由于个人需要国内所有省市区的ip地址段和经纬度的对应关系,最好是数据库可以直接使用的数据源,经调研发现可以使用百度API来实现这个功能,主要是找到所有的ip地址段然后调用api 查询,
主要是用Tornado 做异步爬虫来实现的,
import time
import json
from datetime import timedelta
from bs4 import BeautifulSoup
import os
from tornado import httpclient, gen, ioloop, queues
from api import map_ip_region_provice
AK = "umG08TkpEwmRWkUPx4MaM0FGW3h4i4L" # 你可以使用百度开发者账号申请
base_url = 'http://api.map.baidu.com/location/ip?ak={1}&ip={0}'
IP_URL = 'https://github.com/17mon/china_ip_list/blob/master/china_ip_list.txt'
concurrency = 20 # 并发数
def is_file_exist(result=None):
if result:
result = result
else:
result = []
with open('iplist.txt', 'w+') as ipfile:
for i in result:
ipfile.write(i)
ipfile.write('\n')
return result
def get_iplist():
"""
从github 下载ip 段
"""
# check ip is exist, 节约时间
result = list()
flag = is_file_exist()
if flag:
return flag
else:
http_client = httpclient.HTTPClient()
try:
response = http_client.fetch(IP_URL)
respone = response.body
except httpclient.HTTPError as e:
print("Error: " + str(e))
except Exception as e:
# Other errors are possible, such as IOError.
print("Error: " + str(e))
http_client.close()
respone = BeautifulSoup(respone, 'lxml')
div = respone.find('table',
attrs={'class',
'highlight tab-size js-file-line-container'})
trs = div.find_all('td', attrs={'class': 'blob-code blob-code-inner js-file-line'})
for tr in trs:
temp = tr.string
result.append(temp)
print('fetch ip .....')
is_file_exist(result)
具体可以查看