使用Python自带的标准库获取历史天气网(http://lishi.tianqi.com)上城市的历史气温数据,需要设置的是city名,这个需要历史天气网查看,以及需要获取的年份和月份。
# -*- coding: utf-8 -*-
"""
@author: CC
"""
import re
import urllib2
import csv
import time
"""
设置需要爬取的地区,使用该地区的拼音
设置需要爬取的年、月
"""
city = 'wuhan'
years = ['2017']
months = ['06', '07', '08','09']
def getHtml(city, year, month):
url = 'http://lishi.tianqi.com/' + city + '/' + str(year) + str(month) + '.html'
print url
request = urllib2.Request(url)
request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36')
response = urllib2.urlopen(request)
html = response.read()
return html
def getTemp(html):
text = "".join(html.split())
patten = re.compile('<divclass="tqtongji2">(.*?)</div><divstyle="clear:both">')
table = re.findall(patten, text)
patten1 = re.compile('<ul>(.*?)</ul>')
uls = re.findall(patten1, table[0])
rows = []
for ul in uls:
patten2 = re.compile('<li>(.*?)</li>')
lis = re.findall(patten2, ul)
time = re.findall('>(.*?)</a>', lis[0])[0]
high = lis[1]
low = lis[2]
rows.append((time, high, low))
return rows
if __name__ == "__main__":
with open(city + '.csv', 'wb+') as f:
writer = csv.writer(f)
row1 = [('time', 'high', 'low')]
writer.writerows(row1)
for year in years:
for month in months:
html = getHtml(city, year, month)
rows = getTemp(html)
writer.writerows(rows)
print year + month + ' OK!'
time.sleep(2)