目标:
爬取大众点评某地区的酒店信息,包括酒店名,平均价格,评价人数,标签等,并将其写入txt,导入数据库。
所用模块:urllib,urllib2,re,BeautifulSoup
大致步骤:
(1)获取页面所在首页url,及相应的headers;
(2)页面解析,获取信息,写入txt,并尝试获取下一页的url,若得到,则以此更新url,继续(2),若找不到,则停止,进入(3);
(3)将所得到的txt文档中的数据一次性导入mysql。
#-*-coding:utf-8-*-
'''
created by zwg in 2016-10-15
'''
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import urllib
import urllib2
import re
import copy
from bs4 import BeautifulSoup
import MySQLdb
class get_data:
def get_html(self,url,headers):
opener=urllib2.build_opener()
headers_copy=headers.items()
for i in headers_copy:
opener.addheaders=[i]
urllib2.install_opener(opener)
self.url=url
page=opener.open(url)
self.html=page.read()
self.soup=BeautifulSoup(self.html,'lxml',from_encoding='utf-8')
self.opener=opener
def get_nextpage(self):
basic_url='http://www.dianping.com'
next_url=self.soup.find_all('a',class_='next')
new_url=next_url[0]['href']
self.url=basic_url+new_url
self.html=self.opener.open(self.url)
self.soup=BeautifulSoup(self.html,'lxml',from_encoding='utf-8')
def get_one_data(self):
hotel_li = self.soup.find_all('li', class_='hotel-block')
pattern1 = re.compile('''"title":"(.+)"''')
info = []
for i in hotel_li:
s = i['data-hippo']
s1 = pattern1.findall(s)[0]
name = s1 # 酒店名
p_class = i.find_all('p', class_='hotel-tags')[0]
p11 = p_class.find_all('span')
comment = ''
for j in p11:
comment = comment + j.string + ','
comment=comment[0:len(comment)-1]
p_price = i.find_all('strong')[0]
price = p_price.string # 酒店价格
price.replace(' ', '')
if price == '\n':
price = 'None'
p_people = i.find_all('a', class_='comments')[0]
number = p_people.string # 评论人数
number = number.replace('(', '')
number = number.replace(')', '')
number = number.replace(' ', '')
if not number.isdigit():
number = 'None'
p=i.find_all('p',class_='place')
place=str(p[0].a.string)
info.append((name, price, place, number, comment))
print '%-20s%-10s%-10s%-5s%s' % (name, price, place, number, comment)
self.info=info
def write_to_txt(self, file1):
for i in self.info:
a, b, c, d, e = i
s = ('%s\t%s\t%s\t%s\t%s\n') % (a, b, c, d, e)
file1.writelines(s)
def get_all_data(self,file1):
for i in xrange(5):
self.get_one_data()
self.write_to_txt(file1)
self.get_nextpage()
url='http://www.dianping.com/guangzhou/hotel/p1'
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'}
G=get_data()
G.get_html(url,headers)
file1 = file('dazhong.txt', 'a+')
conn=MySQLdb.connect('127.0.0.1','root','1234','school',)
cursor=conn.cursor()
conn.set_character_set('utf8')
cursor.execute('SET NAMES utf8;')
cursor.execute('SET CHARACTER SET utf8;')
cursor.execute('SET character_set_connection=utf8;')
G.get_all_data(file1)
sql="load data local infile 'D:/Python/web_crawler/dazhong.txt' " \
"into table hotel_info fields terminated by '\t'"
cursor.execute(sql)
conn.commit()
cursor.close()
conn.close()
实现通过,Done!