Python_大众点评网站数据爬虫

2019年6月16日 99次阅读来源: 像在吹

目标：

爬取大众点评某地区的酒店信息，包括酒店名，平均价格，评价人数，标签等，并将其写入txt，导入数据库。

所用模块：urllib，urllib2，re，BeautifulSoup

大致步骤：

（1）获取页面所在首页url，及相应的headers；

（2）页面解析，获取信息，写入txt，并尝试获取下一页的url，若得到，则以此更新url，继续（2），若找不到，则停止，进入（3）；

（3）将所得到的txt文档中的数据一次性导入mysql。

#-*-coding:utf-8-*-
'''
created by zwg in 2016-10-15
'''
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

import urllib
import urllib2
import re
import copy
from bs4 import BeautifulSoup
import MySQLdb

class get_data:
    def get_html(self,url,headers):
        opener=urllib2.build_opener()
        headers_copy=headers.items()
        for i in headers_copy:
            opener.addheaders=[i]
        urllib2.install_opener(opener)
        self.url=url
        page=opener.open(url)
        self.html=page.read()
        self.soup=BeautifulSoup(self.html,'lxml',from_encoding='utf-8')
        self.opener=opener


    def get_nextpage(self):
        basic_url='http://www.dianping.com'
        next_url=self.soup.find_all('a',class_='next')
        new_url=next_url[0]['href']
        self.url=basic_url+new_url
        self.html=self.opener.open(self.url)
        self.soup=BeautifulSoup(self.html,'lxml',from_encoding='utf-8')
    def get_one_data(self):
        hotel_li = self.soup.find_all('li', class_='hotel-block')
        pattern1 = re.compile('''"title":"(.+)"''')
        info = []
        for i in hotel_li:
            s = i['data-hippo']
            s1 = pattern1.findall(s)[0]
            name = s1  # 酒店名
            p_class = i.find_all('p', class_='hotel-tags')[0]
            p11 = p_class.find_all('span')
            comment = ''
            for j in p11:
                comment = comment + j.string + ','
            comment=comment[0:len(comment)-1]
            p_price = i.find_all('strong')[0]
            price = p_price.string  # 酒店价格
            price.replace(' ', '')
            if price == '\n':
                price = 'None'
            p_people = i.find_all('a', class_='comments')[0]
            number = p_people.string  # 评论人数
            number = number.replace('(', '')
            number = number.replace(')', '')
            number = number.replace(' ', '')
            if not number.isdigit():
                number = 'None'
            p=i.find_all('p',class_='place')
            place=str(p[0].a.string)
            info.append((name, price, place, number, comment))
            print '%-20s%-10s%-10s%-5s%s' % (name, price, place, number, comment)
        self.info=info
    def write_to_txt(self, file1):
        for i in self.info:
            a, b, c, d, e = i
            s = ('%s\t%s\t%s\t%s\t%s\n') % (a, b, c, d, e)
            file1.writelines(s)
    def get_all_data(self,file1):
        for i in xrange(5):
            self.get_one_data()
            self.write_to_txt(file1)
            self.get_nextpage()





url='http://www.dianping.com/guangzhou/hotel/p1'
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 '
                      '(KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'}
G=get_data()
G.get_html(url,headers)
file1 = file('dazhong.txt', 'a+')
conn=MySQLdb.connect('127.0.0.1','root','1234','school',)
cursor=conn.cursor()
conn.set_character_set('utf8')
cursor.execute('SET NAMES utf8;')
cursor.execute('SET CHARACTER SET utf8;')
cursor.execute('SET character_set_connection=utf8;')
G.get_all_data(file1)
sql="load data local infile 'D:/Python/web_crawler/dazhong.txt' " \
    "into table hotel_info fields terminated by '\t'"
cursor.execute(sql)
conn.commit()
cursor.close()
conn.close()

实现通过，Done！

    原文作者：像在吹
    原文地址: https://blog.csdn.net/zhangweiguo_717/article/details/52891777
    本文转自网络文章，转载此文章仅为分享知识，如有侵权，请联系博主进行删除。