Python_大众点评网站数据爬虫

目标:

爬取大众点评某地区的酒店信息,包括酒店名,平均价格,评价人数,标签等,并将其写入txt,导入数据库。

所用模块:urllib,urllib2,re,BeautifulSoup

大致步骤:

(1)获取页面所在首页url,及相应的headers;

(2)页面解析,获取信息,写入txt,并尝试获取下一页的url,若得到,则以此更新url,继续(2),若找不到,则停止,进入(3);

(3)将所得到的txt文档中的数据一次性导入mysql。

#-*-coding:utf-8-*-
'''
created by zwg in 2016-10-15
'''
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

import urllib
import urllib2
import re
import copy
from bs4 import BeautifulSoup
import MySQLdb

class get_data:
    def get_html(self,url,headers):
        opener=urllib2.build_opener()
        headers_copy=headers.items()
        for i in headers_copy:
            opener.addheaders=[i]
        urllib2.install_opener(opener)
        self.url=url
        page=opener.open(url)
        self.html=page.read()
        self.soup=BeautifulSoup(self.html,'lxml',from_encoding='utf-8')
        self.opener=opener


    def get_nextpage(self):
        basic_url='http://www.dianping.com'
        next_url=self.soup.find_all('a',class_='next')
        new_url=next_url[0]['href']
        self.url=basic_url+new_url
        self.html=self.opener.open(self.url)
        self.soup=BeautifulSoup(self.html,'lxml',from_encoding='utf-8')
    def get_one_data(self):
        hotel_li = self.soup.find_all('li', class_='hotel-block')
        pattern1 = re.compile('''"title":"(.+)"''')
        info = []
        for i in hotel_li:
            s = i['data-hippo']
            s1 = pattern1.findall(s)[0]
            name = s1  # 酒店名
            p_class = i.find_all('p', class_='hotel-tags')[0]
            p11 = p_class.find_all('span')
            comment = ''
            for j in p11:
                comment = comment + j.string + ','
            comment=comment[0:len(comment)-1]
            p_price = i.find_all('strong')[0]
            price = p_price.string  # 酒店价格
            price.replace(' ', '')
            if price == '\n':
                price = 'None'
            p_people = i.find_all('a', class_='comments')[0]
            number = p_people.string  # 评论人数
            number = number.replace('(', '')
            number = number.replace(')', '')
            number = number.replace(' ', '')
            if not number.isdigit():
                number = 'None'
            p=i.find_all('p',class_='place')
            place=str(p[0].a.string)
            info.append((name, price, place, number, comment))
            print '%-20s%-10s%-10s%-5s%s' % (name, price, place, number, comment)
        self.info=info
    def write_to_txt(self, file1):
        for i in self.info:
            a, b, c, d, e = i
            s = ('%s\t%s\t%s\t%s\t%s\n') % (a, b, c, d, e)
            file1.writelines(s)
    def get_all_data(self,file1):
        for i in xrange(5):
            self.get_one_data()
            self.write_to_txt(file1)
            self.get_nextpage()





url='http://www.dianping.com/guangzhou/hotel/p1'
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 '
                      '(KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'}
G=get_data()
G.get_html(url,headers)
file1 = file('dazhong.txt', 'a+')
conn=MySQLdb.connect('127.0.0.1','root','1234','school',)
cursor=conn.cursor()
conn.set_character_set('utf8')
cursor.execute('SET NAMES utf8;')
cursor.execute('SET CHARACTER SET utf8;')
cursor.execute('SET character_set_connection=utf8;')
G.get_all_data(file1)
sql="load data local infile 'D:/Python/web_crawler/dazhong.txt' " \
    "into table hotel_info fields terminated by '\t'"
cursor.execute(sql)
conn.commit()
cursor.close()
conn.close()

实现通过,Done!

    原文作者:像在吹
    原文地址: https://blog.csdn.net/zhangweiguo_717/article/details/52891777
    本文转自网络文章,转载此文章仅为分享知识,如有侵权,请联系博主进行删除。
点赞