Python 爬虫学习2爬取租房网站信息

任务:爬取租房网站信息,300个房源信息(详情页中的价格、位置、户主名字、性别等)

注意:超链接的获取、性别获取

from bs4 import BeautifulSoup
import requests, time
page_link = [] # <- 每个详情页的链接都存在这里,解析详情的时候就遍历这个列表然后访问就好啦~
def get_page_link(page_number):#获取每一页中的详情页的链接
    for each_number in range(1,page_number): # 每页24个链接,这里输入的是页码
        full_url = 'http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(each_number)
        wb_data = requests.get(full_url)
        soup = BeautifulSoup(wb_data.text,'lxml')
        for link in soup.select('a.resule_img_a'): # 找到这个 class 样为resule_img_a 的 a 标签即可
            page_link.append(link.get('href'))#只需要取出超链接添加到page_link中


def print_gender(class_name):
    if class_name == 'member_ico1':
        return '女'
    if class_name == 'member_ico':
        return '男'

url = 'http://bj.xiaozhu.com/fangzi/1508951935.html'
def get_attractions(url, count):#获取每一个详情页的具体信息
    wb_data = requests.get(url)
    soup = BeautifulSoup(wb_data.text, 'lxml')
    title = soup.select('div.wrap.clearfix.con_bg > div.con_l > div.pho_info > h4 > em')
    address = soup.select(' div.wrap.clearfix.con_bg > div.con_l > div.pho_info > p > span')
    price = soup.select(' div.day_l > span')
    picture = soup.select('#curBigImage')
    host_name = soup.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a')
    host_gender = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > div')
    for ti, add, pri, pic, name, gender in zip(title, address , price, picture, host_name, host_gender):#创建数据结构
        data = {
            'ti': ti.get_text(),
            'add': add.get_text(strip=True), #strip去除前后空白,也可以使用.stripped_strings
            'pri': pri.get_text(),
            'pic': pic.get('src'),
            'name': name.get_text(),
            'gender':print_gender(gender.get('class')[0])

        }
    data['count']=count #增加计数功能

    print(data)
    
get_page_link(13)   #爬取13页
for i in range(1, 301): #爬取300条
    # time.sleep(2)
    get_attractions(page_link[i], i)

 

 

    原文作者:竞biubiubiu
    原文地址: https://blog.csdn.net/weixin_36650342/article/details/60466780
    本文转自网络文章,转载此文章仅为分享知识,如有侵权,请联系博主进行删除。
点赞