python爬取链家网40页二手房信息

2023年1月31日 191次阅读来源: castingA3T
import requests
from bs4 import BeautifulSoup
import re
import time

headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
        }
total=[]
def get_details(url):
    try:
        res=requests.get(url,headers=headers,timeout=20)
        soup=BeautifulSoup(res.text,'html.parser')
        #一页的标题
        titles=soup.select('#js-ershoufangList > div.content-wrapper > div.content > div.m-list > ul > li > div > div.prop-title > a')
        titless=list(map(lambda x:x.text.strip(),titles))
        #一页的地址
        addres=soup.select('#js-ershoufangList > div.content-wrapper > div.content > div.m-list > ul > li > div > div.info-table > div > span.info-col.row2-text > a:nth-of-type(2)')
        address=list(map(lambda x:x.text.strip(),addres))
        #一页几室几厅
        shitings=soup.select('#js-ershoufangList > div.content-wrapper > div.content > div.m-list > ul > li > div > div.info-table > div:nth-of-type(1) > span')
        shitingss=list(map(lambda x:x.text.split(' ')[0].strip(),shitings))
        #面积/平方米
        mianjis=soup.select('#js-ershoufangList > div.content-wrapper > div.content > div.m-list > ul > li > div > div.info-table > div:nth-of-type(1) > span')
        mianjiss=list(map(lambda x:re.findall('\|(.*?)平',x.text),mianjis))
        mianjiss_int=[float(a.strip()) for i in mianjiss for a in i]        
        #房屋平方
        pings=soup.select('#js-ershoufangList > div.content-wrapper > div.content > div.m-list > ul > li > div > div.info-table > div:nth-of-type(2) > span.info-col.price-item.minor')
        pingss=list(map(lambda x:re.findall('价(.+?)元',x.text),pings))
        pingss_int=[int(a) for i in pingss for a in i]
        #房屋单价元/平方
        prices=soup.select('#js-ershoufangList > div.content-wrapper > div.content > div.m-list > ul > li > div > div.info-table > div:nth-of-type(1) > div > span.total-price.strong-num')
        pricess=list(map(lambda x:x.text.strip(),prices))
        pricess_int=[int(i) for i in pricess]
        for title,addre,shiting,mianji,ping,price in zip(titless,address,shitingss,mianjiss_int,pingss_int,pricess_int):
            total.append({'标题':title,
                          '地区':addre,
                          '室厅':shiting,
                          '面积(平方)':mianji,
                          '元每平方':ping,
                          '价格(万)':price})
    except:
        print('抓取失败')
    return total
if __name__=='__main__':
    for i in range(1,41):
        URL='http://sh.lianjia.com/ershoufang/d'+str(i)
        get_details(URL)
        print('第{}页抓取完毕'.format(i))
        time.sleep(5)
import pandas as pd
df=pd.DataFrame(total)
df.to_excel('lianjia-3.xls')
    原文作者：castingA3T
    原文地址: https://blog.csdn.net/castingA3T/article/details/78927198
    本文转自网络文章，转载此文章仅为分享知识，如有侵权，请联系博主进行删除。