scrapy爬取58同城租房信息(第二节)

本节主要讲解爬虫的代码部分。
首先是items的定义
colletion为mongo中的集合名字,其余变量对应想要抓取的数据项

class chengduItem(scrapy.Item):
    collection = 'chuzu'
    describe = Field()
    size = Field()
    region = Field()
    address = Field()
    agent = Field()
    landlord = Field()
    image = Field()
    price = Field()

然后是页面解析部分
直接使用chrome定位到需要提取的部分,使用右键copy-copy xpath就可以直接获取xpath地址,然后把提取的信息赋给对应的item键就可以了。

class ChengduSpider(scrapy.Spider):
    name = 'chengdu'
    allowed_domains = ['cd.58.com']
    start_urls = ['https://cd.58.com/chuzu/0/']

    def parse(self, response):
        ul = response.selector.xpath('//ul[@class="listUl"]/li')
        for li in ul[:-1]:
            item = chengduItem()
            item['describe'] = li.xpath('./div[@class="des"]/h2/a/text()')\
                .extract_first().strip()
            item['size'] = li.xpath('./div[@class="des"]/p[1]/text()')\
                .extract_first().strip()
            item['region'] = li.xpath('./div[@class="des"]/p[2]/a[1]/text()')\
                .extract_first().strip()
            add_p = li.xpath('./div[@class="des"]/p[2]/text()')\
                .extract()[-1].strip()
            add_a = li.xpath('./div[@class="des"]/p[2]/a[2]/text()').extract_first()
            if add_a is None:
                item['address'] = add_p
            else:
                item['address'] = add_a
            # /html/body/div[5]/div/div[5]/div[2]/ul/li[1]/div[2]/p[3]/text()
            item['landlord'] = li.xpath('./div[@class="des"]/p[3]/text()')\
                .extract()[-1].strip()
            item['image'] = li.xpath('./div[@class="img_list"]/a/img/@lazy_src')\
                .extract_first().strip()
            # /html/body/div[5]/div/div[5]/div[2]/ul/li[2]/div[3]/div[2]/b
            item['price'] = li.xpath('./div[3]/div[2]/b/text()').extract_first()
            yield item

            # //*[@id="bottom_ad_li"]/div[2]/a[6]/span
        next_page = ul[-1].xpath('./div[2]/a[@class="next"]/@href').extract_first()
        if next_page:
            yield scrapy.Request(url=next_page, callback=self.parse)

数据处理及数据库写入部分

class Chengdu58Pipeline(object):
    def __init__(self):
        # 数据处理,替换&#x开头的字符为正常的数字
        self.d = {'閏': '4', '麣': '1', '龒': '2', '齤': '7', '驋': '5',
                  '餼': '0', '龥': '8', '龤': '3', '鑶': '6', '鸺': '9'}

    # 替换size中的数字
    def process_item(self, item, spider):
        t = []
        for i in list(item['size']):
            if i != '\xa0':
                if i in self.d:
                    t.append(self.d[i])
                else:
                    t.append(i)
        item['size'] = "".join(t)

    # 替换describe中的数字
        t.clear()
        for i in list(item['describe']):
            if i != '\xa0':
                if i in self.d:
                    t.append(self.d[i])
                else:
                    t.append(i)
        item['describe'] = "".join(t)

        t.clear()
        for i in list(item['price']):
            t.append(self.d[i])
        item['price'] = "".join(t)

        # 去除landlord前多余的冒号
        item["landlord"] = item["landlord"][1:]

        return item
class MongoPipeline(object):
    def __init__(self, mongo_uri, mongo_db):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db

    # 从setting文件中获取mongo连接及数据库名称
    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI'),
            mongo_db=crawler.settings.get('MONGO_DB')
        )

    # 连接mongo数据库
    def open_spider(self, spider):
        self.client = pymongo.MongoClient(self.mongo_uri)
        self.db = self.client[self.mongo_db]

    # item插入集合中,集合名在item类中定义
    def process_item(self, item, spider):
        self.db[item.collection].insert(dict(item))
        return item

    # 关闭mongo连接
    def close_spider(self, spider):
        self.client.close()
    原文作者:Houtasu
    原文地址: https://www.jianshu.com/p/eb331e6e48bd
    本文转自网络文章,转载此文章仅为分享知识,如有侵权,请联系博主进行删除。
点赞