本节主要讲解爬虫的代码部分。
首先是items的定义
colletion为mongo中的集合名字,其余变量对应想要抓取的数据项
class chengduItem(scrapy.Item):
collection = 'chuzu'
describe = Field()
size = Field()
region = Field()
address = Field()
agent = Field()
landlord = Field()
image = Field()
price = Field()
然后是页面解析部分
直接使用chrome定位到需要提取的部分,使用右键copy-copy xpath就可以直接获取xpath地址,然后把提取的信息赋给对应的item键就可以了。
class ChengduSpider(scrapy.Spider):
name = 'chengdu'
allowed_domains = ['cd.58.com']
start_urls = ['https://cd.58.com/chuzu/0/']
def parse(self, response):
ul = response.selector.xpath('//ul[@class="listUl"]/li')
for li in ul[:-1]:
item = chengduItem()
item['describe'] = li.xpath('./div[@class="des"]/h2/a/text()')\
.extract_first().strip()
item['size'] = li.xpath('./div[@class="des"]/p[1]/text()')\
.extract_first().strip()
item['region'] = li.xpath('./div[@class="des"]/p[2]/a[1]/text()')\
.extract_first().strip()
add_p = li.xpath('./div[@class="des"]/p[2]/text()')\
.extract()[-1].strip()
add_a = li.xpath('./div[@class="des"]/p[2]/a[2]/text()').extract_first()
if add_a is None:
item['address'] = add_p
else:
item['address'] = add_a
# /html/body/div[5]/div/div[5]/div[2]/ul/li[1]/div[2]/p[3]/text()
item['landlord'] = li.xpath('./div[@class="des"]/p[3]/text()')\
.extract()[-1].strip()
item['image'] = li.xpath('./div[@class="img_list"]/a/img/@lazy_src')\
.extract_first().strip()
# /html/body/div[5]/div/div[5]/div[2]/ul/li[2]/div[3]/div[2]/b
item['price'] = li.xpath('./div[3]/div[2]/b/text()').extract_first()
yield item
# //*[@id="bottom_ad_li"]/div[2]/a[6]/span
next_page = ul[-1].xpath('./div[2]/a[@class="next"]/@href').extract_first()
if next_page:
yield scrapy.Request(url=next_page, callback=self.parse)
数据处理及数据库写入部分
class Chengdu58Pipeline(object):
def __init__(self):
# 数据处理,替换&#x开头的字符为正常的数字
self.d = {'閏': '4', '麣': '1', '龒': '2', '齤': '7', '驋': '5',
'餼': '0', '龥': '8', '龤': '3', '鑶': '6', '鸺': '9'}
# 替换size中的数字
def process_item(self, item, spider):
t = []
for i in list(item['size']):
if i != '\xa0':
if i in self.d:
t.append(self.d[i])
else:
t.append(i)
item['size'] = "".join(t)
# 替换describe中的数字
t.clear()
for i in list(item['describe']):
if i != '\xa0':
if i in self.d:
t.append(self.d[i])
else:
t.append(i)
item['describe'] = "".join(t)
t.clear()
for i in list(item['price']):
t.append(self.d[i])
item['price'] = "".join(t)
# 去除landlord前多余的冒号
item["landlord"] = item["landlord"][1:]
return item
class MongoPipeline(object):
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
# 从setting文件中获取mongo连接及数据库名称
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DB')
)
# 连接mongo数据库
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
# item插入集合中,集合名在item类中定义
def process_item(self, item, spider):
self.db[item.collection].insert(dict(item))
return item
# 关闭mongo连接
def close_spider(self, spider):
self.client.close()