本程序使用scrapy框架抓取豆瓣网图书250排行榜信息
目标网址为:https://book.douban.com/top250
python版本:3.5
settings.py设置
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class Douban250BookItem(scrapy.Item):
name = scrapy.Field() # 书名
price = scrapy.Field() # 价格
edition_year = scrapy.Field() # 出版年份
publisher = scrapy.Field() # 出版社
ratings = scrapy.Field() # 评分
bookspider.py
# -*- coding:utf-8 -*-
'''by sudo rm -rf http://imchenkun.com'''
import scrapy
from douban250book.items import Douban250BookItem
class BookSpider(scrapy.Spider):
name = 'douban-book'
allowed_domains = ['douban.com']
start_urls = [
'https://book.douban.com/top250'
]
def parse(self, response):
# 请求第一页
yield scrapy.Request(response.url, callback=self.parse_next)
# 请求其它页
for page in response.xpath('//div[@class="paginator"]/a'):
link = page.xpath('@href').extract()[0]
yield scrapy.Request(link, callback=self.parse_next)
def parse_next(self, response):
for item in response.xpath('//tr[@class="item"]'):
book = Douban250BookItem()
book['name'] = item.xpath('td[2]/div[1]/a/@title').extract()[0]
book['price'] = item.xpath('td[2]/p/text()').extract()[0]
book['ratings'] = item.xpath('td[2]/div[2]/span[2]/text()').extract()[0]
yield book