问题:使用ImagesPipeline的时候出现错误 No module named ‘PIL’
解决:pip install Pillow 安装依赖后无误
正题:爬取一百张360图片上的艺术类图片
进入cmd,cd致自己想存放的文件夹
step1:scrapy startproject so_image
step2: cd so_image
stpe3: genspider images http://image.so.com
修改:
step4: 修改Item
class SoImageItem(scrapy.Item):
image_urls = scrapy.Field()
images = scrapy.Field()
step5: 修改setting
ITEM_PIPELINES = {
'scrapy.pipelines.images.ImagesPipeline':1
}
IMAGES_STORE = 'download_images'
#不要忘了把协议False掉
step6: 写自己的spider
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
import json
from scrapy import cmdline
class ImagesSpider(scrapy.Spider):
BASE_URL = 'http://image.so.com/zj?ch=art&sn=%s&listtype=new&temp=1'
start_index = 0
MAX_DOWNLOAD_NUM = 100 #设置爬取上限
name = 'images'
# allowed_domains = ['image.so.com']
start_urls = [BASE_URL%0]
def parse(self, response):
# pass
#使用json模块解析响应结果
infos = json.loads(response.body.decode('utf-8'))
#提取所有图片下载url到一个列表中
#给Item中的image_urls复制
yield {'image_urls':for info in infos['list']]}
self.start_index += infos['count']
if infos['count']>0 and self.start_index<self.MAX_DOWNLOAD_NUM:
yield Request(self.BASE_URL%self.start_index)
def main():
#直接使用main函数调用cmd,不用再打开cmd一个一个敲命令,直接在这里运行就可以了
scrapy.cmdline.execute(['scrapy', 'crawl', 'images'])
#调用main函数
main()