图片下载
pipelines.py文件设置
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
from scrapy import Request
import json
import codecs
import requests
from Logo import settings
import os
# 存储json格式文件
class JsonWithEncodingPipeline(object):
def __init__(self):
self.file = codecs.open('logo.json', 'w', encoding='utf-8')
def process_item(self, item, spider):
line = json.dumps(dict(item), ensure_ascii=False) + "\n"
self.file.write(line)
return item
def spider_closed(self, spider):
self.file.close()
# 图片下载方法一
class DownloadImagesPipeline(ImagesPipeline):
def get_media_requests(self,item,info): #下载图片
for image_url in item['imageurl']:
yield Request(image_url,meta={'item':item,'index':item['imageurl'].index(image_url)}) #添加meta是为了下面重命名文件名使用
def file_path(self,request,response=None,info=None):
item=request.meta['item'] #通过上面的meta传递过来item
index=request.meta['index'] #通过上面的index传递过来列表中当前下载图片的下标
#图片文件名,item['carname'][index]得到汽车名称,request.url.split('/')[-1].split('.')[-1]得到图片后缀jpg,png
image_guid = item['carname'][index]+'.'+request.url.split('/')[-1].split('.')[-1]
#图片下载目录 此处item['country']即需要前面item['country']=''.join()......,否则目录名会变成\u97e9\u56fd\u6c7d\u8f66\u6807\u5fd7\xxx.jpg
filename = u'full/{0}/{1}'.format(item['country'], image_guid)
return filename
# 图片下载方法二
class ImageDownloadPipeline(object):
def process_item(self, item, spider):
if 'imageurl' in item: # 如何‘图片地址’在项目中
images = [] # 定义图片空集
dir_path = '%s/%s' % (settings.IMAGES_STORE, spider.name)
if not os.path.exists(dir_path):
os.makedirs(dir_path)
for image_url in item['imageurl']:
us = image_url.split('/')[3:]
image_file_name = '_'.join(us)
file_path = '%s/%s' % (dir_path, image_file_name)
images.append(file_path)
if os.path.exists(file_path):
continue
with open(file_path, 'wb') as handle:
response = requests.get(image_url, stream=True)
for block in response.iter_content(1024):
if not block:
break
handle.write(block)
item['images'] = images
return item
settings.py文件设置
ITEM_PIPELINES={
# 'sucai.pipelines.SucaiPipeline':1
# 'Logo.pipelines.JsonWithEncodingPipeline':2,
# 'Logo.pipelines.DownloadImagesPipeline':1,
'Logo.pipelines.ImageDownloadPipeline': 3
}
IMAGES_STORE='.\Image'