scrapy-redis 图片下载两种方法

2023年3月20日 233次阅读来源: 啤酒找尿布

图片下载

pipelines.py文件设置

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
from scrapy import Request
import json
import codecs
import requests
from Logo import settings
import os

# 存储json格式文件
class JsonWithEncodingPipeline(object):

    def __init__(self):
        self.file = codecs.open('logo.json', 'w', encoding='utf-8')

    def process_item(self, item, spider):
        line = json.dumps(dict(item), ensure_ascii=False) + "\n"
        self.file.write(line)
        return item

    def spider_closed(self, spider):
        self.file.close()

# 图片下载方法一
class DownloadImagesPipeline(ImagesPipeline):
    def get_media_requests(self,item,info): #下载图片
        for image_url in item['imageurl']:
            yield Request(image_url,meta={'item':item,'index':item['imageurl'].index(image_url)}) #添加meta是为了下面重命名文件名使用

    def file_path(self,request,response=None,info=None):
        item=request.meta['item'] #通过上面的meta传递过来item
        index=request.meta['index'] #通过上面的index传递过来列表中当前下载图片的下标

        #图片文件名，item['carname'][index]得到汽车名称，request.url.split('/')[-1].split('.')[-1]得到图片后缀jpg,png
        image_guid = item['carname'][index]+'.'+request.url.split('/')[-1].split('.')[-1]
        #图片下载目录 此处item['country']即需要前面item['country']=''.join()......,否则目录名会变成\u97e9\u56fd\u6c7d\u8f66\u6807\u5fd7\xxx.jpg
        filename = u'full/{0}/{1}'.format(item['country'], image_guid) 
        return filename


# 图片下载方法二
class ImageDownloadPipeline(object):
    def process_item(self, item, spider):
        if 'imageurl' in item:  # 如何‘图片地址’在项目中
            images = []  # 定义图片空集

            dir_path = '%s/%s' % (settings.IMAGES_STORE, spider.name)

            if not os.path.exists(dir_path):
                os.makedirs(dir_path)
            for image_url in item['imageurl']:
                us = image_url.split('/')[3:]
                image_file_name = '_'.join(us)
                file_path = '%s/%s' % (dir_path, image_file_name)
                images.append(file_path)
                if os.path.exists(file_path):
                    continue

                with open(file_path, 'wb') as handle:
                    response = requests.get(image_url, stream=True)
                    for block in response.iter_content(1024):
                        if not block:
                            break

                        handle.write(block)

            item['images'] = images
        return item

settings.py文件设置

ITEM_PIPELINES={
    # 'sucai.pipelines.SucaiPipeline':1
    # 'Logo.pipelines.JsonWithEncodingPipeline':2,
    # 'Logo.pipelines.DownloadImagesPipeline':1,
    'Logo.pipelines.ImageDownloadPipeline': 3
}
IMAGES_STORE='.\Image'

    原文作者：啤酒找尿布
    原文地址: https://www.jianshu.com/p/639e2a599532
    本文转自网络文章，转载此文章仅为分享知识，如有侵权，请联系博主进行删除。