# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
import random
import json
from scrapy import signals
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
from common import ProxyGenerator
client = ProxyGenerator()
class ProxyMiddlware(object):
"""代理中间件"""
def __init__(self):
self.client = client
def process_request(self, request, spider):
if request.meta.get('proxy'):
pass
else:
# proxy = self.get_proxy()
proxy = {'http': 'http://222.94.146.18:9999', 'https': 'http://222.94.146.18:9999'}
print('当前IP:', proxy)
request.meta['proxy'] = 'http://222.94.146.18:9999'
def get_proxy(self):
proxy_list = self.client.get_ips()
proxy = random.choice(proxy_list)
return proxy
# def process_response(self, request, response, spider):
# if response.status != 200:
# proxy = self.get_proxy()
# print("this is response ip:" + proxy)
# # 对当前reque加上代理
# request.meta['proxy'] = proxy
# return request
# return response
class MyUserAgentMiddleware(UserAgentMiddleware):
def __init__(self, user_agent):
super().__init__()
self.user_agent = user_agent
self.headers = {
"Proxy-Connection": "keep-alive",
"Pragma": "no-cache",
"Cache-Control": "no-cache",
"User-Agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"DNT": "1",
"Accept-Encoding": "gzip, deflate, sdch",
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4",
"Referer": 'http://www.baidu.com/',
"Accept-Charset": "gb2312,gbk;q=0.7,utf-8;q=0.7,*;q=0.7"}
@classmethod
def from_crawler(cls, crawler):
return cls(
user_agent=crawler.settings.get('MY_USER_AGENT')
)
def process_request(self, request, spider):
agent = random.choice(self.user_agent)
# self.headers['Referer'] = request.url
request.headers['User-Agent'] = agent