Request和Response都会首先经过中间件,所以我们在中间件中定义需要添加的header和params
scrapy中最重要的的两个类Response和Request
from scrapy import Request
class Request(object_ref):
def __init__(self, url, callback=None, method='GET', headers=None, body=None,
cookies=None, meta=None, encoding='utf-8', priority=0,
dont_filter=False, errback=None, flags=None):
request是由spider产生
User-Agent
1.我们可以通过下述方式添加header
class ZhihuSpider(scrapy.Spider):
name = "zhihu"
allowed_domains = ["www.zhihu.com"]
start_urls = ['https://www.zhihu.com/']
headers = {
"HOST": "www.zhihu.com",
"Referer": "https://www.zhizhu.com",
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0"
}
def parse(self, response):
........
yield scrapy.Request(url, headers=self.headers, callback=self.parse)
2.随机添加User-Agent
我们可以提前准备user-agent的列表,这样request的时候随机抽取放到header中去。不过python中有现成的库已经帮我们完成了这个任务
https://github.com/hellysmile/fake-useragent
pip install fake-useragent
from fake_useragent import UserAgent
ua = UserAgent()
ua.random
我们在middleware中定义随机请求头,毕竟request要经过中间件才发送出去
# pipeline.py
from fake_useragent import UserAgent
class RandomUserAgentMiddleware(object):
def __init__(self, crawler):
super(RandomUserAgentMiddleware, self).__init__()
self.ua = UserAgent()
self.per_proxy = crawler.settings.get('RANDOM_UA_PER_PROXY', False)
self.ua_type = crawler.settings.get('RANDOM_UA_TYPE', 'random')
self.proxy2ua = {}
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def process_request(self, request, spider):
def get_ua():
'''Gets random UA based on the type setting (random, firefox…)'''
return getattr(self.ua, self.ua_type)
if self.per_proxy:
proxy = request.meta.get('proxy')
if proxy not in self.proxy2ua:
self.proxy2ua[proxy] = get_ua()
logger.debug('Assign User-Agent %s to Proxy %s'
% (self.proxy2ua[proxy], proxy))
request.headers.setdefault('User-Agent', self.proxy2ua[proxy])
else:
ua = get_ua()
request.headers.setdefault('User-Agent', get_ua())
IP代理
ip代理是通过request的meta属性进行传入
def process_request(self,request,spider):
get_ip = GetIP() #自己定义获取proxy ip函数
request.meta["proxy"] = get_ip
1.自己定义爬虫抓取西刺网上的代理ip,存放到数据库中使用
2.pip install scrapy-proxies
https://github.com/aivarsk/scrapy-proxies
以下是爬取西刺网的免费代理ip代码:
import requests
from bs4 import BeautifulSoup
import pymysql
conn = pymysql.connect(
host='127.0.0.1',
user='root',
password='123456',
database='xiciip',
port=3306
)
cursor = conn.cursor()
def crawl_ips():
#爬取西刺网代理ip
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36"
}
for i in range(1,100):
print("==================",i)
re = requests.get("http://www.xicidaili.com/nn/{0}".format(i),headers=headers)
text = re.text
soup = BeautifulSoup(text)
tr_list = soup.select("tr")
tr_list = tr_list[1:]
for td_list in tr_list:
td_ip = td_list.select("td")[1].get_text()
td_port = td_list.select("td")[2].get_text()
http_type = td_list.select("td")[5].get_text()
speed = float((td_list.select("td")[6].div.get('title'))[:-1])
if speed > 1:
continue
insert_sql = """insert into iplist(http_type,ip,port,speed) values(%s,%s,%s,%s)"""
cursor.execute(insert_sql,((http_type,td_ip,td_port,speed)))
conn.commit()
print(td_ip)
conn.close()
class GetIP(object):
def delete_ip(self,ip):
#从数据库中删除无效的ip
delete_sql = """delete from iplist where ip=%s"""
cursor.execute(delete_sql,(ip))
conn.commit()
return True
def judge_ip(self,http_type,ip,port):
#判断ip是否可用
http_url = "http://www.baidu.com"
proxy_url = "{0}://{1}:{1}".format(http_type,ip,port)
try:
proxy = {'http_type': ip+":"+port}
response = requests.get(http_url,proxies=proxy)
except Exception as e:
print("Invalid ip and port")
self.delete_ip(ip)
return False
else:
code = response.status_code
if code >= 200 and code < 300:
print('effective ip')
return True
else:
print("Invalid ip and port")
self.delete_ip(ip)
return False
def get_random_ip(self):
#从数据库随机取一个可用ip
random_sql = """
select http_type,ip,port from iplist order by rand() limit 1
"""
result = cursor.execute(random_sql)
for ip_info in cursor.fetchall():
http_type = ip_info[0]
ip = ip_info[1]
port = ip_info[2]
judge_re = self.judge_ip(http_type,ip,port)
if judge_re:
return http_type+"://"+ip+":"+port
else:
return self.get_random_ip()
if __name__ == "__main__":
crawl_ips()
# getip = GetIP()
# print(getip.get_random_ip())