#1 为什么要用scrapy:
因为自己用python request 库写的爬虫无法获取到 js 生成的html 文件,html 文件不完整。
#2 scrapy-redis 是什么
scrapy-redis是一个类似 scrapy 的插件,scrapy 自动从 redis 中获取待抓取的链接去爬取网页。简单易用,可以很快的搭建一个爬虫分布式框架。
#3 安装
# 安装scrapy
$pip install scrapy
# 安装scrapy reids
$pip install scrapy-redis
#4 创建工程
$scrapy startproject (path) (name)
#5 scrapy-redis 的配置及使用
略 , 查看源码地址 https://github.com/rmax/scrapy-redis
#6 将 scrapy 融入到自己的项目中
/yourProjectRoot/MyScrapy/spiders/XXXSpider.py
MyScrapy 是 scrapy 工程文件夹
from scrapy_redis.spiders import RedisSpider
from pathlib import Path
class XXXSpider(RedisSpider):
_headers = {
}
# 爬虫名称
name = 'xxxSpider'
# 设置爬取的 list 的 key
redis_key = "spider-queue:xxxSpider"
def __init__(self, *args, **kwargs):
# Dynamically define the allowed domains list.
# domain = kwargs.pop('domain', '')
# self.allowed_domains = filter(None, domain.split(','))
super(XXXSpider, self).__init__(*args, **kwargs)
def make_requests_from_url(self, url):
"""
对 request 做一些设置
:param url:
:return:
"""
headers = self._headers
request = super(XXXSpider, self).make_requests_from_url(url)
request.headers = headers
return request
def parse(self, response):
# do what ever you want
pass
@classmethod
def run(cls):
import scrapy.cmdline as sm
# 用 scrapy 的命令库执行
sm.execute(["scrapy", "crawl", "xxxSpider"])
在你的项目中执行
XXXSpider.run()
即可运行调试该spider