Scrapy学习笔记(7)-定制动态可配置爬虫

前言

    最近一直想维护一个代理IP池,在网上找了三十多个免费提供代理IP的网站,想把这些代理都抓取下来存到本地数据库,再写一个守护进程定时去验证可用性和连接速度,剔除失效代理,以此来保证库里面始终都有特定数量的优质代理IP。那么问题来了,这么多网站每个网站的页面布局或者说网页源码都不一样,数据抓取规则也不一样,如果针对每个网站都硬编码一份spider代码,这工作量貌似有点大,而且一旦目标站点调整布局,我们之前写好的spider代码很可能就得再次修改。此时我迫切地希望能有一个框架可以通过只写一份spider代码和维护多个网站的爬取规则,就能自动抓取这些网站的信息,很庆幸的是强大的Scrapy 可以做到这点,本文记录实现过程。

技术点

1.根据爬取规则自动生成spider

2.使用Scrapy核心API操作spider

3.Linux下Redis的安装和使用

4.基于Redis和SQLAlchemy对Scrapy Item去重并存储

实现过程

    笔者系统为centos,关于Redis的安装和配置可以参考这里:http://jinbitou.net/2016/10/28/2110.html,windows环境的读者请自行Google。本次项目的目录结构如下:

《Scrapy学习笔记(7)-定制动态可配置爬虫》

主要文件和目录解释:

model目录存放的是数据库表的映射文件

spiders目录存放的是spider代码

initDB.py是用来初始化数据库的,自动根据model目录下的映射文件在数据库中建好相应的表

pipelines.py实现将抓取到的数据持久化到数据库

proxy_middlewares.py自定义中间件,用来设定代理IP

useragent_middlewares.py自定义中间件,用来随机切换UA

run.py总控脚本,从规则表中读取规则,动态生成spider并启动

废话不多说,上代码!

__init__.py(文件1)

# -*- coding: utf-8 -*-from sqlalchemy.ext.declarative import declarative_basefrom sqlalchemy import create_enginefrom sqlalchemy.orm import sessionmaker# 创建对象的基类:Base = declarative_base()# 初始化数据库连接:engine = create_engine(‘mysql+mysqldb://root:123456@localhost:3306/scrapy?charset=utf8’)#返回数据库会话def loadSession():    Session = sessionmaker(bind=engine)    session = Session()    return session

proxy.py(文件2)

from sqlalchemy import Column,String,Integer,DateTimefrom . import Baseimport datetimeclass Proxy(Base):#继承Base类    __tablename__ = ‘proxies’    ip_port= Column(String(30),primary_key=True,nullable=False)#主键    type= Column(String(20),nullable=True,default=””)#协议类型    level= Column(String(20),nullable=True,default=””)#匿名级别    location= Column(String(100),nullable=True,default=””)#ip所在地区    speed= Column(String(20),nullable=True,default=””)#连接速度    lifetime = Column(String(20),nullable=True,default=””)#生存时间    lastcheck = Column(String(20),nullable=True,default=””)#最后校验时间    source = Column(String(500), nullable=False)#页面地址    rule_id = Column(Integer,nullable=False)#规则(网站/spider)id    indate = Column(DateTime,nullable=False)#入库时间    def __init__(self,ip_port,source,type,level,location,speed,lifetime,lastcheck,rule_id):        self.ip_port=ip_port

        self.type=type

        self.level=level

        self.location=location

        self.speed=speed

        self.source=source

        self.lifetime=lifetime

        self.lastcheck=lastcheck

        self.rule_id=rule_id

        self.indate=datetime.datetime.now().strftime(“%Y-%m-%d %H:%M:%S”)

rules.py(文件3)

# -*- coding: utf-8 -*-from sqlalchemy import Column,String,Integerfrom sqlalchemy import Sequencefrom . import Baseclass Rule(Base):    __tablename__ = ‘rules’    # 表结构:    id = Column(Integer, Sequence(‘id’,start=1,increment=1),primary_key=True)#设定自增长主键    name = Column(String(100),nullable=False)#spider的名字    allowed_domains = Column(String(500),nullable=False)#允许爬取的域    start_urls = Column(String(500),nullable=False)#开始爬取的入口    next_page = Column(String(500),nullable=False,default=””)#xpath表达式,爬取下一页    allow_url = Column(String(500),nullable=False)#正则表达式,匹配符合要求的链接    extract_from = Column(String(500),nullable=False,default=””)#xpath表达式,限制解析区域    loop_xpath = Column(String(500),nullable=False)#xpath表达式,控制单页面循环次数    ip_xpath = Column(String(500),nullable=False)#xpath表达式,解析IP    port_xpath = Column(String(500),nullable=False,default=””)#xpath表达式,解析端口    location1_xpath = Column(String(500),nullable=False)#xpath表达式,解析区域    location2_xpath = Column(String(500),nullable=False,default=””)#xpath表达式,解析区域    speed_xpath = Column(String(500),nullable=False,default=””)#xpath表达式,解析连接速度    lifetime_xpath = Column(String(500),nullable=False,default=””)#xpath表达式,解析生存时间    type_xpath = Column(String(500),nullable=False,default=””)#xpath表达式,解析协议类别    level_xpath = Column(String(500),nullable=False,default=””)#xpath表达式,解析匿名级别    lastcheck_xpath = Column(String(500),nullable=False,default=””)#xpath表达式,解析最后校验时间    enable = Column(Integer,nullable=False)#激活rule的开关,1为开0为关

proxy_spider.py(文件4)

# -*- coding: utf-8 -*-import scrapyfrom scrapy.linkextractors import LinkExtractorfrom scrapy.spiders import CrawlSpider, Rule#抓取信息的数据结构,类似于javabeanclass IpProxyPoolItem(scrapy.Item):    ip_port = scrapy.Field()    type = scrapy.Field()    level = scrapy.Field()    location = scrapy.Field()    speed = scrapy.Field()    lifetime = scrapy.Field()    lastcheck = scrapy.Field()    rule_id = scrapy.Field()    source = scrapy.Field()#搭建spider的主体框架,继承CrawlSpider类class ProxySpiderSpider(CrawlSpider):    name = ‘MagicSpider’    def __init__(self,rule):        self.rule = rule

        self.name = rule.name

        #spilt函数通过分隔符分割字符串,得到列表类型        self.allowed_domains = rule.allowed_domains.split(‘,’)        self.start_urls = rule.start_urls.split(‘,’)        rule_list = []        # 添加”下一页”链接的规则        if len(rule.next_page):            rule_list.append(Rule(LinkExtractor(restrict_xpaths=rule.next_page), follow=True))        #链接提取规则        rule_list.append(Rule(LinkExtractor(            allow=rule.allow_url.split(‘,’),            unique=True),#链接去重            follow=True,#跟随爬取            callback=’parse_item’))#调用parse_item提取数据        #使用tuple()将列表转换为元组        self.rules = tuple(rule_list)        #当有子类继承ProxySpiderSpider的时候,调用初始化方法启动爬取过程        super(ProxySpiderSpider, self).__init__()    def parse_item(self, response):        item=IpProxyPoolItem()        if len(self.rule.loop_xpath):            for proxy in response.xpath(self.rule.loop_xpath):                if len(self.rule.ip_xpath):                    tmp_ip = proxy.xpath(self.rule.ip_xpath).extract_first()                    #strip函数用来删除空白字符(包括’\n’, ‘\r’,  ‘\t’,  ‘ ‘)                    ip = tmp_ip.strip() if tmp_ip is not None else “”                else:                    ip = “”                if len(self.rule.port_xpath):                    tmp_port = proxy.xpath(self.rule.port_xpath).extract_first()                    port = tmp_port.strip() if tmp_port is not None else “”                else:                    port = “”                if len(self.rule.location1_xpath):                    tmp_location1 = proxy.xpath(self.rule.location1_xpath).extract_first()                    location1 = tmp_location1.strip() if tmp_location1 is not None else “”                else:                    location1 = “”                if len(self.rule.location2_xpath):                    tmp_location2 = proxy.xpath(self.rule.location2_xpath).extract_first()                    location2 = tmp_location2.strip() if tmp_location2 is not None else “”                else:                    location2 = “”                if len(self.rule.lifetime_xpath):                    tmp_lifetime = proxy.xpath(self.rule.lifetime_xpath).extract_first()                    lifetime = tmp_lifetime.strip() if tmp_lifetime is not None else “”                else:                    lifetime = “”                if len(self.rule.lastcheck_xpath):                    tmp_lastcheck = proxy.xpath(self.rule.lastcheck_xpath).extract_first()                    lastcheck = tmp_lastcheck.strip() if tmp_lastcheck is not None else “”                else:                    lastcheck = “”                if len(self.rule.level_xpath):                    tmp_level = proxy.xpath(self.rule.level_xpath).extract_first()                    level = tmp_level.strip() if tmp_level is not None else “”                else:                    level = “”                if len(self.rule.type_xpath):                    tmp_type = proxy.xpath(self.rule.type_xpath).extract_first()                    type = tmp_type.strip() if tmp_type is not None else “”                else:                    type = “”                if len(self.rule.speed_xpath):                    tmp_speed = proxy.xpath(self.rule.speed_xpath).extract_first()                    speed = tmp_speed.strip() if tmp_speed is not None else “”                else:                    speed = “”                #join函数用来拼接字符串,接收的参数为列表类型                item[‘ip_port’]=(“:”.join([ip,port])) if len(port) else ip

                item[‘type’]=type

                item[‘level’]=level

                item[‘location’]=(” “.join([location1,location2])) if location2 is not None and len(location2) else location1

                item[‘speed’]=speed

                item[‘lifetime’]=lifetime

                item[‘lastcheck’]=lastcheck

                item[‘rule_id’]=self.rule.id

                item[‘source’]=response.url

                yield item

pipelines.py(文件6)

# -*- coding: utf-8 -*-import MySQLdbfrom scrapy.exceptions import DropItemimport redisfrom model import loadSessionfrom model import proxyfrom scrapy import logimport loggingRedis = redis.StrictRedis(host=’localhost’,port=6379,db=0)# item去重class DuplicatesPipeline(object):    def process_item(self, item, spider):        if Redis.exists(‘ip_port:%s’ % item[‘ip_port’]) :            raise DropItem(“Duplicate item found: %s” % item)        else:            Redis.set(‘ip_port:%s’ % item[‘ip_port’],1)            return item#数据入库class IpProxyPoolPipeline(object):    def process_item(self, item, spider):        if len(item[‘ip_port’]):            a = proxy.Proxy(                ip_port=item[‘ip_port’],                type=item[‘type’],                level=item[‘level’],                location=item[‘location’],                speed=item[‘speed’],                lifetime=item[‘lifetime’],                lastcheck=item[‘lastcheck’],                rule_id=item[‘rule_id’],                source=item[‘source’]            )            session = loadSession()            try:                session.merge(a)                session.commit()            except MySQLdb.IntegrityError, e:                log.msg(“MySQL Error: %s” % str(e), _level=logging.WARNING)            return item

        else:            log.msg(“ip_port is invalid!”,_level=logging.WARNING)

proxy_middlewares.py(文件7)

# -*- coding: utf-8 -*-import randomfrom scrapy import logimport loggingclass ProxyMiddleware(object):    proxyList = [ \

        ‘124.88.67.18:80’,        ‘124.88.67.52:843’,        ‘110.77.169.30:8080’,        ‘58.246.194.70:8080’,        ‘159.232.214.68:8080’        ]    def process_request(self, request, spider):        # 从代理IP列表中随机选择一个        pro_adr = random.choice(self.proxyList)        log.msg(“Current Proxy <%s>” % pro_adr,_level=logging.INFO)        request.meta[‘proxy’] = “http://” + pro_adr

useragent_middlewares.py(文件9)

# -*-coding:utf-8-*-from scrapy import logimport logging”’

#避免被ban策略之一:使用useragent池。

”’import randomfrom scrapy.downloadermiddlewares.useragent import UserAgentMiddlewareclass UserAgent(UserAgentMiddleware):    def __init__(self, user_agent=”):        self.user_agent = user_agent

    def process_request(self, request, spider):        ua = random.choice(self.user_agent_list)        if ua:            #显示当前使用的useragent            #print “********Current UserAgent:%s************” %ua            #记录            log.msg(‘Current UserAgent: ‘+ua, _level=logging.INFO)            request.headers.setdefault(‘User-Agent’, ua)    user_agent_list = [\

        “Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 ”        “(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1”,        “Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 ”        “(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11”,        “Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 ”        “(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6”,        “Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 ”        “(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6”,        “Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 ”        “(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1”,        “Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 ”        “(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5”,        “Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 ”        “(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5”,        “Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 ”        “(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3”,        “Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 ”        “(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3”,        “Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 ”        “(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3”,        “Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 ”        “(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3”,        “Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 ”        “(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3”,        “Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 ”        “(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3”,        “Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 ”        “(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3”,        “Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 ”        “(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3”,        “Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 ”        “(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3”,        “Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 ”        “(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24”,        “Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 ”        “(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24”,        “Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50”        “(KHTML, like Gecko) Version/5.1 Safari/534.50”,        “Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50”

        “(KHTML, like Gecko) Version/5.1 Safari/534.50”,        “Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0”,        “Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0;”        “.NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko”,        “Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 ”        “(KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11”,        “Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)”,        “Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; ”        “AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)”,        “Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; ”        “Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)”,        “Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; ”        “AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)”,        “Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)”,        “Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; ”        “Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)”,        “Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; ”        “Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)”,        “Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; ”        “InfoPath.2; .NET CLR 3.0.04506.30)”,        “Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 ”        “(KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)”,        “Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6”,        “Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1”,        “Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0”,        “Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5”,        “Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6”,        “Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11”,        “Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 ”        “(KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20”,        “Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52”      ]

initDB.py(文件5)

初始化数据库,自动建好所需的数据库表,并添加了一条规则# -*- coding: utf-8 -*-from model import loadSessionfrom model import proxyfrom model.rules import Rulefrom model import Base,engine#寻找Base的所有子类,并在数据库中生成表Base.metadata.create_all(engine)#返回数据库会话session = loadSession()#实例化一个Rule对象item=Rule()item.name=”ip84″item.allowed_domains=”ip84.com”item.start_urls=”http://ip84.com/gn/1″item.next_page=”//a[@class=’next_page’]”item.allow_url=”/gn/\d+”item.loop_xpath=”//table[@class=’list’]/tr[position()>1]”item.ip_xpath=”td[1]/text()”item.port_xpath=”td[2]/text()”item.location1_xpath=”td[3]/a[1]/text()”item.location2_xpath=”td[3]/a[2]/text()”item.speed_xpath=”td[6]/text()”item.type_xpath=”td[5]/text()”item.level_xpath=”td[4]/text()”item.enable=”1″#添加到数据库session.add(item)session.commit()

run.py(文件8)

# -*- coding: utf-8 -*-from spiders.proxy_spider import ProxySpiderSpiderfrom model import loadSessionfrom model.rules import Rulefrom scrapy.crawler import CrawlerProcessfrom scrapy.settings import Settings#spider相关设置settings = Settings()”’

Scrapy框架的高度灵活性得益于其数据管道的架构设计,开发者可以通过简单的配置就能轻松地添加新特性。

我们可以通过如下的方式添加pipline。

”’settings.set(“ITEM_PIPELINES” , {    ‘pipelines.DuplicatesPipeline’: 200,    ‘pipelines.IpProxyPoolPipeline’: 300,})#设置默认请求头settings.set(“DEFAULT_REQUEST_HEADERS”,{  ‘Accept’: ‘text/html, application/xhtml+xml, application/xml’,  ‘Accept-Language’: ‘zh-CN,zh;q=0.8’})#注册自定义中间件,激活切换UA的组件和切换代理IP的组件settings.set(“DOWNLOADER_MIDDLEWARES”,{  ‘useragent_middlewares.UserAgent’: 1,  ‘proxy_middlewares.ProxyMiddleware’:100,  ‘scrapy.downloadermiddleware.useragent.UserAgentMiddleware’ : None,})#设置爬取间隔settings.set(“DOWNLOAD_DELAY”,1)#禁用cookiessettings.get(“COOKIES_ENABLED”,False)#设定是否遵循目标站点robot.txt中的规则settings.get(“ROBOTSTXT_OBEY”,True)#加载设置process = CrawlerProcess(settings)session=loadSession()#取出规则表中已激活的rulerules = session.query(Rule).filter(Rule.enable == 1)for rule in rules:    process.crawl(ProxySpiderSpider,rule)process.start()

开始爬取

shell中执行python run.py即可开始采集数据…

新增两个网站的爬取规则:

http://www.xicidaili.com/

《Scrapy学习笔记(7)-定制动态可配置爬虫》

http://www.kuaidaili.com/free/

《Scrapy学习笔记(7)-定制动态可配置爬虫》

检验结果:

《Scrapy学习笔记(7)-定制动态可配置爬虫》

三条规则生成的spider在两个小时不到已经采集了6万多IP。

《Scrapy学习笔记(7)-定制动态可配置爬虫》

更多原创文章,尽在金笔头博客

    原文作者:leeyis
    原文地址: https://www.jianshu.com/p/97d0beff34a8
    本文转自网络文章,转载此文章仅为分享知识,如有侵权,请联系博主进行删除。
点赞