Python网络爬虫（七）- 深度爬虫CrawlSpider

2019年5月18日 112次阅读来源: 一只写程序的猿

《Python网络爬虫（七）- 深度爬虫CrawlSpider》

Python网络爬虫（一）- 入门基础
Python网络爬虫（二）- urllib爬虫案例
Python网络爬虫（三）- 爬虫进阶
Python网络爬虫（四）- XPath
Python网络爬虫（五）- Requests和Beautiful Soup
Python网络爬虫（六）- Scrapy框架
Python网络爬虫（七）- 深度爬虫CrawlSpider
Python网络爬虫（八） – 利用有道词典实现一个简单翻译程序

深度爬虫之前推荐一个简单实用的库fake-useragent，可以伪装生成headers请求头中的User Agent值

#安装
pip install  fake-useragent

#使用
import requests
from fake_useragent import UserAgent
ua = UserAgent()
headers = {'User-Agent': ua.random}
url = '待爬网页的url'
resp = requests.get(url, headers=headers)

1.深度爬虫CrawlSpider

scrapy.spiders.CrawlSpider

 创建项目：scrapy startproct <project_name>

 创建爬虫：scrapy genspider –t crawl <spider_name> <domains>

 核心处理规则： from scrapy.spiders import CrawlSpider, Rule
 核心处理提取： from scrapy.linkextractors import LinkExtractor

rules：该属性为一个正则表达式集合，用于告知爬虫需要跟踪哪些链接
rules属性还有一个callback函数，用于解析下载得到的响应，而parse_item()方法给我们提供了一个从响应中获取数据的例子。
使用shell命令抓取：scrapy shell http://baidu.com

2.链接提取：LinkExtractor

class scrapy.contrib.linkextractor.sgml.SgmlLinkExtractor(
    allow = (),         # 符合正则表达式参数的数据会被提取
    deny = (),          # 符合正则表达式参数的数据禁止提取
    allow_domains = (),     # 包含的域名中可以提取数据
    deny_domains = (),      # 包含的域名中禁止提取数据
    deny_extensions = (),       
    restrict_xpath = (),        # 使用xpath提取数据，和allow共同起作用
    tags = (),          # 根据标签名称提取数据
    attrs = (),         # 根据标签属性提取数据
    canonicalize = (),
    unique = True,          # 剔除重复链接请求
    process_value = None
)

3.爬取规则：rules

rules = [
    Rule(
        link_extractor,     # LinkExtractor对象
        callback=None,      # 请求到响应数据时的回调函数
        cb_kwargs=None,     # 调用函数设置的参数,不要指定为parse
        follow=None,        # 是否从response跟进链接，为布尔值
        process_links=None, # 过滤linkextractor列表，每次获取列表时都会调用
        process_request=None    # 过滤request,每次提取request都会调用
    )
]

4.如何在pycharm中直接运行爬虫

1. 在项目下创建start.py文件

# -*- coding:utf-8 -*-
from scrapy import cmdline  #引入命令行
cmdline.execute('scrapy crawl dang'.split())

2. 如图所示

点击Edit Configurations
添加python文件
配置完毕后，点击ok
点击运行

配置了这么多最后发现start.py后直接运行就行，不需要配置那么多。

5.使用CrawlSpider爬取猎聘网python相关岗位招聘信息

创建项目

scrapy startproject liep

自动创建spiders文件

scrapy genspider lp liepin.com

items.py

# -*- coding: utf-8 -*-

import scrapy


class LiepItem(scrapy.Item):

    name = scrapy.Field()
    company = scrapy.Field()
    salary = scrapy.Field()
    address = scrapy.Field()
    #投递时间反馈
    experience = scrapy.Field()

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

import json


class LiepPipeline(object):
    def __init__(self):
        self.file = open('liepin.json','w')

    def process_item(self, item, spider):
        text = json.dumps(dict(item),ensure_ascii=False)
        self.file.write(text.encode('utf-8'))
        print 'QAQ ----> 正在写入数据'

    def close(self):
        self.file.close()

lp.py

# -*- coding: utf-8 -*-
from scrapy.spiders import CrawlSpider,Rule
from scrapy.linkextractors import LinkExtractor
from liep.items import LiepItem
import re

class LpSpider(CrawlSpider):
    reg = re.compile('\s*')
    name = 'lp'
    allowed_domains = ['www.liepin.com']
    start_urls = ['https://www.liepin.com/zhaopin/?pubTime=&ckid=6f6956c5d999c17e&fromSearchBtn=2&compkind=&isAnalysis=&init=-1&searchType=1&dqs=020&industryType=&jobKind=&sortFlag=15&degradeFlag=0&industries=040&salary=0%240&compscale=&key=python&clean_condition=&headckid=7a006343bdb04f47&curPage=0',]

    #定义提取超链接的提取规则
    page_link = LinkExtractor(allow=('&curPage=\d+'))
    #定义爬取数据的规则
    rules = {
        Rule(page_link,callback='parse_content',follow=True)

    }

    #定义处理函数
    def parse_content(self, response):
        #定义一个Item,用于存储数据
        item = LiepItem()
        #获取整个我们需要的数据区域
        job_list = response.xpath('//div[@class="job-info"]')
        for job in job_list:
            name = job.xpath('.//h3/a')
            item['name'] = self.reg.sub('', name.xpath('string(.)').extract()[0])
            item['company'] = job.xpath('..//p[@class="company-name"]/a/text()').extract()
            item['salary'] = job.xpath('.//span[@class="text-warning"]/text()').extract()
            item['address'] = job.xpath('.//p[@class="condition clearfix"]//a/text()').extract()
            item['experience'] = job.xpath('.//p[@class="condition clearfix"]//span[3]/text()').extract()

            yield item

settings.py

DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
  'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36',
}

#把ITEM_PIPELINES的注释取消
ITEM_PIPELINES = {
   'firPro.pipelines.FirproPipeline': 300,
}

爬取的结果liepin.json

{
  "salary": "12-24万",
  "company": "嗨皮(上海)网络科技股份有限公司",
  "name": "python开发工程师",
  "experience": "3年工作经验",
  "address": "上海"
}{
  "salary": "14-28万",
  "company": "第一弹",
  "name": "Python后端开发",
  "experience": "3年工作经验",
  "address": "上海"
}{
  "salary": "12-18万",
  "company": "易路软件",
  "name": "Python中级开发工程师",
  "experience": "3年工作经验",
  "address": "上海-闵行区"
}{
  "salary": "11-21万",
  "company": "信用飞/首付游",
  "name": "Python开发工程师（风控方向）",
  "experience": "1年工作经验",
  "address": "上海-徐汇区"
}{
  "salary": "13-24万",
  "company": "联车科技",
  "name": "python开发",
  "experience": "3年工作经验",
  "address": "上海"
}{
  "salary": "12-24万",
  "company": "寻仟信息",
  "name": "Python开发工程师",
  "experience": "1年工作经验",
  "address": "上海"
}{
  "salary": "12-22万",
  "company": "ifuwo",
  "name": "Python开发工程师",
  "experience": "1年工作经验",
  "address": "上海-浦东新区"
}{
  "salary": "12-24万",
  "company": "小葫芦",
  "name": "python开发工程师",
  "experience": "1年工作经验",
  "address": "上海"
}{
  "salary": "14-24万",
  "company": "ifuwo",
  "name": "python后台工程师",
  "experience": "2年工作经验",
  "address": "上海-浦东新区"
}{
  "salary": "面议",
  "company": "森浦资讯",
  "name": "Python开发工程师",
  "experience": "2年工作经验",
  "address": "上海"
}{
  "salary": "14-24万",
  "company": "优刻得",
  "name": "OPL-python运维开发",
  "experience": "2年工作经验",
  "address": "上海"
}{
  "salary": "面议",
  "company": "上海聪牛金融信息服务有限公司",
  "name": "python开发工程师",
  "experience": "2年工作经验",
  "address": "上海"
}{
  "salary": "12-30万",
  "company": "进馨网络",
  "name": "python开发工程师",
  "experience": "3年工作经验",
  "address": "上海"
}{
  "salary": "12-18万",
  "company": "载信软件",
  "name": "Python工程师",
  "experience": "1年工作经验",
  "address": "上海"
}{
  "salary": "14-24万",
  "company": "优刻得",
  "name": "OPL-python运维开发J10605",
  "experience": "1年工作经验",
  "address": "上海"
}{
  "salary": "10-24万",
  "company": "上海霄骋信息科技有限公司",
  "name": "Python爬虫开发工程师",
  "experience": "2年工作经验",
  "address": "上海"
}{
  "salary": "面议",
  "company": "五五海淘",
  "name": "Python",
  "experience": "1年工作经验",
  "address": "上海"
}
.................
.................

6.使用中间件设置请求头和代理

《Python网络爬虫（七）- 深度爬虫CrawlSpider》 scrapyAPI文档中关于中间件的描述

settings.py

# -*- coding: utf-8 -*-



BOT_NAME = 'tea'

SPIDER_MODULES = ['tea.spiders']
NEWSPIDER_MODULE = 'tea.spiders'

# 用于设置日志配置文件，将程序运行的信息，保存在指定的文件中
LOG_FILE = 's.log'
# 用于设置信息记录级别 DEBUG最高级别~记录所有信息  --  INFO WARNING...
# 详细日志<DEBUG> -> 摘要信息<INFO> -> 警告信息<WARNING> -> 错误信息<ERROR>....
LOG_LEVEL = 'INFO'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'tea (+http://www.yourdomain.com)'
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
    "Opera/8.0 (Windows NT 5.1; U; en)",
    "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
    "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2 ",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER) ",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E) ",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0) "
    ]




# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
   # 'tea.middlewares.MyCustomDownloaderMiddleware': 543,
    'tea.middlewares.UseragentMiddleware': 543,
    'tea.middlewares.ProxyMiddleware':600,
}

PROXY = [
    {"ip_port":"178.62.47.236:80"},
    {"ip_port":"125.77.25.116:80"},
    {"ip_port":"13.58.249.76:8080"},
    {"ip_port":"37.204.253.2:8081"},
    {"ip_port":"78.47.174.243:3128"},
    {"ip_port":"139.59.235.243:3128", "user_password":"admin:123123"}
]

middlewares.py

# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html

import random
import base64
from settings import USER_AGENTS,PROXY

#创建一个自定义的下载中间件 -- 需要在settings.py中进行配置才能起作用
class UseragentMiddleware(object):
    #定义一个专门用于处理请求的函数:两个参数，第一个参数就是要处理的请求对象，第二个参数是爬虫程序
    #该函数必须返回一个数据-None/request，如果返回的是None,表示处理完成，交给后续的中间件继续操作
    #如果返回的是request,此时返回的request会被重新交给引擎添加到请求队列中，重新发起
    def process_request(self,request,spider):
        print ('----QAQ-----')
        #随机获取一个user-Agent
        useragent = random.choice(USER_AGENTS)
        #给request请求头中添加user-agent配置
        request.headers.setdefault('User-agent',useragent)
        print ('---->headers successful')
        return None

class ProxyMiddleware(object):
    def process_request(self,request,spider):
        print ('------->-_-')
        proxy = random.choice(PROXY)
        # 给request请求中添加Proxy配置
        print proxy['ip_port'],proxy.get('user_password',None)
        request.meta['proxy'] = proxy.get('ip_port')

        #验证
        if proxy.get('user_password',None):
            b64 = base64.b64encode(proxy.get('user_password'))
            print b64
            request.headers['Proxy-Authorization'] = 'Basic '+b64
            print '======proxy======'

《Python网络爬虫（七）- 深度爬虫CrawlSpider》可以看到请求头和代理IP已被加入

7.爬取美西网商品详情,并存储与数据库

代码如下：
《Python网络爬虫（七）- 深度爬虫CrawlSpider》项目结构

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy

class MeiciItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass

class JsArticleItem(scrapy.Item):
    brand = scrapy.Field()
    productitle = scrapy.Field()
    price = scrapy.Field()
    color = scrapy.Field()
    szie = scrapy.Field()
    proimg = scrapy.Field()
    prodata = scrapy.Field()
    brandstory = scrapy.Field()
    brandimg = scrapy.Field()
    meiciid = scrapy.Field()

middlewares.py

# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals


class MeiciSpiderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.

        # Should return None or raise an exception.
        return None

    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.

        # Must return an iterable of Request, dict or Item objects.
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Response, dict
        # or Item objects.
        pass

    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
import pymysql
import settings

class MeiciPipeline(object):
    def process_item(self, item, spider):
        return item

class WebcrawlerScrapyPipeline(object):
    def __init__(self):
        self.connect = pymysql.connect(
            host=settings.MYSQL_HOST,
            db=settings.MYSQL_DBNAME,
            user=settings.MYSQL_USER,
            passwd=settings.MYSQL_PASSWD,
            charset='utf8',
            use_unicode=True)
        self.cursor = self.connect.cursor()

    def process_item(self, item, spider):
        print item['meiciid']
        self.cursor.execute("""select meiciid from goods where meiciid = %s;""", item['meiciid'])

        ret = self.cursor.fetchone()


        if not ret:
            self.cursor.execute(
                """insert into goods(brand,productitle,price,color,
                szie,proimg,prodata,brandstory,brandimg,meiciid) 
                values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);""",
                (item['brand'],
                 item['productitle'],
                 item['price'],
                 item['color'],
                 item['szie'],
                 item['proimg'],
                 item['prodata'],
                 item['brandstory'],
                 item['brandimg'],
                 item['meiciid']))
            self.connect.commit()
            print "商品保存成功"
        else:
            pass

settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for meici project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'meici'

SPIDER_MODULES = ['meici.spiders']
NEWSPIDER_MODULE = 'meici.spiders'

#Mysql数据库的配置信息
MYSQL_HOST = '127.0.0.1'
MYSQL_DBNAME = 'meici'         #数据库名字，请修改
MYSQL_USER = 'root'             #数据库账号，请修改
MYSQL_PASSWD = '960226'         #数据库密码，请修改

MYSQL_PORT = 3306               #数据库端口，在dbhelper中使用



# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'meici (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 1
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
'User-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
}

# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'meici.middlewares.MeiciSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
#    'meici.middlewares.MyCustomDownloaderMiddleware': 543,
# }
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    'meici.pipelines.WebcrawlerScrapyPipeline': 300,  # 保存到mysql数据库
    'meici.pipelines.MeiciPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

meicispider.py

# -*- coding: utf-8 -*-
import scrapy
import re
import json
from meici import items
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor

class Meicispider(CrawlSpider):
    name = 'meici'
    allowed_domains = ['meici.com']
    start_urls = ['http://www.meici.com/product/detail/id/300251/saleid/156692.html',
                  ]
    rules = (
        Rule(LinkExtractor(allow=('/product/\w+/id/\d+/saleid/\d+.html')),callback='parse_item',follow=True),
    )


    def parse_item(self, response):
        reg = re.compile('\s*')
        xml = response
        brand = xml.xpath('//*[@id="content"]/div/div[1]/div[2]/h1/a/text()').extract()[0]
        productitle = xml.xpath('//*[@id="content"]/div/div[1]/div[2]/div[1]/div/text()').extract()[0]
        price = xml.xpath('//*[@id="content"]/div/div[1]/div[2]/div[2]/div/div/span/em/text()').extract()[0]
        # a = re.compile('class="colorcur" color-id="(\d*)" title="(.*)">')
        # color = re.findall(a, response)

        color = xml.xpath('//li[@class="colorcur"]/@title').extract()[0]
        szie = xml.xpath('//div[@class="pro_size"]//ul/li/a/text()').extract()
        proimg = xml.xpath('//div[@class="proImg"]//img/@src').extract()
        prodata1 = xml.xpath('//div[@class="proTableinfo"]//th//text()').extract()
        prodata2 = xml.xpath('//div[@class="proTableinfo"]//td//text()').extract()
        brandstory = xml.xpath('//div[@class="proBrand_l"]/p/text()').extract()[0]
        brandimg = xml.xpath('//div[@class="proBrand_r"]/img/@src').extract()[0]
        # print brandStory
        meiciid = xml.xpath('//td[@class="product_sku"]/text()').extract()[0]

        # print brand,productitle,price
        # print color,szie
        # print proimg
        # print len(prodata1),len(prodata2)
        # print brandstory
        # print brandimg
        # print meiciid
        del prodata2[9]
        del prodata2[10]
        key = []
        for i in prodata1:
            # i = "'" + i + "'"
            i=reg.sub("",i)
            key.append(i)
        value = []
        for j in prodata2:
            # j = "'" + j + "'"
            j = reg.sub("", j)
            value.append(j)
        prodata = dict(zip(key, value))
        prodata = json.dumps(prodata, ensure_ascii=False)
        # print prodata

        item = items.JsArticleItem()
        item['brand'] = brand
        item['productitle'] = productitle
        item['price'] = price
        item['color'] = color
        item['szie'] = str(szie)
        item['proimg'] = str(proimg)
        item['prodata'] = prodata
        item['brandstory'] = brandstory
        item['brandimg'] = brandimg
        item['meiciid'] = meiciid

        yield item

如果你觉得我的文章还可以，可以关注我的微信公众号：Python攻城狮

《Python网络爬虫（七）- 深度爬虫CrawlSpider》可扫描二维码，添加关注

    原文作者：一只写程序的猿
    原文地址: https://www.jianshu.com/p/1476a181fc57
    本文转自网络文章，转载此文章仅为分享知识，如有侵权，请联系博主进行删除。

算法网