Scrapy 的安装
pip3 install Scrapy
(mac版)
制作Scrapy 爬虫一共分4步
- 新建项目
- 明确目标: 明确你想要爬取的目标
- 制作爬虫:制作爬虫开始爬取网页
- 存储内容:设计管道存储爬取内容
命令详解
安装后,在终端直接输入 scrapy,会提示版本及一些命令提示:
scrapy bench
: 测试你电脑的scrapy 性能
scrapy fetch + 网址
: 给个URL地址取下载网页信息
scrapy genspider
scrapy runspider
: 创建/启动一个爬虫
scrapy shell
: 查看环境的
vi settings.py
打开该文档
步骤详解
新建项目
scrapy startproject + 项目名
就会自动生成很多文件。
生成文件代码详解:
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class ItcastItem(scrapy.Item): # 调用存取字段
# define the fields for your item here like:
# name = scrapy.Field()
# 老师姓名
name = scrapy.Field()
# 老师职称
title = scrapy.Field()
# 老师信息
info = scrapy.Field()
settings.py
# -*- coding: utf-8 -*-
# Scrapy settings for ITcast project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'ITcast'
SPIDER_MODULES = ['ITcast.spiders'] # 默认帮我们配置好的爬虫位置
NEWSPIDER_MODULE = 'ITcast.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'ITcast (+http://www.yourdomain.com)'
# Obey robots.txt rules # 表示你的网站是否遵循 robots 协议
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 # 默认请求并发量
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3 # 下载延迟
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 # 默认并发请求域的个数
#CONCURRENT_REQUESTS_PER_IP = 16 # 并发请求ip个数
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers: # 默认的请求header
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = { # 爬虫中间键
# 'ITcast.middlewares.ItcastSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = { # 下载中间键,字典格式
# 'ITcast.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = { # 监控配置
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { # 管道配置文件
# 'ITcast.pipelines.ItcastPipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
itcast.py
# -*- coding: utf-8 -*-
import scrapy
from ITcast.ITcast.items import ItcastItem
class ItcastSpider(scrapy.Spider):
name = 'itcast' # 爬虫名称,启动爬虫时必须执行的参数
allowed_domains = ['itcast.cn'] # 爬虫允许的域,即遇到其他网址不会爬取,是可选项
start_urls = ['http://www.itcast.cn/channel/teacher.shtml'] # 表示开始爬时,在这域里获取信息,可以是可迭代对象
def parse(self, response): # 解析的方法,每个url完成下载后将被调用
node_list = response.xpath("//div[@class='li_txt']")
items = []
for node in node_list:
item = ItcastItem()
# .extract()将xpath转换成 Unicode字符串 xpath返回的一定是列表
name = node.xpath("./h3/text()").extract()
title = node.xpath("./h4/text()").extract()
info = node.xpath("./p/text()").extract()
item['name'] = name[0]
item['title'] = title[0]
item['info'] = info[0]
items.append(item)
return items