本文以scrapy 框架来爬取整个顶点小说网的小说
1.scrapy的安装
这个安装教程,网上有很多的例子,这里就不在赘述了
2.关于scrapy
scrapy框架 是一个非常好的东西,能够实现异步爬取,节省时间,其实本文纯粹的按照之前的思维来做,
也不是不可以,但是感觉速度太慢了,毕竟数据量有点大
框架内容也在网上找找例子吧
3.直接说实现吧
使用
scrapy startproject dingdian
创建项目
然后增加文件,最后代码目录如下:
├── dingdian
│ ├── __init__.py
│ ├── items.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ ├── __init__.py
│ └── mydingdian.py
主要程序:
mydingdian.py
#coding:utf-8
import scrapy
import re
from scrapy.http import Request
from dingdian.items import DingdianItem
#from dingdian.items import DDNovelContentItem
class Myspider(scrapy.Spider):
name = "dingdian"
allowed_domains = ["23us.com"]
bash_url = "http://www.23us.com/class/"
bashurl='.html'
def start_requests(self):
#for i in range(1,11):
for i in range(7,8):
url=self.bash_url+str(i)+"_1"+self.bashurl
yield Request(url,self.parse)
def parse(self, response):
baseurl=response.url #此处得到的url为http://www.23us.com/class/*_1.html
max_num=response.xpath('//*[@id="pagelink"]/a[14]/text()').extract_first()#获取当前页面的最大页码数
print max_num
baseurl=baseurl[:-7]
#for num in xrange(1,int(max_num)+1):
for num in xrange(1,3):
newurl=baseurl+"_"+str(num)+self.bashurl
#此处使用dont_filter和不使用的效果不一样,使用dont_filter就能够抓取到第一个页面的内容,不用就抓不到
#scrapy会对request的URL去重(RFPDupeFilter),加上dont_filter则告诉它这个URL不参与去重。
yield Request(newurl,dont_filter=True,callback=self.get_name)#将新的页面url的内容传递给get_name函数去处理
def get_name(self,response):
for nameinfo in response.xpath('//tr'):
novelurl = nameinfo.xpath('td[1]/a/@href').extract_first()#小说地址
name = nameinfo.xpath('td[1]/a/text()').extract_first()#小说名字
if novelurl:
yield Request(novelurl,dont_filter=True,callback=self.get_novelcontent,meta={'name':name})
'''
#在当前页面获取小说详情
#print nameinfo
name = nameinfo.xpath('td[1]/a/text()').extract_first()#小说名字
author= nameinfo.xpath('td[3]/text()').extract_first()#小说作者
novelurl = nameinfo.xpath('td[1]/a/@href').extract_first()#小说地址
serialstatus = nameinfo.xpath('td[6]/text()').extract_first()#小说状态
serialnumber = nameinfo.xpath('td[4]/text()').extract_first()#小说字数
if novelurl:
targentcontent['novel_name']=name
targentcontent['author']=author
targentcontent['novelurl']=novelurl
targentcontent['serialstatus']=serialstatus
targentcontent['serialnumber']=serialnumber
#print name,author,novelurl,serialstatus,serialnumber
yield Request(novelurl,callback=self.get_novelcontent,meta={'targentcontent':targentcontent})
小说相关的详情可以暂时不传递
'''
def get_novelcontent(self,response):
#targentcontent=response.meta['targentcontent']
#print targentcontent['novelurl'],targentcontent['name']
#title = response.xpath('//dd[1]/h1/text()').extract_first()
novel_name=response.meta['name']#小说名字
author = response.xpath('//tr[1]/td[2]/text()').extract_first()#作者
novelurl = response.url#小说地址
serialstatus = response.xpath('//tr[1]/td[3]/text()').extract_first()#状态
serialnumber = response.xpath('//tr[2]/td[2]/text()').extract_first()#连载字数
category = response.xpath('//tr[1]/td[1]/a/text()').extract_first()#小说类别
name_id = novelurl[-5:]#小说编号
collect_num_total=response.xpath('//tr[2]/td[1]/text()').extract_first()#总收藏
click_num_total=response.xpath('//tr[3]/td[1]/text()').extract_first()#总点击
#chapterlistul=response.xpath('//dd[2]/div[2]/p[2]/a/text()').extract_first()
chapterlisturl=response.xpath('//dd[2]/div[2]/p[2]/a/@href').extract_first()
novel_breif=response.xpath('//dd[2]/p[2]').extract_first()
targentcontent=DingdianItem()
targentcontent['novel_name']=novel_name
targentcontent['author']=author
targentcontent['novelurl']=novelurl
targentcontent['serialstatus']=serialstatus
targentcontent['serialnumber']=serialnumber
targentcontent['category']=category
targentcontent['name_id']=name_id
targentcontent['collect_num_total']=collect_num_total
targentcontent['click_num_total']=click_num_total
targentcontent['novel_breif']=novel_breif
#yield targentcontent
#print novel_name,author,novelurl,serialstatus,serialnumber,category,name_id,collect_num_total,click_num_total,chapterlisturl
yield Request(chapterlisturl,dont_filter=True,callback=self.get_charaterurl,meta={'targentcontent':targentcontent})
def get_charaterurl(self,response):
#print response.url
item=response.meta['targentcontent']
for contents in response.xpath('//table/tr'):
for content in contents.xpath('td'):
if content.xpath('a/text()').extract_first():
#print content.xpath('a/text()').extract_first()
item['chapterurl']=response.url+content.xpath('a/@href').extract_first()
item['chaptername']=content.xpath('a/text()').extract_first()
yield item
定义的存贮内容即 items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class DingdianItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
novel_name = scrapy.Field()#小说名字
author = scrapy.Field()#作者
novelurl = scrapy.Field()#小说地址
serialstatus = scrapy.Field()#状态
serialnumber = scrapy.Field()#连载字数
category = scrapy.Field()#小说类别
name_id = scrapy.Field()#小说编号
collect_num_total=scrapy.Field()#总收藏
click_num_total=scrapy.Field()#总点击
novel_breif=scrapy.Field()#小说简介
chapterurl = scrapy.Field()#小说章节地址
chaptername = scrapy.Field()#小说章节名字
设置相关 settings.py
# -*- coding: utf-8 -*-
# Scrapy settings for dingdian project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'dingdian'
SPIDER_MODULES = ['dingdian.spiders']
NEWSPIDER_MODULE = 'dingdian.spiders'
PAGE_STORGE="novels"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'dingdian (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'dingdian.middlewares.MyCustomSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'dingdian.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'dingdian.pipelines.DingdianPipeline': 100,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_DIR = 'httpcache'
HTTPCACHE_IGNORE_HTTP_CODES = []
HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
最终的数据处理以及保存
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from dingdian import settings
import os
import urllib2
from dingdian.items import DingdianItem
#from dingdian.items import DDNovelContentItem
from bs4 import BeautifulSoup as bs
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
class DingdianPipeline(object):
def process_item(self, item, spider):
dir_path="%s/%s" % (settings.PAGE_STORGE,spider.name)
if not os.path.exists(dir_path):
# print "dir_path is %s",dir_path
os.makedirs(dir_path)
if isinstance(item,DingdianItem):
novelpath=dir_path+'/'+item['novel_name']
print novelpath
if not os.path.exists(novelpath):
os.makedirs(novelpath)
novelbreif=item['novel_name']+"_简介"
novelbreifpath=novelpath+'/'+novelbreif+'.txt'
if not os.path.exists(novelbreifpath):
with open(novelbreifpath,'wb') as novel_write:
novel_write.write(item['novel_name'])
novel_write.write('\t|\t')
novel_write.write(item['author'])
novel_write.write('\t|\t')
novel_write.write(item['novelurl'])
novel_write.write('\n')
novel_write.write(item['serialstatus'])
novel_write.write('\t|\t')
novel_write.write(item['serialnumber'])
novel_write.write('\t|\t')
novel_write.write(item['category'])
novel_write.write('\n')
novel_write.write(item['name_id'])
novel_write.write('\t|\t')
novel_write.write(item['collect_num_total'])
novel_write.write('\t|\t')
novel_write.write(item['click_num_total'])
novel_write.write('\n')
novel_write.write(item['novel_breif'])
novel_write.close
titlename=item['chaptername']
titlenamepath=novelpath+'/'+titlename+'.txt'
print titlenamepath
chapterurl=item['chapterurl']
html=urllib2.urlopen(chapterurl).read()
soup1=bs(html,'lxml')
if not os.path.exists(titlenamepath):
with open(titlenamepath,'wb') as file_write:
cont=soup1.find("dd",attrs={"id":"contents"}).getText()
#print cont
file_write.write(cont)
file_write.close()
return item
#-o books.csv 参数的意思是将抓取的Item集合输出到csv文件。
#除了CSV格式,Scrapy还支持JSON,XML的格式输入
然后运行
scrapy crawl dingdian
没有报错的话,就等上几个小时,然后就能看到好多小说就躺在自己的电脑上面了