爬取起点中文网小说(只爬了第一页的小说,可以爬去所有的小说,但是太多了,就只爬了第一页的小说)
源代码:
"""
created on Web Jan 02 2019
@author: Super Huan
"""
# Python爬取起点小说
import requests
from lxml import etree
import os
class spider():
def startRequest(self):
response = requests.get('https://www.qidian.com/all')
html = etree.HTML(response.content.decode())
bigTitleList = html.xpath('//div[@class="book-mid-info"]/h4/a/text()')
bigSrcList = html.xpath('//div[@class="book-mid-info"]/h4/a/@href')
for bigTitle, bigSrc in zip(bigTitleList, bigSrcList):
if os.path.exists(bigTitle) == False:
os.mkdir(bigTitle)
self.fileData(bigTitle, bigSrc)
def fileData(self, bigTitle, bigSrc):
response = requests.get('http:' + bigSrc)
html = etree.HTML(response.content.decode())
litTitleList = html.xpath('//ul[@class="cf"]/li/a/text()')
litSrcList = html.xpath('//ul[@class="cf"]/li/a/@href')
for litTitle, litSrc in zip(litTitleList, litSrcList):
self.finallyFile(litTitle, litSrc, bigTitle)
def finallyFile(self, title, url, bigTitle):
response = requests.get('https:' + url)
html = etree.HTML(response.content.decode())
text = '
'.join(html.xpath('//div[@class="read-content j_readContent"]/p/text()'))
fileName = bigTitle + '/' + title + '.txt'
print('正在抓取文章', fileName)
if os.path.exists(fileName) == False:
with open(fileName, 'a', encoding='utf-8') as f:
f.write(text)
spider = spider()
spider.startRequest()
进群:960410445 获取更多源码案例!