Python爬虫!

爬取起点中文网小说(只爬了第一页的小说,可以爬去所有的小说,但是太多了,就只爬了第一页的小说)

源代码:

"""
created on Web Jan 02 2019
@author: Super Huan
"""
# Python爬取起点小说
import requests
from lxml import etree
import os
class spider():
 def startRequest(self):
 response = requests.get('https://www.qidian.com/all')
 html = etree.HTML(response.content.decode())
 bigTitleList = html.xpath('//div[@class="book-mid-info"]/h4/a/text()')
 bigSrcList = html.xpath('//div[@class="book-mid-info"]/h4/a/@href')
 for bigTitle, bigSrc in zip(bigTitleList, bigSrcList):
 if os.path.exists(bigTitle) == False:
 os.mkdir(bigTitle)
 self.fileData(bigTitle, bigSrc)
 def fileData(self, bigTitle, bigSrc):
 response = requests.get('http:' + bigSrc)
 html = etree.HTML(response.content.decode())
 litTitleList = html.xpath('//ul[@class="cf"]/li/a/text()')
 litSrcList = html.xpath('//ul[@class="cf"]/li/a/@href')
 for litTitle, litSrc in zip(litTitleList, litSrcList):
 self.finallyFile(litTitle, litSrc, bigTitle)
 def finallyFile(self, title, url, bigTitle):
 response = requests.get('https:' + url)
 html = etree.HTML(response.content.decode())
 text = '
'.join(html.xpath('//div[@class="read-content j_readContent"]/p/text()'))
 fileName = bigTitle + '/' + title + '.txt'
 print('正在抓取文章', fileName)
 if os.path.exists(fileName) == False:
 with open(fileName, 'a', encoding='utf-8') as f:
 f.write(text)
spider = spider()
spider.startRequest()



进群:960410445  获取更多源码案例!

    原文作者:萌新程序员
    原文地址: https://zhuanlan.zhihu.com/p/54687966
    本文转自网络文章,转载此文章仅为分享知识,如有侵权,请联系博主进行删除。
点赞