爬取币世界标红快讯内容(移动版)
# 引入依赖
from lxml import etree
import requests
import pymongo
import time
client = pymongo.MongoClient('写你自己的数据库地址', 27017) # 需要自己安装mongodb客户端
mydb = client['mydb']
information = mydb['information'] # 数据库表名
currentTime = time.strftime("%m%d%H", time.localtime())
saveTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# 伪造成手机
header = {
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'
}
def get_url(url):
html = requests.get(url, headers=header)
selector = etree.HTML(html.text)
infos = selector.xpath('//div[@id="kuaixun_list"]/div/article/section[@class="focus"]')
onlyOne = selector.xpath('//div[@id="kuaixun_list"]/div/article/section[@class="focus"]')[0]
saveId = onlyOne.xpath('../@id')[0]
file = open(r'C:/Users/SCZ/PycharmProjects/CommunityCrawl/newest', 'w') # 写你自己的文件地址
file.write(currentTime +' '+saveId)
file.close()
for info in infos:
try:
title = (info.xpath('h3[@class="text_title"]/text()')[0]).strip()
content = (info.xpath('p[@class="text_show"]/text()')[0]).strip()
date = info.xpath('../h3[@class="timenode"]/text()')[0]
infoId = info.xpath('../@id')[0]
data = {
'title': title,
'id': infoId,
'date': saveTime,
'content': content,
'source': 'bishijie'
}
print(data)
if (int(infoId) > int(saveId) - 20):
print('插入了一条新数据!')
information.insert_one(data)
else:
print('无新数据产生!')
except IndexError:
pass
if __name__ == '__main__':
fs = open('C:/Users/SCZ/PycharmProjects/CommunityCrawl/newest', 'r+') # 写你自己的文件地址
line = fs.read()
fileDate = line[0:6]
if (fileDate != currentTime):
print('时间不一致,宕机使用当前系统时间进行爬取!')
urls = ['http://m.bishijie.com/kuaixun?fm=' + currentTime]
for url in urls:
get_url(url)
time.sleep(2)
else:
print('时间一致, 正常运行!')
urls = ['http://m.bishijie.com/kuaixun?fm=' + currentTime]
for url in urls:
get_url(url)
time.sleep(2)
主要要求掌握内容: xpath语法,python操作文件,python的基础语法
本文内容比较基础,写的不好,多多指教!大家一起进步!!!
我的其他关于python的文章
Python爬虫入门
Python爬虫之使用MongoDB存储数据