想要学习新技术就不要怕麻烦,下面这个实例我也是参考网上的一步一步写下来的。
当整个程序写完后你就会觉得又进一步了解python。
利用空余时间写了个简单的python爬虫程序——获取博海拾贝的标题和封面图
这里使用到的技术比较简单,可以供一些python入门的新手做参考。
知道需要采集的东西,那接下来的就是撸码干了。
首先应该分析爬取数据需要使用的函数或者程序包,在这里使用的时urllib2+lxml.etree.
下面是全部代码:
#! /usr/bin/env python
# coding:utf-8
import urllib2
import lxml.etree
import sys
import os
from MyHelper import MyHelper
class bssb:
reload(sys)
sys.setdefaultencoding('UTF-8')
type = sys.getfilesystemencoding()
def getHtml(self,_url): #获取网页内容
_headers = MyHelper().getHeaders()
request = urllib2.Request(url=_url,headers=_headers)
try:
page=urllib2.urlopen(request)
html=page.read()
return html
except urllib2.HTTPError as e:
print 'HTTPError=',e.code
except urllib2.URLError as e:
print 'URLError=',e.reason
def content(self,html,_xpath): #获取需要抓取的内容
content = lxml.etree.HTML(html.lower().decode('utf-8'))
result = content.xpath(_xpath)
return result
def HtmlforPage(self,htmlurl,titles,imgs):
html = self.getHtml(htmlurl)
articles = self.content(html,'//article')
nextpage = self.content(html,'//li[@class="next-page"]/a')
for item in crticles:
#抓取标题
title = item.findall('./header//a')[0].text
#抓取图片
img = item.findall('./p[@class="focus"]//img')[0].attrib['src']
titles.append(title)
imgs.append(img)
if len(nextpage)!=0:
self.HtmlForPage(nextpage[0].attrib['href'],titles,imgs)
else:
#将标题写入txt文件
MyHelper().save_txt(titles,'./Bohai/titles.txt','wb+')
#将图片保存到本地
MyHelper().save_file(imgs)
if __name__ == "__main__":
bohai = bssb()
url = 'https://bohaishibei.com/post/category/main/'
titles=[]
imgs=[]
bohai.HtmlforPage(url,titles,imgs)
下面这个是简单的自定义帮助类:
#! /usr/bin/env python
# coding:utf-8
import os
import urllib
import re
import requests
from PIL import Image
from io import BytestIO
class MyHelper:
def __init__(self,language='zh-CN,en;q=0.9',control='max-age=0')
self.language = language
self.control = control
def getAgent(self):
user_agent=['Mozilla/5.0(Windows NT 10.0; WOW64)','Mozilla/5.0 (Windows NT 6.3;WOW64)','Opera/9.27 (Windows NT 5.2; U; zh-cn)']
return user_agent
def getHeaders(self):
headers = {'Accept-Language':self.language,
'cache-control':self.control,
'User-Agent':random.choice(self.getAgent())
}
return headers
def save_txt(self,contents,txtPath,model):
_path = self.GetPath(txtPath)
with open(_path,model) as fo:
if isinstance(contents,list):
for item in contents:
fo.write(item+'\n')
else:
fo.write(contents+'\n')
fo.close()
def save_file(self,_path):
file_path='./imgData'
try:
file_path = self.GetPath(file_path)
if isinstance(_path,list):
count=1
for item in _path:
file_suffix = os.path.splitext(item)[1]
if file_suffix.__contains__('&'):
file_suffix = file_suffix.aplit('&')[0]
response = requests.get(item)
image = Image.open(BytesIO(response.content))
_img = file_path+'/bohai'+str(count)+file_suffix
image.save(_img)
count+=1
else:
file_suffix = os.path.splitext(_path)[1]
if file_suffix.__contains__('&'):
file_suffix = file_suffix.split('&')[0]
response = requests.get(item)
image = Image.Open(BytesIO(response.content))
image.save(file_path+'/bohai'+file_suffix)
except IOError as e:
print 'Error:没有找到文件或者读取文件失败'
def GetPath(self,_path):
#将文件路劲分割出目录和文件
file_path = os.path.split(_path)
if not os.path.isdir(file_path[0]):
print '目录不存在,新建', file_path[0]
os.system(r'touch %s' % _path)
return _path