#!/usr/bin/python
# coding:utf-8
# 多进程爬虫
import requests
import re
import time
import multiprocessing
import threading
# 图片保存路径
SAVE_PATH = "./threa_imgs/"
# 解析jpg图片url的正则
jpgReg = re.compile(r'<img class="BDE_Image" src="(.+?\.jpg)"')
# 根据url获取网页html内容
def getHtmlContent(url):
page = requests.get(url)
page.encoding = "UTF-8"
return page.text
# 从html中解析出所有jpg图片的url
def getJPGs(html):
# 解析出jpg的url列表
jpgs = re.findall(jpgReg,html)
return jpgs
def getSavePath():
'''获取保存地址'''
# 判断路径是否存在
if issetDir(SAVE_PATH) != True:
print("图片保存文件夹创建失败")
exit()
return SAVE_PATH
def load_img(imgurl, file):
'''下载单张图片到制定的文件夹下'''
name = imgurl.split('/')[-1]
file = "{}{}".format(file,name)
item = requests.get(imgurl).content
with open(file,'wb') as f:
f.write(item)
# 目录不存在则创建
def issetDir(path):
import os
path=path.strip()
# 去除尾部 \ 符号
path=path.rstrip("\\")
isExists = os.path.exists(path)
# 判断结果
if not isExists:
os.makedirs(path)
return True
else:
return True
def load_imgs(url, file):
'''多线程下载单页的所有图片'''
threads = []
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:49.0) Gecko/20100101 Firefox/49.0',
}
html = requests.get(url, headers=headers).text
tags = re.findall(jpgReg,html)
for each in tags:
t = threading.Thread(target=load_img,args=(each,file))
threads.append(t)
for i in threads:
i.start()
for i in threads:
i.join()
print(url,'is ok')
def main():
'''多进程下载多页的图片'''
# 记录执行时间
start_t = time.time()
url = 'http://tieba.baidu.com/p/2256306796'
path = getSavePath()
pool = multiprocessing.Pool(processes=4)
pool.apply_async(func=load_imgs,args=(url,path))
pool.close()
pool.join()
use_time = (time.time() - start_t)
print("多进程需要{} 秒".format(use_time))
if __name__ == '__main__':
main()
启用多进程,每个进程开启多线程下载,此多进程+多线程参考 Hopetree 博客,多进程爬取速度比单进程快很多..