Python 入门爬取图片2.0 多进程+多线程

#!/usr/bin/python
# coding:utf-8
# 多进程爬虫
import requests
import re
import time
import multiprocessing
import threading

# 图片保存路径
SAVE_PATH = "./threa_imgs/"
# 解析jpg图片url的正则
jpgReg = re.compile(r'<img class="BDE_Image" src="(.+?\.jpg)"')

# 根据url获取网页html内容
def getHtmlContent(url):
    page = requests.get(url)
    page.encoding = "UTF-8"
    return page.text

# 从html中解析出所有jpg图片的url
def getJPGs(html):
    # 解析出jpg的url列表
    jpgs = re.findall(jpgReg,html)
    return jpgs

def getSavePath():
    '''获取保存地址'''
    # 判断路径是否存在
    if issetDir(SAVE_PATH) != True:
        print("图片保存文件夹创建失败")
        exit()
    return SAVE_PATH

def load_img(imgurl, file):
    '''下载单张图片到制定的文件夹下'''
    name = imgurl.split('/')[-1]
    file = "{}{}".format(file,name)
    item = requests.get(imgurl).content
    with open(file,'wb') as f:
        f.write(item)

# 目录不存在则创建
def issetDir(path):
    import os
    path=path.strip()
    # 去除尾部 \ 符号
    path=path.rstrip("\\")
    isExists = os.path.exists(path)
    # 判断结果
    if not isExists:
        os.makedirs(path) 
        return True
    else:
        return True

def load_imgs(url, file):
    '''多线程下载单页的所有图片'''
    threads = []
    headers = {
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:49.0) Gecko/20100101 Firefox/49.0',
    }
    html = requests.get(url, headers=headers).text
    tags = re.findall(jpgReg,html)
    for each in tags:
        t = threading.Thread(target=load_img,args=(each,file))
        threads.append(t)
    for i in threads:
        i.start()
    for i in threads:
        i.join()
    print(url,'is ok')

def main():
    '''多进程下载多页的图片'''
        # 记录执行时间
    start_t = time.time()
    url = 'http://tieba.baidu.com/p/2256306796'
    path = getSavePath()
    pool = multiprocessing.Pool(processes=4)
    pool.apply_async(func=load_imgs,args=(url,path))
    pool.close()
    pool.join()
    use_time = (time.time() - start_t)
    print("多进程需要{} 秒".format(use_time))

if __name__ == '__main__':
    main()

启用多进程,每个进程开启多线程下载,此多进程+多线程参考 Hopetree 博客,多进程爬取速度比单进程快很多..

    原文作者:python入门
    原文地址: https://my.oschina.net/18y/blog/1844180
    本文转自网络文章,转载此文章仅为分享知识,如有侵权,请联系博主进行删除。
点赞