1 import requests 2 import re 3 # from bs4 import BeautifulSoup 4 from urllib import request 5 # import threading 6 import gevent 7 from gevent import monkey 8 9 monkey.patch_all() 10 11 def get_html_text(url): 12 try: 13 hd = {'User-Agent':'Mozilla/5.0'} #添加伪装浏览器头部信息 14 r = requests.get(url, timeout=10, headers = hd) 15 r.raise_for_status() 16 r.encoding = r.apparent_encoding 17 print(len(r.text)) 18 return r.text 19 except Exception as result: 20 print('错误类型:', result) 21 22 23 def html_text_parser(img_list, html): 24 25 # 下面是修改的重点部分,采用了正则表达式,没有采用bs4 26 # 技术大神可以研究一下斗鱼这个网页的源代码,真正的图片信息都存储在后面,不是传统的# # html,我还没怎么接触过前端的知识,不知怎么使用bs4,所以使用了正则表达式。 27 28 img_pat = r'"rs\w+":"(.*?g)"' 29 links = re.compile(img_pat, re.S).findall(html) 30 print(len(links)) 31 print(links) 32 for link in links: 33 if link: 34 img_list.append(link) 35 return img_list 36 37 38 39 40 def get_douyu_img(Img_list): 41 for i,j in enumerate(Img_list): 42 # name = j.split('.')[-1] 43 try: #异常捕获,如果链接不能访问,退出当前一次循环,进入下一次循环 44 r = request.urlopen(j) 45 ima_content = r.read() 46 path = str(i) 47 with open(path, 'wb') as f: 48 f.write(ima_content) 49 except: 50 continue 51 def main(): 52 url = 'https://www.douyu.com/g_yz' 53 html = get_html_text(url) 54 img_list = list() 55 Img_list = html_text_parser(img_list, html) 56 # print(Img_list) 57 #t1 = threading.Thread(target=get_html_text, args=(url,)) 58 #t2 = threading.Thread(target=html_text_parser, args=(img_list,html)) 59 #t3 = threading.Thread(target=get_douyu_img, args=(Img_list,)) 60 #t1.start() 61 #t2.start() 62 #t3.start() 63 gevent.joinall([ 64 gevent.spawn(get_html_text, url), 65 gevent.spawn(html_text_parser, img_list, html), 66 gevent.spawn(get_douyu_img, Img_list) 67 ]) 68 69 70 if __name__ == '__main__': 71 main()