from urllib import request,parse from time import sleep import re # 1、【数据的获取】 # 封装一个函数,用于将url转化成一个请求对象 def request_by(url,page): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'} if page==1: page_url = url + ".html" else: page_url = url +"_"+ str(page) + ".html" print("正在访问:",page_url) req = request.Request(url=page_url,headers=headers) return req # 封装一个函数,用于对请求对象发起请求并且把响应体返回出去 def get_html_from(req): res = request.urlopen(req) # 每请求一次要休眠一段时间 sleep(1) return res.read().decode("utf-8") # 2、【数据的解析】 def anylasis_data(html): pat = re.compile(r'<div class="box picblock.*?<img src2="(.*?)"',re.S) imgs = pat.findall(html) return imgs # 3、数据的存储 def download_imgs(imgs): for img in imgs: # http://pic1.sc.chinaz.com/Files/pic/pic9/201904/zzpic17564_s.jpg # 生成图片的名字 img_name = img.split("/")[-1] print("正在下载图片:",img) request.urlretrieve(url=img,filename="./meinv/"+img_name) sleep(1) if __name__ == '__main__': page_url = "http://sc.chinaz.com/tupian/meinvxiezhen" for i in range(1,2): req = request_by(url=page_url,page=i) res = get_html_from(req) imgs = anylasis_data(res) download_imgs(imgs)