三个重点,我隐藏了1024的地址,本爬虫只适用于1024的抓取。每个帖子以帖子名为文件名新建一个文件并把帖子内图片保存下来。
url_start设置起始页 url_end设置结束页
有问题的留言,我看到就会回复
1编码
2文章页链接匹配
3文件夹操作
import requests
import re
import time
from bs4 import BeautifulSoup
import os
url_start = 'url1'
url_end = 'url2'
# 获取图片链接并保存到文件夹的函数
def getIMG(article_url):
# time.sleep(1)
urls = []
try:
html = requests.get(article_url)
html.encoding = 'gbk'
soup = BeautifulSoup(html.text, 'html.parser')
part_picURL = re.findall("src='http://img(.+?\.jpg)'",html.text,re.S)
for each in part_picURL:
picURL = 'http://img' + each
urls.append(picURL)
i=0
for each in urls:
try:
pic = requests.get(each, timeout = 10)
folder_name = soup.select('h4')[0].text
if os.path.isdir(folder_name):
pass
else:
os.mkdir(folder_name)
print('文件夹'+ '$ ' + folder_name + '$' + '创建完成')
file_name = folder_name+'/' + folder_name + str(i) + '.jpg'
fp = open(file_name,'wb')
fp.write(pic.content)
fp.close()
i += 1
except:
pass
print('图片下载完成')
except:
pass
return urls
url_list = []
#获取当前页面文章列表链接并翻页
def getlist(url_Start):
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36'}
req = requests.get(url_Start)
req.encoding = 'gbk'
url_index = re.findall('"打開新窗口" href="htm_(.+?\.html)" target="_blank">',req.text,re.S)
for p in url_index:
full_url = 'http://cl.gtta.pw/htm_' + p
url_list.append(full_url)
#判断是否要翻页
urls_next = re.findall('false;}"></a><a href="(.*?)">下一頁',req.text,re.S)[0]
url_next = 'http://cl.gtta.pw/' + urls_next
if url_next != url_end:
getlist(url_next)
else:
print('已到达末页')
return url_list
lists = getlist(url_start)
print(len(lists))
for list in lists:
img = getIMG(list)
print(img)