import requests
import re
import os
import threading
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36'}
def get_onepage(url):
try:
response=requests.get(url,headers=headers)
if response.status_code==200:
return response.text
return None
except:
print('出错了!!!')
return None
def save_html(html,filename):
html=html.encode('utf-8')
if not os.path.exists('blog'):
blog_path = os.path.join(os.path.abspath('.'), 'blog')
os.mkdir(blog_path)
try:
fout = open('./blog/' + filename + '.html', 'wb')
fout.write(html)
except IOError as e:
print(e)
def parse_onepage(html):
pattern = re.compile('<span class="link_view" title="阅读次数"><a href="(.*?)" title="阅读次数">',re.S)
blog_urls=re.findall(pattern,html)
for url in blog_urls:
url2='http://blog.csdn.net/'+url
html2=get_onepage(url2)
pattern2=re.compile('<article>.*?<h1 class="csdn_top">(.*?)</h1>',re.S)
filename=re.findall(pattern2,html2)
filename = "".join(filename)
string = re.sub("[\s+\.\!\/_,$%^*(+\"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+",'',filename)
save_html(html2,string)
def total(html):
pattern=re.compile('<div id="papelist" class="pagelist">.*?共(.*?)页</span>',re.S)
total="".join(re.findall(pattern,html))
# print(total)
return int(total)
def main(blogname):
url = 'CSDN博客-IT技术写作分享平台' + blogname
html=get_onepage(url)
t=total(html)+1
for i in range(1,t):
print("正在抓取第",i,"页")
url2='http://blog.csdn.net/caimouse/article/list/'+str(i)
html=get_onepage(url2)
parse_onepage(html)
print("***********抓取完成************")
if __name__ == '__main__':
#创建多线程数组
threads = []
t1 = threading.Thread(target=main, args='')#添加blogname1
threads.append(t1)
t2 = threading.Thread(target=main, args='')#添加blogname2
threads.append(t2)
#for循环遍历多线程数组
for t in threads:
#将线程声明为守护线程,必须在start() 方法调用之前设置,如果不设置为守护线程程序会被无限挂起。
t.setDaemon(True)
#开始线程活动。
t.start()
#join()的作用是,在子线程完成运行之前,这个子线程的父线程将一直被阻塞
t.join()