PS:需要提前引入库,Requests,BeautifulSoup,Lxml库
目录
一,经典爬虫-爬取酷狗Top500的数据
import requests
from bs4 import BeautifulSoup
import time #导入库
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
} #请求头
def get_info(url): #获取信息的函数
wb_data=requests.get(url,headers=headers)
soup=BeautifulSoup(wb_data.text,'lxml')
ranks=soup.select('span.pc_temp_num')
titles=soup.select('div.pc_temp_songlist > ul > li > a')
times=soup.select('span.pc_temp_tips_r > span')
for rank,title,time,in zip(ranks,titles,times): #获取指定组件指示的位置参数
data={
'rank':rank.get_text().strip(),
'singer':title.get_text().split('-')[0],
'song':title.get_text().split('-')[0],
'time':time.get_text().strip()
}
print(data)
if __name__ == '__main__':
urls=['http://www.kugou.com/yy/rank/home/{}-8888.html'.format(str(i)) for i in range(1,24)] #构造23个URL,由酷狗页面前500条数据所占的页数决定
for url in urls:
get_info(url)
time.sleep(1)
二,正则表达式爬虫-爬取斗破苍穹/爬取糗事百科段子
import requests
import re
import time
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
}
f = open('G:/doupo.txt','a+')
def get_info(url):
res = requests.get(url,headers=headers)
if res.status_code == 200:
contents = re.findall('<p>(.*?)</p>',res.content.decode('utf-8'),re.S)
for content in contents:
f.write(content+'\n')
else:
pass
if __name__ == '__main__':
urls = ['http://www.doupoxs.com/doupocangqiong/{}.html'.format(str(i)) for i in range(2,3)]
for url in urls:
get_info(url)
time.sleep(1)
f.close()
import requests
import re
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
}
info_lists = []
def judgment_sex(class_name):
if class_name == 'womenIcon':
return '女'
else:
return '男'
def get_info(url):
res = requests.get(url)
ids = re.findall('<h2>(.*?)</h2>',res.text,re.S)
levels = re.findall('<div class="articleGender \D+Icon">(.*?)</div>',res.text,re.S)
sexs = re.findall('<div class="articleGender (.*?)">',res.text,re.S)
contents = re.findall('<div class="content">.*?<span>(.*?)</span>',res.text,re.S)
laughs = re.findall('<span class="stats-vote"><i class="number">(\d+)</i>',res.text,re.S)
comments = re.findall('<i class="number">(\d+)</i> 评论',res.text,re.S)
for id,level,sex,content,laugh,comment in zip(ids,levels,sexs,contents,laughs,comments):
info = {
'id':id,
'level':level,
'sex':judgment_sex(sex),
'content':content,
'laugh':laugh,
'comment':comment
}
info_lists.append(info)
if __name__ == '__main__':
urls = ['http://www.qiushibaike.com/text/page/{}/'.format(str(i)) for i in range(1,36)]
for url in urls:
get_info(url)
for info_list in info_lists:
print(info_list)
三,Lxml导出数据到Excel-爬取中文起点小说数据
运行程序
import xlwt
import requests
from lxml import etree
import time
all_info_list = []
def get_info(url):
html = requests.get(url)
selector = etree.HTML(html.text)
infos = selector.xpath('//ul[@class="all-img-list cf"]/li')
for info in infos:
title = info.xpath('div[2]/h4/a/text()')[0]
author = info.xpath('div[2]/p[1]/a[1]/text()')[0]
style_1 = info.xpath('div[2]/p[1]/a[2]/text()')[0]
style_2 = info.xpath('div[2]/p[1]/a[3]/text()')[0]
style = style_1+'·'+style_2
complete = info.xpath('div[2]/p[1]/span/text()')[0]
introduce = info.xpath('div[2]/p[2]/text()')[0].strip()
word = info.xpath('div[2]/p[3]/span/text()')[0].strip('万字')
info_list = [title,author,style,complete,introduce,word]
all_info_list.append(info_list)
if __name__ == '__main__':
urls = ['http://a.qidian.com/?page={}'.format(str(i)) for i in range(1,101)]
for url in urls:
get_info(url)
header = ['title','author','style','complete','introduce','word']
book = xlwt.Workbook(encoding='utf-8')
sheet = book.add_sheet('Sheet1')
for h in range(len(header)):
sheet.write(0, h, header[h])
i = 1
for list in all_info_list:
j = 0
for data in list:
sheet.write(i, j, data)
j += 1
i += 1
book.save('xiaoshuo.xls')
四,爬取图片-爬取妹子网图片
import requests
import os
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
'Referer': 'http://www.mzitu.com'}
# URL列表
urls = ["https://www.mzitu.com/page/{}".format(str(i)) for i in range(2, 11)]
# 路径,可以更改成你的路径
path = 'G:/photo/'
# 获取图片并写入本地文件
def get_girlphoto(url):
data = requests.get(url, headers=headers)
selector = etree.HTML(data.text)
# 获取图片的URL列表
girlphoto_urls = selector.xpath('//div/ul/li/a/img/@data-original')
print(girlphoto_urls)
# 循环每个图片链接并写入本地文件,写入要用二进制
for item in girlphoto_urls:
data = requests.get(item, headers=headers)
with open(path + item[-10:], 'wb') as f:
f.write(data.content)
f.close()
if __name__ == '__main__': # 主函数
# 循环URL
for url in urls:
get_girlphoto(url)
五,爬取数据到数据库-豆瓣音乐前250
import requests
import time #导入库f
from lxml import etree
import pymysql
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
} #请求头
def get_info(url): #获取信息的函数
db = pymysql.connect("localhost", "root", "123456", "music")
cursor = db.cursor()
wb_data=requests.get(url,headers=headers)
selector=etree.HTML(wb_data.text)
name=selector.xpath('//div[@class="pl2"]/a/text()')
titles=selector.xpath('//p[@class="pl"]/text()')
for name,title in zip(name,titles): #获取指定组件指示的位置参数
'''data={
'name':name.strip(),
'author':title.split('/')[0],
'time':title.split('/')[1],
}'''
a=name.strip()
b=title.split('/')[0]
c=title.split('/')[1]
cursor.execute("insert into music values('"+a+"','"+b+"','"+c+"')")
db.commit()
cursor.close()
db.close()
if __name__ == '__main__':
urls=['https://music.douban.com/top250?start=25'.format(str(i)) for i in range(0,225,25)] #构造23个URL,由酷狗页面前500条数据所占的页数决定
for url in urls:
get_info(url)
time.sleep(1)