Python——爬虫(解析简单标签)
# -*- coding: utf-8 -*-
# 使用BeautifulSoup解析网页
from bs4 import BeautifulSoup
#获取要解析的标签
with open('test.html','r',encoding='utf-8') as wb_data:
Soup = BeautifulSoup(wb_data,'lxml'); #将要解析的文件传入
print(Soup); #打印读入Soup中的内容
print("--------------\n")
images = Soup.select('body > img'); #将要解析的标签元素路径传入
urls = Soup.select('#binner > ul > li > a') #可以从网站上直接复制
print(images,urls,sep='\n---------------\n'); #打印解析标签元素包含内容
wb_data.close();
#解析标签内容-------使用get_text()获得文本内容,使用get('')方法获取标签属性值
list = [];
for image,url in zip(images,urls):
data = {
'image':image.get_text(),
'image_src':image.get('src'),
'url':url.get_text(),
'url_href':url.get('href'),
'value':url.get('value')
}
list.append(data);
for i in list:
if float(i['value'])>2.0:
print(i);
html文件:
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Insert title here</title>
</head>
<body>
<img src = "test1.png"/>
<img src = "test2.png"/>
<img src = "test3.png"/>
<div id = 'binner'>
<ul>
<li><a href = 'http://www.runoob.com/' value = '3.1'>菜鸟教程</a></li>
<li><a href = 'http://www.baidu.com/' value = '1.2'>百度主页</a></li>
<li><a href = 'http://www.sdu.edu.cn/' value = '2.4'>山东大学</a></li>
</ul>
</div>
</body>
</html>
上面是一个爬去本地html文件的实例,下面是爬取一个“真实”网站的例子:
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import requests
url = 'https://hao.360.cn/?a1004'
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text,'lxml') #把web_data变得可读
#解析网页元素,从网站上复制元素的CSS路径
#这里以链接为例
#famous-section > ul.list.first.gclearfix > li:nth-child(7) > a
url_famous = soup.select('#famous-section > ul.list.first.gclearfix > li > a')
#famous-section > ul.list.last.gclearfix > li:nth-child(1) > a
url_famous .append(soup.select('#famous-section > ul.list.last.gclearfix > li > a'))
print(url_famous)
#focus_news > ul > li:nth-child(1) > a
url_focus = soup.select('#focus_news > ul > li > a')
print(url_focus)
注意:使用beautifulsoup解析网页元素的时候,需要传入网页元素的CSS路径,可以通过审查元素然后右击获得