案例一: 京东商品页面的爬取
import requests
url = 'https://item.jd.com/2967929.html'
try:
r = requests.get(url)
r = raise_for_status()
r.encoding = r.apparent_encoding
print(r.text[:1000])
except:
print('FINDING ERRORS')
案例二:亚马逊商品页面的爬取
由于amazon禁止python访问,要把headers信息替换成浏览器
import requests
url = 'https://www.amazon.cn/gp/product/B01M8L5Z3Y'
try:
kv = {'user-agent' : 'Mozilla/5.0'}
r = requests.get(url, headers =kv)
r = raise_for_status()
r.encoding = r.apparent_encoding
print(r.text[:1000])
except:
print('FINDING ERRORS')
案例三:百度360关键词提交搜索
百度的关键词接口
http://www.baidu.com/s?wd=keyword
360的关键词接口
http://www.so.com/s?q=keyword
import requests
keyword = 'Python'
try:
kv = {'wd' : keyword}
r = requests.get('http://www.baidu.com/s', params =kv)
print(r.request.url)
r = raise_for_status()
r.encoding = r.apparent_encoding
print(r.text[:1000])
except:
print('FINDING ERRORS')
import requests
keyword = 'Python'
try:
kv = {'q' : keyword}
r = requests.get('http://www.so.com/s', params =kv)
print(r.request.url)
r = raise_for_status()
r.encoding = r.apparent_encoding
print(r.text[:1000])
except:
print('FINDING ERRORS')
案例四:图片爬取并且存储
import requests
import os
url = 'http://image.nationalgeographic.com.cn/2017/0211/20170211061910157.jpg'
root = 'D://pics//'
path = root + url.split('/')[-1]
try:
if not os.path.exists(root):
os.mkdir(root)
if not os.path.exists(path):
r = requests.get(url)
with open(path, 'wb') as f:
f.write(r.content) # 保存二进制格式,即图片
f.close()
print('saved files')
else:
print('files have already existed')
except:
print('Failure')
案例五:IP地址查询
import requests
url = 'http://m.ip138.com/ip.asp?ip='
try:
r = requests.get(url+'202.204.80.112')
r.raise_for_status()
r.encoding = r.apparent_encoding
print(r.text[-500:])
except:
print('Failed')
案例六:大学排名
输入: 大学排名URL链接
输出: 大学排名信息的屏幕输出(排名,大学名称,总分)
技术路线: Requests, BeatifulSoup
步骤一: 从网页上获取大学排名网页内容
步骤二: 提取网页内容中信息到合适的数据结构
步骤三:利用数据结构展示并输出结果
import
def getHTMLText(url):
try:
r = requests.get(url, timeout = 30)
r.rasie_for_status
r.encoding = r.apparent_encoding
return r.text
except:
print('ERRORS')
def fillUnivList(ulist, html):
soup = BeautifulSoup(html, 'html.parser')
for tr in soup('tbody'). children:
if isinstance(tr, bs4.element.Tag):
tds = tr('td') # find_all()
ulist.append([tds[0].string, tds[1].string, tds[2].string])
def printUnivList(ulist, num):
print('{: ^10}\t{: ^6}\t{: ^10}\t'.format('Ranking', 'School Name', 'Marks'))
for i in range(num):
u = ulist[i]
print('{: ^10}\t{: ^6}\t{: ^10}\t'.format(u[0], u[1], u[2]))
def main()
uinfo =[]
url = 'http: //www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html'
html = getHTMLText(url)
fillUnivList(uinfo, html)
案例七
目标: 获取淘宝搜索页面的信息,提取其中的商品名称和价格
理解: 淘宝的搜索接口,翻页的处理
import re
import requests
def getHTMLText(url):
try:
r = requests.get(url, timeout = 30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return 'Error'
def parsePage(ilt, html):
try:
plt = re.findall(r'\"view_price\"\: \"[\d\.]*\" ', html)
tlt = re.findall(r' \"raw_title\"\:\".*?\" ',html)
for i in range(len(plt)):
price = eval(plt[i].split(':')[1])
title = eval(tlt[i].split(':')[1])
ilt.append([price, title])
except:
print('Errors')
def printGoodsList(ilt):
tplt = "{:4}\t{:8}\t{:16}"
print (tplt.format('NO', 'PRICE', 'ITEM NAME')):
count =0
for q in ilt:
count = count +1
print(tplt.format(count, g[0], g[1]))
def main()
goods ='bags'
depth = 2
start_url = 'https://s.taobao.com/search? q = ' + goods
infoList =[ ]
for i in range(depth)
try:
url = start_url + '&s=' + str(44*i)
html = getHTMLText(url)
parsePage(infoList, html)
案例八 股票数据定向爬虫
目标:获取上交所和深交所所有股票的名称和交易信息
输出:保存在文件中
步骤:
步骤一: 从东方财富网获取股票列表
步骤二: 根据股票列表逐个到百度股票获取个股信息
步骤三: 将结果保存到文件中
import traceback
import re
from bs4 import BeautifulSoup
import bs4
def getHTMLText(url):
def getStockList(lst, stockURL):
html = getHTMLText(stockURL)
soup = Beautifuloup(html, 'html.parser')
a = soup.find_all('a')
for i in a:
try:
href = i.attrs['href']
lst.append(re.findall(r'[s][hz]\d{6}', href)[0])
except:
continue
def getStockInfo(lst, stockURL, fpath):
for stock in lst:
url = stockURL + stock + '.html'
html = getHTMLText(url)
try:
if html == ''
continue
infoDict =[ ]
soup = BeautifulSoup(html, 'html.parser')
stockInfo = soup.find('div', attrs ={'class' : 'stock-bets'})
name = stockInfo.find_all(attrs = {'class' : 'bets -name'})[0]
infoDict.update({'StockName': name.text.split()[0]})
keyList = stockInfo.find_all('dt')
valueList = stockInfo.find_all('dd')
for i in range(len(keyList)):
key = keyList[i].text
val = valueList[i].text
infoDict[key] = val
with open(fpath, 'a', encoding = 'utf-8') as f:
f.write(str(infoDict) + '\n')
except:
traceback.print_exc()
continue
def main():
stock_list_url = 'http://quote.eastmoney.com/stock'
stock_info_url = 'httpos://gupiao.baidu.com/stock/'
output_file = 'D://BaiduStockInfo.txt'
slist = [ ]
getStockList(slist, stock_list_url)
getStockInfo(slist, stock_info_url, output_file)
案例九 Scrappy股票数据定向爬虫
import scrapy
import re
class StockSpider(scrapy.Spider)
name = 'stocks'
start_urls = ['http://quote.eastmoney.com/stocklist.html']
def parse(self, response):
for href in reponse.css('a::attr(href)').exact():
try:
stock = re.findall(r'[s][hz]\d{6}', href) [0]
url = 'https://gupiao.baidu.com/stock' + stock + '.html'
yield scrapy.Request(url, callback = self.parse_stock)
except:
continue
def parse_stock(self, reponse):
infoDict ={ }
stockInfo = response.csss('.stock-bets')
name = stockInfo.css('.bets-name').extract()[0]
keyList = stockInfo.css('dt').extract()
valuleList = stockInfo.css('dd').extract()
for i in range(len(keyList)):
key = re.findall(r'>.*</dt>', keyList[i][0][1:-5])
try:
val = re.findall(r'\d+\.?.*</dd>', valueList[i])[0][0:-5]
except:
val = '--'
infoDict.update(
{ 'stockName': re.findall('\s.*\(', name)[0].split()[0] + re.findall('\>.*\<', name) [0][1:-1]}}
# pipeline
class BaidustocksPipeline(object):
def process_item(self, item, spider):
return item
class BaidustocksInfoPipeline(object):
def open_spider(self, spider):
self.f = open('BaiduStockInfo.txt', 'w')
def close_spider(self, spider)
self.f.close()
def process_item(self, item, spider):
try:
line = str(dic(item)) + '\n'
self.f.write(line)
except:
pass
ITEM_PIPELINES = {
'BaiduStocks.pipelines.BaidustockInfoPipeline: ' 300,}