猫眼网页非常简单,就不做网页分析了,直接上代码
main.py
url='http://maoyan.com/board/4?offset={}0'
from get_info import get_info
if __name__=='__main__':
for page in range(0,10,1):
get_info(url,page)
get_info.py
def get_html(url,page):
import requests
wb_data = requests.get(url.format(page))
from bs4 import BeautifulSoup
html= BeautifulSoup(wb_data.text, 'lxml')
return html
def get_a_rank(html,num):
rank=html.select('div.main > dl > dd:nth-of-type({}) > i'.format(num))
return rank[0].text
def get_a_title(html,num):
name=html.select('div.main > dl > dd:nth-of-type({}) > div > div > div.movie-item-info > p.name > a'.format(num))
return name[0].text
def get_a_star(html,num):
star=html.select('div.main > dl > dd:nth-of-type({}) > div > div > div.movie-item-info > p.star'.format(num))
return star[0].text.split(':')[-1]
def get_a_releasetime(html,num):
releasetime=html.select('div.main > dl > dd:nth-of-type({}) > div > div > div.movie-item-info > p.releasetime'.format(num))
return releasetime[0].text.split(':')[-1]
def get_a_score(html,num):
score1=html.select('div.main > dl > dd:nth-of-type({}) > div > div > div.movie-item-number.score-num > p > i.integer'.format(num))
score2=html.select('div.main > dl > dd:nth-of-type({}) > div > div > div.movie-item-number.score-num > p > i.fraction'.format(num))
return score1[0].text+score2[0].text
def get_a_img_src(html,num):
img=html.select('div.main > dl > dd:nth-of-type({}) > a > img.board-img'.format(num))
return img[0].get('data-src') #此处图片源与网页显示不一致
from pymongo_database import movie_item_info
def get_info(url,page):
html=get_html(url,page)
for num in range(1,11,1):
rank=get_a_rank(html,num)
name=get_a_title(html,num)
star=get_a_star(html,num)
releasetime=get_a_releasetime(html,num)
score=get_a_score(html,num)
img_src=get_a_img_src(html,num)
movie_item_info.insert_one({'rank':rank,'name':name,
'star':star,'releasetime':releasetime,
'score':score,'img':img_src})