python 爬虫获取网页 html 内容以及下载附件的方法

2019年6月16日 348次阅读来源: XnCSD

python 爬虫获取网页 html 内容以及下载附件的方法

python 爬虫获取网页 html 内容的两种方法: 获取静态网页和使用浏览器获取动态内容。

from urllib.request import urlopen
from urllib import request
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
from selenium import webdriver
import socket
import time


def get_static_url_content(url, encoding='utf-8', timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
    ''' 获取静态网页内容 :param url: 网页url :param encoding: 网页编码 :param timeout: 设置超时 :return: '''
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
    req = request.Request(url, headers=headers)
    html = urlopen(req, timeout=timeout)
    bsObj = BeautifulSoup(html.read(), "html.parser", from_encoding=encoding)
    return bsObj


def get_driver_url_content(url, encoding='utf-8', timeout=3):
    ''' 使用浏览器获取动态内容 :param url: 网页url :param encoding: 网页编码 :param timeout: 设置超时 :return: '''
    chromedriver_path = '/path/to/chromedriver'
    driver = webdriver.Chrome(executable_path=chromedriver_path)
    # 也可以使用phantomJS
    # driver =webdriver.Phantomjs(executable_path="/path/to/phantomjs")
    driver.get(url)
    time.sleep(timeout)
    bsObj = BeautifulSoup(driver.page_source, 'html.parser', from_encoding=encoding)
    driver.close()
    return bsObj

def load_appendix(url, filename):
    ''' 下载附件 :param url: 附件 url(附件文档和图片均可) :param filename: 保存的文件名 :return: '''
    urlretrieve(url, filename)

    原文作者：XnCSD
    原文地址: https://blog.csdn.net/XnCSD/article/details/85221527
    本文转自网络文章，转载此文章仅为分享知识，如有侵权，请联系博主进行删除。