python爬虫脚本下载YouTube视频

2023年1月14日 1,024次阅读来源: mergerly

工作环境：

python 2.7.13
pip
lxml, 安装 pip install lxml,主要用xpath查找节点，可以使用re模块代替
pytube, 安装 pip install pytube
科学上网工具

参考:

源码：

# coding: utf-8 
__author__ = "zwzhou" 
__date__ = "2017-03-19" 
 
import urllib2 
from pytube import YouTube 
from pprint import pprint 
from lxml import etree 
import sys,getopt 
 
def getHtml(url): 
    user_agent='Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.13 (KHTML, like Gecko) Chrome/24.0.1284.0 Safari/537.13' 
    headers={'User-Agent':user_agent} 
    request=urllib2.Request(url,headers=headers) 
    response=urllib2.urlopen(request) 
    html=response.read() 
    return html 
     
def getUrl(html): 
    global savepath 
    global maxNumber 
    global timeThreshold 
    global cur_count 
    global videoLists 
    tree=etree.HTML(html) 
    urllist=tree.xpath(u'//div[@class="thumb-wrapper"]/a/@href') 
    #print urllist 
    urllist_time=tree.xpath(u'//div[@class="thumb-wrapper"]/a/span/span/text()') 
 
    baseurl=r'https://www.youtube.com' 
    for (item_name,item_length) in zip(urllist,urllist_time): 
        #print item_name 
        #print item_length 
        try: 
            yt = YouTube(baseurl+item_name) 
        except: 
            print "Some thing wrong about the authority" 
             
        print("video name:"+yt.filename) 
        print("video time:"+item_length) 
        if yt.filename in videoLists: # 文件已经存在 
            print "This video has been downloaded!" 
        else: 
            if checktime(item_length): 
                video = yt.filter('mp4')[-1] 
                print("Now is loading %s------------>"%yt.filename) 
                video.download(savepath) 
                print("--------------->%sVideo is loaded!"%yt.filename) 
                cur_count+=1 
                videoLists.append(yt.filename) 
                if cur_count >= maxNumber:# 达到要求 
                    print('There are %d videos downloaded!This task is completed!'%maxNumber) 
                    # TODO: if necessary, the videoLists can be logged 
                    sys.exit()       
            else: 
                print 'This video is too long and it will not be downloaded, just be ignored!' 
    if urllist: 
        getUrl(baseurl+urllist[0]) #下一个页面 
 
 
def checktime(timelength): 
    global timeThreshold 
    strs=timelength.split(':') 
    time =int(strs[0])*60+int(strs[1]) 
    if time< timeThreshold: 
        return True 
    else: 
        return False 
 
def usage(): 
    print ''' 
    usage: python dl_youtube [option] [arg] 
    options and args: 
    -s      : download path 
    -t      : time threshold of the video to be loaded, in seconds 
    -u      : start url which to be crawled, it can be set more than one time 
    -n      : when downloading is stop, i.e. how many videos will be downloaded, default is 10000. 
    -h      : print this help message 
    ''' 
 
if __name__ == "__main__": 
    start_urls=['https://www.youtube.com/watch?v=TThzH_sJo6o'] 
    videoLists=[] # 保存文件名，防止重复下载 
    # 初始值 
    savepath=r"D://MyDownloads" 
    maxNumber=10000 
    timeThreshold=240 
    cur_count=0 
     
    opts,args=getopt.getopt(sys.argv[1:],'hs:t:n:u:') 
    for op,value in opts: 
        if op == "-s":  # 下载路径，如默认 D://MyDownloads 
            savepath=value 
        elif op == '-t': # 时常限制，默认240s 
            timeThreshold =int(value) 
        elif op == "-h": # help 
            usage() 
            sys.exit() 
        elif op == '-n': 
            maxNumber=int(value) 
        elif op == '-u': # 初始的搜索链接 
            start_urls.append(value) 
 
    for item in start_urls: 
        html = getHtml(item) 
        getUrl(html)

使用

python dl_youtube.py -n 10 -s D://MyDownloads -t 600 -u https://www.youtube.com/watch?v=TThzH_sJo6o

将从页面 https://www.youtube.com/watch?v=TThzH_sJo6o 开始搜索下载10段时长小于6分钟的video保存到D://MyDownloads文件夹中。

    原文作者：mergerly
    原文地址: https://blog.csdn.net/mergerly/article/details/78590925
    本文转自网络文章，转载此文章仅为分享知识，如有侵权，请联系博主进行删除。