用Python抓取网易云音乐缓存中的音乐文件,封面和歌词

3个晚上, 用宝宝睡着之后的时间, 终于搞定了, 也学习了Python.

不得不说, Python用起来真爽.

我一直是网易云音乐的会员, 听到好听的歌就要保存下来. 但有的歌只能听,不能下载, 或者到了月末500首歌的下载额度满了, 就强迫症的不爽.

下面代码中主要的点:

  1. 缓存目录下的.idx 里有文件大小和当前缓存进度, 可以判断是不是缓存完了.
  2. 缓存目录下的.info文件里有文件格式.
  3. 缓存目录下的.uc文件是加密了的音乐文件, 解密方式是和 0xa3取反
  4. songId最重要, 在文件名里解析出来.
  5. 通过songId在网站上爬歌曲信息. 网易做了反爬处理, 需要用到selenium+firefox, 再转到一个frame里去解析.
  6. 歌曲信息里有专辑封面图片的链接, 直接下载.
  7. 通过网易的API获取歌词.

之前爬过一个网站上所有影片信息, 其实比这个简单的多. 主要在处理网易的反爬处理时挺麻烦的.

import requests
import os
import json
from bs4 import BeautifulSoup
import socket
import urllib
from selenium import webdriver
import re
import time


songStorageDir = "d:/0-music"
songIdDir = "/`````songid"
cloudMusicCacheDir = os.environ['LOCALAPPDATA'] + "\\Netease\\CloudMusic\\Cache\\Cache"
cloudMusicLyricDir = os.environ['LOCALAPPDATA'] + "\\Netease\\CloudMusic\\webdata\\lyric"

def GetSongInfo( songId, fileSize, format ):
    songInfoFilePath = songStorageDir + songIdDir + "/" + songId + ".json"
    if( os.path.isfile( songInfoFilePath ) ):
        try:
            songInfoFile = open( songInfoFilePath, "rb")
            songInfo = json.load( songInfoFile )
            songInfoFile.close()
            if( songInfo["songId"] ):
                return songInfo
        except:
            print( "Load song info fail! path:" + songInfoFilePath )
    fireFoxOptions = webdriver.FirefoxOptions()
    fireFoxOptions.set_headless()
    browser = webdriver.Firefox(firefox_options=fireFoxOptions)
    #browser = webdriver.Firefox()
    #browser = webdriver.Chrome()
    #browser = webdriver.PhantomJS()
    #browser.get( "http://music.163.com/#/song?id=" + songId )
    songUrl = "http://music.163.com/song?id=" + songId
    browser.get( songUrl )
    try:
        browser.switch_to.frame('g_iframe')  # 从windows切换到frame,切换到真实信息所在的frame
    except:
        print( "switch to frame fail! song url:" + songUrl )
        return
    songInfoJsonStr = browser.find_element_by_xpath( "/html/head/script[2]").get_attribute( "innerHTML" )
    songInfo = json.loads( songInfoJsonStr )
    songInfo["songId"] = songId
    songInfo["format"]= format
    songInfo["fileSize"] = fileSize
    songInfo["artist"] = songInfo["description"].split('。')[0].split(':')[1]
    songInfo["album"] = songInfo["description"].split('。')[1].split(':')[1]
    # 保存.
    with open( songInfoFilePath, "w", encoding='utf-8' ) as f:
        json.dump( songInfo, f )
    f.close()
    #print( songInfo )
    browser.quit()
    return songInfo

def DecodeCloudMusicCacheFile( cacheFilePath, storageFilePath ):
    with open ( cacheFilePath,'rb') as f:
        btay = bytearray(f.read())
    with open(storageFilePath,'wb') as out:
        for i,j in enumerate(btay):
            btay[i] = j ^ 0xa3
        out.write(bytes(btay))
        out.close()
        f.close()

def GetSongDir( songInfo ):
    album = re.sub('[\\\/:*?"<>|]','-',songInfo[ "album" ])#去掉非法字符
    artist = re.sub('[\\\/:*?"<>|]','-',songInfo["artist"])#去掉非法字符
    songDir = songStorageDir + "/" + artist + "/" + album + "/"
    os.makedirs( songDir, 0o777, True )
    return songDir

def GetSongFilePathWithoutExt( songInfo ):
    songTitle = songInfo["title"]
    songDir = GetSongDir( songInfo )
    fileName = re.sub('[\\\/:*?"<>|]','-',songTitle)#去掉非法字符
    return  songDir + fileName

def SaveSongFile( ucFilePath, songInfo ):
    songFilePath = GetSongFilePathWithoutExt( songInfo ) + "." + songInfo["format"]
    if( os.path.isfile( songFilePath ) ):
        if( os.path.getsize( songFilePath ) >= int( songInfo["fileSize"] ) ):
            #print( songFilePath + " already exist!")
            return songFilePath # 已经存在,并且文件大小可以.
    DecodeCloudMusicCacheFile( ucFilePath, songFilePath )
    print( "Save song to :" + songFilePath )
    return songFilePath

def SaveAlbumCover( songInfo ):
    coverFilePath = GetSongDir( songInfo ) + "album.jpg"
    if( os.path.isfile( coverFilePath ) ):
        #print( "Cover " + coverFilePath + " already exist!" )
        return
    imgUrl = songInfo[ "images"][0]
    try:
        urllib.request.urlretrieve(imgUrl, coverFilePath)
    except urllib.error.HTTPError as err:
        print( "img get fail! url:\t"+ imgUrl  )
        print( err );
    except:
        print( "img get fail unknown except! url:\t"+ imgUrl  )
    print( "Save album cover to:" + coverFilePath )

def SaveLyric( songInfo ):
    lyricJsonFilePath = GetSongFilePathWithoutExt( songInfo ) + ".lyric.json"
    if( not os.path.exists( lyricJsonFilePath ) ):
        try:
            songId = songInfo["songId"]
            lyricUrl = "http://music.163.com/api/song/lyric?os=pc&id="+ songId +"&lv=-1&kv=-1&tv=-1"
            urllib.request.urlretrieve( lyricUrl, lyricJsonFilePath )
        except:
            print( "Save lyric fail! song info:" + json.dumps( songInfo ))
    lyricJsonFile = open( lyricJsonFilePath, "rb" )
    lyricJson = json.load( lyricJsonFile )
    lyricJsonFile.close()
    if( not "lrc" in lyricJson.keys() ):
        return
    if( not "lyric" in lyricJson[ "lrc"].keys() ):
        return
    lyric = lyricJson[ "lrc"]["lyric"]
    lyric = lyric.replace( "\\n", "\n" )
    lrcFilePath = GetSongFilePathWithoutExt( songInfo ) + ".lrc"
    with open( lrcFilePath, "w", encoding='utf-8' ) as lrcFile:
        lrcFile.write( lyric )
        lrcFile.close()

def HackCloudMusicCache():
    cacheFiles = os.listdir( cloudMusicCacheDir );
    for fileName in cacheFiles: #遍历文件夹
        ucFilePath = cloudMusicCacheDir + "\\" + fileName
        if(os.path.isfile(ucFilePath)): #判断是否是文件夹,不是文件夹才打开
              #f = open( cloudMusicCacheDir+"/"+file); #打开文件
              isUC= fileName.endswith('.uc')
              if( isUC ):
                  strArray = fileName.split("-")
                  songId = strArray[0];
                  #print( "song id is :" + songId )  #文件名的第一部分是歌曲ID
                  idxFilePath = ucFilePath.replace('.uc', '.idx')
                  infoFilePath = ucFilePath.replace('.uc', '.info')
                  idxFileExist = os.path.exists( idxFilePath )
                  infoFileExist = os.path.exists( infoFilePath )
                  if( idxFileExist and infoFileExist ):     # info 或 idx 文件不存在的时候,歌曲还没有播放完.
                      idxFile = open( idxFilePath, 'rb')
                      idxJson = json.load( idxFile )
                      #print( idxJson )
                      fileSize = int(idxJson['size'])
                      zoneEnd = int(idxJson['zone'][0].split(' ')[1])
                      cacheFinished = fileSize <= zoneEnd+1
                      idxFile.close()
                      if( not cacheFinished ):
                          print( "File " + ucFilePath + " has not finished!")
                          continue
                      infoFile = open( infoFilePath, 'rb' )
                      infoJson = json.load( infoFile )
                      musicFileFormat = infoJson['format']
                      #print( "file format is :" + musicFileFormat)

                      songInfo = GetSongInfo( songId, fileSize, musicFileFormat )
                      if( not songInfo["songId"] ):
                          continue
                      storageSongPath = SaveSongFile( ucFilePath, songInfo )
                      SaveAlbumCover( songInfo )  # 保存封面
                      SaveLyric( songInfo )         # 保存歌词.



while( True ):
    print( "--------------------Start hack cloud music cache!------------------" )
    HackCloudMusicCache();
    time.sleep( 60 * 5 )

    原文作者:李之兴
    原文地址: https://zhuanlan.zhihu.com/p/35881988
    本文转自网络文章,转载此文章仅为分享知识,如有侵权,请联系博主进行删除。
点赞