3个晚上, 用宝宝睡着之后的时间, 终于搞定了, 也学习了Python.
不得不说, Python用起来真爽.
我一直是网易云音乐的会员, 听到好听的歌就要保存下来. 但有的歌只能听,不能下载, 或者到了月末500首歌的下载额度满了, 就强迫症的不爽.
下面代码中主要的点:
- 缓存目录下的.idx 里有文件大小和当前缓存进度, 可以判断是不是缓存完了.
- 缓存目录下的.info文件里有文件格式.
- 缓存目录下的.uc文件是加密了的音乐文件, 解密方式是和 0xa3取反
- songId最重要, 在文件名里解析出来.
- 通过songId在网站上爬歌曲信息. 网易做了反爬处理, 需要用到selenium+firefox, 再转到一个frame里去解析.
- 歌曲信息里有专辑封面图片的链接, 直接下载.
- 通过网易的API获取歌词.
之前爬过一个网站上所有影片信息, 其实比这个简单的多. 主要在处理网易的反爬处理时挺麻烦的.
import requests
import os
import json
from bs4 import BeautifulSoup
import socket
import urllib
from selenium import webdriver
import re
import time
songStorageDir = "d:/0-music"
songIdDir = "/`````songid"
cloudMusicCacheDir = os.environ['LOCALAPPDATA'] + "\\Netease\\CloudMusic\\Cache\\Cache"
cloudMusicLyricDir = os.environ['LOCALAPPDATA'] + "\\Netease\\CloudMusic\\webdata\\lyric"
def GetSongInfo( songId, fileSize, format ):
songInfoFilePath = songStorageDir + songIdDir + "/" + songId + ".json"
if( os.path.isfile( songInfoFilePath ) ):
try:
songInfoFile = open( songInfoFilePath, "rb")
songInfo = json.load( songInfoFile )
songInfoFile.close()
if( songInfo["songId"] ):
return songInfo
except:
print( "Load song info fail! path:" + songInfoFilePath )
fireFoxOptions = webdriver.FirefoxOptions()
fireFoxOptions.set_headless()
browser = webdriver.Firefox(firefox_options=fireFoxOptions)
#browser = webdriver.Firefox()
#browser = webdriver.Chrome()
#browser = webdriver.PhantomJS()
#browser.get( "http://music.163.com/#/song?id=" + songId )
songUrl = "http://music.163.com/song?id=" + songId
browser.get( songUrl )
try:
browser.switch_to.frame('g_iframe') # 从windows切换到frame,切换到真实信息所在的frame
except:
print( "switch to frame fail! song url:" + songUrl )
return
songInfoJsonStr = browser.find_element_by_xpath( "/html/head/script[2]").get_attribute( "innerHTML" )
songInfo = json.loads( songInfoJsonStr )
songInfo["songId"] = songId
songInfo["format"]= format
songInfo["fileSize"] = fileSize
songInfo["artist"] = songInfo["description"].split('。')[0].split(':')[1]
songInfo["album"] = songInfo["description"].split('。')[1].split(':')[1]
# 保存.
with open( songInfoFilePath, "w", encoding='utf-8' ) as f:
json.dump( songInfo, f )
f.close()
#print( songInfo )
browser.quit()
return songInfo
def DecodeCloudMusicCacheFile( cacheFilePath, storageFilePath ):
with open ( cacheFilePath,'rb') as f:
btay = bytearray(f.read())
with open(storageFilePath,'wb') as out:
for i,j in enumerate(btay):
btay[i] = j ^ 0xa3
out.write(bytes(btay))
out.close()
f.close()
def GetSongDir( songInfo ):
album = re.sub('[\\\/:*?"<>|]','-',songInfo[ "album" ])#去掉非法字符
artist = re.sub('[\\\/:*?"<>|]','-',songInfo["artist"])#去掉非法字符
songDir = songStorageDir + "/" + artist + "/" + album + "/"
os.makedirs( songDir, 0o777, True )
return songDir
def GetSongFilePathWithoutExt( songInfo ):
songTitle = songInfo["title"]
songDir = GetSongDir( songInfo )
fileName = re.sub('[\\\/:*?"<>|]','-',songTitle)#去掉非法字符
return songDir + fileName
def SaveSongFile( ucFilePath, songInfo ):
songFilePath = GetSongFilePathWithoutExt( songInfo ) + "." + songInfo["format"]
if( os.path.isfile( songFilePath ) ):
if( os.path.getsize( songFilePath ) >= int( songInfo["fileSize"] ) ):
#print( songFilePath + " already exist!")
return songFilePath # 已经存在,并且文件大小可以.
DecodeCloudMusicCacheFile( ucFilePath, songFilePath )
print( "Save song to :" + songFilePath )
return songFilePath
def SaveAlbumCover( songInfo ):
coverFilePath = GetSongDir( songInfo ) + "album.jpg"
if( os.path.isfile( coverFilePath ) ):
#print( "Cover " + coverFilePath + " already exist!" )
return
imgUrl = songInfo[ "images"][0]
try:
urllib.request.urlretrieve(imgUrl, coverFilePath)
except urllib.error.HTTPError as err:
print( "img get fail! url:\t"+ imgUrl )
print( err );
except:
print( "img get fail unknown except! url:\t"+ imgUrl )
print( "Save album cover to:" + coverFilePath )
def SaveLyric( songInfo ):
lyricJsonFilePath = GetSongFilePathWithoutExt( songInfo ) + ".lyric.json"
if( not os.path.exists( lyricJsonFilePath ) ):
try:
songId = songInfo["songId"]
lyricUrl = "http://music.163.com/api/song/lyric?os=pc&id="+ songId +"&lv=-1&kv=-1&tv=-1"
urllib.request.urlretrieve( lyricUrl, lyricJsonFilePath )
except:
print( "Save lyric fail! song info:" + json.dumps( songInfo ))
lyricJsonFile = open( lyricJsonFilePath, "rb" )
lyricJson = json.load( lyricJsonFile )
lyricJsonFile.close()
if( not "lrc" in lyricJson.keys() ):
return
if( not "lyric" in lyricJson[ "lrc"].keys() ):
return
lyric = lyricJson[ "lrc"]["lyric"]
lyric = lyric.replace( "\\n", "\n" )
lrcFilePath = GetSongFilePathWithoutExt( songInfo ) + ".lrc"
with open( lrcFilePath, "w", encoding='utf-8' ) as lrcFile:
lrcFile.write( lyric )
lrcFile.close()
def HackCloudMusicCache():
cacheFiles = os.listdir( cloudMusicCacheDir );
for fileName in cacheFiles: #遍历文件夹
ucFilePath = cloudMusicCacheDir + "\\" + fileName
if(os.path.isfile(ucFilePath)): #判断是否是文件夹,不是文件夹才打开
#f = open( cloudMusicCacheDir+"/"+file); #打开文件
isUC= fileName.endswith('.uc')
if( isUC ):
strArray = fileName.split("-")
songId = strArray[0];
#print( "song id is :" + songId ) #文件名的第一部分是歌曲ID
idxFilePath = ucFilePath.replace('.uc', '.idx')
infoFilePath = ucFilePath.replace('.uc', '.info')
idxFileExist = os.path.exists( idxFilePath )
infoFileExist = os.path.exists( infoFilePath )
if( idxFileExist and infoFileExist ): # info 或 idx 文件不存在的时候,歌曲还没有播放完.
idxFile = open( idxFilePath, 'rb')
idxJson = json.load( idxFile )
#print( idxJson )
fileSize = int(idxJson['size'])
zoneEnd = int(idxJson['zone'][0].split(' ')[1])
cacheFinished = fileSize <= zoneEnd+1
idxFile.close()
if( not cacheFinished ):
print( "File " + ucFilePath + " has not finished!")
continue
infoFile = open( infoFilePath, 'rb' )
infoJson = json.load( infoFile )
musicFileFormat = infoJson['format']
#print( "file format is :" + musicFileFormat)
songInfo = GetSongInfo( songId, fileSize, musicFileFormat )
if( not songInfo["songId"] ):
continue
storageSongPath = SaveSongFile( ucFilePath, songInfo )
SaveAlbumCover( songInfo ) # 保存封面
SaveLyric( songInfo ) # 保存歌词.
while( True ):
print( "--------------------Start hack cloud music cache!------------------" )
HackCloudMusicCache();
time.sleep( 60 * 5 )