爬虫爬取千千音乐榜单音乐 - 千羽易行
最近做了一个爬取千千音乐的demo,免去下载歌曲还要下载对应客户端的麻烦,刚开始接触爬虫,可能写的不太好,求别喷!话不多说,进入正题
1.获取主页信息(获取各个榜单的url)
这里想要说的就是关于千千音乐的登录问题,可能是我在浏览器其他地方登录了百度账号,导致点击退出之后它就会自动又登录上,本来想通过代码登录获取cookie等登录信息的,我也懒得清除缓存了,
索性直接从抓包工具中把请求头全部复制过来,稍微修改一下
# 获取主页 def gethomepage(): # 创建会话 s = requests.Session() home_url = \'http://music.taihe.com/\' headers ={ \'Accept\':\'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3\', \'Accept-Language\':\'zh-CN,zh;q=0.9\', \'Cache-Control\':\'max-age=0\', \'Connection\':\'keep-alive\', \'Cookie\':\'log_sid=1561218778562E9DB28E6A3CDA8ED552F27E3703A9AB4; BAIDUID=E9DB28E6A3CDA8ED552F27E3703A9AB4:FG=1; BDUSS=3AtOE5xTDJnOTBGb2h6UXVYVnZxTEl-Z2VKc0w2V0kyUVV6MmticWxmaHdlVEZkSUFBQUFBJCQAAAAAAAAAAAEAAADQRIc5uqO~3cqvwMMzNjUAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAHDsCV1w7Aldc; flash_tip_pop=true; tracesrc=-1%7C%7C-1; u_lo=0; u_id=; u_t=; u_login=1; userid=965166288; app_vip=show; Hm_lvt_d0ad46e4afeacf34cd12de4c9b553aa6=1561206432,1561209820; __qianqian_pop_tt=8; Hm_lpvt_d0ad46e4afeacf34cd12de4c9b553aa6=1561218967\', # \'Host\':\'music.taihe.com\', \'Referer\':\'http://music.taihe.com/\', \'Upgrade-Insecure-Requests\':\'1\', \'User-Agent\':\'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36\', } r = s.get(home_url,headers = headers) soup = BeautifulSoup(r.text, \'lxml\') # 获取新歌榜 热榜 网络歌曲榜的url list_m = soup.findAll(\'h4\', class_=\'more-rank\') for h in list_m: bd_url = h.find(\'a\')[\'href\'] title = h.find(\'a\')[\'title\'] entitle = h.find(\'a\')[\'href\'].split(\'/\')[-1] bd_url = \'http://music.taihe.com\' + bd_url gotolist(bd_url, headers, s, title, entitle)
2.获取每个榜单中的每首歌曲的id
# 获取各个榜单的歌曲id,并拼接成以逗号隔开的字符串 def gotolist(bd_url, headers, s, title, entitle): r = s.get(bd_url, headers = headers) r.encoding=\'utf8\' soup = BeautifulSoup(r.text, \'lxml\') m_list = soup.select(\'.song-item\') m_num_list = \'\' for m_num in m_list: soup = BeautifulSoup(str(m_num), \'lxml\') text = soup.find(\'span\', class_=\'song-title\').find(\'a\')[\'href\'] m_num_list += text.split(\'/\')[-1] + \',\' getjson(m_num_list.strip(\',\'), title, entitle)
3.根据歌曲id获取每首歌曲的基本信息
json_url = \'http://play.taihe.com/data/music/songlink\' formdata = { \'songIds\': num, \'hq\': \'0\', \'type\': \'m4a,mp3\', \'rate\': \'\', \'pt\': \'0\', \'flag\': \'-1\', \'s2p\': \'-1\', \'prerate\': \'-1\', \'bwt\': \'-1\', \'dur\': \'-1\', \'bat\': \'-1\', \'bp\': \'-1\', \'pos\': \'-1\', \'auto\': \'-1\', } r = requests.post(json_url,headers = headers, data = formdata) # 将获取到的歌曲信息保存在一个列表中 songlist = json.loads(r.text)[\'data\'][\'songList\']
4.遍历并下载歌曲
r = requests.get(music_url, timeout = 500)这行代码中的
timeout = 500得加上,数字可以按情况填写,因为我下载的时候如果不加这个参数下载到中途就会被服务器关闭连接,从而报错
# 遍历找到歌曲的下载地址/播放地址 for song in songlist: music_url = song[\'linkinfo\'][\'128\'][\'songLink\'] print(music_url) # 创建父目录 dirname = \'paihangbang\' if not os.path.exists(dirname): os.mkdir(dirname) #创建对应排行榜目录 dirname = dirname + \'/\' + entitle + \'/\' if not os.path.exists(dirname): os.mkdir(dirname) try: # 歌曲以歌曲名+歌手名进行命名 filename = dirname + str(song[\'songName\']) + \'-\' + str(song[\'artistName\']) + \'.mp3\' r = requests.get(music_url, timeout = 500) with open(filename, \'wb\') as fp: fp.write(r.content) except FileNotFoundError as e: print(filename + \'未找到!\') time.sleep(1)
以上就是全部的代码,下载成功后的目录使这样的