懒, 直接上源码
输出来的东西还有点怪怪的, 懒得用正则表达式修复了
import requests
from bs4 import BeautifulSoup as bs
import re
import time
rootAdress = "https://www.azlyrics.com/"
htmlData = []
songURLList = []
pattern = re.compile('../lyrics/.*')
titlePattern = re.compile('(?<=</div><b>").*(?="</b><br/>)')
lyricsPattern = re.compile('(?<=-->).*(?=</div><br/><br/>)')
singerName = input()
singerNamePost = singerName.replace(" ","").lower()
postAdress = rootAdress + singerNamePost[0] +"/"+ singerNamePost +".html"
r = requests.get(postAdress)
soup = bs(r.content,"html.parser")
for link in soup.find_all('a'):
match = pattern.match(str(link.get('href')))
if match:
print(match)
if match != None:
songURLList.append(rootAdress+str(match.group(0)[3:]))
i =0
f = open("./python mma 歌词/lyrics.txt","a+",encoding='utf-8')
for i in range(i,i+25):
r = requests.get(songURLList[i])
time.sleep(1)
soup = bs(r.content,"html.parser")
lyricsRaw = (str(soup.find_all('div')[9]).replace("\n",''))
if re.search(lyricsPattern,lyricsRaw)!=None:
lyrics = re.search(lyricsPattern,lyricsRaw).group(0).replace("<br/><br/>",", ").replace("<br/>",", ")
f.write(str(lyrics)+"\n")
print("Success to save #" +str(i))
i=i+1
f.close()
因为这个网址每次只能查30次, 再多就自动关闭连接, 所以用i来指定每次连接下载的个数.