Spaces:
Running
Running
import bs4, re, time, os | |
from urllib.parse import quote | |
# from .jaro import jaro_distance | |
# from songscope import jaro_distance | |
from songscope import * | |
letters = 'abcdefghijklmnopqrstuvwxyz0123456789' | |
def htmlFind(page): | |
# v3.0 | |
# Changed page.text -> page.content.decode() to support variant unicodes | |
soup = bs4.BeautifulSoup( | |
page.content.decode(), | |
"html.parser" | |
) | |
return soup.find | |
def htmlFindAll(page): | |
# v3.0 | |
# Changed page.text -> page.content.decode() to support variant unicodes | |
soup = bs4.BeautifulSoup( | |
page.content.decode(), | |
"html.parser" | |
) | |
return soup.findAll | |
def filtr(inpt, isFile=False): | |
if isFile: | |
return ''.join(i for i in inpt if i not in r'<>:"/\|?*') | |
return ''.join(i.lower() for i in inpt if i.lower() in letters) | |
def normalGet(artist='', title='', _type=0): | |
art, tit = filtr(artist), filtr(title) | |
if _type: | |
print('https://www.azlyrics.com/{}/{}.html'.format(art[0], art)) | |
return 'https://www.azlyrics.com/{}/{}.html'.format(art[0], art) | |
return 'https://www.azlyrics.com/lyrics/{}/{}.html'.format(art, tit) | |
def googleGet(srch_eng, acc, get_func, artist='', title='', _type=0, proxies={}): | |
# Encode artist and title to avoid url encoding errors | |
data = artist + ' ' * (title != '' and artist != '') + title | |
encoded_data = quote(data.replace(' ', '+')) | |
# Perform a search (for accuracy) [Custom search engine] | |
search_engines = { | |
'google': 'https://www.google.com/search?q=', | |
'duckduckgo': 'https://duckduckgo.com/html/?q=' | |
} | |
slctd_srch_engn = 'google' | |
if srch_eng in search_engines: | |
slctd_srch_engn = srch_eng | |
google_page = get_func('{}{}+site%3Aazlyrics.com'.format( | |
search_engines[slctd_srch_engn], | |
encoded_data | |
), | |
proxies | |
) | |
# Choose between lyrics or song according to function used | |
regex = [ | |
r'(azlyrics\.com\/lyrics\/(\w+)\/(\w+).html)', | |
r'(azlyrics\.com\/[a-z0-9]+\/(\w+).html)' | |
] | |
# ex result: [('azlyrics.com/t/taylorswift.html', 'taylorswift')] | |
# result[0][0] = 'azlyrics.com/t/taylorswift.html' | |
results = re.findall( | |
regex[_type], | |
google_page.text | |
) | |
if len(results): | |
# calculate jaro similarity for artist and title | |
jaro_artist = 1.0 | |
jaro_title = 1.0 | |
if artist: | |
jaro_artist = jaro_distance( | |
artist.replace(' ', ''), | |
results[0][1] | |
) | |
if title: | |
jaro_title = jaro_distance( | |
title.replace(' ', ''), | |
results[0][2] | |
) | |
if jaro_artist >= acc and jaro_title >= acc: | |
return 'https://www.' + results[0][0] | |
else: | |
print('Similarity <', acc) | |
else: | |
print(srch_eng.title(), 'found nothing!') | |
return 0 | |
# v3.0.5: Re-coded ParseLyrics to be more efficient | |
def parseLyric(page): | |
divs = [i.text for i in htmlFindAll(page)('div', {'class': None})] | |
return max(divs, key=len) | |
def parseSongs(page): | |
songs = {} | |
Parent = htmlFind(page)('div', {'id':'listAlbum'}) | |
if Parent: | |
Raw_Data = Parent.findChildren() | |
curType, curName, curYear = '', '', '' | |
for elmnt in Raw_Data: | |
# v3.0.3: Removed break after script due to google ads inside listAlbum | |
# is using script tag, which results in not all songs retrieved | |
#if elmnt.name == 'script': | |
# break | |
# album info are inside divs | |
if elmnt.name == 'div': | |
if elmnt.text == 'other songs:': | |
curType, curName, curYear = 'Others', '', '' | |
else: | |
# Separating to (album, name, year) | |
rgx = re.findall(r'(.*):\s"(.*)"\s\(([0-9]+)\)', elmnt.text) | |
if rgx: | |
curType, curName, curYear = rgx[0] | |
if elmnt.name == 'a': | |
songs[elmnt.text] = { | |
'year': curYear, | |
'album': curName, | |
'type': curType, | |
# Azlyrics puts hrefs with/without base url | |
'url': 'http://www.azlyrics.com' + elmnt['href'].strip('.') \ | |
if elmnt['href'].startswith('/lyrics/') else elmnt['href'] | |
} | |
# v 3.0 | |
# Some artists have no albums, so we cover this | |
else: | |
for div in htmlFindAll(page)('div', {'class':'listalbum-item'}): | |
a = div.find('a') | |
songs[a.text] = { | |
'year': '', | |
'album': '', | |
'type': '', | |
# v3.0.1: fix relative urls -> absolute url | |
'url': 'http://www.azlyrics.com' + a['href'][2:] \ | |
if a['href'][:2] == '..' else a['href'] | |
} | |
return songs |