Spaces:
Sleeping
Sleeping
File size: 5,403 Bytes
dfd2747 59d0659 dfd2747 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
import bs4, re, time, os
from urllib.parse import quote
# from .jaro import jaro_distance
# from songscope import jaro_distance
from songscope import *
letters = 'abcdefghijklmnopqrstuvwxyz0123456789'
def htmlFind(page):
# v3.0
# Changed page.text -> page.content.decode() to support variant unicodes
soup = bs4.BeautifulSoup(
page.content.decode(),
"html.parser"
)
return soup.find
def htmlFindAll(page):
# v3.0
# Changed page.text -> page.content.decode() to support variant unicodes
soup = bs4.BeautifulSoup(
page.content.decode(),
"html.parser"
)
return soup.findAll
def filtr(inpt, isFile=False):
if isFile:
return ''.join(i for i in inpt if i not in r'<>:"/\|?*')
return ''.join(i.lower() for i in inpt if i.lower() in letters)
def normalGet(artist='', title='', _type=0):
art, tit = filtr(artist), filtr(title)
if _type:
print('https://www.azlyrics.com/{}/{}.html'.format(art[0], art))
return 'https://www.azlyrics.com/{}/{}.html'.format(art[0], art)
return 'https://www.azlyrics.com/lyrics/{}/{}.html'.format(art, tit)
def googleGet(srch_eng, acc, get_func, artist='', title='', _type=0, proxies={}):
# Encode artist and title to avoid url encoding errors
data = artist + ' ' * (title != '' and artist != '') + title
encoded_data = quote(data.replace(' ', '+'))
# Perform a search (for accuracy) [Custom search engine]
search_engines = {
'google': 'https://www.google.com/search?q=',
'duckduckgo': 'https://duckduckgo.com/html/?q='
}
slctd_srch_engn = 'google'
if srch_eng in search_engines:
slctd_srch_engn = srch_eng
google_page = get_func('{}{}+site%3Aazlyrics.com'.format(
search_engines[slctd_srch_engn],
encoded_data
),
proxies
)
# Choose between lyrics or song according to function used
regex = [
r'(azlyrics\.com\/lyrics\/(\w+)\/(\w+).html)',
r'(azlyrics\.com\/[a-z0-9]+\/(\w+).html)'
]
# ex result: [('azlyrics.com/t/taylorswift.html', 'taylorswift')]
# result[0][0] = 'azlyrics.com/t/taylorswift.html'
results = re.findall(
regex[_type],
google_page.text
)
if len(results):
# calculate jaro similarity for artist and title
jaro_artist = 1.0
jaro_title = 1.0
if artist:
jaro_artist = jaro_distance(
artist.replace(' ', ''),
results[0][1]
)
if title:
jaro_title = jaro_distance(
title.replace(' ', ''),
results[0][2]
)
if jaro_artist >= acc and jaro_title >= acc:
return 'https://www.' + results[0][0]
else:
print('Similarity <', acc)
else:
print(srch_eng.title(), 'found nothing!')
return 0
# v3.0.5: Re-coded ParseLyrics to be more efficient
def parseLyric(page):
divs = [i.text for i in htmlFindAll(page)('div', {'class': None})]
return max(divs, key=len)
def parseSongs(page):
songs = {}
Parent = htmlFind(page)('div', {'id':'listAlbum'})
if Parent:
Raw_Data = Parent.findChildren()
curType, curName, curYear = '', '', ''
for elmnt in Raw_Data:
# v3.0.3: Removed break after script due to google ads inside listAlbum
# is using script tag, which results in not all songs retrieved
#if elmnt.name == 'script':
# break
# album info are inside divs
if elmnt.name == 'div':
if elmnt.text == 'other songs:':
curType, curName, curYear = 'Others', '', ''
else:
# Separating to (album, name, year)
rgx = re.findall(r'(.*):\s"(.*)"\s\(([0-9]+)\)', elmnt.text)
if rgx:
curType, curName, curYear = rgx[0]
if elmnt.name == 'a':
songs[elmnt.text] = {
'year': curYear,
'album': curName,
'type': curType,
# Azlyrics puts hrefs with/without base url
'url': 'http://www.azlyrics.com' + elmnt['href'].strip('.') \
if elmnt['href'].startswith('/lyrics/') else elmnt['href']
}
# v 3.0
# Some artists have no albums, so we cover this
else:
for div in htmlFindAll(page)('div', {'class':'listalbum-item'}):
a = div.find('a')
songs[a.text] = {
'year': '',
'album': '',
'type': '',
# v3.0.1: fix relative urls -> absolute url
'url': 'http://www.azlyrics.com' + a['href'][2:] \
if a['href'][:2] == '..' else a['href']
}
return songs |