Spaces:

kmaurinjones
/

SongScope

Running

App Files Files Community

SongScope / tools.py

kmaurinjones

Update tools.py

59d0659 about 1 year ago

raw

history blame

No virus

5.4 kB

	import bs4, re, time, os
	from urllib.parse import quote
	# from .jaro import jaro_distance
	# from songscope import jaro_distance
	from songscope import *

	letters = 'abcdefghijklmnopqrstuvwxyz0123456789'

	def htmlFind(page):
	# v3.0
	# Changed page.text -> page.content.decode() to support variant unicodes
	soup = bs4.BeautifulSoup(
	page.content.decode(),
	"html.parser"
	)
	return soup.find

	def htmlFindAll(page):
	# v3.0
	# Changed page.text -> page.content.decode() to support variant unicodes
	soup = bs4.BeautifulSoup(
	page.content.decode(),
	"html.parser"
	)
	return soup.findAll

	def filtr(inpt, isFile=False):
	if isFile:
	return ''.join(i for i in inpt if i not in r'<>:"/\\|?*')
	return ''.join(i.lower() for i in inpt if i.lower() in letters)

	def normalGet(artist='', title='', _type=0):
	art, tit = filtr(artist), filtr(title)
	if _type:
	print('https://www.azlyrics.com/{}/{}.html'.format(art[0], art))
	return 'https://www.azlyrics.com/{}/{}.html'.format(art[0], art)
	return 'https://www.azlyrics.com/lyrics/{}/{}.html'.format(art, tit)

	def googleGet(srch_eng, acc, get_func, artist='', title='', _type=0, proxies={}):
	# Encode artist and title to avoid url encoding errors
	data = artist + ' ' * (title != '' and artist != '') + title
	encoded_data = quote(data.replace(' ', '+'))

	# Perform a search (for accuracy) [Custom search engine]
	search_engines = {
	'google': 'https://www.google.com/search?q=',
	'duckduckgo': 'https://duckduckgo.com/html/?q='
	}

	slctd_srch_engn = 'google'
	if srch_eng in search_engines:
	slctd_srch_engn = srch_eng

	google_page = get_func('{}{}+site%3Aazlyrics.com'.format(
	search_engines[slctd_srch_engn],
	encoded_data
	),
	proxies
	)

	# Choose between lyrics or song according to function used
	regex = [
	r'(azlyrics\.com\/lyrics\/(\w+)\/(\w+).html)',
	r'(azlyrics\.com\/[a-z0-9]+\/(\w+).html)'
	]

	# ex result: [('azlyrics.com/t/taylorswift.html', 'taylorswift')]
	# result[0][0] = 'azlyrics.com/t/taylorswift.html'
	results = re.findall(
	regex[_type],
	google_page.text
	)

	if len(results):
	# calculate jaro similarity for artist and title
	jaro_artist = 1.0
	jaro_title = 1.0

	if artist:
	jaro_artist = jaro_distance(
	artist.replace(' ', ''),
	results[0][1]
	)
	if title:
	jaro_title = jaro_distance(
	title.replace(' ', ''),
	results[0][2]
	)

	if jaro_artist >= acc and jaro_title >= acc:
	return 'https://www.' + results[0][0]
	else:
	print('Similarity <', acc)
	else:
	print(srch_eng.title(), 'found nothing!')

	return 0

	# v3.0.5: Re-coded ParseLyrics to be more efficient
	def parseLyric(page):
	divs = [i.text for i in htmlFindAll(page)('div', {'class': None})]
	return max(divs, key=len)

	def parseSongs(page):
	songs = {}
	Parent = htmlFind(page)('div', {'id':'listAlbum'})
	if Parent:
	Raw_Data = Parent.findChildren()

	curType, curName, curYear = '', '', ''

	for elmnt in Raw_Data:

	# v3.0.3: Removed break after script due to google ads inside listAlbum
	# is using script tag, which results in not all songs retrieved
	#if elmnt.name == 'script':
	# break

	# album info are inside divs
	if elmnt.name == 'div':
	if elmnt.text == 'other songs:':
	curType, curName, curYear = 'Others', '', ''
	else:
	# Separating to (album, name, year)
	rgx = re.findall(r'(.):\s"(.)"\s\(([0-9]+)\)', elmnt.text)
	if rgx:
	curType, curName, curYear = rgx[0]
	if elmnt.name == 'a':
	songs[elmnt.text] = {
	'year': curYear,
	'album': curName,
	'type': curType,
	# Azlyrics puts hrefs with/without base url
	'url': 'http://www.azlyrics.com' + elmnt['href'].strip('.') \
	if elmnt['href'].startswith('/lyrics/') else elmnt['href']
	}
	# v 3.0
	# Some artists have no albums, so we cover this
	else:
	for div in htmlFindAll(page)('div', {'class':'listalbum-item'}):
	a = div.find('a')
	songs[a.text] = {
	'year': '',
	'album': '',
	'type': '',
	# v3.0.1: fix relative urls -> absolute url
	'url': 'http://www.azlyrics.com' + a['href'][2:] \
	if a['href'][:2] == '..' else a['href']
	}
	return songs