import bs4, re, time, os from urllib.parse import quote # from .jaro import jaro_distance from songscope import jaro_distance letters = 'abcdefghijklmnopqrstuvwxyz0123456789' def htmlFind(page): # v3.0 # Changed page.text -> page.content.decode() to support variant unicodes soup = bs4.BeautifulSoup( page.content.decode(), "html.parser" ) return soup.find def htmlFindAll(page): # v3.0 # Changed page.text -> page.content.decode() to support variant unicodes soup = bs4.BeautifulSoup( page.content.decode(), "html.parser" ) return soup.findAll def filtr(inpt, isFile=False): if isFile: return ''.join(i for i in inpt if i not in r'<>:"/\|?*') return ''.join(i.lower() for i in inpt if i.lower() in letters) def normalGet(artist='', title='', _type=0): art, tit = filtr(artist), filtr(title) if _type: print('{}/{}.html'.format(art[0], art)) return '{}/{}.html'.format(art[0], art) return '{}/{}.html'.format(art, tit) def googleGet(srch_eng, acc, get_func, artist='', title='', _type=0, proxies={}): # Encode artist and title to avoid url encoding errors data = artist + ' ' * (title != '' and artist != '') + title encoded_data = quote(data.replace(' ', '+')) # Perform a search (for accuracy) [Custom search engine] search_engines = { 'google': '', 'duckduckgo': '' } slctd_srch_engn = 'google' if srch_eng in search_engines: slctd_srch_engn = srch_eng google_page = get_func('{}{}'.format( search_engines[slctd_srch_engn], encoded_data ), proxies ) # Choose between lyrics or song according to function used regex = [ r'(azlyrics\.com\/lyrics\/(\w+)\/(\w+).html)', r'(azlyrics\.com\/[a-z0-9]+\/(\w+).html)' ] # ex result: [('', 'taylorswift')] # result[0][0] = '' results = re.findall( regex[_type], google_page.text ) if len(results): # calculate jaro similarity for artist and title jaro_artist = 1.0 jaro_title = 1.0 if artist: jaro_artist = jaro_distance( artist.replace(' ', ''), results[0][1] ) if title: jaro_title = jaro_distance( title.replace(' ', ''), results[0][2] ) if jaro_artist >= acc and jaro_title >= acc: return 'https://www.' + results[0][0] else: print('Similarity <', acc) else: print(srch_eng.title(), 'found nothing!') return 0 # v3.0.5: Re-coded ParseLyrics to be more efficient def parseLyric(page): divs = [i.text for i in htmlFindAll(page)('div', {'class': None})] return max(divs, key=len) def parseSongs(page): songs = {} Parent = htmlFind(page)('div', {'id':'listAlbum'}) if Parent: Raw_Data = Parent.findChildren() curType, curName, curYear = '', '', '' for elmnt in Raw_Data: # v3.0.3: Removed break after script due to google ads inside listAlbum # is using script tag, which results in not all songs retrieved #if == 'script': # break # album info are inside divs if == 'div': if elmnt.text == 'other songs:': curType, curName, curYear = 'Others', '', '' else: # Separating to (album, name, year) rgx = re.findall(r'(.*):\s"(.*)"\s\(([0-9]+)\)', elmnt.text) if rgx: curType, curName, curYear = rgx[0] if == 'a': songs[elmnt.text] = { 'year': curYear, 'album': curName, 'type': curType, # Azlyrics puts hrefs with/without base url 'url': '' + elmnt['href'].strip('.') \ if elmnt['href'].startswith('/lyrics/') else elmnt['href'] } # v 3.0 # Some artists have no albums, so we cover this else: for div in htmlFindAll(page)('div', {'class':'listalbum-item'}): a = div.find('a') songs[a.text] = { 'year': '', 'album': '', 'type': '', # v3.0.1: fix relative urls -> absolute url 'url': '' + a['href'][2:] \ if a['href'][:2] == '..' else a['href'] } return songs