File size: 5,377 Bytes
dfd2747
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import bs4, re, time, os
from urllib.parse import quote
# from .jaro import jaro_distance
from songscope import jaro_distance

letters = 'abcdefghijklmnopqrstuvwxyz0123456789'

def htmlFind(page):
    # v3.0
    # Changed page.text -> page.content.decode() to support variant unicodes
    soup = bs4.BeautifulSoup(
                        page.content.decode(),
                        "html.parser"
                        )
    return soup.find

def htmlFindAll(page):
    # v3.0
    # Changed page.text -> page.content.decode() to support variant unicodes
    soup = bs4.BeautifulSoup(
                        page.content.decode(),
                        "html.parser"
                        )
    return soup.findAll

def filtr(inpt, isFile=False):
    if isFile:
        return ''.join(i for i in inpt if i not in r'<>:"/\|?*')
    return ''.join(i.lower() for i in inpt if i.lower() in letters)

def normalGet(artist='', title='', _type=0):
    art, tit = filtr(artist), filtr(title)
    if _type:
        print('https://www.azlyrics.com/{}/{}.html'.format(art[0], art))
        return 'https://www.azlyrics.com/{}/{}.html'.format(art[0], art)
    return 'https://www.azlyrics.com/lyrics/{}/{}.html'.format(art, tit)

def googleGet(srch_eng, acc, get_func, artist='', title='', _type=0, proxies={}):
    # Encode artist and title to avoid url encoding errors
    data = artist + ' ' * (title != '' and artist != '') + title
    encoded_data = quote(data.replace(' ', '+'))

    # Perform a search (for accuracy) [Custom search engine]
    search_engines = {
        'google': 'https://www.google.com/search?q=',
        'duckduckgo': 'https://duckduckgo.com/html/?q='
    }

    slctd_srch_engn = 'google'
    if srch_eng in search_engines:
        slctd_srch_engn = srch_eng

    google_page = get_func('{}{}+site%3Aazlyrics.com'.format(
                                    search_engines[slctd_srch_engn],
                                    encoded_data
                                    ),
                            proxies
                            )
    
    # Choose between lyrics or song according to function used
    regex = [
        r'(azlyrics\.com\/lyrics\/(\w+)\/(\w+).html)',
        r'(azlyrics\.com\/[a-z0-9]+\/(\w+).html)'
    ]
    
    # ex result: [('azlyrics.com/t/taylorswift.html', 'taylorswift')]
    # result[0][0] = 'azlyrics.com/t/taylorswift.html'
    results = re.findall(
                        regex[_type],
                        google_page.text
                        )

    if len(results):
        # calculate jaro similarity for artist and title
        jaro_artist = 1.0
        jaro_title = 1.0
        
        if artist:
            jaro_artist = jaro_distance(
                                        artist.replace(' ', ''),
                                        results[0][1]
                                        )
        if title:
            jaro_title = jaro_distance(
                                        title.replace(' ', ''),
                                        results[0][2]
                                        )
        
        if jaro_artist >= acc and jaro_title >= acc:
            return 'https://www.' + results[0][0]
        else:
            print('Similarity <', acc)
    else:
        print(srch_eng.title(), 'found nothing!')
    
    return 0

# v3.0.5: Re-coded ParseLyrics to be more efficient
def parseLyric(page):
    divs = [i.text for i in htmlFindAll(page)('div', {'class': None})]
    return max(divs, key=len)

def parseSongs(page):
    songs = {}
    Parent = htmlFind(page)('div', {'id':'listAlbum'})
    if Parent:
        Raw_Data = Parent.findChildren()

        curType, curName, curYear = '', '', ''

        for elmnt in Raw_Data:

            # v3.0.3: Removed break after script due to google ads inside listAlbum
            # is using script tag, which results in not all songs retrieved
            #if elmnt.name == 'script':
            #    break
            
            # album info are inside divs
            if elmnt.name == 'div':
                if elmnt.text == 'other songs:':
                    curType, curName, curYear = 'Others', '', ''
                else:
                    # Separating to (album, name, year)
                    rgx = re.findall(r'(.*):\s"(.*)"\s\(([0-9]+)\)', elmnt.text)
                    if rgx:
                        curType, curName, curYear = rgx[0]
            if elmnt.name == 'a':
                songs[elmnt.text] = {
                    'year': curYear,
                    'album': curName,
                    'type': curType,
                    # Azlyrics puts hrefs with/without base url
                    'url': 'http://www.azlyrics.com' + elmnt['href'].strip('.') \
                            if elmnt['href'].startswith('/lyrics/') else elmnt['href']
                }
    # v 3.0
    # Some artists have no albums, so we cover this
    else:
        for div in htmlFindAll(page)('div', {'class':'listalbum-item'}):
            a = div.find('a')
            songs[a.text] = {
                'year': '',
                'album': '',
                'type': '',
                # v3.0.1: fix relative urls -> absolute url
                'url': 'http://www.azlyrics.com' + a['href'][2:] \
                        if a['href'][:2] == '..' else a['href']
                }
    return songs