kmaurinjones commited on
Commit
dfd2747
1 Parent(s): 618c903

Upload 4 files

Browse files
Files changed (4) hide show
  1. __init__.py +8 -0
  2. azapi.py +5 -3
  3. requester.py +52 -0
  4. tools.py +149 -0
__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from azapi import AZlyrics
2
+
3
+ __name__ = 'azapi.azapi.AZlyrics'
4
+ __author__ = 'Khaled H. El-Morshedy'
5
+ __url__ = 'https://github.com/elmoiv/azapi'
6
+ __description__ = 'Get Lyrics from AZLyrics.com like a Boss ~(0_0)~'
7
+ __license__ = 'GPL-v3.0'
8
+ __version__ = '3.0.7'
azapi.py CHANGED
@@ -1,5 +1,7 @@
1
- from .requester import Requester
2
- from .tools import *
 
 
3
 
4
  class AZlyrics(Requester):
5
  """
@@ -175,4 +177,4 @@ class AZlyrics(Requester):
175
 
176
  # Store songs for later usage
177
  self.songs = parseSongs(albums_page)
178
- return self.songs
 
1
+ # from .requester import Requester
2
+ # from .tools import *
3
+ from requester import Requester
4
+ from tools import *
5
 
6
  class AZlyrics(Requester):
7
  """
 
177
 
178
  # Store songs for later usage
179
  self.songs = parseSongs(albums_page)
180
+ return self.songs
requester.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests, random
2
+
3
+ userAgents = '''Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36
4
+ Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36
5
+ Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.11 Safari/535.19
6
+ Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11
7
+ Mozilla/5.0 (X11; U; Linux x86_64; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.221.3 Safari/532.2
8
+ Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.221.0 Safari/532.2
9
+ Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.220.1 Safari/532.1
10
+ Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.6 Safari/532.1
11
+ Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.5 Safari/532.1
12
+ Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.5 Safari/532.1
13
+ Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.4 Safari/532.1
14
+ Mozilla/5.0 (X11; U; Linux x86_64; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.3 Safari/532.1
15
+ Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.3 Safari/532.1
16
+ Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.3 Safari/532.1
17
+ Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.197.0 Safari/532.0
18
+ Mozilla/5.0 (X11; U; Linux i686 (x86_64); en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.197.0 Safari/532.0
19
+ Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.23 Safari/530.5
20
+ Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.2 Safari/530.5
21
+ Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.2 Safari/530.5
22
+ Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.4 (KHTML, like Gecko) Chrome/2.0.172.0 Safari/530.4
23
+ Mozilla/5.0 (Windows; U; Windows NT 5.2; eu) AppleWebKit/530.4 (KHTML, like Gecko) Chrome/2.0.172.0 Safari/530.4
24
+ Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/530.4 (KHTML, like Gecko) Chrome/2.0.172.0 Safari/530.4
25
+ Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.0 Safari/530.5
26
+ Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.4 (KHTML, like Gecko) Chrome/2.0.171.0 Safari/530.4
27
+ Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.1 (KHTML, like Gecko) Chrome/2.0.170.0 Safari/530.1
28
+ Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.1 (KHTML, like Gecko) Chrome/2.0.169.0 Safari/530.1
29
+ Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.1 (KHTML, like Gecko) Chrome/2.0.168.0 Safari/530.1
30
+ Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.1 (KHTML, like Gecko) Chrome/2.0.164.0 Safari/530.1
31
+ Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.0 (KHTML, like Gecko) Chrome/2.0.162.0 Safari/530.0
32
+ Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.0 (KHTML, like Gecko) Chrome/2.0.160.0 Safari/530.0
33
+ Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/528.10 (KHTML, like Gecko) Chrome/2.0.157.2 Safari/528.10
34
+ Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.10 (KHTML, like Gecko) Chrome/2.0.157.2 Safari/528.10
35
+ Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/528.11 (KHTML, like Gecko) Chrome/2.0.157.0 Safari/528.11
36
+ Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.9 (KHTML, like Gecko) Chrome/2.0.157.0 Safari/528.9
37
+ Mozilla/5.0 (Linux; U; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13
38
+ Mozilla/5.0 (Macintosh; U; Mac OS X 10_6_1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/ Safari/530.5
39
+ Mozilla/5.0 (Macintosh; U; Mac OS X 10_5_7; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/ Safari/530.5
40
+ Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US) AppleWebKit/530.9 (KHTML, like Gecko) Chrome/ Safari/530.9
41
+ Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US) AppleWebKit/530.6 (KHTML, like Gecko) Chrome/ Safari/530.6
42
+ Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/ Safari/530.5'''
43
+
44
+ class Requester():
45
+ USER_AGENTS = userAgents.split('\n')
46
+
47
+ # Inspired from: https://github.com/brianchesley/Lyrics/blob/master/lyrics_data_scrape.py
48
+ def get(self, url, _proxies={}):
49
+ return requests.get(url, headers={'User-Agent': random.choice(self.USER_AGENTS)}, proxies=_proxies)
50
+
51
+ def head(self, url, _proxies={}):
52
+ return requests.head(url, headers={'User-Agent': random.choice(self.USER_AGENTS)}, proxies=_proxies)
tools.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import bs4, re, time, os
2
+ from urllib.parse import quote
3
+ # from .jaro import jaro_distance
4
+ from songscope import jaro_distance
5
+
6
+ letters = 'abcdefghijklmnopqrstuvwxyz0123456789'
7
+
8
+ def htmlFind(page):
9
+ # v3.0
10
+ # Changed page.text -> page.content.decode() to support variant unicodes
11
+ soup = bs4.BeautifulSoup(
12
+ page.content.decode(),
13
+ "html.parser"
14
+ )
15
+ return soup.find
16
+
17
+ def htmlFindAll(page):
18
+ # v3.0
19
+ # Changed page.text -> page.content.decode() to support variant unicodes
20
+ soup = bs4.BeautifulSoup(
21
+ page.content.decode(),
22
+ "html.parser"
23
+ )
24
+ return soup.findAll
25
+
26
+ def filtr(inpt, isFile=False):
27
+ if isFile:
28
+ return ''.join(i for i in inpt if i not in r'<>:"/\|?*')
29
+ return ''.join(i.lower() for i in inpt if i.lower() in letters)
30
+
31
+ def normalGet(artist='', title='', _type=0):
32
+ art, tit = filtr(artist), filtr(title)
33
+ if _type:
34
+ print('https://www.azlyrics.com/{}/{}.html'.format(art[0], art))
35
+ return 'https://www.azlyrics.com/{}/{}.html'.format(art[0], art)
36
+ return 'https://www.azlyrics.com/lyrics/{}/{}.html'.format(art, tit)
37
+
38
+ def googleGet(srch_eng, acc, get_func, artist='', title='', _type=0, proxies={}):
39
+ # Encode artist and title to avoid url encoding errors
40
+ data = artist + ' ' * (title != '' and artist != '') + title
41
+ encoded_data = quote(data.replace(' ', '+'))
42
+
43
+ # Perform a search (for accuracy) [Custom search engine]
44
+ search_engines = {
45
+ 'google': 'https://www.google.com/search?q=',
46
+ 'duckduckgo': 'https://duckduckgo.com/html/?q='
47
+ }
48
+
49
+ slctd_srch_engn = 'google'
50
+ if srch_eng in search_engines:
51
+ slctd_srch_engn = srch_eng
52
+
53
+ google_page = get_func('{}{}+site%3Aazlyrics.com'.format(
54
+ search_engines[slctd_srch_engn],
55
+ encoded_data
56
+ ),
57
+ proxies
58
+ )
59
+
60
+ # Choose between lyrics or song according to function used
61
+ regex = [
62
+ r'(azlyrics\.com\/lyrics\/(\w+)\/(\w+).html)',
63
+ r'(azlyrics\.com\/[a-z0-9]+\/(\w+).html)'
64
+ ]
65
+
66
+ # ex result: [('azlyrics.com/t/taylorswift.html', 'taylorswift')]
67
+ # result[0][0] = 'azlyrics.com/t/taylorswift.html'
68
+ results = re.findall(
69
+ regex[_type],
70
+ google_page.text
71
+ )
72
+
73
+ if len(results):
74
+ # calculate jaro similarity for artist and title
75
+ jaro_artist = 1.0
76
+ jaro_title = 1.0
77
+
78
+ if artist:
79
+ jaro_artist = jaro_distance(
80
+ artist.replace(' ', ''),
81
+ results[0][1]
82
+ )
83
+ if title:
84
+ jaro_title = jaro_distance(
85
+ title.replace(' ', ''),
86
+ results[0][2]
87
+ )
88
+
89
+ if jaro_artist >= acc and jaro_title >= acc:
90
+ return 'https://www.' + results[0][0]
91
+ else:
92
+ print('Similarity <', acc)
93
+ else:
94
+ print(srch_eng.title(), 'found nothing!')
95
+
96
+ return 0
97
+
98
+ # v3.0.5: Re-coded ParseLyrics to be more efficient
99
+ def parseLyric(page):
100
+ divs = [i.text for i in htmlFindAll(page)('div', {'class': None})]
101
+ return max(divs, key=len)
102
+
103
+ def parseSongs(page):
104
+ songs = {}
105
+ Parent = htmlFind(page)('div', {'id':'listAlbum'})
106
+ if Parent:
107
+ Raw_Data = Parent.findChildren()
108
+
109
+ curType, curName, curYear = '', '', ''
110
+
111
+ for elmnt in Raw_Data:
112
+
113
+ # v3.0.3: Removed break after script due to google ads inside listAlbum
114
+ # is using script tag, which results in not all songs retrieved
115
+ #if elmnt.name == 'script':
116
+ # break
117
+
118
+ # album info are inside divs
119
+ if elmnt.name == 'div':
120
+ if elmnt.text == 'other songs:':
121
+ curType, curName, curYear = 'Others', '', ''
122
+ else:
123
+ # Separating to (album, name, year)
124
+ rgx = re.findall(r'(.*):\s"(.*)"\s\(([0-9]+)\)', elmnt.text)
125
+ if rgx:
126
+ curType, curName, curYear = rgx[0]
127
+ if elmnt.name == 'a':
128
+ songs[elmnt.text] = {
129
+ 'year': curYear,
130
+ 'album': curName,
131
+ 'type': curType,
132
+ # Azlyrics puts hrefs with/without base url
133
+ 'url': 'http://www.azlyrics.com' + elmnt['href'].strip('.') \
134
+ if elmnt['href'].startswith('/lyrics/') else elmnt['href']
135
+ }
136
+ # v 3.0
137
+ # Some artists have no albums, so we cover this
138
+ else:
139
+ for div in htmlFindAll(page)('div', {'class':'listalbum-item'}):
140
+ a = div.find('a')
141
+ songs[a.text] = {
142
+ 'year': '',
143
+ 'album': '',
144
+ 'type': '',
145
+ # v3.0.1: fix relative urls -> absolute url
146
+ 'url': 'http://www.azlyrics.com' + a['href'][2:] \
147
+ if a['href'][:2] == '..' else a['href']
148
+ }
149
+ return songs