Spaces:
Running
Running
Commit
•
dfd2747
1
Parent(s):
618c903
Upload 4 files
Browse files- __init__.py +8 -0
- azapi.py +5 -3
- requester.py +52 -0
- tools.py +149 -0
__init__.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from azapi import AZlyrics
|
2 |
+
|
3 |
+
__name__ = 'azapi.azapi.AZlyrics'
|
4 |
+
__author__ = 'Khaled H. El-Morshedy'
|
5 |
+
__url__ = 'https://github.com/elmoiv/azapi'
|
6 |
+
__description__ = 'Get Lyrics from AZLyrics.com like a Boss ~(0_0)~'
|
7 |
+
__license__ = 'GPL-v3.0'
|
8 |
+
__version__ = '3.0.7'
|
azapi.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1 |
-
from .requester import Requester
|
2 |
-
from .tools import *
|
|
|
|
|
3 |
|
4 |
class AZlyrics(Requester):
|
5 |
"""
|
@@ -175,4 +177,4 @@ class AZlyrics(Requester):
|
|
175 |
|
176 |
# Store songs for later usage
|
177 |
self.songs = parseSongs(albums_page)
|
178 |
-
return self.songs
|
|
|
1 |
+
# from .requester import Requester
|
2 |
+
# from .tools import *
|
3 |
+
from requester import Requester
|
4 |
+
from tools import *
|
5 |
|
6 |
class AZlyrics(Requester):
|
7 |
"""
|
|
|
177 |
|
178 |
# Store songs for later usage
|
179 |
self.songs = parseSongs(albums_page)
|
180 |
+
return self.songs
|
requester.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests, random
|
2 |
+
|
3 |
+
userAgents = '''Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36
|
4 |
+
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36
|
5 |
+
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.11 Safari/535.19
|
6 |
+
Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11
|
7 |
+
Mozilla/5.0 (X11; U; Linux x86_64; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.221.3 Safari/532.2
|
8 |
+
Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.221.0 Safari/532.2
|
9 |
+
Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.220.1 Safari/532.1
|
10 |
+
Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.6 Safari/532.1
|
11 |
+
Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.5 Safari/532.1
|
12 |
+
Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.5 Safari/532.1
|
13 |
+
Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.4 Safari/532.1
|
14 |
+
Mozilla/5.0 (X11; U; Linux x86_64; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.3 Safari/532.1
|
15 |
+
Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.3 Safari/532.1
|
16 |
+
Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.3 Safari/532.1
|
17 |
+
Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.197.0 Safari/532.0
|
18 |
+
Mozilla/5.0 (X11; U; Linux i686 (x86_64); en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.197.0 Safari/532.0
|
19 |
+
Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.23 Safari/530.5
|
20 |
+
Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.2 Safari/530.5
|
21 |
+
Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.2 Safari/530.5
|
22 |
+
Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.4 (KHTML, like Gecko) Chrome/2.0.172.0 Safari/530.4
|
23 |
+
Mozilla/5.0 (Windows; U; Windows NT 5.2; eu) AppleWebKit/530.4 (KHTML, like Gecko) Chrome/2.0.172.0 Safari/530.4
|
24 |
+
Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/530.4 (KHTML, like Gecko) Chrome/2.0.172.0 Safari/530.4
|
25 |
+
Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.0 Safari/530.5
|
26 |
+
Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.4 (KHTML, like Gecko) Chrome/2.0.171.0 Safari/530.4
|
27 |
+
Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.1 (KHTML, like Gecko) Chrome/2.0.170.0 Safari/530.1
|
28 |
+
Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.1 (KHTML, like Gecko) Chrome/2.0.169.0 Safari/530.1
|
29 |
+
Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.1 (KHTML, like Gecko) Chrome/2.0.168.0 Safari/530.1
|
30 |
+
Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.1 (KHTML, like Gecko) Chrome/2.0.164.0 Safari/530.1
|
31 |
+
Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.0 (KHTML, like Gecko) Chrome/2.0.162.0 Safari/530.0
|
32 |
+
Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.0 (KHTML, like Gecko) Chrome/2.0.160.0 Safari/530.0
|
33 |
+
Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/528.10 (KHTML, like Gecko) Chrome/2.0.157.2 Safari/528.10
|
34 |
+
Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.10 (KHTML, like Gecko) Chrome/2.0.157.2 Safari/528.10
|
35 |
+
Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/528.11 (KHTML, like Gecko) Chrome/2.0.157.0 Safari/528.11
|
36 |
+
Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.9 (KHTML, like Gecko) Chrome/2.0.157.0 Safari/528.9
|
37 |
+
Mozilla/5.0 (Linux; U; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13
|
38 |
+
Mozilla/5.0 (Macintosh; U; Mac OS X 10_6_1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/ Safari/530.5
|
39 |
+
Mozilla/5.0 (Macintosh; U; Mac OS X 10_5_7; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/ Safari/530.5
|
40 |
+
Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US) AppleWebKit/530.9 (KHTML, like Gecko) Chrome/ Safari/530.9
|
41 |
+
Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US) AppleWebKit/530.6 (KHTML, like Gecko) Chrome/ Safari/530.6
|
42 |
+
Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/ Safari/530.5'''
|
43 |
+
|
44 |
+
class Requester():
|
45 |
+
USER_AGENTS = userAgents.split('\n')
|
46 |
+
|
47 |
+
# Inspired from: https://github.com/brianchesley/Lyrics/blob/master/lyrics_data_scrape.py
|
48 |
+
def get(self, url, _proxies={}):
|
49 |
+
return requests.get(url, headers={'User-Agent': random.choice(self.USER_AGENTS)}, proxies=_proxies)
|
50 |
+
|
51 |
+
def head(self, url, _proxies={}):
|
52 |
+
return requests.head(url, headers={'User-Agent': random.choice(self.USER_AGENTS)}, proxies=_proxies)
|
tools.py
ADDED
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import bs4, re, time, os
|
2 |
+
from urllib.parse import quote
|
3 |
+
# from .jaro import jaro_distance
|
4 |
+
from songscope import jaro_distance
|
5 |
+
|
6 |
+
letters = 'abcdefghijklmnopqrstuvwxyz0123456789'
|
7 |
+
|
8 |
+
def htmlFind(page):
|
9 |
+
# v3.0
|
10 |
+
# Changed page.text -> page.content.decode() to support variant unicodes
|
11 |
+
soup = bs4.BeautifulSoup(
|
12 |
+
page.content.decode(),
|
13 |
+
"html.parser"
|
14 |
+
)
|
15 |
+
return soup.find
|
16 |
+
|
17 |
+
def htmlFindAll(page):
|
18 |
+
# v3.0
|
19 |
+
# Changed page.text -> page.content.decode() to support variant unicodes
|
20 |
+
soup = bs4.BeautifulSoup(
|
21 |
+
page.content.decode(),
|
22 |
+
"html.parser"
|
23 |
+
)
|
24 |
+
return soup.findAll
|
25 |
+
|
26 |
+
def filtr(inpt, isFile=False):
|
27 |
+
if isFile:
|
28 |
+
return ''.join(i for i in inpt if i not in r'<>:"/\|?*')
|
29 |
+
return ''.join(i.lower() for i in inpt if i.lower() in letters)
|
30 |
+
|
31 |
+
def normalGet(artist='', title='', _type=0):
|
32 |
+
art, tit = filtr(artist), filtr(title)
|
33 |
+
if _type:
|
34 |
+
print('https://www.azlyrics.com/{}/{}.html'.format(art[0], art))
|
35 |
+
return 'https://www.azlyrics.com/{}/{}.html'.format(art[0], art)
|
36 |
+
return 'https://www.azlyrics.com/lyrics/{}/{}.html'.format(art, tit)
|
37 |
+
|
38 |
+
def googleGet(srch_eng, acc, get_func, artist='', title='', _type=0, proxies={}):
|
39 |
+
# Encode artist and title to avoid url encoding errors
|
40 |
+
data = artist + ' ' * (title != '' and artist != '') + title
|
41 |
+
encoded_data = quote(data.replace(' ', '+'))
|
42 |
+
|
43 |
+
# Perform a search (for accuracy) [Custom search engine]
|
44 |
+
search_engines = {
|
45 |
+
'google': 'https://www.google.com/search?q=',
|
46 |
+
'duckduckgo': 'https://duckduckgo.com/html/?q='
|
47 |
+
}
|
48 |
+
|
49 |
+
slctd_srch_engn = 'google'
|
50 |
+
if srch_eng in search_engines:
|
51 |
+
slctd_srch_engn = srch_eng
|
52 |
+
|
53 |
+
google_page = get_func('{}{}+site%3Aazlyrics.com'.format(
|
54 |
+
search_engines[slctd_srch_engn],
|
55 |
+
encoded_data
|
56 |
+
),
|
57 |
+
proxies
|
58 |
+
)
|
59 |
+
|
60 |
+
# Choose between lyrics or song according to function used
|
61 |
+
regex = [
|
62 |
+
r'(azlyrics\.com\/lyrics\/(\w+)\/(\w+).html)',
|
63 |
+
r'(azlyrics\.com\/[a-z0-9]+\/(\w+).html)'
|
64 |
+
]
|
65 |
+
|
66 |
+
# ex result: [('azlyrics.com/t/taylorswift.html', 'taylorswift')]
|
67 |
+
# result[0][0] = 'azlyrics.com/t/taylorswift.html'
|
68 |
+
results = re.findall(
|
69 |
+
regex[_type],
|
70 |
+
google_page.text
|
71 |
+
)
|
72 |
+
|
73 |
+
if len(results):
|
74 |
+
# calculate jaro similarity for artist and title
|
75 |
+
jaro_artist = 1.0
|
76 |
+
jaro_title = 1.0
|
77 |
+
|
78 |
+
if artist:
|
79 |
+
jaro_artist = jaro_distance(
|
80 |
+
artist.replace(' ', ''),
|
81 |
+
results[0][1]
|
82 |
+
)
|
83 |
+
if title:
|
84 |
+
jaro_title = jaro_distance(
|
85 |
+
title.replace(' ', ''),
|
86 |
+
results[0][2]
|
87 |
+
)
|
88 |
+
|
89 |
+
if jaro_artist >= acc and jaro_title >= acc:
|
90 |
+
return 'https://www.' + results[0][0]
|
91 |
+
else:
|
92 |
+
print('Similarity <', acc)
|
93 |
+
else:
|
94 |
+
print(srch_eng.title(), 'found nothing!')
|
95 |
+
|
96 |
+
return 0
|
97 |
+
|
98 |
+
# v3.0.5: Re-coded ParseLyrics to be more efficient
|
99 |
+
def parseLyric(page):
|
100 |
+
divs = [i.text for i in htmlFindAll(page)('div', {'class': None})]
|
101 |
+
return max(divs, key=len)
|
102 |
+
|
103 |
+
def parseSongs(page):
|
104 |
+
songs = {}
|
105 |
+
Parent = htmlFind(page)('div', {'id':'listAlbum'})
|
106 |
+
if Parent:
|
107 |
+
Raw_Data = Parent.findChildren()
|
108 |
+
|
109 |
+
curType, curName, curYear = '', '', ''
|
110 |
+
|
111 |
+
for elmnt in Raw_Data:
|
112 |
+
|
113 |
+
# v3.0.3: Removed break after script due to google ads inside listAlbum
|
114 |
+
# is using script tag, which results in not all songs retrieved
|
115 |
+
#if elmnt.name == 'script':
|
116 |
+
# break
|
117 |
+
|
118 |
+
# album info are inside divs
|
119 |
+
if elmnt.name == 'div':
|
120 |
+
if elmnt.text == 'other songs:':
|
121 |
+
curType, curName, curYear = 'Others', '', ''
|
122 |
+
else:
|
123 |
+
# Separating to (album, name, year)
|
124 |
+
rgx = re.findall(r'(.*):\s"(.*)"\s\(([0-9]+)\)', elmnt.text)
|
125 |
+
if rgx:
|
126 |
+
curType, curName, curYear = rgx[0]
|
127 |
+
if elmnt.name == 'a':
|
128 |
+
songs[elmnt.text] = {
|
129 |
+
'year': curYear,
|
130 |
+
'album': curName,
|
131 |
+
'type': curType,
|
132 |
+
# Azlyrics puts hrefs with/without base url
|
133 |
+
'url': 'http://www.azlyrics.com' + elmnt['href'].strip('.') \
|
134 |
+
if elmnt['href'].startswith('/lyrics/') else elmnt['href']
|
135 |
+
}
|
136 |
+
# v 3.0
|
137 |
+
# Some artists have no albums, so we cover this
|
138 |
+
else:
|
139 |
+
for div in htmlFindAll(page)('div', {'class':'listalbum-item'}):
|
140 |
+
a = div.find('a')
|
141 |
+
songs[a.text] = {
|
142 |
+
'year': '',
|
143 |
+
'album': '',
|
144 |
+
'type': '',
|
145 |
+
# v3.0.1: fix relative urls -> absolute url
|
146 |
+
'url': 'http://www.azlyrics.com' + a['href'][2:] \
|
147 |
+
if a['href'][:2] == '..' else a['href']
|
148 |
+
}
|
149 |
+
return songs
|