Spaces:
Sleeping
Sleeping
kmaurinjones
commited on
Commit
•
618c903
1
Parent(s):
6781c46
Update songscope.py
Browse files- songscope.py +4 -26
songscope.py
CHANGED
@@ -12,7 +12,7 @@ from nltk.sentiment import SentimentIntensityAnalyzer
|
|
12 |
nltk.download("vader_lexicon")
|
13 |
|
14 |
# jaro distance from scratch because importing or installing the module didn't work
|
15 |
-
def
|
16 |
if not s1 or not s2:
|
17 |
return 0.0
|
18 |
|
@@ -60,7 +60,6 @@ def random_delay(min_val: float, max_val: float, print_delay: bool = False):
|
|
60 |
The maximum amount of time to delay (in seconds).
|
61 |
print_delay: bool
|
62 |
Whether or not to print the delay time.
|
63 |
-
|
64 |
Returns:
|
65 |
-------
|
66 |
val: float
|
@@ -78,12 +77,10 @@ def find_artist(artist_name: str):
|
|
78 |
|
79 |
This function sends an HTTP request to azlyrics.com, scrapes the HTML content
|
80 |
to find the artist's page, and returns the URL to that page.
|
81 |
-
|
82 |
Parameters:
|
83 |
----------
|
84 |
artist_name: str
|
85 |
The name of the artist.
|
86 |
-
|
87 |
Returns:
|
88 |
-------
|
89 |
url: str
|
@@ -119,7 +116,7 @@ def find_artist(artist_name: str):
|
|
119 |
max_sim = -100
|
120 |
for id, name in enumerate(artist_urls):
|
121 |
# dist = Levenshtein.jaro(artist_name, name)
|
122 |
-
dist =
|
123 |
if max_sim < dist:
|
124 |
max_sim = dist
|
125 |
min_id = id
|
@@ -133,23 +130,19 @@ def follow_lyrics(lyric_url: str):
|
|
133 |
This function sends an HTTP request to the given `lyric_url`, parses the HTML content,
|
134 |
and extracts the song lyrics. The lyrics are cleaned by removing unnecessary HTML tags
|
135 |
and whitespace. The function returns the lyrics as a string.
|
136 |
-
|
137 |
Note: Web scraping may be against the terms of service of some websites. Azlyrics.com
|
138 |
specifically prohibits the usage of their content by third-party lyrics providers.
|
139 |
Always review the website's policies and ensure you are compliant before scraping data.
|
140 |
-
|
141 |
Parameters:
|
142 |
------
|
143 |
`lyric_url`: str
|
144 |
The URL of the song lyrics on azlyrics.com.
|
145 |
`song_title`: str
|
146 |
Title of the song
|
147 |
-
|
148 |
Returns:
|
149 |
------
|
150 |
`lyrics_str`:
|
151 |
The lyrics of the song as a single string.
|
152 |
-
|
153 |
Raises:
|
154 |
------
|
155 |
`ValueError`: If the lyrics cannot be found or if there's an error fetching the webpage.
|
@@ -190,17 +183,14 @@ def follow_lyrics(lyric_url: str):
|
|
190 |
def find_artist(artist_name: str) -> str:
|
191 |
"""
|
192 |
Finds the link for the artist page on azlyrics.com.
|
193 |
-
|
194 |
Parameters:
|
195 |
------
|
196 |
`artist_name`: str
|
197 |
The name of the artist.
|
198 |
-
|
199 |
Returns:
|
200 |
------
|
201 |
`url`: str
|
202 |
The URL of the artist page on azlyrics.com.
|
203 |
-
|
204 |
Raises:
|
205 |
------
|
206 |
`ValueError`: If the artist page cannot be found.
|
@@ -245,7 +235,7 @@ def find_artist(artist_name: str) -> str:
|
|
245 |
max_sim = -100
|
246 |
for id, name in enumerate(artist_urls):
|
247 |
# dist = Levenshtein.jaro(artist_name, name)
|
248 |
-
dist =
|
249 |
if max_sim < dist:
|
250 |
max_sim = dist
|
251 |
min_id = id
|
@@ -256,12 +246,10 @@ def find_artist(artist_name: str) -> str:
|
|
256 |
def flatten_list(lst: list):
|
257 |
"""
|
258 |
Flattens all inner lists (all depths) of a list into a list of depth == 1.
|
259 |
-
|
260 |
Parameters:
|
261 |
------
|
262 |
`lst`: List
|
263 |
The list to be flattened.
|
264 |
-
|
265 |
Returns:
|
266 |
------
|
267 |
`result`: List
|
@@ -280,12 +268,10 @@ def process_lyrics(lyrics: str):
|
|
280 |
"""
|
281 |
Pre-processes the lyrics by replacing `\r` with an empty string, splitting the lyrics
|
282 |
by `\n`, and removing consecutive whitespace list items.
|
283 |
-
|
284 |
Parameters:
|
285 |
------
|
286 |
`lyrics`: str
|
287 |
The lyrics to be pre-processed.
|
288 |
-
|
289 |
Returns:
|
290 |
------
|
291 |
`cleaned_lines`: List
|
@@ -306,12 +292,10 @@ def process_lyrics(lyrics: str):
|
|
306 |
def sectionize(lyrics: str):
|
307 |
"""
|
308 |
Splits the pre-processed lyrics into sections.
|
309 |
-
|
310 |
Parameters:
|
311 |
------
|
312 |
`lyrics`: str
|
313 |
The pre-processed lyrics.
|
314 |
-
|
315 |
Returns:
|
316 |
------
|
317 |
`all_sections`: List
|
@@ -347,12 +331,10 @@ def analyze_sentiment_vader(text: str):
|
|
347 |
"""
|
348 |
Analyzes the sentiment of a text using the VADER (Valence Aware Dictionary and sEntiment Reasoner)
|
349 |
sentiment analysis model.
|
350 |
-
|
351 |
Parameters:
|
352 |
------
|
353 |
`text`: str
|
354 |
The text to be analyzed.
|
355 |
-
|
356 |
Returns:
|
357 |
------
|
358 |
`label`: str
|
@@ -378,23 +360,19 @@ def analyze_sentiment_vader(text: str):
|
|
378 |
def get_sentiments(df):
|
379 |
"""
|
380 |
Retrieves the sentiment analysis of all text items in the 'lyrics' column of a pandas DataFrame.
|
381 |
-
|
382 |
This function applies the Vader sentiment analysis model from the Natural Language Toolkit (nltk)
|
383 |
to each text item in the 'lyrics' column of the passed DataFrame. It assigns a sentiment label
|
384 |
('POSITIVE', 'NEGATIVE', or 'NEUTRAL') and a compound sentiment score (ranging from -1 to 1) to
|
385 |
each text item. The sentiment analysis is added as new columns to the DataFrame and the modified
|
386 |
DataFrame is returned.
|
387 |
-
|
388 |
Parameters:
|
389 |
-----------
|
390 |
df : pandas DataFrame
|
391 |
The DataFrame containing the 'lyrics' column to be analyzed.
|
392 |
-
|
393 |
Returns:
|
394 |
--------
|
395 |
df : pandas DataFrame
|
396 |
The modified DataFrame with sentiment analysis added as new columns.
|
397 |
-
|
398 |
Raises:
|
399 |
-------
|
400 |
None.
|
@@ -451,7 +429,7 @@ def get_metadata(artist_name: str, song_titles: list = None) -> dict:
|
|
451 |
max_sim = -100
|
452 |
for id, az_name in enumerate(az_titles):
|
453 |
# dist = Levenshtein.jaro(title, az_name)
|
454 |
-
dist =
|
455 |
if max_sim < dist:
|
456 |
max_sim = dist
|
457 |
min_id = id
|
|
|
12 |
nltk.download("vader_lexicon")
|
13 |
|
14 |
# jaro distance from scratch because importing or installing the module didn't work
|
15 |
+
def jaro_distance(s1, s2):
|
16 |
if not s1 or not s2:
|
17 |
return 0.0
|
18 |
|
|
|
60 |
The maximum amount of time to delay (in seconds).
|
61 |
print_delay: bool
|
62 |
Whether or not to print the delay time.
|
|
|
63 |
Returns:
|
64 |
-------
|
65 |
val: float
|
|
|
77 |
|
78 |
This function sends an HTTP request to azlyrics.com, scrapes the HTML content
|
79 |
to find the artist's page, and returns the URL to that page.
|
|
|
80 |
Parameters:
|
81 |
----------
|
82 |
artist_name: str
|
83 |
The name of the artist.
|
|
|
84 |
Returns:
|
85 |
-------
|
86 |
url: str
|
|
|
116 |
max_sim = -100
|
117 |
for id, name in enumerate(artist_urls):
|
118 |
# dist = Levenshtein.jaro(artist_name, name)
|
119 |
+
dist = jaro_distance(artist_name, name)
|
120 |
if max_sim < dist:
|
121 |
max_sim = dist
|
122 |
min_id = id
|
|
|
130 |
This function sends an HTTP request to the given `lyric_url`, parses the HTML content,
|
131 |
and extracts the song lyrics. The lyrics are cleaned by removing unnecessary HTML tags
|
132 |
and whitespace. The function returns the lyrics as a string.
|
|
|
133 |
Note: Web scraping may be against the terms of service of some websites. Azlyrics.com
|
134 |
specifically prohibits the usage of their content by third-party lyrics providers.
|
135 |
Always review the website's policies and ensure you are compliant before scraping data.
|
|
|
136 |
Parameters:
|
137 |
------
|
138 |
`lyric_url`: str
|
139 |
The URL of the song lyrics on azlyrics.com.
|
140 |
`song_title`: str
|
141 |
Title of the song
|
|
|
142 |
Returns:
|
143 |
------
|
144 |
`lyrics_str`:
|
145 |
The lyrics of the song as a single string.
|
|
|
146 |
Raises:
|
147 |
------
|
148 |
`ValueError`: If the lyrics cannot be found or if there's an error fetching the webpage.
|
|
|
183 |
def find_artist(artist_name: str) -> str:
|
184 |
"""
|
185 |
Finds the link for the artist page on azlyrics.com.
|
|
|
186 |
Parameters:
|
187 |
------
|
188 |
`artist_name`: str
|
189 |
The name of the artist.
|
|
|
190 |
Returns:
|
191 |
------
|
192 |
`url`: str
|
193 |
The URL of the artist page on azlyrics.com.
|
|
|
194 |
Raises:
|
195 |
------
|
196 |
`ValueError`: If the artist page cannot be found.
|
|
|
235 |
max_sim = -100
|
236 |
for id, name in enumerate(artist_urls):
|
237 |
# dist = Levenshtein.jaro(artist_name, name)
|
238 |
+
dist = jaro_distance(artist_name, name)
|
239 |
if max_sim < dist:
|
240 |
max_sim = dist
|
241 |
min_id = id
|
|
|
246 |
def flatten_list(lst: list):
|
247 |
"""
|
248 |
Flattens all inner lists (all depths) of a list into a list of depth == 1.
|
|
|
249 |
Parameters:
|
250 |
------
|
251 |
`lst`: List
|
252 |
The list to be flattened.
|
|
|
253 |
Returns:
|
254 |
------
|
255 |
`result`: List
|
|
|
268 |
"""
|
269 |
Pre-processes the lyrics by replacing `\r` with an empty string, splitting the lyrics
|
270 |
by `\n`, and removing consecutive whitespace list items.
|
|
|
271 |
Parameters:
|
272 |
------
|
273 |
`lyrics`: str
|
274 |
The lyrics to be pre-processed.
|
|
|
275 |
Returns:
|
276 |
------
|
277 |
`cleaned_lines`: List
|
|
|
292 |
def sectionize(lyrics: str):
|
293 |
"""
|
294 |
Splits the pre-processed lyrics into sections.
|
|
|
295 |
Parameters:
|
296 |
------
|
297 |
`lyrics`: str
|
298 |
The pre-processed lyrics.
|
|
|
299 |
Returns:
|
300 |
------
|
301 |
`all_sections`: List
|
|
|
331 |
"""
|
332 |
Analyzes the sentiment of a text using the VADER (Valence Aware Dictionary and sEntiment Reasoner)
|
333 |
sentiment analysis model.
|
|
|
334 |
Parameters:
|
335 |
------
|
336 |
`text`: str
|
337 |
The text to be analyzed.
|
|
|
338 |
Returns:
|
339 |
------
|
340 |
`label`: str
|
|
|
360 |
def get_sentiments(df):
|
361 |
"""
|
362 |
Retrieves the sentiment analysis of all text items in the 'lyrics' column of a pandas DataFrame.
|
|
|
363 |
This function applies the Vader sentiment analysis model from the Natural Language Toolkit (nltk)
|
364 |
to each text item in the 'lyrics' column of the passed DataFrame. It assigns a sentiment label
|
365 |
('POSITIVE', 'NEGATIVE', or 'NEUTRAL') and a compound sentiment score (ranging from -1 to 1) to
|
366 |
each text item. The sentiment analysis is added as new columns to the DataFrame and the modified
|
367 |
DataFrame is returned.
|
|
|
368 |
Parameters:
|
369 |
-----------
|
370 |
df : pandas DataFrame
|
371 |
The DataFrame containing the 'lyrics' column to be analyzed.
|
|
|
372 |
Returns:
|
373 |
--------
|
374 |
df : pandas DataFrame
|
375 |
The modified DataFrame with sentiment analysis added as new columns.
|
|
|
376 |
Raises:
|
377 |
-------
|
378 |
None.
|
|
|
429 |
max_sim = -100
|
430 |
for id, az_name in enumerate(az_titles):
|
431 |
# dist = Levenshtein.jaro(title, az_name)
|
432 |
+
dist = jaro_distance(title, az_name)
|
433 |
if max_sim < dist:
|
434 |
max_sim = dist
|
435 |
min_id = id
|