kmaurinjones commited on
Commit
618c903
1 Parent(s): 6781c46

Update songscope.py

Browse files
Files changed (1) hide show
  1. songscope.py +4 -26
songscope.py CHANGED
@@ -12,7 +12,7 @@ from nltk.sentiment import SentimentIntensityAnalyzer
12
  nltk.download("vader_lexicon")
13
 
14
  # jaro distance from scratch because importing or installing the module didn't work
15
- def homemade_jaro(s1, s2):
16
  if not s1 or not s2:
17
  return 0.0
18
 
@@ -60,7 +60,6 @@ def random_delay(min_val: float, max_val: float, print_delay: bool = False):
60
  The maximum amount of time to delay (in seconds).
61
  print_delay: bool
62
  Whether or not to print the delay time.
63
-
64
  Returns:
65
  -------
66
  val: float
@@ -78,12 +77,10 @@ def find_artist(artist_name: str):
78
 
79
  This function sends an HTTP request to azlyrics.com, scrapes the HTML content
80
  to find the artist's page, and returns the URL to that page.
81
-
82
  Parameters:
83
  ----------
84
  artist_name: str
85
  The name of the artist.
86
-
87
  Returns:
88
  -------
89
  url: str
@@ -119,7 +116,7 @@ def find_artist(artist_name: str):
119
  max_sim = -100
120
  for id, name in enumerate(artist_urls):
121
  # dist = Levenshtein.jaro(artist_name, name)
122
- dist = homemade_jaro(artist_name, name)
123
  if max_sim < dist:
124
  max_sim = dist
125
  min_id = id
@@ -133,23 +130,19 @@ def follow_lyrics(lyric_url: str):
133
  This function sends an HTTP request to the given `lyric_url`, parses the HTML content,
134
  and extracts the song lyrics. The lyrics are cleaned by removing unnecessary HTML tags
135
  and whitespace. The function returns the lyrics as a string.
136
-
137
  Note: Web scraping may be against the terms of service of some websites. Azlyrics.com
138
  specifically prohibits the usage of their content by third-party lyrics providers.
139
  Always review the website's policies and ensure you are compliant before scraping data.
140
-
141
  Parameters:
142
  ------
143
  `lyric_url`: str
144
  The URL of the song lyrics on azlyrics.com.
145
  `song_title`: str
146
  Title of the song
147
-
148
  Returns:
149
  ------
150
  `lyrics_str`:
151
  The lyrics of the song as a single string.
152
-
153
  Raises:
154
  ------
155
  `ValueError`: If the lyrics cannot be found or if there's an error fetching the webpage.
@@ -190,17 +183,14 @@ def follow_lyrics(lyric_url: str):
190
  def find_artist(artist_name: str) -> str:
191
  """
192
  Finds the link for the artist page on azlyrics.com.
193
-
194
  Parameters:
195
  ------
196
  `artist_name`: str
197
  The name of the artist.
198
-
199
  Returns:
200
  ------
201
  `url`: str
202
  The URL of the artist page on azlyrics.com.
203
-
204
  Raises:
205
  ------
206
  `ValueError`: If the artist page cannot be found.
@@ -245,7 +235,7 @@ def find_artist(artist_name: str) -> str:
245
  max_sim = -100
246
  for id, name in enumerate(artist_urls):
247
  # dist = Levenshtein.jaro(artist_name, name)
248
- dist = homemade_jaro(artist_name, name)
249
  if max_sim < dist:
250
  max_sim = dist
251
  min_id = id
@@ -256,12 +246,10 @@ def find_artist(artist_name: str) -> str:
256
  def flatten_list(lst: list):
257
  """
258
  Flattens all inner lists (all depths) of a list into a list of depth == 1.
259
-
260
  Parameters:
261
  ------
262
  `lst`: List
263
  The list to be flattened.
264
-
265
  Returns:
266
  ------
267
  `result`: List
@@ -280,12 +268,10 @@ def process_lyrics(lyrics: str):
280
  """
281
  Pre-processes the lyrics by replacing `\r` with an empty string, splitting the lyrics
282
  by `\n`, and removing consecutive whitespace list items.
283
-
284
  Parameters:
285
  ------
286
  `lyrics`: str
287
  The lyrics to be pre-processed.
288
-
289
  Returns:
290
  ------
291
  `cleaned_lines`: List
@@ -306,12 +292,10 @@ def process_lyrics(lyrics: str):
306
  def sectionize(lyrics: str):
307
  """
308
  Splits the pre-processed lyrics into sections.
309
-
310
  Parameters:
311
  ------
312
  `lyrics`: str
313
  The pre-processed lyrics.
314
-
315
  Returns:
316
  ------
317
  `all_sections`: List
@@ -347,12 +331,10 @@ def analyze_sentiment_vader(text: str):
347
  """
348
  Analyzes the sentiment of a text using the VADER (Valence Aware Dictionary and sEntiment Reasoner)
349
  sentiment analysis model.
350
-
351
  Parameters:
352
  ------
353
  `text`: str
354
  The text to be analyzed.
355
-
356
  Returns:
357
  ------
358
  `label`: str
@@ -378,23 +360,19 @@ def analyze_sentiment_vader(text: str):
378
  def get_sentiments(df):
379
  """
380
  Retrieves the sentiment analysis of all text items in the 'lyrics' column of a pandas DataFrame.
381
-
382
  This function applies the Vader sentiment analysis model from the Natural Language Toolkit (nltk)
383
  to each text item in the 'lyrics' column of the passed DataFrame. It assigns a sentiment label
384
  ('POSITIVE', 'NEGATIVE', or 'NEUTRAL') and a compound sentiment score (ranging from -1 to 1) to
385
  each text item. The sentiment analysis is added as new columns to the DataFrame and the modified
386
  DataFrame is returned.
387
-
388
  Parameters:
389
  -----------
390
  df : pandas DataFrame
391
  The DataFrame containing the 'lyrics' column to be analyzed.
392
-
393
  Returns:
394
  --------
395
  df : pandas DataFrame
396
  The modified DataFrame with sentiment analysis added as new columns.
397
-
398
  Raises:
399
  -------
400
  None.
@@ -451,7 +429,7 @@ def get_metadata(artist_name: str, song_titles: list = None) -> dict:
451
  max_sim = -100
452
  for id, az_name in enumerate(az_titles):
453
  # dist = Levenshtein.jaro(title, az_name)
454
- dist = homemade_jaro(title, az_name)
455
  if max_sim < dist:
456
  max_sim = dist
457
  min_id = id
 
12
  nltk.download("vader_lexicon")
13
 
14
  # jaro distance from scratch because importing or installing the module didn't work
15
+ def jaro_distance(s1, s2):
16
  if not s1 or not s2:
17
  return 0.0
18
 
 
60
  The maximum amount of time to delay (in seconds).
61
  print_delay: bool
62
  Whether or not to print the delay time.
 
63
  Returns:
64
  -------
65
  val: float
 
77
 
78
  This function sends an HTTP request to azlyrics.com, scrapes the HTML content
79
  to find the artist's page, and returns the URL to that page.
 
80
  Parameters:
81
  ----------
82
  artist_name: str
83
  The name of the artist.
 
84
  Returns:
85
  -------
86
  url: str
 
116
  max_sim = -100
117
  for id, name in enumerate(artist_urls):
118
  # dist = Levenshtein.jaro(artist_name, name)
119
+ dist = jaro_distance(artist_name, name)
120
  if max_sim < dist:
121
  max_sim = dist
122
  min_id = id
 
130
  This function sends an HTTP request to the given `lyric_url`, parses the HTML content,
131
  and extracts the song lyrics. The lyrics are cleaned by removing unnecessary HTML tags
132
  and whitespace. The function returns the lyrics as a string.
 
133
  Note: Web scraping may be against the terms of service of some websites. Azlyrics.com
134
  specifically prohibits the usage of their content by third-party lyrics providers.
135
  Always review the website's policies and ensure you are compliant before scraping data.
 
136
  Parameters:
137
  ------
138
  `lyric_url`: str
139
  The URL of the song lyrics on azlyrics.com.
140
  `song_title`: str
141
  Title of the song
 
142
  Returns:
143
  ------
144
  `lyrics_str`:
145
  The lyrics of the song as a single string.
 
146
  Raises:
147
  ------
148
  `ValueError`: If the lyrics cannot be found or if there's an error fetching the webpage.
 
183
  def find_artist(artist_name: str) -> str:
184
  """
185
  Finds the link for the artist page on azlyrics.com.
 
186
  Parameters:
187
  ------
188
  `artist_name`: str
189
  The name of the artist.
 
190
  Returns:
191
  ------
192
  `url`: str
193
  The URL of the artist page on azlyrics.com.
 
194
  Raises:
195
  ------
196
  `ValueError`: If the artist page cannot be found.
 
235
  max_sim = -100
236
  for id, name in enumerate(artist_urls):
237
  # dist = Levenshtein.jaro(artist_name, name)
238
+ dist = jaro_distance(artist_name, name)
239
  if max_sim < dist:
240
  max_sim = dist
241
  min_id = id
 
246
  def flatten_list(lst: list):
247
  """
248
  Flattens all inner lists (all depths) of a list into a list of depth == 1.
 
249
  Parameters:
250
  ------
251
  `lst`: List
252
  The list to be flattened.
 
253
  Returns:
254
  ------
255
  `result`: List
 
268
  """
269
  Pre-processes the lyrics by replacing `\r` with an empty string, splitting the lyrics
270
  by `\n`, and removing consecutive whitespace list items.
 
271
  Parameters:
272
  ------
273
  `lyrics`: str
274
  The lyrics to be pre-processed.
 
275
  Returns:
276
  ------
277
  `cleaned_lines`: List
 
292
  def sectionize(lyrics: str):
293
  """
294
  Splits the pre-processed lyrics into sections.
 
295
  Parameters:
296
  ------
297
  `lyrics`: str
298
  The pre-processed lyrics.
 
299
  Returns:
300
  ------
301
  `all_sections`: List
 
331
  """
332
  Analyzes the sentiment of a text using the VADER (Valence Aware Dictionary and sEntiment Reasoner)
333
  sentiment analysis model.
 
334
  Parameters:
335
  ------
336
  `text`: str
337
  The text to be analyzed.
 
338
  Returns:
339
  ------
340
  `label`: str
 
360
  def get_sentiments(df):
361
  """
362
  Retrieves the sentiment analysis of all text items in the 'lyrics' column of a pandas DataFrame.
 
363
  This function applies the Vader sentiment analysis model from the Natural Language Toolkit (nltk)
364
  to each text item in the 'lyrics' column of the passed DataFrame. It assigns a sentiment label
365
  ('POSITIVE', 'NEGATIVE', or 'NEUTRAL') and a compound sentiment score (ranging from -1 to 1) to
366
  each text item. The sentiment analysis is added as new columns to the DataFrame and the modified
367
  DataFrame is returned.
 
368
  Parameters:
369
  -----------
370
  df : pandas DataFrame
371
  The DataFrame containing the 'lyrics' column to be analyzed.
 
372
  Returns:
373
  --------
374
  df : pandas DataFrame
375
  The modified DataFrame with sentiment analysis added as new columns.
 
376
  Raises:
377
  -------
378
  None.
 
429
  max_sim = -100
430
  for id, az_name in enumerate(az_titles):
431
  # dist = Levenshtein.jaro(title, az_name)
432
+ dist = jaro_distance(title, az_name)
433
  if max_sim < dist:
434
  max_sim = dist
435
  min_id = id