File size: 987 Bytes
4a584a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fc8cc6e
4a584a3
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
from data_processing import load_data, spotify_data, path
import pandas 
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

#---------------------------Download the requirements NLTK--------------------------------

#nltk.download('punkt')
#nltk.download('stopwords')

def clean_lyrics(lyrics):
    # Tokenización
    tokens = word_tokenize(lyrics)

    # To lower case 
    tokens = [word.lower() for word in tokens]

    # Delete signs 
    table = str.maketrans('', '', string.punctuation)
    stripped_tokens = [word.translate(table) for word in tokens]

    # Stop Words
    stop_words = set(stopwords.words('english'))  
    tokens_without_sw = [word for word in stripped_tokens if word not in stop_words]

    return tokens_without_sw

# Apply clean
spotify_data['cleaned_text'] = spotify_data['text'].apply(clean_lyrics)
spotify_data.to_csv('spotify_data_processed.csv', index=False)

#print(spotify_data['cleaned_text'].head())