Diego-0121 commited on
Commit
4a584a3
1 Parent(s): eedbb45

Upload 5 files

Browse files
Recomendation.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.metrics.pairwise import cosine_similarity
2
+ import pandas as pd
3
+ import numpy as np
4
+ from vectorization import spotify_data
5
+ import json
6
+ import gradio as gr
7
+ from gradio.components import Textbox
8
+ from ast import literal_eval
9
+ spotify_data_processed = pd.read_csv('C:\\Users\\34640\\Desktop\\Saturdays.ai\\spotify_dset\\dataset_modificado.csv')
10
+
11
+ def convert_string_to_array(str_vector):
12
+ # Si str_vector ya es un array de NumPy, devolverlo directamente
13
+ if isinstance(str_vector, np.ndarray):
14
+ return str_vector
15
+
16
+ try:
17
+ cleaned_str = str_vector.replace('[', '').replace(']', '').replace('\n', ' ').replace('\r', '').strip()
18
+ vector_elements = [float(item) for item in cleaned_str.split()]
19
+ return np.array(vector_elements)
20
+ except ValueError as e:
21
+ print("Error:", e)
22
+ return np.zeros((100,))
23
+
24
+
25
+ spotify_data_processed['song_vector'] = spotify_data_processed['song_vector'].apply(convert_string_to_array)
26
+
27
+
28
+ # Aplicar la función a las primeras filas para ver los resultados
29
+ sample_data = spotify_data_processed['song_vector'].head()
30
+ converted_vectors = sample_data.apply(convert_string_to_array)
31
+ print(converted_vectors)
32
+
33
+
34
+
35
+ def recommend_song(song_name, artist_name, spotify_data_processed, top_n=4):
36
+ # Filtrar para encontrar la canción específica
37
+ specific_song = spotify_data_processed[(spotify_data_processed['song'] == song_name)
38
+ & (spotify_data_processed['artist'] == artist_name)]
39
+
40
+ # Verificar si la canción existe en el dataset
41
+ if specific_song.empty:
42
+ return pd.DataFrame({"Error": ["Canción no encontrada en la base de datos."]})
43
+
44
+
45
+ # Obtener el vector de la canción específica
46
+ song_vec = specific_song['song_vector'].iloc[0]
47
+
48
+ # Asegurarte de que song_vec sea un array de NumPy
49
+ if isinstance(song_vec, str):
50
+ song_vec = convert_string_to_array(song_vec)
51
+
52
+ all_song_vectors = np.array(spotify_data_processed['song_vector'].tolist())
53
+
54
+ # Calcular similitudes
55
+ similarities = cosine_similarity([song_vec], all_song_vectors)[0]
56
+
57
+ # Obtener los índices de las canciones más similares
58
+ top_indices = np.argsort(similarities)[::-1][1:top_n+1]
59
+
60
+ # Devolver los nombres y artistas de las canciones más similares
61
+ recommended_songs = spotify_data_processed.iloc[top_indices][['song', 'artist']]
62
+ return recommended_songs
63
+
64
+
65
+
66
+
67
+
68
+ def recommend_song_interface(song_name, artist_name):
69
+ recommendations_df = recommend_song(song_name, artist_name, spotify_data_processed)
70
+
71
+ if isinstance(recommendations_df, pd.DataFrame):
72
+ # Convierte el DataFrame en una lista de listas y luego a un formato de texto plano para la salida
73
+ recommendations_list = recommendations_df.values.tolist()
74
+ return ["{} by {}".format(song, artist) for song, artist in recommendations_list]
75
+ else:
76
+ # Si no es un DataFrame, devolver el mensaje de error
77
+ return recommendations_df
78
+
79
+ # Crear la interfaz con Gradio
80
+ iface = gr.Interface(
81
+ fn=recommend_song_interface,
82
+ inputs=[
83
+ gr.Textbox(placeholder="Ingrese el título de la canción", label="Título de la Canción"),
84
+ gr.Textbox(placeholder="Ingrese el nombre del artista", label="Nombre del Artista")
85
+ ],
86
+ outputs=[gr.Text(label="Recomendación 1"),
87
+ gr.Text(label="Recomendación 2"),
88
+ gr.Text(label="Recomendación 3"),
89
+ gr.Text(label="Recomendación 4")],
90
+ title="Recomendador de Canciones",
91
+ description="Ingrese el título de una canción y el nombre del artista para obtener recomendaciones.",
92
+ theme="dark", # Comenta o elimina si el tema oscuro no está disponible
93
+ css="""
94
+ body {font-family: Arial, sans-serif;}
95
+ .input_text {background-color: #f0f0f0; border-radius: 5px;}
96
+ .output_text {border: 2px solid #f0f0f0; border-radius: 5px; padding: 10px;}
97
+ """
98
+ )
99
+
100
+ iface.launch()
data_processing.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import sys
4
+ import codecs
5
+ #-------------------Load_data, function that loads the Spotify Dataset 1921-2020, 600k+--------------------------
6
+ #-------------------Tracks and checks with an error check if the data has been loaded correctly.----------------
7
+
8
+ def load_data (path):
9
+ try:
10
+ df = pd.read_csv(path)
11
+ return df
12
+ except FileNotFoundError:
13
+ print(f"The document is not found in the directory: {path}")
14
+ return None
15
+ except Exception as e:
16
+ print(f"An error occurred loading the file: {e}")
17
+ return None
18
+ path = 'C:\\Users\\34640\\Desktop\\Saturdays.ai\\spotify_dset\\spotify_millsongdata.csv\\spotify_millsongdata.csv'
19
+ spotify_data = load_data(path)
20
+
21
+ spotify_data.columns = ['artist', 'song', 'link', 'text']
22
+
23
+
24
+
25
+ if spotify_data is not None:
26
+ print("-----------Suscessfully loaded-------------")
27
+
28
+
29
+ # print(spotify_data.isnull().sum())
30
+ #-----------Fill up white space-----------#
31
+ for col in spotify_data.columns:
32
+ spotify_data[col] = spotify_data[col].fillna(spotify_data[col].mode()[0])
33
+
34
+ #-----------Convert to lower case and delete special characters-----------#
35
+ spotify_data[col] = spotify_data[col].str.lower().str.replace('[^\w\s]', '', regex=True)
36
+
37
+
38
+ #-----------Delete duplicates-----------#
39
+ spotify_data = spotify_data.drop_duplicates()
40
+
41
+ #print(spotify_data.isnull().sum())
42
+ else:
43
+ print("No spotify data")
44
+
45
+
46
+
tokenizer.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from data_processing import load_data, spotify_data, path
2
+ import pandas
3
+ import nltk
4
+ from nltk.corpus import stopwords
5
+ from nltk.tokenize import word_tokenize
6
+ import string
7
+
8
+ #---------------------------Download the requirements NLTK--------------------------------
9
+
10
+ #nltk.download('punkt')
11
+ #nltk.download('stopwords')
12
+
13
+ def clean_lyrics(lyrics):
14
+ # Tokenización
15
+ tokens = word_tokenize(lyrics)
16
+
17
+ # To lower case
18
+ tokens = [word.lower() for word in tokens]
19
+
20
+ # Delete signs
21
+ table = str.maketrans('', '', string.punctuation)
22
+ stripped_tokens = [word.translate(table) for word in tokens]
23
+
24
+ # Stop Words
25
+ stop_words = set(stopwords.words('english'))
26
+ tokens_without_sw = [word for word in stripped_tokens if word not in stop_words]
27
+
28
+ return tokens_without_sw
29
+
30
+ # Apply clean
31
+ spotify_data['cleaned_text'] = spotify_data['text'].apply(clean_lyrics)
32
+ spotify_data.to_csv('C:\\Users\\34640\\Desktop\\Saturdays.ai\\spotify_dset\\spotify_data_processed.csv', index=False)
33
+
34
+ #print(spotify_data['cleaned_text'].head())
vectorial_representation.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from vectorization import model, spotify_data
3
+
4
+ # Función para convertir una canción en un vector promedio de sus palabras
5
+ def song_vector(tokens, model):
6
+ # Filtrar palabras que están en el modelo
7
+ tokens = [word for word in tokens if word in model.wv.key_to_index]
8
+
9
+ if len(tokens) == 0:
10
+ return np.zeros(model.vector_size)
11
+
12
+ # Calcular el promedio de los vectores de las palabras
13
+ song_vec = np.mean([model.wv[word] for word in tokens], axis=0)
14
+ return song_vec
15
+
16
+ # Aplicar esta función a cada canción en tu dataset
17
+ spotify_data['song_vector'] = spotify_data['cleaned_text'].apply(lambda x: song_vector(x, model))
18
+ spotify_data.to_csv('C:\\Users\\34640\\Desktop\\Saturdays.ai\\spotify_dset\\dataset_modificado.csv', index=False)
19
+
20
+
vectorization.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from gensim.models import Word2Vec
2
+ import pandas as pd
3
+
4
+
5
+ spotify_data= pd.read_csv('C:\\Users\\34640\\Desktop\\Saturdays.ai\\spotify_dset\\spotify_data_processed.csv')
6
+
7
+
8
+
9
+
10
+ # Asumiendo que spotify_data['cleaned_text'] contiene listas de palabras (tokens)
11
+ spotify_data['cleaned_text'] = spotify_data['cleaned_text'].apply(eval)
12
+
13
+ model = Word2Vec(sentences=spotify_data['cleaned_text'], vector_size=100, window=10, min_count=1, workers=5)
14
+ # Guardar el modelo
15
+ model.save("word2vec_model.model")