import os import spotipy import pandas as pd import numpy as np from sklearn.preprocessing import StandardScaler from get_scaler import get_scaler from dotenv import load_dotenv from spotipy.oauth2 import SpotifyClientCredentials from collections import defaultdict from sklearn.metrics import euclidean_distances from scipy.spatial.distance import cdist number_cols = ['valence', 'year', 'acousticness', 'danceability', 'duration_ms', 'energy', 'explicit', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity', 'speechiness', 'tempo'] # Load environment variables from .env file load_dotenv() # Access the Spotify API credentials client_id = os.getenv('SPOTIFY_CLIENT_ID') client_secret = os.getenv('SPOTIFY_CLIENT_SECRET') sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)) def find_song(name): song_data = defaultdict() results = sp.search(q=name, limit=1, type='track') if results["tracks"]["items"] == []: return None results = results["tracks"]["items"][0] track_id = results["id"] audio_features = sp.audio_features(track_id)[0] song_data["name"] = [name] song_data["year"] = [int(results['album']['release_date'].split("-")[0])] song_data["artist"] = [results['artists'][0]['name']] song_data["explicit"] = [int(results['explicit'])] song_data['duration_ms'] = [results['duration_ms']] song_data['popularity'] = [results['popularity']] for key, value in audio_features.items(): song_data[key] = value return pd.DataFrame(song_data) def get_song_data(song, spotify_data): try: song_data = spotify_data[(spotify_data['name'] == song['name'])].iloc[0] print(f"Finding the song in the fitted data.") return song_data except IndexError: print(f"Could not find song in the fitted data. Trying to fetch online now...") return find_song(song['name']) def get_mean_vector(song_list, spotify_data): song_vectors = [] for song in song_list: song_data = get_song_data(song, spotify_data) if song_data is None: print('Warning: {} does not exist in Spotify or in database'.format(song['name'])) continue song_vector = song_data[number_cols].values song_vectors.append(song_vector) song_matrix = np.array(list(song_vectors)) return np.mean(song_matrix, axis=0) def flatten_dict_list(dict_list): flattened_dict = defaultdict() for key in dict_list[0].keys(): flattened_dict[key] = [] for dictionary in dict_list: for key, value in dictionary.items(): flattened_dict[key].append(value) return flattened_dict def recommend_songs( song_list, spotify_data, n_songs=10): metadata_cols = ['name', 'year', 'artists'] song_dict = flatten_dict_list(song_list) # song_center = get_mean_vector(song_list, spotify_data) song_center = get_song_data(song_list[0], spotify_data)[number_cols].values scaler = get_scaler()[1] scaled_data = scaler.transform(spotify_data[number_cols]) scaled_song_center = scaler.transform(song_center.reshape(1, -1)) distances = cdist(scaled_song_center, scaled_data, 'cosine') index = list(np.argsort(distances)[:, :n_songs][0]) rec_songs = spotify_data.iloc[index] rec_songs['name'] = rec_songs['name'].apply(lambda x: x.lower()) song_dict['name'] = [x.lower() for x in song_dict['name']] rec_songs = rec_songs[~rec_songs['name'].isin(song_dict['name'])] return rec_songs[metadata_cols].to_dict(orient='records')