Amit Kumar
fit data using kmeans
history blame
No virus
3.62 kB
import os
import spotipy
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from get_scaler import get_scaler
from dotenv import load_dotenv
from spotipy.oauth2 import SpotifyClientCredentials
from collections import defaultdict
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist
number_cols = ['valence', 'year', 'acousticness', 'danceability', 'duration_ms', 'energy', 'explicit',
'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity', 'speechiness', 'tempo']
# Load environment variables from .env file
# Access the Spotify API credentials
client_id = os.getenv('SPOTIFY_CLIENT_ID')
client_secret = os.getenv('SPOTIFY_CLIENT_SECRET')
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=client_id,
def find_song(name):
song_data = defaultdict()
results =, limit=1, type='track')
if results["tracks"]["items"] == []:
return None
results = results["tracks"]["items"][0]
track_id = results["id"]
audio_features = sp.audio_features(track_id)[0]
song_data["name"] = [name]
song_data["year"] = [int(results['album']['release_date'].split("-")[0])]
song_data["artist"] = [results['artists'][0]['name']]
song_data["explicit"] = [int(results['explicit'])]
song_data['duration_ms'] = [results['duration_ms']]
song_data['popularity'] = [results['popularity']]
for key, value in audio_features.items():
song_data[key] = value
return pd.DataFrame(song_data)
def get_song_data(song, spotify_data):
song_data = spotify_data[(spotify_data['name'] == song['name'])].iloc[0]
print(f"Finding the song in the fitted data.")
return song_data
except IndexError:
print(f"Could not find song in the fitted data. Trying to fetch online now...")
return find_song(song['name'])
def get_mean_vector(song_list, spotify_data):
song_vectors = []
for song in song_list:
song_data = get_song_data(song, spotify_data)
if song_data is None:
print('Warning: {} does not exist in Spotify or in database'.format(song['name']))
song_vector = song_data[number_cols].values
song_matrix = np.array(list(song_vectors))
return np.mean(song_matrix, axis=0)
def flatten_dict_list(dict_list):
flattened_dict = defaultdict()
for key in dict_list[0].keys():
flattened_dict[key] = []
for dictionary in dict_list:
for key, value in dictionary.items():
return flattened_dict
def recommend_songs( song_list, spotify_data, n_songs=10):
metadata_cols = ['name', 'year', 'artists']
song_dict = flatten_dict_list(song_list)
# song_center = get_mean_vector(song_list, spotify_data)
song_center = get_song_data(song_list[0], spotify_data)[number_cols].values
scaler = get_scaler()[1]
scaled_data = scaler.transform(spotify_data[number_cols])
scaled_song_center = scaler.transform(song_center.reshape(1, -1))
distances = cdist(scaled_song_center, scaled_data, 'cosine')
index = list(np.argsort(distances)[:, :n_songs][0])
rec_songs = spotify_data.iloc[index]
rec_songs['name'] = rec_songs['name'].apply(lambda x: x.lower())
song_dict['name'] = [x.lower() for x in song_dict['name']]
rec_songs = rec_songs[~rec_songs['name'].isin(song_dict['name'])]
return rec_songs[metadata_cols].to_dict(orient='records')