Amit Kumar
fit data using kmeans
e7b83d9
raw
history blame
No virus
3.62 kB
import os
import spotipy
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from get_scaler import get_scaler
from dotenv import load_dotenv
from spotipy.oauth2 import SpotifyClientCredentials
from collections import defaultdict
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist
number_cols = ['valence', 'year', 'acousticness', 'danceability', 'duration_ms', 'energy', 'explicit',
'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity', 'speechiness', 'tempo']
# Load environment variables from .env file
load_dotenv()
# Access the Spotify API credentials
client_id = os.getenv('SPOTIFY_CLIENT_ID')
client_secret = os.getenv('SPOTIFY_CLIENT_SECRET')
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=client_id,
client_secret=client_secret))
def find_song(name):
song_data = defaultdict()
results = sp.search(q=name, limit=1, type='track')
if results["tracks"]["items"] == []:
return None
results = results["tracks"]["items"][0]
track_id = results["id"]
audio_features = sp.audio_features(track_id)[0]
song_data["name"] = [name]
song_data["year"] = [int(results['album']['release_date'].split("-")[0])]
song_data["artist"] = [results['artists'][0]['name']]
song_data["explicit"] = [int(results['explicit'])]
song_data['duration_ms'] = [results['duration_ms']]
song_data['popularity'] = [results['popularity']]
for key, value in audio_features.items():
song_data[key] = value
return pd.DataFrame(song_data)
def get_song_data(song, spotify_data):
try:
song_data = spotify_data[(spotify_data['name'] == song['name'])].iloc[0]
print(f"Finding the song in the fitted data.")
return song_data
except IndexError:
print(f"Could not find song in the fitted data. Trying to fetch online now...")
return find_song(song['name'])
def get_mean_vector(song_list, spotify_data):
song_vectors = []
for song in song_list:
song_data = get_song_data(song, spotify_data)
if song_data is None:
print('Warning: {} does not exist in Spotify or in database'.format(song['name']))
continue
song_vector = song_data[number_cols].values
song_vectors.append(song_vector)
song_matrix = np.array(list(song_vectors))
return np.mean(song_matrix, axis=0)
def flatten_dict_list(dict_list):
flattened_dict = defaultdict()
for key in dict_list[0].keys():
flattened_dict[key] = []
for dictionary in dict_list:
for key, value in dictionary.items():
flattened_dict[key].append(value)
return flattened_dict
def recommend_songs( song_list, spotify_data, n_songs=10):
metadata_cols = ['name', 'year', 'artists']
song_dict = flatten_dict_list(song_list)
# song_center = get_mean_vector(song_list, spotify_data)
song_center = get_song_data(song_list[0], spotify_data)[number_cols].values
scaler = get_scaler()[1]
scaled_data = scaler.transform(spotify_data[number_cols])
scaled_song_center = scaler.transform(song_center.reshape(1, -1))
distances = cdist(scaled_song_center, scaled_data, 'cosine')
index = list(np.argsort(distances)[:, :n_songs][0])
rec_songs = spotify_data.iloc[index]
rec_songs['name'] = rec_songs['name'].apply(lambda x: x.lower())
song_dict['name'] = [x.lower() for x in song_dict['name']]
rec_songs = rec_songs[~rec_songs['name'].isin(song_dict['name'])]
return rec_songs[metadata_cols].to_dict(orient='records')