|
|
|
import os |
|
import spotipy |
|
import pandas as pd |
|
import numpy as np |
|
from sklearn.preprocessing import StandardScaler |
|
|
|
from get_scaler import get_scaler |
|
from dotenv import load_dotenv |
|
from spotipy.oauth2 import SpotifyClientCredentials |
|
from collections import defaultdict |
|
from sklearn.metrics import euclidean_distances |
|
from scipy.spatial.distance import cdist |
|
|
|
number_cols = ['valence', 'year', 'acousticness', 'danceability', 'duration_ms', 'energy', 'explicit', |
|
'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity', 'speechiness', 'tempo'] |
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
client_id = os.getenv('SPOTIFY_CLIENT_ID') |
|
client_secret = os.getenv('SPOTIFY_CLIENT_SECRET') |
|
|
|
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=client_id, |
|
client_secret=client_secret)) |
|
|
|
def find_song(name): |
|
song_data = defaultdict() |
|
results = sp.search(q=name, limit=1, type='track') |
|
if results["tracks"]["items"] == []: |
|
return None |
|
|
|
results = results["tracks"]["items"][0] |
|
track_id = results["id"] |
|
audio_features = sp.audio_features(track_id)[0] |
|
|
|
song_data["name"] = [name] |
|
song_data["year"] = [int(results['album']['release_date'].split("-")[0])] |
|
song_data["artist"] = [results['artists'][0]['name']] |
|
song_data["explicit"] = [int(results['explicit'])] |
|
song_data['duration_ms'] = [results['duration_ms']] |
|
song_data['popularity'] = [results['popularity']] |
|
|
|
for key, value in audio_features.items(): |
|
song_data[key] = value |
|
|
|
return pd.DataFrame(song_data) |
|
|
|
|
|
def get_song_data(song, spotify_data): |
|
|
|
try: |
|
song_data = spotify_data[(spotify_data['name'] == song['name'])].iloc[0] |
|
print(f"Finding the song in the fitted data.") |
|
return song_data |
|
|
|
except IndexError: |
|
print(f"Could not find song in the fitted data. Trying to fetch online now...") |
|
return find_song(song['name']) |
|
|
|
|
|
def get_mean_vector(song_list, spotify_data): |
|
|
|
song_vectors = [] |
|
|
|
for song in song_list: |
|
song_data = get_song_data(song, spotify_data) |
|
if song_data is None: |
|
print('Warning: {} does not exist in Spotify or in database'.format(song['name'])) |
|
continue |
|
song_vector = song_data[number_cols].values |
|
song_vectors.append(song_vector) |
|
|
|
song_matrix = np.array(list(song_vectors)) |
|
return np.mean(song_matrix, axis=0) |
|
|
|
|
|
def flatten_dict_list(dict_list): |
|
|
|
flattened_dict = defaultdict() |
|
for key in dict_list[0].keys(): |
|
flattened_dict[key] = [] |
|
|
|
for dictionary in dict_list: |
|
for key, value in dictionary.items(): |
|
flattened_dict[key].append(value) |
|
|
|
return flattened_dict |
|
|
|
|
|
def recommend_songs( song_list, spotify_data, n_songs=10): |
|
|
|
metadata_cols = ['name', 'year', 'artists'] |
|
song_dict = flatten_dict_list(song_list) |
|
|
|
|
|
|
|
song_center = get_song_data(song_list[0], spotify_data)[number_cols].values |
|
|
|
scaler = get_scaler()[1] |
|
scaled_data = scaler.transform(spotify_data[number_cols]) |
|
scaled_song_center = scaler.transform(song_center.reshape(1, -1)) |
|
distances = cdist(scaled_song_center, scaled_data, 'cosine') |
|
index = list(np.argsort(distances)[:, :n_songs][0]) |
|
|
|
rec_songs = spotify_data.iloc[index] |
|
rec_songs['name'] = rec_songs['name'].apply(lambda x: x.lower()) |
|
song_dict['name'] = [x.lower() for x in song_dict['name']] |
|
|
|
rec_songs = rec_songs[~rec_songs['name'].isin(song_dict['name'])] |
|
return rec_songs[metadata_cols].to_dict(orient='records') |
|
|