Spaces:

amitca75
/

spotify_music_recommendation_system

Running

File size: 3,621 Bytes


import os
import spotipy
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

from get_scaler import get_scaler
from dotenv import load_dotenv
from spotipy.oauth2 import SpotifyClientCredentials
from collections import defaultdict
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist

number_cols = ['valence', 'year', 'acousticness', 'danceability', 'duration_ms', 'energy', 'explicit',
 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity', 'speechiness', 'tempo']

# Load environment variables from .env file
load_dotenv()

# Access the Spotify API credentials
client_id = os.getenv('SPOTIFY_CLIENT_ID')
client_secret = os.getenv('SPOTIFY_CLIENT_SECRET')

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=client_id,
                              client_secret=client_secret))

def find_song(name):
  song_data = defaultdict()
  results = sp.search(q=name, limit=1, type='track')
  if results["tracks"]["items"] == []:
    return None

  results = results["tracks"]["items"][0]
  track_id = results["id"]
  audio_features = sp.audio_features(track_id)[0]

  song_data["name"] = [name]
  song_data["year"] = [int(results['album']['release_date'].split("-")[0])]
  song_data["artist"] = [results['artists'][0]['name']]
  song_data["explicit"] = [int(results['explicit'])]
  song_data['duration_ms'] = [results['duration_ms']]
  song_data['popularity'] = [results['popularity']]

  for key, value in audio_features.items():
    song_data[key] = value

  return pd.DataFrame(song_data)


def get_song_data(song, spotify_data):

    try:
        song_data = spotify_data[(spotify_data['name'] == song['name'])].iloc[0]
        print(f"Finding the song in the fitted data.")
        return song_data

    except IndexError:
        print(f"Could not find song in the fitted data. Trying to fetch online now...")
        return find_song(song['name'])


def get_mean_vector(song_list, spotify_data):

    song_vectors = []

    for song in song_list:
        song_data = get_song_data(song, spotify_data)
        if song_data is None:
            print('Warning: {} does not exist in Spotify or in database'.format(song['name']))
            continue
        song_vector = song_data[number_cols].values
        song_vectors.append(song_vector)

    song_matrix = np.array(list(song_vectors))
    return np.mean(song_matrix, axis=0)


def flatten_dict_list(dict_list):

    flattened_dict = defaultdict()
    for key in dict_list[0].keys():
        flattened_dict[key] = []

    for dictionary in dict_list:
        for key, value in dictionary.items():
            flattened_dict[key].append(value)

    return flattened_dict


def recommend_songs( song_list, spotify_data, n_songs=10):

    metadata_cols = ['name', 'year', 'artists']
    song_dict = flatten_dict_list(song_list)

    # song_center = get_mean_vector(song_list, spotify_data)

    song_center = get_song_data(song_list[0], spotify_data)[number_cols].values

    scaler = get_scaler()[1]
    scaled_data = scaler.transform(spotify_data[number_cols])
    scaled_song_center = scaler.transform(song_center.reshape(1, -1))
    distances = cdist(scaled_song_center, scaled_data, 'cosine')
    index = list(np.argsort(distances)[:, :n_songs][0])

    rec_songs = spotify_data.iloc[index]
    rec_songs['name'] = rec_songs['name'].apply(lambda x: x.lower())
    song_dict['name'] = [x.lower() for x in song_dict['name']]

    rec_songs = rec_songs[~rec_songs['name'].isin(song_dict['name'])]
    return rec_songs[metadata_cols].to_dict(orient='records')