Spaces:

gsliwoski
/

Spotify_Artist_Recommender

Sleeping

File size: 10,192 Bytes

c0b0603

import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyOAuth
import random
from sklearn.preprocessing import StandardScaler
import numpy as np
from scipy.spatial.distance import cosine
import json
from tqdm import tqdm
import argparse
import sys
from datetime import datetime
pd.set_option('display.max_colwidth', None)

sp = None

MUSIC_FEATURES = ["danceability", "energy", "loudness", "speechiness", "acousticness", "instrumentalness", "liveness", "valence"]
SHEET_FEATURES = ["key", "mode", "tempo"]

def initialize_spotify_client(credentials_file, isfile=True):
    global sp
    if isfile:
        creds = json.load(open(credentials_file))
    else:
        creds = credentials_file
    SPOTIPY_CLIENT_ID = creds['SPOTIPY_CLIENT_ID']
    SPOTIPY_CLIENT_SECRET = creds['SPOTIPY_CLIENT_SECRET']
    SPOTIPY_REDIRECT_URI = creds['SPOTIPY_REDIRECT_URI']
    SCOPE = 'playlist-modify-public user-read-recently-played'
    sp = spotipy.Spotify(auth_manager=SpotifyOAuth(client_id=SPOTIPY_CLIENT_ID,
                                               client_secret=SPOTIPY_CLIENT_SECRET,
                                               redirect_uri=SPOTIPY_REDIRECT_URI,
                                               scope=SCOPE))

def find_artists_with_matching_genres(target_genres):
    matched_artists = []
    # Try searching for a perfect match of genres then randomly remove one genre at a time until get at least 100
    while len(target_genres) > 0 and len(matched_artists) < 10:
        print(f"Trying {target_genres}")
        query = " AND ".join([f"genre:\"{genre}\"" for genre in target_genres])
        artist_results = sp.search(q=query, type='artist', limit=10)
        matched_artists.extend([{'id':artist['id'], 'name':artist['name'], 'artist_url':artist['external_urls']['spotify']} for artist in artist_results['artists']['items'] if artist not in matched_artists])
        random_item = random.choice(target_genres)
        target_genres.remove(random_item)
    return matched_artists

def get_top_track_features(artist_id):
    try:
        top_tracks = sp.artist_top_tracks(artist_id, country='US')['tracks']
        if top_tracks:
            top_track_id = top_tracks[0]['id']
            top_track_name = top_tracks[0]['name']
            features = sp.audio_features([top_track_id])[0]
            features['track_id'] = top_track_id
            features['track_name'] = top_track_name
            if not features:
                print("Audio features could not be retrieved.")
                features = {}
        else:
            print("No top tracks found for this artist.")
            features = {}
    except Exception as e:
        print(f"Error retrieving top track for artist: {e}")
        features = {}
    return features

def average_cosine_distance(A, B):
    average_distances = []
    for b in B:
        distances = [cosine(b, a) for a in A]
        average_distances.append(np.mean(distances))
    return np.array(average_distances)

def get_recently_played(limit=50, selected_artists = ""):
    if len(selected_artists) == 0:
        print("Getting most recently played artists")
        results = sp.current_user_recently_played(limit=limit)
    else:
        selected_artists = [x.strip() for x in selected_artists.split(",")]
        print(f"Using supplied list of artists (first 10 artists only)")
        results = {'items':[]}
        for artist in selected_artists[:10]:
            artist_objects = sp.search(q=f"artist: {artist}", type='artist')
            try:
                artist_objects = sorted([x for x in artist_objects['artists']['items'] if x['name'].lower() == artist.lower()],
                                        key= lambda x: x['popularity'], reverse=True)
            except KeyError:
                artist_objects = []
            if len(artist_objects) == 0:
                print(f"{artist} not found")
                continue
            elif len(artist_objects) > 1:
                print(f"Multiple artist_id found for {artist}, selecting the most popular artist_id in the list.")
            artist_id = artist_objects[0]['id']
            try:
                artist_url = artist_objects[0]['external_urls']['spotify']
            except KeyError:
                artist_url = ""
            results['items'].append({
                'track': {
                    'artists': [{'id': artist_id,
                                  'name': artist,
                                  'external_urls': {'spotify': artist_url}}]}})
    tracks = []
    for idx, item in enumerate(results['items']):
        try:
            track = item['track']
            artist_id = track['artists'][0]['id']
        except KeyError as e:
            print(f"Failed index {idx}:")
            print(e)
            continue
        if artist_id in [x['artist_id'] for x in tracks]:
            continue
        elif len(tracks) >= 10:
            break
        features = get_top_track_features(artist_id)
        if len(features.keys()) == 0:
            print(f"No features found for artist: {artist_id}")
            continue
        try:
            features['name'] = track['artists'][0]['name']
        except KeyError as e:
            print(f"Failed to get artists name for {artist_id}:")
            print(e)
            features['name'] = np.nan
        features['artist_id'] = artist_id
        try:
            features['artist_url'] = track['artists'][0]['external_urls']['spotify']
        except KeyError as e:
            print(f"Failed to get URL for artist {artist_id}:")
            print(e)
            features['artist_url'] = np.nan
        tracks.append(features)
        print(f"{idx+1}: {features['name']} - {features['track_name']}")
    return tracks

def get_matching_artists(tracks):
    artist_features = []
    for idx, item in tqdm(enumerate(tracks)):
        artist_id = item['artist_id']
        artist = sp.artist(artist_id)
        genres = artist.get('genres',[])
        if len(genres)<1:
            continue
        matched_artists = find_artists_with_matching_genres([x for x in genres])
        if len(matched_artists) <= 1: #If it only managed to match itself
            continue
        for artist in matched_artists:
            artist_id = artist.get('id',"")
            if artist_id == "" or artist_id in [x['artist_id'] for x in tracks+artist_features]:
                continue
            #print(artist.get('name', 'UNKNOWN ARTIST'))
            features = get_top_track_features(artist_id)
            if len(features.keys())==0:
                continue
            features['artist_id'] = artist_id
            features['name'] = artist['name']
            features['artist_url'] = artist['artist_url']
            artist_features.append(features)
    return artist_features

def get_closest_artists(df, features):
    all_feats = StandardScaler().fit_transform(df[features])
    all_feats_ref = all_feats[df[df.source=="reference"].index]
    all_feats_new = all_feats[df[df.source=="artist"].index]
    result_distances = average_cosine_distance(all_feats_ref, all_feats_new)
    results = pd.DataFrame({"artist": df[df.source == "artist"]['name'].values, "artist_id": df[df.source == "artist"]['artist_id'].values,
                            "distance": result_distances,
                            'artist_url': df[df.source == "artist"]['artist_url']})
    return results.sort_values("distance").head(10)

def generate_playlist(closest_artists, artists = []):
    playlist_name = f"Recommended_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
    playlist_desc = f"Top 10 recommended artists based on supplied artists: {', '.join(artists)}" if len(artists) > 0 else "Top 10 recommended artists from recently played."
    user_id = sp.current_user()['id']
    new_playlist = sp.user_playlist_create(user_id, playlist_name, description=playlist_desc)
    playlist_id = new_playlist['id']
    for i,r in closest_artists.iterrows():
        print(r)
        artist = r.artist
        artist_id = r.artist_id
        top_tracks = sp.artist_top_tracks(artist_id, country='US')['tracks']
        if len(top_tracks) == 0:
            continue
        else:
            top_track_uri = top_tracks[0]['uri']
        sp.playlist_add_items(playlist_id, [top_track_uri])

    if "name" in new_playlist and "external_urls" in new_playlist:
        print("Created playlist:", new_playlist['name'], "with URL:", new_playlist['external_urls']['spotify'])
        return True
    else:
        print("failed to create new playlist")
        return False

def main():
    parser = argparse.ArgumentParser(description="Spotify artist recommender. Requires a JSON with spotify credentials "
                                                 "(see credentials.json.example). Can also take a comma separated list "
                                                 "of artists instead of looking up last played.")
    parser.add_argument('--creds', type=str, help='Path to credentials json file', required=True)
    parser.add_argument('--artists', type=str, help='Comma separated list of artists', default="")
    parser.add_argument('--playlist', action='store_true', help='Create a Spotify playlist if set ("Recommended_timstamp")')
    args = parser.parse_args()

    print("Initializing Spotify")
    initialize_spotify_client(args.creds)
    try:
        _ = sp.current_user()
    except:
        print("Failed to initialize Spotify, are credentials correct?")
        sys.exit()
    tracks = get_recently_played(selected_artists=args.artists)
    print("Getting reference features")
    reference_df = pd.DataFrame.from_records(tracks)
    print("Getting matching artist features")
    artist_features = get_matching_artists(tracks)
    artist_df = pd.DataFrame(artist_features)
    reference_df['source'] = 'reference'
    artist_df['source'] = 'artist'
    df = pd.concat([reference_df, artist_df],ignore_index=True)
    closest_artists = get_closest_artists(df, MUSIC_FEATURES+SHEET_FEATURES)
    closest_artists.to_csv("closest_artists.csv", index=False)
    if args.playlist:
        generate_playlist(closest_artists, args.artists)

if __name__ == "__main__":
    main()