Spaces:
Sleeping
Sleeping
import gradio as gr | |
import numpy as np | |
import pandas as pd | |
from sklearn.cluster import AffinityPropagation # Remove once model is saved | |
import scipy | |
import ast | |
################## PREPARATION ################## | |
files = [ | |
'112023_df_tracks_metadata.csv', | |
'112023_df_tracks_metadata2.csv', | |
'122023_df_tracks_metadata.csv', | |
'012024_df_tracks_metadata.csv', | |
'022024_df_tracks_metadata.csv', | |
'032024_df_tracks_metadata.csv', | |
'042024_df_tracks_metadata.csv', | |
'nonopm.csv', | |
'opm.csv' | |
] | |
dataframes = [] | |
# convert the urls to pandas dataframes | |
for file in files: | |
dataframes.append(pd.read_csv(file)) | |
# merge all the dataframes to one dataframe | |
data = pd.concat(dataframes) | |
data.drop_duplicates(subset='track.id', inplace=True) | |
cols = ['track.id', 'track.name', 'track.artists', 'danceability', 'energy', 'key', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo'] | |
data_lesscol = data[cols] | |
################## ADDING HAS FILIPINO FEATURE ################## | |
setofartists = set() # set of all artists in the dataset | |
artistlinks = {} # links of each artist to their respsective artist page. helps with manual sorting | |
maxartists = 0 # maximum number of artists per track | |
#create a list of all the artist names in the dataset | |
artistslist = data_lesscol['track.artists'].to_list() | |
for i, x in enumerate(artistslist): | |
dicts = ast.literal_eval(x) | |
maxartists = max(maxartists, len(dicts)) | |
for y in dicts: | |
setofartists.add(y['name']) | |
artistlinks[y['name']] = y['external_urls']['spotify'] | |
def returnArtistsList(row): | |
''' | |
returns a string of artists names, separated by "| " | |
''' | |
artistslist = row['track.artists'] | |
artists = [] | |
for y in ast.literal_eval(artistslist): | |
artists.append(y['name']) | |
return '| '.join(artists) | |
# create an artists column, which is just the artists' names separated by "| " | |
data_lesscol['artists'] = data_lesscol.apply(returnArtistsList, axis=1) | |
# get the set of the OPM playlists from the opm playlists dataset | |
opm = dataframes[-1] | |
OPMartists = set() | |
for artistobjectarr in opm['track.artists'].to_list(): | |
artistsarr = ast.literal_eval(artistobjectarr) | |
for artistobject in artistsarr: | |
OPMartists.add(artistobject['name']) | |
# some of these artists are not Filipino, but we can just remove them | |
toRemove = ['Kina Grannis'] | |
for name in toRemove: | |
OPMartists.remove(name) | |
################## ALL FILIPINO LIST ################## | |
# manually retrieved list of OPM artists | |
manualartists = [ | |
"Orange & Lemons", | |
"SunKissed Lola", | |
"YP", | |
"E.J", | |
"Dilaw", | |
"Eros Tongco", | |
"Teys", | |
"Lola Amour", | |
"slimedemidemislime", | |
"SwKeith", | |
"juan karlos", | |
"Jed Baruelo", | |
"Jthekidd", | |
"P-Lo", | |
"rhodessa", | |
"Hev Abi", | |
"Martti Franca", | |
"TONEEJAY", | |
"Denise Julia", | |
"SB19", | |
"Al James", | |
"HELLMERRY", | |
"Gabito Ballesteros", | |
"Calein", | |
"syd hartha" | |
"Nateman", | |
"Tom Odell", | |
"Realest Cram", | |
"Janine", | |
"Madman Stan", | |
"Sugarcane", | |
"DEMI", | |
"gins&melodies", | |
"Cean Jr.", | |
"Arthur Miguel", | |
"Cup of Joe", | |
"CK YG", | |
"Janine Berdin", | |
"Adie", | |
"Zack Tabudlo", | |
"Maki", | |
"NOBITA", | |
"Arthur Nery", | |
"Mitski", | |
"Up Dharma Down", | |
"Ben&Ben", | |
"BINI", | |
"Eraserheads", | |
"Silent Sanctuary", | |
"Moira Dela Torre", | |
"Rivermaya", | |
"Parokya Ni Edgar", | |
"The Itchyworms", | |
"Spongecola", | |
"December Avenue", | |
"Alfred Jasper", | |
"Gico", | |
"Kitchie Nadal", | |
"PRETTYMF9INE", | |
"CA$HMAN", | |
"Sansette", | |
"Just Hush", | |
"Tera", | |
"Kid Bando", | |
"Jason Dhakal", | |
] | |
# add in the set from the playlists: | |
for name in manualartists: OPMartists.add(name) | |
def checkIfOPM(row): | |
''' | |
returns 1 if any of the listed artists in the "artists" feature is a Filipino | |
otherwise, returns 0 | |
''' | |
artists = row['artists'].split('| ') | |
if any([artist in OPMartists for artist in artists]): | |
return 1 | |
else: | |
return 0 | |
data_lesscol['hasFilipinoArtist'] = data_lesscol.apply(checkIfOPM, axis=1) | |
data_lesscol.to_csv('RESULT.csv', index=False) | |
################## PREDICTION ################## | |
# only use specific cols for clustering | |
clustercols = ['danceability', 'energy', 'key', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo'] | |
X = data_lesscol[clustercols] | |
# application of clustering algorithm | |
ap_cluster = AffinityPropagation(random_state=5) | |
results = ap_cluster.fit(X) | |
################## SAMPLE ################## | |
# Make a song sample to test - Never Gonna Give You Up by Rick Astley (Dance Pop) | |
sample_track = pd.DataFrame({ | |
'danceability': [0.73], | |
'energy': [0.94], | |
'key': [5], # Random. It's Ab Major but idk it's number value | |
'loudness': [-11.82], | |
'speechiness': [0.04], | |
'acousticness': [0.14], | |
'instrumentalness': [0.01], | |
'liveness': [0.15], | |
'valence': [0.92], | |
'tempo': [113.0] # Tempo is BMM | |
}, columns=clustercols) | |
################## PREDICTION ################## | |
current_closest_track = sample_track | |
current_track = sample_track | |
input_flag = False | |
# TODO: Modify to allow OPM tracks only | |
def find_closest_song(track_name): | |
global current_closest_track, current_track, data_lesscol, clustercols, ap_cluster, X, input_flag | |
artist:str = '' | |
if ' by ' in track_name: | |
artist:str = track_name.split(' by ')[1] | |
track_name = track_name.split(' by ')[0] | |
if not track_name or track_name not in data_lesscol['track.name'].tolist(): | |
input_flag = False | |
return 'No songs found. Check if input matches the suggested songs. Input is case-sensitive.' | |
input_flag = True | |
track = data_lesscol[data_lesscol['track.name'] == track_name] | |
print(f"TRACK: {track}") | |
if len(track) > 1: | |
if artist: track = data_lesscol[(data_lesscol['track.name'] == track_name) & (data_lesscol['artists'] == artist)].head(1) | |
else: track = track.head(1) | |
print(f"TRACK2: {track}") | |
track = track[clustercols] | |
# Predict the cluster for the new music instance | |
cluster_label = ap_cluster.predict(track)[0] | |
# Exploiting predict with original dataset to see tracks in their respective clusters | |
whole_cluster = ap_cluster.predict(X) | |
# Filter the original dataset to find songs in the same cluster | |
same_cluster_tracks = data_lesscol[whole_cluster == cluster_label] | |
# Take all the filipino tracks under that cluster | |
filipino_tracks = same_cluster_tracks[(same_cluster_tracks['hasFilipinoArtist'] == 1) & (same_cluster_tracks['track.name'] != track_name) & (same_cluster_tracks['artists'] != artist)] | |
print("SAME CLUSTER TRACKS") | |
print(same_cluster_tracks['track.name']) | |
print("\n\nFILIPINO TRACKS") | |
print(filipino_tracks['track.name']) | |
# Find the closest song in the same cluster | |
distances = scipy.spatial.distance.cdist(filipino_tracks[clustercols], track) | |
closest_song_index = np.argmin(distances) | |
closest_song = filipino_tracks.iloc[closest_song_index] | |
print("\nDISTANCES") | |
print(distances) | |
# Set current variables | |
current_closest_track = closest_song | |
current_track = data_lesscol[data_lesscol['track.name'] == track_name] | |
return f"{closest_song['track.name']} by {closest_song['artists']}" | |
################## MATCH ################## | |
# Consists of DataFrames parameter | |
def match(): | |
global current_closest_track, current_track, X, input_flag | |
if not input_flag: | |
return pd.DataFrame({'Features':[],'Match Percentage':[]}) | |
print(f"CLOSEST TRACK: {current_closest_track}") | |
print(f"CURRENT TRACK: {current_track}") | |
range_danceability = range_energy = range_speechiness = range_acousticness = range_instrumentalness = range_liveness = range_valence = 1 | |
range_loudness = 60 | |
range_key = 12 | |
range_tempo = max(X['tempo']) - min(X['tempo']) | |
match_danceability = 100 - abs((current_closest_track["danceability"] - current_track["danceability"])/range_danceability)*100 | |
match_energy = 100 - abs((current_closest_track["energy"] - current_track["energy"])/range_energy)*100 | |
match_key = 100 - abs((current_closest_track["key"] - current_track["key"])/range_key)*100 | |
match_loudness = 100 - abs((current_closest_track["loudness"] - current_track["loudness"])/range_loudness)*100 | |
match_speechiness = 100 - abs((current_closest_track["speechiness"] - current_track["speechiness"])/range_speechiness)*100 | |
match_acousticness = 100 - abs((current_closest_track["acousticness"] - current_track["acousticness"])/range_acousticness)*100 | |
match_instrumentalness = 100 - abs((current_closest_track["instrumentalness"] - current_track["instrumentalness"])/range_instrumentalness)*100 | |
match_liveness = 100 - abs((current_closest_track["liveness"] - current_track["liveness"])/range_liveness)*100 | |
match_valence = 100 - abs((current_closest_track["valence"] - current_track["valence"])/range_valence)*100 | |
match_tempo = 100 - abs((current_closest_track["tempo"] - current_track["tempo"])/range_tempo)*100 | |
overall_match = match_danceability + match_energy + match_key + match_loudness + match_speechiness + match_acousticness + match_instrumentalness + match_liveness + match_valence + match_tempo | |
overall_match = overall_match/10 | |
return pd.DataFrame({ | |
'Features' : ['Overall Match', 'Danceability', 'Energy', 'Key', 'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness', 'Valence', 'Tempo'], | |
'Match Percentage': [overall_match.values[0], match_danceability.values[0], match_energy.values[0], match_key.values[0], match_loudness.values[0], match_speechiness.values[0], match_acousticness.values[0], match_instrumentalness.values[0], match_liveness.values[0], match_valence.values[0], match_tempo.values[0]] | |
}) | |
################## WEBSITE ################## | |
# Get all track.name and track.artists of data_lesscol | |
all_tracks_and_artist = [] | |
for i in data_lesscol[['track.name', 'artists']].values: | |
all_tracks_and_artist.append(i[0] + ' by ' + i[1]) | |
# Remove duplicates | |
all_tracks_and_artist = list(set(all_tracks_and_artist)) | |
def filter_and_select_track(filter_text: str): | |
if not filter_text: | |
return '' | |
filtered_choices = [track for track in all_tracks_and_artist if filter_text.lower() in track.lower()] | |
return '\n'.join(filtered_choices) | |
def change_input(filter_text: str): | |
if not filter_text: | |
return '' | |
filtered_choices = [track for track in all_tracks_and_artist if filter_text.lower() in track.lower()] | |
return filtered_choices[0] | |
with gr.Blocks(theme=gr.themes.Glass(), css=".gradio-container {background: url('https://undsgn.com/wp-content/uploads/2018/04/ltotbngnzzu-uai-1600x900.jpg'); opacity:10}") as demo: | |
gr.Markdown( | |
""" | |
# OPM Song Recommender | |
#### Music is an integral part of human life. However, local artists tend to struggle financially due to the saturated market. Similarly, listeners may find it difficult to support their local artists, or even find local tracks in the first place. How is an interested listener going to find Filipino music that is similar to their tastes? | |
#### This application takes in a track (both OPM and non-OPM) and seeks to find OPM music with similar features. Try it out! | |
""" | |
) | |
input_text = gr.Textbox(label="Search Track Name (Case-sensitive) | Format: [TRACK NAME] or [TRACK NAME] by [ARTIST]", placeholder="Search for a song") | |
suggestion = gr.Textbox('', label="Suggestions Available (Double-click to copy first record in input)", placeholder="Available song in dataset will be shown here.") | |
input_text.change(filter_and_select_track, inputs=input_text, outputs=suggestion) | |
suggestion.select(change_input, inputs=input_text, outputs=input_text) | |
button = gr.Button(value="Search Similar OPM") | |
result = gr.Textbox(label="Result", value="Result will be shown here.") | |
button.click(find_closest_song, inputs=input_text, outputs=result) | |
match_percentage = gr.DataFrame(pd.DataFrame({ | |
'Features':[], | |
'Match Percentage':[]})) | |
result.change(match, outputs=match_percentage) | |
# Display all tracks | |
gr.Markdown (""" ## All Tracks (OPM and Non-OPM) """) | |
for track in all_tracks_and_artist: | |
gr.Markdown(f""" ##### {track} """) | |
demo.launch(share=True) |