File size: 11,481 Bytes
734e6f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225787e
 
734e6f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6a6c71e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
import gradio as gr
import numpy as np
import pandas as pd
from sklearn.cluster import AffinityPropagation # Remove once model is saved
import scipy
import ast


################## PREPARATION ##################
files = [
    '112023_df_tracks_metadata.csv',
    '112023_df_tracks_metadata2.csv',
    '122023_df_tracks_metadata.csv',
    '012024_df_tracks_metadata.csv',
    '022024_df_tracks_metadata.csv',
    '032024_df_tracks_metadata.csv',
    '042024_df_tracks_metadata.csv',
    'nonopm.csv',
    'opm.csv'
]
dataframes = []

# convert the urls to pandas dataframes
for file in files:
  dataframes.append(pd.read_csv(file))

# merge all the dataframes to one dataframe
data = pd.concat(dataframes)
data.drop_duplicates(subset='track.id', inplace=True)

cols = ['track.id', 'track.name', 'track.artists', 'danceability', 'energy', 'key', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
data_lesscol = data[cols]

################## ADDING HAS FILIPINO FEATURE ##################


setofartists = set() # set of all artists in the dataset
artistlinks = {} # links of each artist to their respsective artist page. helps with manual sorting
maxartists = 0 # maximum number of artists per track

#create a list of all the artist names in the dataset
artistslist = data_lesscol['track.artists'].to_list()
for i, x in enumerate(artistslist):
  dicts = ast.literal_eval(x)
  maxartists = max(maxartists, len(dicts))
  for y in dicts:
    setofartists.add(y['name'])
    artistlinks[y['name']] = y['external_urls']['spotify']

def returnArtistsList(row):
  '''
  returns a string of artists names, separated by "| "
  '''
  artistslist = row['track.artists']
  artists = []
  for y in ast.literal_eval(artistslist):
    artists.append(y['name'])
  return '| '.join(artists)

# create an artists column, which is just the artists' names separated by "| "
data_lesscol['artists'] = data_lesscol.apply(returnArtistsList, axis=1)

# get the set of the OPM playlists from the opm playlists dataset
opm = dataframes[-1]
OPMartists = set()
for artistobjectarr in opm['track.artists'].to_list():
  artistsarr = ast.literal_eval(artistobjectarr)
  for artistobject in artistsarr:
    OPMartists.add(artistobject['name'])

# some of these artists are not Filipino, but we can just remove them
toRemove = ['Kina Grannis']
for name in toRemove:
  OPMartists.remove(name)

################## ALL FILIPINO LIST ##################

# manually retrieved list of OPM artists
manualartists = [
  "Orange & Lemons",
  "SunKissed Lola",
  "YP",
  "E.J",
  "Dilaw",
  "Eros Tongco",
  "Teys",
  "Lola Amour",
  "slimedemidemislime",
  "SwKeith",
  "juan karlos",
  "Jed Baruelo",
  "Jthekidd",
  "P-Lo",
  "rhodessa",
  "Hev Abi",
  "Martti Franca",
  "TONEEJAY",
  "Denise Julia",
  "SB19",
  "Al James",
  "HELLMERRY",
  "Gabito Ballesteros",
  "Calein",
  "syd hartha"
  "Nateman",
  "Tom Odell",
  "Realest Cram",
  "Janine",
  "Madman Stan",
  "Sugarcane",
  "DEMI",
  "gins&melodies",
  "Cean Jr.",
  "Arthur Miguel",
  "Cup of Joe",
  "CK YG",
  "Janine Berdin",
  "Adie",
  "Zack Tabudlo",
  "Maki",
  "NOBITA",
  "Arthur Nery",
  "Mitski",
  "Up Dharma Down",
  "Ben&Ben",
  "BINI",
  "Eraserheads",
  "Silent Sanctuary",
  "Moira Dela Torre",
  "Rivermaya",
  "Parokya Ni Edgar",
  "The Itchyworms",
  "Spongecola",
  "December Avenue",
  "Alfred Jasper",
  "Gico",
  "Kitchie Nadal",
  "PRETTYMF9INE",
  "CA$HMAN",
  "Sansette",
  "Just Hush",
  "Tera",
  "Kid Bando",
  "Jason Dhakal",
]

# add in the set from the playlists:
for name in manualartists: OPMartists.add(name)

def checkIfOPM(row):
  '''
  returns 1 if any of the listed artists in the "artists" feature is a Filipino
  otherwise, returns 0
  '''
  artists = row['artists'].split('| ')
  if any([artist in OPMartists for artist in artists]):
    return 1
  else:
    return 0

data_lesscol['hasFilipinoArtist'] = data_lesscol.apply(checkIfOPM, axis=1)
data_lesscol.to_csv('RESULT.csv', index=False)

################## PREDICTION ##################

# only use specific cols for clustering
clustercols = ['danceability', 'energy', 'key', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
X = data_lesscol[clustercols]

# application of clustering algorithm
ap_cluster = AffinityPropagation(random_state=5)
results = ap_cluster.fit(X)

################## SAMPLE ##################

# Make a song sample to test - Never Gonna Give You Up by Rick Astley (Dance Pop)
sample_track = pd.DataFrame({
    'danceability': [0.73],
    'energy': [0.94],
    'key': [5], # Random. It's Ab Major but idk it's number value
    'loudness': [-11.82],
    'speechiness': [0.04],
    'acousticness': [0.14],
    'instrumentalness': [0.01],
    'liveness': [0.15],
    'valence': [0.92],
    'tempo': [113.0] # Tempo is BMM
}, columns=clustercols)


################## PREDICTION ##################

current_closest_track = sample_track
current_track = sample_track
input_flag = False

def find_closest_song(track_name):
    global current_closest_track, current_track, data_lesscol, clustercols, ap_cluster, X, input_flag
    
    if track_name is None: 
      input_flag = False
      return 'No songs found. Check if input matches the suggested songs. Input is case-sensitive.'
    
    artist:str = ''
    if ' by ' in track_name:
        artist:str = track_name.split(' by ')[1]
        track_name = track_name.split(' by ')[0]
    
    if not track_name or track_name not in data_lesscol['track.name'].tolist(): 
        input_flag = False
        return 'No songs found. Check if input matches the suggested songs. Input is case-sensitive.'
    
    input_flag = True
    track = data_lesscol[data_lesscol['track.name'] == track_name]
    print(f"TRACK: {track}")
    if len(track) > 1:
        if artist: track = data_lesscol[(data_lesscol['track.name'] == track_name) & (data_lesscol['artists'] == artist)].head(1)
        else: track = track.head(1)
    print(f"TRACK2: {track}")

    track = track[clustercols]
    
    # Predict the cluster for the new music instance
    cluster_label = ap_cluster.predict(track)[0]

    # Exploiting predict with original dataset to see tracks in their respective clusters
    whole_cluster = ap_cluster.predict(X)

    # Filter the original dataset to find songs in the same cluster
    same_cluster_tracks = data_lesscol[whole_cluster == cluster_label]

    # Take all the filipino tracks under that cluster
    filipino_tracks = same_cluster_tracks[(same_cluster_tracks['hasFilipinoArtist'] == 1) & (same_cluster_tracks['track.name'] != track_name) & (same_cluster_tracks['artists'] != artist)]
    
    print("SAME CLUSTER TRACKS")
    print(same_cluster_tracks['track.name'])

    print("\n\nFILIPINO TRACKS")
    print(filipino_tracks['track.name'])

    # Find the closest song in the same cluster
    distances = scipy.spatial.distance.cdist(filipino_tracks[clustercols], track)
    closest_song_index = np.argmin(distances)
    closest_song = filipino_tracks.iloc[closest_song_index]
    
    print("\nDISTANCES")
    print(distances)

    # Set current variables
    current_closest_track = closest_song
    current_track = data_lesscol[data_lesscol['track.name'] == track_name]
    
    return f"{closest_song['track.name']} by {closest_song['artists']}"



################## MATCH ##################
# Consists of DataFrames parameter
def match():
    global current_closest_track, current_track, X, input_flag
    if not input_flag:
        return pd.DataFrame({'Features':[],'Match Percentage':[]})
    
    print(f"CLOSEST TRACK: {current_closest_track}")
    print(f"CURRENT TRACK: {current_track}")
    
    range_danceability = range_energy = range_speechiness = range_acousticness = range_instrumentalness = range_liveness = range_valence = 1
    range_loudness = 60
    range_key = 12
    range_tempo = max(X['tempo']) - min(X['tempo'])
    
    match_danceability = 100 - abs((current_closest_track["danceability"] - current_track["danceability"])/range_danceability)*100

    match_energy = 100 - abs((current_closest_track["energy"] - current_track["energy"])/range_energy)*100

    match_key = 100 - abs((current_closest_track["key"] - current_track["key"])/range_key)*100

    match_loudness = 100 - abs((current_closest_track["loudness"] - current_track["loudness"])/range_loudness)*100

    match_speechiness = 100 - abs((current_closest_track["speechiness"] - current_track["speechiness"])/range_speechiness)*100

    match_acousticness = 100 - abs((current_closest_track["acousticness"] - current_track["acousticness"])/range_acousticness)*100

    match_instrumentalness = 100 - abs((current_closest_track["instrumentalness"] - current_track["instrumentalness"])/range_instrumentalness)*100

    match_liveness = 100 - abs((current_closest_track["liveness"] - current_track["liveness"])/range_liveness)*100

    match_valence = 100 - abs((current_closest_track["valence"] - current_track["valence"])/range_valence)*100

    match_tempo = 100 - abs((current_closest_track["tempo"] - current_track["tempo"])/range_tempo)*100

    overall_match = match_danceability + match_energy + match_key + match_loudness + match_speechiness + match_acousticness + match_instrumentalness + match_liveness + match_valence + match_tempo
    overall_match = overall_match/10    

    return pd.DataFrame({
        'Features' : ['Overall Match', 'Danceability', 'Energy', 'Key', 'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness', 'Valence', 'Tempo'],
        'Match Percentage': [overall_match.values[0], match_danceability.values[0], match_energy.values[0], match_key.values[0], match_loudness.values[0], match_speechiness.values[0], match_acousticness.values[0], match_instrumentalness.values[0], match_liveness.values[0], match_valence.values[0], match_tempo.values[0]]
    })




################## WEBSITE ##################

# Get all track.name and track.artists of data_lesscol
all_tracks_and_artist = []
for i in data_lesscol[['track.name', 'artists']].values:
    all_tracks_and_artist.append(i[0] + ' by ' + i[1])
    
# Remove duplicates
all_tracks_and_artist = list(set(all_tracks_and_artist))

with gr.Blocks(theme=gr.themes.Base(), css=".gradio-container {background: url('https://undsgn.com/wp-content/uploads/2018/04/ltotbngnzzu-uai-1600x900.jpg'); opacity:10}") as demo:
    with gr.Group():
        gr.Markdown(
        """
        # OPM Song Recommender
        #### Music is an integral part of human life. However, local artists tend to struggle financially due to the saturated market. Similarly, listeners may find it difficult to support their local artists, or even find local tracks in the first place. How is an interested listener going to find Filipino music that is similar to their tastes?
        #### This application takes in a track (both OPM and non-OPM) and seeks to find OPM music with similar features. Try it out!
        """
    )
    
    input_text = gr.Dropdown(label="Search Track Name (Case-sensitive)", choices=all_tracks_and_artist)

    button = gr.Button(value="Search Similar OPM")
    result = gr.Textbox(label="Result", value="Result will be shown here.")
    
    button.click(find_closest_song, inputs=input_text, outputs=result)

    match_percentage = gr.DataFrame(pd.DataFrame({
       'Features':[],
       'Match Percentage':[]}))

    result.change(match, outputs=match_percentage)
    
demo.launch(share=True)