ccolas commited on
Commit
337e77a
1 Parent(s): 903a62f

Upload utils.py

Browse files
Files changed (1) hide show
  1. utils.py +199 -0
utils.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import json
3
+ import os
4
+
5
+ valid_track_infos = {'uri', 'name', 'artist_name', 'popularity', 'artist_genres', 'album',
6
+ 'artist_popularity', 'audio_features', 'audio_analysis'}
7
+
8
+ def get_all_tracks_from_playlist_uri(sp, playlist_uri):
9
+ # get all playlist_tracks
10
+ offset = 0
11
+ tracks = []
12
+ done = False
13
+ while not done:
14
+ new_tracks = sp.playlist_tracks(playlist_uri, offset=offset, limit=100)["items"]
15
+ tracks += new_tracks
16
+ if len(new_tracks) < 100:
17
+ done = True
18
+ else:
19
+ offset += 100
20
+ return tracks
21
+
22
+ def update_data_with_audio_features(sp, uris, data):
23
+ assert len(uris) <= 100
24
+ tracks_audio_features = sp.audio_features(uris)
25
+ for i in range(len(uris)):
26
+ data[uris[i]]['track']['audio_features'] = tracks_audio_features[i]
27
+ return data, []
28
+
29
+ def check_all_track_has_audio_features(data):
30
+ for uri in data.keys():
31
+ assert 'audio_features' in data[uri]['track'].keys()
32
+
33
+ def get_all_tracks_from_playlists(sp, playlist_uris, verbose=False):
34
+ if verbose: print(f'Extracting all tracks from {len(playlist_uris)} playlists.')
35
+ # load data
36
+ cache_path = './cache_track_features_tmp.json'
37
+ if not os.path.exists(cache_path):
38
+ with open(cache_path, 'w') as f:
39
+ json.dump(dict(), f)
40
+ with open(cache_path, 'r') as f:
41
+ data = json.load(f)
42
+ if verbose: print(f'\t{len(data.keys())} tracks loaded from cache')
43
+
44
+ # for each playlist, extract all tracks, remove doubles
45
+ if verbose: print(f'\tScanning tracks for each playlist')
46
+ new_additions = 0
47
+ added_uris = []
48
+ for i_playlist, playlist_uri in enumerate(playlist_uris):
49
+ new_tracks = get_all_tracks_from_playlist_uri(sp, playlist_uri)
50
+ # remove doubles
51
+ for new_track in new_tracks:
52
+ uri = new_track['track']['uri'].split(':')[-1]
53
+ if uri not in set(data.keys()):
54
+ genres = sp.artist(new_track['track']['artists'][0]['uri'])['genres']
55
+ new_track['track']['genres'] = genres
56
+ data[uri] = new_track
57
+ added_uris.append(uri)
58
+ new_additions += 1
59
+ # when 100 new added uris, compute their audio features
60
+ if len(added_uris) == 100:
61
+ data, added_uris = update_data_with_audio_features(sp, added_uris, data)
62
+ if (new_additions + 1) % 1000 == 0:
63
+ data, added_uris = update_data_with_audio_features(sp, added_uris, data)
64
+ check_all_track_has_audio_features(data)
65
+ with open(cache_path, 'w') as f:
66
+ json.dump(data, f)
67
+ if verbose: print(f"\t\t{i_playlist + 1} playlists scanned ({new_additions} new tracks, total: {len(data.keys())} tracks)")
68
+ if verbose: print('\tDone.')
69
+ data, _ = update_data_with_audio_features(sp, added_uris, data)
70
+ check_all_track_has_audio_features(data)
71
+ with open(cache_path, 'w') as f:
72
+ json.dump(data, f)
73
+ return data
74
+
75
+
76
+ def get_all_tracks_from_user(sp, user_id='bkayf', verbose=False):
77
+ if verbose: print(f'Extracting all tracks from user {user_id}.')
78
+ # load data
79
+ if user_id == 'bkayf':
80
+ cache_path = '../data/bkayf/cache_track_features.json'
81
+ if not os.path.exists(cache_path):
82
+ with open(cache_path, 'w') as f:
83
+ json.dump(dict(), f)
84
+ with open(cache_path, 'r') as f:
85
+ data = json.load(f)
86
+ else:
87
+ data = dict()
88
+ if verbose: print(f'\t{len(data.keys())} tracks loaded from cache')
89
+
90
+ # first get all playlists
91
+ offset = 0
92
+ done = False
93
+ playlists = []
94
+ if verbose: print(f'\tScanning playlists.')
95
+ while not done:
96
+ new_playlists = sp.user_playlists(user_id, offset=offset, limit=50)['items']
97
+ playlists += new_playlists
98
+ if len(new_playlists) < 50:
99
+ done = True
100
+ if verbose: print(f'\t\tfrom {offset} to {offset + len(new_playlists)} (complete).')
101
+ else:
102
+ if verbose: print(f'\t\tfrom {offset} to {offset + len(new_playlists)},')
103
+ offset += 50
104
+
105
+ # for each playlist, extract all tracks, remove doubles
106
+ if verbose: print(f'\tScanning tracks for each playlist')
107
+ new_additions = 0
108
+ added_uris = []
109
+ for i_playlist, playlist in enumerate(playlists):
110
+ if (i_playlist + 1) % 5 == 0:
111
+ if verbose: print(f"\t\t{i_playlist + 1} playlists scanned ({new_additions} new tracks, total: {len(data.keys())} tracks)")
112
+ playlist_uri = playlist['uri'].split(':')[-1]
113
+ new_tracks = get_all_tracks_from_playlist_uri(sp, playlist_uri)
114
+ # remove doubles
115
+ for new_track in new_tracks:
116
+ uri = new_track['track']['uri'].split(':')[-1]
117
+ if uri not in set(data.keys()):
118
+ data[uri] = new_track
119
+ added_uris.append(uri)
120
+ new_additions += 1
121
+ # when 100 new added uris, compute their audio features
122
+ if len(added_uris) == 100:
123
+ data, added_uris = update_data_with_audio_features(sp, added_uris, data)
124
+ if (new_additions + 1) % 1000 == 0 and user_id == "bkayf":
125
+ data, added_uris = update_data_with_audio_features(sp, added_uris, data)
126
+ check_all_track_has_audio_features(data)
127
+ with open(cache_path, 'w') as f:
128
+ json.dump(data, f)
129
+ if verbose: print('\tDone.')
130
+ if user_id == "bkayf":
131
+ data, _ = update_data_with_audio_features(sp, added_uris, data)
132
+ check_all_track_has_audio_features(data)
133
+ with open(cache_path, 'w') as f:
134
+ json.dump(data, f)
135
+ return data
136
+
137
+ def get_audio_features_from_tracks(sp, data_tracks):
138
+ uris = list(data_tracks.keys())
139
+ offset = 0
140
+ done = False
141
+ while not done:
142
+ # extract audio features by groups of 100
143
+
144
+ for i_t, x in enumerate(tracks):
145
+ print(i_t)
146
+ output[info].append(sp.audio_features(x["track"]["uri"]))
147
+
148
+
149
+ def get_uri_from_link(link):
150
+ return link.split("?")[0].split("/")[-1]
151
+
152
+
153
+
154
+
155
+ def get_track_info_from_playlist_uri(sp, playlist_uri, which_info=['uri'], verbose=False):
156
+ output = dict()
157
+ assert len(set(which_info) - valid_track_infos) == 0, f"Error which_info. Valid infos are: {valid_track_infos}"
158
+
159
+ tracks = get_all_tracks_from_playlist_uri(sp, playlist_uri)
160
+ if verbose: print(f'Playlist with {len(tracks)} tracks.')
161
+
162
+ # prepare artist info if needed
163
+ if any([w in which_info for w in ['artist_genres', 'artist_popularity', 'artist_name']]):
164
+ artist_uris = [x["track"]["artists"][0]["uri"] for x in tracks]
165
+ artist_infos = [sp.artist(artist_uri) for artist_uri in artist_uris]
166
+
167
+ for info in which_info:
168
+ # print(info)
169
+ if info in ['uri', 'name', 'album', 'popularity']:
170
+ output[info] = []
171
+ for i_t, x in enumerate(tracks):
172
+ print(i_t)
173
+ output[info].append(x["track"][info])
174
+ # output[info] = [x["track"][info] for x in tracks]
175
+ elif info in ['artist_genres', 'artist_popularity', 'artist_name']:
176
+ output[info] = [artist_info[info.split('_')[1]] for artist_info in artist_infos]
177
+ elif info == 'album':
178
+ output[info] = [x["track"][info]["name"] for x in tracks]
179
+ elif info == 'audio_features':
180
+ output[info] = []
181
+ for i_t, x in enumerate(tracks):
182
+ print(i_t)
183
+ output[info].append(sp.audio_features(x["track"]["uri"]))
184
+ # output[info] = [sp.audio_features(x["track"]["uri"]) for x in tracks]
185
+ elif info == 'audio_analysis':
186
+ output[info] = [sp.audio_analysis(x["track"]["uri"]) for x in tracks]
187
+ else:
188
+ raise NotImplementedError
189
+
190
+ return output
191
+
192
+ def compute_progress_and_eta(times, iter, total, n_av=3000):
193
+ av_time = np.mean(times[-n_av:])
194
+ progress = int(((iter + 1) / total) * 100)
195
+ eta_h = int(av_time * (total - iter) // 3600)
196
+ eta_m = int((av_time * (total - iter) - (eta_h * 3600)) // 60)
197
+ eta_s = int((av_time * (total - iter) - (eta_h * 3600) - eta_m * 60))
198
+ eta = f"Progress: {progress}%, ETA: {eta_h}H{eta_m}M{eta_s}S."
199
+ return eta