catiR commited on
Commit
779c244
·
1 Parent(s): fbb78e7
scripts/clusterprosody.py ADDED
@@ -0,0 +1,440 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import matplotlib.pyplot as plt
3
+ import soundfile as sf
4
+ from collections import defaultdict
5
+ from dtw import dtw
6
+ from sklearn_extra.cluster import KMedoids
7
+ from copy import deepcopy
8
+ import os, librosa, json
9
+
10
+
11
+ # based on original implementation by
12
+ # https://colab.research.google.com/drive/1RApnJEocx3-mqdQC2h5SH8vucDkSlQYt?authuser=1#scrollTo=410ecd91fa29bc73
13
+ # by magnús freyr morthens 2023 supported by rannís nsn
14
+
15
+
16
+
17
+
18
+ # will need:
19
+ # the whole sentence text (index, word) pairs
20
+ # the indices of units the user wants
21
+ # human meta db of all human recordings
22
+ # tts dir, human wav + align + f0 dirs
23
+ # list of tts voices
24
+ # an actual wav file for each human rec, probably
25
+ # params like: use f0, use rmse, (use dur), [.....]
26
+ # .. check what i wrote anywhere abt this.
27
+
28
+
29
+
30
+ def z_score(x, mean, std):
31
+ return (x - mean) / std
32
+
33
+
34
+
35
+ # TODO ADJUST
36
+ # new input will be one Meta db
37
+ # output should probably be the same, e.g.
38
+ # {'013823-0457777': [('hvaða', 0.89, 1.35),
39
+ # ('sjúkdómar', 1.35, 2.17),
40
+ # ('geta', 2.17, 2.4),
41
+ # ('fylgt', 2.4, 2.83),
42
+ # ('óbeinum', 2.83, 3.29),
43
+ # ('reykingum', 3.29, 3.9)],
44
+ # '014226-0508808': [('hvaða', 1.03, 1.45),
45
+ # ('sjúkdómar', 1.45, 2.28),
46
+ # ('geta', 2.41, 2.7),
47
+ # ('fylgt', 2.7, 3.09),
48
+ # ('óbeinum', 3.09, 3.74),
49
+ # ('reykingum', 3.74, 4.42)],
50
+ # '013726-0843679': [('hvaða', 0.87, 1.14),
51
+ # ('sjúkdómar', 1.14, 1.75),
52
+ # ('geta', 1.75, 1.96),
53
+ # ('fylgt', 1.96, 2.27),
54
+ # ('óbeinum', 2.27, 2.73),
55
+ # ('reykingum', 2.73, 3.27)] }
56
+ def get_word_aligns(sentences, directory):
57
+ """
58
+ Returns a dictionary of word alignments for a given sentence.
59
+ """
60
+ word_aligns = defaultdict(list)
61
+
62
+ for sentence in sentences:
63
+ print(sentence)
64
+ slist = sentence.split(" ")
65
+
66
+ for filename in os.listdir(directory):
67
+ f = os.path.join(directory, filename)
68
+
69
+ with open(f) as f:
70
+ lines = f.read().splitlines()[1:]
71
+ lines = [line.split(",") for line in lines]
72
+ if len(lines) >= len(slist) and lines[0][2] == slist[0] and all([lines[i][2] == slist[i] for i, line in enumerate(slist)]):
73
+ id = filename.replace(".csv", "")
74
+ word_al = [(lines[j][2], float(lines[j][0]), float(lines[j][1])) for j, line in enumerate(slist)]
75
+ # word_aligns[id].append(word_al) # If one speaker has multiple sentences
76
+ word_aligns[id] = word_al
77
+
78
+ if len(word_aligns) >= 10 * len(sentences): break
79
+
80
+ return word_aligns
81
+
82
+
83
+
84
+
85
+
86
+ # TODO ADJUST
87
+ # or tbqh it is possibly fine as is
88
+ # well, what file format is it reading.
89
+ # either adjust my f0 file format or adjust this, a little.
90
+ def get_pitches(start_time, end_time, id, path):
91
+ """
92
+ Returns an array of pitch values for a given speech.
93
+ """
94
+
95
+ f = os.path.join(path, id + ".f0")
96
+ with open(f) as f:
97
+ lines = f.read().splitlines()[7:]
98
+ lines = [[float(x) for x in line.split()] for line in lines] # split lines into floats
99
+ pitches = []
100
+
101
+ # find the mean of all pitches in the whole sentence
102
+ mean = np.mean([line[2] for line in lines if line[2] != -1])
103
+ # find the std of all pitches in the whole sentence
104
+ std = np.std([line[2] for line in lines if line[2] != -1])
105
+
106
+ fifth_percentile = np.percentile([line[2] for line in lines if line[2] != -1], 5)
107
+ ninetyfifth_percentile = np.percentile([line[2] for line in lines if line[2] != -1], 95)
108
+
109
+ for line in lines:
110
+ time, is_pitch, pitch = line
111
+
112
+ if start_time <= time <= end_time:
113
+ if is_pitch:
114
+ if fifth_percentile <= pitch <= ninetyfifth_percentile:
115
+ pitches.append(z_score(pitch, mean, std))
116
+ elif pitch < fifth_percentile:
117
+ pitches.append(z_score(fifth_percentile, mean, std))
118
+ elif pitch > ninetyfifth_percentile:
119
+ pitches.append(z_score(ninetyfifth_percentile, mean, std))
120
+ else:
121
+ pitches.append(z_score(fifth_percentile, mean, std))
122
+
123
+ return pitches
124
+
125
+
126
+
127
+
128
+ # TODO adjust
129
+ # probably mainly for the assumption about filepath lol
130
+ # but also then, comprehend it lol
131
+ def get_rmse(start_time, end_time, id, path, pitch_len):
132
+ """
133
+ Returns an array of RMSE values for a given speech.
134
+ """
135
+
136
+ f = os.path.join(path, id + ".wav")
137
+ audio, sr = librosa.load(f, sr=16000)
138
+ segment = audio[int(np.floor(start_time * sr)):int(np.ceil(end_time * sr))]
139
+ rmse = librosa.feature.rms(segment)
140
+ rmse = rmse[0]
141
+ idx = np.round(np.linspace(0, len(rmse) - 1, pitch_len)).astype(int)
142
+ return rmse[idx]
143
+
144
+
145
+
146
+
147
+ tEMP_start_end_word_pairs = [
148
+ [("hvaða", "sjúkdómar"), ("geta", "fylgt"), ("óbeinum", "reykingum")],
149
+ [("en", "af", "hverju"), ("skyldi", "vera"), ("svona", "mikið", "bull"), ("í", "stjórnmálum")],
150
+ ]
151
+
152
+
153
+ #TODO !!!!!!!!!!!!!########
154
+ # make it take any list of (1stword, lastword) or (word)
155
+ # units and do the thing for those units.
156
+ # make it work if the sentence has 2 of the same word
157
+ # PROBABLY this means i actually need to display the sentence
158
+ # to the user with the words numbered,
159
+ # and make the user input word indices.
160
+ def get_data(word_aligns, start_end_word_pairs):
161
+ """
162
+ Returns a dictionary of pitch, rmse, and spectral centroids values for a given sentence/word combinations.
163
+ """
164
+
165
+ data = defaultdict(list)
166
+ f0_dir = "aligned-reaper/samromur-queries/f0/"
167
+ wav_dir = "aligned-reaper/samromur-queries/wav/"
168
+
169
+ for id, word_al in word_aligns.items():
170
+ for sent in start_end_word_pairs:
171
+ for word_combs in sent:
172
+ start, end = word_combs[0], word_combs[-1]
173
+
174
+ if any(x[0] == start for x in word_al) and any(x[0] == end for x in word_al):
175
+ start_time = [al[1] for al in word_al if al[0] == start][0]
176
+ end_time = [al[2] for al in word_al if al[0] == end][0]
177
+
178
+ pitches = get_pitches(start_time, end_time, id, f0_dir)
179
+ rmses = get_rmse(start_time, end_time, id, wav_dir, len(pitches))
180
+ spectral_centroids = get_spectral_centroids(start_time, end_time, id, wav_dir, len(pitches))
181
+ pitches_cpy = np.array(deepcopy(pitches))
182
+ rmses_cpy = np.array(deepcopy(rmses))
183
+ d = [[p, r, s] for p, r, s in zip(pitches_cpy, rmses_cpy, spectral_centroids)]
184
+ words = "-".join(word_combs)
185
+ data[f"{words}-{id}"] = d
186
+
187
+ return data
188
+ # output -
189
+ # {'hvaða-sjúkdómar-013823-0457777': [[-1.9923755532468812, 0.0027455997, -0.4325454395749879],
190
+ # [-1.9923755532468812, 0.0027455997, -0.4325454395749879],
191
+ # [-1.9923755532468812, 0.0027455997, -0.4325454395749879],
192
+ # [-1.9923755532468812, 0.0027455997, -0.4325454395749879],
193
+ # [-1.9923755532468812, 0.0033261522, -0.4428492071628255]],
194
+ # 'geta-fylgt-013823-0457777': [[x,x,x],[x,x,x]],
195
+ # 'hvaða-sjúkdómar-013726-0843679': [[],[]] }
196
+ # e.g. it seems to be a flat dict whose keys are unique speaker&unit tokens
197
+ # for which each entry is list len timepoints, at each timepoint dim feats (for me up to 2 not 3)
198
+
199
+
200
+
201
+ # up to here was forming the data
202
+ # -----------------------------------------------------
203
+ # from here down is probably clustering it
204
+
205
+
206
+
207
+ # TODO i have no idea how necessary this will be at all
208
+ def dtw_distance(x, y):
209
+ """
210
+ Returns the DTW distance between two pitch sequences.
211
+ """
212
+
213
+ alignment = dtw(x, y, keep_internals=True)
214
+ return alignment.normalizedDistance
215
+
216
+
217
+
218
+
219
+ # TODO idk but it looks p good
220
+ # HOWEVER consider exclude the 0 self-comparisons
221
+ # or see if there is something later that takes care of them
222
+ dtw_dists = defaultdict(list)
223
+
224
+ for key1, value1 in data.items():
225
+ d = key1.split("-")
226
+ words1 = d[:-2]
227
+ id1, id2 = d[-2], d[-1]
228
+ for key2, value2 in data.items():
229
+ d = key2.split("-")
230
+ words2 = d[:-2]
231
+ id3, id4 = d[-2], d[-1]
232
+ if all([w1 == w2 for w1, w2 in zip(words1, words2)]):
233
+ dtw_dists[f"{'-'.join(words1)}"].append((f"{id1}-{id2}_{id3}-{id4}", dtw_distance(value1, value2)))
234
+
235
+ # dtw dists ends up as the dict from units to list of tuples
236
+ # {'hvaða-sjúkdómar': [('013823-0457777_013823-0457777', 0.0),
237
+ # ('013823-0457777_013698-0441666', 0.5999433281203399),
238
+ # ('013823-0457777_014675-0563760', 0.4695447105594414),
239
+ # ('014226-0508808_013823-0457777', 0.44080874425223393),
240
+ # ('014226-0508808_014226-0508808', 0.0),
241
+ # ('014226-0508808_013726-0843679', 0.5599404672667414),
242
+ # ('014226-0508808_013681-0442313', 0.6871330752342419)] }
243
+ # note that currently the 0 self-comparisons are present here so
244
+
245
+
246
+
247
+ # TODO
248
+ # a) do i need this?
249
+ # b) make n_clusters a param with default 3
250
+ def kmedoids_clustering(X):
251
+ kmedoids = KMedoids(n_clusters=3, random_state=0).fit(X)
252
+ y_km = kmedoids.labels_
253
+ return y_km, kmedoids
254
+
255
+
256
+
257
+
258
+
259
+ # TODO !!!!!!!!!!!! #########
260
+ # THIS IS LIKE THE MAIN THINGS probably
261
+ # ok ya it can probably use some restructurings
262
+ # like i can make something make ids_dist2 format already earlier.
263
+ # also triplecheck what kind of distancematrix is supposed to go into X
264
+ # and what currently is it
265
+ # although ok i think it might be, and self-organising,
266
+ # and why it keeps the 0s and has symmetric doubles of everything.
267
+ # HOWEVER the 10 should possibly be replaced with nspeakers param ?!?!??
268
+
269
+
270
+ # btw since i guess clustering strictly operates on X,
271
+ # once i reduce whatever duration thing down to pair-distances,
272
+ # it no longer matters that duration and pitch/energy had different dimensionality...
273
+ # .... in fact should i actually dtw on 3 feats pitch/ener/dur separately and er cluster on
274
+ # 3dim distance mat? or can u not give it distances in multidim space bc distance doesnt do that
275
+ # in which case i could still, u kno, average the 3 distances into 1 x, altho..
276
+
277
+ kmedoids_cluster_dists = defaultdict(list)
278
+
279
+ for words, datas in dtw_dists.items():
280
+ ids_dist = {d[0]: d[1] for d in datas}
281
+
282
+ ids_dist2 = defaultdict(list)
283
+
284
+ for d in datas:
285
+ id1, id2 = d[0].split("_")
286
+ ids_dist2[id1].append(d[1])
287
+
288
+ X = [d[1] for d in datas]
289
+ X = [X[i:i+10] for i in range(0, len(X), 10)]
290
+ X = np.array(X)
291
+ y_km, kmedoids = kmedoids_clustering(X)
292
+ plot_clusters(X, y_km, words)
293
+
294
+ c1, c2, c3 = [X[np.where(kmedoids.labels_ == i)] for i in range(3)]
295
+
296
+ result = zip(X, kmedoids.labels_)
297
+ sortedR = sorted(result, key=lambda x: x[1])
298
+
299
+ for dp in sortedR:
300
+ arr, label = dp
301
+ ids = next((k for k, v in ids_dist2.items() if np.array_equal(v, arr)), None)
302
+
303
+ if ids is None:
304
+ print("ID is none")
305
+ continue
306
+
307
+ kmedoids_cluster_dists[words].append((label, ids, arr))
308
+
309
+ # TODO probably remember to make it RETURN kmedoids_cluster_dists ..
310
+
311
+
312
+
313
+
314
+
315
+
316
+
317
+
318
+ # ###############
319
+ # TTS and misc ------------------
320
+ #
321
+
322
+
323
+ # TODO rename this get_audio_part
324
+ # also maybe take that tmp wav-making out of reaper and put it somewhere general.
325
+ # so everything gets a wav.
326
+ # TODO do NOT specify SR
327
+ # and CHECK if everything that depends on this is ok with arbitrary SR
328
+ def get_audio(start_time, end_time, id, path):
329
+ """
330
+ Returns a dictionary of RMSE values for a given sentence.
331
+ """
332
+
333
+ f = os.path.join(path, id + ".wav")
334
+ audio, sr = librosa.load(f, sr=16000)
335
+ segment = audio[int(np.floor(start_time * sr)):int(np.ceil(end_time * sr))]
336
+ return segment
337
+
338
+
339
+
340
+ # see near end of notebook for v nice way to grab timespans of tts audio
341
+ # (or just the start/end timestamps to mark them) from alignment json
342
+ # based on word position index -
343
+ # so probably really do show user the sentence with each word numbered.
344
+
345
+
346
+
347
+ # TODO the speech_marks.json is NOT EXACTLY what u get from tiro
348
+ # but idr how different, so.
349
+ alfur_sents = speech_marks_data["Alfur"]
350
+ with open("speech_marks.json") as f:
351
+ speech_marks_data = json.load(f)
352
+
353
+
354
+
355
+
356
+
357
+ # TODO there IS sth for making tts_data
358
+ # but im probably p much on my own rlly for that.
359
+
360
+
361
+ # TODO this one is v v helpful.
362
+ # but mind if i adjusted a dictionaries earlier.
363
+ speaker_to_tts_dtw_dists = defaultdict(list)
364
+
365
+ for key1, value1 in data.items():
366
+ d = key1.split("-")
367
+ words1 = d[:-2]
368
+ id1, id2 = d[-2], d[-1]
369
+ for key2, value2 in tts_data.items():
370
+ d = key2.split("-")
371
+ words2 = d[:-2]
372
+ id3, id4 = d[-2], d[-1]
373
+ if all([w1 == w2 for w1, w2 in zip(words1, words2)]):
374
+ speaker_to_tts_dtw_dists[f"{'-'.join(words1)}"].append((f"{id1}-{id2}_{id3}-{id4}", dtw_distance(value1, value2)))
375
+
376
+
377
+ #TODO i think this is also gr8
378
+ # but like figure out how its doing
379
+ # bc dict format and stuff,
380
+ # working keying by word index instead of word text, ***********
381
+ # and for 1 wd or 3+ wd units...
382
+ tts_dist_to_cluster = defaultdict(list)
383
+
384
+ for words1, datas1 in kmedoids_cluster_dists.items():
385
+ for d1 in datas1:
386
+ cluster, sp_id1, arr = d1
387
+ for words2, datas2 in speaker_to_tts_dtw_dists.items():
388
+ for d2 in datas2:
389
+ ids, dist = d2
390
+ sp_id2, tts_alfur = ids.split("_")
391
+ if sp_id1 == sp_id2 and words1 == words2:
392
+ tts_dist_to_cluster[f"{words1}-{cluster}"].append(dist)
393
+
394
+ tts_mean_dist_to_cluster = {
395
+ key: np.mean(value) for key, value in tts_dist_to_cluster.items()
396
+ }
397
+
398
+
399
+
400
+
401
+ # THEN there is -
402
+ # \# Plot pitch, rmse, and spectral centroid for each word combination for each speaker
403
+ # - this is one persontoken per graph and has a word division line - idk if works >2 wds.
404
+ # it might be good to do this for tts at least, eh
405
+
406
+
407
+ # Plot pitch values for each word combination for each speaker in each cluster (with word boundaries)
408
+ # - multi speakers (one cluster) per graph - this will be good to show, with tts on top.
409
+ # i may want to recentre it around wd bound. at least if only 2 wds.
410
+ # well i could just pick, like, it will be centred around the 1st wboundary & good luck if more.
411
+
412
+ # - the same as above, but rmse
413
+
414
+ # go all the way to the bottom to see gphs with a tts added on to one cluster.
415
+
416
+
417
+
418
+
419
+
420
+ # PLOTTING IS GOING TO BE A WHOLE NIGHTMare
421
+ # that is just too bad
422
+
423
+ def plot_clusters(X, y, word):
424
+ u_labels = np.unique(y)
425
+
426
+ # plot the results
427
+ for i in u_labels:
428
+ plt.scatter(X[y == i, 0], X[y == i, 1], label=i)
429
+ plt.title(word)
430
+ plt.legend()
431
+ plt.show()
432
+
433
+
434
+
435
+
436
+
437
+
438
+
439
+
440
+
scripts/reaper2pass.py CHANGED
@@ -17,7 +17,7 @@ def reaper_soundfile(sound_path, orig_filetype):
17
  curdir = subprocess.run(["pwd"], capture_output=True, text=True)
18
  curdir = curdir.stdout.splitlines()[0]
19
  fname = sound_path.split('/')[-1].replace(orig_filetype,'')
20
- tmp_path = f'{curdir}/REAPER_TMP/{fname}_tmp.wav'
21
  if not os.path.exists(f'{curdir}/REAPER_TMP'):
22
  os.mkdir(f'{curdir}/REAPER_TMP')
23
  aud_data.export(tmp_path, format="wav")
 
17
  curdir = subprocess.run(["pwd"], capture_output=True, text=True)
18
  curdir = curdir.stdout.splitlines()[0]
19
  fname = sound_path.split('/')[-1].replace(orig_filetype,'')
20
+ tmp_path = f'{curdir}/REAPER_TMP/{fname}tmp.wav'
21
  if not os.path.exists(f'{curdir}/REAPER_TMP'):
22
  os.mkdir(f'{curdir}/REAPER_TMP')
23
  aud_data.export(tmp_path, format="wav")
scripts/runSQ.py CHANGED
@@ -31,6 +31,9 @@ def run(sentence, voices):
31
  if meta:
32
  align_human(meta,speech_aligns,speech_dir,align_model_path)
33
  f0_human(meta, speech_f0, speech_dir)
 
 
 
34
  if voices:
35
  temp_a_sample = get_tts(sentence,voices,tts_dir)
36
  f0_tts(sentence, voices, tts_dir)
 
31
  if meta:
32
  align_human(meta,speech_aligns,speech_dir,align_model_path)
33
  f0_human(meta, speech_f0, speech_dir)
34
+ #TODO cluster humans
35
+ # input - meta, speech dir, human aligns dir, human f0 dir, any cluster params.
36
+ # output maybe an object.
37
  if voices:
38
  temp_a_sample = get_tts(sentence,voices,tts_dir)
39
  f0_tts(sentence, voices, tts_dir)