catiR commited on
Commit
366ecce
·
1 Parent(s): 86daaba

force align tts, add voices

Browse files
Files changed (6) hide show
  1. app.py +2 -4
  2. requirements.txt +1 -0
  3. scripts/clusterprosody.py +88 -331
  4. scripts/reaper2pass.py +10 -14
  5. scripts/runSQ.py +128 -128
  6. scripts/tapi.py +26 -32
app.py CHANGED
@@ -54,10 +54,8 @@ with bl:
54
 
55
  #temp_sentences = ['Litlaus græn hugmynd?','Var það ekki nóg?', 'Ef svo er hvað heita þau þá?','Eru maríuhænur á Íslandi?']
56
 
57
- voices = ['Alfur','Dilja']
58
- # currently i only get json speech marks for those two.
59
- # supposedly they also provided for Karl, Dora, but i dont even get their wavs
60
- # i get everyone elses wavs tho
61
 
62
  #with gr.Row():
63
  #with gr.Column(scale=4):
 
54
 
55
  #temp_sentences = ['Litlaus græn hugmynd?','Var það ekki nóg?', 'Ef svo er hvað heita þau þá?','Eru maríuhænur á Íslandi?']
56
 
57
+ voices = ['Alfur_v2', 'Dilja_v2', 'Alfur','Dilja', 'Bjartur', 'Rosa', 'Karl', 'Dora']
58
+
 
 
59
 
60
  #with gr.Row():
61
  #with gr.Column(scale=4):
requirements.txt CHANGED
@@ -5,5 +5,6 @@ librosa
5
  scipy
6
  dtw-python
7
  scikit-learn_extra
 
8
  pydub
9
 
 
5
  scipy
6
  dtw-python
7
  scikit-learn_extra
8
+ secrets
9
  pydub
10
 
scripts/clusterprosody.py CHANGED
@@ -16,63 +16,40 @@ import os, librosa, json
16
 
17
 
18
 
19
-
20
  def z_score(x, mean, std):
21
  return (x - mean) / std
22
 
23
 
24
-
25
-
26
- # output
27
- # {'013823-0457777': [('hvaða', 0.89, 1.35),
28
- # ('sjúkdómar', 1.35, 2.17),
29
- # ('geta', 2.17, 2.4),
30
- # ('fylgt', 2.4, 2.83),
31
- # ('óbeinum', 2.83, 3.29),
32
- # ('reykingum', 3.29, 3.9)],
33
- # '014226-0508808': [('hvaða', 1.03, 1.45),
34
- # ('sjúkdómar', 1.45, 2.28),
35
- # ('geta', 2.41, 2.7),
36
- # ('fylgt', 2.7, 3.09),
37
- # ('óbeinum', 3.09, 3.74),
38
- # ('reykingum', 3.74, 4.42)],
39
- # '013726-0843679': [('hvaða', 0.87, 1.14),
40
- # ('sjúkdómar', 1.14, 1.75),
41
- # ('geta', 1.75, 1.96),
42
- # ('fylgt', 1.96, 2.27),
43
- # ('óbeinum', 2.27, 2.73),
44
- # ('reykingum', 2.73, 3.27)] }
45
-
46
- # takes a list of human SPEAKER IDS not the whole meta db
47
- def get_word_aligns(rec_ids, norm_sent, aln_dir):
48
  """
49
  Returns a dictionary of word alignments for a given sentence.
50
  """
51
  word_aligns = defaultdict(list)
 
52
 
53
- for rec in rec_ids:
54
- slist = norm_sent.split(" ")
55
- aln_path = os.path.join(aln_dir, f'{rec}.tsv')
56
  with open(aln_path) as f:
57
  lines = f.read().splitlines()
58
  lines = [l.split('\t') for l in lines]
59
  try:
60
  assert len(lines) == len(slist)
61
- word_aligns[rec] = [(w,float(s),float(e)) for w,s,e in lines]
62
  except:
63
  print(slist, lines, "<---- something didn't match")
64
  return word_aligns
65
 
66
 
67
-
68
- def get_pitches(start_time, end_time, id, path):
 
69
  """
70
  Returns an array of pitch values for a given speech.
71
  Reads from .f0 file of Time, F0, IsVoiced
72
  """
73
 
74
- f = os.path.join(path, id + ".f0")
75
- with open(f) as f:
76
  lines = f.read().splitlines()
77
  lines = [[float(x) for x in line.split()] for line in lines] # split lines into floats
78
  pitches = []
@@ -98,6 +75,7 @@ def get_pitches(start_time, end_time, id, path):
98
 
99
 
100
 
 
101
  # jcheng used energy from esps get_f0
102
  # get f0 says (?) :
103
  #The RMS value of each record is computed based on a 30 msec hanning
@@ -107,20 +85,20 @@ def get_pitches(start_time, end_time, id, path):
107
  # TODO: implement that. ?
108
  # not sure librosa provides hamming window in rms function directly
109
  # TODO handle audio that not originally .wav
110
- def get_rmse(start_time, end_time, id, path):
111
  """
112
  Returns an array of RMSE values for a given speech.
113
  """
114
 
115
- f = os.path.join(path, id + ".wav")
116
- audio, sr = librosa.load(f, sr=16000)
117
  segment = audio[int(np.floor(start_time * sr)):int(np.ceil(end_time * sr))]
118
- rmse = librosa.feature.rms(y=segment)
119
  rmse = rmse[0]
120
  #idx = np.round(np.linspace(0, len(rmse) - 1, pitch_len)).astype(int)
121
  return rmse#[idx]
122
 
123
 
 
124
  def downsample_rmse2pitch(rmse,pitch_len):
125
  idx = np.round(np.linspace(0, len(rmse) - 1, pitch_len)).astype(int)
126
  return rmse[idx]
@@ -142,29 +120,31 @@ def parse_word_indices(start_end_word_index):
142
 
143
  # take any (1stword, lastword) or (word)
144
  # unit and prepare data for that unit
145
- def get_data(norm_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, start_end_word_index):
146
  """
147
  Returns a dictionary of pitch, rmse, and spectral centroids values for a given sentence/word combinations.
148
  """
149
 
150
  s_ix, e_ix = parse_word_indices(start_end_word_index)
151
-
152
  words = '_'.join(norm_sent.split(' ')[s_ix:e_ix+1])
153
 
154
- word_aligns = get_word_aligns(h_spk_ids,norm_sent,h_align_dir)
 
 
155
  data = defaultdict(list)
156
  align_data = defaultdict(list)
157
 
158
- for id, word_al in word_aligns.items():
 
159
  start_time = word_al[s_ix][1]
160
  end_time = word_al[e_ix][2]
161
 
162
  seg_aligns = word_al[s_ix:e_ix+1]
163
  seg_aligns = [(w,round(s-start_time,2),round(e-start_time,2)) for w,s,e in seg_aligns]
164
 
165
- pitches = get_pitches(start_time, end_time, id, h_f0_dir)
166
 
167
- rmses = get_rmse(start_time, end_time, id, h_wav_dir)
168
  rmses = downsample_rmse2pitch(rmses,len(pitches))
169
  #spectral_centroids = get_spectral_centroids(start_time, end_time, id, wav_dir, len(pitches))
170
 
@@ -172,13 +152,12 @@ def get_data(norm_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, start_end_wo
172
  rmses_cpy = np.array(deepcopy(rmses))
173
  d = [[p, r] for p, r in zip(pitches_cpy, rmses_cpy)]
174
  #words = "-".join(word_combs)
175
- data[f"{words}**{id}"] = d
176
- align_data[f"{words}**{id}"] = seg_aligns
177
 
178
  return words, data, align_data
179
 
180
-
181
-
182
 
183
  def dtw_distance(x, y):
184
  """
@@ -190,7 +169,6 @@ def dtw_distance(x, y):
190
 
191
 
192
 
193
-
194
  # recs is a sorted list of rec IDs
195
  # all recs/data contain the same words
196
  # rec1 and rec2 can be the same
@@ -206,33 +184,7 @@ def pair_dists(data,words,recs):
206
  val2 = data[key2]
207
  dtw_dists.append((f"{rec1}**{rec2}", dtw_distance(val1, val2)))
208
 
209
- #for key1, value1 in data.items():
210
- # d1 = key1.split("**")
211
- # words1 = d1[0]
212
- # if not words:
213
- # words = words1
214
- # spk1 = d1[1]
215
- # for key2, value2 in data.items():
216
- # d2 = key2.split("**")
217
- # words2 = d2[0]
218
- # spk2 = d2[1]
219
- # if all([w0 == w2 for w0, w2 in zip(words.split('_'), words2.split('_'))]):
220
- #dtw_dists[words1].append((f"{spk1}**{spk2}", dtw_distance(value1, value2)))
221
- # dtw_dists.append((f"{spk1}**{spk2}", dtw_distance(value1, value2)))
222
  return dtw_dists
223
- # dtw dists is the dict from units to list of tuples
224
- # or: now just the list not labelled with the unit.
225
- # {'hvaða-sjúkdómar':
226
- # [('013823-0457777_013823-0457777', 0.0),
227
- # ('013823-0457777_013698-0441666', 0.5999433281203399),
228
- # ('013823-0457777_014675-0563760', 0.4695447105594414),
229
- # ('014226-0508808_013823-0457777', 0.44080874425223393),
230
- # ('014226-0508808_014226-0508808', 0.0),
231
- # ('014226-0508808_013726-0843679', 0.5599404672667414),
232
- # ('014226-0508808_013681-0442313', 0.6871330752342419)]
233
- # }
234
- # the 0-distance self-comparisons are present here
235
- # along with both copies of symmetric Speaker1**Speaker2, Speaker2**Speaker1
236
 
237
 
238
 
@@ -244,46 +196,9 @@ def kmedoids_clustering(X):
244
  return y_km, kmedoids
245
 
246
 
247
- def get_tts_data(tdir,voice,start_end_word_index):
248
- with open(f'{tdir}{voice}.json') as f:
249
- speechmarks = json.load(f)
250
- speechmarks = speechmarks['alignments']
251
-
252
- sr=16000
253
- tts_audio, _ = librosa.load(f'{tdir}{voice}.wav',sr=sr)
254
-
255
- # TODO
256
- # tts operates on punctuated version
257
- # so clean this up instead of assuming it will work
258
- s_ix, e_ix = parse_word_indices(start_end_word_index)
259
-
260
- # TODO
261
- # default speechmarks return word start time only -
262
- # this cannot describe pauses #######
263
- s_tts = speechmarks[s_ix]["time"]/1000
264
- if e_ix+1 < len(speechmarks): #if user doesn't want final word, which has no end time mark,
265
- e_tts = speechmarks[e_ix+1]["time"]/1000
266
- tts_segment = tts_audio[int(np.floor(s_tts * sr)):int(np.ceil(e_tts * sr))]
267
- else:
268
- tts_segment = tts_audio[int(np.floor(s_tts * sr)):]
269
- e_tts = len(tts_audio) / sr
270
- # TODO not ideal as probably silence padding on end file?
271
-
272
- tts_align = [(speechmarks[ix]["value"],speechmarks[ix]["time"]) for ix in range(s_ix,e_ix+1)]
273
- tts_align = [(w,s/1000) for w,s in tts_align]
274
- tts_align = [(w,round(s-s_tts,3)) for w,s in tts_align]
275
-
276
- tts_f0 = get_pitches(s_tts, e_tts, voice, tdir)
277
- tts_rmse = get_rmse(s_tts, e_tts, voice, tdir)
278
- tts_rmse = downsample_rmse2pitch(tts_rmse,len(tts_f0))
279
- t_pitches_cpy = np.array(deepcopy(tts_f0))
280
- t_rmses_cpy = np.array(deepcopy(tts_rmse))
281
- tts_data = [[p, r] for p, r in zip(t_pitches_cpy, t_rmses_cpy)]
282
- return tts_data, tts_align
283
-
284
-
285
 
286
  def match_tts(clusters, speech_data, tts_data, tts_align, words, seg_aligns, voice):
 
287
 
288
  tts_info = []
289
  for label in set([c for r,c in clusters]):
@@ -308,18 +223,31 @@ def match_tts(clusters, speech_data, tts_data, tts_align, words, seg_aligns, voi
308
  bad_cluster = tts_info[2][0]
309
  bad_data = {f'{words}**{r}': speech_data[f'{words}**{r}'] for r,c in clusters if c==bad_cluster}
310
 
311
- tts_fig_p = plot_pitch_tts(matched_data,tts_data, tts_align, words,seg_aligns,best_cluster,voice)
312
- fig_mid_p = plot_pitch_cluster(mid_data,words,seg_aligns,mid_cluster)
313
- fig_bad_p = plot_pitch_cluster(bad_data,words,seg_aligns,bad_cluster)
 
 
314
 
315
- tts_fig_e = plot_rmse_tts(matched_data,tts_data, tts_align, words,seg_aligns,best_cluster,voice)
316
- fig_mid_e = plot_rmse_cluster(mid_data,words,seg_aligns,mid_cluster)
317
- fig_bad_e = plot_rmse_cluster(bad_data,words,seg_aligns,bad_cluster)
318
 
319
  return best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p, tts_fig_e, fig_mid_e, fig_bad_e
320
 
321
 
322
 
 
 
 
 
 
 
 
 
 
 
 
323
 
324
  # since clustering strictly operates on X,
325
  # once reduce a duration metric down to pair-distances,
@@ -329,14 +257,16 @@ def match_tts(clusters, speech_data, tts_data, tts_align, words, seg_aligns, voi
329
  # or can it not take that input in multidimensional space
330
  # then the 3 dists can still be averaged to flatten, if appropriately scaled
331
 
332
- def cluster(norm_sent,orig_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, tts_dir, voices, start_end_word_index):
333
 
334
  h_spk_ids = sorted(h_spk_ids)
335
  nsents = len(h_spk_ids)
336
 
337
- words, data, seg_aligns = get_data(norm_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, start_end_word_index)
 
 
338
 
339
- dtw_dists = pair_dists(data,words,h_spk_ids)
340
 
341
  kmedoids_cluster_dists = []
342
 
@@ -352,13 +282,17 @@ def cluster(norm_sent,orig_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, tts
352
  groups = [[r,c] for r,c in zip(h_spk_ids,kmedoids.labels_)]
353
 
354
 
355
- # tts: assume the first 64 chars of sentence are enough
356
- tdir = f'{tts_dir}{orig_sent.replace(" ","_")[:65]}/'
 
357
  for v in voices:
358
- tts_data, tts_align = get_tts_data(tdir,v,start_end_word_index)
 
 
 
359
 
360
  # match the data with a cluster -----
361
- best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p, tts_fig_e, fig_mid_e, fig_bad_e = match_tts(groups, data, tts_data, tts_align, words, seg_aligns,v)
362
 
363
  # only supports one voice at a time currently
364
  return best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p, tts_fig_e, fig_mid_e, fig_bad_e
@@ -432,125 +366,39 @@ def get_audio_part(start_time, end_time, id, path):
432
 
433
 
434
 
435
-
436
- def plot_pitch_tts(speech_data,tts_data, tts_align,words,seg_aligns,cluster_id, voice):
437
  colors = ["red", "green", "blue", "orange", "purple", "pink", "brown", "gray", "cyan"]
438
  cc = 0
439
  fig = plt.figure(figsize=(10, 5))
440
- plt.title(f"{words} - Pitch - Cluster {cluster_id}")
441
- for k,v in speech_data.items():
442
-
443
- spk = k.split('**')[1]
444
-
445
- word_times = seg_aligns[k]
446
-
447
- pitches = [p for p,e in v]
448
- # datapoint interval is 0.005 seconds
449
- pitch_xvals = [x*0.005 for x in range(len(pitches))]
450
-
451
- # centre around the first word boundary -
452
- # if 3+ words, too bad.
453
- if len(word_times)>1:
454
- realign = np.mean([word_times[0][2],word_times[1][1]])
455
- pitch_xvals = [x - realign for x in pitch_xvals]
456
- word_times = [(w,s-realign,e-realign) for w,s,e in word_times]
457
- plt.axvline(x= 0, color="gray", linestyle='--', linewidth=1, label=f"{word_times[0][0]} -> {word_times[1][0]} boundary")
458
-
459
- if len(word_times)>2:
460
- for i in range(1,len(word_times)-1):
461
- bound_line = np.mean([word_times[i][2],word_times[i+1][1]])
462
- plt.axvline(x=bound_line, color=colors[cc], linestyle='--', linewidth=1, label=f"Speaker {spk} -> {word_times[i+1][0]}")
463
-
464
- plt.scatter(pitch_xvals, pitches, color=colors[cc], label=f"Speaker {spk}")
465
- cc += 1
466
- if cc >= len(colors):
467
- cc=0
468
-
469
- tpitches = [p for p,e in tts_data]
470
- t_xvals = [x*0.005 for x in range(len(tpitches))]
471
 
472
- if len(tts_align)>1:
473
- realign = tts_align[1][1]
474
- t_xvals = [x - realign for x in t_xvals]
475
- tts_align = [(w,s-realign) for w,s in tts_align]
476
-
477
- if len(tts_align)>2:
478
- for i in range(2,len(tts_align)):
479
- bound_line = tts_align[i][1]
480
- plt.axvline(x=bound_line, color="black", linestyle='--', linewidth=1, label=f"TTS -> {tts_align[i][0]}")
481
- plt.scatter(t_xvals, tpitches, color="black", label=f"TTS {voice}")
482
-
483
-
484
- #plt.legend()
485
- #plt.show()
486
-
487
-
488
- return fig
489
-
490
-
491
-
492
- def plot_pitch_cluster(speech_data,words,seg_aligns,cluster_id):
493
- colors = ["red", "green", "blue", "orange", "purple", "pink", "brown", "gray", "cyan"]
494
- cc = 0
495
- fig = plt.figure(figsize=(8, 4))
496
- plt.title(f"{words} - Pitch - Cluster {cluster_id}")
497
  for k,v in speech_data.items():
498
 
499
  spk = k.split('**')[1]
500
 
501
  word_times = seg_aligns[k]
502
 
503
- pitches = [p for p,e in v]
504
- # datapoint interval is 0.005 seconds
505
- pitch_xvals = [x*0.005 for x in range(len(pitches))]
506
-
507
- # centre around the first word boundary -
508
- # if 3+ words, too bad.
509
- if len(word_times)>1:
510
- realign = np.mean([word_times[0][2],word_times[1][1]])
511
- pitch_xvals = [x - realign for x in pitch_xvals]
512
- word_times = [(w,s-realign,e-realign) for w,s,e in word_times]
513
- plt.axvline(x= 0, color="gray", linestyle='--', linewidth=1, label=f"{word_times[0][0]} -> {word_times[1][0]} boundary")
514
-
515
- if len(word_times)>2:
516
- for i in range(1,len(word_times)-1):
517
- bound_line = np.mean([word_times[i][2],word_times[i+1][1]])
518
- plt.axvline(x=bound_line, color=colors[cc], linestyle='--', linewidth=1, label=f"Speaker {spk} -> {word_times[i+1][0]}")
519
-
520
- plt.scatter(pitch_xvals, pitches, color=colors[cc], label=f"Speaker {spk}")
521
- cc += 1
522
- if cc >= len(colors):
523
- cc=0
524
-
525
- #plt.legend()
526
- #plt.show()
527
-
528
-
529
- return fig
530
-
531
-
532
-
533
-
534
- def plot_rmse_tts(speech_data,tts_data, tts_align,words,seg_aligns,cluster_id, voice):
535
- colors = ["red", "green", "blue", "orange", "purple", "pink", "brown", "gray", "cyan"]
536
- cc = 0
537
- fig = plt.figure(figsize=(10, 5))
538
- plt.title(f"{words} - Energy - Cluster {cluster_id}")
539
- for k,v in speech_data.items():
540
-
541
- spk = k.split('**')[1]
542
-
543
- word_times = seg_aligns[k]
544
 
545
- rmse = [e for p,e in v]
546
  # datapoint interval is 0.005 seconds
547
- rmse_xvals = [x*0.005 for x in range(len(rmse))]
548
 
549
  # centre around the first word boundary -
550
  # if 3+ words, too bad.
551
  if len(word_times)>1:
552
  realign = np.mean([word_times[0][2],word_times[1][1]])
553
- rmse_xvals = [x - realign for x in rmse_xvals]
554
  word_times = [(w,s-realign,e-realign) for w,s,e in word_times]
555
  plt.axvline(x= 0, color="gray", linestyle='--', linewidth=1, label=f"{word_times[0][0]} -> {word_times[1][0]} boundary")
556
 
@@ -559,24 +407,25 @@ def plot_rmse_tts(speech_data,tts_data, tts_align,words,seg_aligns,cluster_id, v
559
  bound_line = np.mean([word_times[i][2],word_times[i+1][1]])
560
  plt.axvline(x=bound_line, color=colors[cc], linestyle='--', linewidth=1, label=f"Speaker {spk} -> {word_times[i+1][0]}")
561
 
562
- plt.plot(rmse_xvals, rmse, color=colors[cc], label=f"Speaker {spk}")
563
  cc += 1
564
  if cc >= len(colors):
565
  cc=0
566
 
567
- trmse = [e for p,e in tts_data]
568
- t_xvals = [x*0.005 for x in range(len(trmse))]
 
569
 
570
- if len(tts_align)>1:
571
- realign = tts_align[1][1]
572
- t_xvals = [x - realign for x in t_xvals]
573
- tts_align = [(w,s-realign) for w,s in tts_align]
574
 
575
- if len(tts_align)>2:
576
- for i in range(2,len(tts_align)):
577
- bound_line = tts_align[i][1]
578
- plt.axvline(x=bound_line, color="black", linestyle='--', linewidth=1, label=f"TTS -> {tts_align[i][0]}")
579
- plt.plot(t_xvals, trmse, color="black", label=f"TTS {voice}")
580
 
581
 
582
  #plt.legend()
@@ -586,99 +435,7 @@ def plot_rmse_tts(speech_data,tts_data, tts_align,words,seg_aligns,cluster_id, v
586
  return fig
587
 
588
 
589
- def plot_rmse_cluster(speech_data,words,seg_aligns,cluster_id):
590
- colors = ["red", "green", "blue", "orange", "purple", "pink", "brown", "gray", "cyan"]
591
- cc = 0
592
- fig = plt.figure(figsize=(10, 5))
593
- plt.title(f"{words} - Energy - Cluster {cluster_id}")
594
- for k,v in speech_data.items():
595
-
596
- spk = k.split('**')[1]
597
 
598
- word_times = seg_aligns[k]
599
-
600
- rmse = [e for p,e in v]
601
- # datapoint interval is 0.005 seconds
602
- rmse_xvals = [x*0.005 for x in range(len(rmse))]
603
-
604
- # centre around the first word boundary -
605
- # if 3+ words, too bad.
606
- if len(word_times)>1:
607
- realign = np.mean([word_times[0][2],word_times[1][1]])
608
- rmse_xvals = [x - realign for x in rmse_xvals]
609
- word_times = [(w,s-realign,e-realign) for w,s,e in word_times]
610
- plt.axvline(x= 0, color="gray", linestyle='--', linewidth=1, label=f"{word_times[0][0]} -> {word_times[1][0]} boundary")
611
-
612
- if len(word_times)>2:
613
- for i in range(1,len(word_times)-1):
614
- bound_line = np.mean([word_times[i][2],word_times[i+1][1]])
615
- plt.axvline(x=bound_line, color=colors[cc], linestyle='--', linewidth=1, label=f"Speaker {spk} -> {word_times[i+1][0]}")
616
-
617
- plt.plot(rmse_xvals, rmse, color=colors[cc], label=f"Speaker {spk}")
618
- cc += 1
619
- if cc >= len(colors):
620
- cc=0
621
-
622
- return fig
623
-
624
-
625
- # want to:
626
- # - find tts best cluster
627
- # - find avg dist for tts in that cluster
628
- # - find avg dist for any human to the rest of its cluster
629
-
630
-
631
-
632
- # see near end of notebook for v nice way to grab timespans of tts audio
633
- # (or just the start/end timestamps to mark them) from alignment json
634
- # based on word position index -
635
- # so probably really do show user the sentence with each word numbered.
636
-
637
-
638
-
639
- # THEN there is -
640
- # \# Plot pitch, rmse, and spectral centroid for each word combination for each speaker
641
- # - this is one persontoken per graph and has a word division line - idk if works >2 wds.
642
- # it might be good to do this for tts at least, eh
643
-
644
-
645
- # Plot pitch values for each word combination for each speaker in each cluster (with word boundaries)
646
- # - multi speakers (one cluster) per graph - this will be good to show, with tts on top.
647
- # i may want to recentre it around wd bound. at least if only 2 wds.
648
- # well i could just pick, like, it will be centred around the 1st wboundary & good luck if more.
649
-
650
- # - the same as above, but rmse
651
-
652
- # go all the way to the bottom to see gphs with a tts added on to one cluster.
653
-
654
-
655
-
656
-
657
- # will need:
658
- # the whole sentence text (index, word) pairs
659
- # the indices of units the user wants
660
- # human meta db of all human recordings
661
- # tts dir, human wav + align + f0 dirs
662
- # list of tts voices
663
- # an actual wav file for each human rec, probably
664
- # params like: use f0, use rmse, (use dur), [.....]
665
- # .. check.
666
-
667
-
668
-
669
-
670
-
671
- def plot_clusters(X, y, word):
672
- u_labels = np.unique(y)
673
-
674
- # plot the results
675
- for i in u_labels:
676
- plt.scatter(X[y == i, 0], X[y == i, 1], label=i)
677
- plt.title(word)
678
- plt.legend()
679
- plt.show()
680
-
681
-
682
 
683
 
684
 
 
16
 
17
 
18
 
 
19
  def z_score(x, mean, std):
20
  return (x - mean) / std
21
 
22
 
23
+ # given a sentence and list of its speakers + their alignment files,
24
+ # return a dictionary of word alignments
25
+ def get_word_aligns(norm_sent, aln_paths):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  """
27
  Returns a dictionary of word alignments for a given sentence.
28
  """
29
  word_aligns = defaultdict(list)
30
+ slist = norm_sent.split(" ")
31
 
32
+ for spk,aln_path in aln_paths:
 
 
33
  with open(aln_path) as f:
34
  lines = f.read().splitlines()
35
  lines = [l.split('\t') for l in lines]
36
  try:
37
  assert len(lines) == len(slist)
38
+ word_aligns[spk] = [(w,float(s),float(e)) for w,s,e in lines]
39
  except:
40
  print(slist, lines, "<---- something didn't match")
41
  return word_aligns
42
 
43
 
44
+
45
+ #TODO pass whole path
46
+ def get_pitches(start_time, end_time, fpath):
47
  """
48
  Returns an array of pitch values for a given speech.
49
  Reads from .f0 file of Time, F0, IsVoiced
50
  """
51
 
52
+ with open(fpath) as f:
 
53
  lines = f.read().splitlines()
54
  lines = [[float(x) for x in line.split()] for line in lines] # split lines into floats
55
  pitches = []
 
75
 
76
 
77
 
78
+ # TODO take whole path
79
  # jcheng used energy from esps get_f0
80
  # get f0 says (?) :
81
  #The RMS value of each record is computed based on a 30 msec hanning
 
85
  # TODO: implement that. ?
86
  # not sure librosa provides hamming window in rms function directly
87
  # TODO handle audio that not originally .wav
88
+ def get_rmse(start_time, end_time, wpath):
89
  """
90
  Returns an array of RMSE values for a given speech.
91
  """
92
 
93
+ audio, sr = librosa.load(wpath, sr=16000)
 
94
  segment = audio[int(np.floor(start_time * sr)):int(np.ceil(end_time * sr))]
95
+ rmse = librosa.feature.rms(y=segment,frame_length=480,hop_length=80)#librosa.feature.rms(y=segment)
96
  rmse = rmse[0]
97
  #idx = np.round(np.linspace(0, len(rmse) - 1, pitch_len)).astype(int)
98
  return rmse#[idx]
99
 
100
 
101
+ # may be unnecessary depending how rmse and pitch window/hop are calculated already
102
  def downsample_rmse2pitch(rmse,pitch_len):
103
  idx = np.round(np.linspace(0, len(rmse) - 1, pitch_len)).astype(int)
104
  return rmse[idx]
 
120
 
121
  # take any (1stword, lastword) or (word)
122
  # unit and prepare data for that unit
123
+ def get_data(norm_sent,path_key,start_end_word_index):
124
  """
125
  Returns a dictionary of pitch, rmse, and spectral centroids values for a given sentence/word combinations.
126
  """
127
 
128
  s_ix, e_ix = parse_word_indices(start_end_word_index)
 
129
  words = '_'.join(norm_sent.split(' ')[s_ix:e_ix+1])
130
 
131
+ align_paths = [(spk,pdict['aln']) for spk,pdict in path_key]
132
+ word_aligns = get_word_aligns(norm_sent, align_paths)
133
+
134
  data = defaultdict(list)
135
  align_data = defaultdict(list)
136
 
137
+ for spk, pdict in path_key:
138
+ word_al = word_aligns[spk]
139
  start_time = word_al[s_ix][1]
140
  end_time = word_al[e_ix][2]
141
 
142
  seg_aligns = word_al[s_ix:e_ix+1]
143
  seg_aligns = [(w,round(s-start_time,2),round(e-start_time,2)) for w,s,e in seg_aligns]
144
 
145
+ pitches = get_pitches(start_time, end_time, pdict['f0'])
146
 
147
+ rmses = get_rmse(start_time, end_time, pdict['wav'])
148
  rmses = downsample_rmse2pitch(rmses,len(pitches))
149
  #spectral_centroids = get_spectral_centroids(start_time, end_time, id, wav_dir, len(pitches))
150
 
 
152
  rmses_cpy = np.array(deepcopy(rmses))
153
  d = [[p, r] for p, r in zip(pitches_cpy, rmses_cpy)]
154
  #words = "-".join(word_combs)
155
+ data[f"{words}**{spk}"] = d
156
+ align_data[f"{words}**{spk}"] = seg_aligns
157
 
158
  return words, data, align_data
159
 
160
+
 
161
 
162
  def dtw_distance(x, y):
163
  """
 
169
 
170
 
171
 
 
172
  # recs is a sorted list of rec IDs
173
  # all recs/data contain the same words
174
  # rec1 and rec2 can be the same
 
184
  val2 = data[key2]
185
  dtw_dists.append((f"{rec1}**{rec2}", dtw_distance(val1, val2)))
186
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  return dtw_dists
 
 
 
 
 
 
 
 
 
 
 
 
 
188
 
189
 
190
 
 
196
  return y_km, kmedoids
197
 
198
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
 
200
  def match_tts(clusters, speech_data, tts_data, tts_align, words, seg_aligns, voice):
201
+
202
 
203
  tts_info = []
204
  for label in set([c for r,c in clusters]):
 
223
  bad_cluster = tts_info[2][0]
224
  bad_data = {f'{words}**{r}': speech_data[f'{words}**{r}'] for r,c in clusters if c==bad_cluster}
225
 
226
+ #tts_fig_p = plot_pitch_tts(matched_data,tts_data, tts_align, words,seg_aligns,best_cluster,voice)
227
+ tts_fig_p = plot_one_cluster(words,'pitch',matched_data,seg_aligns,cluster,tts_data=tts_data,tts_align=tts_align,voice=voice)
228
+ fig_mid_p = plot_one_cluster(words,'pitch',mid_data,seg_aligns,cluster)
229
+ fig_bad_p = plot_one_cluster(words,'pitch',bad_data,seg_aligns,cluster)
230
+
231
 
232
+ tts_fig_e = plot_one_cluster(words,'rmse',matched_data,seg_aligns,cluster,tts_data=tts_data,tts_align=tts_align,voice=voice)
233
+ fig_mid_e = plot_one_cluster(words,'rmse',mid_data,seg_aligns,cluster)
234
+ fig_bad_e = plot_one_cluster(words,'rmse',bad_data,seg_aligns,cluster)
235
 
236
  return best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p, tts_fig_e, fig_mid_e, fig_bad_e
237
 
238
 
239
 
240
+ def gp(d,s,x):
241
+ return os.path.join(d, f'{s}.{x}')
242
+
243
+ def gen_tts_paths(tdir,voices):
244
+ plist = [(v, {'wav': gp(tdir,v,'wav'), 'aln': gp(tdir,v,'tsv'), 'f0': gp(tdir,v,'f0')}) for v in voices]
245
+ return plist
246
+
247
+ def gen_h_paths(wdir,adir,f0dir,spks):
248
+ plist = [(s, {'wav': gp(wdir,s,'wav'), 'aln': gp(adir,s,'tsv'), 'f0': gp(f0dir,s,'f0')}) for s in spks]
249
+ return plist
250
+
251
 
252
  # since clustering strictly operates on X,
253
  # once reduce a duration metric down to pair-distances,
 
257
  # or can it not take that input in multidimensional space
258
  # then the 3 dists can still be averaged to flatten, if appropriately scaled
259
 
260
+ def cluster(norm_sent,orig_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, tts_sent_dir, voices, start_end_word_index):
261
 
262
  h_spk_ids = sorted(h_spk_ids)
263
  nsents = len(h_spk_ids)
264
 
265
+ h_all_paths = gen_h_paths(h_wav_dir,h_align_dir,h_f0_dir,h_spk_ids)
266
+
267
+ words, h_data, h_seg_aligns = get_data(norm_sent,h_all_paths,start_end_word_index)
268
 
269
+ dtw_dists = pair_dists(h_data,words,h_spk_ids)
270
 
271
  kmedoids_cluster_dists = []
272
 
 
282
  groups = [[r,c] for r,c in zip(h_spk_ids,kmedoids.labels_)]
283
 
284
 
285
+ tts_all_paths = gen_tts_paths(tts_sent_dir, voices)
286
+ _, tts_data, tts_seg_aligns = get_data(norm_sent,tts_all_paths,start_end_word_index)
287
+
288
  for v in voices:
289
+ voice_data = tts_data[f"{words}**{v}"]
290
+ voice_align = tts_seg_aligns[f"{words}**{v}"]
291
+
292
+ #tts_data, tts_align = get_one_tts_data(tts_sent_dir,v,norm_sent,start_end_word_index)
293
 
294
  # match the data with a cluster -----
295
+ best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p, tts_fig_e, fig_mid_e, fig_bad_e = match_tts(groups, h_data, voice_data, voice_align, words, h_seg_aligns,v)
296
 
297
  # only supports one voice at a time currently
298
  return best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p, tts_fig_e, fig_mid_e, fig_bad_e
 
366
 
367
 
368
 
369
+ def plot_one_cluster(words,feature,speech_data,seg_aligns,cluster_id,tts_data=None,tts_align=None,voice=None):
370
+ #(speech_data, tts_data, tts_align, words, seg_aligns, cluster_id, voice):
371
  colors = ["red", "green", "blue", "orange", "purple", "pink", "brown", "gray", "cyan"]
372
  cc = 0
373
  fig = plt.figure(figsize=(10, 5))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
374
 
375
+ if feature.lower() in ['pitch','f0']:
376
+ fname = 'Pitch'
377
+ ffunc = lambda x: [p for p,e in x]
378
+ elif feature.lower() in ['energy', 'rmse']:
379
+ fname = 'Energy'
380
+ ffunc = lambda x: [e for p,e in x]
381
+ else:
382
+ print('problem with the figure')
383
+ return fig
384
+
385
+ plt.title(f"{words} - {fname} - Cluster {cluster_id}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
386
  for k,v in speech_data.items():
387
 
388
  spk = k.split('**')[1]
389
 
390
  word_times = seg_aligns[k]
391
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
392
 
393
+ feats = ffunc(v)
394
  # datapoint interval is 0.005 seconds
395
+ feat_xvals = [x*0.005 for x in range(len(feats))]
396
 
397
  # centre around the first word boundary -
398
  # if 3+ words, too bad.
399
  if len(word_times)>1:
400
  realign = np.mean([word_times[0][2],word_times[1][1]])
401
+ feat_xvals = [x - realign for x in feat_xvals]
402
  word_times = [(w,s-realign,e-realign) for w,s,e in word_times]
403
  plt.axvline(x= 0, color="gray", linestyle='--', linewidth=1, label=f"{word_times[0][0]} -> {word_times[1][0]} boundary")
404
 
 
407
  bound_line = np.mean([word_times[i][2],word_times[i+1][1]])
408
  plt.axvline(x=bound_line, color=colors[cc], linestyle='--', linewidth=1, label=f"Speaker {spk} -> {word_times[i+1][0]}")
409
 
410
+ plt.scatter(feat_xvals, feats, color=colors[cc], label=f"Speaker {spk}")
411
  cc += 1
412
  if cc >= len(colors):
413
  cc=0
414
 
415
+ if voice:
416
+ tfeats = [p for p,e in tts_data]
417
+ t_xvals = [x*0.005 for x in range(len(tfeats))]
418
 
419
+ if len(tts_align)>1:
420
+ realign = np.mean([tts_align[0][2],tts_align[1][1]])
421
+ t_xvals = [x - realign for x in t_xvals]
422
+ tts_align = [(w,s-realign,e-realign) for w,s,e in tts_align]
423
 
424
+ if len(tts_align)>2:
425
+ for i in range(1,len(tts_align)-1):
426
+ bound_line = np.mean([tts_align[i][2],tts_align[i+1][1]])
427
+ plt.axvline(x=bound_line, color="black", linestyle='--', linewidth=1, label=f"TTS -> {tts_align[i+1][0]}")
428
+ plt.scatter(t_xvals, tfeats, color="black", label=f"TTS {voice}")
429
 
430
 
431
  #plt.legend()
 
435
  return fig
436
 
437
 
 
 
 
 
 
 
 
 
438
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
439
 
440
 
441
 
scripts/reaper2pass.py CHANGED
@@ -5,11 +5,13 @@ from pydub import AudioSegment
5
  import subprocess
6
  import os
7
 
 
8
  # ref. Hirst The analysis by synthesis of speech melody: from data to models
 
9
 
10
 
11
  # reaper requires wav file path input,
12
- # not audio data itself.
13
  # reaper does NOT require 16khz mono audio.
14
  def reaper_soundfile(sound_path, orig_filetype):
15
 
@@ -17,9 +19,9 @@ def reaper_soundfile(sound_path, orig_filetype):
17
  curdir = subprocess.run(["pwd"], capture_output=True, text=True)
18
  curdir = curdir.stdout.splitlines()[0]
19
  fname = sound_path.split('/')[-1].replace(orig_filetype,'')
20
- tmp_path = f'{curdir}/REAPER_TMP/{fname}tmp.wav'
21
- if not os.path.exists(f'{curdir}/REAPER_TMP'):
22
- os.mkdir(f'{curdir}/REAPER_TMP')
23
  aud_data.export(tmp_path, format="wav")
24
  wav_path = tmp_path
25
 
@@ -31,11 +33,8 @@ def reaper_soundfile(sound_path, orig_filetype):
31
  def get_reaper(wav_path, reaper_path, maxf0='700', minf0='50'):
32
 
33
  f0_data = subprocess.run([reaper_path, "-i", wav_path, '-f', '/dev/stdout', '-x', maxf0, '-m', minf0, '-a'],capture_output=True).stdout
34
- #print('PLAIN:',f0_data)
35
  f0_data = f0_data.decode()
36
- #print('DECODE-PITCH:',f0_data)
37
  f0_data = f0_data.split('EST_Header_End\n')[1].splitlines()
38
- #print(f0_data)
39
  f0_data = [l.split(' ') for l in f0_data]
40
  f0_data = [l for l in f0_data if len(l) == 3] # the last line or 2 lines are other info, different format
41
  f0_data = [ [float(t), float(f), float(v)] for t,v,f in f0_data]
@@ -43,12 +42,9 @@ def get_reaper(wav_path, reaper_path, maxf0='700', minf0='50'):
43
  return f0_data
44
 
45
 
46
- # currently,
47
- # take the simplified list data from get_reaper_data,
48
- # with format Time F0Val only at times with existing F0Val,
49
- # and write that to a text file.
50
- # alternate would be letting reaper write its own files
51
- # instead of capturing the stdout...
52
  def save_pitch(f0_data, save_path,hed=False):
53
  with open(save_path,'w') as handle:
54
  if hed:
@@ -60,7 +56,7 @@ def save_pitch(f0_data, save_path,hed=False):
60
  def estimate_pitch(sound_path,reaper_path = "REAPER/build/reaper"):
61
 
62
  orig_ftype = sound_path.split('.')[-1]
63
- if orig_ftype == '.wav':
64
  wav_path = sound_path
65
  else:
66
  tmp_path = reaper_soundfile(sound_path, orig_ftype)
 
5
  import subprocess
6
  import os
7
 
8
+ # 2 pass f0 estimation
9
  # ref. Hirst The analysis by synthesis of speech melody: from data to models
10
+ # python wrap for gradio app
11
 
12
 
13
  # reaper requires wav file path input,
14
+ # not audio data.
15
  # reaper does NOT require 16khz mono audio.
16
  def reaper_soundfile(sound_path, orig_filetype):
17
 
 
19
  curdir = subprocess.run(["pwd"], capture_output=True, text=True)
20
  curdir = curdir.stdout.splitlines()[0]
21
  fname = sound_path.split('/')[-1].replace(orig_filetype,'')
22
+ tmp_path = f'{curdir}/files_tmp/{fname}tmp.wav'
23
+ if not os.path.exists(f'{curdir}/files_tmp'):
24
+ os.mkdir(f'{curdir}/files_tmp')
25
  aud_data.export(tmp_path, format="wav")
26
  wav_path = tmp_path
27
 
 
33
  def get_reaper(wav_path, reaper_path, maxf0='700', minf0='50'):
34
 
35
  f0_data = subprocess.run([reaper_path, "-i", wav_path, '-f', '/dev/stdout', '-x', maxf0, '-m', minf0, '-a'],capture_output=True).stdout
 
36
  f0_data = f0_data.decode()
 
37
  f0_data = f0_data.split('EST_Header_End\n')[1].splitlines()
 
38
  f0_data = [l.split(' ') for l in f0_data]
39
  f0_data = [l for l in f0_data if len(l) == 3] # the last line or 2 lines are other info, different format
40
  f0_data = [ [float(t), float(f), float(v)] for t,v,f in f0_data]
 
42
  return f0_data
43
 
44
 
45
+
46
+ # save simplified data format from get_reaper
47
+ # instead of reaper's original output
 
 
 
48
  def save_pitch(f0_data, save_path,hed=False):
49
  with open(save_path,'w') as handle:
50
  if hed:
 
56
  def estimate_pitch(sound_path,reaper_path = "REAPER/build/reaper"):
57
 
58
  orig_ftype = sound_path.split('.')[-1]
59
+ if orig_ftype == 'wav':
60
  wav_path = sound_path
61
  else:
62
  tmp_path = reaper_soundfile(sound_path, orig_ftype)
scripts/runSQ.py CHANGED
@@ -1,4 +1,4 @@
1
- import os, unicodedata
2
  from scripts.ctcalign import aligner, wav16m
3
  from scripts.tapi import tiro
4
  from scripts.reaper2pass import estimate_pitch, save_pitch
@@ -30,23 +30,24 @@ def run(sentence, voices, start_end_word_ix):
30
 
31
 
32
  norm_sentence = snorm(sentence)
 
33
 
34
- meta = get_recordings(norm_sentence, corpus_meta)
35
- if meta:
36
- align_human(meta,speech_aligns,speech_dir,align_model_path)
37
- f0_human(meta, speech_f0, speech_dir)
38
- human_rec_ids = sorted([l[2].split('.wav')[0] for l in meta])
39
 
40
  if voices:
 
 
41
  voices = [voices[0]] # TODO. now limit one voice at a time.
42
- tts_sample, tts_speechmarks = get_tts(sentence,voices,tts_dir)
43
- f0_tts(sentence, voices, tts_dir)
44
-
45
- score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_dir, voices, start_end_word_ix)
46
 
47
  # also stop forgetting duration.
48
 
49
- return tts_sample, score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e
 
 
 
 
 
50
 
51
 
52
  def snorm(s):
@@ -54,6 +55,7 @@ def snorm(s):
54
  while ' ' in s:
55
  s = s.replace(' ', ' ')
56
  return s
 
57
 
58
 
59
  def create_temp_sent_list():
@@ -66,156 +68,155 @@ def create_temp_sent_list():
66
 
67
 
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
  # find all the recordings of a given sentence
71
  # listed in the corpus metadata.
 
72
  # sentence should be provided lowercase without punctuation
73
- # TODO something not fatal to interface if <10
74
- def get_recordings(sentence, corpusdb):
 
 
 
75
  with open(corpusdb,'r') as handle:
76
  meta = handle.read().splitlines()
77
  meta = [l.split('\t') for l in meta[1:]]
78
 
 
79
  # column index 4 of db is normalised sentence text
80
- smeta = [l for l in meta if l[4] == sentence]
81
 
82
- if len(smeta) < 10:
83
- if len(smeta) < 1:
84
  print('This sentence does not exist in the corpus')
85
  else:
86
  print('Under 10 copies of the sentence: skipping.')
87
  return []
88
  else:
89
- print(f'{len(smeta)} recordings of sentence <{sentence}>')
90
- return smeta
91
-
92
-
93
-
94
- # check if word alignments exist for a set of human speech recordings
95
- # if not, warn, and make them with ctcalign.
96
- def align_human(meta,align_dir,speech_dir,model_path):
97
-
98
- model_word_sep = '|'
99
- model_blank_tk = '[PAD]'
100
 
101
- no_align = []
102
 
103
- for rec in meta:
104
- apath = align_dir + rec[2].replace('.wav','.tsv')
105
- if not os.path.exists(apath):
106
- no_align.append(rec)
107
-
108
- if no_align:
109
- print(f'Need to run alignment for {len(no_align)} files')
110
- if not os.path.exists(align_dir):
111
  os.makedirs(align_dir)
 
 
112
 
113
- caligner = aligner(model_path,model_word_sep,model_blank_tk)
114
- for rec in no_align:
115
- #wav_path = f'{speech_dir}{rec[1]}/{rec[2]}'
116
- wav_path = f'{speech_dir}{rec[2]}'
117
- word_aln = caligner(wav16m(wav_path),rec[4],is_normed=True)
118
- apath = align_dir + rec[2].replace('.wav','.tsv')
119
- word_aln = [[str(x) for x in l] for l in word_aln]
120
- with open(apath,'w') as handle:
121
- handle.write(''.join(['\t'.join(l)+'\n' for l in word_aln]))
122
- else:
123
- print('All alignments existed')
124
-
125
-
126
 
127
- # check if f0s exist for all of those files.
128
- # if not, warn, and make them with TODO reaper
129
- def f0_human(meta, f0_dir, speech_dir, reaper_path = "REAPER/build/reaper"):
130
- no_f0 = []
131
-
132
  for rec in meta:
 
 
 
 
 
133
  fpath = f0_dir + rec[2].replace('.wav','.f0')
134
  if not os.path.exists(fpath):
135
- no_f0.append(rec)
136
-
137
- if no_f0:
138
- print(f'Need to estimate pitch for {len(no_f0)} recordings')
139
- if not os.path.exists(f0_dir):
140
- os.makedirs(f0_dir)
141
- for rec in no_f0:
142
- wav_path = f'{speech_dir}{rec[2]}'
143
  fpath = f0_dir + rec[2].replace('.wav','.f0')
144
- f0_data = estimate_pitch(wav_path, reaper_path)
145
  save_pitch(f0_data,fpath)
146
-
147
-
148
- #print('2ND PASS PITCHES OF', fpath)
149
- #print(f0_data)
150
-
151
-
152
- else:
153
- print('All speech pitch trackings existed')
154
 
155
 
156
 
157
 
158
- # check if the TTS wavs + align jsons exist for this sentence
159
- # if not, warn and make them with TAPI ******
160
- def get_tts(sentence,voices,ttsdir):
161
 
162
- # assume the first 64 chars of sentence are enough
163
- dpath = sentence.replace(' ','_')[:65]
164
-
165
- no_voice = []
166
 
167
- temp_sample_path = ''
 
 
 
168
 
169
  for v in voices:
170
- wpath = f'{ttsdir}{dpath}/{v}.wav'
171
- jpath = f'{ttsdir}{dpath}/{v}.json'
172
- if not (os.path.exists(wpath) and os.path.exists(jpath)):
173
- no_voice.append(v)
174
- if not temp_sample_path:
175
- temp_sample_path = wpath
176
- temp_json_path = jpath
177
 
178
- if no_voice:
179
- print(f'Need to generate TTS for {len(no_voice)} voices')
180
- if not os.path.exists(f'{ttsdir}{dpath}'):
181
- os.makedirs(f'{ttsdir}{dpath}')
182
- for v in voices:
183
- wf, af = tiro(sentence,v,save=f'{ttsdir}{dpath}/')
 
 
 
184
 
185
- else:
186
- print('TTS for all voices existed')
187
 
188
- return temp_sample_path, temp_json_path
 
 
 
 
 
189
 
190
 
191
 
192
- # check if the TTS f0s exist
193
- # if not warn + make
194
- # TODO collapse functions
195
- def f0_tts(sentence, voices, ttsdir, reaper_path = "REAPER/build/reaper"):
196
-
197
- # assume the first 64 chars of sentence are enough
198
- dpath = sentence.replace(' ','_')[:65]
199
 
200
- no_f0 = []
 
201
 
202
- for v in voices:
203
- fpath = f'{ttsdir}{dpath}/{v}.f0'
204
- if not os.path.exists(fpath):
205
- no_f0.append(v)
206
-
207
-
208
- if no_f0:
209
- print(f'Need to estimate pitch for {len(no_f0)} voices')
210
- for v in voices:
211
- wav_path = f'{ttsdir}{dpath}/{v}.wav'
212
- fpath = f'{ttsdir}{dpath}/{v}.f0'
213
 
214
- f0_data = estimate_pitch(wav_path, reaper_path)
215
- save_pitch(f0_data,fpath)
 
 
 
216
 
217
- else:
218
- print('All TTS pitch trackings existed')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
 
220
 
221
 
@@ -239,21 +240,20 @@ def localtest():
239
  reaper_exc = '/home/caitlinr/work/notterra/REAPER/build/reaper'
240
 
241
  norm_sentence = snorm(sentence)
242
- meta = get_recordings(norm_sentence, corpus_meta)
243
- #print(meta)
244
- if meta:
245
- align_human(meta,speech_aligns,speech_dir,align_model_path)
246
- f0_human(meta, speech_f0, speech_dir, reaper_path = reaper_exc )
247
 
248
- human_rec_ids = sorted([l[2].split('.wav')[0] for l in meta])
 
249
 
250
  if voices:
251
- voices = [voices[0]] # TODO. now limit one voice at a time.
252
- audio_sample, speechmarks = get_tts(sentence,voices,tts_dir)
253
- f0_tts(sentence, voices, tts_dir, reaper_path = reaper_exc)
254
 
 
 
 
255
 
256
- score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_dir, voices, start_end_word_ix)
 
257
 
258
 
259
 
 
1
+ import os, unicodedata, string, secrets
2
  from scripts.ctcalign import aligner, wav16m
3
  from scripts.tapi import tiro
4
  from scripts.reaper2pass import estimate_pitch, save_pitch
 
30
 
31
 
32
  norm_sentence = snorm(sentence)
33
+ sentence = sentence.replace('\t', ' ')
34
 
35
+ human_rec_ids = get_samromur_queries(norm_sentence, corpus_meta, speech_dir, speech_aligns, align_model_path, speech_f0)
 
 
 
 
36
 
37
  if voices:
38
+ temp_tts_sample, tts_sent_dir = get_tts(sentence,voices,tts_dir,align_model_path)
39
+
40
  voices = [voices[0]] # TODO. now limit one voice at a time.
41
+ score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_sent_dir, voices, start_end_word_ix)
 
 
 
42
 
43
  # also stop forgetting duration.
44
 
45
+ return temp_tts_sample, score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e
46
+
47
+
48
+
49
+
50
+
51
 
52
 
53
  def snorm(s):
 
55
  while ' ' in s:
56
  s = s.replace(' ', ' ')
57
  return s
58
+
59
 
60
 
61
  def create_temp_sent_list():
 
68
 
69
 
70
 
71
+ def align_file(wav_path, output_path, norm_sentence, word_aligner = None, model_path = "carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h"):
72
+
73
+ model_word_sep = '|'
74
+ model_blank_tk = '[PAD]'
75
+
76
+ if not word_aligner:
77
+ print('initiating forced alignment, can take some time...')
78
+ word_aligner = aligner(model_path,model_word_sep,model_blank_tk)
79
+
80
+ word_aln = word_aligner(wav16m(wav_path),norm_sentence,is_normed=True)
81
+ word_aln = [[str(x) for x in l] for l in word_aln]
82
+
83
+ with open(output_path,'w') as handle:
84
+ handle.write(''.join(['\t'.join(l)+'\n' for l in word_aln]))
85
+
86
+ return word_aligner
87
+
88
+
89
+
90
 
91
  # find all the recordings of a given sentence
92
  # listed in the corpus metadata.
93
+ # find or create their alignments and f0 tracking.
94
  # sentence should be provided lowercase without punctuation
95
+ # TODO something not fatal to interface if <10 --
96
+ # metadata file for SQ is already filtered.
97
+ # TODO handle audio that is not originally .wav
98
+ # not an issue for SQ
99
+ def get_samromur_queries(sentence, corpusdb, speech_dir, align_dir, align_model_path, f0_dir, reaper_path = "REAPER/build/reaper"):
100
  with open(corpusdb,'r') as handle:
101
  meta = handle.read().splitlines()
102
  meta = [l.split('\t') for l in meta[1:]]
103
 
104
+
105
  # column index 4 of db is normalised sentence text
106
+ meta = [l for l in meta if l[4] == sentence]
107
 
108
+ if len(meta) < 10:
109
+ if len(meta) < 1:
110
  print('This sentence does not exist in the corpus')
111
  else:
112
  print('Under 10 copies of the sentence: skipping.')
113
  return []
114
  else:
115
+ print(f'{len(meta)} recordings of sentence <{sentence}>')
116
+ #return meta
117
+
 
 
 
 
 
 
 
 
118
 
119
+ word_aligner = None
120
 
121
+ if not os.path.exists(align_dir):
 
 
 
 
 
 
 
122
  os.makedirs(align_dir)
123
+ if not os.path.exists(f0_dir):
124
+ os.makedirs(f0_dir)
125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
 
 
 
 
 
127
  for rec in meta:
128
+ wpath = f'{speech_dir}{rec[2]}'
129
+ apath = align_dir + rec[2].replace('.wav','.tsv')
130
+ if not os.path.exists(apath):
131
+ word_aligner = align_file(wpath,apath, rec[4], word_aligner = word_aligner, model_path = align_model_path)
132
+
133
  fpath = f0_dir + rec[2].replace('.wav','.f0')
134
  if not os.path.exists(fpath):
 
 
 
 
 
 
 
 
135
  fpath = f0_dir + rec[2].replace('.wav','.f0')
136
+ f0_data = estimate_pitch(wpath, reaper_path)
137
  save_pitch(f0_data,fpath)
138
+
139
+
140
+ human_rec_ids = sorted([l[2].split('.wav')[0] for l in meta])
141
+ return human_rec_ids
142
+
 
 
 
143
 
144
 
145
 
146
 
147
+ # check if the TTS wavs, alignments, f0 exist for this sentence
148
+ # if not, make them
149
+ def get_tts(sentence,voices,ttsdir,align_model_path,reaper_path = "REAPER/build/reaper"):
150
 
151
+ dpath = setup_tts_sent(sentence,ttsdir)
 
 
 
152
 
153
+
154
+ sample_paths = []
155
+
156
+ word_aligner = None
157
 
158
  for v in voices:
159
+ wpath = f'{dpath}/{v}.wav'
160
+ apath = f'{dpath}/{v}.tsv'
161
+ fpath = f'{dpath}/{v}.f0'
162
+
163
+ if not os.path.exists(wpath):
164
+ wf = tiro(sentence,v,save=f'{dpath}/')
 
165
 
166
+ if not os.path.exists(apath):
167
+ word_aligner = align_file(wpath, apath, snorm(sentence), word_aligner = word_aligner, model_path = align_model_path)
168
+
169
+
170
+ if not os.path.exists(fpath):
171
+ f0_data = estimate_pitch(wpath, reaper_path)
172
+ save_pitch(f0_data,fpath)
173
+
174
+ sample_paths.append(wpath)
175
 
 
 
176
 
177
+ # TEMP
178
+ # return for single last voice
179
+ temp_sample_path = wpath
180
+
181
+ return temp_sample_path, dpath
182
+
183
 
184
 
185
 
186
+ # find if dir for this sentence exists yet
187
+ # or make one, and record it.
188
+ # punctuation can affect synthesis
189
+ # so index by original sentence, not normed text
190
+ def setup_tts_sent(sentence,ttsdir,meta_path = 'tts_meta.tsv'):
 
 
191
 
192
+ if not os.path.exists(f'{ttsdir}'):
193
+ os.makedirs(f'{ttsdir}')
194
 
195
+ sentence = sentence.replace('\n',' ')
 
 
 
 
 
 
 
 
 
 
196
 
197
+ with open(f'{ttsdir}{meta_path}','a+') as handle:
198
+ tts_meta = handle.read().splitlines()
199
+ tts_meta = [l.split('\t') for l in tts_meta]
200
+
201
+ tts_meta = {sent:s_id for s_id,sent in tts_meta}
202
 
203
+ if sentence not in tts_meta.keys():
204
+ sent_id = sentence.replace(' ','_')[:33]
205
+ rand_id = ''.join(secrets.choice(string.ascii_lowercase + string.digits) for i in range(6))
206
+ while f'{sent_id}_{rand_id}' in tts_meta.values():
207
+ rand_id = ''.join(secrets.choice(string.ascii_lowercase + string.digits) for i in range(6))
208
+ sent_id = f'{sent_id}_{rand_id}'
209
+
210
+ handle.write(f'{sent_id}\t{sentence}\n')
211
+
212
+ else:
213
+ sent_id = tts_meta[sentence]
214
+
215
+ sent_dir = f'{ttsdir}{sent_id}'
216
+ if not os.path.exists(f'{sent_dir}'):
217
+ os.makedirs(f'{sent_dir}')
218
+ return sent_dir
219
+
220
 
221
 
222
 
 
240
  reaper_exc = '/home/caitlinr/work/notterra/REAPER/build/reaper'
241
 
242
  norm_sentence = snorm(sentence)
 
 
 
 
 
243
 
244
+
245
+ human_rec_ids = get_samromur_queries(norm_sentence, corpus_meta, speech_dir, speech_aligns, align_model_path, speech_f0, reaper_path = reaper_exc)
246
 
247
  if voices:
248
+
249
+ one_audio_sample, tts_sent_dir = get_tts(sentence,voices,tts_dir,align_model_path,reaper_path = reaper_exc)
 
250
 
251
+ voices = [voices[0]] # TODO. now limit one voice at a time.
252
+
253
+ score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_sent_dir, voices, start_end_word_ix)
254
 
255
+
256
+
257
 
258
 
259
 
scripts/tapi.py CHANGED
@@ -2,12 +2,11 @@ import json, os, requests, warnings, wave
2
  warnings.filterwarnings("ignore")
3
 
4
 
5
-
6
  # synthesise speech
7
- # save 16khz mono wav file
8
- # and word-level timestamps
9
- # return paths to wave and alignment files
10
- def tiro(text,voice,save='./'):
11
 
12
  # endpoint working 2023
13
  url = 'https://tts.tiro.is/v0/speech'
@@ -24,7 +23,21 @@ def tiro(text,voice,save='./'):
24
  "VoiceId": voice
25
  }
26
 
 
 
 
 
 
 
 
 
 
 
 
27
  # word time alignments
 
 
 
28
  payload_aln = {
29
  "Engine": "standard",
30
  "LanguageCode": "is-IS",
@@ -33,37 +46,18 @@ def tiro(text,voice,save='./'):
33
  "Text": text,
34
  "VoiceId": voice
35
  }
36
-
37
-
38
- tts_data = requests.post(url, headers=headers, json=payload_tts, verify=False)
39
- aln_data = requests.post(url, headers=headers, json=payload_aln, verify=False)
40
-
41
-
42
- #fname = save+text.replace(':','').replace('/','-')
43
- #wname = fname+'.wav'
44
- #aname = fname+'.json'
45
- wname = save+voice+'.wav'
46
  aname = save+voice+'.json'
47
 
48
- with wave.open(wname,'wb') as f:
49
- f.setnchannels(1)
50
- f.setframerate(16000)
51
- f.setsampwidth(2)
52
- f.writeframes(tts_data.content)
53
-
54
- with open(aname,'w') as f:
55
- f.write('{"alignments": [')
56
- f.write(aln_data.content.decode().replace('}\n{','},\n {'))
57
- f.write(']}')
58
-
59
- return(os.path.abspath(wname),os.path.abspath(aname))
60
 
61
 
 
 
62
 
63
-
64
- #sentence = "Hæ hæ hæ hæ! Ég heiti Gervimaður Finnland, en þú?"
65
- #voice = "Alfur"
66
 
67
- #wf, af = tiro(sentence,voice)
68
 
69
- #print(wf, af)
 
2
  warnings.filterwarnings("ignore")
3
 
4
 
 
5
  # synthesise speech
6
+ # save 16khz mono wav file
7
+ # return path to wave file
8
+ # saving word alignment timestamps is deprecating
9
+ def tiro(text,voice,save='./',tiroalign = False):
10
 
11
  # endpoint working 2023
12
  url = 'https://tts.tiro.is/v0/speech'
 
23
  "VoiceId": voice
24
  }
25
 
26
+ wname = save+voice+'.wav'
27
+ tts_data = requests.post(url, headers=headers, json=payload_tts, verify=False)
28
+
29
+ with wave.open(wname,'wb') as f:
30
+ f.setnchannels(1)
31
+ f.setframerate(16000)
32
+ f.setsampwidth(2)
33
+ f.writeframes(tts_data.content)
34
+
35
+
36
+
37
  # word time alignments
38
+ # SKIP
39
+ # tiro no longer intends to support this
40
+ # and only does support it for 2 voices anyway
41
  payload_aln = {
42
  "Engine": "standard",
43
  "LanguageCode": "is-IS",
 
46
  "Text": text,
47
  "VoiceId": voice
48
  }
 
 
 
 
 
 
 
 
 
 
49
  aname = save+voice+'.json'
50
 
51
+ if tiroalign:
52
+ aln_data = requests.post(url, headers=headers, json=payload_aln, verify=False)
53
+ with open(aname,'w') as f:
54
+ f.write('{"alignments": [')
55
+ f.write(aln_data.content.decode().replace('}\n{','},\n {'))
56
+ f.write(']}')
 
 
 
 
 
 
57
 
58
 
59
+ #return(os.path.abspath(wname),os.path.abspath(aname))
60
+ return os.path.abspath(wname)
61
 
 
 
 
62
 
 
63