catiR commited on
Commit
1095ae0
1 Parent(s): 307bcfb

appearance,tabs

Browse files
Files changed (5) hide show
  1. README.md +3 -3
  2. app.py +101 -40
  3. requirements.txt +1 -0
  4. scripts/clusterprosody.py +143 -110
  5. scripts/runSQ.py +6 -7
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
  title: Prosody clustering and evaluation
3
  emoji: ⚡
4
- colorFrom: pink
5
- colorTo: pink
6
  sdk: gradio
7
- sdk_version: 3.47.1
8
  app_file: app.py
9
  pinned: false
10
  ---
 
1
  ---
2
  title: Prosody clustering and evaluation
3
  emoji: ⚡
4
+ colorFrom: blue
5
+ colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: 4.7.1
8
  app_file: app.py
9
  pinned: false
10
  ---
app.py CHANGED
@@ -2,18 +2,21 @@ import gradio as gr
2
  import subprocess, os
3
  import scripts.runSQ
4
 
5
-
6
  #https://huggingface.co/spaces/clr/prosalign/blob/main/app.py
7
 
8
 
9
  def setup():
10
  r0 = subprocess.run(["pwd"], capture_output=True, text=True)
11
  print('PWD::', r0.stdout)
12
- r1 = subprocess.run(["wget", "https://github.com/google/REAPER/archive/refs/heads/master.zip"], capture_output=True, text=True)
13
- print(r1.stdout)
14
- subprocess.run(["unzip", "./master.zip"])
 
 
 
 
15
  subprocess.run(["mv", "REAPER-master", "REAPER"])
16
- subprocess.run(["rm", "./master.zip"])
17
  os.chdir('./REAPER')
18
  subprocess.run(["mkdir", "build"])
19
  os.chdir('./build')
@@ -31,12 +34,6 @@ print('about to setup')
31
  setup()
32
 
33
 
34
- def f1(voices, sent, indices):
35
- #tts_audio, tts_score, graph = scripts.runSQ.run(sent, voices, indices)
36
- tts_audio, tts_score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, audio_html = scripts.runSQ.run(sent, [voices], indices)
37
- score_report = f'Difference from TTS to real speech: {round(tts_score,2)}'
38
- return (tts_audio, score_report, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, audio_html)
39
-
40
 
41
  def label_indices(sentence):
42
  sentence = scripts.runSQ.snorm(sentence)
@@ -44,51 +41,112 @@ def label_indices(sentence):
44
  labelled = [(f'{word} {i+1} ', str(i+1)) for i, word in enumerate(sentence)]
45
  return labelled
46
 
47
-
 
 
 
 
 
48
 
49
  temp_sentences = scripts.runSQ.create_temp_sent_list()
50
 
51
  bl = gr.Blocks()
52
  with bl:
53
 
54
-
55
- #temp_sentences = ['Litlaus græn hugmynd?','Var það ekki nóg?', 'Ef svo er hvað heita þau þá?','Eru maríuhænur á Íslandi?']
56
 
57
- voices = ['Dilja_v2', 'Alfur_v2', 'Dilja', 'Alfur', 'Bjartur', 'Rosa', 'Karl', 'Dora']
58
 
59
-
60
- #with gr.Row():
61
- #with gr.Column(scale=4):
62
- temp_sentmenu = gr.Dropdown(temp_sentences, label="Sentence")
63
- #voiceselect = gr.CheckboxGroup(voices, label="TTS voice",value='Alfur')
64
 
65
- marked_sentence = gr.HighlightedText(interactive=False,label="Word selection key",color_map = {str(i):"#dcfce7" for i in range(333)})
 
 
 
66
 
67
- with gr.Row():
68
- spanselect = gr.Textbox(value='1-3',label="Select words",info='Enter the index of the word(s) to analyse, according to the key above. It can be a single word: 4 or a span of words separated by a dash: 2-3')
69
- voiceselect = gr.Radio(voices, label="TTS voice",value='Alfur_v2')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
- #with gr.Column(scale=1):
72
- temp_button = gr.Button(value="Run with selected options")
73
-
74
-
75
- tts_output = gr.Audio(interactive=False)
76
- report_score = gr.Markdown('Difference from TTS to real speech:')
77
 
78
  with gr.Tabs():
79
  with gr.TabItem("Pitch"):
80
 
81
- pl1 = gr.Plot()
82
- with gr.Row():
83
- pl2 = gr.Plot()
84
- pl3 = gr.Plot()
85
 
86
  with gr.TabItem("Energy"):
87
 
88
- pl4 = gr.Plot()
89
- with gr.Row():
90
- pl5 = gr.Plot()
91
- pl6 = gr.Plot()
92
 
93
  with gr.TabItem("Audio"):
94
 
@@ -96,9 +154,12 @@ with bl:
96
 
97
 
98
 
99
-
100
  temp_sentmenu.input(label_indices,temp_sentmenu,marked_sentence)
101
- temp_button.click(f1,[voiceselect,temp_sentmenu,spanselect],[tts_output,report_score,pl1,pl2,pl3,pl4,pl5,pl6,play])
 
 
 
 
102
 
103
 
104
  if __name__ == "__main__":
 
2
  import subprocess, os
3
  import scripts.runSQ
4
 
 
5
  #https://huggingface.co/spaces/clr/prosalign/blob/main/app.py
6
 
7
 
8
  def setup():
9
  r0 = subprocess.run(["pwd"], capture_output=True, text=True)
10
  print('PWD::', r0.stdout)
11
+ #r1 = subprocess.run(["wget", "https://github.com/google/REAPER/archive/refs/heads/REAPER-master.zip", "-O", "./master.zip"], capture_output=True, text=True)
12
+ #print(r1.stdout)
13
+ r9x = subprocess.run(["ls", "-la"], capture_output=True, text=True)
14
+ print('LS::', r9x.stdout)
15
+
16
+ subprocess.run(["unzip", "./REAPER-master.zip"])
17
+ subprocess.run(["rm", "./REAPER-master.zip"])
18
  subprocess.run(["mv", "REAPER-master", "REAPER"])
19
+
20
  os.chdir('./REAPER')
21
  subprocess.run(["mkdir", "build"])
22
  os.chdir('./build')
 
34
  setup()
35
 
36
 
 
 
 
 
 
 
37
 
38
  def label_indices(sentence):
39
  sentence = scripts.runSQ.snorm(sentence)
 
41
  labelled = [(f'{word} {i+1} ', str(i+1)) for i, word in enumerate(sentence)]
42
  return labelled
43
 
44
+
45
+ #gradio states dont like dicts
46
+ def d2l(d):
47
+ return [(k,v) for k,v in d.items()]
48
+ def l2d(l):
49
+ return {k:v for k,v in l}
50
 
51
  temp_sentences = scripts.runSQ.create_temp_sent_list()
52
 
53
  bl = gr.Blocks()
54
  with bl:
55
 
 
 
56
 
57
+ voices = ['Dilja_v2', 'Alfur_v2', 'Dilja', 'Alfur', 'Bjartur', 'Rosa']
58
 
59
+ with gr.Tabs():
 
 
 
 
60
 
61
+
62
+ with gr.TabItem("Options"):
63
+ temp_sentmenu = gr.Dropdown(temp_sentences, label="Sentence")
64
+ marked_sentence = gr.HighlightedText(interactive=False,label="Word selection key",color_map = {str(i):"#dcfce7" for i in range(333)})
65
 
66
+ with gr.Row():
67
+ spanselect = gr.Textbox(value='1-3',label="Select words",info='Enter the index of the word(s) to analyse, according to the key above. It can be a single word: 4 or a span of words separated by a dash: 2-3')
68
+ #voiceselect = gr.Radio(voices, label="TTS voice",value='Alfur_v2')
69
+ voiceselect = gr.CheckboxGroup(voices, label="TTS voice",value=['Dilja_v2','Alfur_v2'])
70
+
71
+ #with gr.Column(scale=1):
72
+ temp_button = gr.Button(value="Run with selected options")
73
+
74
+ with gr.TabItem("About"):
75
+ docu = gr.Markdown("""
76
+ # Multi-target prosody evaluation
77
+ ### 1. Choose a sentence - they are from Samrómur Queries
78
+ ### 2. The words will be numbered by position - type the number or range you want to evaluate
79
+ ### 3. Choose a TTS voice - they come from Tiro's API https://tiro.is/talgerving
80
+ ### 4. Run
81
+
82
+ The evaluation automatically clusters human speakers according to prosodic features,
83
+ and then measures how different the synthesised speech is from each natural cluster.
84
+ Clustering and TTS scoring use only the selected word(s) from Step 2, not the whole sentence.
85
+ Close match to one cluster shows what prosodic act TTS might have achieved, in the selected words.
86
+ TTS whose prosody does not match any cluster might sound unnatural.
87
+
88
+ TTS output includes generated audio, pitch, energy, and scores for each cluster.
89
+ Output is only shown for the selected voice(s).
90
+ Below, human data shows pitch and energy of each cluster, along with original audio.
91
+
92
+ TTS often takes over 30 seconds per sentence/voice.
93
+ After you have done it once, re-running different word spans for the same sentence/voice is much faster.
94
+
95
+ See "Automatic assessment of prosody in high-stakes English tests" (Jian Cheng, ISCA 2011)
96
+ regarding multi-target prosody scoring. This version extends the implementation by Magnús Freyr Morthens
97
+ supported by Rannís student innovation fund.
98
+ """)
99
+
100
+ ttstabs = {v:{} for v in voices}
101
+ with gr.Tabs():
102
+ for v in voices:
103
+ with gr.TabItem(v):
104
+ ttstabs[v]['tts_output'] = gr.Audio(interactive=False)
105
+ with gr.Row():
106
+ ttstabs[v]['ptts'] = gr.Plot()
107
+ ttstabs[v]['etts'] = gr.Plot()
108
+ ttstabs[v]['scorearea'] = gr.Markdown(f'TTS results for **{v}** will appear here')
109
+ #tts_output = gr.Audio(interactive=False)
110
+ #with gr.Row():
111
+ # ptts = gr.Plot()
112
+ # etts = gr.Plot()
113
+ #report_score = gr.Markdown('Difference from TTS to real speech:')
114
+
115
+ # cant store ttstabs in gradio state, use here
116
+ def f1(voices, sent, indices):
117
+ #tts_audio, tts_score, f0_fig_0, f0_fig_1, f0_fig_2, en_fig_0, en_fig_1, en_fig_2, audio_html, f0_fig_tts, en_fig_tts = scripts.runSQ.run(sent, [voices], indices)
118
+ f0_fig_0, f0_fig_1, f0_fig_2, en_fig_0, en_fig_1, en_fig_2, audio_html, tts_results = scripts.runSQ.run(sent, voices, indices)
119
+ outputs = {pc0: f0_fig_0, pc1: f0_fig_1, pc2: f0_fig_2, ec0: en_fig_0, ec1: en_fig_1, ec2: en_fig_2, play: audio_html}
120
+
121
+
122
+ for v in voices:
123
+ outputs[ttstabs[v]['tts_output']] = tts_results[v]['audio']
124
+ outputs[ttstabs[v]['ptts']] = tts_results[v]['f0_fig_tts']
125
+ outputs[ttstabs[v]['etts']] = tts_results[v]['en_fig_tts']
126
+ outputs[ttstabs[v]['scorearea']] = tts_results[v]['scoreinfo']
127
+
128
+ clear = [v for v in ttstabs.keys() if v not in voices]
129
+ for v in clear:
130
+ outputs[ttstabs[v]['tts_output']] = None
131
+ outputs[ttstabs[v]['ptts']] = None
132
+ outputs[ttstabs[v]['etts']] = None
133
+ outputs[ttstabs[v]['scorearea']] = f'TTS results for **{v}** will appear here'
134
+
135
+ return outputs #(tts_audio, score_report, f0_fig_0, f0_fig_1, f0_fig_2, en_fig_0, en_fig_1, en_fig_2, audio_html, f0_fig_tts, en_fig_tts)
136
 
 
 
 
 
 
 
137
 
138
  with gr.Tabs():
139
  with gr.TabItem("Pitch"):
140
 
141
+ pc0 = gr.Plot()
142
+ pc1 = gr.Plot()
143
+ pc2 = gr.Plot()
 
144
 
145
  with gr.TabItem("Energy"):
146
 
147
+ ec0 = gr.Plot()
148
+ ec1 = gr.Plot()
149
+ ec2 = gr.Plot()
 
150
 
151
  with gr.TabItem("Audio"):
152
 
 
154
 
155
 
156
 
 
157
  temp_sentmenu.input(label_indices,temp_sentmenu,marked_sentence)
158
+ outputs_list = [pc0,pc1,pc2,ec0,ec1,ec2,play]
159
+ for v in voices:
160
+ outputs_list += [ttstabs[v]['tts_output'], ttstabs[v]['ptts'],ttstabs[v]['etts'],ttstabs[v]['scorearea']]
161
+ temp_button.click(f1,[voiceselect,temp_sentmenu,spanselect],outputs_list)
162
+ #[tts_output,report_score,pc0,pc1,pc2,ec0,ec1,ec2,play,ptts,etts])
163
 
164
 
165
  if __name__ == "__main__":
requirements.txt CHANGED
@@ -6,4 +6,5 @@ scipy
6
  dtw-python
7
  scikit-learn_extra
8
  pydub
 
9
 
 
6
  dtw-python
7
  scikit-learn_extra
8
  pydub
9
+ colorcet
10
 
scripts/clusterprosody.py CHANGED
@@ -3,6 +3,7 @@ import matplotlib
3
  matplotlib.use('Agg')
4
  import matplotlib.pyplot as plt
5
  import soundfile as sf
 
6
  from collections import defaultdict
7
  from dtw import dtw
8
  from sklearn_extra.cluster import KMedoids
@@ -203,51 +204,28 @@ def kmedoids_clustering(X):
203
  return y_km, kmedoids
204
 
205
 
206
-
207
  def match_tts(clusters, speech_data, tts_data, tts_align, words, seg_aligns, voice):
208
 
209
-
210
- tts_info = []
211
  for label in set([c for r,c in clusters]):
212
  recs = [r for r,c in clusters if c==label]
213
  dists = []
214
  for rec in recs:
215
- key = f'{words}**{rec}'
216
- dists.append(dtw_distance(tts_data, speech_data[key]))
217
- tts_info.append((label,np.nanmean(dists)))
218
 
219
- tts_info = sorted(tts_info,key = lambda x: x[1])
220
- best_cluster = tts_info[0][0]
221
- best_cluster_score = tts_info[0][1]
222
-
223
- matched_data = {f'{words}**{r}': speech_data[f'{words}**{r}'] for r,c in clusters if c==best_cluster}
224
 
225
- # now do graphs of matched_data with tts_data
226
- # and report best_cluster_score
227
-
228
- mid_cluster = tts_info[1][0]
229
- mid_data = {f'{words}**{r}': speech_data[f'{words}**{r}'] for r,c in clusters if c==mid_cluster}
230
- bad_cluster = tts_info[2][0]
231
- bad_data = {f'{words}**{r}': speech_data[f'{words}**{r}'] for r,c in clusters if c==bad_cluster}
232
-
233
- #tts_fig_p = plot_pitch_tts(matched_data,tts_data, tts_align, words,seg_aligns,best_cluster,voice)
234
- tts_fig_p, best_cc = plot_one_cluster(words,'pitch',matched_data,seg_aligns,best_cluster,tts_data=tts_data,tts_align=tts_align,voice=voice)
235
- fig_mid_p, mid_cc = plot_one_cluster(words,'pitch',mid_data,seg_aligns,mid_cluster)
236
- fig_bad_p, bad_cc = plot_one_cluster(words,'pitch',bad_data,seg_aligns,bad_cluster)
237
 
 
 
238
 
239
- tts_fig_e, _ = plot_one_cluster(words,'rmse',matched_data,seg_aligns,best_cluster,tts_data=tts_data,tts_align=tts_align,voice=voice)
240
- fig_mid_e, _ = plot_one_cluster(words,'rmse',mid_data,seg_aligns,mid_cluster)
241
- fig_bad_e, _ = plot_one_cluster(words,'rmse',bad_data,seg_aligns,bad_cluster)
242
-
243
-
244
- # TODO
245
- # not necessarily here, bc paths to audio files.
246
- spk_cc_map = [('Best',best_cluster,best_cc), ('Mid',mid_cluster,mid_cc), ('Last',bad_cluster,bad_cc)]
247
- #playable = audio_htmls(spk_cc_map)
248
-
249
- return best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p, tts_fig_e, fig_mid_e, fig_bad_e
250
-
251
 
252
 
253
  def gp(d,s,x):
@@ -261,7 +239,6 @@ def gen_h_paths(wdir,adir,f0dir,pldir,spks):
261
  plist = [(s, {'wav': gp(wdir,s,'wav'), 'aln': gp(adir,s,'tsv'), 'f0': gp(f0dir,s,'f0'), 'play': gp(pldir,s,'wav')}) for s in spks]
262
  return plist
263
 
264
-
265
  # since clustering strictly operates on X,
266
  # once reduce a duration metric down to pair-distances,
267
  # it no longer matters that duration and pitch/energy had different dimensionality
@@ -289,30 +266,38 @@ def cluster(norm_sent,orig_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, h_p
289
  X = np.array(X)
290
 
291
  y_km, kmedoids = kmedoids_clustering(X)
292
- #plot_clusters(X, y_km, words)
293
- #c1, c2, c3 = [X[np.where(kmedoids.labels_ == i)] for i in range(3)]
294
-
295
  result = zip(X, kmedoids.labels_)
296
  groups = [[r,c] for r,c in zip(h_spk_ids,kmedoids.labels_)]
297
 
 
 
 
 
298
 
299
  tts_all_paths = gen_tts_paths(tts_sent_dir, voices)
300
  _, tts_data, tts_seg_aligns, _, _ = get_data(norm_sent,tts_all_paths,start_end_word_index)
301
-
 
302
  for v in voices:
303
- voice_data = tts_data[f"{words}**{v}"]
304
- voice_align = tts_seg_aligns[f"{words}**{v}"]
305
-
306
- #tts_data, tts_align = get_one_tts_data(tts_sent_dir,v,norm_sent,start_end_word_index)
307
-
308
  # match the data with a cluster -----
309
- best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p, tts_fig_e, fig_mid_e, fig_bad_e = match_tts(groups, h_data, voice_data, voice_align, words, h_seg_aligns,v)
310
-
311
-
312
- audio_html = clusters_audio(groups,h_playable)
313
-
314
- # only supports one voice at a time currently
315
- return best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p, tts_fig_e, fig_mid_e, fig_bad_e, audio_html
 
 
 
 
 
 
 
 
316
  #return words, kmedoids_cluster_dists, group
317
 
318
 
@@ -320,7 +305,7 @@ def cluster(norm_sent,orig_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, h_p
320
 
321
  # generate html panel to play audios for each human cluster
322
  # audios is dict {recording_id : (wav_path, seg_start_time, seg_end_time)}
323
- def clusters_audio(clusters,audios):
324
 
325
  html = '''<html><body>'''
326
 
@@ -330,17 +315,19 @@ def clusters_audio(clusters,audios):
330
  html += '<div>'
331
  html += f'<h2>Cluster {label}</h2>'
332
 
333
- html += '<div>'
334
  html += '<table><tbody>'
335
 
336
  for rec in recs:
 
 
337
  html += f'<tr><td><audio controls id="{rec}">' #width="20%">
338
 
339
  html += f'<source src="{audios[rec][0]}#t={audios[rec][1]:.2f},{audios[rec][2]:.2f}" type="audio/wav">'
340
  #html += f'<source src="{audios[rec][0]}" type="audio/wav">'
341
 
342
  html += '</audio></td>'
343
- html += f'<td>{rec}</td></tr>'
344
 
345
  html += '</tbody></table>'
346
  html += '</div>'
@@ -352,9 +339,8 @@ def clusters_audio(clusters,audios):
352
  return html
353
 
354
 
355
-
356
  # find offsets to visually align start of each word for speakers in cluster
357
- def reset_cluster_times(words,cluster_speakers,human_aligns,tts_align):
358
  words = words.split('_')
359
 
360
  retimes = [(words[0], 0.0)]
@@ -392,81 +378,128 @@ def retime_xs_feats(retimes, speaker_aligns, speaker_xvals, feats):
392
  xf.append((x,f))
393
  return [x for x,f in xf], [f for x,f in xf]
394
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
395
 
396
 
397
- def plot_one_cluster(words,feature,speech_data,seg_aligns,cluster_id,tts_data=None,tts_align=None,voice=None):
398
- #(speech_data, tts_data, tts_align, words, seg_aligns, cluster_id, voice):
399
- colors = ["red", "green", "blue", "orange", "purple", "pink", "brown", "gray", "cyan"]
400
- cc = 0
401
- spk_ccs = [] # for external display
402
- fig = plt.figure(figsize=(10, 5))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
403
 
404
  if feature.lower() in ['pitch','f0']:
405
  fname = 'Pitch'
406
- ffunc = lambda x: [p for p,e in x]
407
- pfunc = plt.scatter
 
 
 
 
 
 
408
  elif feature.lower() in ['energy', 'rmse']:
409
  fname = 'Energy'
410
  ffunc = lambda x: [e for p,e in x]
411
  pfunc = plt.plot
 
412
  else:
413
  print('problem with the figure')
414
  return fig, []
415
-
416
-
417
- # boundary for start of each word
418
- retimes = reset_cluster_times(words,list(speech_data.keys()),seg_aligns,tts_align)
419
- if len(retimes)>1:
420
- for w,bound_line in retimes:
421
- plt.axvline(x=bound_line, color="gray", linestyle='--', linewidth=1, label=f'Start "{w}"')
422
-
423
- plt.title(f"{words} - {fname} - Cluster {cluster_id}")
424
-
425
- for k,v in speech_data.items():
426
 
427
- spk = k.split('**')[1]
428
- word_times = seg_aligns[k]
 
 
 
 
429
 
430
- feats = ffunc(v)
431
- # datapoint interval is 0.005 seconds
432
- feat_xvals = [x*0.005 for x in range(len(feats))]
433
 
434
- feat_xvals, feats = retime_xs_feats(retimes,word_times,feat_xvals,feats)
435
- pfunc(feat_xvals, feats, color=colors[cc], label=f"Speaker {spk}")
436
 
437
- #feat_xvals = retime_speaker_xvals(retimes, word_times, feat_xvals)
438
- #for w, st in reversed(retimes):
439
- # w_xvals = [x for x in feat_xvals if x>= st]
440
- # w_feats = feats[-(len(w_xvals)):]
441
- # pfunc(w_xvals, w_feats, color=colors[cc])
442
- # feat_xvals = feat_xvals[:-(len(w_xvals))]
443
- # feats = feats[:-(len(w_xvals))]
444
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
445
 
446
- spk_ccs.append((spk,colors[cc]))
447
- cc += 1
448
- if cc >= len(colors):
449
- cc=0
450
 
451
- if voice:
452
- tfeats = ffunc(tts_data)
453
- t_xvals = [x*0.005 for x in range(len(tfeats))]
454
-
455
- t_xvals, tfeats = retime_xs_feats(retimes, tts_align, t_xvals, tfeats)
456
- pfunc(t_xvals, tfeats, color="black", label=f"TTS {voice}")
457
 
458
- #t_xvals = retime_speaker_xvals(retimes, tts_align, t_xvals)
459
- #for w, st in reversed(retimes):
460
- # tw_xvals = [x for x in t_xvals if x>= st]
461
- # tw_feats = tfeats[-(len(tw_xvals)):]
462
- # pfunc(tw_xvals, tw_feats, color="black")
463
- # t_xvals = t_xvals[:-(len(tw_xvals))]
464
- # tfeats = tfeats[:-(len(tw_xvals))]
465
-
466
  #plt.legend()
467
  #plt.show()
468
 
469
-
470
  return fig, spk_ccs
471
 
472
 
 
3
  matplotlib.use('Agg')
4
  import matplotlib.pyplot as plt
5
  import soundfile as sf
6
+ import colorcet as clc
7
  from collections import defaultdict
8
  from dtw import dtw
9
  from sklearn_extra.cluster import KMedoids
 
204
  return y_km, kmedoids
205
 
206
 
 
207
  def match_tts(clusters, speech_data, tts_data, tts_align, words, seg_aligns, voice):
208
 
209
+ tts_info = defaultdict(list)
210
+
211
  for label in set([c for r,c in clusters]):
212
  recs = [r for r,c in clusters if c==label]
213
  dists = []
214
  for rec in recs:
215
+ dists.append(dtw_distance(tts_data[f'{words}**{voice}'], speech_data[f'{words}**{rec}']))
216
+ tts_info[voice].append((label,np.nanmean(dists)))
 
217
 
218
+ #tts_info[voice] = sorted(tts_info[voice],key = lambda x: x[1])
219
+ #best_cluster = tts_info[voice][0][0]
220
+ #best_cluster_score = tts_info[voice][0][1]
 
 
221
 
222
+ #tts_pldat = {f'{words}**{voice}': tts_data}
 
 
 
 
 
 
 
 
 
 
 
223
 
224
+ f0_fig_tts, _ = plot_one_cluster(words,'pitch',tts_data,tts_align,0,['#c97eb7'],gtype='tts',voice=voice)
225
+ en_fig_tts, _ = plot_one_cluster(words,'energy',tts_data,tts_align,0,['#9276d9'],gtype='tts',voice=voice)
226
 
227
+ return tts_info[voice], f0_fig_tts, en_fig_tts
228
+
 
 
 
 
 
 
 
 
 
 
229
 
230
 
231
  def gp(d,s,x):
 
239
  plist = [(s, {'wav': gp(wdir,s,'wav'), 'aln': gp(adir,s,'tsv'), 'f0': gp(f0dir,s,'f0'), 'play': gp(pldir,s,'wav')}) for s in spks]
240
  return plist
241
 
 
242
  # since clustering strictly operates on X,
243
  # once reduce a duration metric down to pair-distances,
244
  # it no longer matters that duration and pitch/energy had different dimensionality
 
266
  X = np.array(X)
267
 
268
  y_km, kmedoids = kmedoids_clustering(X)
 
 
 
269
  result = zip(X, kmedoids.labels_)
270
  groups = [[r,c] for r,c in zip(h_spk_ids,kmedoids.labels_)]
271
 
272
+ f0_fig_c0, f0_fig_c1, f0_fig_c2, en_fig_c0, en_fig_c1, en_fig_c2, spk_cc_map = graph_humans(groups,h_data,words,h_seg_aligns)
273
+ audio_html = clusters_audio(groups,spk_cc_map,h_playable)
274
+
275
+
276
 
277
  tts_all_paths = gen_tts_paths(tts_sent_dir, voices)
278
  _, tts_data, tts_seg_aligns, _, _ = get_data(norm_sent,tts_all_paths,start_end_word_index)
279
+
280
+ tts_results = defaultdict(dict)
281
  for v in voices:
282
+ #voice_data = tts_data[f"{words}**{v}"]
283
+ #voice_align = tts_seg_aligns[f"{words}**{v}"]
284
+
 
 
285
  # match the data with a cluster -----
286
+ cluster_scores, f0_fig_tts, en_fig_tts = match_tts(groups, h_data, tts_data, tts_seg_aligns, words, h_seg_aligns, v)
287
+ best_cluster = [c for c,s in cluster_scores if s == min([s for c,s in cluster_scores])]
288
+ scorestring = []
289
+ for c,s in cluster_scores:
290
+ if c== best_cluster:
291
+ scorestring.append(f' **Cluster {c}: {round(s,2)}** ')
292
+ else:
293
+ scorestring.append(f' Cluster {c}: {round(s,2)} ')
294
+ scorestring = ' - '.join(scorestring)
295
+
296
+ audiosample = [pdict['play'] for voic, pdict in tts_all_paths if voic == v][0]
297
+
298
+ tts_results[v] = {'audio': audiosample, 'f0_fig_tts': f0_fig_tts, 'en_fig_tts':en_fig_tts, 'scoreinfo': scorestring}
299
+
300
+ return f0_fig_c0, f0_fig_c1, f0_fig_c2, en_fig_c0, en_fig_c1, en_fig_c2, audio_html, tts_results
301
  #return words, kmedoids_cluster_dists, group
302
 
303
 
 
305
 
306
  # generate html panel to play audios for each human cluster
307
  # audios is dict {recording_id : (wav_path, seg_start_time, seg_end_time)}
308
+ def clusters_audio(clusters,colormap,audios):
309
 
310
  html = '''<html><body>'''
311
 
 
315
  html += '<div>'
316
  html += f'<h2>Cluster {label}</h2>'
317
 
318
+ html += '<div style="font-size:130%;">'
319
  html += '<table><tbody>'
320
 
321
  for rec in recs:
322
+ cc = colormap[label][rec]
323
+
324
  html += f'<tr><td><audio controls id="{rec}">' #width="20%">
325
 
326
  html += f'<source src="{audios[rec][0]}#t={audios[rec][1]:.2f},{audios[rec][2]:.2f}" type="audio/wav">'
327
  #html += f'<source src="{audios[rec][0]}" type="audio/wav">'
328
 
329
  html += '</audio></td>'
330
+ html += f'<td style="color:{cc};">{rec}</td></tr>'
331
 
332
  html += '</tbody></table>'
333
  html += '</div>'
 
339
  return html
340
 
341
 
 
342
  # find offsets to visually align start of each word for speakers in cluster
343
+ def reset_cluster_times(words,cluster_speakers,human_aligns,tts_align=None):
344
  words = words.split('_')
345
 
346
  retimes = [(words[0], 0.0)]
 
378
  xf.append((x,f))
379
  return [x for x,f in xf], [f for x,f in xf]
380
 
381
+
382
+
383
+ # TODO handle the ccmap in here not inside plot_one
384
+ def graph_humans(clusters,speech_data,words,seg_aligns):
385
+ c0,c1,c2 = (0,1,2)
386
+ nsents = len(speech_data)
387
+
388
+ c0_data = {f'{words}**{r}': speech_data[f'{words}**{r}'] for r,c in clusters if c==c0}
389
+ c1_data = {f'{words}**{r}': speech_data[f'{words}**{r}'] for r,c in clusters if c==c1}
390
+ c2_data = {f'{words}**{r}': speech_data[f'{words}**{r}'] for r,c in clusters if c==c2}
391
+
392
+ colors = [(pc,ec) for pc,ec in zip(clc.CET_C8s,clc.CET_C9s)]
393
+ cix = [int(x) for x in np.linspace(0,len(colors)-1, nsents)]
394
+ pcolors = [colors[x][0] for x in cix]
395
+ ecolors= [colors[x][1] for x in cix]
396
+
397
+ f0_fig_c0, c0_cc = plot_one_cluster(words,'pitch',c0_data,seg_aligns,c0,pcolors)
398
+ f0_fig_c1, c1_cc= plot_one_cluster(words,'pitch',c1_data,seg_aligns,c1,pcolors[len(c0_data):])
399
+ f0_fig_c2, c2_cc = plot_one_cluster(words,'pitch',c2_data,seg_aligns,c2,pcolors[len(c0_data)+len(c1_data):])
400
+
401
+ en_fig_c0, _ = plot_one_cluster(words,'rmse',c0_data,seg_aligns,c0,ecolors)
402
+ en_fig_c1, _ = plot_one_cluster(words,'rmse',c1_data,seg_aligns,c1,ecolors[len(c0_data):])
403
+ en_fig_c2, _ = plot_one_cluster(words,'rmse',c2_data,seg_aligns,c2,ecolors[len(c0_data)+len(c1_data):])
404
+
405
+ # TODO
406
+ # not necessarily here, bc paths to audio files.
407
+ spk_cc_map = {c0 : c0_cc, c1 : c1_cc, c2 : c2_cc}
408
+ #playable = audio_htmls(spk_cc_map)
409
+
410
+ return f0_fig_c0, f0_fig_c1, f0_fig_c2, en_fig_c0, en_fig_c1, en_fig_c2, spk_cc_map
411
 
412
 
413
+
414
+ #TODO handle the colour list OUTSIDE of this part....
415
+ def plot_one_cluster(words,feature,speech_data,seg_aligns,cluster_id,colors,gtype='cluster',voice=None):
416
+
417
+ cc=0
418
+ gclr = "#909090"
419
+ spk_ccs = {} # for external display
420
+
421
+
422
+ #fig = plt.figure(figsize=(10, 5))
423
+ if voice:
424
+ fig, ax = plt.subplots(figsize=(7.5,4))
425
+ else:
426
+ fig, ax = plt.subplots(figsize=(10,5))
427
+ fig.patch.set_facecolor('none')
428
+ ax.patch.set_facecolor('none')
429
+ fig.patch.set_alpha(0)
430
+ ax.tick_params(color=gclr,labelcolor=gclr)
431
+ for spine in ['bottom','left']:
432
+ ax.spines[spine].set_color(gclr)
433
+ for spine in ['top','right']:
434
+ ax.spines[spine].set(visible=False)
435
+
436
 
437
  if feature.lower() in ['pitch','f0']:
438
  fname = 'Pitch'
439
+ def _ffunc(feats):
440
+ ps = [p for p,e in feats]
441
+ nv = min(ps)
442
+ ps = [np.nan if p == nv else p for p in ps]
443
+ return ps
444
+ ffunc = _ffunc
445
+ pfunc = plt.plot
446
+ ylab = "Mean-variance normalised F0"
447
  elif feature.lower() in ['energy', 'rmse']:
448
  fname = 'Energy'
449
  ffunc = lambda x: [e for p,e in x]
450
  pfunc = plt.plot
451
+ ylab = "Mean-variance normalised energy"
452
  else:
453
  print('problem with the figure')
454
  return fig, []
 
 
 
 
 
 
 
 
 
 
 
455
 
456
+
457
+ if gtype == 'cluster':
458
+ # boundary for start of each word
459
+ retimes = reset_cluster_times(words,list(speech_data.keys()),seg_aligns)#,tts_align)
460
+ plt.title(f"{words} - {fname} - Cluster {cluster_id}", color=gclr, fontsize=16)
461
+ xmax = 0
462
 
463
+ for k,v in speech_data.items():
 
 
464
 
465
+ spk = k.split('**')[1]
466
+ word_times = seg_aligns[k]
467
 
468
+ feats = ffunc(v)
469
+ # datapoint interval is 0.005 seconds
470
+ feat_xvals = [x*0.005 for x in range(len(feats))]
471
+
472
+ feat_xvals, feats = retime_xs_feats(retimes,word_times,feat_xvals,feats)
473
+ pfunc(feat_xvals, feats, color=colors[cc], linewidth=2, label=f"Speaker {spk}")
474
+
475
+ xmax = max(xmax,max(feat_xvals))
476
+ spk_ccs[spk] = colors[cc]
477
+ cc += 1
478
+ if cc >= len(colors):
479
+ cc=0
480
+
481
+ elif gtype == 'tts':
482
+ # boundary for start of each word
483
+ retimes = reset_cluster_times(words,[f'{words}**{voice}'],seg_aligns)
484
+ word_times = seg_aligns[f'{words}**{voice}']
485
+ tfeats = ffunc(speech_data[f'{words}**{voice}'])
486
+ t_xvals = [x*0.005 for x in range(len(tfeats))]
487
+ t_xvals, tfeats = retime_xs_feats(retimes, word_times, t_xvals, tfeats)
488
+ pfunc(t_xvals, tfeats, color=colors[cc], label=f"TTS {voice}")
489
+ plt.title(f"{fname}", color=gclr, fontsize=14)
490
+ xmax = max(t_xvals)
491
 
 
 
 
 
492
 
493
+ if len(retimes)>1:
494
+ for w,bound_line in retimes:
495
+ plt.axvline(x=bound_line, color=gclr, linestyle='--', linewidth=1, label=f'Start "{w}"')
496
+ plt.xlim([0, xmax])
497
+ ax.set_xlabel("Time --->",fontsize=13,color=gclr)
498
+ ax.set_ylabel(ylab,fontsize=13,color=gclr)
499
 
 
 
 
 
 
 
 
 
500
  #plt.legend()
501
  #plt.show()
502
 
 
503
  return fig, spk_ccs
504
 
505
 
scripts/runSQ.py CHANGED
@@ -38,13 +38,13 @@ def run(sentence, voices, start_end_word_ix):
38
  if voices:
39
  temp_tts_sample, tts_sent_dir = get_tts(sentence,voices,tts_dir,align_model_path)
40
 
41
- voices = [voices[0]] # TODO. now limit one voice at a time.
42
- score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, html = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, playable_dir, tts_sent_dir, voices, start_end_word_ix)
43
-
44
  # also stop forgetting duration.
45
 
46
- return temp_tts_sample, score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, html
47
-
48
 
49
 
50
 
@@ -284,8 +284,7 @@ def localtest():
284
 
285
  voices = [voices[0]] # TODO. now limit one voice at a time.
286
 
287
- score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, html = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, playable_dir, tts_sent_dir, voices, start_end_word_ix)
288
-
289
 
290
 
291
  #localtest()
 
38
  if voices:
39
  temp_tts_sample, tts_sent_dir = get_tts(sentence,voices,tts_dir,align_model_path)
40
 
41
+ #voices = [voices[0]] # TODO. now limit one voice at a time.
42
+ #score, f0_fig_c0, f0_fig_c1, f0_fig_c2, en_fig_c0, en_fig_c1, en_fig_c2, html, f0_fig_tts, en_fig_tts = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, playable_dir, tts_sent_dir, voices, start_end_word_ix)
43
+ f0_fig_c0, f0_fig_c1, f0_fig_c2, en_fig_c0, en_fig_c1, en_fig_c2, html, tts_results = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, playable_dir, tts_sent_dir, voices, start_end_word_ix)
44
  # also stop forgetting duration.
45
 
46
+ #return temp_tts_sample, score, f0_fig_c0, f0_fig_c1, f0_fig_c2, en_fig_c0, en_fig_c1, en_fig_c2, html, f0_fig_tts, en_fig_tts
47
+ return f0_fig_c0, f0_fig_c1, f0_fig_c2, en_fig_c0, en_fig_c1, en_fig_c2, html, tts_results
48
 
49
 
50
 
 
284
 
285
  voices = [voices[0]] # TODO. now limit one voice at a time.
286
 
287
+ score, f0_fig_c0, f0_fig_c1, f0_fig_c2, en_fig_c0, en_fig_c1, en_fig_c2, html, f0_fig_tts, en_fig_tts = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, playable_dir, tts_sent_dir, voices, start_end_word_ix)
 
288
 
289
 
290
  #localtest()