catiR
commited on
Commit
•
1095ae0
1
Parent(s):
307bcfb
appearance,tabs
Browse files- README.md +3 -3
- app.py +101 -40
- requirements.txt +1 -0
- scripts/clusterprosody.py +143 -110
- scripts/runSQ.py +6 -7
README.md
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
---
|
2 |
title: Prosody clustering and evaluation
|
3 |
emoji: ⚡
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
-
sdk_version:
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
|
|
1 |
---
|
2 |
title: Prosody clustering and evaluation
|
3 |
emoji: ⚡
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: indigo
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 4.7.1
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
app.py
CHANGED
@@ -2,18 +2,21 @@ import gradio as gr
|
|
2 |
import subprocess, os
|
3 |
import scripts.runSQ
|
4 |
|
5 |
-
|
6 |
#https://huggingface.co/spaces/clr/prosalign/blob/main/app.py
|
7 |
|
8 |
|
9 |
def setup():
|
10 |
r0 = subprocess.run(["pwd"], capture_output=True, text=True)
|
11 |
print('PWD::', r0.stdout)
|
12 |
-
r1 = subprocess.run(["wget", "https://github.com/google/REAPER/archive/refs/heads/master.zip"], capture_output=True, text=True)
|
13 |
-
print(r1.stdout)
|
14 |
-
subprocess.run(["
|
|
|
|
|
|
|
|
|
15 |
subprocess.run(["mv", "REAPER-master", "REAPER"])
|
16 |
-
|
17 |
os.chdir('./REAPER')
|
18 |
subprocess.run(["mkdir", "build"])
|
19 |
os.chdir('./build')
|
@@ -31,12 +34,6 @@ print('about to setup')
|
|
31 |
setup()
|
32 |
|
33 |
|
34 |
-
def f1(voices, sent, indices):
|
35 |
-
#tts_audio, tts_score, graph = scripts.runSQ.run(sent, voices, indices)
|
36 |
-
tts_audio, tts_score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, audio_html = scripts.runSQ.run(sent, [voices], indices)
|
37 |
-
score_report = f'Difference from TTS to real speech: {round(tts_score,2)}'
|
38 |
-
return (tts_audio, score_report, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, audio_html)
|
39 |
-
|
40 |
|
41 |
def label_indices(sentence):
|
42 |
sentence = scripts.runSQ.snorm(sentence)
|
@@ -44,51 +41,112 @@ def label_indices(sentence):
|
|
44 |
labelled = [(f'{word} {i+1} ', str(i+1)) for i, word in enumerate(sentence)]
|
45 |
return labelled
|
46 |
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
temp_sentences = scripts.runSQ.create_temp_sent_list()
|
50 |
|
51 |
bl = gr.Blocks()
|
52 |
with bl:
|
53 |
|
54 |
-
|
55 |
-
#temp_sentences = ['Litlaus græn hugmynd?','Var það ekki nóg?', 'Ef svo er hvað heita þau þá?','Eru maríuhænur á Íslandi?']
|
56 |
|
57 |
-
voices = ['Dilja_v2', 'Alfur_v2', 'Dilja', 'Alfur', 'Bjartur', 'Rosa'
|
58 |
|
59 |
-
|
60 |
-
#with gr.Row():
|
61 |
-
#with gr.Column(scale=4):
|
62 |
-
temp_sentmenu = gr.Dropdown(temp_sentences, label="Sentence")
|
63 |
-
#voiceselect = gr.CheckboxGroup(voices, label="TTS voice",value='Alfur')
|
64 |
|
65 |
-
|
|
|
|
|
|
|
66 |
|
67 |
-
|
68 |
-
|
69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
-
#with gr.Column(scale=1):
|
72 |
-
temp_button = gr.Button(value="Run with selected options")
|
73 |
-
|
74 |
-
|
75 |
-
tts_output = gr.Audio(interactive=False)
|
76 |
-
report_score = gr.Markdown('Difference from TTS to real speech:')
|
77 |
|
78 |
with gr.Tabs():
|
79 |
with gr.TabItem("Pitch"):
|
80 |
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
pl3 = gr.Plot()
|
85 |
|
86 |
with gr.TabItem("Energy"):
|
87 |
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
pl6 = gr.Plot()
|
92 |
|
93 |
with gr.TabItem("Audio"):
|
94 |
|
@@ -96,9 +154,12 @@ with bl:
|
|
96 |
|
97 |
|
98 |
|
99 |
-
|
100 |
temp_sentmenu.input(label_indices,temp_sentmenu,marked_sentence)
|
101 |
-
|
|
|
|
|
|
|
|
|
102 |
|
103 |
|
104 |
if __name__ == "__main__":
|
|
|
2 |
import subprocess, os
|
3 |
import scripts.runSQ
|
4 |
|
|
|
5 |
#https://huggingface.co/spaces/clr/prosalign/blob/main/app.py
|
6 |
|
7 |
|
8 |
def setup():
|
9 |
r0 = subprocess.run(["pwd"], capture_output=True, text=True)
|
10 |
print('PWD::', r0.stdout)
|
11 |
+
#r1 = subprocess.run(["wget", "https://github.com/google/REAPER/archive/refs/heads/REAPER-master.zip", "-O", "./master.zip"], capture_output=True, text=True)
|
12 |
+
#print(r1.stdout)
|
13 |
+
r9x = subprocess.run(["ls", "-la"], capture_output=True, text=True)
|
14 |
+
print('LS::', r9x.stdout)
|
15 |
+
|
16 |
+
subprocess.run(["unzip", "./REAPER-master.zip"])
|
17 |
+
subprocess.run(["rm", "./REAPER-master.zip"])
|
18 |
subprocess.run(["mv", "REAPER-master", "REAPER"])
|
19 |
+
|
20 |
os.chdir('./REAPER')
|
21 |
subprocess.run(["mkdir", "build"])
|
22 |
os.chdir('./build')
|
|
|
34 |
setup()
|
35 |
|
36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
def label_indices(sentence):
|
39 |
sentence = scripts.runSQ.snorm(sentence)
|
|
|
41 |
labelled = [(f'{word} {i+1} ', str(i+1)) for i, word in enumerate(sentence)]
|
42 |
return labelled
|
43 |
|
44 |
+
|
45 |
+
#gradio states dont like dicts
|
46 |
+
def d2l(d):
|
47 |
+
return [(k,v) for k,v in d.items()]
|
48 |
+
def l2d(l):
|
49 |
+
return {k:v for k,v in l}
|
50 |
|
51 |
temp_sentences = scripts.runSQ.create_temp_sent_list()
|
52 |
|
53 |
bl = gr.Blocks()
|
54 |
with bl:
|
55 |
|
|
|
|
|
56 |
|
57 |
+
voices = ['Dilja_v2', 'Alfur_v2', 'Dilja', 'Alfur', 'Bjartur', 'Rosa']
|
58 |
|
59 |
+
with gr.Tabs():
|
|
|
|
|
|
|
|
|
60 |
|
61 |
+
|
62 |
+
with gr.TabItem("Options"):
|
63 |
+
temp_sentmenu = gr.Dropdown(temp_sentences, label="Sentence")
|
64 |
+
marked_sentence = gr.HighlightedText(interactive=False,label="Word selection key",color_map = {str(i):"#dcfce7" for i in range(333)})
|
65 |
|
66 |
+
with gr.Row():
|
67 |
+
spanselect = gr.Textbox(value='1-3',label="Select words",info='Enter the index of the word(s) to analyse, according to the key above. It can be a single word: 4 or a span of words separated by a dash: 2-3')
|
68 |
+
#voiceselect = gr.Radio(voices, label="TTS voice",value='Alfur_v2')
|
69 |
+
voiceselect = gr.CheckboxGroup(voices, label="TTS voice",value=['Dilja_v2','Alfur_v2'])
|
70 |
+
|
71 |
+
#with gr.Column(scale=1):
|
72 |
+
temp_button = gr.Button(value="Run with selected options")
|
73 |
+
|
74 |
+
with gr.TabItem("About"):
|
75 |
+
docu = gr.Markdown("""
|
76 |
+
# Multi-target prosody evaluation
|
77 |
+
### 1. Choose a sentence - they are from Samrómur Queries
|
78 |
+
### 2. The words will be numbered by position - type the number or range you want to evaluate
|
79 |
+
### 3. Choose a TTS voice - they come from Tiro's API https://tiro.is/talgerving
|
80 |
+
### 4. Run
|
81 |
+
|
82 |
+
The evaluation automatically clusters human speakers according to prosodic features,
|
83 |
+
and then measures how different the synthesised speech is from each natural cluster.
|
84 |
+
Clustering and TTS scoring use only the selected word(s) from Step 2, not the whole sentence.
|
85 |
+
Close match to one cluster shows what prosodic act TTS might have achieved, in the selected words.
|
86 |
+
TTS whose prosody does not match any cluster might sound unnatural.
|
87 |
+
|
88 |
+
TTS output includes generated audio, pitch, energy, and scores for each cluster.
|
89 |
+
Output is only shown for the selected voice(s).
|
90 |
+
Below, human data shows pitch and energy of each cluster, along with original audio.
|
91 |
+
|
92 |
+
TTS often takes over 30 seconds per sentence/voice.
|
93 |
+
After you have done it once, re-running different word spans for the same sentence/voice is much faster.
|
94 |
+
|
95 |
+
See "Automatic assessment of prosody in high-stakes English tests" (Jian Cheng, ISCA 2011)
|
96 |
+
regarding multi-target prosody scoring. This version extends the implementation by Magnús Freyr Morthens
|
97 |
+
supported by Rannís student innovation fund.
|
98 |
+
""")
|
99 |
+
|
100 |
+
ttstabs = {v:{} for v in voices}
|
101 |
+
with gr.Tabs():
|
102 |
+
for v in voices:
|
103 |
+
with gr.TabItem(v):
|
104 |
+
ttstabs[v]['tts_output'] = gr.Audio(interactive=False)
|
105 |
+
with gr.Row():
|
106 |
+
ttstabs[v]['ptts'] = gr.Plot()
|
107 |
+
ttstabs[v]['etts'] = gr.Plot()
|
108 |
+
ttstabs[v]['scorearea'] = gr.Markdown(f'TTS results for **{v}** will appear here')
|
109 |
+
#tts_output = gr.Audio(interactive=False)
|
110 |
+
#with gr.Row():
|
111 |
+
# ptts = gr.Plot()
|
112 |
+
# etts = gr.Plot()
|
113 |
+
#report_score = gr.Markdown('Difference from TTS to real speech:')
|
114 |
+
|
115 |
+
# cant store ttstabs in gradio state, use here
|
116 |
+
def f1(voices, sent, indices):
|
117 |
+
#tts_audio, tts_score, f0_fig_0, f0_fig_1, f0_fig_2, en_fig_0, en_fig_1, en_fig_2, audio_html, f0_fig_tts, en_fig_tts = scripts.runSQ.run(sent, [voices], indices)
|
118 |
+
f0_fig_0, f0_fig_1, f0_fig_2, en_fig_0, en_fig_1, en_fig_2, audio_html, tts_results = scripts.runSQ.run(sent, voices, indices)
|
119 |
+
outputs = {pc0: f0_fig_0, pc1: f0_fig_1, pc2: f0_fig_2, ec0: en_fig_0, ec1: en_fig_1, ec2: en_fig_2, play: audio_html}
|
120 |
+
|
121 |
+
|
122 |
+
for v in voices:
|
123 |
+
outputs[ttstabs[v]['tts_output']] = tts_results[v]['audio']
|
124 |
+
outputs[ttstabs[v]['ptts']] = tts_results[v]['f0_fig_tts']
|
125 |
+
outputs[ttstabs[v]['etts']] = tts_results[v]['en_fig_tts']
|
126 |
+
outputs[ttstabs[v]['scorearea']] = tts_results[v]['scoreinfo']
|
127 |
+
|
128 |
+
clear = [v for v in ttstabs.keys() if v not in voices]
|
129 |
+
for v in clear:
|
130 |
+
outputs[ttstabs[v]['tts_output']] = None
|
131 |
+
outputs[ttstabs[v]['ptts']] = None
|
132 |
+
outputs[ttstabs[v]['etts']] = None
|
133 |
+
outputs[ttstabs[v]['scorearea']] = f'TTS results for **{v}** will appear here'
|
134 |
+
|
135 |
+
return outputs #(tts_audio, score_report, f0_fig_0, f0_fig_1, f0_fig_2, en_fig_0, en_fig_1, en_fig_2, audio_html, f0_fig_tts, en_fig_tts)
|
136 |
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
|
138 |
with gr.Tabs():
|
139 |
with gr.TabItem("Pitch"):
|
140 |
|
141 |
+
pc0 = gr.Plot()
|
142 |
+
pc1 = gr.Plot()
|
143 |
+
pc2 = gr.Plot()
|
|
|
144 |
|
145 |
with gr.TabItem("Energy"):
|
146 |
|
147 |
+
ec0 = gr.Plot()
|
148 |
+
ec1 = gr.Plot()
|
149 |
+
ec2 = gr.Plot()
|
|
|
150 |
|
151 |
with gr.TabItem("Audio"):
|
152 |
|
|
|
154 |
|
155 |
|
156 |
|
|
|
157 |
temp_sentmenu.input(label_indices,temp_sentmenu,marked_sentence)
|
158 |
+
outputs_list = [pc0,pc1,pc2,ec0,ec1,ec2,play]
|
159 |
+
for v in voices:
|
160 |
+
outputs_list += [ttstabs[v]['tts_output'], ttstabs[v]['ptts'],ttstabs[v]['etts'],ttstabs[v]['scorearea']]
|
161 |
+
temp_button.click(f1,[voiceselect,temp_sentmenu,spanselect],outputs_list)
|
162 |
+
#[tts_output,report_score,pc0,pc1,pc2,ec0,ec1,ec2,play,ptts,etts])
|
163 |
|
164 |
|
165 |
if __name__ == "__main__":
|
requirements.txt
CHANGED
@@ -6,4 +6,5 @@ scipy
|
|
6 |
dtw-python
|
7 |
scikit-learn_extra
|
8 |
pydub
|
|
|
9 |
|
|
|
6 |
dtw-python
|
7 |
scikit-learn_extra
|
8 |
pydub
|
9 |
+
colorcet
|
10 |
|
scripts/clusterprosody.py
CHANGED
@@ -3,6 +3,7 @@ import matplotlib
|
|
3 |
matplotlib.use('Agg')
|
4 |
import matplotlib.pyplot as plt
|
5 |
import soundfile as sf
|
|
|
6 |
from collections import defaultdict
|
7 |
from dtw import dtw
|
8 |
from sklearn_extra.cluster import KMedoids
|
@@ -203,51 +204,28 @@ def kmedoids_clustering(X):
|
|
203 |
return y_km, kmedoids
|
204 |
|
205 |
|
206 |
-
|
207 |
def match_tts(clusters, speech_data, tts_data, tts_align, words, seg_aligns, voice):
|
208 |
|
209 |
-
|
210 |
-
|
211 |
for label in set([c for r,c in clusters]):
|
212 |
recs = [r for r,c in clusters if c==label]
|
213 |
dists = []
|
214 |
for rec in recs:
|
215 |
-
|
216 |
-
|
217 |
-
tts_info.append((label,np.nanmean(dists)))
|
218 |
|
219 |
-
tts_info = sorted(tts_info,key = lambda x: x[1])
|
220 |
-
best_cluster = tts_info[0][0]
|
221 |
-
best_cluster_score = tts_info[0][1]
|
222 |
-
|
223 |
-
matched_data = {f'{words}**{r}': speech_data[f'{words}**{r}'] for r,c in clusters if c==best_cluster}
|
224 |
|
225 |
-
#
|
226 |
-
# and report best_cluster_score
|
227 |
-
|
228 |
-
mid_cluster = tts_info[1][0]
|
229 |
-
mid_data = {f'{words}**{r}': speech_data[f'{words}**{r}'] for r,c in clusters if c==mid_cluster}
|
230 |
-
bad_cluster = tts_info[2][0]
|
231 |
-
bad_data = {f'{words}**{r}': speech_data[f'{words}**{r}'] for r,c in clusters if c==bad_cluster}
|
232 |
-
|
233 |
-
#tts_fig_p = plot_pitch_tts(matched_data,tts_data, tts_align, words,seg_aligns,best_cluster,voice)
|
234 |
-
tts_fig_p, best_cc = plot_one_cluster(words,'pitch',matched_data,seg_aligns,best_cluster,tts_data=tts_data,tts_align=tts_align,voice=voice)
|
235 |
-
fig_mid_p, mid_cc = plot_one_cluster(words,'pitch',mid_data,seg_aligns,mid_cluster)
|
236 |
-
fig_bad_p, bad_cc = plot_one_cluster(words,'pitch',bad_data,seg_aligns,bad_cluster)
|
237 |
|
|
|
|
|
238 |
|
239 |
-
|
240 |
-
|
241 |
-
fig_bad_e, _ = plot_one_cluster(words,'rmse',bad_data,seg_aligns,bad_cluster)
|
242 |
-
|
243 |
-
|
244 |
-
# TODO
|
245 |
-
# not necessarily here, bc paths to audio files.
|
246 |
-
spk_cc_map = [('Best',best_cluster,best_cc), ('Mid',mid_cluster,mid_cc), ('Last',bad_cluster,bad_cc)]
|
247 |
-
#playable = audio_htmls(spk_cc_map)
|
248 |
-
|
249 |
-
return best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p, tts_fig_e, fig_mid_e, fig_bad_e
|
250 |
-
|
251 |
|
252 |
|
253 |
def gp(d,s,x):
|
@@ -261,7 +239,6 @@ def gen_h_paths(wdir,adir,f0dir,pldir,spks):
|
|
261 |
plist = [(s, {'wav': gp(wdir,s,'wav'), 'aln': gp(adir,s,'tsv'), 'f0': gp(f0dir,s,'f0'), 'play': gp(pldir,s,'wav')}) for s in spks]
|
262 |
return plist
|
263 |
|
264 |
-
|
265 |
# since clustering strictly operates on X,
|
266 |
# once reduce a duration metric down to pair-distances,
|
267 |
# it no longer matters that duration and pitch/energy had different dimensionality
|
@@ -289,30 +266,38 @@ def cluster(norm_sent,orig_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, h_p
|
|
289 |
X = np.array(X)
|
290 |
|
291 |
y_km, kmedoids = kmedoids_clustering(X)
|
292 |
-
#plot_clusters(X, y_km, words)
|
293 |
-
#c1, c2, c3 = [X[np.where(kmedoids.labels_ == i)] for i in range(3)]
|
294 |
-
|
295 |
result = zip(X, kmedoids.labels_)
|
296 |
groups = [[r,c] for r,c in zip(h_spk_ids,kmedoids.labels_)]
|
297 |
|
|
|
|
|
|
|
|
|
298 |
|
299 |
tts_all_paths = gen_tts_paths(tts_sent_dir, voices)
|
300 |
_, tts_data, tts_seg_aligns, _, _ = get_data(norm_sent,tts_all_paths,start_end_word_index)
|
301 |
-
|
|
|
302 |
for v in voices:
|
303 |
-
voice_data = tts_data[f"{words}**{v}"]
|
304 |
-
voice_align = tts_seg_aligns[f"{words}**{v}"]
|
305 |
-
|
306 |
-
#tts_data, tts_align = get_one_tts_data(tts_sent_dir,v,norm_sent,start_end_word_index)
|
307 |
-
|
308 |
# match the data with a cluster -----
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
316 |
#return words, kmedoids_cluster_dists, group
|
317 |
|
318 |
|
@@ -320,7 +305,7 @@ def cluster(norm_sent,orig_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, h_p
|
|
320 |
|
321 |
# generate html panel to play audios for each human cluster
|
322 |
# audios is dict {recording_id : (wav_path, seg_start_time, seg_end_time)}
|
323 |
-
def clusters_audio(clusters,audios):
|
324 |
|
325 |
html = '''<html><body>'''
|
326 |
|
@@ -330,17 +315,19 @@ def clusters_audio(clusters,audios):
|
|
330 |
html += '<div>'
|
331 |
html += f'<h2>Cluster {label}</h2>'
|
332 |
|
333 |
-
html += '<div>'
|
334 |
html += '<table><tbody>'
|
335 |
|
336 |
for rec in recs:
|
|
|
|
|
337 |
html += f'<tr><td><audio controls id="{rec}">' #width="20%">
|
338 |
|
339 |
html += f'<source src="{audios[rec][0]}#t={audios[rec][1]:.2f},{audios[rec][2]:.2f}" type="audio/wav">'
|
340 |
#html += f'<source src="{audios[rec][0]}" type="audio/wav">'
|
341 |
|
342 |
html += '</audio></td>'
|
343 |
-
html += f'<td>{rec}</td></tr>'
|
344 |
|
345 |
html += '</tbody></table>'
|
346 |
html += '</div>'
|
@@ -352,9 +339,8 @@ def clusters_audio(clusters,audios):
|
|
352 |
return html
|
353 |
|
354 |
|
355 |
-
|
356 |
# find offsets to visually align start of each word for speakers in cluster
|
357 |
-
def reset_cluster_times(words,cluster_speakers,human_aligns,tts_align):
|
358 |
words = words.split('_')
|
359 |
|
360 |
retimes = [(words[0], 0.0)]
|
@@ -392,81 +378,128 @@ def retime_xs_feats(retimes, speaker_aligns, speaker_xvals, feats):
|
|
392 |
xf.append((x,f))
|
393 |
return [x for x,f in xf], [f for x,f in xf]
|
394 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
395 |
|
396 |
|
397 |
-
|
398 |
-
#
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
403 |
|
404 |
if feature.lower() in ['pitch','f0']:
|
405 |
fname = 'Pitch'
|
406 |
-
|
407 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
408 |
elif feature.lower() in ['energy', 'rmse']:
|
409 |
fname = 'Energy'
|
410 |
ffunc = lambda x: [e for p,e in x]
|
411 |
pfunc = plt.plot
|
|
|
412 |
else:
|
413 |
print('problem with the figure')
|
414 |
return fig, []
|
415 |
-
|
416 |
-
|
417 |
-
# boundary for start of each word
|
418 |
-
retimes = reset_cluster_times(words,list(speech_data.keys()),seg_aligns,tts_align)
|
419 |
-
if len(retimes)>1:
|
420 |
-
for w,bound_line in retimes:
|
421 |
-
plt.axvline(x=bound_line, color="gray", linestyle='--', linewidth=1, label=f'Start "{w}"')
|
422 |
-
|
423 |
-
plt.title(f"{words} - {fname} - Cluster {cluster_id}")
|
424 |
-
|
425 |
-
for k,v in speech_data.items():
|
426 |
|
427 |
-
|
428 |
-
|
|
|
|
|
|
|
|
|
429 |
|
430 |
-
|
431 |
-
# datapoint interval is 0.005 seconds
|
432 |
-
feat_xvals = [x*0.005 for x in range(len(feats))]
|
433 |
|
434 |
-
|
435 |
-
|
436 |
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
-
|
443 |
-
|
444 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
445 |
|
446 |
-
spk_ccs.append((spk,colors[cc]))
|
447 |
-
cc += 1
|
448 |
-
if cc >= len(colors):
|
449 |
-
cc=0
|
450 |
|
451 |
-
if
|
452 |
-
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
|
458 |
-
#t_xvals = retime_speaker_xvals(retimes, tts_align, t_xvals)
|
459 |
-
#for w, st in reversed(retimes):
|
460 |
-
# tw_xvals = [x for x in t_xvals if x>= st]
|
461 |
-
# tw_feats = tfeats[-(len(tw_xvals)):]
|
462 |
-
# pfunc(tw_xvals, tw_feats, color="black")
|
463 |
-
# t_xvals = t_xvals[:-(len(tw_xvals))]
|
464 |
-
# tfeats = tfeats[:-(len(tw_xvals))]
|
465 |
-
|
466 |
#plt.legend()
|
467 |
#plt.show()
|
468 |
|
469 |
-
|
470 |
return fig, spk_ccs
|
471 |
|
472 |
|
|
|
3 |
matplotlib.use('Agg')
|
4 |
import matplotlib.pyplot as plt
|
5 |
import soundfile as sf
|
6 |
+
import colorcet as clc
|
7 |
from collections import defaultdict
|
8 |
from dtw import dtw
|
9 |
from sklearn_extra.cluster import KMedoids
|
|
|
204 |
return y_km, kmedoids
|
205 |
|
206 |
|
|
|
207 |
def match_tts(clusters, speech_data, tts_data, tts_align, words, seg_aligns, voice):
|
208 |
|
209 |
+
tts_info = defaultdict(list)
|
210 |
+
|
211 |
for label in set([c for r,c in clusters]):
|
212 |
recs = [r for r,c in clusters if c==label]
|
213 |
dists = []
|
214 |
for rec in recs:
|
215 |
+
dists.append(dtw_distance(tts_data[f'{words}**{voice}'], speech_data[f'{words}**{rec}']))
|
216 |
+
tts_info[voice].append((label,np.nanmean(dists)))
|
|
|
217 |
|
218 |
+
#tts_info[voice] = sorted(tts_info[voice],key = lambda x: x[1])
|
219 |
+
#best_cluster = tts_info[voice][0][0]
|
220 |
+
#best_cluster_score = tts_info[voice][0][1]
|
|
|
|
|
221 |
|
222 |
+
#tts_pldat = {f'{words}**{voice}': tts_data}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
223 |
|
224 |
+
f0_fig_tts, _ = plot_one_cluster(words,'pitch',tts_data,tts_align,0,['#c97eb7'],gtype='tts',voice=voice)
|
225 |
+
en_fig_tts, _ = plot_one_cluster(words,'energy',tts_data,tts_align,0,['#9276d9'],gtype='tts',voice=voice)
|
226 |
|
227 |
+
return tts_info[voice], f0_fig_tts, en_fig_tts
|
228 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
229 |
|
230 |
|
231 |
def gp(d,s,x):
|
|
|
239 |
plist = [(s, {'wav': gp(wdir,s,'wav'), 'aln': gp(adir,s,'tsv'), 'f0': gp(f0dir,s,'f0'), 'play': gp(pldir,s,'wav')}) for s in spks]
|
240 |
return plist
|
241 |
|
|
|
242 |
# since clustering strictly operates on X,
|
243 |
# once reduce a duration metric down to pair-distances,
|
244 |
# it no longer matters that duration and pitch/energy had different dimensionality
|
|
|
266 |
X = np.array(X)
|
267 |
|
268 |
y_km, kmedoids = kmedoids_clustering(X)
|
|
|
|
|
|
|
269 |
result = zip(X, kmedoids.labels_)
|
270 |
groups = [[r,c] for r,c in zip(h_spk_ids,kmedoids.labels_)]
|
271 |
|
272 |
+
f0_fig_c0, f0_fig_c1, f0_fig_c2, en_fig_c0, en_fig_c1, en_fig_c2, spk_cc_map = graph_humans(groups,h_data,words,h_seg_aligns)
|
273 |
+
audio_html = clusters_audio(groups,spk_cc_map,h_playable)
|
274 |
+
|
275 |
+
|
276 |
|
277 |
tts_all_paths = gen_tts_paths(tts_sent_dir, voices)
|
278 |
_, tts_data, tts_seg_aligns, _, _ = get_data(norm_sent,tts_all_paths,start_end_word_index)
|
279 |
+
|
280 |
+
tts_results = defaultdict(dict)
|
281 |
for v in voices:
|
282 |
+
#voice_data = tts_data[f"{words}**{v}"]
|
283 |
+
#voice_align = tts_seg_aligns[f"{words}**{v}"]
|
284 |
+
|
|
|
|
|
285 |
# match the data with a cluster -----
|
286 |
+
cluster_scores, f0_fig_tts, en_fig_tts = match_tts(groups, h_data, tts_data, tts_seg_aligns, words, h_seg_aligns, v)
|
287 |
+
best_cluster = [c for c,s in cluster_scores if s == min([s for c,s in cluster_scores])]
|
288 |
+
scorestring = []
|
289 |
+
for c,s in cluster_scores:
|
290 |
+
if c== best_cluster:
|
291 |
+
scorestring.append(f' **Cluster {c}: {round(s,2)}** ')
|
292 |
+
else:
|
293 |
+
scorestring.append(f' Cluster {c}: {round(s,2)} ')
|
294 |
+
scorestring = ' - '.join(scorestring)
|
295 |
+
|
296 |
+
audiosample = [pdict['play'] for voic, pdict in tts_all_paths if voic == v][0]
|
297 |
+
|
298 |
+
tts_results[v] = {'audio': audiosample, 'f0_fig_tts': f0_fig_tts, 'en_fig_tts':en_fig_tts, 'scoreinfo': scorestring}
|
299 |
+
|
300 |
+
return f0_fig_c0, f0_fig_c1, f0_fig_c2, en_fig_c0, en_fig_c1, en_fig_c2, audio_html, tts_results
|
301 |
#return words, kmedoids_cluster_dists, group
|
302 |
|
303 |
|
|
|
305 |
|
306 |
# generate html panel to play audios for each human cluster
|
307 |
# audios is dict {recording_id : (wav_path, seg_start_time, seg_end_time)}
|
308 |
+
def clusters_audio(clusters,colormap,audios):
|
309 |
|
310 |
html = '''<html><body>'''
|
311 |
|
|
|
315 |
html += '<div>'
|
316 |
html += f'<h2>Cluster {label}</h2>'
|
317 |
|
318 |
+
html += '<div style="font-size:130%;">'
|
319 |
html += '<table><tbody>'
|
320 |
|
321 |
for rec in recs:
|
322 |
+
cc = colormap[label][rec]
|
323 |
+
|
324 |
html += f'<tr><td><audio controls id="{rec}">' #width="20%">
|
325 |
|
326 |
html += f'<source src="{audios[rec][0]}#t={audios[rec][1]:.2f},{audios[rec][2]:.2f}" type="audio/wav">'
|
327 |
#html += f'<source src="{audios[rec][0]}" type="audio/wav">'
|
328 |
|
329 |
html += '</audio></td>'
|
330 |
+
html += f'<td style="color:{cc};">{rec}</td></tr>'
|
331 |
|
332 |
html += '</tbody></table>'
|
333 |
html += '</div>'
|
|
|
339 |
return html
|
340 |
|
341 |
|
|
|
342 |
# find offsets to visually align start of each word for speakers in cluster
|
343 |
+
def reset_cluster_times(words,cluster_speakers,human_aligns,tts_align=None):
|
344 |
words = words.split('_')
|
345 |
|
346 |
retimes = [(words[0], 0.0)]
|
|
|
378 |
xf.append((x,f))
|
379 |
return [x for x,f in xf], [f for x,f in xf]
|
380 |
|
381 |
+
|
382 |
+
|
383 |
+
# TODO handle the ccmap in here not inside plot_one
|
384 |
+
def graph_humans(clusters,speech_data,words,seg_aligns):
|
385 |
+
c0,c1,c2 = (0,1,2)
|
386 |
+
nsents = len(speech_data)
|
387 |
+
|
388 |
+
c0_data = {f'{words}**{r}': speech_data[f'{words}**{r}'] for r,c in clusters if c==c0}
|
389 |
+
c1_data = {f'{words}**{r}': speech_data[f'{words}**{r}'] for r,c in clusters if c==c1}
|
390 |
+
c2_data = {f'{words}**{r}': speech_data[f'{words}**{r}'] for r,c in clusters if c==c2}
|
391 |
+
|
392 |
+
colors = [(pc,ec) for pc,ec in zip(clc.CET_C8s,clc.CET_C9s)]
|
393 |
+
cix = [int(x) for x in np.linspace(0,len(colors)-1, nsents)]
|
394 |
+
pcolors = [colors[x][0] for x in cix]
|
395 |
+
ecolors= [colors[x][1] for x in cix]
|
396 |
+
|
397 |
+
f0_fig_c0, c0_cc = plot_one_cluster(words,'pitch',c0_data,seg_aligns,c0,pcolors)
|
398 |
+
f0_fig_c1, c1_cc= plot_one_cluster(words,'pitch',c1_data,seg_aligns,c1,pcolors[len(c0_data):])
|
399 |
+
f0_fig_c2, c2_cc = plot_one_cluster(words,'pitch',c2_data,seg_aligns,c2,pcolors[len(c0_data)+len(c1_data):])
|
400 |
+
|
401 |
+
en_fig_c0, _ = plot_one_cluster(words,'rmse',c0_data,seg_aligns,c0,ecolors)
|
402 |
+
en_fig_c1, _ = plot_one_cluster(words,'rmse',c1_data,seg_aligns,c1,ecolors[len(c0_data):])
|
403 |
+
en_fig_c2, _ = plot_one_cluster(words,'rmse',c2_data,seg_aligns,c2,ecolors[len(c0_data)+len(c1_data):])
|
404 |
+
|
405 |
+
# TODO
|
406 |
+
# not necessarily here, bc paths to audio files.
|
407 |
+
spk_cc_map = {c0 : c0_cc, c1 : c1_cc, c2 : c2_cc}
|
408 |
+
#playable = audio_htmls(spk_cc_map)
|
409 |
+
|
410 |
+
return f0_fig_c0, f0_fig_c1, f0_fig_c2, en_fig_c0, en_fig_c1, en_fig_c2, spk_cc_map
|
411 |
|
412 |
|
413 |
+
|
414 |
+
#TODO handle the colour list OUTSIDE of this part....
|
415 |
+
def plot_one_cluster(words,feature,speech_data,seg_aligns,cluster_id,colors,gtype='cluster',voice=None):
|
416 |
+
|
417 |
+
cc=0
|
418 |
+
gclr = "#909090"
|
419 |
+
spk_ccs = {} # for external display
|
420 |
+
|
421 |
+
|
422 |
+
#fig = plt.figure(figsize=(10, 5))
|
423 |
+
if voice:
|
424 |
+
fig, ax = plt.subplots(figsize=(7.5,4))
|
425 |
+
else:
|
426 |
+
fig, ax = plt.subplots(figsize=(10,5))
|
427 |
+
fig.patch.set_facecolor('none')
|
428 |
+
ax.patch.set_facecolor('none')
|
429 |
+
fig.patch.set_alpha(0)
|
430 |
+
ax.tick_params(color=gclr,labelcolor=gclr)
|
431 |
+
for spine in ['bottom','left']:
|
432 |
+
ax.spines[spine].set_color(gclr)
|
433 |
+
for spine in ['top','right']:
|
434 |
+
ax.spines[spine].set(visible=False)
|
435 |
+
|
436 |
|
437 |
if feature.lower() in ['pitch','f0']:
|
438 |
fname = 'Pitch'
|
439 |
+
def _ffunc(feats):
|
440 |
+
ps = [p for p,e in feats]
|
441 |
+
nv = min(ps)
|
442 |
+
ps = [np.nan if p == nv else p for p in ps]
|
443 |
+
return ps
|
444 |
+
ffunc = _ffunc
|
445 |
+
pfunc = plt.plot
|
446 |
+
ylab = "Mean-variance normalised F0"
|
447 |
elif feature.lower() in ['energy', 'rmse']:
|
448 |
fname = 'Energy'
|
449 |
ffunc = lambda x: [e for p,e in x]
|
450 |
pfunc = plt.plot
|
451 |
+
ylab = "Mean-variance normalised energy"
|
452 |
else:
|
453 |
print('problem with the figure')
|
454 |
return fig, []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
455 |
|
456 |
+
|
457 |
+
if gtype == 'cluster':
|
458 |
+
# boundary for start of each word
|
459 |
+
retimes = reset_cluster_times(words,list(speech_data.keys()),seg_aligns)#,tts_align)
|
460 |
+
plt.title(f"{words} - {fname} - Cluster {cluster_id}", color=gclr, fontsize=16)
|
461 |
+
xmax = 0
|
462 |
|
463 |
+
for k,v in speech_data.items():
|
|
|
|
|
464 |
|
465 |
+
spk = k.split('**')[1]
|
466 |
+
word_times = seg_aligns[k]
|
467 |
|
468 |
+
feats = ffunc(v)
|
469 |
+
# datapoint interval is 0.005 seconds
|
470 |
+
feat_xvals = [x*0.005 for x in range(len(feats))]
|
471 |
+
|
472 |
+
feat_xvals, feats = retime_xs_feats(retimes,word_times,feat_xvals,feats)
|
473 |
+
pfunc(feat_xvals, feats, color=colors[cc], linewidth=2, label=f"Speaker {spk}")
|
474 |
+
|
475 |
+
xmax = max(xmax,max(feat_xvals))
|
476 |
+
spk_ccs[spk] = colors[cc]
|
477 |
+
cc += 1
|
478 |
+
if cc >= len(colors):
|
479 |
+
cc=0
|
480 |
+
|
481 |
+
elif gtype == 'tts':
|
482 |
+
# boundary for start of each word
|
483 |
+
retimes = reset_cluster_times(words,[f'{words}**{voice}'],seg_aligns)
|
484 |
+
word_times = seg_aligns[f'{words}**{voice}']
|
485 |
+
tfeats = ffunc(speech_data[f'{words}**{voice}'])
|
486 |
+
t_xvals = [x*0.005 for x in range(len(tfeats))]
|
487 |
+
t_xvals, tfeats = retime_xs_feats(retimes, word_times, t_xvals, tfeats)
|
488 |
+
pfunc(t_xvals, tfeats, color=colors[cc], label=f"TTS {voice}")
|
489 |
+
plt.title(f"{fname}", color=gclr, fontsize=14)
|
490 |
+
xmax = max(t_xvals)
|
491 |
|
|
|
|
|
|
|
|
|
492 |
|
493 |
+
if len(retimes)>1:
|
494 |
+
for w,bound_line in retimes:
|
495 |
+
plt.axvline(x=bound_line, color=gclr, linestyle='--', linewidth=1, label=f'Start "{w}"')
|
496 |
+
plt.xlim([0, xmax])
|
497 |
+
ax.set_xlabel("Time --->",fontsize=13,color=gclr)
|
498 |
+
ax.set_ylabel(ylab,fontsize=13,color=gclr)
|
499 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
500 |
#plt.legend()
|
501 |
#plt.show()
|
502 |
|
|
|
503 |
return fig, spk_ccs
|
504 |
|
505 |
|
scripts/runSQ.py
CHANGED
@@ -38,13 +38,13 @@ def run(sentence, voices, start_end_word_ix):
|
|
38 |
if voices:
|
39 |
temp_tts_sample, tts_sent_dir = get_tts(sentence,voices,tts_dir,align_model_path)
|
40 |
|
41 |
-
voices = [voices[0]] # TODO. now limit one voice at a time.
|
42 |
-
score,
|
43 |
-
|
44 |
# also stop forgetting duration.
|
45 |
|
46 |
-
return temp_tts_sample, score,
|
47 |
-
|
48 |
|
49 |
|
50 |
|
@@ -284,8 +284,7 @@ def localtest():
|
|
284 |
|
285 |
voices = [voices[0]] # TODO. now limit one voice at a time.
|
286 |
|
287 |
-
score,
|
288 |
-
|
289 |
|
290 |
|
291 |
#localtest()
|
|
|
38 |
if voices:
|
39 |
temp_tts_sample, tts_sent_dir = get_tts(sentence,voices,tts_dir,align_model_path)
|
40 |
|
41 |
+
#voices = [voices[0]] # TODO. now limit one voice at a time.
|
42 |
+
#score, f0_fig_c0, f0_fig_c1, f0_fig_c2, en_fig_c0, en_fig_c1, en_fig_c2, html, f0_fig_tts, en_fig_tts = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, playable_dir, tts_sent_dir, voices, start_end_word_ix)
|
43 |
+
f0_fig_c0, f0_fig_c1, f0_fig_c2, en_fig_c0, en_fig_c1, en_fig_c2, html, tts_results = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, playable_dir, tts_sent_dir, voices, start_end_word_ix)
|
44 |
# also stop forgetting duration.
|
45 |
|
46 |
+
#return temp_tts_sample, score, f0_fig_c0, f0_fig_c1, f0_fig_c2, en_fig_c0, en_fig_c1, en_fig_c2, html, f0_fig_tts, en_fig_tts
|
47 |
+
return f0_fig_c0, f0_fig_c1, f0_fig_c2, en_fig_c0, en_fig_c1, en_fig_c2, html, tts_results
|
48 |
|
49 |
|
50 |
|
|
|
284 |
|
285 |
voices = [voices[0]] # TODO. now limit one voice at a time.
|
286 |
|
287 |
+
score, f0_fig_c0, f0_fig_c1, f0_fig_c2, en_fig_c0, en_fig_c1, en_fig_c2, html, f0_fig_tts, en_fig_tts = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, playable_dir, tts_sent_dir, voices, start_end_word_ix)
|
|
|
288 |
|
289 |
|
290 |
#localtest()
|