Spaces:

clr
/

pce

Sleeping

App Files Files Community

catiR commited on Nov 24, 2023

Commit

1095ae0

•

1 Parent(s): 307bcfb

appearance,tabs

Browse files

Files changed (5) hide show

README.md +3 -3
app.py +101 -40
requirements.txt +1 -0
scripts/clusterprosody.py +143 -110
scripts/runSQ.py +6 -7

README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 ---
 title: Prosody clustering and evaluation
 emoji: ⚡
-colorFrom: pink
-colorTo: pink
 sdk: gradio
-sdk_version: 3.47.1
 app_file: app.py
 pinned: false
 ---

 ---
 title: Prosody clustering and evaluation
 emoji: ⚡
+colorFrom: blue
+colorTo: indigo
 sdk: gradio
+sdk_version: 4.7.1
 app_file: app.py
 pinned: false
 ---

app.py CHANGED Viewed

@@ -2,18 +2,21 @@ import gradio as gr
 import subprocess, os
 import scripts.runSQ
 #https://huggingface.co/spaces/clr/prosalign/blob/main/app.py
 def setup():
     r0 = subprocess.run(["pwd"], capture_output=True, text=True)
     print('PWD::', r0.stdout)
-    r1 = subprocess.run(["wget", "https://github.com/google/REAPER/archive/refs/heads/master.zip"], capture_output=True, text=True)
-    print(r1.stdout)
-    subprocess.run(["unzip", "./master.zip"])
     subprocess.run(["mv", "REAPER-master", "REAPER"])
-    subprocess.run(["rm", "./master.zip"])
     os.chdir('./REAPER')
     subprocess.run(["mkdir", "build"])
     os.chdir('./build')
@@ -31,12 +34,6 @@ print('about to setup')
 setup()
-def f1(voices, sent, indices):
-    #tts_audio, tts_score, graph = scripts.runSQ.run(sent, voices, indices)
-    tts_audio, tts_score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, audio_html = scripts.runSQ.run(sent, [voices], indices)
-    score_report = f'Difference from TTS to real speech: {round(tts_score,2)}'
-    return (tts_audio, score_report, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, audio_html)
 def label_indices(sentence):
     sentence = scripts.runSQ.snorm(sentence)
@@ -44,51 +41,112 @@ def label_indices(sentence):
     labelled = [(f'{word} {i+1} ', str(i+1)) for i, word in enumerate(sentence)]
     return labelled
 temp_sentences = scripts.runSQ.create_temp_sent_list()
 bl = gr.Blocks()
 with bl:
-    #temp_sentences = ['Litlaus græn hugmynd?','Var það ekki nóg?', 'Ef svo er hvað heita þau þá?','Eru maríuhænur á Íslandi?']
-    voices = ['Dilja_v2', 'Alfur_v2', 'Dilja', 'Alfur', 'Bjartur', 'Rosa', 'Karl', 'Dora']
-    #with gr.Row():
-        #with gr.Column(scale=4):
-    temp_sentmenu = gr.Dropdown(temp_sentences, label="Sentence")
-        #voiceselect = gr.CheckboxGroup(voices, label="TTS voice",value='Alfur')
-    marked_sentence = gr.HighlightedText(interactive=False,label="Word selection key",color_map = {str(i):"#dcfce7" for i in range(333)})
-    with gr.Row():
-        spanselect = gr.Textbox(value='1-3',label="Select words",info='Enter the index of the word(s) to analyse, according to the key above. It can be a single word: 4 or a span of words separated by a dash: 2-3')
-        voiceselect = gr.Radio(voices, label="TTS voice",value='Alfur_v2')
-        #with gr.Column(scale=1):
-        temp_button = gr.Button(value="Run with selected options")
-    tts_output = gr.Audio(interactive=False)
-    report_score = gr.Markdown('Difference from TTS to real speech:')
     with gr.Tabs():
         with gr.TabItem("Pitch"):
-            pl1 = gr.Plot()
-            with gr.Row():
-                pl2 = gr.Plot()
-                pl3 = gr.Plot()
         with gr.TabItem("Energy"):
-            pl4 = gr.Plot()
-            with gr.Row():
-                pl5 = gr.Plot()
-                pl6 = gr.Plot()
         with gr.TabItem("Audio"):
@@ -96,9 +154,12 @@ with bl:
     temp_sentmenu.input(label_indices,temp_sentmenu,marked_sentence)
-    temp_button.click(f1,[voiceselect,temp_sentmenu,spanselect],[tts_output,report_score,pl1,pl2,pl3,pl4,pl5,pl6,play])
 if __name__ == "__main__":

 import subprocess, os
 import scripts.runSQ
 #https://huggingface.co/spaces/clr/prosalign/blob/main/app.py
 def setup():
     r0 = subprocess.run(["pwd"], capture_output=True, text=True)
     print('PWD::', r0.stdout)
+    #r1 = subprocess.run(["wget", "https://github.com/google/REAPER/archive/refs/heads/REAPER-master.zip", "-O", "./master.zip"], capture_output=True, text=True)
+    #print(r1.stdout)
+    r9x = subprocess.run(["ls", "-la"], capture_output=True, text=True)
+    print('LS::', r9x.stdout)
+    subprocess.run(["unzip", "./REAPER-master.zip"])
+    subprocess.run(["rm", "./REAPER-master.zip"])
     subprocess.run(["mv", "REAPER-master", "REAPER"])
     os.chdir('./REAPER')
     subprocess.run(["mkdir", "build"])
     os.chdir('./build')
 setup()
 def label_indices(sentence):
     sentence = scripts.runSQ.snorm(sentence)
     labelled = [(f'{word} {i+1} ', str(i+1)) for i, word in enumerate(sentence)]
     return labelled
+#gradio states dont like dicts
+def d2l(d):
+    return [(k,v) for k,v in d.items()]
+def l2d(l):
+    return {k:v for k,v in l}
 temp_sentences = scripts.runSQ.create_temp_sent_list()
 bl = gr.Blocks()
 with bl:
+    voices = ['Dilja_v2', 'Alfur_v2', 'Dilja', 'Alfur', 'Bjartur', 'Rosa']
+    with gr.Tabs():
+        with gr.TabItem("Options"):
+            temp_sentmenu = gr.Dropdown(temp_sentences, label="Sentence")
+            marked_sentence = gr.HighlightedText(interactive=False,label="Word selection key",color_map = {str(i):"#dcfce7" for i in range(333)})
+            with gr.Row():
+                spanselect = gr.Textbox(value='1-3',label="Select words",info='Enter the index of the word(s) to analyse, according to the key above. It can be a single word: 4 or a span of words separated by a dash: 2-3')
+                #voiceselect = gr.Radio(voices, label="TTS voice",value='Alfur_v2')
+                voiceselect = gr.CheckboxGroup(voices, label="TTS voice",value=['Dilja_v2','Alfur_v2'])
+                #with gr.Column(scale=1):
+                temp_button = gr.Button(value="Run with selected options")
+        with gr.TabItem("About"):
+            docu = gr.Markdown("""
+    # Multi-target prosody evaluation
+    ### 1. Choose a sentence - they are from Samrómur Queries
+    ### 2. The words will be numbered by position - type the number or range you want to evaluate
+    ### 3. Choose a TTS voice - they come from Tiro's API https://tiro.is/talgerving
+    ### 4. Run
+    The evaluation automatically clusters human speakers according to prosodic features,
+    and then measures how different the synthesised speech is from each natural cluster.
+    Clustering and TTS scoring use only the selected word(s) from Step 2, not the whole sentence.
+    Close match to one cluster shows what prosodic act TTS might have achieved, in the selected words.
+    TTS whose prosody does not match any cluster might sound unnatural.
+    TTS output includes generated audio, pitch, energy, and scores for each cluster.
+    Output is only shown for the selected voice(s).
+    Below, human data shows pitch and energy of each cluster, along with original audio.
+    TTS often takes over 30 seconds per sentence/voice.
+    After you have done it once, re-running different word spans for the same sentence/voice is much faster.
+    See "Automatic assessment of prosody in high-stakes English tests" (Jian Cheng, ISCA 2011)
+    regarding multi-target prosody scoring. This version extends the implementation by Magnús Freyr Morthens
+    supported by Rannís student innovation fund.
+    """)
+    ttstabs = {v:{} for v in voices}
+    with gr.Tabs():
+        for v in voices:
+            with gr.TabItem(v):
+                ttstabs[v]['tts_output'] = gr.Audio(interactive=False)
+                with gr.Row():
+                    ttstabs[v]['ptts'] = gr.Plot()
+                    ttstabs[v]['etts'] = gr.Plot()
+                ttstabs[v]['scorearea'] = gr.Markdown(f'TTS results for **{v}** will appear here')
+                #tts_output = gr.Audio(interactive=False)
+                #with gr.Row():
+                #    ptts = gr.Plot()
+                #    etts = gr.Plot()
+                #report_score = gr.Markdown('Difference from TTS to real speech:')
+    # cant store ttstabs in gradio state, use here
+    def f1(voices, sent, indices):
+        #tts_audio, tts_score, f0_fig_0, f0_fig_1, f0_fig_2, en_fig_0, en_fig_1, en_fig_2, audio_html, f0_fig_tts, en_fig_tts = scripts.runSQ.run(sent, [voices], indices)
+        f0_fig_0, f0_fig_1, f0_fig_2, en_fig_0, en_fig_1, en_fig_2, audio_html, tts_results = scripts.runSQ.run(sent, voices, indices)
+        outputs = {pc0: f0_fig_0, pc1: f0_fig_1, pc2: f0_fig_2, ec0: en_fig_0, ec1: en_fig_1, ec2: en_fig_2, play: audio_html}
+        for v in voices:
+            outputs[ttstabs[v]['tts_output']] = tts_results[v]['audio']
+            outputs[ttstabs[v]['ptts']] = tts_results[v]['f0_fig_tts']
+            outputs[ttstabs[v]['etts']] = tts_results[v]['en_fig_tts']
+            outputs[ttstabs[v]['scorearea']] = tts_results[v]['scoreinfo']
+        clear = [v for v in ttstabs.keys() if v not in voices]
+        for v in clear:
+            outputs[ttstabs[v]['tts_output']] = None
+            outputs[ttstabs[v]['ptts']] = None
+            outputs[ttstabs[v]['etts']] = None
+            outputs[ttstabs[v]['scorearea']] = f'TTS results for **{v}** will appear here'
+        return outputs #(tts_audio, score_report, f0_fig_0, f0_fig_1, f0_fig_2, en_fig_0, en_fig_1, en_fig_2, audio_html, f0_fig_tts, en_fig_tts)
     with gr.Tabs():
         with gr.TabItem("Pitch"):
+            pc0 = gr.Plot()
+            pc1 = gr.Plot()
+            pc2 = gr.Plot()
         with gr.TabItem("Energy"):
+            ec0 = gr.Plot()
+            ec1 = gr.Plot()
+            ec2 = gr.Plot()
         with gr.TabItem("Audio"):
     temp_sentmenu.input(label_indices,temp_sentmenu,marked_sentence)
+    outputs_list = [pc0,pc1,pc2,ec0,ec1,ec2,play]
+    for v in voices:
+        outputs_list += [ttstabs[v]['tts_output'], ttstabs[v]['ptts'],ttstabs[v]['etts'],ttstabs[v]['scorearea']]
+    temp_button.click(f1,[voiceselect,temp_sentmenu,spanselect],outputs_list)
+                      #[tts_output,report_score,pc0,pc1,pc2,ec0,ec1,ec2,play,ptts,etts])
 if __name__ == "__main__":

requirements.txt CHANGED Viewed

@@ -6,4 +6,5 @@ scipy
 dtw-python
 scikit-learn_extra
 pydub

 dtw-python
 scikit-learn_extra
 pydub
+colorcet

scripts/clusterprosody.py CHANGED Viewed

@@ -3,6 +3,7 @@ import matplotlib
 matplotlib.use('Agg')
 import matplotlib.pyplot as plt
 import soundfile as sf
 from collections import defaultdict
 from dtw import dtw
 from sklearn_extra.cluster import KMedoids
@@ -203,51 +204,28 @@ def kmedoids_clustering(X):
     return y_km, kmedoids
 def match_tts(clusters, speech_data, tts_data, tts_align, words, seg_aligns, voice):
-    tts_info = []
     for label in set([c for r,c in clusters]):
         recs = [r for r,c in clusters if c==label]
         dists = []
         for rec in recs:
-            key = f'{words}**{rec}'
-            dists.append(dtw_distance(tts_data, speech_data[key]))
-        tts_info.append((label,np.nanmean(dists)))
-    tts_info = sorted(tts_info,key = lambda x: x[1])
-    best_cluster = tts_info[0][0]
-    best_cluster_score = tts_info[0][1]
-    matched_data = {f'{words}**{r}': speech_data[f'{words}**{r}'] for r,c in clusters if c==best_cluster}
-    # now do graphs of matched_data with tts_data
-    # and report best_cluster_score
-    mid_cluster = tts_info[1][0]
-    mid_data = {f'{words}**{r}': speech_data[f'{words}**{r}'] for r,c in clusters if c==mid_cluster}
-    bad_cluster = tts_info[2][0]
-    bad_data = {f'{words}**{r}': speech_data[f'{words}**{r}'] for r,c in clusters if c==bad_cluster}
-    #tts_fig_p = plot_pitch_tts(matched_data,tts_data, tts_align, words,seg_aligns,best_cluster,voice)
-    tts_fig_p, best_cc = plot_one_cluster(words,'pitch',matched_data,seg_aligns,best_cluster,tts_data=tts_data,tts_align=tts_align,voice=voice)
-    fig_mid_p, mid_cc = plot_one_cluster(words,'pitch',mid_data,seg_aligns,mid_cluster)
-    fig_bad_p, bad_cc = plot_one_cluster(words,'pitch',bad_data,seg_aligns,bad_cluster)
-    tts_fig_e, _ = plot_one_cluster(words,'rmse',matched_data,seg_aligns,best_cluster,tts_data=tts_data,tts_align=tts_align,voice=voice)
-    fig_mid_e, _ = plot_one_cluster(words,'rmse',mid_data,seg_aligns,mid_cluster)
-    fig_bad_e, _ = plot_one_cluster(words,'rmse',bad_data,seg_aligns,bad_cluster)
-    # TODO
-    # not necessarily here, bc paths to audio files.
-    spk_cc_map = [('Best',best_cluster,best_cc), ('Mid',mid_cluster,mid_cc), ('Last',bad_cluster,bad_cc)]
-    #playable = audio_htmls(spk_cc_map)
-    return best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p, tts_fig_e, fig_mid_e, fig_bad_e
 def gp(d,s,x):
@@ -261,7 +239,6 @@ def gen_h_paths(wdir,adir,f0dir,pldir,spks):
     plist = [(s, {'wav': gp(wdir,s,'wav'), 'aln': gp(adir,s,'tsv'), 'f0': gp(f0dir,s,'f0'), 'play': gp(pldir,s,'wav')}) for s in spks]
     return plist
 # since clustering strictly operates on X,
 #  once reduce a duration metric down to pair-distances,
 # it no longer matters that duration and pitch/energy had different dimensionality
@@ -289,30 +266,38 @@ def cluster(norm_sent,orig_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, h_p
     X = np.array(X)
     y_km, kmedoids = kmedoids_clustering(X)
-    #plot_clusters(X, y_km, words)
-    #c1, c2, c3 = [X[np.where(kmedoids.labels_ == i)] for i in range(3)]
     result = zip(X, kmedoids.labels_)
     groups = [[r,c] for r,c in zip(h_spk_ids,kmedoids.labels_)]
     tts_all_paths = gen_tts_paths(tts_sent_dir, voices)
     _, tts_data, tts_seg_aligns, _, _ = get_data(norm_sent,tts_all_paths,start_end_word_index)
     for v in voices:
-        voice_data = tts_data[f"{words}**{v}"]
-        voice_align = tts_seg_aligns[f"{words}**{v}"]
-        #tts_data, tts_align = get_one_tts_data(tts_sent_dir,v,norm_sent,start_end_word_index)
     # match the data with a cluster -----
-        best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p, tts_fig_e, fig_mid_e, fig_bad_e = match_tts(groups, h_data, voice_data, voice_align, words, h_seg_aligns,v)
-    audio_html = clusters_audio(groups,h_playable)
-    # only supports one voice at a time currently
-    return best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p,  tts_fig_e, fig_mid_e, fig_bad_e, audio_html
     #return words, kmedoids_cluster_dists, group
@@ -320,7 +305,7 @@ def cluster(norm_sent,orig_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, h_p
 # generate html panel to play audios for each human cluster
 # audios is dict {recording_id : (wav_path, seg_start_time, seg_end_time)}
-def clusters_audio(clusters,audios):
     html = '''<html><body>'''
@@ -330,17 +315,19 @@ def clusters_audio(clusters,audios):
         html += '<div>'
         html += f'<h2>Cluster {label}</h2>'
-        html += '<div>'
         html += '<table><tbody>'
         for rec in recs:
             html += f'<tr><td><audio controls id="{rec}">'   #width="20%">
             html += f'<source src="{audios[rec][0]}#t={audios[rec][1]:.2f},{audios[rec][2]:.2f}" type="audio/wav">'
             #html += f'<source src="{audios[rec][0]}" type="audio/wav">'
             html += '</audio></td>'
-            html += f'<td>{rec}</td></tr>'
         html += '</tbody></table>'
         html += '</div>'
@@ -352,9 +339,8 @@ def clusters_audio(clusters,audios):
     return html
 # find offsets to visually align start of each word for speakers in cluster
-def reset_cluster_times(words,cluster_speakers,human_aligns,tts_align):
     words = words.split('_')
     retimes = [(words[0], 0.0)]
@@ -392,81 +378,128 @@ def retime_xs_feats(retimes, speaker_aligns, speaker_xvals, feats):
         xf.append((x,f))
     return [x for x,f in xf], [f for x,f in xf]
-def plot_one_cluster(words,feature,speech_data,seg_aligns,cluster_id,tts_data=None,tts_align=None,voice=None):
-#(speech_data, tts_data, tts_align, words, seg_aligns, cluster_id, voice):
-    colors = ["red", "green", "blue", "orange", "purple", "pink", "brown", "gray", "cyan"]
-    cc = 0
-    spk_ccs = [] # for external display
-    fig = plt.figure(figsize=(10, 5))
     if feature.lower() in ['pitch','f0']:
         fname = 'Pitch'
-        ffunc = lambda x: [p for p,e in x]
-        pfunc = plt.scatter
     elif feature.lower() in ['energy', 'rmse']:
         fname = 'Energy'
         ffunc = lambda x: [e for p,e in x]
         pfunc = plt.plot
     else:
         print('problem with the figure')
         return fig, []
-    # boundary for start of each word
-    retimes = reset_cluster_times(words,list(speech_data.keys()),seg_aligns,tts_align)
-    if len(retimes)>1:
-        for w,bound_line in retimes:
-            plt.axvline(x=bound_line, color="gray", linestyle='--', linewidth=1, label=f'Start "{w}"')
-    plt.title(f"{words} - {fname} - Cluster {cluster_id}")
-    for k,v in speech_data.items():
-        spk = k.split('**')[1]
-        word_times = seg_aligns[k]
-        feats = ffunc(v)
-        # datapoint interval is 0.005 seconds
-        feat_xvals = [x*0.005 for x in range(len(feats))]
-        feat_xvals, feats = retime_xs_feats(retimes,word_times,feat_xvals,feats)
-        pfunc(feat_xvals, feats, color=colors[cc], label=f"Speaker {spk}")
-        #feat_xvals = retime_speaker_xvals(retimes, word_times, feat_xvals)
-        #for w, st in reversed(retimes):
-        #    w_xvals = [x for x in feat_xvals if x>= st]
-        #    w_feats = feats[-(len(w_xvals)):]
-        #    pfunc(w_xvals, w_feats, color=colors[cc])
-        #    feat_xvals = feat_xvals[:-(len(w_xvals))]
-        #    feats = feats[:-(len(w_xvals))]
-        spk_ccs.append((spk,colors[cc]))
-        cc += 1
-        if cc >= len(colors):
-            cc=0
-    if voice:
-        tfeats = ffunc(tts_data)
-        t_xvals = [x*0.005 for x in range(len(tfeats))]
-        t_xvals, tfeats = retime_xs_feats(retimes, tts_align, t_xvals, tfeats)
-        pfunc(t_xvals, tfeats, color="black", label=f"TTS {voice}")
-        #t_xvals = retime_speaker_xvals(retimes, tts_align, t_xvals)
-        #for w, st in reversed(retimes):
-        #    tw_xvals = [x for x in t_xvals if x>= st]
-        #    tw_feats = tfeats[-(len(tw_xvals)):]
-        #    pfunc(tw_xvals, tw_feats, color="black")
-        #    t_xvals = t_xvals[:-(len(tw_xvals))]
-        #    tfeats = tfeats[:-(len(tw_xvals))]
     #plt.legend()
     #plt.show()
     return fig, spk_ccs

 matplotlib.use('Agg')
 import matplotlib.pyplot as plt
 import soundfile as sf
+import colorcet as clc
 from collections import defaultdict
 from dtw import dtw
 from sklearn_extra.cluster import KMedoids
     return y_km, kmedoids
 def match_tts(clusters, speech_data, tts_data, tts_align, words, seg_aligns, voice):
+    tts_info = defaultdict(list)
     for label in set([c for r,c in clusters]):
         recs = [r for r,c in clusters if c==label]
         dists = []
         for rec in recs:
+            dists.append(dtw_distance(tts_data[f'{words}**{voice}'], speech_data[f'{words}**{rec}']))
+        tts_info[voice].append((label,np.nanmean(dists)))
+    #tts_info[voice] = sorted(tts_info[voice],key = lambda x: x[1])
+    #best_cluster = tts_info[voice][0][0]
+    #best_cluster_score = tts_info[voice][0][1]
+    #tts_pldat = {f'{words}**{voice}': tts_data}
+    f0_fig_tts, _ = plot_one_cluster(words,'pitch',tts_data,tts_align,0,['#c97eb7'],gtype='tts',voice=voice)
+    en_fig_tts, _ = plot_one_cluster(words,'energy',tts_data,tts_align,0,['#9276d9'],gtype='tts',voice=voice)
+    return tts_info[voice], f0_fig_tts, en_fig_tts
 def gp(d,s,x):
     plist = [(s, {'wav': gp(wdir,s,'wav'), 'aln': gp(adir,s,'tsv'), 'f0': gp(f0dir,s,'f0'), 'play': gp(pldir,s,'wav')}) for s in spks]
     return plist
 # since clustering strictly operates on X,
 #  once reduce a duration metric down to pair-distances,
 # it no longer matters that duration and pitch/energy had different dimensionality
     X = np.array(X)
     y_km, kmedoids = kmedoids_clustering(X)
     result = zip(X, kmedoids.labels_)
     groups = [[r,c] for r,c in zip(h_spk_ids,kmedoids.labels_)]
+    f0_fig_c0, f0_fig_c1, f0_fig_c2, en_fig_c0, en_fig_c1, en_fig_c2, spk_cc_map = graph_humans(groups,h_data,words,h_seg_aligns)
+    audio_html = clusters_audio(groups,spk_cc_map,h_playable)
     tts_all_paths = gen_tts_paths(tts_sent_dir, voices)
     _, tts_data, tts_seg_aligns, _, _ = get_data(norm_sent,tts_all_paths,start_end_word_index)
+    tts_results = defaultdict(dict)
     for v in voices:
+        #voice_data = tts_data[f"{words}**{v}"]
+        #voice_align = tts_seg_aligns[f"{words}**{v}"]
     # match the data with a cluster -----
+        cluster_scores, f0_fig_tts, en_fig_tts = match_tts(groups, h_data, tts_data, tts_seg_aligns, words, h_seg_aligns, v)
+        best_cluster = [c for c,s in cluster_scores if s == min([s for c,s in cluster_scores])]
+        scorestring = []
+        for c,s in cluster_scores:
+            if c== best_cluster:
+                scorestring.append(f' **Cluster {c}: {round(s,2)}** ')
+            else:
+                scorestring.append(f' Cluster {c}: {round(s,2)} ')
+        scorestring = ' - '.join(scorestring)
+        audiosample = [pdict['play'] for voic, pdict in tts_all_paths if voic == v][0]
+        tts_results[v] = {'audio': audiosample, 'f0_fig_tts': f0_fig_tts, 'en_fig_tts':en_fig_tts, 'scoreinfo': scorestring}
+    return f0_fig_c0, f0_fig_c1, f0_fig_c2, en_fig_c0, en_fig_c1, en_fig_c2, audio_html, tts_results
     #return words, kmedoids_cluster_dists, group
 # generate html panel to play audios for each human cluster
 # audios is dict {recording_id : (wav_path, seg_start_time, seg_end_time)}
+def clusters_audio(clusters,colormap,audios):
     html = '''<html><body>'''
         html += '<div>'
         html += f'<h2>Cluster {label}</h2>'
+        html += '<div style="font-size:130%;">'
         html += '<table><tbody>'
         for rec in recs:
+            cc = colormap[label][rec]
             html += f'<tr><td><audio controls id="{rec}">'   #width="20%">
             html += f'<source src="{audios[rec][0]}#t={audios[rec][1]:.2f},{audios[rec][2]:.2f}" type="audio/wav">'
             #html += f'<source src="{audios[rec][0]}" type="audio/wav">'
             html += '</audio></td>'
+            html += f'<td style="color:{cc};">{rec}</td></tr>'
         html += '</tbody></table>'
         html += '</div>'
     return html
 # find offsets to visually align start of each word for speakers in cluster
+def reset_cluster_times(words,cluster_speakers,human_aligns,tts_align=None):
     words = words.split('_')
     retimes = [(words[0], 0.0)]
         xf.append((x,f))
     return [x for x,f in xf], [f for x,f in xf]
+# TODO handle the ccmap in here not inside plot_one
+def graph_humans(clusters,speech_data,words,seg_aligns):
+    c0,c1,c2 = (0,1,2)
+    nsents = len(speech_data)
+    c0_data = {f'{words}**{r}': speech_data[f'{words}**{r}'] for r,c in clusters if c==c0}
+    c1_data = {f'{words}**{r}': speech_data[f'{words}**{r}'] for r,c in clusters if c==c1}
+    c2_data = {f'{words}**{r}': speech_data[f'{words}**{r}'] for r,c in clusters if c==c2}
+    colors = [(pc,ec) for pc,ec in zip(clc.CET_C8s,clc.CET_C9s)]
+    cix = [int(x) for x in np.linspace(0,len(colors)-1, nsents)]
+    pcolors = [colors[x][0] for x in cix]
+    ecolors= [colors[x][1] for x in cix]
+    f0_fig_c0, c0_cc = plot_one_cluster(words,'pitch',c0_data,seg_aligns,c0,pcolors)
+    f0_fig_c1, c1_cc= plot_one_cluster(words,'pitch',c1_data,seg_aligns,c1,pcolors[len(c0_data):])
+    f0_fig_c2, c2_cc = plot_one_cluster(words,'pitch',c2_data,seg_aligns,c2,pcolors[len(c0_data)+len(c1_data):])
+    en_fig_c0, _ = plot_one_cluster(words,'rmse',c0_data,seg_aligns,c0,ecolors)
+    en_fig_c1, _ = plot_one_cluster(words,'rmse',c1_data,seg_aligns,c1,ecolors[len(c0_data):])
+    en_fig_c2, _ = plot_one_cluster(words,'rmse',c2_data,seg_aligns,c2,ecolors[len(c0_data)+len(c1_data):])
+    # TODO
+    # not necessarily here, bc paths to audio files.
+    spk_cc_map = {c0 : c0_cc, c1 : c1_cc, c2 : c2_cc}
+    #playable = audio_htmls(spk_cc_map)
+    return f0_fig_c0, f0_fig_c1, f0_fig_c2, en_fig_c0, en_fig_c1, en_fig_c2, spk_cc_map
+#TODO handle the colour list OUTSIDE of this part....
+def plot_one_cluster(words,feature,speech_data,seg_aligns,cluster_id,colors,gtype='cluster',voice=None):
+    cc=0
+    gclr = "#909090"
+    spk_ccs = {} # for external display
+    #fig = plt.figure(figsize=(10, 5))
+    if voice:
+        fig, ax = plt.subplots(figsize=(7.5,4))
+    else:
+        fig, ax = plt.subplots(figsize=(10,5))
+    fig.patch.set_facecolor('none')
+    ax.patch.set_facecolor('none')
+    fig.patch.set_alpha(0)
+    ax.tick_params(color=gclr,labelcolor=gclr)
+    for spine in ['bottom','left']:
+        ax.spines[spine].set_color(gclr)
+    for spine in ['top','right']:
+        ax.spines[spine].set(visible=False)
     if feature.lower() in ['pitch','f0']:
         fname = 'Pitch'
+        def _ffunc(feats):
+            ps = [p for p,e in feats]
+            nv = min(ps)
+            ps = [np.nan if p == nv else p for p in ps]
+            return ps
+        ffunc = _ffunc
+        pfunc = plt.plot
+        ylab = "Mean-variance normalised F0"
     elif feature.lower() in ['energy', 'rmse']:
         fname = 'Energy'
         ffunc = lambda x: [e for p,e in x]
         pfunc = plt.plot
+        ylab = "Mean-variance normalised energy"
     else:
         print('problem with the figure')
         return fig, []
+    if gtype == 'cluster':
+        # boundary for start of each word
+        retimes = reset_cluster_times(words,list(speech_data.keys()),seg_aligns)#,tts_align)
+        plt.title(f"{words} - {fname} - Cluster {cluster_id}", color=gclr, fontsize=16)
+        xmax = 0
+        for k,v in speech_data.items():
+            spk = k.split('**')[1]
+            word_times = seg_aligns[k]
+            feats = ffunc(v)
+            # datapoint interval is 0.005 seconds
+            feat_xvals = [x*0.005 for x in range(len(feats))]
+            feat_xvals, feats = retime_xs_feats(retimes,word_times,feat_xvals,feats)
+            pfunc(feat_xvals, feats, color=colors[cc], linewidth=2, label=f"Speaker {spk}")
+            xmax = max(xmax,max(feat_xvals))
+            spk_ccs[spk] = colors[cc]
+            cc += 1
+            if cc >= len(colors):
+                cc=0
+    elif gtype == 'tts':
+        # boundary for start of each word
+        retimes = reset_cluster_times(words,[f'{words}**{voice}'],seg_aligns)
+        word_times = seg_aligns[f'{words}**{voice}']
+        tfeats = ffunc(speech_data[f'{words}**{voice}'])
+        t_xvals = [x*0.005 for x in range(len(tfeats))]
+        t_xvals, tfeats = retime_xs_feats(retimes, word_times, t_xvals, tfeats)
+        pfunc(t_xvals, tfeats, color=colors[cc], label=f"TTS {voice}")
+        plt.title(f"{fname}", color=gclr, fontsize=14)
+        xmax = max(t_xvals)
+    if len(retimes)>1:
+        for w,bound_line in retimes:
+            plt.axvline(x=bound_line, color=gclr, linestyle='--', linewidth=1, label=f'Start "{w}"')
+    plt.xlim([0, xmax])
+    ax.set_xlabel("Time --->",fontsize=13,color=gclr)
+    ax.set_ylabel(ylab,fontsize=13,color=gclr)
     #plt.legend()
     #plt.show()
     return fig, spk_ccs

scripts/runSQ.py CHANGED Viewed

@@ -38,13 +38,13 @@ def run(sentence, voices, start_end_word_ix):
     if voices:
         temp_tts_sample, tts_sent_dir = get_tts(sentence,voices,tts_dir,align_model_path)
-        voices = [voices[0]] # TODO. now limit one voice at a time.
-        score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, html = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, playable_dir, tts_sent_dir, voices, start_end_word_ix)
     # also stop forgetting duration.
-    return temp_tts_sample, score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, html
@@ -284,8 +284,7 @@ def localtest():
         voices = [voices[0]] # TODO. now limit one voice at a time.
-        score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, html = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, playable_dir, tts_sent_dir, voices, start_end_word_ix)
 #localtest()

     if voices:
         temp_tts_sample, tts_sent_dir = get_tts(sentence,voices,tts_dir,align_model_path)
+        #voices = [voices[0]] # TODO. now limit one voice at a time.
+        #score, f0_fig_c0, f0_fig_c1, f0_fig_c2, en_fig_c0, en_fig_c1, en_fig_c2, html, f0_fig_tts, en_fig_tts = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, playable_dir, tts_sent_dir, voices, start_end_word_ix)
+        f0_fig_c0, f0_fig_c1, f0_fig_c2, en_fig_c0, en_fig_c1, en_fig_c2, html, tts_results = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, playable_dir, tts_sent_dir, voices, start_end_word_ix)
     # also stop forgetting duration.
+    #return temp_tts_sample, score, f0_fig_c0, f0_fig_c1, f0_fig_c2, en_fig_c0, en_fig_c1, en_fig_c2, html, f0_fig_tts, en_fig_tts
+    return f0_fig_c0, f0_fig_c1, f0_fig_c2, en_fig_c0, en_fig_c1, en_fig_c2, html, tts_results
         voices = [voices[0]] # TODO. now limit one voice at a time.
+        score, f0_fig_c0, f0_fig_c1, f0_fig_c2, en_fig_c0, en_fig_c1, en_fig_c2, html, f0_fig_tts, en_fig_tts = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, playable_dir, tts_sent_dir, voices, start_end_word_ix)
 #localtest()