Spaces:

clr
/

pce

Sleeping

App Files Files Community

catiR commited on Oct 19, 2023

Commit

0d67145

1 Parent(s): a894787

run clustering

Browse files

Files changed (3) hide show

app.py +3 -3
scripts/clusterprosody.py +102 -7
scripts/runSQ.py +3 -3

app.py CHANGED Viewed

@@ -33,9 +33,9 @@ setup()
 def f1(voices, sent, indices):
     #tts_audio, tts_score, graph = scripts.runSQ.run(sent, voices, indices)
-    tts_audio, tts_score, graph = scripts.runSQ.run(sent, [voices], indices)
     score_report = f'Difference from TTS to real speech: {round(tts_score,2)}'
-    return (tts_audio, score_report, tts_graph, mid_graph, bad_graph)
 def label_indices(sentence):
@@ -46,7 +46,7 @@ def label_indices(sentence):
-temp_sentences = scripts.runSQ.snorm.create_temp_sent_list()
 bl = gr.Blocks()
 with bl:

 def f1(voices, sent, indices):
     #tts_audio, tts_score, graph = scripts.runSQ.run(sent, voices, indices)
+    tts_audio, tts_score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e = scripts.runSQ.run(sent, [voices], indices)
     score_report = f'Difference from TTS to real speech: {round(tts_score,2)}'
+    return (tts_audio, score_report, tts_fig_p, mid_fig_p, bad_fig_p)
 def label_indices(sentence):
+temp_sentences = scripts.runSQ.create_temp_sent_list()
 bl = gr.Blocks()
 with bl:

scripts/clusterprosody.py CHANGED Viewed

@@ -302,16 +302,21 @@ def match_tts(clusters, speech_data, tts_data, tts_align, words, seg_aligns, voi
     # now do graphs of matched_data with tts_data
     # and report best_cluster_score
-    tts_fig = plot_pitch_tts(matched_data,tts_data, tts_align, words,seg_aligns,best_cluster,voice)
     mid_cluster = tts_info[1][0]
     mid_data = {f'{words}**{r}': speech_data[f'{words}**{r}'] for r,c in clusters if c==mid_cluster}
     bad_cluster = tts_info[2][0]
     bad_data = {f'{words}**{r}': speech_data[f'{words}**{r}'] for r,c in clusters if c==bad_cluster}
-    fig_mid = plot_pitch_cluster(mid_data,words,seg_aligns,mid_cluster)
-    fig_bad = plot_pitch_cluster(bad_data,words,seg_aligns,bad_cluster)
-    return best_cluster_score, tts_fig, fig_mid, fig_bad
@@ -353,11 +358,11 @@ def cluster(norm_sent,orig_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, tts
         tts_data, tts_align = get_tts_data(tdir,v,start_end_word_index)
     # match the data with a cluster -----
-        best_cluster_score, tts_fig, fig_mid, fig_bad = match_tts(groups, data, tts_data, tts_align, words, seg_aligns,v)
     # only supports one voice at a time currently
-    return best_cluster_score, tts_fig, fig_mid, fig_bad
-    #return words, kmedoids_cluster_dists, groups
@@ -526,6 +531,96 @@ def plot_pitch_cluster(speech_data,words,seg_aligns,cluster_id):
 # want to:
 # - find tts best cluster

     # now do graphs of matched_data with tts_data
     # and report best_cluster_score
     mid_cluster = tts_info[1][0]
     mid_data = {f'{words}**{r}': speech_data[f'{words}**{r}'] for r,c in clusters if c==mid_cluster}
     bad_cluster = tts_info[2][0]
     bad_data = {f'{words}**{r}': speech_data[f'{words}**{r}'] for r,c in clusters if c==bad_cluster}
+    tts_fig_p = plot_pitch_tts(matched_data,tts_data, tts_align, words,seg_aligns,best_cluster,voice)
+    fig_mid_p = plot_pitch_cluster(mid_data,words,seg_aligns,mid_cluster)
+    fig_bad_p = plot_pitch_cluster(bad_data,words,seg_aligns,bad_cluster)
+    tts_fig_e = plot_rmse_tts(matched_data,tts_data, tts_align, words,seg_aligns,best_cluster,voice)
+    fig_mid_e = plot_rmse_cluster(mid_data,words,seg_aligns,mid_cluster)
+    fig_bad_e = plot_rmse_cluster(bad_data,words,seg_aligns,bad_cluster)
+    return best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p, tts_fig_e, fig_mid_e, fig_bad_e
         tts_data, tts_align = get_tts_data(tdir,v,start_end_word_index)
     # match the data with a cluster -----
+        best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p, tts_fig_e, fig_mid_e, fig_bad_e = match_tts(groups, data, tts_data, tts_align, words, seg_aligns,v)
     # only supports one voice at a time currently
+    return best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p,  tts_fig_e, fig_mid_e, fig_bad_e
+    #return words, kmedoids_cluster_dists, group
+def plot_rmse_tts(speech_data,tts_data, tts_align,words,seg_aligns,cluster_id, voice):
+    colors = ["red", "green", "blue", "orange", "purple", "pink", "brown", "gray", "cyan"]
+    cc = 0
+    fig = plt.figure(figsize=(10, 5))
+    plt.title(f"{words} - Energy - Cluster {cluster_id}")
+    for k,v in speech_data.items():
+        spk = k.split('**')[1]
+        word_times = seg_aligns[k]
+        rmse = [e for p,e in v]
+        # datapoint interval is 0.005 seconds
+        rmse_xvals = [x*0.005 for x in range(len(rmse))]
+        # centre around the first word boundary -
+        # if 3+ words, too bad.
+        if len(word_times)>1:
+            realign = np.mean([word_times[0][2],word_times[1][1]])
+            rmse_xvals = [x - realign for x in rmse_xvals]
+            word_times = [(w,s-realign,e-realign) for w,s,e in word_times]
+            plt.axvline(x= 0, color="gray", linestyle='--', linewidth=1, label=f"{word_times[0][0]} -> {word_times[1][0]} boundary")
+        if len(word_times)>2:
+            for i in range(1,len(word_times)-1):
+                bound_line = np.mean([word_times[i][2],word_times[i+1][1]])
+                plt.axvline(x=bound_line, color=colors[cc], linestyle='--', linewidth=1, label=f"Speaker {spk} -> {word_times[i+1][0]}")
+        plt.scatter(rmse_xvals, rmse, color=colors[cc], label=f"Speaker {spk}")
+        cc += 1
+        if cc >= len(colors):
+            cc=0
+    trmse = [e for p,e in tts_data]
+    t_xvals = [x*0.005 for x in range(len(trmse))]
+    if len(tts_align)>1:
+        realign = tts_align[1][1]
+        t_xvals = [x - realign for x in t_xvals]
+        tts_align = [(w,s-realign) for w,s in tts_align]
+    if len(tts_align)>2:
+        for i in range(2,len(tts_align)):
+            bound_line = tts_align[i][1]
+            plt.axvline(x=bound_line, color="black", linestyle='--', linewidth=1, label=f"TTS -> {tts_align[i][0]}")
+    plt.scatter(t_xvals, trmse, color="black", label=f"TTS {voice}")
+    #plt.legend()
+    #plt.show()
+    return fig
+def plot_rmse_cluster(speech_data,words,seg_aligns,cluster_id):
+    colors = ["red", "green", "blue", "orange", "purple", "pink", "brown", "gray", "cyan"]
+    cc = 0
+    fig = plt.figure(figsize=(10, 5))
+    plt.title(f"{words} - Energy - Cluster {cluster_id}")
+    for k,v in speech_data.items():
+        spk = k.split('**')[1]
+        word_times = seg_aligns[k]
+        rmse = [e for p,e in v]
+        # datapoint interval is 0.005 seconds
+        rmse_xvals = [x*0.005 for x in range(len(rmse))]
+        # centre around the first word boundary -
+        # if 3+ words, too bad.
+        if len(word_times)>1:
+            realign = np.mean([word_times[0][2],word_times[1][1]])
+            rmse_xvals = [x - realign for x in rmse_xvals]
+            word_times = [(w,s-realign,e-realign) for w,s,e in word_times]
+            plt.axvline(x= 0, color="gray", linestyle='--', linewidth=1, label=f"{word_times[0][0]} -> {word_times[1][0]} boundary")
+        if len(word_times)>2:
+            for i in range(1,len(word_times)-1):
+                bound_line = np.mean([word_times[i][2],word_times[i+1][1]])
+                plt.axvline(x=bound_line, color=colors[cc], linestyle='--', linewidth=1, label=f"Speaker {spk} -> {word_times[i+1][0]}")
+        plt.scatter(rmse_xvals, rmse, color=colors[cc], label=f"Speaker {spk}")
+        cc += 1
+        if cc >= len(colors):
+            cc=0
+    return fig
 # want to:
 # - find tts best cluster

scripts/runSQ.py CHANGED Viewed

@@ -42,11 +42,11 @@ def run(sentence, voices, start_end_word_ix):
         tts_sample, tts_speechmarks = get_tts(sentence,voices,tts_dir)
         f0_tts(sentence, voices, tts_dir)
-        score, fig = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_dir, voices, start_end_word_ix)
     # also stop forgetting duration.
-    return tts_sample, score, fig
 def snorm(s):
@@ -253,7 +253,7 @@ def localtest():
         f0_tts(sentence, voices, tts_dir, reaper_path = reaper_exc)
-        score, tts_fig, mid_fig, bad_fig = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_dir, voices, start_end_word_ix)

         tts_sample, tts_speechmarks = get_tts(sentence,voices,tts_dir)
         f0_tts(sentence, voices, tts_dir)
+        score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_dir, voices, start_end_word_ix)
     # also stop forgetting duration.
+    return tts_sample, score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e
 def snorm(s):
         f0_tts(sentence, voices, tts_dir, reaper_path = reaper_exc)
+        score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_dir, voices, start_end_word_ix)