Spaces:

clr
/

pce

Sleeping

App Files Files Community

catiR commited on Oct 27, 2023

Commit

366ecce

1 Parent(s): 86daaba

force align tts, add voices

Browse files

Files changed (6) hide show

app.py +2 -4
requirements.txt +1 -0
scripts/clusterprosody.py +88 -331
scripts/reaper2pass.py +10 -14
scripts/runSQ.py +128 -128
scripts/tapi.py +26 -32

app.py CHANGED Viewed

@@ -54,10 +54,8 @@ with bl:
     #temp_sentences = ['Litlaus græn hugmynd?','Var það ekki nóg?', 'Ef svo er hvað heita þau þá?','Eru maríuhænur á Íslandi?']
-    voices = ['Alfur','Dilja']
-    # currently i only get json speech marks for those two.
-    # supposedly they also provided for Karl, Dora, but i dont even get their wavs
-    # i get everyone elses wavs tho
     #with gr.Row():
         #with gr.Column(scale=4):

     #temp_sentences = ['Litlaus græn hugmynd?','Var það ekki nóg?', 'Ef svo er hvað heita þau þá?','Eru maríuhænur á Íslandi?']
+    voices = ['Alfur_v2', 'Dilja_v2', 'Alfur','Dilja', 'Bjartur', 'Rosa', 'Karl', 'Dora']
     #with gr.Row():
         #with gr.Column(scale=4):

requirements.txt CHANGED Viewed

@@ -5,5 +5,6 @@ librosa
 scipy
 dtw-python
 scikit-learn_extra
 pydub

 scipy
 dtw-python
 scikit-learn_extra
+secrets
 pydub

scripts/clusterprosody.py CHANGED Viewed

@@ -16,63 +16,40 @@ import os, librosa, json
 def z_score(x, mean, std):
     return (x - mean) / std
-#  output
-#  {'013823-0457777': [('hvaða', 0.89, 1.35),
-#              ('sjúkdómar', 1.35, 2.17),
-#              ('geta', 2.17, 2.4),
-#              ('fylgt', 2.4, 2.83),
-#              ('óbeinum', 2.83, 3.29),
-#              ('reykingum', 3.29, 3.9)],
-#             '014226-0508808': [('hvaða', 1.03, 1.45),
-#              ('sjúkdómar', 1.45, 2.28),
-#              ('geta', 2.41, 2.7),
-#              ('fylgt', 2.7, 3.09),
-#              ('óbeinum', 3.09, 3.74),
-#              ('reykingum', 3.74, 4.42)],
-#             '013726-0843679': [('hvaða', 0.87, 1.14),
-#              ('sjúkdómar', 1.14, 1.75),
-#              ('geta', 1.75, 1.96),
-#              ('fylgt', 1.96, 2.27),
-#              ('óbeinum', 2.27, 2.73),
-#              ('reykingum', 2.73, 3.27)] }
-# takes a list of human SPEAKER IDS not the whole meta db
-def get_word_aligns(rec_ids, norm_sent, aln_dir):
     """
     Returns a dictionary of word alignments for a given sentence.
     """
     word_aligns = defaultdict(list)
-    for rec in rec_ids:
-        slist = norm_sent.split(" ")
-        aln_path = os.path.join(aln_dir, f'{rec}.tsv')
         with open(aln_path) as f:
             lines = f.read().splitlines()
         lines = [l.split('\t') for l in lines]
         try:
             assert len(lines) == len(slist)
-            word_aligns[rec] = [(w,float(s),float(e)) for w,s,e in lines]
         except:
             print(slist, lines, "<---- something didn't match")
     return word_aligns
-def get_pitches(start_time, end_time, id, path):
     """
     Returns an array of pitch values for a given speech.
     Reads from .f0 file of Time, F0, IsVoiced
     """
-    f = os.path.join(path, id + ".f0")
-    with open(f) as f:
         lines = f.read().splitlines()
         lines = [[float(x) for x in line.split()] for line in lines]    # split lines into floats
         pitches = []
@@ -98,6 +75,7 @@ def get_pitches(start_time, end_time, id, path):
 # jcheng used energy from esps get_f0
 # get f0 says (?) :
 #The RMS value of each record is computed based on a 30 msec hanning
@@ -107,20 +85,20 @@ def get_pitches(start_time, end_time, id, path):
 # TODO: implement that. ?
 # not sure librosa provides hamming window in rms function directly
 # TODO handle audio that not originally .wav
-def get_rmse(start_time, end_time, id, path):
     """
     Returns an array of RMSE values for a given speech.
     """
-    f = os.path.join(path, id + ".wav")
-    audio, sr = librosa.load(f, sr=16000)
     segment = audio[int(np.floor(start_time * sr)):int(np.ceil(end_time * sr))]
-    rmse = librosa.feature.rms(y=segment)
     rmse = rmse[0]
     #idx = np.round(np.linspace(0, len(rmse) - 1, pitch_len)).astype(int)
     return rmse#[idx]
 def downsample_rmse2pitch(rmse,pitch_len):
     idx = np.round(np.linspace(0, len(rmse) - 1, pitch_len)).astype(int)
     return rmse[idx]
@@ -142,29 +120,31 @@ def parse_word_indices(start_end_word_index):
 # take any (1stword, lastword) or (word)
 #   unit and prepare data for that unit
-def get_data(norm_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, start_end_word_index):
     """
     Returns a dictionary of pitch, rmse, and spectral centroids values for a given sentence/word combinations.
     """
     s_ix, e_ix = parse_word_indices(start_end_word_index)
     words = '_'.join(norm_sent.split(' ')[s_ix:e_ix+1])
-    word_aligns = get_word_aligns(h_spk_ids,norm_sent,h_align_dir)
     data = defaultdict(list)
     align_data = defaultdict(list)
-    for id, word_al in word_aligns.items():
         start_time = word_al[s_ix][1]
         end_time = word_al[e_ix][2]
         seg_aligns =  word_al[s_ix:e_ix+1]
         seg_aligns = [(w,round(s-start_time,2),round(e-start_time,2)) for w,s,e in seg_aligns]
-        pitches = get_pitches(start_time, end_time, id, h_f0_dir)
-        rmses = get_rmse(start_time, end_time, id, h_wav_dir)
         rmses = downsample_rmse2pitch(rmses,len(pitches))
         #spectral_centroids = get_spectral_centroids(start_time, end_time, id, wav_dir, len(pitches))
@@ -172,13 +152,12 @@ def get_data(norm_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, start_end_wo
         rmses_cpy = np.array(deepcopy(rmses))
         d = [[p, r] for p, r in zip(pitches_cpy, rmses_cpy)]
         #words = "-".join(word_combs)
-        data[f"{words}**{id}"] = d
-        align_data[f"{words}**{id}"] = seg_aligns
     return words, data, align_data
 def dtw_distance(x, y):
     """
@@ -190,7 +169,6 @@ def dtw_distance(x, y):
 # recs is a sorted list of rec IDs
 # all recs/data contain the same words
 # rec1 and rec2 can be the same
@@ -206,33 +184,7 @@ def pair_dists(data,words,recs):
             val2 = data[key2]
             dtw_dists.append((f"{rec1}**{rec2}", dtw_distance(val1, val2)))
-    #for key1, value1 in data.items():
-    #    d1 = key1.split("**")
-    #    words1 = d1[0]
-    #    if not words:
-    #        words = words1
-    #    spk1 = d1[1]
-    #    for key2, value2 in data.items():
-    #        d2 = key2.split("**")
-    #        words2 = d2[0]
-    #        spk2 = d2[1]
-    #        if all([w0 == w2 for w0, w2 in zip(words.split('_'), words2.split('_'))]):
-                #dtw_dists[words1].append((f"{spk1}**{spk2}", dtw_distance(value1, value2)))
-    #            dtw_dists.append((f"{spk1}**{spk2}", dtw_distance(value1, value2)))
     return dtw_dists
-# dtw dists is the dict from units to list of tuples
-# or: now just the list not labelled with the unit.
-# {'hvaða-sjúkdómar':
-# [('013823-0457777_013823-0457777', 0.0),
-#              ('013823-0457777_013698-0441666', 0.5999433281203399),
-#              ('013823-0457777_014675-0563760', 0.4695447105594414),
-#              ('014226-0508808_013823-0457777', 0.44080874425223393),
-#              ('014226-0508808_014226-0508808', 0.0),
-#              ('014226-0508808_013726-0843679', 0.5599404672667414),
-#              ('014226-0508808_013681-0442313', 0.6871330752342419)]
-# }
-# the 0-distance self-comparisons are present here
-# along with both copies of symmetric Speaker1**Speaker2, Speaker2**Speaker1
@@ -244,46 +196,9 @@ def kmedoids_clustering(X):
     return y_km, kmedoids
-def get_tts_data(tdir,voice,start_end_word_index):
-    with open(f'{tdir}{voice}.json') as f:
-        speechmarks = json.load(f)
-    speechmarks = speechmarks['alignments']
-    sr=16000
-    tts_audio, _ = librosa.load(f'{tdir}{voice}.wav',sr=sr)
-        # TODO
-        # tts operates on punctuated version
-        # so clean this up instead of assuming it will work
-    s_ix, e_ix = parse_word_indices(start_end_word_index)
-        # TODO
-        # default speechmarks return word start time only -
-        # this cannot describe pauses #######
-    s_tts = speechmarks[s_ix]["time"]/1000
-    if e_ix+1 < len(speechmarks): #if user doesn't want final word, which has no end time mark,
-        e_tts = speechmarks[e_ix+1]["time"]/1000
-        tts_segment = tts_audio[int(np.floor(s_tts * sr)):int(np.ceil(e_tts * sr))]
-    else:
-        tts_segment = tts_audio[int(np.floor(s_tts * sr)):]
-        e_tts = len(tts_audio) / sr
-         # TODO not ideal as probably silence padding on end file?
-    tts_align = [(speechmarks[ix]["value"],speechmarks[ix]["time"]) for ix in range(s_ix,e_ix+1)]
-    tts_align = [(w,s/1000) for w,s in tts_align]
-    tts_align = [(w,round(s-s_tts,3)) for w,s in tts_align]
-    tts_f0 = get_pitches(s_tts, e_tts, voice, tdir)
-    tts_rmse = get_rmse(s_tts, e_tts, voice, tdir)
-    tts_rmse = downsample_rmse2pitch(tts_rmse,len(tts_f0))
-    t_pitches_cpy = np.array(deepcopy(tts_f0))
-    t_rmses_cpy = np.array(deepcopy(tts_rmse))
-    tts_data = [[p, r] for p, r in zip(t_pitches_cpy, t_rmses_cpy)]
-    return tts_data, tts_align
 def match_tts(clusters, speech_data, tts_data, tts_align, words, seg_aligns, voice):
     tts_info = []
     for label in set([c for r,c in clusters]):
@@ -308,18 +223,31 @@ def match_tts(clusters, speech_data, tts_data, tts_align, words, seg_aligns, voi
     bad_cluster = tts_info[2][0]
     bad_data = {f'{words}**{r}': speech_data[f'{words}**{r}'] for r,c in clusters if c==bad_cluster}
-    tts_fig_p = plot_pitch_tts(matched_data,tts_data, tts_align, words,seg_aligns,best_cluster,voice)
-    fig_mid_p = plot_pitch_cluster(mid_data,words,seg_aligns,mid_cluster)
-    fig_bad_p = plot_pitch_cluster(bad_data,words,seg_aligns,bad_cluster)
-    tts_fig_e = plot_rmse_tts(matched_data,tts_data, tts_align, words,seg_aligns,best_cluster,voice)
-    fig_mid_e = plot_rmse_cluster(mid_data,words,seg_aligns,mid_cluster)
-    fig_bad_e = plot_rmse_cluster(bad_data,words,seg_aligns,bad_cluster)
     return best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p, tts_fig_e, fig_mid_e, fig_bad_e
 # since clustering strictly operates on X,
 #  once reduce a duration metric down to pair-distances,
@@ -329,14 +257,16 @@ def match_tts(clusters, speech_data, tts_data, tts_align, words, seg_aligns, voi
 # or can it not take that input in multidimensional space
 #  then the 3 dists can still be averaged to flatten, if appropriately scaled
-def cluster(norm_sent,orig_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, tts_dir, voices, start_end_word_index):
     h_spk_ids = sorted(h_spk_ids)
     nsents = len(h_spk_ids)
-    words, data, seg_aligns = get_data(norm_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, start_end_word_index)
-    dtw_dists = pair_dists(data,words,h_spk_ids)
     kmedoids_cluster_dists = []
@@ -352,13 +282,17 @@ def cluster(norm_sent,orig_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, tts
     groups = [[r,c] for r,c in zip(h_spk_ids,kmedoids.labels_)]
-    # tts: assume the first 64 chars of sentence are enough
-    tdir = f'{tts_dir}{orig_sent.replace(" ","_")[:65]}/'
     for v in voices:
-        tts_data, tts_align = get_tts_data(tdir,v,start_end_word_index)
     # match the data with a cluster -----
-        best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p, tts_fig_e, fig_mid_e, fig_bad_e = match_tts(groups, data, tts_data, tts_align, words, seg_aligns,v)
     # only supports one voice at a time currently
     return best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p,  tts_fig_e, fig_mid_e, fig_bad_e
@@ -432,125 +366,39 @@ def get_audio_part(start_time, end_time, id, path):
-def plot_pitch_tts(speech_data,tts_data, tts_align,words,seg_aligns,cluster_id, voice):
     colors = ["red", "green", "blue", "orange", "purple", "pink", "brown", "gray", "cyan"]
     cc = 0
     fig = plt.figure(figsize=(10, 5))
-    plt.title(f"{words} - Pitch - Cluster {cluster_id}")
-    for k,v in speech_data.items():
-        spk = k.split('**')[1]
-        word_times = seg_aligns[k]
-        pitches = [p for p,e in v]
-        # datapoint interval is 0.005 seconds
-        pitch_xvals = [x*0.005 for x in range(len(pitches))]
-        # centre around the first word boundary -
-        # if 3+ words, too bad.
-        if len(word_times)>1:
-            realign = np.mean([word_times[0][2],word_times[1][1]])
-            pitch_xvals = [x - realign for x in pitch_xvals]
-            word_times = [(w,s-realign,e-realign) for w,s,e in word_times]
-            plt.axvline(x= 0, color="gray", linestyle='--', linewidth=1, label=f"{word_times[0][0]} -> {word_times[1][0]} boundary")
-        if len(word_times)>2:
-            for i in range(1,len(word_times)-1):
-                bound_line = np.mean([word_times[i][2],word_times[i+1][1]])
-                plt.axvline(x=bound_line, color=colors[cc], linestyle='--', linewidth=1, label=f"Speaker {spk} -> {word_times[i+1][0]}")
-        plt.scatter(pitch_xvals, pitches, color=colors[cc], label=f"Speaker {spk}")
-        cc += 1
-        if cc >= len(colors):
-            cc=0
-    tpitches = [p for p,e in tts_data]
-    t_xvals = [x*0.005 for x in range(len(tpitches))]
-    if len(tts_align)>1:
-        realign = tts_align[1][1]
-        t_xvals = [x - realign for x in t_xvals]
-        tts_align = [(w,s-realign) for w,s in tts_align]
-    if len(tts_align)>2:
-        for i in range(2,len(tts_align)):
-            bound_line = tts_align[i][1]
-            plt.axvline(x=bound_line, color="black", linestyle='--', linewidth=1, label=f"TTS -> {tts_align[i][0]}")
-    plt.scatter(t_xvals, tpitches, color="black", label=f"TTS {voice}")
-    #plt.legend()
-    #plt.show()
-    return fig
-def plot_pitch_cluster(speech_data,words,seg_aligns,cluster_id):
-    colors = ["red", "green", "blue", "orange", "purple", "pink", "brown", "gray", "cyan"]
-    cc = 0
-    fig = plt.figure(figsize=(8, 4))
-    plt.title(f"{words} - Pitch - Cluster {cluster_id}")
     for k,v in speech_data.items():
         spk = k.split('**')[1]
         word_times = seg_aligns[k]
-        pitches = [p for p,e in v]
-        # datapoint interval is 0.005 seconds
-        pitch_xvals = [x*0.005 for x in range(len(pitches))]
-        # centre around the first word boundary -
-        # if 3+ words, too bad.
-        if len(word_times)>1:
-            realign = np.mean([word_times[0][2],word_times[1][1]])
-            pitch_xvals = [x - realign for x in pitch_xvals]
-            word_times = [(w,s-realign,e-realign) for w,s,e in word_times]
-            plt.axvline(x= 0, color="gray", linestyle='--', linewidth=1, label=f"{word_times[0][0]} -> {word_times[1][0]} boundary")
-        if len(word_times)>2:
-            for i in range(1,len(word_times)-1):
-                bound_line = np.mean([word_times[i][2],word_times[i+1][1]])
-                plt.axvline(x=bound_line, color=colors[cc], linestyle='--', linewidth=1, label=f"Speaker {spk} -> {word_times[i+1][0]}")
-        plt.scatter(pitch_xvals, pitches, color=colors[cc], label=f"Speaker {spk}")
-        cc += 1
-        if cc >= len(colors):
-            cc=0
-    #plt.legend()
-    #plt.show()
-    return fig
-def plot_rmse_tts(speech_data,tts_data, tts_align,words,seg_aligns,cluster_id, voice):
-    colors = ["red", "green", "blue", "orange", "purple", "pink", "brown", "gray", "cyan"]
-    cc = 0
-    fig = plt.figure(figsize=(10, 5))
-    plt.title(f"{words} - Energy - Cluster {cluster_id}")
-    for k,v in speech_data.items():
-        spk = k.split('**')[1]
-        word_times = seg_aligns[k]
-        rmse = [e for p,e in v]
         # datapoint interval is 0.005 seconds
-        rmse_xvals = [x*0.005 for x in range(len(rmse))]
         # centre around the first word boundary -
         # if 3+ words, too bad.
         if len(word_times)>1:
             realign = np.mean([word_times[0][2],word_times[1][1]])
-            rmse_xvals = [x - realign for x in rmse_xvals]
             word_times = [(w,s-realign,e-realign) for w,s,e in word_times]
             plt.axvline(x= 0, color="gray", linestyle='--', linewidth=1, label=f"{word_times[0][0]} -> {word_times[1][0]} boundary")
@@ -559,24 +407,25 @@ def plot_rmse_tts(speech_data,tts_data, tts_align,words,seg_aligns,cluster_id, v
                 bound_line = np.mean([word_times[i][2],word_times[i+1][1]])
                 plt.axvline(x=bound_line, color=colors[cc], linestyle='--', linewidth=1, label=f"Speaker {spk} -> {word_times[i+1][0]}")
-        plt.plot(rmse_xvals, rmse, color=colors[cc], label=f"Speaker {spk}")
         cc += 1
         if cc >= len(colors):
             cc=0
-    trmse = [e for p,e in tts_data]
-    t_xvals = [x*0.005 for x in range(len(trmse))]
-    if len(tts_align)>1:
-        realign = tts_align[1][1]
-        t_xvals = [x - realign for x in t_xvals]
-        tts_align = [(w,s-realign) for w,s in tts_align]
-    if len(tts_align)>2:
-        for i in range(2,len(tts_align)):
-            bound_line = tts_align[i][1]
-            plt.axvline(x=bound_line, color="black", linestyle='--', linewidth=1, label=f"TTS -> {tts_align[i][0]}")
-    plt.plot(t_xvals, trmse, color="black", label=f"TTS {voice}")
     #plt.legend()
@@ -586,99 +435,7 @@ def plot_rmse_tts(speech_data,tts_data, tts_align,words,seg_aligns,cluster_id, v
     return fig
-def plot_rmse_cluster(speech_data,words,seg_aligns,cluster_id):
-    colors = ["red", "green", "blue", "orange", "purple", "pink", "brown", "gray", "cyan"]
-    cc = 0
-    fig = plt.figure(figsize=(10, 5))
-    plt.title(f"{words} - Energy - Cluster {cluster_id}")
-    for k,v in speech_data.items():
-        spk = k.split('**')[1]
-        word_times = seg_aligns[k]
-        rmse = [e for p,e in v]
-        # datapoint interval is 0.005 seconds
-        rmse_xvals = [x*0.005 for x in range(len(rmse))]
-        # centre around the first word boundary -
-        # if 3+ words, too bad.
-        if len(word_times)>1:
-            realign = np.mean([word_times[0][2],word_times[1][1]])
-            rmse_xvals = [x - realign for x in rmse_xvals]
-            word_times = [(w,s-realign,e-realign) for w,s,e in word_times]
-            plt.axvline(x= 0, color="gray", linestyle='--', linewidth=1, label=f"{word_times[0][0]} -> {word_times[1][0]} boundary")
-        if len(word_times)>2:
-            for i in range(1,len(word_times)-1):
-                bound_line = np.mean([word_times[i][2],word_times[i+1][1]])
-                plt.axvline(x=bound_line, color=colors[cc], linestyle='--', linewidth=1, label=f"Speaker {spk} -> {word_times[i+1][0]}")
-        plt.plot(rmse_xvals, rmse, color=colors[cc], label=f"Speaker {spk}")
-        cc += 1
-        if cc >= len(colors):
-            cc=0
-    return fig
-# want to:
-# - find tts best cluster
-# - find avg dist for tts in that cluster
-# - find avg dist for any human to the rest of its cluster
-# see near end of notebook for v nice way to grab timespans of tts audio
-# (or just the start/end timestamps to mark them) from alignment json
-# based on word position index -
-#  so probably really do show user the sentence with each word numbered.
-# THEN there is -
-# \# Plot pitch, rmse, and spectral centroid for each word combination for each speaker
-#   - this is one persontoken per graph and has a word division line - idk if works >2 wds.
-# it might be good to do this for tts at least, eh
-# Plot pitch values for each word combination for each speaker in each cluster (with word boundaries)
-# - multi speakers (one cluster) per graph - this will be good to show, with tts on top.
-# i may want to recentre it around wd bound. at least if only 2 wds.
-#  well i could just pick, like, it will be centred around the 1st wboundary & good luck if more.
-# - the same as above, but rmse
-# go all the way to the bottom to see gphs with a tts added on to one cluster.
-# will need:
-# the whole sentence text (index, word) pairs
-# the indices of units the user wants
-# human meta db of all human recordings
-# tts dir, human wav + align + f0 dirs
-# list of tts voices
-# an actual wav file for each human rec, probably
-# params like: use f0, use rmse, (use dur), [.....]
-# .. check.
-def plot_clusters(X, y, word):
-    u_labels = np.unique(y)
-    # plot the results
-    for i in u_labels:
-        plt.scatter(X[y == i, 0], X[y == i, 1], label=i)
-    plt.title(word)
-    plt.legend()
-    plt.show()

 def z_score(x, mean, std):
     return (x - mean) / std
+# given a sentence and list of its speakers + their alignment files,
+# return a dictionary of word alignments
+def get_word_aligns(norm_sent, aln_paths):
     """
     Returns a dictionary of word alignments for a given sentence.
     """
     word_aligns = defaultdict(list)
+    slist = norm_sent.split(" ")
+    for spk,aln_path in aln_paths:
         with open(aln_path) as f:
             lines = f.read().splitlines()
         lines = [l.split('\t') for l in lines]
         try:
             assert len(lines) == len(slist)
+            word_aligns[spk] = [(w,float(s),float(e)) for w,s,e in lines]
         except:
             print(slist, lines, "<---- something didn't match")
     return word_aligns
+#TODO pass whole path
+def get_pitches(start_time, end_time, fpath):
     """
     Returns an array of pitch values for a given speech.
     Reads from .f0 file of Time, F0, IsVoiced
     """
+    with open(fpath) as f:
         lines = f.read().splitlines()
         lines = [[float(x) for x in line.split()] for line in lines]    # split lines into floats
         pitches = []
+# TODO take whole path
 # jcheng used energy from esps get_f0
 # get f0 says (?) :
 #The RMS value of each record is computed based on a 30 msec hanning
 # TODO: implement that. ?
 # not sure librosa provides hamming window in rms function directly
 # TODO handle audio that not originally .wav
+def get_rmse(start_time, end_time, wpath):
     """
     Returns an array of RMSE values for a given speech.
     """
+    audio, sr = librosa.load(wpath, sr=16000)
     segment = audio[int(np.floor(start_time * sr)):int(np.ceil(end_time * sr))]
+    rmse = librosa.feature.rms(y=segment,frame_length=480,hop_length=80)#librosa.feature.rms(y=segment)
     rmse = rmse[0]
     #idx = np.round(np.linspace(0, len(rmse) - 1, pitch_len)).astype(int)
     return rmse#[idx]
+# may be unnecessary depending how rmse and pitch window/hop are calculated already
 def downsample_rmse2pitch(rmse,pitch_len):
     idx = np.round(np.linspace(0, len(rmse) - 1, pitch_len)).astype(int)
     return rmse[idx]
 # take any (1stword, lastword) or (word)
 #   unit and prepare data for that unit
+def get_data(norm_sent,path_key,start_end_word_index):
     """
     Returns a dictionary of pitch, rmse, and spectral centroids values for a given sentence/word combinations.
     """
     s_ix, e_ix = parse_word_indices(start_end_word_index)
     words = '_'.join(norm_sent.split(' ')[s_ix:e_ix+1])
+    align_paths = [(spk,pdict['aln']) for spk,pdict in path_key]
+    word_aligns = get_word_aligns(norm_sent, align_paths)
     data = defaultdict(list)
     align_data = defaultdict(list)
+    for spk, pdict in path_key:
+        word_al = word_aligns[spk]
         start_time = word_al[s_ix][1]
         end_time = word_al[e_ix][2]
         seg_aligns =  word_al[s_ix:e_ix+1]
         seg_aligns = [(w,round(s-start_time,2),round(e-start_time,2)) for w,s,e in seg_aligns]
+        pitches = get_pitches(start_time, end_time, pdict['f0'])
+        rmses = get_rmse(start_time, end_time, pdict['wav'])
         rmses = downsample_rmse2pitch(rmses,len(pitches))
         #spectral_centroids = get_spectral_centroids(start_time, end_time, id, wav_dir, len(pitches))
         rmses_cpy = np.array(deepcopy(rmses))
         d = [[p, r] for p, r in zip(pitches_cpy, rmses_cpy)]
         #words = "-".join(word_combs)
+        data[f"{words}**{spk}"] = d
+        align_data[f"{words}**{spk}"] = seg_aligns
     return words, data, align_data
 def dtw_distance(x, y):
     """
 # recs is a sorted list of rec IDs
 # all recs/data contain the same words
 # rec1 and rec2 can be the same
             val2 = data[key2]
             dtw_dists.append((f"{rec1}**{rec2}", dtw_distance(val1, val2)))
     return dtw_dists
     return y_km, kmedoids
 def match_tts(clusters, speech_data, tts_data, tts_align, words, seg_aligns, voice):
     tts_info = []
     for label in set([c for r,c in clusters]):
     bad_cluster = tts_info[2][0]
     bad_data = {f'{words}**{r}': speech_data[f'{words}**{r}'] for r,c in clusters if c==bad_cluster}
+    #tts_fig_p = plot_pitch_tts(matched_data,tts_data, tts_align, words,seg_aligns,best_cluster,voice)
+    tts_fig_p = plot_one_cluster(words,'pitch',matched_data,seg_aligns,cluster,tts_data=tts_data,tts_align=tts_align,voice=voice)
+    fig_mid_p = plot_one_cluster(words,'pitch',mid_data,seg_aligns,cluster)
+    fig_bad_p = plot_one_cluster(words,'pitch',bad_data,seg_aligns,cluster)
+    tts_fig_e = plot_one_cluster(words,'rmse',matched_data,seg_aligns,cluster,tts_data=tts_data,tts_align=tts_align,voice=voice)
+    fig_mid_e = plot_one_cluster(words,'rmse',mid_data,seg_aligns,cluster)
+    fig_bad_e = plot_one_cluster(words,'rmse',bad_data,seg_aligns,cluster)
     return best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p, tts_fig_e, fig_mid_e, fig_bad_e
+def gp(d,s,x):
+    return os.path.join(d, f'{s}.{x}')
+def gen_tts_paths(tdir,voices):
+    plist = [(v, {'wav': gp(tdir,v,'wav'), 'aln': gp(tdir,v,'tsv'), 'f0': gp(tdir,v,'f0')}) for v in voices]
+    return plist
+def gen_h_paths(wdir,adir,f0dir,spks):
+    plist = [(s, {'wav': gp(wdir,s,'wav'), 'aln': gp(adir,s,'tsv'), 'f0': gp(f0dir,s,'f0')}) for s in spks]
+    return plist
 # since clustering strictly operates on X,
 #  once reduce a duration metric down to pair-distances,
 # or can it not take that input in multidimensional space
 #  then the 3 dists can still be averaged to flatten, if appropriately scaled
+def cluster(norm_sent,orig_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, tts_sent_dir, voices, start_end_word_index):
     h_spk_ids = sorted(h_spk_ids)
     nsents = len(h_spk_ids)
+    h_all_paths = gen_h_paths(h_wav_dir,h_align_dir,h_f0_dir,h_spk_ids)
+    words, h_data, h_seg_aligns = get_data(norm_sent,h_all_paths,start_end_word_index)
+    dtw_dists = pair_dists(h_data,words,h_spk_ids)
     kmedoids_cluster_dists = []
     groups = [[r,c] for r,c in zip(h_spk_ids,kmedoids.labels_)]
+    tts_all_paths = gen_tts_paths(tts_sent_dir, voices)
+    _, tts_data, tts_seg_aligns = get_data(norm_sent,tts_all_paths,start_end_word_index)
     for v in voices:
+        voice_data = tts_data[f"{words}**{v}"]
+        voice_align = tts_seg_aligns[f"{words}**{v}"]
+        #tts_data, tts_align = get_one_tts_data(tts_sent_dir,v,norm_sent,start_end_word_index)
     # match the data with a cluster -----
+        best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p, tts_fig_e, fig_mid_e, fig_bad_e = match_tts(groups, h_data, voice_data, voice_align, words, h_seg_aligns,v)
     # only supports one voice at a time currently
     return best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p,  tts_fig_e, fig_mid_e, fig_bad_e
+def plot_one_cluster(words,feature,speech_data,seg_aligns,cluster_id,tts_data=None,tts_align=None,voice=None):
+#(speech_data, tts_data, tts_align, words, seg_aligns, cluster_id, voice):
     colors = ["red", "green", "blue", "orange", "purple", "pink", "brown", "gray", "cyan"]
     cc = 0
     fig = plt.figure(figsize=(10, 5))
+    if feature.lower() in ['pitch','f0']:
+        fname = 'Pitch'
+        ffunc = lambda x: [p for p,e in x]
+    elif feature.lower() in ['energy', 'rmse']:
+        fname = 'Energy'
+        ffunc = lambda x: [e for p,e in x]
+    else:
+        print('problem with the figure')
+        return fig
+    plt.title(f"{words} - {fname} - Cluster {cluster_id}")
     for k,v in speech_data.items():
         spk = k.split('**')[1]
         word_times = seg_aligns[k]
+        feats = ffunc(v)
         # datapoint interval is 0.005 seconds
+        feat_xvals = [x*0.005 for x in range(len(feats))]
         # centre around the first word boundary -
         # if 3+ words, too bad.
         if len(word_times)>1:
             realign = np.mean([word_times[0][2],word_times[1][1]])
+            feat_xvals = [x - realign for x in feat_xvals]
             word_times = [(w,s-realign,e-realign) for w,s,e in word_times]
             plt.axvline(x= 0, color="gray", linestyle='--', linewidth=1, label=f"{word_times[0][0]} -> {word_times[1][0]} boundary")
                 bound_line = np.mean([word_times[i][2],word_times[i+1][1]])
                 plt.axvline(x=bound_line, color=colors[cc], linestyle='--', linewidth=1, label=f"Speaker {spk} -> {word_times[i+1][0]}")
+        plt.scatter(feat_xvals, feats, color=colors[cc], label=f"Speaker {spk}")
         cc += 1
         if cc >= len(colors):
             cc=0
+    if voice:
+        tfeats = [p for p,e in tts_data]
+        t_xvals = [x*0.005 for x in range(len(tfeats))]
+        if len(tts_align)>1:
+            realign = np.mean([tts_align[0][2],tts_align[1][1]])
+            t_xvals = [x - realign for x in t_xvals]
+            tts_align = [(w,s-realign,e-realign) for w,s,e in tts_align]
+        if len(tts_align)>2:
+            for i in range(1,len(tts_align)-1):
+                bound_line = np.mean([tts_align[i][2],tts_align[i+1][1]])
+                plt.axvline(x=bound_line, color="black", linestyle='--', linewidth=1, label=f"TTS -> {tts_align[i+1][0]}")
+        plt.scatter(t_xvals, tfeats, color="black", label=f"TTS {voice}")
     #plt.legend()
     return fig

scripts/reaper2pass.py CHANGED Viewed

@@ -5,11 +5,13 @@ from pydub import AudioSegment
 import subprocess
 import os
 # ref. Hirst The analysis by synthesis of speech melody: from data to models
 # reaper requires wav file path input,
-#  not audio data itself.
 # reaper does NOT require 16khz mono audio.
 def reaper_soundfile(sound_path, orig_filetype):
@@ -17,9 +19,9 @@ def reaper_soundfile(sound_path, orig_filetype):
     curdir = subprocess.run(["pwd"], capture_output=True, text=True)
     curdir = curdir.stdout.splitlines()[0]
     fname = sound_path.split('/')[-1].replace(orig_filetype,'')
-    tmp_path = f'{curdir}/REAPER_TMP/{fname}tmp.wav'
-    if not os.path.exists(f'{curdir}/REAPER_TMP'):
-        os.mkdir(f'{curdir}/REAPER_TMP')
     aud_data.export(tmp_path, format="wav")
     wav_path = tmp_path
@@ -31,11 +33,8 @@ def reaper_soundfile(sound_path, orig_filetype):
 def get_reaper(wav_path, reaper_path, maxf0='700', minf0='50'):
     f0_data = subprocess.run([reaper_path, "-i", wav_path, '-f', '/dev/stdout', '-x', maxf0, '-m', minf0, '-a'],capture_output=True).stdout
-    #print('PLAIN:',f0_data)
     f0_data = f0_data.decode()
-    #print('DECODE-PITCH:',f0_data)
     f0_data = f0_data.split('EST_Header_End\n')[1].splitlines()
-    #print(f0_data)
     f0_data = [l.split(' ') for l in f0_data]
     f0_data = [l for l in f0_data if len(l) == 3] # the last line or 2 lines are other info, different format
     f0_data = [ [float(t), float(f), float(v)] for t,v,f in f0_data]
@@ -43,12 +42,9 @@ def get_reaper(wav_path, reaper_path, maxf0='700', minf0='50'):
     return f0_data
-# currently,
-# take the simplified list data from get_reaper_data,
-#  with format Time  F0Val only at times with existing F0Val,
-#   and write that to a text file.
-# alternate would be letting reaper write its own files
-#  instead of capturing the stdout...
 def save_pitch(f0_data, save_path,hed=False):
     with open(save_path,'w') as handle:
         if hed:
@@ -60,7 +56,7 @@ def save_pitch(f0_data, save_path,hed=False):
 def estimate_pitch(sound_path,reaper_path = "REAPER/build/reaper"):
     orig_ftype = sound_path.split('.')[-1]
-    if orig_ftype == '.wav':
         wav_path = sound_path
     else:
         tmp_path = reaper_soundfile(sound_path, orig_ftype)

 import subprocess
 import os
+# 2 pass f0 estimation
 # ref. Hirst The analysis by synthesis of speech melody: from data to models
+# python wrap for gradio app
 # reaper requires wav file path input,
+#  not audio data.
 # reaper does NOT require 16khz mono audio.
 def reaper_soundfile(sound_path, orig_filetype):
     curdir = subprocess.run(["pwd"], capture_output=True, text=True)
     curdir = curdir.stdout.splitlines()[0]
     fname = sound_path.split('/')[-1].replace(orig_filetype,'')
+    tmp_path = f'{curdir}/files_tmp/{fname}tmp.wav'
+    if not os.path.exists(f'{curdir}/files_tmp'):
+        os.mkdir(f'{curdir}/files_tmp')
     aud_data.export(tmp_path, format="wav")
     wav_path = tmp_path
 def get_reaper(wav_path, reaper_path, maxf0='700', minf0='50'):
     f0_data = subprocess.run([reaper_path, "-i", wav_path, '-f', '/dev/stdout', '-x', maxf0, '-m', minf0, '-a'],capture_output=True).stdout
     f0_data = f0_data.decode()
     f0_data = f0_data.split('EST_Header_End\n')[1].splitlines()
     f0_data = [l.split(' ') for l in f0_data]
     f0_data = [l for l in f0_data if len(l) == 3] # the last line or 2 lines are other info, different format
     f0_data = [ [float(t), float(f), float(v)] for t,v,f in f0_data]
     return f0_data
+# save simplified data format from get_reaper
+#  instead of reaper's original output
 def save_pitch(f0_data, save_path,hed=False):
     with open(save_path,'w') as handle:
         if hed:
 def estimate_pitch(sound_path,reaper_path = "REAPER/build/reaper"):
     orig_ftype = sound_path.split('.')[-1]
+    if orig_ftype == 'wav':
         wav_path = sound_path
     else:
         tmp_path = reaper_soundfile(sound_path, orig_ftype)

scripts/runSQ.py CHANGED Viewed

@@ -1,4 +1,4 @@
-import os, unicodedata
 from scripts.ctcalign import aligner, wav16m
 from scripts.tapi import tiro
 from scripts.reaper2pass import estimate_pitch, save_pitch
@@ -30,23 +30,24 @@ def run(sentence, voices, start_end_word_ix):
     norm_sentence = snorm(sentence)
-    meta = get_recordings(norm_sentence, corpus_meta)
-    if meta:
-        align_human(meta,speech_aligns,speech_dir,align_model_path)
-        f0_human(meta, speech_f0, speech_dir)
-        human_rec_ids = sorted([l[2].split('.wav')[0] for l in meta])
     if voices:
         voices = [voices[0]] # TODO. now limit one voice at a time.
-        tts_sample, tts_speechmarks = get_tts(sentence,voices,tts_dir)
-        f0_tts(sentence, voices, tts_dir)
-        score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_dir, voices, start_end_word_ix)
     # also stop forgetting duration.
-    return tts_sample, score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e
 def snorm(s):
@@ -54,6 +55,7 @@ def snorm(s):
     while '  ' in s:
         s = s.replace('  ', ' ')
     return s
 def create_temp_sent_list():
@@ -66,156 +68,155 @@ def create_temp_sent_list():
 # find all the recordings of a given sentence
 # listed in the corpus metadata.
 # sentence should be provided lowercase without punctuation
-# TODO something not fatal to interface if <10
-def get_recordings(sentence, corpusdb):
     with open(corpusdb,'r') as handle:
         meta = handle.read().splitlines()
     meta = [l.split('\t') for l in meta[1:]]
     # column index 4 of db is normalised sentence text
-    smeta = [l for l in meta if l[4] == sentence]
-    if len(smeta) < 10:
-        if len(smeta) < 1:
             print('This sentence does not exist in the corpus')
         else:
             print('Under 10 copies of the sentence: skipping.')
         return []
     else:
-        print(f'{len(smeta)} recordings of sentence <{sentence}>')
-        return smeta
-# check if word alignments exist for a set of human speech recordings
-# if not, warn, and make them with ctcalign.
-def align_human(meta,align_dir,speech_dir,model_path):
-    model_word_sep = '|'
-    model_blank_tk = '[PAD]'
-    no_align = []
-    for rec in meta:
-        apath = align_dir + rec[2].replace('.wav','.tsv')
-        if not os.path.exists(apath):
-            no_align.append(rec)
-    if no_align:
-        print(f'Need to run alignment for {len(no_align)} files')
-        if not os.path.exists(align_dir):
             os.makedirs(align_dir)
-        caligner = aligner(model_path,model_word_sep,model_blank_tk)
-        for rec in no_align:
-            #wav_path = f'{speech_dir}{rec[1]}/{rec[2]}'
-            wav_path = f'{speech_dir}{rec[2]}'
-            word_aln = caligner(wav16m(wav_path),rec[4],is_normed=True)
-            apath = align_dir + rec[2].replace('.wav','.tsv')
-            word_aln = [[str(x) for x in l] for l in word_aln]
-            with open(apath,'w') as handle:
-                handle.write(''.join(['\t'.join(l)+'\n' for l in word_aln]))
-    else:
-        print('All alignments existed')
-# check if f0s exist for all of those files.
-# if not, warn, and make them with TODO reaper
-def f0_human(meta, f0_dir, speech_dir, reaper_path = "REAPER/build/reaper"):
-    no_f0 = []
     for rec in meta:
         fpath = f0_dir + rec[2].replace('.wav','.f0')
         if not os.path.exists(fpath):
-            no_f0.append(rec)
-    if no_f0:
-        print(f'Need to estimate pitch for {len(no_f0)} recordings')
-        if not os.path.exists(f0_dir):
-            os.makedirs(f0_dir)
-        for rec in no_f0:
-            wav_path = f'{speech_dir}{rec[2]}'
             fpath = f0_dir + rec[2].replace('.wav','.f0')
-            f0_data = estimate_pitch(wav_path, reaper_path)
             save_pitch(f0_data,fpath)
-            #print('2ND PASS PITCHES OF', fpath)
-            #print(f0_data)
-    else:
-        print('All speech pitch trackings existed')
-# check if the TTS wavs + align jsons exist for this sentence
-# if not, warn and make them with TAPI ******
-def get_tts(sentence,voices,ttsdir):
-    # assume the first 64 chars of sentence are enough
-    dpath = sentence.replace(' ','_')[:65]
-    no_voice = []
-    temp_sample_path = ''
     for v in voices:
-        wpath = f'{ttsdir}{dpath}/{v}.wav'
-        jpath = f'{ttsdir}{dpath}/{v}.json'
-        if not (os.path.exists(wpath) and os.path.exists(jpath)):
-            no_voice.append(v)
-        if not temp_sample_path:
-            temp_sample_path = wpath
-            temp_json_path = jpath
-    if no_voice:
-        print(f'Need to generate TTS for {len(no_voice)} voices')
-        if not os.path.exists(f'{ttsdir}{dpath}'):
-            os.makedirs(f'{ttsdir}{dpath}')
-        for v in voices:
-            wf, af = tiro(sentence,v,save=f'{ttsdir}{dpath}/')
-    else:
-        print('TTS for all voices existed')
-    return temp_sample_path, temp_json_path
-# check if the TTS f0s exist
-# if not warn + make
-# TODO collapse functions
-def f0_tts(sentence, voices, ttsdir, reaper_path = "REAPER/build/reaper"):
-    # assume the first 64 chars of sentence are enough
-    dpath = sentence.replace(' ','_')[:65]
-    no_f0 = []
-    for v in voices:
-        fpath = f'{ttsdir}{dpath}/{v}.f0'
-        if not os.path.exists(fpath):
-            no_f0.append(v)
-    if no_f0:
-        print(f'Need to estimate pitch for {len(no_f0)} voices')
-        for v in voices:
-            wav_path = f'{ttsdir}{dpath}/{v}.wav'
-            fpath = f'{ttsdir}{dpath}/{v}.f0'
-            f0_data = estimate_pitch(wav_path, reaper_path)
-            save_pitch(f0_data,fpath)
-    else:
-        print('All TTS pitch trackings existed')
@@ -239,21 +240,20 @@ def localtest():
     reaper_exc = '/home/caitlinr/work/notterra/REAPER/build/reaper'
     norm_sentence = snorm(sentence)
-    meta = get_recordings(norm_sentence, corpus_meta)
-    #print(meta)
-    if meta:
-        align_human(meta,speech_aligns,speech_dir,align_model_path)
-        f0_human(meta, speech_f0, speech_dir, reaper_path = reaper_exc )
-        human_rec_ids = sorted([l[2].split('.wav')[0] for l in meta])
     if voices:
-        voices = [voices[0]] # TODO. now limit one voice at a time.
-        audio_sample, speechmarks = get_tts(sentence,voices,tts_dir)
-        f0_tts(sentence, voices, tts_dir, reaper_path = reaper_exc)
-        score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_dir, voices, start_end_word_ix)

+import os, unicodedata, string, secrets
 from scripts.ctcalign import aligner, wav16m
 from scripts.tapi import tiro
 from scripts.reaper2pass import estimate_pitch, save_pitch
     norm_sentence = snorm(sentence)
+    sentence = sentence.replace('\t', ' ')
+    human_rec_ids = get_samromur_queries(norm_sentence, corpus_meta, speech_dir, speech_aligns, align_model_path, speech_f0)
     if voices:
+        temp_tts_sample, tts_sent_dir = get_tts(sentence,voices,tts_dir,align_model_path)
         voices = [voices[0]] # TODO. now limit one voice at a time.
+        score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_sent_dir, voices, start_end_word_ix)
     # also stop forgetting duration.
+    return temp_tts_sample, score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e
 def snorm(s):
     while '  ' in s:
         s = s.replace('  ', ' ')
     return s
 def create_temp_sent_list():
+def align_file(wav_path, output_path, norm_sentence, word_aligner = None, model_path = "carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h"):
+    model_word_sep = '|'
+    model_blank_tk = '[PAD]'
+    if not word_aligner:
+        print('initiating forced alignment, can take some time...')
+        word_aligner = aligner(model_path,model_word_sep,model_blank_tk)
+    word_aln = word_aligner(wav16m(wav_path),norm_sentence,is_normed=True)
+    word_aln = [[str(x) for x in l] for l in word_aln]
+    with open(output_path,'w') as handle:
+        handle.write(''.join(['\t'.join(l)+'\n' for l in word_aln]))
+    return word_aligner
 # find all the recordings of a given sentence
 # listed in the corpus metadata.
+# find or create their alignments and f0 tracking.
 # sentence should be provided lowercase without punctuation
+# TODO something not fatal to interface if <10 --
+#   metadata file for SQ is already filtered.
+# TODO handle audio that is not originally .wav
+#   not an issue for SQ
+def get_samromur_queries(sentence, corpusdb, speech_dir, align_dir, align_model_path, f0_dir, reaper_path = "REAPER/build/reaper"):
     with open(corpusdb,'r') as handle:
         meta = handle.read().splitlines()
     meta = [l.split('\t') for l in meta[1:]]
     # column index 4 of db is normalised sentence text
+    meta = [l for l in meta if l[4] == sentence]
+    if len(meta) < 10:
+        if len(meta) < 1:
             print('This sentence does not exist in the corpus')
         else:
             print('Under 10 copies of the sentence: skipping.')
         return []
     else:
+        print(f'{len(meta)} recordings of sentence <{sentence}>')
+        #return meta
+    word_aligner = None
+    if not os.path.exists(align_dir):
             os.makedirs(align_dir)
+    if not os.path.exists(f0_dir):
+            os.makedirs(f0_dir)
     for rec in meta:
+        wpath = f'{speech_dir}{rec[2]}'
+        apath = align_dir + rec[2].replace('.wav','.tsv')
+        if not os.path.exists(apath):
+            word_aligner = align_file(wpath,apath, rec[4], word_aligner = word_aligner, model_path = align_model_path)
         fpath = f0_dir + rec[2].replace('.wav','.f0')
         if not os.path.exists(fpath):
             fpath = f0_dir + rec[2].replace('.wav','.f0')
+            f0_data = estimate_pitch(wpath, reaper_path)
             save_pitch(f0_data,fpath)
+    human_rec_ids = sorted([l[2].split('.wav')[0] for l in meta])
+    return human_rec_ids
+# check if the TTS wavs, alignments, f0 exist for this sentence
+# if not, make them
+def get_tts(sentence,voices,ttsdir,align_model_path,reaper_path = "REAPER/build/reaper"):
+    dpath = setup_tts_sent(sentence,ttsdir)
+    sample_paths = []
+    word_aligner = None
     for v in voices:
+        wpath = f'{dpath}/{v}.wav'
+        apath = f'{dpath}/{v}.tsv'
+        fpath = f'{dpath}/{v}.f0'
+        if not os.path.exists(wpath):
+            wf = tiro(sentence,v,save=f'{dpath}/')
+        if not os.path.exists(apath):
+            word_aligner = align_file(wpath, apath, snorm(sentence), word_aligner = word_aligner, model_path = align_model_path)
+        if not os.path.exists(fpath):
+            f0_data = estimate_pitch(wpath, reaper_path)
+            save_pitch(f0_data,fpath)
+        sample_paths.append(wpath)
+        # TEMP
+        # return for single last voice
+        temp_sample_path = wpath
+    return temp_sample_path, dpath
+# find if dir for this sentence exists yet
+#   or make one, and record it.
+# punctuation can affect synthesis
+#   so index by original sentence, not normed text
+def setup_tts_sent(sentence,ttsdir,meta_path = 'tts_meta.tsv'):
+    if not os.path.exists(f'{ttsdir}'):
+        os.makedirs(f'{ttsdir}')
+    sentence = sentence.replace('\n',' ')
+    with open(f'{ttsdir}{meta_path}','a+') as handle:
+        tts_meta = handle.read().splitlines()
+        tts_meta = [l.split('\t') for l in tts_meta]
+        tts_meta = {sent:s_id for s_id,sent in tts_meta}
+        if sentence not in tts_meta.keys():
+            sent_id = sentence.replace(' ','_')[:33]
+            rand_id = ''.join(secrets.choice(string.ascii_lowercase + string.digits) for i in range(6))
+            while f'{sent_id}_{rand_id}' in tts_meta.values():
+                rand_id = ''.join(secrets.choice(string.ascii_lowercase + string.digits) for i in range(6))
+            sent_id = f'{sent_id}_{rand_id}'
+            handle.write(f'{sent_id}\t{sentence}\n')
+        else:
+            sent_id = tts_meta[sentence]
+    sent_dir = f'{ttsdir}{sent_id}'
+    if not os.path.exists(f'{sent_dir}'):
+        os.makedirs(f'{sent_dir}')
+    return sent_dir
     reaper_exc = '/home/caitlinr/work/notterra/REAPER/build/reaper'
     norm_sentence = snorm(sentence)
+    human_rec_ids = get_samromur_queries(norm_sentence, corpus_meta, speech_dir, speech_aligns, align_model_path, speech_f0, reaper_path = reaper_exc)
     if voices:
+        one_audio_sample, tts_sent_dir = get_tts(sentence,voices,tts_dir,align_model_path,reaper_path = reaper_exc)
+        voices = [voices[0]] # TODO. now limit one voice at a time.
+        score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_sent_dir, voices, start_end_word_ix)

scripts/tapi.py CHANGED Viewed

@@ -2,12 +2,11 @@ import json, os, requests, warnings, wave
 warnings.filterwarnings("ignore")
 # synthesise speech
-#   save 16khz mono wav file
-#   and word-level timestamps
-# return paths to wave and alignment files
-def tiro(text,voice,save='./'):
     # endpoint working 2023
     url = 'https://tts.tiro.is/v0/speech'
@@ -24,7 +23,21 @@ def tiro(text,voice,save='./'):
     "VoiceId": voice
     }
     # word time alignments
     payload_aln = {
     "Engine": "standard",
     "LanguageCode": "is-IS",
@@ -33,37 +46,18 @@ def tiro(text,voice,save='./'):
     "Text": text,
     "VoiceId": voice
     }
-    tts_data = requests.post(url, headers=headers, json=payload_tts, verify=False)
-    aln_data = requests.post(url, headers=headers, json=payload_aln, verify=False)
-    #fname = save+text.replace(':','').replace('/','-')
-    #wname = fname+'.wav'
-    #aname = fname+'.json'
-    wname = save+voice+'.wav'
     aname = save+voice+'.json'
-    with wave.open(wname,'wb') as f:
-        f.setnchannels(1)
-        f.setframerate(16000)
-        f.setsampwidth(2)
-        f.writeframes(tts_data.content)
-    with open(aname,'w') as f:
-        f.write('{"alignments": [')
-        f.write(aln_data.content.decode().replace('}\n{','},\n {'))
-        f.write(']}')
-    return(os.path.abspath(wname),os.path.abspath(aname))
-#sentence = "Hæ hæ hæ hæ! Ég heiti Gervimaður Finnland, en þú?"
-#voice = "Alfur"
-#wf, af = tiro(sentence,voice)
-#print(wf, af)

 warnings.filterwarnings("ignore")
 # synthesise speech
+# save 16khz mono wav file
+# return path to wave file
+#   saving word alignment timestamps is deprecating
+def tiro(text,voice,save='./',tiroalign = False):
     # endpoint working 2023
     url = 'https://tts.tiro.is/v0/speech'
     "VoiceId": voice
     }
+    wname = save+voice+'.wav'
+    tts_data = requests.post(url, headers=headers, json=payload_tts, verify=False)
+    with wave.open(wname,'wb') as f:
+        f.setnchannels(1)
+        f.setframerate(16000)
+        f.setsampwidth(2)
+        f.writeframes(tts_data.content)
     # word time alignments
+    # SKIP
+    # tiro no longer intends to support this
+    # and only does support it for 2 voices anyway
     payload_aln = {
     "Engine": "standard",
     "LanguageCode": "is-IS",
     "Text": text,
     "VoiceId": voice
     }
     aname = save+voice+'.json'
+    if tiroalign:
+        aln_data = requests.post(url, headers=headers, json=payload_aln, verify=False)
+        with open(aname,'w') as f:
+            f.write('{"alignments": [')
+            f.write(aln_data.content.decode().replace('}\n{','},\n {'))
+            f.write(']}')
+    #return(os.path.abspath(wname),os.path.abspath(aname))
+    return os.path.abspath(wname)