Spaces:

clr
/

pce

Sleeping

App Files Files Community

catiR commited on Nov 17, 2023

Commit

67be6d3

•

1 Parent(s): 971211e

audios, skip bad f0

Browse files

Files changed (2) hide show

scripts/clusterprosody.py +40 -39
scripts/runSQ.py +9 -7

scripts/clusterprosody.py CHANGED Viewed

@@ -48,30 +48,27 @@ def get_pitches(start_time, end_time, fpath):
     Returns an array of pitch values for a given speech.
     Reads from .f0 file of Time, F0, IsVoiced
     """
     with open(fpath) as f:
         lines = f.read().splitlines()
         lines = [[float(x) for x in line.split()] for line in lines]    # split lines into floats
         pitches = []
         # find the mean of all pitches in the whole sentence
         mean = np.mean([line[1] for line in lines if line[2] == 1])
         # find the std of all pitches in the whole sentence
         std = np.std([line[1] for line in lines if line[2] == 1])
-        low = min([p for t,p,v in lines if v == 1]) - 1
-        for line in lines:
-            time, pitch, is_pitch = line
-            if start_time <= time <= end_time:
-                if is_pitch == 1:
-                    pitches.append(z_score(pitch, mean, std))
-                else:
-                    pitches.append(z_score(low, mean, std))
-                    #pitches.append(-0.99)
     return pitches
@@ -138,6 +135,7 @@ def get_data(norm_sent,path_key,start_end_word_index):
     data = defaultdict(list)
     align_data = defaultdict(list)
     playable_audio = {}
     for spk, pdict in path_key:
         word_al = word_aligns[spk]
@@ -146,22 +144,25 @@ def get_data(norm_sent,path_key,start_end_word_index):
         seg_aligns =  word_al[s_ix:e_ix+1]
         seg_aligns = [(w,round(s-start_time,2),round(e-start_time,2)) for w,s,e in seg_aligns]
         pitches = get_pitches(start_time, end_time, pdict['f0'])
         rmses = get_rmse(start_time, end_time, pdict['wav'])
         rmses = downsample_rmse2pitch(rmses,len(pitches))
         #spectral_centroids = get_spectral_centroids(start_time, end_time, id, wav_dir, len(pitches))
-        pitches_cpy = np.array(deepcopy(pitches))
-        rmses_cpy = np.array(deepcopy(rmses))
-        d = [[p, r] for p, r in zip(pitches_cpy, rmses_cpy)]
-        #words = "-".join(word_combs)
-        data[f"{words}**{spk}"] = d
-        align_data[f"{words}**{spk}"] = seg_aligns
-        playable_audio[spk] = (pdict['wav'], start_time, end_time)
-    return words, data, align_data, playable_audio
@@ -253,11 +254,11 @@ def gp(d,s,x):
     return os.path.join(d, f'{s}.{x}')
 def gen_tts_paths(tdir,voices):
-    plist = [(v, {'wav': gp(tdir,v,'wav'), 'aln': gp(tdir,v,'tsv'), 'f0': gp(tdir,v,'f0')}) for v in voices]
     return plist
-def gen_h_paths(wdir,adir,f0dir,spks):
-    plist = [(s, {'wav': gp(wdir,s,'wav'), 'aln': gp(adir,s,'tsv'), 'f0': gp(f0dir,s,'f0')}) for s in spks]
     return plist
@@ -269,15 +270,16 @@ def gen_h_paths(wdir,adir,f0dir,spks):
 # or can it not take that input in multidimensional space
 #  then the 3 dists can still be averaged to flatten, if appropriately scaled
-def cluster(norm_sent,orig_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, tts_sent_dir, voices, start_end_word_index):
     h_spk_ids = sorted(h_spk_ids)
-    nsents = len(h_spk_ids)
-    h_all_paths = gen_h_paths(h_wav_dir,h_align_dir,h_f0_dir,h_spk_ids)
-    words, h_data, h_seg_aligns, h_playable = get_data(norm_sent,h_all_paths,start_end_word_index)
     dtw_dists = pair_dists(h_data,words,h_spk_ids)
     kmedoids_cluster_dists = []
@@ -295,7 +297,7 @@ def cluster(norm_sent,orig_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, tts
     tts_all_paths = gen_tts_paths(tts_sent_dir, voices)
-    _, tts_data, tts_seg_aligns, tts_playable_segment = get_data(norm_sent,tts_all_paths,start_end_word_index)
     for v in voices:
         voice_data = tts_data[f"{words}**{v}"]
@@ -333,14 +335,13 @@ def clusters_audio(clusters,audios):
         for rec in recs:
             html += f'<tr><td><audio controls id="{rec}">'   #width="20%">
-            #html += f'<source src="{audios[rec][0]}#t={audios[rec][1]*60:.2f},{audios[rec][2]*60:.2f}" type="audio/wav">'
-            html += f'<source src="{audios[rec][0]}" type="audio/wav">'
             html += '</audio></td>'
             html += f'<td>{rec}</td></tr>'
-            print(f'{audios[rec][0]}')
-            print(f'{audios[rec][0]}#t={audios[rec][1]*60:.2f},{audios[rec][2]*60:.2f}')
         html += '</tbody></table>'
         html += '</div>'
         #html += '<div style="height:2%;background:#e7fefc"></div>'

     Returns an array of pitch values for a given speech.
     Reads from .f0 file of Time, F0, IsVoiced
     """
     with open(fpath) as f:
         lines = f.read().splitlines()
         lines = [[float(x) for x in line.split()] for line in lines]    # split lines into floats
         pitches = []
         # find the mean of all pitches in the whole sentence
         mean = np.mean([line[1] for line in lines if line[2] == 1])
         # find the std of all pitches in the whole sentence
         std = np.std([line[1] for line in lines if line[2] == 1])
+        tracked = [p for t,p,v in lines if v == 1]
+        if tracked:
+            low = min(tracked) - 1
+            for line in lines:
+                time, pitch, is_pitch = line
+                if start_time <= time <= end_time:
+                    if is_pitch == 1:
+                        pitches.append(z_score(pitch, mean, std))
+                    else:
+                        pitches.append(z_score(low, mean, std))
+                        #pitches.append(-0.99)
     return pitches
     data = defaultdict(list)
     align_data = defaultdict(list)
     playable_audio = {}
+    exclude = []
     for spk, pdict in path_key:
         word_al = word_aligns[spk]
         seg_aligns =  word_al[s_ix:e_ix+1]
         seg_aligns = [(w,round(s-start_time,2),round(e-start_time,2)) for w,s,e in seg_aligns]
         pitches = get_pitches(start_time, end_time, pdict['f0'])
         rmses = get_rmse(start_time, end_time, pdict['wav'])
         rmses = downsample_rmse2pitch(rmses,len(pitches))
         #spectral_centroids = get_spectral_centroids(start_time, end_time, id, wav_dir, len(pitches))
+        if pitches and seg_aligns:
+            pitches_cpy = np.array(deepcopy(pitches))
+            rmses_cpy = np.array(deepcopy(rmses))
+            d = [[p, r] for p, r in zip(pitches_cpy, rmses_cpy)]
+            #words = "-".join(word_combs)
+            data[f"{words}**{spk}"] = d
+            align_data[f"{words}**{spk}"] = seg_aligns
+            playable_audio[spk] = (pdict['play'], start_time, end_time)
+        else:
+            exclude.append(spk)
+    return words, data, align_data, exclude, playable_audio
     return os.path.join(d, f'{s}.{x}')
 def gen_tts_paths(tdir,voices):
+    plist = [(v, {'wav': gp(tdir,v,'wav'), 'aln': gp(tdir,v,'tsv'), 'f0': gp(tdir,v,'f0'), 'play': gp(tdir,v,'wav')}) for v in voices]
     return plist
+def gen_h_paths(wdir,adir,f0dir,pldir,spks):
+    plist = [(s, {'wav': gp(wdir,s,'wav'), 'aln': gp(adir,s,'tsv'), 'f0': gp(f0dir,s,'f0'), 'play': gp(pldir,s,'wav')}) for s in spks]
     return plist
 # or can it not take that input in multidimensional space
 #  then the 3 dists can still be averaged to flatten, if appropriately scaled
+def cluster(norm_sent,orig_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, h_play_dir, tts_sent_dir, voices, start_end_word_index):
     h_spk_ids = sorted(h_spk_ids)
+    h_all_paths = gen_h_paths(h_wav_dir,h_align_dir,h_f0_dir,h_play_dir,h_spk_ids)
+    words, h_data, h_seg_aligns, drop_spk, h_playable = get_data(norm_sent,h_all_paths,start_end_word_index)
+    h_spk_ids = [spk for spk in h_spk_ids if spk not in drop_spk]
+    h_all_paths = [pinfo for pinfo in h_all_paths if pinfo[0] not in drop_spk]
+    nsents = len(h_spk_ids)
     dtw_dists = pair_dists(h_data,words,h_spk_ids)
     kmedoids_cluster_dists = []
     tts_all_paths = gen_tts_paths(tts_sent_dir, voices)
+    _, tts_data, tts_seg_aligns, _, _ = get_data(norm_sent,tts_all_paths,start_end_word_index)
     for v in voices:
         voice_data = tts_data[f"{words}**{v}"]
         for rec in recs:
             html += f'<tr><td><audio controls id="{rec}">'   #width="20%">
+            html += f'<source src="{audios[rec][0]}#t={audios[rec][1]:.2f},{audios[rec][2]:.2f}" type="audio/wav">'
+            #html += f'<source src="{audios[rec][0]}" type="audio/wav">'
             html += '</audio></td>'
             html += f'<td>{rec}</td></tr>'
         html += '</tbody></table>'
         html += '</div>'
         #html += '<div style="height:2%;background:#e7fefc"></div>'

scripts/runSQ.py CHANGED Viewed

@@ -22,6 +22,7 @@ def run(sentence, voices, start_end_word_ix):
     corpus_meta = '/home/user/app/human_data/SQL1adult10s_metadata.tsv'
     speech_dir = '/home/user/app/human_data/audio/squeries/'
     speech_aligns = '/home/user/app/human_data/align/squeries/'
     speech_f0 = '/home/user/app/human_data/f0/squeries/'
     align_model_path ="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h"
@@ -38,7 +39,7 @@ def run(sentence, voices, start_end_word_ix):
         temp_tts_sample, tts_sent_dir = get_tts(sentence,voices,tts_dir,align_model_path)
         voices = [voices[0]] # TODO. now limit one voice at a time.
-        score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, html = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_sent_dir, voices, start_end_word_ix)
     # also stop forgetting duration.
@@ -249,11 +250,13 @@ def precompute(corpusdb, speech_dir, align_dir, align_model_path, f0_dir, reaper
     return max(toi,len(meta))
 def localtest():
-    sentence = 'En er hægt að taka orðalagið bókstaflega?'#'Ef svo er, hvað heita þau þá?'#'Var það ekki nóg?'
     voices = ['Alfur_v2'] #,'Dilja']
     # make for now the interface allows max one voice
@@ -262,6 +265,7 @@ def localtest():
     locl = '/home/caitlinr/work/peval/pce/'
     corpus_meta = locl+'human_data/SQL1adult10s_metadata.tsv'
     speech_dir = locl+'human_data/audio/squeries/'
     speech_aligns = locl+'human_data/align/squeries/'
     speech_f0 = locl+'human_data/f0/squeries/'
     align_model_path ="/home/caitlinr/work/models/LVL/wav2vec2-large-xlsr-53-icelandic-ep10-1000h"
@@ -272,7 +276,6 @@ def localtest():
     norm_sentence = snorm(sentence)
     human_rec_ids = get_samromur_queries(norm_sentence, corpus_meta, speech_dir, speech_aligns, align_model_path, speech_f0, reaper_path = reaper_exc)
     if voices:
@@ -281,11 +284,10 @@ def localtest():
         voices = [voices[0]] # TODO. now limit one voice at a time.
-        score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, html = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_sent_dir, voices, start_end_word_ix)
 #localtest()
 # torch matplotlib librosa sklearn_extra pydub
 # env pclustr

     corpus_meta = '/home/user/app/human_data/SQL1adult10s_metadata.tsv'
     speech_dir = '/home/user/app/human_data/audio/squeries/'
+    playable_dir = 'https://huggingface.co/spaces/clr/pce/resolve/main/human_data/audio/squeries/'
     speech_aligns = '/home/user/app/human_data/align/squeries/'
     speech_f0 = '/home/user/app/human_data/f0/squeries/'
     align_model_path ="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h"
         temp_tts_sample, tts_sent_dir = get_tts(sentence,voices,tts_dir,align_model_path)
         voices = [voices[0]] # TODO. now limit one voice at a time.
+        score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, html = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, playable_dir, tts_sent_dir, voices, start_end_word_ix)
     # also stop forgetting duration.
     return max(toi,len(meta))
 def localtest():
+    # TODO
+    # En hvað veldur þá þessari miklu fjölgun snjógæsa?
+    sentence= "Hann spyr: Hvað get ég vitað?"
+    #sentence = 'En er hægt að taka orðalagið bókstaflega?'#'Ef svo er, hvað heita þau þá?'#'Var það ekki nóg?'
     voices = ['Alfur_v2'] #,'Dilja']
     # make for now the interface allows max one voice
     locl = '/home/caitlinr/work/peval/pce/'
     corpus_meta = locl+'human_data/SQL1adult10s_metadata.tsv'
     speech_dir = locl+'human_data/audio/squeries/'
+    playable_dir = 'https://huggingface.co/spaces/clr/pce/resolve/main/human_data/audio/squeries/'
     speech_aligns = locl+'human_data/align/squeries/'
     speech_f0 = locl+'human_data/f0/squeries/'
     align_model_path ="/home/caitlinr/work/models/LVL/wav2vec2-large-xlsr-53-icelandic-ep10-1000h"
     norm_sentence = snorm(sentence)
     human_rec_ids = get_samromur_queries(norm_sentence, corpus_meta, speech_dir, speech_aligns, align_model_path, speech_f0, reaper_path = reaper_exc)
     if voices:
         voices = [voices[0]] # TODO. now limit one voice at a time.
+        score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, html = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, playable_dir, tts_sent_dir, voices, start_end_word_ix)
 #localtest()
 # torch matplotlib librosa sklearn_extra pydub
 # env pclustr