catiR
commited on
Commit
•
67be6d3
1
Parent(s):
971211e
audios, skip bad f0
Browse files- scripts/clusterprosody.py +40 -39
- scripts/runSQ.py +9 -7
scripts/clusterprosody.py
CHANGED
@@ -48,30 +48,27 @@ def get_pitches(start_time, end_time, fpath):
|
|
48 |
Returns an array of pitch values for a given speech.
|
49 |
Reads from .f0 file of Time, F0, IsVoiced
|
50 |
"""
|
51 |
-
|
52 |
with open(fpath) as f:
|
53 |
lines = f.read().splitlines()
|
54 |
lines = [[float(x) for x in line.split()] for line in lines] # split lines into floats
|
55 |
pitches = []
|
56 |
|
57 |
-
|
58 |
# find the mean of all pitches in the whole sentence
|
59 |
mean = np.mean([line[1] for line in lines if line[2] == 1])
|
60 |
# find the std of all pitches in the whole sentence
|
61 |
std = np.std([line[1] for line in lines if line[2] == 1])
|
62 |
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
return pitches
|
76 |
|
77 |
|
@@ -138,6 +135,7 @@ def get_data(norm_sent,path_key,start_end_word_index):
|
|
138 |
data = defaultdict(list)
|
139 |
align_data = defaultdict(list)
|
140 |
playable_audio = {}
|
|
|
141 |
|
142 |
for spk, pdict in path_key:
|
143 |
word_al = word_aligns[spk]
|
@@ -146,22 +144,25 @@ def get_data(norm_sent,path_key,start_end_word_index):
|
|
146 |
|
147 |
seg_aligns = word_al[s_ix:e_ix+1]
|
148 |
seg_aligns = [(w,round(s-start_time,2),round(e-start_time,2)) for w,s,e in seg_aligns]
|
149 |
-
|
150 |
pitches = get_pitches(start_time, end_time, pdict['f0'])
|
151 |
|
152 |
rmses = get_rmse(start_time, end_time, pdict['wav'])
|
153 |
rmses = downsample_rmse2pitch(rmses,len(pitches))
|
154 |
#spectral_centroids = get_spectral_centroids(start_time, end_time, id, wav_dir, len(pitches))
|
155 |
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
|
|
|
|
|
|
165 |
|
166 |
|
167 |
|
@@ -253,11 +254,11 @@ def gp(d,s,x):
|
|
253 |
return os.path.join(d, f'{s}.{x}')
|
254 |
|
255 |
def gen_tts_paths(tdir,voices):
|
256 |
-
plist = [(v, {'wav': gp(tdir,v,'wav'), 'aln': gp(tdir,v,'tsv'), 'f0': gp(tdir,v,'f0')}) for v in voices]
|
257 |
return plist
|
258 |
|
259 |
-
def gen_h_paths(wdir,adir,f0dir,spks):
|
260 |
-
plist = [(s, {'wav': gp(wdir,s,'wav'), 'aln': gp(adir,s,'tsv'), 'f0': gp(f0dir,s,'f0')}) for s in spks]
|
261 |
return plist
|
262 |
|
263 |
|
@@ -269,15 +270,16 @@ def gen_h_paths(wdir,adir,f0dir,spks):
|
|
269 |
# or can it not take that input in multidimensional space
|
270 |
# then the 3 dists can still be averaged to flatten, if appropriately scaled
|
271 |
|
272 |
-
def cluster(norm_sent,orig_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, tts_sent_dir, voices, start_end_word_index):
|
273 |
|
274 |
h_spk_ids = sorted(h_spk_ids)
|
275 |
-
|
276 |
-
|
277 |
-
h_all_paths = gen_h_paths(h_wav_dir,h_align_dir,h_f0_dir,h_spk_ids)
|
278 |
-
|
279 |
-
words, h_data, h_seg_aligns, h_playable = get_data(norm_sent,h_all_paths,start_end_word_index)
|
280 |
|
|
|
|
|
|
|
|
|
|
|
281 |
dtw_dists = pair_dists(h_data,words,h_spk_ids)
|
282 |
|
283 |
kmedoids_cluster_dists = []
|
@@ -295,7 +297,7 @@ def cluster(norm_sent,orig_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, tts
|
|
295 |
|
296 |
|
297 |
tts_all_paths = gen_tts_paths(tts_sent_dir, voices)
|
298 |
-
_, tts_data, tts_seg_aligns,
|
299 |
|
300 |
for v in voices:
|
301 |
voice_data = tts_data[f"{words}**{v}"]
|
@@ -333,14 +335,13 @@ def clusters_audio(clusters,audios):
|
|
333 |
|
334 |
for rec in recs:
|
335 |
html += f'<tr><td><audio controls id="{rec}">' #width="20%">
|
336 |
-
|
337 |
-
html += f'<source src="{audios[rec][0]}" type="audio/wav">'
|
|
|
|
|
338 |
html += '</audio></td>'
|
339 |
html += f'<td>{rec}</td></tr>'
|
340 |
|
341 |
-
print(f'{audios[rec][0]}')
|
342 |
-
print(f'{audios[rec][0]}#t={audios[rec][1]*60:.2f},{audios[rec][2]*60:.2f}')
|
343 |
-
|
344 |
html += '</tbody></table>'
|
345 |
html += '</div>'
|
346 |
#html += '<div style="height:2%;background:#e7fefc"></div>'
|
|
|
48 |
Returns an array of pitch values for a given speech.
|
49 |
Reads from .f0 file of Time, F0, IsVoiced
|
50 |
"""
|
|
|
51 |
with open(fpath) as f:
|
52 |
lines = f.read().splitlines()
|
53 |
lines = [[float(x) for x in line.split()] for line in lines] # split lines into floats
|
54 |
pitches = []
|
55 |
|
|
|
56 |
# find the mean of all pitches in the whole sentence
|
57 |
mean = np.mean([line[1] for line in lines if line[2] == 1])
|
58 |
# find the std of all pitches in the whole sentence
|
59 |
std = np.std([line[1] for line in lines if line[2] == 1])
|
60 |
|
61 |
+
tracked = [p for t,p,v in lines if v == 1]
|
62 |
+
if tracked:
|
63 |
+
low = min(tracked) - 1
|
64 |
+
for line in lines:
|
65 |
+
time, pitch, is_pitch = line
|
66 |
+
if start_time <= time <= end_time:
|
67 |
+
if is_pitch == 1:
|
68 |
+
pitches.append(z_score(pitch, mean, std))
|
69 |
+
else:
|
70 |
+
pitches.append(z_score(low, mean, std))
|
71 |
+
#pitches.append(-0.99)
|
|
|
72 |
return pitches
|
73 |
|
74 |
|
|
|
135 |
data = defaultdict(list)
|
136 |
align_data = defaultdict(list)
|
137 |
playable_audio = {}
|
138 |
+
exclude = []
|
139 |
|
140 |
for spk, pdict in path_key:
|
141 |
word_al = word_aligns[spk]
|
|
|
144 |
|
145 |
seg_aligns = word_al[s_ix:e_ix+1]
|
146 |
seg_aligns = [(w,round(s-start_time,2),round(e-start_time,2)) for w,s,e in seg_aligns]
|
147 |
+
|
148 |
pitches = get_pitches(start_time, end_time, pdict['f0'])
|
149 |
|
150 |
rmses = get_rmse(start_time, end_time, pdict['wav'])
|
151 |
rmses = downsample_rmse2pitch(rmses,len(pitches))
|
152 |
#spectral_centroids = get_spectral_centroids(start_time, end_time, id, wav_dir, len(pitches))
|
153 |
|
154 |
+
if pitches and seg_aligns:
|
155 |
+
pitches_cpy = np.array(deepcopy(pitches))
|
156 |
+
rmses_cpy = np.array(deepcopy(rmses))
|
157 |
+
d = [[p, r] for p, r in zip(pitches_cpy, rmses_cpy)]
|
158 |
+
#words = "-".join(word_combs)
|
159 |
+
data[f"{words}**{spk}"] = d
|
160 |
+
align_data[f"{words}**{spk}"] = seg_aligns
|
161 |
+
playable_audio[spk] = (pdict['play'], start_time, end_time)
|
162 |
+
else:
|
163 |
+
exclude.append(spk)
|
164 |
+
|
165 |
+
return words, data, align_data, exclude, playable_audio
|
166 |
|
167 |
|
168 |
|
|
|
254 |
return os.path.join(d, f'{s}.{x}')
|
255 |
|
256 |
def gen_tts_paths(tdir,voices):
|
257 |
+
plist = [(v, {'wav': gp(tdir,v,'wav'), 'aln': gp(tdir,v,'tsv'), 'f0': gp(tdir,v,'f0'), 'play': gp(tdir,v,'wav')}) for v in voices]
|
258 |
return plist
|
259 |
|
260 |
+
def gen_h_paths(wdir,adir,f0dir,pldir,spks):
|
261 |
+
plist = [(s, {'wav': gp(wdir,s,'wav'), 'aln': gp(adir,s,'tsv'), 'f0': gp(f0dir,s,'f0'), 'play': gp(pldir,s,'wav')}) for s in spks]
|
262 |
return plist
|
263 |
|
264 |
|
|
|
270 |
# or can it not take that input in multidimensional space
|
271 |
# then the 3 dists can still be averaged to flatten, if appropriately scaled
|
272 |
|
273 |
+
def cluster(norm_sent,orig_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, h_play_dir, tts_sent_dir, voices, start_end_word_index):
|
274 |
|
275 |
h_spk_ids = sorted(h_spk_ids)
|
276 |
+
h_all_paths = gen_h_paths(h_wav_dir,h_align_dir,h_f0_dir,h_play_dir,h_spk_ids)
|
|
|
|
|
|
|
|
|
277 |
|
278 |
+
words, h_data, h_seg_aligns, drop_spk, h_playable = get_data(norm_sent,h_all_paths,start_end_word_index)
|
279 |
+
h_spk_ids = [spk for spk in h_spk_ids if spk not in drop_spk]
|
280 |
+
h_all_paths = [pinfo for pinfo in h_all_paths if pinfo[0] not in drop_spk]
|
281 |
+
nsents = len(h_spk_ids)
|
282 |
+
|
283 |
dtw_dists = pair_dists(h_data,words,h_spk_ids)
|
284 |
|
285 |
kmedoids_cluster_dists = []
|
|
|
297 |
|
298 |
|
299 |
tts_all_paths = gen_tts_paths(tts_sent_dir, voices)
|
300 |
+
_, tts_data, tts_seg_aligns, _, _ = get_data(norm_sent,tts_all_paths,start_end_word_index)
|
301 |
|
302 |
for v in voices:
|
303 |
voice_data = tts_data[f"{words}**{v}"]
|
|
|
335 |
|
336 |
for rec in recs:
|
337 |
html += f'<tr><td><audio controls id="{rec}">' #width="20%">
|
338 |
+
|
339 |
+
html += f'<source src="{audios[rec][0]}#t={audios[rec][1]:.2f},{audios[rec][2]:.2f}" type="audio/wav">'
|
340 |
+
#html += f'<source src="{audios[rec][0]}" type="audio/wav">'
|
341 |
+
|
342 |
html += '</audio></td>'
|
343 |
html += f'<td>{rec}</td></tr>'
|
344 |
|
|
|
|
|
|
|
345 |
html += '</tbody></table>'
|
346 |
html += '</div>'
|
347 |
#html += '<div style="height:2%;background:#e7fefc"></div>'
|
scripts/runSQ.py
CHANGED
@@ -22,6 +22,7 @@ def run(sentence, voices, start_end_word_ix):
|
|
22 |
|
23 |
corpus_meta = '/home/user/app/human_data/SQL1adult10s_metadata.tsv'
|
24 |
speech_dir = '/home/user/app/human_data/audio/squeries/'
|
|
|
25 |
speech_aligns = '/home/user/app/human_data/align/squeries/'
|
26 |
speech_f0 = '/home/user/app/human_data/f0/squeries/'
|
27 |
align_model_path ="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h"
|
@@ -38,7 +39,7 @@ def run(sentence, voices, start_end_word_ix):
|
|
38 |
temp_tts_sample, tts_sent_dir = get_tts(sentence,voices,tts_dir,align_model_path)
|
39 |
|
40 |
voices = [voices[0]] # TODO. now limit one voice at a time.
|
41 |
-
score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, html = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_sent_dir, voices, start_end_word_ix)
|
42 |
|
43 |
# also stop forgetting duration.
|
44 |
|
@@ -249,11 +250,13 @@ def precompute(corpusdb, speech_dir, align_dir, align_model_path, f0_dir, reaper
|
|
249 |
|
250 |
return max(toi,len(meta))
|
251 |
|
252 |
-
|
253 |
-
|
254 |
|
255 |
def localtest():
|
256 |
-
|
|
|
|
|
|
|
|
|
257 |
voices = ['Alfur_v2'] #,'Dilja']
|
258 |
# make for now the interface allows max one voice
|
259 |
|
@@ -262,6 +265,7 @@ def localtest():
|
|
262 |
locl = '/home/caitlinr/work/peval/pce/'
|
263 |
corpus_meta = locl+'human_data/SQL1adult10s_metadata.tsv'
|
264 |
speech_dir = locl+'human_data/audio/squeries/'
|
|
|
265 |
speech_aligns = locl+'human_data/align/squeries/'
|
266 |
speech_f0 = locl+'human_data/f0/squeries/'
|
267 |
align_model_path ="/home/caitlinr/work/models/LVL/wav2vec2-large-xlsr-53-icelandic-ep10-1000h"
|
@@ -272,7 +276,6 @@ def localtest():
|
|
272 |
|
273 |
norm_sentence = snorm(sentence)
|
274 |
|
275 |
-
|
276 |
human_rec_ids = get_samromur_queries(norm_sentence, corpus_meta, speech_dir, speech_aligns, align_model_path, speech_f0, reaper_path = reaper_exc)
|
277 |
|
278 |
if voices:
|
@@ -281,11 +284,10 @@ def localtest():
|
|
281 |
|
282 |
voices = [voices[0]] # TODO. now limit one voice at a time.
|
283 |
|
284 |
-
score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, html = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_sent_dir, voices, start_end_word_ix)
|
285 |
|
286 |
|
287 |
|
288 |
-
|
289 |
#localtest()
|
290 |
# torch matplotlib librosa sklearn_extra pydub
|
291 |
# env pclustr
|
|
|
22 |
|
23 |
corpus_meta = '/home/user/app/human_data/SQL1adult10s_metadata.tsv'
|
24 |
speech_dir = '/home/user/app/human_data/audio/squeries/'
|
25 |
+
playable_dir = 'https://huggingface.co/spaces/clr/pce/resolve/main/human_data/audio/squeries/'
|
26 |
speech_aligns = '/home/user/app/human_data/align/squeries/'
|
27 |
speech_f0 = '/home/user/app/human_data/f0/squeries/'
|
28 |
align_model_path ="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h"
|
|
|
39 |
temp_tts_sample, tts_sent_dir = get_tts(sentence,voices,tts_dir,align_model_path)
|
40 |
|
41 |
voices = [voices[0]] # TODO. now limit one voice at a time.
|
42 |
+
score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, html = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, playable_dir, tts_sent_dir, voices, start_end_word_ix)
|
43 |
|
44 |
# also stop forgetting duration.
|
45 |
|
|
|
250 |
|
251 |
return max(toi,len(meta))
|
252 |
|
|
|
|
|
253 |
|
254 |
def localtest():
|
255 |
+
|
256 |
+
# TODO
|
257 |
+
# En hvað veldur þá þessari miklu fjölgun snjógæsa?
|
258 |
+
sentence= "Hann spyr: Hvað get ég vitað?"
|
259 |
+
#sentence = 'En er hægt að taka orðalagið bókstaflega?'#'Ef svo er, hvað heita þau þá?'#'Var það ekki nóg?'
|
260 |
voices = ['Alfur_v2'] #,'Dilja']
|
261 |
# make for now the interface allows max one voice
|
262 |
|
|
|
265 |
locl = '/home/caitlinr/work/peval/pce/'
|
266 |
corpus_meta = locl+'human_data/SQL1adult10s_metadata.tsv'
|
267 |
speech_dir = locl+'human_data/audio/squeries/'
|
268 |
+
playable_dir = 'https://huggingface.co/spaces/clr/pce/resolve/main/human_data/audio/squeries/'
|
269 |
speech_aligns = locl+'human_data/align/squeries/'
|
270 |
speech_f0 = locl+'human_data/f0/squeries/'
|
271 |
align_model_path ="/home/caitlinr/work/models/LVL/wav2vec2-large-xlsr-53-icelandic-ep10-1000h"
|
|
|
276 |
|
277 |
norm_sentence = snorm(sentence)
|
278 |
|
|
|
279 |
human_rec_ids = get_samromur_queries(norm_sentence, corpus_meta, speech_dir, speech_aligns, align_model_path, speech_f0, reaper_path = reaper_exc)
|
280 |
|
281 |
if voices:
|
|
|
284 |
|
285 |
voices = [voices[0]] # TODO. now limit one voice at a time.
|
286 |
|
287 |
+
score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, html = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, playable_dir, tts_sent_dir, voices, start_end_word_ix)
|
288 |
|
289 |
|
290 |
|
|
|
291 |
#localtest()
|
292 |
# torch matplotlib librosa sklearn_extra pydub
|
293 |
# env pclustr
|