catiR commited on
Commit
67be6d3
1 Parent(s): 971211e

audios, skip bad f0

Browse files
Files changed (2) hide show
  1. scripts/clusterprosody.py +40 -39
  2. scripts/runSQ.py +9 -7
scripts/clusterprosody.py CHANGED
@@ -48,30 +48,27 @@ def get_pitches(start_time, end_time, fpath):
48
  Returns an array of pitch values for a given speech.
49
  Reads from .f0 file of Time, F0, IsVoiced
50
  """
51
-
52
  with open(fpath) as f:
53
  lines = f.read().splitlines()
54
  lines = [[float(x) for x in line.split()] for line in lines] # split lines into floats
55
  pitches = []
56
 
57
-
58
  # find the mean of all pitches in the whole sentence
59
  mean = np.mean([line[1] for line in lines if line[2] == 1])
60
  # find the std of all pitches in the whole sentence
61
  std = np.std([line[1] for line in lines if line[2] == 1])
62
 
63
- low = min([p for t,p,v in lines if v == 1]) - 1
64
-
65
- for line in lines:
66
- time, pitch, is_pitch = line
67
-
68
- if start_time <= time <= end_time:
69
- if is_pitch == 1:
70
- pitches.append(z_score(pitch, mean, std))
71
- else:
72
- pitches.append(z_score(low, mean, std))
73
- #pitches.append(-0.99)
74
-
75
  return pitches
76
 
77
 
@@ -138,6 +135,7 @@ def get_data(norm_sent,path_key,start_end_word_index):
138
  data = defaultdict(list)
139
  align_data = defaultdict(list)
140
  playable_audio = {}
 
141
 
142
  for spk, pdict in path_key:
143
  word_al = word_aligns[spk]
@@ -146,22 +144,25 @@ def get_data(norm_sent,path_key,start_end_word_index):
146
 
147
  seg_aligns = word_al[s_ix:e_ix+1]
148
  seg_aligns = [(w,round(s-start_time,2),round(e-start_time,2)) for w,s,e in seg_aligns]
149
-
150
  pitches = get_pitches(start_time, end_time, pdict['f0'])
151
 
152
  rmses = get_rmse(start_time, end_time, pdict['wav'])
153
  rmses = downsample_rmse2pitch(rmses,len(pitches))
154
  #spectral_centroids = get_spectral_centroids(start_time, end_time, id, wav_dir, len(pitches))
155
 
156
- pitches_cpy = np.array(deepcopy(pitches))
157
- rmses_cpy = np.array(deepcopy(rmses))
158
- d = [[p, r] for p, r in zip(pitches_cpy, rmses_cpy)]
159
- #words = "-".join(word_combs)
160
- data[f"{words}**{spk}"] = d
161
- align_data[f"{words}**{spk}"] = seg_aligns
162
- playable_audio[spk] = (pdict['wav'], start_time, end_time)
163
-
164
- return words, data, align_data, playable_audio
 
 
 
165
 
166
 
167
 
@@ -253,11 +254,11 @@ def gp(d,s,x):
253
  return os.path.join(d, f'{s}.{x}')
254
 
255
  def gen_tts_paths(tdir,voices):
256
- plist = [(v, {'wav': gp(tdir,v,'wav'), 'aln': gp(tdir,v,'tsv'), 'f0': gp(tdir,v,'f0')}) for v in voices]
257
  return plist
258
 
259
- def gen_h_paths(wdir,adir,f0dir,spks):
260
- plist = [(s, {'wav': gp(wdir,s,'wav'), 'aln': gp(adir,s,'tsv'), 'f0': gp(f0dir,s,'f0')}) for s in spks]
261
  return plist
262
 
263
 
@@ -269,15 +270,16 @@ def gen_h_paths(wdir,adir,f0dir,spks):
269
  # or can it not take that input in multidimensional space
270
  # then the 3 dists can still be averaged to flatten, if appropriately scaled
271
 
272
- def cluster(norm_sent,orig_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, tts_sent_dir, voices, start_end_word_index):
273
 
274
  h_spk_ids = sorted(h_spk_ids)
275
- nsents = len(h_spk_ids)
276
-
277
- h_all_paths = gen_h_paths(h_wav_dir,h_align_dir,h_f0_dir,h_spk_ids)
278
-
279
- words, h_data, h_seg_aligns, h_playable = get_data(norm_sent,h_all_paths,start_end_word_index)
280
 
 
 
 
 
 
281
  dtw_dists = pair_dists(h_data,words,h_spk_ids)
282
 
283
  kmedoids_cluster_dists = []
@@ -295,7 +297,7 @@ def cluster(norm_sent,orig_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, tts
295
 
296
 
297
  tts_all_paths = gen_tts_paths(tts_sent_dir, voices)
298
- _, tts_data, tts_seg_aligns, tts_playable_segment = get_data(norm_sent,tts_all_paths,start_end_word_index)
299
 
300
  for v in voices:
301
  voice_data = tts_data[f"{words}**{v}"]
@@ -333,14 +335,13 @@ def clusters_audio(clusters,audios):
333
 
334
  for rec in recs:
335
  html += f'<tr><td><audio controls id="{rec}">' #width="20%">
336
- #html += f'<source src="{audios[rec][0]}#t={audios[rec][1]*60:.2f},{audios[rec][2]*60:.2f}" type="audio/wav">'
337
- html += f'<source src="{audios[rec][0]}" type="audio/wav">'
 
 
338
  html += '</audio></td>'
339
  html += f'<td>{rec}</td></tr>'
340
 
341
- print(f'{audios[rec][0]}')
342
- print(f'{audios[rec][0]}#t={audios[rec][1]*60:.2f},{audios[rec][2]*60:.2f}')
343
-
344
  html += '</tbody></table>'
345
  html += '</div>'
346
  #html += '<div style="height:2%;background:#e7fefc"></div>'
 
48
  Returns an array of pitch values for a given speech.
49
  Reads from .f0 file of Time, F0, IsVoiced
50
  """
 
51
  with open(fpath) as f:
52
  lines = f.read().splitlines()
53
  lines = [[float(x) for x in line.split()] for line in lines] # split lines into floats
54
  pitches = []
55
 
 
56
  # find the mean of all pitches in the whole sentence
57
  mean = np.mean([line[1] for line in lines if line[2] == 1])
58
  # find the std of all pitches in the whole sentence
59
  std = np.std([line[1] for line in lines if line[2] == 1])
60
 
61
+ tracked = [p for t,p,v in lines if v == 1]
62
+ if tracked:
63
+ low = min(tracked) - 1
64
+ for line in lines:
65
+ time, pitch, is_pitch = line
66
+ if start_time <= time <= end_time:
67
+ if is_pitch == 1:
68
+ pitches.append(z_score(pitch, mean, std))
69
+ else:
70
+ pitches.append(z_score(low, mean, std))
71
+ #pitches.append(-0.99)
 
72
  return pitches
73
 
74
 
 
135
  data = defaultdict(list)
136
  align_data = defaultdict(list)
137
  playable_audio = {}
138
+ exclude = []
139
 
140
  for spk, pdict in path_key:
141
  word_al = word_aligns[spk]
 
144
 
145
  seg_aligns = word_al[s_ix:e_ix+1]
146
  seg_aligns = [(w,round(s-start_time,2),round(e-start_time,2)) for w,s,e in seg_aligns]
147
+
148
  pitches = get_pitches(start_time, end_time, pdict['f0'])
149
 
150
  rmses = get_rmse(start_time, end_time, pdict['wav'])
151
  rmses = downsample_rmse2pitch(rmses,len(pitches))
152
  #spectral_centroids = get_spectral_centroids(start_time, end_time, id, wav_dir, len(pitches))
153
 
154
+ if pitches and seg_aligns:
155
+ pitches_cpy = np.array(deepcopy(pitches))
156
+ rmses_cpy = np.array(deepcopy(rmses))
157
+ d = [[p, r] for p, r in zip(pitches_cpy, rmses_cpy)]
158
+ #words = "-".join(word_combs)
159
+ data[f"{words}**{spk}"] = d
160
+ align_data[f"{words}**{spk}"] = seg_aligns
161
+ playable_audio[spk] = (pdict['play'], start_time, end_time)
162
+ else:
163
+ exclude.append(spk)
164
+
165
+ return words, data, align_data, exclude, playable_audio
166
 
167
 
168
 
 
254
  return os.path.join(d, f'{s}.{x}')
255
 
256
  def gen_tts_paths(tdir,voices):
257
+ plist = [(v, {'wav': gp(tdir,v,'wav'), 'aln': gp(tdir,v,'tsv'), 'f0': gp(tdir,v,'f0'), 'play': gp(tdir,v,'wav')}) for v in voices]
258
  return plist
259
 
260
+ def gen_h_paths(wdir,adir,f0dir,pldir,spks):
261
+ plist = [(s, {'wav': gp(wdir,s,'wav'), 'aln': gp(adir,s,'tsv'), 'f0': gp(f0dir,s,'f0'), 'play': gp(pldir,s,'wav')}) for s in spks]
262
  return plist
263
 
264
 
 
270
  # or can it not take that input in multidimensional space
271
  # then the 3 dists can still be averaged to flatten, if appropriately scaled
272
 
273
+ def cluster(norm_sent,orig_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, h_play_dir, tts_sent_dir, voices, start_end_word_index):
274
 
275
  h_spk_ids = sorted(h_spk_ids)
276
+ h_all_paths = gen_h_paths(h_wav_dir,h_align_dir,h_f0_dir,h_play_dir,h_spk_ids)
 
 
 
 
277
 
278
+ words, h_data, h_seg_aligns, drop_spk, h_playable = get_data(norm_sent,h_all_paths,start_end_word_index)
279
+ h_spk_ids = [spk for spk in h_spk_ids if spk not in drop_spk]
280
+ h_all_paths = [pinfo for pinfo in h_all_paths if pinfo[0] not in drop_spk]
281
+ nsents = len(h_spk_ids)
282
+
283
  dtw_dists = pair_dists(h_data,words,h_spk_ids)
284
 
285
  kmedoids_cluster_dists = []
 
297
 
298
 
299
  tts_all_paths = gen_tts_paths(tts_sent_dir, voices)
300
+ _, tts_data, tts_seg_aligns, _, _ = get_data(norm_sent,tts_all_paths,start_end_word_index)
301
 
302
  for v in voices:
303
  voice_data = tts_data[f"{words}**{v}"]
 
335
 
336
  for rec in recs:
337
  html += f'<tr><td><audio controls id="{rec}">' #width="20%">
338
+
339
+ html += f'<source src="{audios[rec][0]}#t={audios[rec][1]:.2f},{audios[rec][2]:.2f}" type="audio/wav">'
340
+ #html += f'<source src="{audios[rec][0]}" type="audio/wav">'
341
+
342
  html += '</audio></td>'
343
  html += f'<td>{rec}</td></tr>'
344
 
 
 
 
345
  html += '</tbody></table>'
346
  html += '</div>'
347
  #html += '<div style="height:2%;background:#e7fefc"></div>'
scripts/runSQ.py CHANGED
@@ -22,6 +22,7 @@ def run(sentence, voices, start_end_word_ix):
22
 
23
  corpus_meta = '/home/user/app/human_data/SQL1adult10s_metadata.tsv'
24
  speech_dir = '/home/user/app/human_data/audio/squeries/'
 
25
  speech_aligns = '/home/user/app/human_data/align/squeries/'
26
  speech_f0 = '/home/user/app/human_data/f0/squeries/'
27
  align_model_path ="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h"
@@ -38,7 +39,7 @@ def run(sentence, voices, start_end_word_ix):
38
  temp_tts_sample, tts_sent_dir = get_tts(sentence,voices,tts_dir,align_model_path)
39
 
40
  voices = [voices[0]] # TODO. now limit one voice at a time.
41
- score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, html = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_sent_dir, voices, start_end_word_ix)
42
 
43
  # also stop forgetting duration.
44
 
@@ -249,11 +250,13 @@ def precompute(corpusdb, speech_dir, align_dir, align_model_path, f0_dir, reaper
249
 
250
  return max(toi,len(meta))
251
 
252
-
253
-
254
 
255
  def localtest():
256
- sentence = 'En er hægt að taka orðalagið bókstaflega?'#'Ef svo er, hvað heita þau þá?'#'Var það ekki nóg?'
 
 
 
 
257
  voices = ['Alfur_v2'] #,'Dilja']
258
  # make for now the interface allows max one voice
259
 
@@ -262,6 +265,7 @@ def localtest():
262
  locl = '/home/caitlinr/work/peval/pce/'
263
  corpus_meta = locl+'human_data/SQL1adult10s_metadata.tsv'
264
  speech_dir = locl+'human_data/audio/squeries/'
 
265
  speech_aligns = locl+'human_data/align/squeries/'
266
  speech_f0 = locl+'human_data/f0/squeries/'
267
  align_model_path ="/home/caitlinr/work/models/LVL/wav2vec2-large-xlsr-53-icelandic-ep10-1000h"
@@ -272,7 +276,6 @@ def localtest():
272
 
273
  norm_sentence = snorm(sentence)
274
 
275
-
276
  human_rec_ids = get_samromur_queries(norm_sentence, corpus_meta, speech_dir, speech_aligns, align_model_path, speech_f0, reaper_path = reaper_exc)
277
 
278
  if voices:
@@ -281,11 +284,10 @@ def localtest():
281
 
282
  voices = [voices[0]] # TODO. now limit one voice at a time.
283
 
284
- score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, html = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_sent_dir, voices, start_end_word_ix)
285
 
286
 
287
 
288
-
289
  #localtest()
290
  # torch matplotlib librosa sklearn_extra pydub
291
  # env pclustr
 
22
 
23
  corpus_meta = '/home/user/app/human_data/SQL1adult10s_metadata.tsv'
24
  speech_dir = '/home/user/app/human_data/audio/squeries/'
25
+ playable_dir = 'https://huggingface.co/spaces/clr/pce/resolve/main/human_data/audio/squeries/'
26
  speech_aligns = '/home/user/app/human_data/align/squeries/'
27
  speech_f0 = '/home/user/app/human_data/f0/squeries/'
28
  align_model_path ="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h"
 
39
  temp_tts_sample, tts_sent_dir = get_tts(sentence,voices,tts_dir,align_model_path)
40
 
41
  voices = [voices[0]] # TODO. now limit one voice at a time.
42
+ score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, html = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, playable_dir, tts_sent_dir, voices, start_end_word_ix)
43
 
44
  # also stop forgetting duration.
45
 
 
250
 
251
  return max(toi,len(meta))
252
 
 
 
253
 
254
  def localtest():
255
+
256
+ # TODO
257
+ # En hvað veldur þá þessari miklu fjölgun snjógæsa?
258
+ sentence= "Hann spyr: Hvað get ég vitað?"
259
+ #sentence = 'En er hægt að taka orðalagið bókstaflega?'#'Ef svo er, hvað heita þau þá?'#'Var það ekki nóg?'
260
  voices = ['Alfur_v2'] #,'Dilja']
261
  # make for now the interface allows max one voice
262
 
 
265
  locl = '/home/caitlinr/work/peval/pce/'
266
  corpus_meta = locl+'human_data/SQL1adult10s_metadata.tsv'
267
  speech_dir = locl+'human_data/audio/squeries/'
268
+ playable_dir = 'https://huggingface.co/spaces/clr/pce/resolve/main/human_data/audio/squeries/'
269
  speech_aligns = locl+'human_data/align/squeries/'
270
  speech_f0 = locl+'human_data/f0/squeries/'
271
  align_model_path ="/home/caitlinr/work/models/LVL/wav2vec2-large-xlsr-53-icelandic-ep10-1000h"
 
276
 
277
  norm_sentence = snorm(sentence)
278
 
 
279
  human_rec_ids = get_samromur_queries(norm_sentence, corpus_meta, speech_dir, speech_aligns, align_model_path, speech_f0, reaper_path = reaper_exc)
280
 
281
  if voices:
 
284
 
285
  voices = [voices[0]] # TODO. now limit one voice at a time.
286
 
287
+ score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, html = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, playable_dir, tts_sent_dir, voices, start_end_word_ix)
288
 
289
 
290
 
 
291
  #localtest()
292
  # torch matplotlib librosa sklearn_extra pydub
293
  # env pclustr