KevinGeng commited on
Commit
8dacb0a
1 Parent(s): 403b1ea

dis able PPM

Browse files
Files changed (1) hide show
  1. app.py +10 -7
app.py CHANGED
@@ -23,8 +23,8 @@ transformation = jiwer.Compose([
23
 
24
  # WPM part
25
  from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
26
- processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
27
- phoneme_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
28
  # phoneme_model = pipeline(model="facebook/wav2vec2-xlsr-53-espeak-cv-ft")
29
  class ChangeSampleRate(nn.Module):
30
  def __init__(self, input_rate: int, output_rate: int):
@@ -79,10 +79,13 @@ def calc_mos(audio_path, ref):
79
  MOS_fig = Naturalness_Plot(AVA_MOS)
80
 
81
  # Phonemes per minute (PPM)
82
- with torch.no_grad():
83
- logits = phoneme_model(out_wavs).logits
84
- phone_predicted_ids = torch.argmax(logits, dim=-1)
85
- phone_transcription = processor.batch_decode(phone_predicted_ids)
 
 
 
86
  lst_phonemes = phone_transcription[0].split(" ")
87
  wav_vad = torchaudio.functional.vad(wav, sample_rate=sr)
88
 
@@ -92,7 +95,7 @@ def calc_mos(audio_path, ref):
92
  ppm = len(lst_phonemes) / (wav_vad.shape[-1] / sr) * 60
93
 
94
  # pdb.set_trace()
95
- return AVA_MOS, MOS_fig, INTELI_score, INT_fig, trans, phone_transcription, ppm , f0_db_fig
96
 
97
 
98
  with open("local/description.md") as f:
 
23
 
24
  # WPM part
25
  from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
26
+ # processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
27
+ # phoneme_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
28
  # phoneme_model = pipeline(model="facebook/wav2vec2-xlsr-53-espeak-cv-ft")
29
  class ChangeSampleRate(nn.Module):
30
  def __init__(self, input_rate: int, output_rate: int):
 
79
  MOS_fig = Naturalness_Plot(AVA_MOS)
80
 
81
  # Phonemes per minute (PPM)
82
+ # with torch.no_grad():
83
+ # logits = phoneme_model(out_wavs).logits
84
+ # phone_predicted_ids = torch.argmax(logits, dim=-1)
85
+ # phone_transcription = processor.batch_decode(phone_predicted_ids)
86
+
87
+ # Disable PPM for now
88
+ phone_transcription = None
89
  lst_phonemes = phone_transcription[0].split(" ")
90
  wav_vad = torchaudio.functional.vad(wav, sample_rate=sr)
91
 
 
95
  ppm = len(lst_phonemes) / (wav_vad.shape[-1] / sr) * 60
96
 
97
  # pdb.set_trace()
98
+ return AVA_MOS, MOS_fig, INTELI_score, INT_fig, trans, phone_transcription, ppm, f0_db_fig
99
 
100
 
101
  with open("local/description.md") as f: