KevinGeng commited on
Commit
4ea1ce4
1 Parent(s): f3b6079

Update app.py

Browse files

1. Support multi channels (mean stereo to mono)
2. Better recognition rate. (Whisper)

Files changed (1) hide show
  1. app.py +15 -11
app.py CHANGED
@@ -1,4 +1,3 @@
1
-
2
  from random import sample
3
  import gradio as gr
4
  import torchaudio
@@ -10,8 +9,12 @@ import jiwer
10
 
11
  # ASR part
12
  from transformers import pipeline
13
- p = pipeline("automatic-speech-recognition")
14
-
 
 
 
 
15
  # WER part
16
  transformation = jiwer.Compose([
17
  jiwer.ToLowerCase(),
@@ -44,7 +47,9 @@ class ChangeSampleRate(nn.Module):
44
  model = lightning_module.BaselineLightningModule.load_from_checkpoint("epoch=3-step=7459.ckpt").eval()
45
 
46
  def calc_mos(audio_path, ref):
47
- wav, sr = torchaudio.load(audio_path)
 
 
48
  osr = 16_000
49
  batch = wav.unsqueeze(0).repeat(10, 1, 1)
50
  csr = ChangeSampleRate(sr, osr)
@@ -73,6 +78,7 @@ def calc_mos(audio_path, ref):
73
 
74
  return predic_mos, trans, wer, phone_transcription, ppm
75
 
 
76
  description ="""
77
  MOS prediction demo using UTMOS-strong w/o phoneme encoder model, which is trained on the main track dataset.
78
  This demo only accepts .wav format. Best at 16 kHz sampling rate.
@@ -86,15 +92,13 @@ Add WER interface.
86
 
87
  iface = gr.Interface(
88
  fn=calc_mos,
89
- inputs=[gr.Audio(source="microphone", type='filepath', label="Audio to evaluate"),
90
- gr.Textbox(value="Once upon a time there was a young rat named Arthur who couldn’t make up his mind.",
91
- placeholder="Input reference here",
92
- label="Reference")],
93
- outputs=[gr.Textbox(placeholder="Predicted MOS", label="Predicted MOS"),
94
  gr.Textbox(placeholder="Hypothesis", label="Hypothesis"),
95
- gr.Textbox(placeholder="Word Error Rate", label = "WER"),
96
  gr.Textbox(placeholder="Predicted Phonemes", label="Predicted Phonemes"),
97
- gr.Textbox(placeholder="Phonemes per minutes", label="PPM")],
98
  title="Laronix's Voice Quality Checking System Demo",
99
  description=description,
100
  allow_flagging="auto",
 
 
1
  from random import sample
2
  import gradio as gr
3
  import torchaudio
 
9
 
10
  # ASR part
11
  from transformers import pipeline
12
+ # p = pipeline("automatic-speech-recognition")
13
+ p = pipeline(
14
+ "automatic-speech-recognition",
15
+ model="KevinGeng/whipser_medium_en_PAL300_step25",
16
+ device=0,
17
+ )
18
  # WER part
19
  transformation = jiwer.Compose([
20
  jiwer.ToLowerCase(),
 
47
  model = lightning_module.BaselineLightningModule.load_from_checkpoint("epoch=3-step=7459.ckpt").eval()
48
 
49
  def calc_mos(audio_path, ref):
50
+ wav, sr = torchaudio.load(audio_path, channels_first=True)
51
+ if wav.shape[0] > 1:
52
+ wav = wav.mean(dim=0, keepdim=True) # Mono channel
53
  osr = 16_000
54
  batch = wav.unsqueeze(0).repeat(10, 1, 1)
55
  csr = ChangeSampleRate(sr, osr)
 
78
 
79
  return predic_mos, trans, wer, phone_transcription, ppm
80
 
81
+
82
  description ="""
83
  MOS prediction demo using UTMOS-strong w/o phoneme encoder model, which is trained on the main track dataset.
84
  This demo only accepts .wav format. Best at 16 kHz sampling rate.
 
92
 
93
  iface = gr.Interface(
94
  fn=calc_mos,
95
+ inputs=[gr.Audio(type='filepath', label="Audio to evaluate"),
96
+ gr.Textbox(placeholder="Input reference here (Don't keep this empty)", label="Reference")],
97
+ outputs=[gr.Textbox(placeholder="Naturalness evaluation, ranged 1 to 5, the higher the better.", label="Predicted MOS"),
 
 
98
  gr.Textbox(placeholder="Hypothesis", label="Hypothesis"),
99
+ gr.Textbox(placeholder="Word Error Rate: Only valid when Reference is given", label = "WER"),
100
  gr.Textbox(placeholder="Predicted Phonemes", label="Predicted Phonemes"),
101
+ gr.Textbox(placeholder="Speaking Rate, Phonemes per minutes", label="PPM")],
102
  title="Laronix's Voice Quality Checking System Demo",
103
  description=description,
104
  allow_flagging="auto",