Spaces:

istupakov
/

onnx-asr

Running

App Files Files Community

istupakov commited on 3 days ago

Commit

d1326a6

verified ·

1 Parent(s): d1d81c6

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -10

app.py CHANGED Viewed

@@ -29,14 +29,15 @@ models_ru = {
 }
 models_en = {
-    name: onnx_asr.load_model(name, quantization="int8")
     for name in [
         "nemo-parakeet-ctc-0.6b",
         "nemo-parakeet-rnnt-0.6b",
     ]
 }
-models_vad = models_ru | models_en | whisper
 def recognize(audio: tuple[int, np.ndarray], models, language):
@@ -44,7 +45,8 @@ def recognize(audio: tuple[int, np.ndarray], models, language):
         return None
     sample_rate, waveform = audio
-    logger.debug("recognize: sample_rate %s, waveform.shape %s.", sample_rate, waveform.shape)
     try:
         waveform = waveform.astype(np.float32) / 2 ** (8 * waveform.itemsize - 1)
         if waveform.ndim == 2:
@@ -52,6 +54,8 @@ def recognize(audio: tuple[int, np.ndarray], models, language):
         results = []
         for name, model in models.items():
             start = timer()
             result = model.recognize(waveform, sample_rate=sample_rate, language=language)
             time = timer() - start
@@ -95,7 +99,7 @@ def recognize_with_vad(audio: tuple[int, np.ndarray], name: str):
 with gr.Blocks() as recognize_short:
-    audio = gr.Audio(min_length=1, max_length=20)
     with gr.Row():
         gr.ClearButton(audio)
         btn_ru = gr.Button("Recognize (ru)", variant="primary")
@@ -104,15 +108,28 @@ with gr.Blocks() as recognize_short:
     btn_ru.click(fn=recognize_ru, inputs=audio, outputs=output)
     btn_en.click(fn=recognize_en, inputs=audio, outputs=output)
 with gr.Blocks() as recognize_long:
     name = gr.Dropdown(models_vad.keys(), label="Model")
     audio = gr.Audio(min_length=1, max_length=300)
     with gr.Row():
         gr.ClearButton(audio)
         btn = gr.Button("Recognize", variant="primary")
-    output = gr.TextArea(label="result")  # headers=["start", "end", "result"], wrap=True, every=0.1)
     btn.click(fn=recognize_with_vad, inputs=[audio, name], outputs=output)
 with gr.Blocks() as demo:
     gr.Markdown("""
     # ASR demo using onnx-asr
@@ -122,22 +139,25 @@ with gr.Blocks() as demo:
     gr.TabbedInterface(
         [recognize_short, recognize_long],
         [
-            "Recognition of a short phrase (up to 20 sec.)",
             "Recognition of a long phrase with VAD (up to 5 min.)",
         ],
     )
-    with gr.Accordion("Models used in this demo...", open=False):
         gr.Markdown("""
-        ## ASR models
         * `gigaam-v2-ctc` - Sber GigaAM v2 CTC ([origin](https://github.com/salute-developers/GigaAM), [onnx](https://huggingface.co/istupakov/gigaam-v2-onnx))
         * `gigaam-v2-rnnt` - Sber GigaAM v2 RNN-T ([origin](https://github.com/salute-developers/GigaAM), [onnx](https://huggingface.co/istupakov/gigaam-v2-onnx))
         * `nemo-fastconformer-ru-ctc` - Nvidia FastConformer-Hybrid Large (ru) with CTC decoder ([origin](https://huggingface.co/nvidia/stt_ru_fastconformer_hybrid_large_pc), [onnx](https://huggingface.co/istupakov/stt_ru_fastconformer_hybrid_large_pc_onnx))
         * `nemo-fastconformer-ru-rnnt` - Nvidia FastConformer-Hybrid Large (ru) with RNN-T decoder ([origin](https://huggingface.co/nvidia/stt_ru_fastconformer_hybrid_large_pc), [onnx](https://huggingface.co/istupakov/stt_ru_fastconformer_hybrid_large_pc_onnx))
-        * `nemo-parakeet-ctc-0.6b` - Nvidia Parakeet CTC 0.6B (en) ([origin](https://huggingface.co/nvidia/parakeet-ctc-0.6b), [onnx](https://huggingface.co/istupakov/parakeet-ctc-0.6b-onnx))
-        * `nemo-parakeet-rnnt-0.6b` - Nvidia Parakeet RNNT 0.6B (en) ([origin](https://huggingface.co/nvidia/parakeet-rnnt-0.6b), [onnx](https://huggingface.co/istupakov/parakeet-rnnt-0.6b-onnx))
         * `whisper-base` - OpenAI Whisper Base exported with onnxruntime ([origin](https://huggingface.co/openai/whisper-base), [onnx](https://huggingface.co/istupakov/whisper-base-onnx))
         * `alphacep/vosk-model-ru` - Alpha Cephei Vosk 0.54-ru ([origin](https://huggingface.co/alphacep/vosk-model-ru))
         * `alphacep/vosk-model-small-ru` - Alpha Cephei Vosk 0.52-small-ru ([origin](https://huggingface.co/alphacep/vosk-model-small-ru))
         ## VAD models
         * `silero` - Silero VAD ([origin](https://github.com/snakers4/silero-vad), [onnx](https://huggingface.co/onnx-community/silero-vad))
         """)

 }
 models_en = {
+    name: onnx_asr.load_model(name)
     for name in [
         "nemo-parakeet-ctc-0.6b",
         "nemo-parakeet-rnnt-0.6b",
+        "nemo-parakeet-tdt-0.6b-v2",
     ]
 }
+models_vad = whisper | models_ru | models_en
 def recognize(audio: tuple[int, np.ndarray], models, language):
         return None
     sample_rate, waveform = audio
+    length = waveform.shape[-1] / sample_rate
+    logger.debug("recognize: length %.3f, sample_rate %s, waveform.shape %s.", length, sample_rate, waveform.shape)
     try:
         waveform = waveform.astype(np.float32) / 2 ** (8 * waveform.itemsize - 1)
         if waveform.ndim == 2:
         results = []
         for name, model in models.items():
+            if length > 20 and name == "alphacep/vosk-model-small-ru":
+                continue
             start = timer()
             result = model.recognize(waveform, sample_rate=sample_rate, language=language)
             time = timer() - start
 with gr.Blocks() as recognize_short:
+    audio = gr.Audio(min_length=1, max_length=30)
     with gr.Row():
         gr.ClearButton(audio)
         btn_ru = gr.Button("Recognize (ru)", variant="primary")
     btn_ru.click(fn=recognize_ru, inputs=audio, outputs=output)
     btn_en.click(fn=recognize_en, inputs=audio, outputs=output)
 with gr.Blocks() as recognize_long:
     name = gr.Dropdown(models_vad.keys(), label="Model")
+    # lang = gr.Label()
     audio = gr.Audio(min_length=1, max_length=300)
     with gr.Row():
         gr.ClearButton(audio)
         btn = gr.Button("Recognize", variant="primary")
+    output = gr.TextArea(label="result")
     btn.click(fn=recognize_with_vad, inputs=[audio, name], outputs=output)
+    def on_model_change(name: str):
+        if name in models_ru:
+            label = f"Model {name} support only Russian language"
+        elif name in models_en:
+            label = f"Model {name} support only English language"
+        else:
+            label = None
+        return gr.Audio(min_length=1, max_length=300, label=label)
+    name.change(on_model_change, inputs=name, outputs=audio)
 with gr.Blocks() as demo:
     gr.Markdown("""
     # ASR demo using onnx-asr
     gr.TabbedInterface(
         [recognize_short, recognize_long],
         [
+            "Recognition of a short phrase (up to 30 sec.)",
             "Recognition of a long phrase with VAD (up to 5 min.)",
         ],
     )
+    with gr.Accordion("Models used in this demo:", open=False):
         gr.Markdown("""
+        ## Russian ASR models
         * `gigaam-v2-ctc` - Sber GigaAM v2 CTC ([origin](https://github.com/salute-developers/GigaAM), [onnx](https://huggingface.co/istupakov/gigaam-v2-onnx))
         * `gigaam-v2-rnnt` - Sber GigaAM v2 RNN-T ([origin](https://github.com/salute-developers/GigaAM), [onnx](https://huggingface.co/istupakov/gigaam-v2-onnx))
         * `nemo-fastconformer-ru-ctc` - Nvidia FastConformer-Hybrid Large (ru) with CTC decoder ([origin](https://huggingface.co/nvidia/stt_ru_fastconformer_hybrid_large_pc), [onnx](https://huggingface.co/istupakov/stt_ru_fastconformer_hybrid_large_pc_onnx))
         * `nemo-fastconformer-ru-rnnt` - Nvidia FastConformer-Hybrid Large (ru) with RNN-T decoder ([origin](https://huggingface.co/nvidia/stt_ru_fastconformer_hybrid_large_pc), [onnx](https://huggingface.co/istupakov/stt_ru_fastconformer_hybrid_large_pc_onnx))
         * `whisper-base` - OpenAI Whisper Base exported with onnxruntime ([origin](https://huggingface.co/openai/whisper-base), [onnx](https://huggingface.co/istupakov/whisper-base-onnx))
         * `alphacep/vosk-model-ru` - Alpha Cephei Vosk 0.54-ru ([origin](https://huggingface.co/alphacep/vosk-model-ru))
         * `alphacep/vosk-model-small-ru` - Alpha Cephei Vosk 0.52-small-ru ([origin](https://huggingface.co/alphacep/vosk-model-small-ru))
+        ## English ASR models
+        * `nemo-parakeet-ctc-0.6b` - Nvidia Parakeet CTC 0.6B (en) ([origin](https://huggingface.co/nvidia/parakeet-ctc-0.6b), [onnx](https://huggingface.co/istupakov/parakeet-ctc-0.6b-onnx))
+        * `nemo-parakeet-rnnt-0.6b` - Nvidia Parakeet RNNT 0.6B (en) ([origin](https://huggingface.co/nvidia/parakeet-rnnt-0.6b), [onnx](https://huggingface.co/istupakov/parakeet-rnnt-0.6b-onnx))
+        * `nemo-parakeet-tdt-0.6b-v2` - Nvidia Parakeet TDT 0.6B V2 (en) ([origin](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2), [onnx](https://huggingface.co/istupakov/parakeet-tdt-0.6b-v2-onnx))
+        * `whisper-base` - OpenAI Whisper Base exported with onnxruntime ([origin](https://huggingface.co/openai/whisper-base), [onnx](https://huggingface.co/istupakov/whisper-base-onnx))
         ## VAD models
         * `silero` - Silero VAD ([origin](https://github.com/snakers4/silero-vad), [onnx](https://huggingface.co/onnx-community/silero-vad))
         """)