Spaces:

mizoru
/

minority-asr

Sleeping

App Files Files Community

mizoru commited on May 22, 2022

Commit

2b10872

•

1 Parent(s): f67f7fd

Create app.py

Browse files

Files changed (1) hide show

app.py +62 -0

app.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import gradio as gr
+from transformers import pipeline, AutoModelForCTC, Wav2Vec2Processor, Wav2Vec2ProcessorWithLM
+MODELS = {
+    "Tatar": {"model_id": "sammy786/wav2vec2-xlsr-tatar", "has_lm": False},
+    "Chuvash": {"model_id": "sammy786/wav2vec2-xlsr-chuvash", "has_lm": False}
+    }
+CACHED_MODELS_BY_ID = {}
+LANGUAGES = sorted(MODELS.keys())
+def run(input_file, language, decoding_type, history):
+    #logger.info(f"Running ASR {language}-{model_size}-{decoding_type} for {input_file}")
+    model = MODELS.get(language, None)
+    if decoding_type == "LM" and not model["has_lm"]:
+        history.append({
+            "error_message": f"LM not available for {language} language :("
+        })
+    else:
+        # model_instance = AutoModelForCTC.from_pretrained(model["model_id"])
+        model_instance = CACHED_MODELS_BY_ID.get(model["model_id"], None)
+        if model_instance is None:
+            model_instance = AutoModelForCTC.from_pretrained(model["model_id"])
+            CACHED_MODELS_BY_ID[model["model_id"]] = model_instance
+        if decoding_type == "LM":
+            processor = Wav2Vec2ProcessorWithLM.from_pretrained(model["model_id"])
+            asr = pipeline("automatic-speech-recognition", model=model_instance, tokenizer=processor.tokenizer,
+                           feature_extractor=processor.feature_extractor, decoder=processor.decoder)
+        else:
+            processor = Wav2Vec2Processor.from_pretrained(model["model_id"])
+            asr = pipeline("automatic-speech-recognition", model=model_instance, tokenizer=processor.tokenizer,
+                           feature_extractor=processor.feature_extractor, decoder=None)
+        transcription = asr(input_file, chunk_length_s=5, stride_length_s=1)["text"]
+    return transcription
+gr.Interface(
+    run,
+    inputs=[
+        gr.Audio(source="microphone", type="filepath", label="Record something..."),
+        gr.Radio(label="Language", choices=LANGUAGES),
+        gr.Radio(label="Decoding type", choices=["greedy", "LM"]),
+        # gr.inputs.Radio(label="Model size", choices=["300M", "1B"]),
+        "state"
+    ],
+    outputs=[
+        gr.TextBox
+    ],
+    allow_screenshot=False,
+    allow_flagging="never",
+    theme="grass"
+).launch(enable_queue=True)