import logging import sys import gradio as gr import vosk import json import subprocess logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) LARGE_MODEL_BY_LANGUAGE = { "Russian": {"model_id": "vosk-model-ru-0.42"}, "Chinese": {"model_id": "vosk-model-cn-0.22"}, "English": {"model_id": "vosk-model-en-us-0.22"}, "French": {"model_id": "vosk-model-fr-0.22"}, "German": {"model_id": "vosk-model-de-0.22"}, "Italian": {"model_id": "vosk-model-it-0.22"}, "Japanese": {"model_id": "vosk-model-ja-0.22"}, "Hindi": {"model_id": "vosk-model-hi-0.22"}, "Persian": {"model_id": "vosk-model-fa-0.5"}, "Uzbek": {"model_id": "vosk-model-small-uz-0.22"}, } LANGUAGES = sorted(LARGE_MODEL_BY_LANGUAGE.keys()) CACHED_MODELS_BY_ID = {} def asr(model, input_file): rec = vosk.KaldiRecognizer(model, 16000.0) results = [] process = subprocess.Popen(f'ffmpeg -loglevel quiet -i {input_file} -ar 16000 -ac 1 -f s16le -'.split(), stdout=subprocess.PIPE) while True: data = process.stdout.read(4000) if len(data) == 0: break if rec.AcceptWaveform(data): jres = json.loads(rec.Result()) results.append(jres['text']) jres = json.loads(rec.FinalResult()) results.append(jres['text']) return " ".join(results) def run(input_file, language, history): logger.info(f"Running ASR for {language} for {input_file}") history = history or [] model = LARGE_MODEL_BY_LANGUAGE.get(language, None) if model is None: history.append({ "error_message": f"Failed to find a model for {language} language :(" }) elif input_file is None: history.append({ "error_message": f"Record input audio first" }) else: model_instance = CACHED_MODELS_BY_ID.get(model["model_id"], None) if model_instance is None: model_instance = vosk.Model(model_name=model["model_id"]) CACHED_MODELS_BY_ID[model["model_id"]] = model_instance transcription = asr(model_instance, input_file) logger.info(f"Transcription for {input_file}: {transcription}") history.append({ "model_id": model["model_id"], "language": language, "transcription": transcription, "error_message": None }) html_output = "
" for item in history: if item["error_message"] is not None: html_output += f"
{item['error_message']}
" else: html_output += "
" html_output += f'{item["transcription"]}
' html_output += "
" html_output += "
" return html_output, history gr.Interface( run, inputs=[ gr.inputs.Audio(source="microphone", type="filepath", label="Record something..."), gr.inputs.Radio(label="Language", choices=LANGUAGES), "state" ], outputs=[ gr.outputs.HTML(label="Outputs"), "state" ], title="Automatic Speech Recognition", description="", css=""" .result {display:flex;flex-direction:column} .result_item {padding:15px;margin-bottom:8px;border-radius:15px;width:100%} .result_item_success {background-color:mediumaquamarine;color:white;align-self:start} .result_item_error {background-color:#ff7070;color:white;align-self:start} """, allow_flagging="never", theme="default" ).launch(enable_queue=True)