import gradio as gr from nemo.collections.asr.models import ASRModel import torch if torch.cuda.is_available(): device = torch.device(f'cuda:0') asr_model = ASRModel.from_pretrained(model_name='stt_en_citrinet_1024') from happytransformer import HappyTextToText, TTSettings happy_tt = HappyTextToText("T5", "vennify/t5-base-grammar-correction") args = TTSettings(num_beams=5, min_length=1) def transcribe(audio): """Speech to text using Nvidia Nemo""" text = asr_model.transcribe(paths2audio_files=[audio])[0] # Add the prefix "grammar: " before each input correct = happy_tt.generate_text("grammar: " + text, args=args) return text, correct.text gr.Interface( fn=transcribe, inputs=[ gr.Audio(source="microphone", type="filepath"), ], outputs=[ "textbox", "textbox" ]).launch()