import nemo.collections.asr as nemo_asr from transformers import pipeline import numpy as np import gradio as gr def respond(message, chat_history): bot_message = message chat_history.append((message, bot_message)) return "", chat_history def transcribe(audio): sr, y = audio y = y.astype(np.float32) y /= np.max(np.abs(y)) result = asr_model({"sampling_rate": sr, "raw": y})["text"] return result # asr_model_id = "openai/whisper-small.en" # asr_model = pipeline("automatic-speech-recognition", model=asr_model_id) asr_model = nemo_asr.models.EncDecCTCBPEModel.from_pretrained(model_name="nvidia/parakeet-ctc-0.6b") text = asr_model.transcribe(["./Samples/Sample_audios/test.wav"]) with gr.Blocks() as demo: with gr.Column(): gr.Markdown( """ # HKU Canteen VA """) gr.Markdown( f"{text}") va = gr.Chatbot(container=False) with gr.Row(): # text input text_input = gr.Textbox(placeholder="Ask me anything...", container=False, scale=1) submit_btn = gr.Button("Submit", scale=0) # with gr.Row(): # audio input # recording = gr.Microphone(show_download_button=False, container=False) with gr.Row(): # button toolbar clear = gr.ClearButton([text_input, va]) text_input.submit(respond, [text_input, va], [text_input, va], queue=False) submit_btn.click(respond, [text_input, va], [text_input, va], queue=False) # recording.stop_recording(transcribe, [recording], [text_input]).then(respond, [text_input, va], [text_input, va], queue=False) if __name__ == "__main__": demo.launch()