import os os.system("""pip install nemo_toolkit['all']""") import nemo.collections.asr as nemo_asr from transformers import pipeline import numpy as np import gradio as gr import librosa from scipy.io.wavfile import write def respond(message, chat_history): bot_message = message chat_history.append((message, bot_message)) return "", chat_history def transcribe(audio): sr, y = audio audio_name = "resampled_audio.wav" resampled_audio = librosa.resample(y=y.astype("float"), orig_sr=sr, target_sr=16000) write(audio_name, 16000, resampled_audio) result = asr_model.transcribe([f"./{audio_name}"]) return result[0] asr_model = nemo_asr.models.EncDecCTCModel.from_pretrained(model_name="nvidia/parakeet-ctc-0.6b") with gr.Blocks() as demo: with gr.Column(): gr.Markdown( """ # HKU Canteen VA """) va = gr.Chatbot(container=False) with gr.Row(): # text input text_input = gr.Textbox(placeholder="Ask me anything...", container=False, scale=1) submit_btn = gr.Button("Submit", scale=0) with gr.Row(): # audio input recording = gr.Microphone(show_download_button=False, container=False) with gr.Row(): # button toolbar clear = gr.ClearButton([text_input, va]) text_input.submit(respond, [text_input, va], [text_input, va], queue=False) submit_btn.click(respond, [text_input, va], [text_input, va], queue=False) # recording.stop_recording(transcribe, [recording], [text_input]).then(respond,s [text_input, va], [text_input, va], queue=False) recording.stop_recording(transcribe, [recording], [text_input]) if __name__ == "__main__": demo.launch()