import gradio as gr import numpy as np import torch import io import soundfile as sf from nemo.collections.asr.models import EncDecMultiTaskModel # Load the ASR model canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b') # Update decoding parameters decode_cfg = canary_model.cfg.decoding decode_cfg.beam.beam_size = 1 canary_model.change_decoding_strategy(decode_cfg) # Initialize LLM pipeline generator = pipeline("text-generation", model="microsoft/Phi-3-mini-4k-instruct", trust_remote_code=True) # Initialize TTS tokenizer and model tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng") model = VitsModel.from_pretrained("facebook/mms-tts-eng") def transcribe_generate_and_speak(audio): sr, y = audio y = y.astype(np.float32) y /= np.max(np.abs(y)) # Transcribe audio asr_output = canary_model.transcribe([y], [sr]) # Generate text based on ASR output generated_text = generator(asr_output[0])[0]['generated_text'] # Generate audio from text inputs = tokenizer(text=generated_text, return_tensors="pt") set_seed(555) with torch.no_grad(): outputs = model(**inputs) waveform = outputs.waveform[0] waveform_path = "output.wav" sf.write(waveform_path, waveform.numpy(), 16000, format='wav') return waveform_path # Define Gradio interface audio_input = gr.Interface( transcribe_generate_and_speak, gr.Audio(sources=["microphone"], label="Speak Here"), "audio", title="ASR -> LLM -> TTS", description="Speak into the microphone and hear the generated audio." ) # Launch the interface audio_input.launch()