Spaces:
Sleeping
Sleeping
import gradio as gr | |
import numpy as np | |
import torch | |
import io | |
import soundfile as sf | |
from nemo.collections.asr.models import EncDecMultiTaskModel | |
# Load the ASR model | |
canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b') | |
# Update decoding parameters | |
decode_cfg = canary_model.cfg.decoding | |
decode_cfg.beam.beam_size = 1 | |
canary_model.change_decoding_strategy(decode_cfg) | |
# Initialize LLM pipeline | |
generator = pipeline("text-generation", model="microsoft/Phi-3-mini-4k-instruct", trust_remote_code=True) | |
# Initialize TTS tokenizer and model | |
tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng") | |
model = VitsModel.from_pretrained("facebook/mms-tts-eng") | |
def transcribe_generate_and_speak(audio): | |
sr, y = audio | |
y = y.astype(np.float32) | |
y /= np.max(np.abs(y)) | |
# Transcribe audio | |
asr_output = canary_model.transcribe([y], [sr]) | |
# Generate text based on ASR output | |
generated_text = generator(asr_output[0])[0]['generated_text'] | |
# Generate audio from text | |
inputs = tokenizer(text=generated_text, return_tensors="pt") | |
set_seed(555) | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
waveform = outputs.waveform[0] | |
waveform_path = "output.wav" | |
sf.write(waveform_path, waveform.numpy(), 16000, format='wav') | |
return waveform_path | |
# Define Gradio interface | |
audio_input = gr.Interface( | |
transcribe_generate_and_speak, | |
gr.Audio(sources=["microphone"], label="Speak Here"), | |
"audio", | |
title="ASR -> LLM -> TTS", | |
description="Speak into the microphone and hear the generated audio." | |
) | |
# Launch the interface | |
audio_input.launch() |