camanalo1's picture
Update app.py
4ba3d10 verified
raw
history blame
No virus
1.63 kB
import gradio as gr
import numpy as np
import torch
import io
import soundfile as sf
from nemo.collections.asr.models import EncDecMultiTaskModel
# Load the ASR model
canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b')
# Update decoding parameters
decode_cfg = canary_model.cfg.decoding
decode_cfg.beam.beam_size = 1
canary_model.change_decoding_strategy(decode_cfg)
# Initialize LLM pipeline
generator = pipeline("text-generation", model="microsoft/Phi-3-mini-4k-instruct", trust_remote_code=True)
# Initialize TTS tokenizer and model
tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
model = VitsModel.from_pretrained("facebook/mms-tts-eng")
def transcribe_generate_and_speak(audio):
sr, y = audio
y = y.astype(np.float32)
y /= np.max(np.abs(y))
# Transcribe audio
asr_output = canary_model.transcribe([y], [sr])
# Generate text based on ASR output
generated_text = generator(asr_output[0])[0]['generated_text']
# Generate audio from text
inputs = tokenizer(text=generated_text, return_tensors="pt")
set_seed(555)
with torch.no_grad():
outputs = model(**inputs)
waveform = outputs.waveform[0]
waveform_path = "output.wav"
sf.write(waveform_path, waveform.numpy(), 16000, format='wav')
return waveform_path
# Define Gradio interface
audio_input = gr.Interface(
transcribe_generate_and_speak,
gr.Audio(sources=["microphone"], label="Speak Here"),
"audio",
title="ASR -> LLM -> TTS",
description="Speak into the microphone and hear the generated audio."
)
# Launch the interface
audio_input.launch()