|
import gradio as gr |
|
from transformers import pipeline, VitsTokenizer, VitsModel, set_seed |
|
import numpy as np |
|
import torch |
|
import io |
|
import soundfile as sf |
|
|
|
|
|
transcriber = pipeline("automatic-speech-recognition", model="facebook/s2t-small-librispeech-asr") |
|
|
|
|
|
generator = pipeline("text-generation", model="gpt2") |
|
|
|
|
|
tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng") |
|
model = VitsModel.from_pretrained("facebook/mms-tts-eng") |
|
|
|
def transcribe_and_generate_audio(audio): |
|
sr, y = audio |
|
y = y.astype(np.float32) |
|
y /= np.max(np.abs(y)) |
|
|
|
|
|
asr_output = transcriber({"sampling_rate": sr, "raw": y})["text"] |
|
|
|
|
|
generated_text = generator(asr_output)[0]['generated_text'] |
|
|
|
|
|
inputs = tokenizer(text=generated_text, return_tensors="pt") |
|
set_seed(555) |
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
waveform = outputs.waveform[0] |
|
waveform_path = "output.wav" |
|
sf.write(waveform_path, waveform.numpy(), 16000, format='wav') |
|
|
|
return waveform_path |
|
|
|
|
|
audio_input = gr.Interface( |
|
transcribe_and_generate_audio, |
|
gr.Audio(sources=["microphone"], label="Speak Here"), |
|
"audio", |
|
title="ASR -> LLM -> TTS", |
|
description="Speak into the microphone and hear the generated audio." |
|
) |
|
|
|
|
|
audio_input.launch() |