|
import gradio as gr |
|
import torch |
|
from transformers import VitsModel, VitsTokenizer |
|
|
|
|
|
|
|
print("Loading facebook/mms-tts-eng model and tokenizer...") |
|
model_id = "facebook/mms-tts-eng" |
|
model = VitsModel.from_pretrained(model_id) |
|
tokenizer = VitsTokenizer.from_pretrained(model_id) |
|
print("Model and tokenizer loaded successfully.") |
|
|
|
|
|
def synthesize_speech(text): |
|
""" |
|
Converts text to speech using the selected TTS model. |
|
""" |
|
|
|
inputs = tokenizer(text, return_tensors="pt") |
|
|
|
|
|
|
|
with torch.no_grad(): |
|
waveform = model(**inputs).waveform |
|
|
|
|
|
|
|
waveform_numpy = waveform.cpu().numpy().squeeze() |
|
|
|
|
|
sampling_rate = model.config.sampling_rate |
|
|
|
|
|
return (sampling_rate, waveform_numpy) |
|
|
|
|
|
demo = gr.Interface( |
|
fn=synthesize_speech, |
|
inputs=gr.Textbox( |
|
label="Text to Synthesize", |
|
info="Enter the text you want to convert to speech.", |
|
value="Hello, this is a demonstration of the Facebook MMS text to speech model." |
|
), |
|
outputs=gr.Audio( |
|
label="Synthesized Audio", |
|
type="numpy" |
|
), |
|
title="🗣️ MMS Text-to-Speech (English)", |
|
description="A Gradio app to run the `facebook/mms-tts-eng` model for text-to-speech conversion.", |
|
examples=[ |
|
["The quick brown fox jumps over the lazy dog."], |
|
["To be, or not to be, that is the question."], |
|
["Artificial intelligence will shape our future in profound ways."] |
|
], |
|
cache_examples=True |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch() |