Spaces:
Running
Running
import gradio as gr | |
import torch | |
import numpy as np | |
from transformers import VitsModel, AutoTokenizer | |
LANG_MODEL_MAP = { | |
"English": "facebook/mms-tts-eng", | |
"Hindi": "facebook/mms-tts-hin", | |
"Tamil": "facebook/mms-tts-tam", | |
"Malayalam": "facebook/mms-tts-mal", | |
"Kannada": "facebook/mms-tts-kan", | |
"Telugu": "facebook/mms-tts-tel" | |
} | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
cache = {} | |
def load_model_and_tokenizer(language): | |
model_name = LANG_MODEL_MAP[language] | |
if model_name not in cache: | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = VitsModel.from_pretrained(model_name).to(device) | |
cache[model_name] = (tokenizer, model) | |
return cache[model_name] | |
def tts(language, text): | |
if not text.strip(): | |
return 16000, np.zeros(1) # empty waveform if no text | |
tokenizer, model = load_model_and_tokenizer(language) | |
inputs = tokenizer(text, return_tensors="pt").to(device) | |
with torch.no_grad(): | |
output = model(**inputs) | |
waveform = output.waveform.squeeze().cpu().numpy() | |
return 16000, waveform | |
iface = gr.Interface( | |
fn=tts, | |
inputs=[ | |
gr.Dropdown(choices=list(LANG_MODEL_MAP.keys()), label="Select Language"), | |
gr.Textbox(label="Enter Text") | |
], | |
outputs=gr.Audio(label="Synthesized Speech", type="numpy"), | |
title="Multilingual Text-to-Speech (MMS)", | |
description="Generate speech from text using Meta's MMS models for English, Hindi, Tamil, Malayalam, Kannada and Telugu." | |
) | |
if __name__ == "__main__": | |
iface.launch() |