Spaces:
Runtime error
Runtime error
import gradio as gr | |
import torch | |
from transformers import VitsTokenizer, VitsModel, set_seed | |
import tempfile | |
import numpy as np | |
from scipy.io.wavfile import write | |
from lib.normalize_dv import normalize_dv | |
models = { | |
"MMS TTS Base": "Dhivehi/mms-tts-div", | |
"Female F01 👩🏽 (CV)": "alakxender/mms-tts-div-finetuned-md-f01", | |
"Female F02 👩🏽 (CV, pitch/tempo changed)": "alakxender/mms-tts-div-finetuned-md-f02", | |
"Female F03 👩🏽 (CV, pitch/tempo changed)": "alakxender/mms-tts-div-finetuned-md-f03", | |
"Female F04 👩🏽 (CV, rvc-test)": "alakxender/mms-tts-speak-f01", | |
#"Female Unknown 👩🏽 (🤷♀️)": "alakxender/mms-tts-div-finetuned-sm-fu01", | |
"Male M01 (CV) 👨🏽": "alakxender/mms-tts-div-finetuned-md-m01", | |
"Male M02 (javaabu/shaafiu)" : "alakxender/mms-tts-div-finetuned-sm-mu01" | |
} | |
def tts(text:str, model_name:str): | |
if (len(text)>2000): | |
raise gr.Error(f"huh! using free cpu here!, try a small chunk of data. Yours is {len(text)}. try to fit to 2000 chars.") | |
if (model_name is None): | |
raise gr.Error("huh! not sure what to do without a model. select a model.") | |
print (f"Loading...{models[model_name]}") | |
# Load the MMS-TTS English model | |
tokenizer = VitsTokenizer.from_pretrained(models[model_name]) | |
model = VitsModel.from_pretrained(models[model_name]) | |
print ("Model loaded.") | |
# normalize the dv text from written to spoken | |
print (f"Normalizing: {text}") | |
text = normalize_dv(text) | |
print (f"Normalized: {text}") | |
# Preprocess the input text | |
inputs = tokenizer(text=text, return_tensors="pt") | |
print ("Preprocess done.") | |
# Make the speech synthesis deterministic | |
set_seed(555) | |
# Generate the audio waveform | |
print ("Generating audio...") | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
waveform = outputs.waveform[0] | |
sample_rate = model.config.sampling_rate | |
#write("test.wav", rate=sample_rate, data=waveform.numpy().T) | |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: | |
# Save the waveform to the temporary file | |
write(f.name, sample_rate, waveform.numpy().T) | |
# Get the file name | |
waveform_file = f.name | |
print ("done.") | |
return waveform_file | |
css = """ | |
.textbox1 textarea { | |
font-size: 18px !important; | |
font-family: 'MV_Faseyha', 'Faruma', 'A_Faruma' !important; | |
line-height: 1.8 !important; | |
} | |
""" | |
with gr.Blocks( | |
css=css | |
) as demo: | |
gr.Markdown("# <center> DV Text-To-Speech </center>") | |
gr.Markdown("This interface convert Divehi text into natural-sounding speech using a fine-tuned Text-to-Speech model. Leveraging the capabilities of Massively Multilingual Speech (MMS) and VITS models. Text normalization is also incorporated to handle various input formats effectively.") | |
text = gr.TextArea(label="Input text", | |
placeholder="ދިވެހި ބަހުން ކޮންމެވެސް އެއްޗެކޭ މިތާ ލިޔެބަލަ", rtl=True, elem_classes="textbox1") | |
model_name = gr.Dropdown(choices=list(models.keys()), label="Select TTS Model") | |
btn = gr.Button("Text-To-Speech") | |
output_audio = gr.Audio(label="Speech Output") | |
#examples =[["2023 ވަނަ އަހަރު އިބްރާހިމް ވަނީ ބޮޑު 3 މަޝްރޫއެއް ކާމިޔާބުކަމާއެކު ނިންމައި، އިންވެސްޓަރެއްގެ އަތުން 100000 ޑޮލަރު ހޯދައި، 1000000 ޑޮލަރުގެ އާމްދަނީއެއް ހޯދުމުގެ ބޮޑު އަމާޒެއް ކަނޑައަޅާފައެވެ. މުޅި އަހަރު ދުވަހު 45 ގޮންޖެހުމަކާ ކުރިމަތިލާން ޖެހުނު ނަމަވެސް އޭނާގެ އަޒުމުގައި ދުވަހަކުވެސް ފަހަތަށް ޖެހިފައެއް ނުވެއެވެ. ޑިސެމްބަރު މަހުގެ ނިޔަލަށް އޭނާ ވަނީ 8 ޚާއްޞަ ޕްރޮފެޝަނަލުންގެ ޓީމެއް ހޯދައި، އޭނާގެ މާލީ ކުރިއެރުން ދައްކުވައިދޭ ގޮތަށް އޭނާގެ ބޭންކް އެކައުންޓް ނަންބަރު 876999954321 އަށް 100000 ޑޮލަރު ޖަމާކޮށްފައެވެ."]] | |
text.submit(fn=tts, inputs=[text,model_name], outputs=output_audio) | |
btn.click(fn=tts, inputs=[text,model_name], outputs=output_audio) | |
# Launch the Gradio app | |
demo.launch() | |
#tts("އަހަރެން") | |