Spaces:

alakxender
/

tts-dhivehi-demo-mms

Runtime error

App Files Files Community

tts-dhivehi-demo-mms / app.py

alakxender

update

0d01a94 22 days ago

raw

history blame contribute delete

4.41 kB


	import gradio as gr
	import torch
	from transformers import VitsTokenizer, VitsModel, set_seed
	import tempfile
	import numpy as np
	from scipy.io.wavfile import write
	from lib.normalize_dv import normalize_dv

	models = {
	"MMS TTS Base": "Dhivehi/mms-tts-div",
	"Female F01 👩🏽 (CV)": "alakxender/mms-tts-div-finetuned-md-f01",
	"Female F02 👩🏽 (CV, pitch/tempo changed)": "alakxender/mms-tts-div-finetuned-md-f02",
	"Female F03 👩🏽 (CV, pitch/tempo changed)": "alakxender/mms-tts-div-finetuned-md-f03",
	"Female F04 👩🏽 (CV, rvc-test)": "alakxender/mms-tts-speak-f01",
	#"Female Unknown 👩🏽 (🤷‍♀️)": "alakxender/mms-tts-div-finetuned-sm-fu01",
	"Male M01 (CV) 👨🏽": "alakxender/mms-tts-div-finetuned-md-m01",
	"Male M02 (javaabu/shaafiu)" : "alakxender/mms-tts-div-finetuned-sm-mu01"

	}

	def tts(text:str, model_name:str):

	if (len(text)>2000):
	raise gr.Error(f"huh! using free cpu here!, try a small chunk of data. Yours is {len(text)}. try to fit to 2000 chars.")

	if (model_name is None):
	raise gr.Error("huh! not sure what to do without a model. select a model.")

	print (f"Loading...{models[model_name]}")

	# Load the MMS-TTS English model
	tokenizer = VitsTokenizer.from_pretrained(models[model_name])
	model = VitsModel.from_pretrained(models[model_name])

	print ("Model loaded.")

	# normalize the dv text from written to spoken
	print (f"Normalizing: {text}")
	text = normalize_dv(text)
	print (f"Normalized: {text}")

	# Preprocess the input text
	inputs = tokenizer(text=text, return_tensors="pt")
	print ("Preprocess done.")

	# Make the speech synthesis deterministic
	set_seed(555)

	# Generate the audio waveform
	print ("Generating audio...")
	with torch.no_grad():
	outputs = model(**inputs)

	waveform = outputs.waveform[0]
	sample_rate = model.config.sampling_rate

	#write("test.wav", rate=sample_rate, data=waveform.numpy().T)

	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
	# Save the waveform to the temporary file
	write(f.name, sample_rate, waveform.numpy().T)
	# Get the file name
	waveform_file = f.name
	print ("done.")
	return waveform_file

	css = """
	.textbox1 textarea {
	font-size: 18px !important;
	font-family: 'MV_Faseyha', 'Faruma', 'A_Faruma' !important;
	line-height: 1.8 !important;
	}
	"""

	with gr.Blocks(
	css=css
	) as demo:
	gr.Markdown("# <center> DV Text-To-Speech </center>")
	gr.Markdown("This interface convert Divehi text into natural-sounding speech using a fine-tuned Text-to-Speech model. Leveraging the capabilities of Massively Multilingual Speech (MMS) and VITS models. Text normalization is also incorporated to handle various input formats effectively.")

	text = gr.TextArea(label="Input text",
	placeholder="ދިވެހި ބަހުން ކޮންމެވެސް އެއްޗެކޭ މިތާ ލިޔެބަލަ", rtl=True, elem_classes="textbox1")
	model_name = gr.Dropdown(choices=list(models.keys()), label="Select TTS Model")
	btn = gr.Button("Text-To-Speech")
	output_audio = gr.Audio(label="Speech Output")
	#examples =[["2023 ވަނަ އަހަރު އިބްރާހިމް ވަނީ ބޮޑު 3 މަޝްރޫއެއް ކާމިޔާބުކަމާއެކު ނިންމައި، އިންވެސްޓަރެއްގެ އަތުން 100000 ޑޮލަރު ހޯދައި، 1000000 ޑޮލަރުގެ އާމްދަނީއެއް ހޯދުމުގެ ބޮޑު އަމާޒެއް ކަނޑައަޅާފައެވެ. މުޅި އަހަރު ދުވަހު 45 ގޮންޖެހުމަކާ ކުރިމަތިލާން ޖެހުނު ނަމަވެސް އޭނާގެ އަޒުމުގައި ދުވަހަކުވެސް ފަހަތަށް ޖެހިފައެއް ނުވެއެވެ. ޑިސެމްބަރު މަހުގެ ނިޔަލަށް އޭނާ ވަނީ 8 ޚާއްޞަ ޕްރޮފެޝަނަލުންގެ ޓީމެއް ހޯދައި، އޭނާގެ މާލީ ކުރިއެރުން ދައްކުވައިދޭ ގޮތަށް އޭނާގެ ބޭންކް އެކައުންޓް ނަންބަރު 876999954321 އަށް 100000 ޑޮލަރު ޖަމާކޮށްފައެވެ."]]
	text.submit(fn=tts, inputs=[text,model_name], outputs=output_audio)
	btn.click(fn=tts, inputs=[text,model_name], outputs=output_audio)

	# Launch the Gradio app
	demo.launch()
	#tts("އަހަރެން")