Spaces:

Aumkeshchy2003
/

Italian_TTS

Running

App Files Files Community

Italian_TTS / app.py

Aumkeshchy2003

Update app.py

c209c53 verified about 1 year ago

raw

history blame contribute delete

3.79 kB


	import gradio as gr
	import torch
	from datasets import load_dataset
	from transformers import SpeechT5Processor, SpeechT5HifiGan, SpeechT5ForTextToSpeech

	# Load the fine-tuned model and vocoder for Italian from the new model ID
	model_id = "Aumkeshchy2003/speecht5_finetuned_AumkeshChy_italian_tts"
	model = SpeechT5ForTextToSpeech.from_pretrained(model_id)
	vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

	# Load speaker embeddings dataset
	embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
	speaker_embeddings = torch.tensor(embeddings_dataset[7440]["xvector"]).unsqueeze(0)

	# Load processor for the new Italian model
	processor = SpeechT5Processor.from_pretrained(model_id)


	replacements = [
	('à', 'ah'),
	('è', 'eh'),
	('ì', 'ee'),
	('í', 'ee'),
	('ï', 'ee'),
	('ò', 'aw'),
	('ó', 'oh'),
	('ù', 'oo'),
	('ú', 'oo')
	]

	number_words = {
	0: "zero", 1: "oo-noh", 2: "doo-eh", 3: "tre", 4: "quattro", 5: "chinque", 6: "sei", 7: "sette", 8: "otto", 9: "nove",
	10: "decei", 11: "undici", 12: "dodici", 13: "tredici", 14: "quattordici", 15: "quindici", 16: "sedici", 17: "diciassette",
	18: "diciotto", 19: "diciannove", 20: "venti", 30: "trenta", 40: "quaranta", 50: "cinquanta", 60: "sessanta", 70: "settanta",
	80: "ottanta", 90: "novanta", 100: "cento", 1000: "mille"
	}

	def number_to_words(number):
	if number < 20:
	return number_words[number]
	elif number < 100:
	tens, unit = divmod(number, 10)
	return number_words[tens * 10] + (" " + number_words[unit] if unit else "")
	elif number < 1000:
	hundreds, remainder = divmod(number, 100)
	return (number_words[hundreds] + " centi" if hundreds > 1 else " centi") + (" " + number_to_words(remainder) if remainder else "")
	elif number < 1000000:
	thousands, remainder = divmod(number, 1000)
	return (number_to_words(thousands) + " mille" if thousands > 1 else " mille") + (" " + number_to_words(remainder) if remainder else "")
	elif number < 1000000000:
	millions, remainder = divmod(number, 1000000)
	return number_to_words(millions) + " millione" + (" " + number_to_words(remainder) if remainder else "")
	elif number < 1000000000000:
	billions, remainder = divmod(number, 1000000000)
	return number_to_words(billions) + " milliardo" + (" " + number_to_words(remainder) if remainder else "")
	else:
	return str(number)

	def replace_numbers_with_words(text):
	def replace(match):
	number = int(match.group())
	return number_to_words(number)

	# Find the numbers and change with words.
	result = re.sub(r'\b\d+\b', replace, text)

	return result

	# Text-to-speech synthesis function
	def synthesize_speech(text):
	# Clean up text for Italian-specific accents
	for src, dst in replacements:
	text = text.replace(src, dst)

	# Process input text
	inputs = processor(text=text, return_tensors="pt")

	# Generate speech using the model and vocoder
	speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

	# Return the generated speech as (sample_rate, audio_array)
	return (16000, speech.cpu().numpy())

	# Title and description for the Gradio interface
	title = "Fine-tuning TTS for a Italian Language Using SpeechT5"
	description = """
	Enter Italian text, and listen to the generated speech
	"""

	# Create Gradio interface
	interface = gr.Interface(
	fn=synthesize_speech,
	inputs=gr.Textbox(label="Input Text", placeholder="Enter Italian text"),
	outputs=gr.Audio(label="Generated Speech"),
	title=title,
	description=description,
	examples=["Buongiorno, come sta? Buona giornata"]
	)

	# Launch the interface
	interface.launch()