Spaces:
Sleeping
Sleeping
File size: 3,794 Bytes
27c27c1 405ddc5 27c27c1 405ddc5 33baaf9 405ddc5 27c27c1 c209c53 405ddc5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
import gradio as gr
import torch
from datasets import load_dataset
from transformers import SpeechT5Processor, SpeechT5HifiGan, SpeechT5ForTextToSpeech
# Load the fine-tuned model and vocoder for Italian from the new model ID
model_id = "Aumkeshchy2003/speecht5_finetuned_AumkeshChy_italian_tts"
model = SpeechT5ForTextToSpeech.from_pretrained(model_id)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
# Load speaker embeddings dataset
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7440]["xvector"]).unsqueeze(0)
# Load processor for the new Italian model
processor = SpeechT5Processor.from_pretrained(model_id)
replacements = [
('à', 'ah'),
('è', 'eh'),
('ì', 'ee'),
('í', 'ee'),
('ï', 'ee'),
('ò', 'aw'),
('ó', 'oh'),
('ù', 'oo'),
('ú', 'oo')
]
number_words = {
0: "zero", 1: "oo-noh", 2: "doo-eh", 3: "tre", 4: "quattro", 5: "chinque", 6: "sei", 7: "sette", 8: "otto", 9: "nove",
10: "decei", 11: "undici", 12: "dodici", 13: "tredici", 14: "quattordici", 15: "quindici", 16: "sedici", 17: "diciassette",
18: "diciotto", 19: "diciannove", 20: "venti", 30: "trenta", 40: "quaranta", 50: "cinquanta", 60: "sessanta", 70: "settanta",
80: "ottanta", 90: "novanta", 100: "cento", 1000: "mille"
}
def number_to_words(number):
if number < 20:
return number_words[number]
elif number < 100:
tens, unit = divmod(number, 10)
return number_words[tens * 10] + (" " + number_words[unit] if unit else "")
elif number < 1000:
hundreds, remainder = divmod(number, 100)
return (number_words[hundreds] + " centi" if hundreds > 1 else " centi") + (" " + number_to_words(remainder) if remainder else "")
elif number < 1000000:
thousands, remainder = divmod(number, 1000)
return (number_to_words(thousands) + " mille" if thousands > 1 else " mille") + (" " + number_to_words(remainder) if remainder else "")
elif number < 1000000000:
millions, remainder = divmod(number, 1000000)
return number_to_words(millions) + " millione" + (" " + number_to_words(remainder) if remainder else "")
elif number < 1000000000000:
billions, remainder = divmod(number, 1000000000)
return number_to_words(billions) + " milliardo" + (" " + number_to_words(remainder) if remainder else "")
else:
return str(number)
def replace_numbers_with_words(text):
def replace(match):
number = int(match.group())
return number_to_words(number)
# Find the numbers and change with words.
result = re.sub(r'\b\d+\b', replace, text)
return result
# Text-to-speech synthesis function
def synthesize_speech(text):
# Clean up text for Italian-specific accents
for src, dst in replacements:
text = text.replace(src, dst)
# Process input text
inputs = processor(text=text, return_tensors="pt")
# Generate speech using the model and vocoder
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
# Return the generated speech as (sample_rate, audio_array)
return (16000, speech.cpu().numpy())
# Title and description for the Gradio interface
title = "Fine-tuning TTS for a Italian Language Using SpeechT5"
description = """
Enter Italian text, and listen to the generated speech
"""
# Create Gradio interface
interface = gr.Interface(
fn=synthesize_speech,
inputs=gr.Textbox(label="Input Text", placeholder="Enter Italian text"),
outputs=gr.Audio(label="Generated Speech"),
title=title,
description=description,
examples=["Buongiorno, come sta? Buona giornata"]
)
# Launch the interface
interface.launch() |