Spaces:
Running
Running
| import gradio as gr | |
| import torch | |
| from datasets import load_dataset | |
| from transformers import SpeechT5Processor, SpeechT5HifiGan, SpeechT5ForTextToSpeech | |
| # Load the fine-tuned model and vocoder for Italian from the new model ID | |
| model_id = "Aumkeshchy2003/speecht5_finetuned_AumkeshChy_italian_tts" | |
| model = SpeechT5ForTextToSpeech.from_pretrained(model_id) | |
| vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | |
| # Load speaker embeddings dataset | |
| embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") | |
| speaker_embeddings = torch.tensor(embeddings_dataset[7440]["xvector"]).unsqueeze(0) | |
| # Load processor for the new Italian model | |
| processor = SpeechT5Processor.from_pretrained(model_id) | |
| replacements = [ | |
| ('à', 'ah'), | |
| ('è', 'eh'), | |
| ('ì', 'ee'), | |
| ('í', 'ee'), | |
| ('ï', 'ee'), | |
| ('ò', 'aw'), | |
| ('ó', 'oh'), | |
| ('ù', 'oo'), | |
| ('ú', 'oo') | |
| ] | |
| number_words = { | |
| 0: "zero", 1: "oo-noh", 2: "doo-eh", 3: "tre", 4: "quattro", 5: "chinque", 6: "sei", 7: "sette", 8: "otto", 9: "nove", | |
| 10: "decei", 11: "undici", 12: "dodici", 13: "tredici", 14: "quattordici", 15: "quindici", 16: "sedici", 17: "diciassette", | |
| 18: "diciotto", 19: "diciannove", 20: "venti", 30: "trenta", 40: "quaranta", 50: "cinquanta", 60: "sessanta", 70: "settanta", | |
| 80: "ottanta", 90: "novanta", 100: "cento", 1000: "mille" | |
| } | |
| def number_to_words(number): | |
| if number < 20: | |
| return number_words[number] | |
| elif number < 100: | |
| tens, unit = divmod(number, 10) | |
| return number_words[tens * 10] + (" " + number_words[unit] if unit else "") | |
| elif number < 1000: | |
| hundreds, remainder = divmod(number, 100) | |
| return (number_words[hundreds] + " centi" if hundreds > 1 else " centi") + (" " + number_to_words(remainder) if remainder else "") | |
| elif number < 1000000: | |
| thousands, remainder = divmod(number, 1000) | |
| return (number_to_words(thousands) + " mille" if thousands > 1 else " mille") + (" " + number_to_words(remainder) if remainder else "") | |
| elif number < 1000000000: | |
| millions, remainder = divmod(number, 1000000) | |
| return number_to_words(millions) + " millione" + (" " + number_to_words(remainder) if remainder else "") | |
| elif number < 1000000000000: | |
| billions, remainder = divmod(number, 1000000000) | |
| return number_to_words(billions) + " milliardo" + (" " + number_to_words(remainder) if remainder else "") | |
| else: | |
| return str(number) | |
| def replace_numbers_with_words(text): | |
| def replace(match): | |
| number = int(match.group()) | |
| return number_to_words(number) | |
| # Find the numbers and change with words. | |
| result = re.sub(r'\b\d+\b', replace, text) | |
| return result | |
| # Text-to-speech synthesis function | |
| def synthesize_speech(text): | |
| # Clean up text for Italian-specific accents | |
| for src, dst in replacements: | |
| text = text.replace(src, dst) | |
| # Process input text | |
| inputs = processor(text=text, return_tensors="pt") | |
| # Generate speech using the model and vocoder | |
| speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) | |
| # Return the generated speech as (sample_rate, audio_array) | |
| return (16000, speech.cpu().numpy()) | |
| # Title and description for the Gradio interface | |
| title = "Fine-tuning TTS for a Italian Language Using SpeechT5" | |
| description = """ | |
| Enter Italian text, and listen to the generated speech | |
| """ | |
| # Create Gradio interface | |
| interface = gr.Interface( | |
| fn=synthesize_speech, | |
| inputs=gr.Textbox(label="Input Text", placeholder="Enter Italian text"), | |
| outputs=gr.Audio(label="Generated Speech"), | |
| title=title, | |
| description=description, | |
| examples=["Buongiorno, come sta? Buona giornata"] | |
| ) | |
| # Launch the interface | |
| interface.launch() |