File size: 3,794 Bytes
27c27c1
 
 
 
405ddc5
 
 
 
 
 
 
 
 
 
 
 
 
27c27c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
405ddc5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33baaf9
405ddc5
 
 
 
 
 
 
 
 
 
27c27c1
c209c53
405ddc5
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102

import gradio as gr
import torch
from datasets import load_dataset
from transformers import SpeechT5Processor, SpeechT5HifiGan, SpeechT5ForTextToSpeech

# Load the fine-tuned model and vocoder for Italian from the new model ID
model_id = "Aumkeshchy2003/speecht5_finetuned_AumkeshChy_italian_tts"
model = SpeechT5ForTextToSpeech.from_pretrained(model_id)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

# Load speaker embeddings dataset
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7440]["xvector"]).unsqueeze(0)

# Load processor for the new Italian model
processor = SpeechT5Processor.from_pretrained(model_id)


replacements = [
    ('à', 'ah'),
    ('è', 'eh'),
    ('ì', 'ee'),
    ('í', 'ee'),
    ('ï', 'ee'),
    ('ò', 'aw'),
    ('ó', 'oh'),
    ('ù', 'oo'),
    ('ú', 'oo')
]

number_words = {
     0: "zero", 1: "oo-noh", 2: "doo-eh", 3: "tre", 4: "quattro", 5: "chinque", 6: "sei", 7: "sette", 8: "otto", 9: "nove",
    10: "decei", 11: "undici", 12: "dodici", 13: "tredici", 14: "quattordici", 15: "quindici", 16: "sedici", 17: "diciassette",
    18: "diciotto", 19: "diciannove", 20: "venti", 30: "trenta", 40: "quaranta", 50: "cinquanta", 60: "sessanta", 70: "settanta",
    80: "ottanta", 90: "novanta", 100: "cento", 1000: "mille"
}

def number_to_words(number):
    if number < 20:
        return number_words[number]
    elif number < 100:
        tens, unit = divmod(number, 10)
        return number_words[tens * 10] + (" " + number_words[unit] if unit else "")
    elif number < 1000:
        hundreds, remainder = divmod(number, 100)
        return (number_words[hundreds] + " centi" if hundreds > 1 else " centi") + (" " + number_to_words(remainder) if remainder else "")
    elif number < 1000000:
        thousands, remainder = divmod(number, 1000)
        return (number_to_words(thousands) + " mille" if thousands > 1 else " mille") + (" " + number_to_words(remainder) if remainder else "")
    elif number < 1000000000:
        millions, remainder = divmod(number, 1000000)
        return number_to_words(millions) + " millione" + (" " + number_to_words(remainder) if remainder else "")
    elif number < 1000000000000:
        billions, remainder = divmod(number, 1000000000)
        return number_to_words(billions) + " milliardo" + (" " + number_to_words(remainder) if remainder else "")
    else:
        return str(number)

def replace_numbers_with_words(text):
    def replace(match):
        number = int(match.group())
        return number_to_words(number)

    # Find the numbers and change with words.
    result = re.sub(r'\b\d+\b', replace, text)

    return result

# Text-to-speech synthesis function
def synthesize_speech(text):
    # Clean up text for Italian-specific accents
    for src, dst in replacements:
        text = text.replace(src, dst)

    # Process input text
    inputs = processor(text=text, return_tensors="pt")

    # Generate speech using the model and vocoder
    speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

    # Return the generated speech as (sample_rate, audio_array)
    return (16000, speech.cpu().numpy())

# Title and description for the Gradio interface
title = "Fine-tuning TTS for a Italian Language Using SpeechT5"
description = """
Enter Italian text, and listen to the generated speech
"""

# Create Gradio interface
interface = gr.Interface(
    fn=synthesize_speech,
    inputs=gr.Textbox(label="Input Text", placeholder="Enter Italian text"),
    outputs=gr.Audio(label="Generated Speech"),
    title=title,
    description=description,
    examples=["Buongiorno, come sta? Buona giornata"]
)

# Launch the interface
interface.launch()