Aumkeshchy2003 commited on
Commit
27c27c1
·
verified ·
1 Parent(s): 4dd6eb3

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +146 -0
app.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import gradio as gr
3
+ import torch
4
+ import soundfile as sf
5
+ import spaces
6
+ import os
7
+ import numpy as np
8
+ import re
9
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
10
+ from speechbrain.pretrained import EncoderClassifier
11
+ from datasets import load_dataset
12
+
13
+ device = "cuda" if torch.cuda.is_available() else "cpu"
14
+
15
+ def load_models_and_data():
16
+ model_name = "microsoft/speecht5_tts"
17
+ processor = SpeechT5Processor.from_pretrained(model_name)
18
+ model = SpeechT5ForTextToSpeech.from_pretrained("Aumkeshchy2003/speecht5_finetuned_AumkeshChy_italian_tts").to(device)
19
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
20
+
21
+ spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
22
+ speaker_model = EncoderClassifier.from_hparams(
23
+ source=spk_model_name,
24
+ run_opts={"device": device},
25
+ savedir=os.path.join("/tmp", spk_model_name),
26
+ )
27
+
28
+ # Load a sample from a dataset for default embedding
29
+ dataset = load_dataset("freds0/cml_tts_dataset_italian", split="train")
30
+ example = dataset[14]
31
+
32
+ return model, processor, vocoder, speaker_model, example
33
+
34
+ model, processor, vocoder, speaker_model, default_example = load_models_and_data()
35
+
36
+ def create_speaker_embedding(waveform):
37
+ with torch.no_grad():
38
+ speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform).unsqueeze(0).to(device))
39
+ speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
40
+ speaker_embeddings = speaker_embeddings.squeeze()
41
+ return speaker_embeddings
42
+
43
+ def prepare_default_embedding(example):
44
+ audio = example["audio"]
45
+ return create_speaker_embedding(audio["array"])
46
+
47
+ default_embedding = prepare_default_embedding(default_example)
48
+
49
+ replacements = [
50
+ ('à', 'ah'),
51
+ ('è', 'eh'),
52
+ ('ì', 'ee'),
53
+ ('í', 'ee'),
54
+ ('ï', 'ee'),
55
+ ('ò', 'aw'),
56
+ ('ó', 'oh'),
57
+ ('ù', 'oo'),
58
+ ('ú', 'oo')
59
+ ]
60
+
61
+ number_words = {
62
+ 0: "zero", 1: "oo-noh", 2: "doo-eh", 3: "tre", 4: "quattro", 5: "chinque", 6: "sei", 7: "sette", 8: "otto", 9: "nove",
63
+ 10: "decei", 11: "undici", 12: "dodici", 13: "tredici", 14: "quattordici", 15: "quindici", 16: "sedici", 17: "diciassette",
64
+ 18: "diciotto", 19: "diciannove", 20: "venti", 30: "trenta", 40: "quaranta", 50: "cinquanta", 60: "sessanta", 70: "settanta",
65
+ 80: "ottanta", 90: "novanta", 100: "cento", 1000: "mille"
66
+ }
67
+
68
+ def number_to_words(number):
69
+ if number < 20:
70
+ return number_words[number]
71
+ elif number < 100:
72
+ tens, unit = divmod(number, 10)
73
+ return number_words[tens * 10] + (" " + number_words[unit] if unit else "")
74
+ elif number < 1000:
75
+ hundreds, remainder = divmod(number, 100)
76
+ return (number_words[hundreds] + " centi" if hundreds > 1 else " centi") + (" " + number_to_words(remainder) if remainder else "")
77
+ elif number < 1000000:
78
+ thousands, remainder = divmod(number, 1000)
79
+ return (number_to_words(thousands) + " mille" if thousands > 1 else " mille") + (" " + number_to_words(remainder) if remainder else "")
80
+ elif number < 1000000000:
81
+ millions, remainder = divmod(number, 1000000)
82
+ return number_to_words(millions) + " millione" + (" " + number_to_words(remainder) if remainder else "")
83
+ elif number < 1000000000000:
84
+ billions, remainder = divmod(number, 1000000000)
85
+ return number_to_words(billions) + " milliardo" + (" " + number_to_words(remainder) if remainder else "")
86
+ else:
87
+ return str(number)
88
+
89
+ def replace_numbers_with_words(text):
90
+ def replace(match):
91
+ number = int(match.group())
92
+ return number_to_words(number)
93
+
94
+ # Find the numbers and change with words.
95
+ result = re.sub(r'\b\d+\b', replace, text)
96
+
97
+ return result
98
+
99
+ def normalize_text(text):
100
+ # Convert to lowercase
101
+ text = text.lower()
102
+
103
+ # Replace numbers with words
104
+ text = replace_numbers_with_words(text)
105
+
106
+ # Apply character replacements
107
+ for old, new in replacements:
108
+ text = text.replace(old, new)
109
+
110
+ # Remove punctuation
111
+ text = re.sub(r'[^\w\s]', '', text)
112
+
113
+ return text
114
+
115
+ @spaces.GPU(duration=60)
116
+ def text_to_speech(text, audio_file=None):
117
+ # Normalize the input text
118
+ normalized_text = normalize_text(text)
119
+
120
+ # Prepare the input for the model
121
+ inputs = processor(text=normalized_text, return_tensors="pt").to(device)
122
+
123
+ # Use the default speaker embedding
124
+ speaker_embeddings = default_embedding
125
+
126
+ # Generate speech
127
+ with torch.no_grad():
128
+ speech = model.generate_speech(inputs["input_ids"], speaker_embeddings.unsqueeze(0), vocoder=vocoder)
129
+
130
+ speech_np = speech.cpu().numpy()
131
+
132
+ return (24000, speech_np)
133
+
134
+ iface = gr.Interface(
135
+ fn=text_to_speech,
136
+ inputs=[
137
+ gr.Textbox(label="Enter Italian text to convert to speech")
138
+ ],
139
+ outputs=[
140
+ gr.Audio(label="Generated Speech", type="numpy")
141
+ ],
142
+ title="Italian SpeechT5 Text-to-Speech Demo",
143
+ description="Enter Italian text, and listen to the generated speech."
144
+ )
145
+
146
+ iface.launch(share=True)