Update app.py
Browse files
app.py
CHANGED
@@ -13,6 +13,28 @@ model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
|
|
13 |
tokenizer = processor.tokenizer
|
14 |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
# Buckwalter to Unicode mapping
|
18 |
buck2uni = {
|
@@ -88,11 +110,13 @@ def generate_audio(text):
|
|
88 |
roman_urdu = transString(text)
|
89 |
|
90 |
# Tokenize the input text
|
91 |
-
inputs =
|
92 |
|
93 |
# Generate audio from the SpeechT5 model
|
94 |
-
|
95 |
-
|
|
|
|
|
96 |
|
97 |
return speech
|
98 |
|
@@ -101,10 +125,11 @@ def text_to_speech(text):
|
|
101 |
audio_output = generate_audio(text)
|
102 |
|
103 |
# Save audio as a .wav file
|
104 |
-
|
105 |
-
|
|
|
106 |
|
107 |
-
return
|
108 |
|
109 |
# Define the Gradio interface
|
110 |
inputs = gr.inputs.Textbox(label="Enter text in Urdu")
|
|
|
13 |
tokenizer = processor.tokenizer
|
14 |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
15 |
|
16 |
+
def prepare_dataset(example):
|
17 |
+
audio = example["audio"]
|
18 |
+
|
19 |
+
example = processor(
|
20 |
+
text=transString(example["sentence"]),
|
21 |
+
audio_target=audio["array"],
|
22 |
+
sampling_rate=audio["sampling_rate"],
|
23 |
+
return_attention_mask=False,
|
24 |
+
)
|
25 |
+
|
26 |
+
# strip off the batch dimension
|
27 |
+
example["labels"] = example["labels"][0]
|
28 |
+
|
29 |
+
# use SpeechBrain to obtain x-vector
|
30 |
+
example["speaker_embeddings"] = create_speaker_embedding(audio["array"])
|
31 |
+
|
32 |
+
return example
|
33 |
+
|
34 |
+
test_dataset = load_dataset("mozilla-foundation/common_voice_13_0", "ur", split="test")
|
35 |
+
test_dataset = test_dataset.cast_column("audio", Audio(sampling_rate=16000))
|
36 |
+
test_dataset = test_dataset.map(prepare_dataset, remove_columns=test_dataset.column_names)
|
37 |
+
|
38 |
|
39 |
# Buckwalter to Unicode mapping
|
40 |
buck2uni = {
|
|
|
110 |
roman_urdu = transString(text)
|
111 |
|
112 |
# Tokenize the input text
|
113 |
+
inputs = processor(text=roman_urdu, return_tensors="pt")
|
114 |
|
115 |
# Generate audio from the SpeechT5 model
|
116 |
+
example = test_dataset[22]
|
117 |
+
speaker_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)
|
118 |
+
|
119 |
+
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
|
120 |
|
121 |
return speech
|
122 |
|
|
|
125 |
audio_output = generate_audio(text)
|
126 |
|
127 |
# Save audio as a .wav file
|
128 |
+
from IPython.display import Audio
|
129 |
+
|
130 |
+
audio = Audio(audio_output.numpy(), rate=16000)
|
131 |
|
132 |
+
return audio
|
133 |
|
134 |
# Define the Gradio interface
|
135 |
inputs = gr.inputs.Textbox(label="Enter text in Urdu")
|