Abdullah-Habib commited on
Commit
e3a594e
1 Parent(s): 9c93632

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +187 -4
app.py CHANGED
@@ -1,7 +1,190 @@
 
 
 
1
  import gradio as gr
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
 
 
5
 
6
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import SpeechT5ForTextToSpeech, SpeechT5Processor, SpeechT5HifiGan
3
+ import soundfile as sf
4
  import gradio as gr
5
+ import scipy.io.wavfile as wav
6
+ import numpy as np
7
+ import wave
8
+ from datasets import load_dataset, Audio, config
9
+ from IPython.display import Audio
10
 
11
+ # Load the TTS model from the Hugging Face Hub
12
+ checkpoint = "Abdullah-Habib/urdu_speech_tt" # Replace with your actual model name
13
+ processor = SpeechT5Processor.from_pretrained(checkpoint)
14
+ model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
15
+ tokenizer = processor.tokenizer
16
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
17
 
18
+
19
+ # Buckwalter to Unicode mapping
20
+ buck2uni = {
21
+ u"\u0627":"a",
22
+ u"\u0627":"a",
23
+ u"\u0675":"a",
24
+ u"\u0673":"a",
25
+ u"\u0630":"a",
26
+ u"\u0622":"aa",
27
+ u"\u0628":"b",
28
+ u"\u067E":"p",
29
+ u"\u062A":"t",
30
+ u"\u0637":"t",
31
+ u"\u0679":"t",
32
+ u"\u062C":"j",
33
+ u"\u0633":"s",
34
+ u"\u062B":"s",
35
+ u"\u0635":"s",
36
+ u"\u0686":"ch",
37
+ u"\u062D":"h",
38
+ u"\u0647":"h",
39
+ u"\u0629":"h",
40
+ u"\u06DF":"h",
41
+ u"\u062E":"kh",
42
+ u"\u062F":"d",
43
+ u"\u0688":"d",
44
+ u"\u0630":"z",
45
+ u"\u0632":"z",
46
+ u"\u0636":"z",
47
+ u"\u0638":"z",
48
+ u"\u068E":"z",
49
+ u"\u0631":"r",
50
+ u"\u0691":"r",
51
+ u"\u0634":"sh",
52
+ u"\u063A":"gh",
53
+ u"\u0641":"f",
54
+ u"\u06A9":"k",
55
+ u"\u0642":"k",
56
+ u"\u06AF":"g",
57
+ u"\u0644":"l",
58
+ u"\u0645":"m",
59
+ u"\u0646":"n",
60
+ u"\u06BA":"n",
61
+ u"\u0648":"o",
62
+ u"\u0649":"y",
63
+ u"\u0626":"y",
64
+ u"\u06CC":"y",
65
+ u"\u06D2":"e",
66
+ u"\u06C1":"h",
67
+ u"\u064A":"e" ,
68
+ u"\u06C2":"ah" ,
69
+ u"\u06BE":"h" ,
70
+ u"\u0639":"a" ,
71
+ u"\u0643":"k" ,
72
+ u"\u0621":"a",
73
+ u"\u0624":"o",
74
+ u"\u060C":"" #seperator ulta comma
75
+
76
+ def transString(string, reverse=0):
77
+ """Given a Unicode string, transliterate into Buckwalter. To go from
78
+ Buckwalter back to Unicode, set reverse=1"""
79
+ for k, v in buck2uni.items():
80
+ if not reverse:
81
+ string = string.replace(k, v)
82
+ else:
83
+ string = string.replace(v, k)
84
+ return string
85
+
86
+
87
+ def generate_audio(text):
88
+ # Convert input text to Roman Urdu
89
+ roman_urdu = transString(text)
90
+
91
+ # Tokenize the input text
92
+ inputs = processor(text=roman_urdu, return_tensors="pt", type = "numpy")
93
+
94
+ # Generate audio from the SpeechT5 model
95
+
96
+
97
+
98
+ # speaker_embeddings = torch.tensor(np.load("speaker_embeddings.npy"))
99
+
100
+
101
+ speaker_embeddings= torch.tensor([[-0.0917, -0.0461, 0.0347, 0.0341, 0.0197, -0.0438, -0.0377, -0.0212,
102
+ 0.0361, 0.0220, -0.0676, -0.0731, 0.0827, 0.0132, 0.0187, 0.0577,
103
+ -0.0026, 0.0618, 0.0088, 0.0159, 0.0344, 0.0243, -0.0164, -0.0430,
104
+ -0.0556, -0.0044, -0.0413, -0.0003, 0.0310, 0.0369, -0.0034, 0.0424,
105
+ 0.0474, 0.0102, 0.0392, -0.0611, 0.0405, 0.0652, -0.0386, -0.0638,
106
+ 0.0255, -0.0411, 0.0398, 0.0490, 0.0297, -0.1218, -0.0206, 0.0146,
107
+ -0.0649, 0.0550, 0.0177, 0.0407, 0.0017, -0.0113, -0.0990, -0.0015,
108
+ 0.0158, 0.0481, 0.0286, 0.0300, 0.0346, -0.0104, -0.0142, -0.0005,
109
+ 0.0264, 0.0412, 0.0227, -0.0389, -0.0489, -0.0750, 0.0238, 0.0101,
110
+ 0.0171, 0.0141, 0.0224, 0.0344, 0.0402, 0.0336, -0.0641, -0.0818,
111
+ -0.0731, -0.0470, -0.0512, -0.0602, -0.0344, -0.0442, -0.0541, 0.0097,
112
+ 0.0198, 0.0482, 0.0323, -0.0885, 0.0210, -0.0798, 0.0417, -0.0436,
113
+ 0.0402, 0.0256, -0.0641, -0.0668, -0.0023, -0.0706, -0.0928, 0.0121,
114
+ 0.0355, -0.0376, 0.0522, 0.0482, 0.0200, 0.0290, -0.0698, -0.0232,
115
+ 0.0878, 0.0044, 0.0559, 0.0581, -0.0718, 0.0095, -0.0538, 0.0125,
116
+ 0.0023, -0.0562, 0.0424, 0.0261, -0.0498, 0.0255, -0.0840, 0.0331,
117
+ 0.0406, 0.0162, -0.0522, 0.0218, 0.0323, 0.0359, 0.0128, -0.0891,
118
+ -0.0569, 0.0031, -0.0694, -0.0102, 0.0118, 0.0033, 0.0127, 0.0589,
119
+ -0.0783, 0.0179, 0.0200, -0.0371, 0.0325, -0.1033, 0.0483, -0.0343,
120
+ -0.0714, 0.0102, 0.0665, 0.0278, 0.0285, -0.0653, -0.0834, 0.0196,
121
+ 0.0399, 0.0085, 0.0246, -0.0400, 0.0215, 0.0083, 0.0302, 0.0204,
122
+ 0.0360, 0.0309, -0.0306, -0.0828, 0.0142, -0.0614, -0.0103, 0.0372,
123
+ -0.0456, 0.0291, 0.0565, -0.0271, 0.0518, -0.0671, 0.0012, -0.0048,
124
+ -0.0565, -0.0092, 0.0336, 0.0476, -0.0351, -0.0698, 0.0487, 0.0313,
125
+ -0.0491, 0.0401, 0.0246, 0.0178, 0.0405, 0.0012, 0.0311, -0.0041,
126
+ 0.0367, 0.0330, -0.0609, 0.0099, -0.0097, 0.0173, 0.0494, -0.0305,
127
+ 0.0272, -0.0349, 0.0025, -0.0697, -0.0414, 0.0604, -0.0707, 0.0420,
128
+ 0.0380, -0.0731, 0.0546, 0.0339, -0.0758, 0.0365, -0.0712, -0.0140,
129
+ 0.0365, 0.0477, 0.0796, 0.0572, 0.0212, 0.0098, 0.0133, 0.0261,
130
+ 0.0329, -0.0269, 0.0437, -0.0359, 0.0296, 0.0180, -0.0008, 0.0668,
131
+ -0.0448, 0.0269, -0.0734, 0.0194, -0.0494, 0.0432, 0.0449, 0.0442,
132
+ 0.0389, 0.0530, 0.0420, 0.0021, 0.0084, -0.0820, -0.0081, 0.0326,
133
+ 0.0265, 0.0536, -0.0714, 0.0188, 0.0298, -0.0737, 0.0110, 0.0340,
134
+ 0.0016, 0.0262, 0.0179, 0.0109, 0.0426, -0.0538, 0.0649, 0.0160,
135
+ 0.0146, -0.0419, -0.0851, 0.0138, 0.0399, 0.0445, -0.0849, -0.0425,
136
+ 0.0293, 0.0477, 0.0108, -0.0941, -0.0386, 0.0600, 0.0089, 0.0557,
137
+ -0.0892, 0.0026, 0.0192, 0.0136, -0.0207, -0.0023, 0.0163, 0.0263,
138
+ -0.0112, 0.0245, 0.0411, 0.0285, 0.0267, 0.0297, 0.0213, -0.0577,
139
+ 0.0169, 0.0592, 0.0227, 0.0290, 0.0074, 0.0197, 0.0282, 0.0368,
140
+ 0.0064, 0.0092, -0.0896, -0.0693, -0.0295, 0.0316, -0.0674, 0.0645,
141
+ -0.0655, 0.0355, -0.0389, 0.0134, 0.0299, -0.0534, 0.0537, 0.0900,
142
+ -0.0770, -0.0666, -0.0600, -0.0019, 0.0276, 0.0590, -0.0705, 0.0222,
143
+ 0.0517, -0.0089, 0.0063, -0.0270, 0.0185, -0.0626, -0.0065, 0.0187,
144
+ -0.0670, 0.0216, 0.0356, 0.0384, -0.0268, -0.0628, -0.0443, -0.0195,
145
+ -0.0495, 0.1405, 0.0274, -0.0455, -0.0068, 0.0686, -0.0756, -0.0073,
146
+ -0.0981, 0.0025, 0.0383, 0.0157, 0.0651, 0.0252, -0.0665, 0.0054,
147
+ 0.0223, 0.0509, 0.0101, 0.0454, -0.0527, 0.0252, -0.0157, -0.0022,
148
+ 0.0526, 0.0224, 0.0494, 0.0293, -0.0808, -0.1220, 0.0196, 0.0135,
149
+ 0.0303, -0.0467, 0.0411, -0.0639, 0.0358, 0.0499, 0.0425, 0.0169,
150
+ -0.0579, 0.0388, 0.0414, -0.0101, 0.0490, -0.0773, 0.0478, -0.0238,
151
+ -0.0142, -0.0508, 0.0018, -0.0085, 0.0198, 0.0126, 0.0133, -0.0554,
152
+ -0.0583, -0.0699, -0.0167, 0.0131, 0.0288, -0.0132, 0.0343, -0.0476,
153
+ -0.0039, -0.0825, -0.1180, -0.0570, -0.0590, 0.0233, 0.0500, -0.0328,
154
+ -0.0426, 0.0241, 0.0441, 0.0372, 0.0488, -0.0366, -0.0233, -0.0118,
155
+ -0.0256, 0.0254, 0.0041, 0.0119, 0.0423, 0.0178, -0.0245, -0.0769,
156
+ 0.0056, 0.0428, 0.0341, -0.0009, -0.0197, 0.0395, 0.0247, 0.0090,
157
+ 0.0098, -0.0083, 0.0346, 0.0411, 0.0416, 0.0413, 0.0312, 0.0054,
158
+ 0.0390, -0.0571, -0.0403, 0.0441, -0.0132, 0.0117, 0.0467, 0.0516,
159
+ -0.0639, 0.0296, 0.0337, -0.0557, 0.0110, 0.0277, -0.0026, 0.0347,
160
+ 0.0301, 0.0056, -0.0572, -0.0663, 0.0124, -0.0065, 0.0222, 0.0441,
161
+ -0.0570, -0.0519, 0.0132, 0.0323, 0.0401, 0.0357, -0.0555, 0.0310,
162
+ 0.0028, -0.0102, -0.0598, 0.0153, -0.0438, 0.0268, -0.0097, 0.0388,
163
+ -0.0330, -0.0277, -0.0581, -0.0389, 0.0099, 0.0371, -0.0455, 0.0553,
164
+ 0.0753, -0.0154, -0.0385, 0.0359, 0.0403, 0.0464, 0.0499, -0.0365]])
165
+
166
+
167
+
168
+ speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
169
+
170
+ return speech
171
+
172
+ def text_to_speech(text):
173
+ # Generate audio
174
+ audio_output = generate_audio(text)
175
+
176
+ output_path = "output.wav"
177
+ sf.write(output_path, audio_output.numpy(), 16000, "PCM_16")
178
+
179
+ return output_path
180
+
181
+
182
+ examples = [
183
+ ['اگر رشتے داری ہے تو پیسے کی'],
184
+ ['مجھے کھانے سے لطف نہیں آیا۔']
185
+ ]
186
+
187
+
188
+ interface = gr.Interface(fn=text_to_speech, inputs="text", outputs="audio", verbose = True, title="Urdu TTS",
189
+ description = "A simple Urdu Text to Speech Application. It is not by any means perfect and will not work for all text. You can sometimes expect it to generate random noise on an input of your choice. Right now it works successfully on very basic urdu text, such the ones in the example.", examples = examples)
190
+ interface.launch()