Chithekitale commited on
Commit
5f93243
·
verified ·
1 Parent(s): 352be2c

Upload 12 files

Browse files
README (2).md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Rule-Integrated Chichewa TTS
3
+ emoji: 👩‍🎤
4
+ colorFrom: yellow
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 6.9.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app (1).py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ import torch
4
+ import tempfile
5
+ import os
6
+ from scipy.io.wavfile import write
7
+ from transformers import (
8
+ SpeechT5Processor,
9
+ SpeechT5ForTextToSpeech,
10
+ SpeechT5HifiGan
11
+ )
12
+
13
+ # =========================
14
+ # Model loading
15
+ # =========================
16
+ checkpoint = "Chithekitale/Chichewa_tts_v2"
17
+
18
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
19
+ model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
20
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
21
+
22
+ # Make all keys consistent
23
+ speaker_embeddings = {
24
+ "SPK1": "spkemb/speaker_2.npy",
25
+ "SPK2": "spkemb/speaker_1.npy",
26
+ "SPK3": "spkemb/cmu_us_ksp_arctic-wav-arctic_b0087.npy",
27
+ "SPK4": "spkemb/cmu_us_rms_arctic-wav-arctic_b0353.npy",
28
+ "SPK5": "spkemb/cmu_us_slt_arctic-wav-arctic_a0508.npy",
29
+ }
30
+
31
+ SPEAKER_CHOICES = [
32
+ "SPK1 (female)",
33
+ "SPK2 (male)",
34
+ "SPK3 (male)",
35
+ "SPK4 (male)",
36
+ "SPK5 (female)"
37
+ ]
38
+
39
+ EXAMPLES = [
40
+ ["Ndapita, koma ndibweranso pompano.", "SPK1 (female)"],
41
+ ["Koma apapa zikuoneka kuti ziyenda bwino.", "SPK2 (male)"],
42
+ ["Ineyo ndikuona kuti sizizasithanso.", "SPK3 (male)"],
43
+ ["Mwina kusogolo kuno anthu ena azalimba mtima, koma panopana ndakaika.", "SPK4 (male)"],
44
+ ["Simungasankhe munthu oti bola linamukana.", "SPK5 (female)"],
45
+ ["Kodi chimanga panopa chikugulisidwa zingati, kapena nanunso simukudziwa?", "SPK5 (female)"],
46
+ ]
47
+
48
+ SAMPLE_RATE = 16000
49
+
50
+ # =========================
51
+ # Helpers
52
+ # =========================
53
+ def get_speaker_key(speaker_label: str) -> str:
54
+ # "SPK1 (female)" -> "SPK1"
55
+ return speaker_label.split()[0]
56
+
57
+ def load_speaker_embedding(speaker: str) -> np.ndarray:
58
+ speaker_key = get_speaker_key(speaker)
59
+
60
+ if speaker_key not in speaker_embeddings:
61
+ raise ValueError(f"Unknown speaker key: {speaker_key}")
62
+
63
+ path = speaker_embeddings[speaker_key]
64
+
65
+ try:
66
+ speaker_embedding = np.load(path).astype(np.float32)
67
+ except Exception as e:
68
+ raise FileNotFoundError(
69
+ f"Could not load speaker embedding file: {path}. Error: {e}"
70
+ )
71
+
72
+ if speaker_embedding.ndim == 2:
73
+ speaker_embedding = speaker_embedding.mean(axis=0)
74
+
75
+ speaker_embedding = np.squeeze(speaker_embedding)
76
+
77
+ if speaker_embedding.shape != (512,):
78
+ raise ValueError(
79
+ f"Unexpected speaker embedding shape after processing: "
80
+ f"{speaker_embedding.shape}. Expected (512,)"
81
+ )
82
+
83
+ return speaker_embedding
84
+
85
+ def save_audio_to_wav(audio: np.ndarray, sample_rate: int = SAMPLE_RATE) -> str:
86
+ """
87
+ Save generated int16 audio to a temporary WAV file and return its path.
88
+ """
89
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
90
+ temp_file.close()
91
+ write(temp_file.name, sample_rate, audio)
92
+ return temp_file.name
93
+
94
+ # =========================
95
+ # Inference
96
+ # =========================
97
+ def predict(text, speaker):
98
+ try:
99
+ if not text or len(text.strip()) == 0:
100
+ return None, None, "Please enter some Chichewa text."
101
+
102
+ inputs = processor(text=text, return_tensors="pt")
103
+ input_ids = inputs["input_ids"][..., :model.config.max_text_positions]
104
+
105
+ speaker_embedding = load_speaker_embedding(speaker)
106
+ speaker_embedding = torch.tensor(
107
+ speaker_embedding, dtype=torch.float32
108
+ ).unsqueeze(0)
109
+
110
+ with torch.no_grad():
111
+ speech = model.generate_speech(
112
+ input_ids,
113
+ speaker_embedding,
114
+ vocoder=vocoder
115
+ )
116
+
117
+ speech = speech.cpu().numpy()
118
+
119
+ # Normalize safely before int16 conversion
120
+ max_val = np.max(np.abs(speech))
121
+ if max_val > 0:
122
+ speech = speech / max_val
123
+
124
+ speech = (speech * 32767).astype(np.int16)
125
+
126
+ # Save WAV file for downloading
127
+ wav_path = save_audio_to_wav(speech, SAMPLE_RATE)
128
+
129
+ status = f"Generated speech successfully using speaker: {speaker}"
130
+ return (SAMPLE_RATE, speech), wav_path, status
131
+
132
+ except Exception as e:
133
+ return None, None, f"Error during generation: {str(e)}"
134
+
135
+ def clear_all():
136
+ return "", "SPK1 (female)", None, None, "Ready."
137
+
138
+ # =========================
139
+ # UI
140
+ # =========================
141
+ custom_css = """
142
+ .gradio-container {
143
+ max-width: 1100px !important;
144
+ margin: 0 auto;
145
+ }
146
+ .hero {
147
+ text-align: center;
148
+ padding: 10px 0 0 0;
149
+ }
150
+ .section-note {
151
+ font-size: 0.95rem;
152
+ opacity: 0.9;
153
+ }
154
+ """
155
+
156
+ with gr.Blocks(css=custom_css, title="Chichewa Speech Synthesis Demo") as demo:
157
+ gr.HTML(
158
+ """
159
+ <div class="hero">
160
+ <h1>Rule-Intergrated Chichewa Speech Synthesis</h1>
161
+ <p class="section-note">
162
+ Enter Chichewa text, choose a speaker voice, and generate speech audio.
163
+ </p>
164
+ </div>
165
+ """
166
+ )
167
+
168
+ with gr.Row():
169
+ with gr.Column(scale=5):
170
+ text_input = gr.Textbox(
171
+ label="Input Text",
172
+ placeholder="Type Chichewa text here...",
173
+ lines=6
174
+ )
175
+
176
+ speaker_input = gr.Radio(
177
+ label="Speaker Voice",
178
+ choices=SPEAKER_CHOICES,
179
+ value="SPK1 (female)"
180
+ )
181
+
182
+ with gr.Row():
183
+ generate_btn = gr.Button("Generate Speech", variant="primary")
184
+ clear_btn = gr.Button("Clear")
185
+
186
+ status_box = gr.Textbox(
187
+ label="System Status",
188
+ value="Ready.",
189
+ interactive=False
190
+ )
191
+
192
+ with gr.Column(scale=5):
193
+ audio_output = gr.Audio(
194
+ label="Generated Speech",
195
+ type="numpy",
196
+ autoplay=False
197
+ )
198
+
199
+ download_file = gr.File(
200
+ label="Download Audio File"
201
+ )
202
+
203
+ gr.Markdown("### Example Inputs")
204
+ gr.Examples(
205
+ examples=EXAMPLES,
206
+ inputs=[text_input, speaker_input]
207
+ )
208
+
209
+ generate_btn.click(
210
+ fn=predict,
211
+ inputs=[text_input, speaker_input],
212
+ outputs=[audio_output, download_file, status_box],
213
+ show_progress="full"
214
+ )
215
+
216
+ clear_btn.click(
217
+ fn=clear_all,
218
+ inputs=[],
219
+ outputs=[text_input, speaker_input, audio_output, download_file, status_box]
220
+ )
221
+
222
+ demo.launch()
cmu_us_awb_arctic-wav-arctic_a0002 (1).npy ADDED
Binary file (2.18 kB). View file
 
cmu_us_bdl_arctic-wav-arctic_a0009.npy ADDED
Binary file (2.18 kB). View file
 
cmu_us_clb_arctic-wav-arctic_a0144.npy ADDED
Binary file (2.18 kB). View file
 
cmu_us_ksp_arctic-wav-arctic_b0087.npy ADDED
Binary file (2.18 kB). View file
 
cmu_us_rms_arctic-wav-arctic_b0353.npy ADDED
Binary file (2.18 kB). View file
 
cmu_us_slt_arctic-wav-arctic_a0508.npy ADDED
Binary file (2.18 kB). View file
 
gitignore (1) ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ *.pyc
2
+ __pycache__/
3
+ .DS_Store
4
+
one (1).npy ADDED
Binary file (2.18 kB). View file
 
requirements (1).txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ git+https://github.com/huggingface/transformers.git
2
+ torch
3
+ torchaudio
4
+ soundfile
5
+ librosa
6
+ samplerate
7
+ resampy
8
+ sentencepiece
two (1).npy ADDED
Binary file (2.18 kB). View file