Yamanedz commited on
Commit
660a8ce
·
verified ·
1 Parent(s): a846017

Deploy Gradio app with multiple files

Browse files
Files changed (5) hide show
  1. app.py +348 -0
  2. config.py +82 -0
  3. models.py +188 -0
  4. requirements.txt +7 -0
  5. utils.py +66 -0
app.py ADDED
@@ -0,0 +1,348 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import time
3
+ import os
4
+ import shutil
5
+ from typing import Dict, Any, List, Tuple
6
+ from config import LOCAL_STRINGS, TEMP_DIR, DEFAULT_RVC_MODEL_PATH, LANGUAGES
7
+ from utils import get_localized_strings, clean_file_paths
8
+ from models import rvc_training_mock, rvc_conversion_mock, tts_inference
9
+
10
+ # Global state to store the selected language
11
+ current_lang_state = gr.State("en")
12
+ # Global state to track generated model file path for cleanup
13
+ trained_model_file_path = gr.State(DEFAULT_RVC_MODEL_PATH)
14
+ # State for uploaded RVC model path in conversion tab
15
+ converted_model_path = gr.State(None)
16
+
17
+ def update_ui_labels(lang: str) -> Dict[str, Any]:
18
+ """Updates all component labels based on the selected language."""
19
+ strings = get_localized_strings(lang)
20
+
21
+ updates = {
22
+ # Tabs
23
+ "tab_train": gr.Tab(label=strings["tab_train"]),
24
+ "tab_convert": gr.Tab(label=strings["tab_convert"]),
25
+ "tab_tts": gr.Tab(label=strings["tab_tts"]),
26
+
27
+ # Header/Language Selector
28
+ "app_title": gr.Markdown(f"# {strings['title']}"),
29
+ "lang_radio": gr.Radio(label=strings["lang_select"], value=lang),
30
+ "subtitle": gr.Markdown(
31
+ f"[{strings['subtitle']}]({strings['subtitle_link']})"
32
+ ),
33
+
34
+ # Training Tab
35
+ "train_desc": gr.Markdown(strings["train_desc"]),
36
+ "train_input_audio": gr.Audio(label=strings["train_input_audio"]),
37
+ "train_input_name": gr.Textbox(label=strings["train_input_name"]),
38
+ "train_btn": gr.Button(strings["train_btn"], variant="primary"),
39
+ "train_output_file": gr.File(label=strings["train_output_file"], visible=False),
40
+ "train_status": gr.Textbox(label=strings["train_status"], interactive=False),
41
+
42
+ # Conversion Tab
43
+ "convert_desc": gr.Markdown(strings["convert_desc"]),
44
+ "convert_input_singer": gr.Audio(label=strings["convert_input_singer"]),
45
+ "convert_input_model_file": gr.File(label=strings["convert_input_model"], visible=True),
46
+ "convert_pitch": gr.Slider(label=strings["convert_pitch"]),
47
+ "convert_index_rate": gr.Slider(label=strings["convert_index_rate"]),
48
+ "convert_btn": gr.Button(strings["convert_btn"], variant="primary"),
49
+ "convert_output": gr.Audio(label=strings["convert_output"]),
50
+
51
+ # TTS Tab
52
+ "tts_desc": gr.Markdown(strings["tts_desc"]),
53
+ "tts_input": gr.Textbox(label=strings["tts_input"], placeholder="Type your text here...", lines=3),
54
+ "tts_speed": gr.Slider(label=strings["tts_speed"]),
55
+ "tts_btn": gr.Button(strings["tts_btn"], variant="primary"),
56
+ "tts_output": gr.Audio(label=strings["tts_output"]),
57
+ }
58
+ return updates
59
+
60
+ # --- Gradio Application ---
61
+
62
+ with gr.Blocks(theme=gr.themes.Base(), css="""
63
+ .container {
64
+ max-width: 1000px;
65
+ margin: auto;
66
+ }
67
+ .rtl {
68
+ direction: rtl;
69
+ text-align: right;
70
+ }
71
+ .rtl label {
72
+ float: right !important;
73
+ }
74
+ .rtl .markdown {
75
+ text-align: right;
76
+ }
77
+ """) as demo:
78
+
79
+ gr.HTML("<div class='container' id='app_title_container'></div>") # Placeholder for title Markdown
80
+
81
+ # Define language selector and global state
82
+ with gr.Row(elem_id="lang_row"):
83
+ lang_radio = gr.Radio(
84
+ ["en", "ar"],
85
+ value="en",
86
+ label=LOCAL_STRINGS["en"]["lang_select"],
87
+ elem_id="lang_radio",
88
+ scale=0
89
+ )
90
+ subtitle = gr.Markdown(
91
+ f"[{LOCAL_STRINGS['en']['subtitle']}]({LOCAL_STRINGS['en']['subtitle_link']})",
92
+ scale=1,
93
+ elem_classes=["subtitle-link"]
94
+ )
95
+
96
+ # Placeholders for dynamic components
97
+ global_components = {}
98
+
99
+ with gr.Blocks(elem_id="main_content") as content_block:
100
+
101
+ # Training Tab
102
+ with gr.Tab(label=LOCAL_STRINGS["en"]["tab_train"], elem_id="tab_train") as tab_train:
103
+ global_components["tab_train"] = tab_train
104
+
105
+ train_desc = gr.Markdown(LOCAL_STRINGS["en"]["train_desc"])
106
+ global_components["train_desc"] = train_desc
107
+
108
+ with gr.Row():
109
+ train_input_audio = gr.Audio(
110
+ sources=["upload", "microphone"],
111
+ type="filepath",
112
+ label=LOCAL_STRINGS["en"]["train_input_audio"]
113
+ )
114
+ global_components["train_input_audio"] = train_input_audio
115
+
116
+ train_input_name = gr.Textbox(
117
+ label=LOCAL_STRINGS["en"]["train_input_name"],
118
+ value="MyVoiceModel"
119
+ )
120
+ global_components["train_input_name"] = train_input_name
121
+
122
+ train_btn = gr.Button(LOCAL_STRINGS["en"]["train_btn"], variant="primary")
123
+ global_components["train_btn"] = train_btn
124
+
125
+ train_status = gr.Textbox(
126
+ label=LOCAL_STRINGS["en"]["train_status"],
127
+ interactive=False
128
+ )
129
+ global_components["train_status"] = train_status
130
+
131
+ train_output_file = gr.File(
132
+ label=LOCAL_STRINGS["en"]["train_output_file"],
133
+ visible=False,
134
+ type="filepath",
135
+ file_count="single"
136
+ )
137
+ global_components["train_output_file"] = train_output_file
138
+
139
+ # Conversion Tab
140
+ with gr.Tab(label=LOCAL_STRINGS["en"]["tab_convert"], elem_id="tab_convert") as tab_convert:
141
+ global_components["tab_convert"] = tab_convert
142
+
143
+ convert_desc = gr.Markdown(LOCAL_STRINGS["en"]["convert_desc"])
144
+ global_components["convert_desc"] = convert_desc
145
+
146
+ with gr.Row():
147
+ convert_input_singer = gr.Audio(
148
+ sources=["upload", "microphone"],
149
+ type="filepath",
150
+ label=LOCAL_STRINGS["en"]["convert_input_singer"]
151
+ )
152
+ global_components["convert_input_singer"] = convert_input_singer
153
+
154
+ convert_input_model_file = gr.File(
155
+ label=LOCAL_STRINGS["en"]["convert_input_model"],
156
+ file_types=[".pth"],
157
+ type="file"
158
+ )
159
+ global_components["convert_input_model_file"] = convert_input_model_file
160
+
161
+ with gr.Row():
162
+ convert_pitch = gr.Slider(
163
+ minimum=-12,
164
+ maximum=12,
165
+ step=1,
166
+ value=0,
167
+ label=LOCAL_STRINGS["en"]["convert_pitch"]
168
+ )
169
+ global_components["convert_pitch"] = convert_pitch
170
+
171
+ convert_index_rate = gr.Slider(
172
+ minimum=0.0,
173
+ maximum=1.0,
174
+ step=0.05,
175
+ value=0.7,
176
+ label=LOCAL_STRINGS["en"]["convert_index_rate"]
177
+ )
178
+ global_components["convert_index_rate"] = convert_index_rate
179
+
180
+ convert_btn = gr.Button(LOCAL_STRINGS["en"]["convert_btn"], variant="primary")
181
+ global_components["convert_btn"] = convert_btn
182
+
183
+ convert_output = gr.Audio(
184
+ label=LOCAL_STRINGS["en"]["convert_output"],
185
+ interactive=False
186
+ )
187
+ global_components["convert_output"] = convert_output
188
+
189
+ # TTS Tab
190
+ with gr.Tab(label=LOCAL_STRINGS["en"]["tab_tts"], elem_id="tab_tts") as tab_tts:
191
+ global_components["tab_tts"] = tab_tts
192
+
193
+ tts_desc = gr.Markdown(LOCAL_STRINGS["en"]["tts_desc"])
194
+ global_components["tts_desc"] = tts_desc
195
+
196
+ tts_input = gr.Textbox(
197
+ label=LOCAL_STRINGS["en"]["tts_input"],
198
+ placeholder="The quick brown fox jumps over the lazy dog.",
199
+ lines=3
200
+ )
201
+ global_components["tts_input"] = tts_input
202
+
203
+ with gr.Row():
204
+ tts_speed = gr.Slider(
205
+ minimum=0.5,
206
+ maximum=1.5,
207
+ step=0.1,
208
+ value=1.0,
209
+ label=LOCAL_STRINGS["en"]["tts_speed"]
210
+ )
211
+ global_components["tts_speed"] = tts_speed
212
+
213
+ tts_btn = gr.Button(LOCAL_STRINGS["en"]["tts_btn"], variant="primary")
214
+ global_components["tts_btn"] = tts_btn
215
+
216
+ tts_output = gr.Audio(
217
+ label=LOCAL_STRINGS["en"]["tts_output"],
218
+ interactive=False
219
+ )
220
+ global_components["tts_output"] = tts_output
221
+
222
+ # --- Event Handlers ---
223
+
224
+ # 1. Localization Handler
225
+ def set_language(lang: str) -> Dict[str, Any]:
226
+ """Sets the language state and updates UI components."""
227
+ updates = update_ui_labels(lang)
228
+
229
+ # Apply RTL class if Arabic is selected
230
+ if lang == "ar":
231
+ updates["app_title_container"] = gr.HTML(
232
+ f"<div class='container rtl' id='app_title_container'># {LOCAL_STRINGS['ar']['title']}</div>"
233
+ )
234
+
235
+ # Apply classes to the main blocks content
236
+ updates["main_content"] = gr.Blocks(elem_classes=["rtl"])
237
+
238
+ # Need to explicitly set the subtitle markdown value and update its class
239
+ updates["subtitle"] = gr.Markdown(
240
+ f"[{LOCAL_STRINGS['ar']['subtitle']}]({LOCAL_STRINGS['ar']['subtitle_link']})",
241
+ elem_classes=["subtitle-link", "rtl"]
242
+ )
243
+ else:
244
+ updates["app_title_container"] = gr.HTML(
245
+ f"<div class='container' id='app_title_container'># {LOCAL_STRINGS['en']['title']}</div>"
246
+ )
247
+ updates["main_content"] = gr.Blocks(elem_classes=[])
248
+ updates["subtitle"] = gr.Markdown(
249
+ f"[{LOCAL_STRINGS['en']['subtitle']}]({LOCAL_STRINGS['en']['subtitle_link']})",
250
+ elem_classes=["subtitle-link"]
251
+ )
252
+
253
+ # Update the state last
254
+ updates["current_lang_state"] = lang
255
+ return updates
256
+
257
+ # Define the outputs for the language change event
258
+ localization_outputs = [
259
+ lang_radio, subtitle, content_block,
260
+ gr.Update(value=lang_radio.value, **global_components["train_input_audio"].__dict__),
261
+ gr.Update(value=lang_radio.value, **global_components["train_input_name"].__dict__),
262
+ gr.Update(value=lang_radio.value, **global_components["train_btn"].__dict__),
263
+ gr.Update(value=lang_radio.value, **global_components["train_status"].__dict__),
264
+ gr.Update(value=lang_radio.value, **global_components["train_output_file"].__dict__),
265
+ gr.Update(value=lang_radio.value, **global_components["convert_input_singer"].__dict__),
266
+ gr.Update(value=lang_radio.value, **global_components["convert_input_model_file"].__dict__),
267
+ gr.Update(value=lang_radio.value, **global_components["convert_pitch"].__dict__),
268
+ gr.Update(value=lang_radio.value, **global_components["convert_index_rate"].__dict__),
269
+ gr.Update(value=lang_radio.value, **global_components["convert_btn"].__dict__),
270
+ gr.Update(value=lang_radio.value, **global_components["convert_output"].__dict__),
271
+ gr.Update(value=lang_radio.value, **global_components["tts_input"].__dict__),
272
+ gr.Update(value=lang_radio.value, **global_components["tts_speed"].__dict__),
273
+ gr.Update(value=lang_radio.value, **global_components["tts_btn"].__dict__),
274
+ gr.Update(value=lang_radio.value, **global_components["tts_output"].__dict__),
275
+ current_lang_state,
276
+
277
+ # Markdown updates
278
+ train_desc, convert_desc, tts_desc,
279
+ tab_train, tab_convert, tab_tts,
280
+
281
+ # Invisible HTML component to handle the title update outside blocks
282
+ gr.HTML(visible=False, elem_id="app_title_container"),
283
+ ]
284
+
285
+ lang_radio.change(
286
+ set_language,
287
+ inputs=[lang_radio],
288
+ outputs=localization_outputs,
289
+ queue=False,
290
+ show_progress="hidden"
291
+ )
292
+
293
+ # 2. Training Handler
294
+ def handle_training_output(model_path: str, log_message: str, current_model_state: str) -> Tuple[str, str, gr.Update, str]:
295
+ """Handles the complex output of the training mock, saving the path and providing the download file."""
296
+ if not model_path:
297
+ # Training failed or conditions not met
298
+ return log_message, None, gr.File(visible=False), current_model_state
299
+
300
+ return log_message, model_path, gr.File(value=model_path, visible=True, label=get_localized_strings(current_lang_state.value)['train_output_file']), model_path
301
+
302
+ train_btn.click(
303
+ fn=rvc_training_mock,
304
+ inputs=[train_input_audio, train_input_name, current_lang_state],
305
+ outputs=[train_status, train_output_file, train_output_file, trained_model_file_path] # The last output updates the global state
306
+ )
307
+
308
+ # 3. Conversion Handler
309
+ convert_btn.click(
310
+ fn=rvc_conversion_mock,
311
+ inputs=[
312
+ convert_input_singer,
313
+ convert_input_model_file, # Gradio passes FileData object/dict
314
+ convert_pitch,
315
+ convert_index_rate,
316
+ current_lang_state
317
+ ],
318
+ outputs=convert_output
319
+ )
320
+
321
+ # 4. TTS Handler
322
+ tts_btn.click(
323
+ fn=tts_inference,
324
+ inputs=[tts_input, current_lang_state, tts_speed],
325
+ outputs=tts_output
326
+ )
327
+
328
+ # Initial UI setup (must happen after components are defined)
329
+ # We run the language setter once to initialize the title and main content structure
330
+ initial_updates = set_language(lang_radio.value)
331
+ demo.load(
332
+ lambda: tuple(initial_updates.values()),
333
+ outputs=list(initial_updates.keys()),
334
+ queue=False,
335
+ show_progress="hidden"
336
+ )
337
+
338
+ # Cleanup handler when the session closes
339
+ demo.unload(
340
+ lambda path_list: clean_file_paths(path_list),
341
+ inputs=[
342
+ gr.List([train_input_audio, train_input_name, train_output_file, convert_input_singer, trained_model_file_path])
343
+ ],
344
+ queue=False
345
+ )
346
+
347
+ if __name__ == "__main__":
348
+ demo.launch()
config.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Dict, Any, List
3
+
4
+ # --- Paths and Constants ---
5
+ TEMP_DIR = "temp_models"
6
+ os.makedirs(TEMP_DIR, exist_ok=True)
7
+
8
+ DEFAULT_RVC_MODEL_PATH = os.path.join(TEMP_DIR, "placeholder_rvc_model.pth")
9
+ DEFAULT_SR = 48000
10
+
11
+ # --- Localization Data (English and Arabic) ---
12
+
13
+ LOCAL_STRINGS: Dict[str, Dict[str, Any]] = {
14
+ "en": {
15
+ "title": "RVC Voice Cloning and Conversion Suite",
16
+ "subtitle": "Built with anycoder",
17
+ "subtitle_link": "https://huggingface.co/spaces/akhaliq/anycoder",
18
+ "lang_select": "Select Language",
19
+ "tab_train": "1. Voice Cloning (Training)",
20
+ "tab_convert": "2. Voice Conversion (Singing)",
21
+ "tab_tts": "3. Text-to-Speech",
22
+ "tts_desc": "Generate speech using a general AI voice model.",
23
+ "tts_input": "Text Input",
24
+ "tts_output": "Generated Speech",
25
+ "tts_btn": "Generate Speech",
26
+ "train_desc": "Upload 1-5 minutes of clear voice audio to create your clone. Output file is downloadable.",
27
+ "train_input_audio": "Upload Voice Sample Audio (WAV/MP3)",
28
+ "train_input_name": "Model Name (e.g., my_voice)",
29
+ "train_btn": "Start Voice Training (Mock)",
30
+ "train_output_file": "Download Trained Model (.pth)",
31
+ "train_status": "Training Status/Log",
32
+ "convert_desc": "Convert a singer's voice in an audio file to your cloned voice.",
33
+ "convert_input_singer": "Upload Singer Audio (WAV/MP3) to Convert",
34
+ "convert_input_model": "Load Trained Voice Model (.pth file)",
35
+ "convert_pitch": "Pitch Change (Semitones)",
36
+ "convert_index_rate": "Index Rate (Higher = More Fidelity to Target Voice)",
37
+ "convert_btn": "Perform Voice Conversion",
38
+ "convert_output": "Converted Audio Output",
39
+ "voice_select": "Select Target Voice Model",
40
+ "tts_speed": "Speech Speed (1.0 = Normal)",
41
+ "tts_voice": "TTS Voice Speaker (Default)",
42
+ },
43
+ "ar": {
44
+ "title": "حزمة تحويل واستنساخ الصوت RVC",
45
+ "subtitle": "مبني بواسطة anycoder",
46
+ "subtitle_link": "https://huggingface.co/spaces/akhaliq/anycoder",
47
+ "lang_select": "اختر اللغة",
48
+ "tab_train": "1. استنساخ الصوت (التدريب)",
49
+ "tab_convert": "2. تحويل الصوت (الغناء)",
50
+ "tab_tts": "3. تحويل النص إلى كلام",
51
+ "tts_desc": "إنشاء كلام باستخدام نموذج صوتي عام للذكاء الاصطناعي.",
52
+ "tts_input": "إدخال النص",
53
+ "tts_output": "الكلام الناتج",
54
+ "tts_btn": "توليد الكلام",
55
+ "train_desc": "قم بتحميل 1-5 دقائق من الصوت الواضح لإنشاء نسختك. يمكن تحميل الملف الناتج مباشرة.",
56
+ "train_input_audio": "تحميل عينة صوتية للتدريب (WAV/MP3)",
57
+ "train_input_name": "اسم النموذج (مثال: صوتي)",
58
+ "train_btn": "بدء تدريب الصوت (محاكاة)",
59
+ "train_output_file": "تحميل النموذج المدرب (.pth)",
60
+ "train_status": "حالة / سجل التدريب",
61
+ "convert_desc": "تحويل صوت المغني في ملف صوتي إلى صوتك المستنسخ.",
62
+ "convert_input_singer": "تحميل صوت المغني المراد تحويله (WAV/MP3)",
63
+ "convert_input_model": "تحميل نموذج الصوت المدرب (ملف .pth)",
64
+ "convert_pitch": "تغيير حدة الصوت (نغمات نصفية)",
65
+ "convert_index_rate": "معدل الفهرس (أعلى = ولاء أكبر للصوت الهدف)",
66
+ "convert_btn": "تنفيذ تحويل الصوت",
67
+ "convert_output": "إخراج الصوت المحول",
68
+ "voice_select": "اختيار نموذج الصوت الهدف",
69
+ "tts_speed": "سرعة الكلام (1.0 = عادي)",
70
+ "tts_voice": "المتحدث (افتراضي)",
71
+ },
72
+ }
73
+
74
+ # Supported languages
75
+ LANGUAGES = ["en", "ar"]
76
+
77
+ # TTS configuration (using a small, general-purpose TTS model)
78
+ TTS_MODEL_ID = "facebook/fastspeech2-en-ljspeech"
79
+ TTS_VOCODER_ID = "facebook/hifigan-en-ljspeech"
80
+
81
+ # Audio normalization factor for simulation (16-bit PCM max)
82
+ MAX_WAV_VALUE = 32767
models.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ import time
4
+ import os
5
+ import gradio as gr
6
+ import spaces
7
+ import torchaudio
8
+ from transformers import AutoProcessor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
9
+ from utils import get_localized_strings, generate_mock_pth, log_status, load_audio_from_path
10
+ from config import (
11
+ TTS_MODEL_ID, TTS_VOCODER_ID, DEFAULT_SR,
12
+ MAX_WAV_VALUE, TEMP_DIR, DEFAULT_RVC_MODEL_PATH
13
+ )
14
+
15
+ # --- TTS Setup ---
16
+ try:
17
+ tts_processor = AutoProcessor.from_pretrained("microsoft/speecht5_tts")
18
+ tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
19
+ tts_vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
20
+
21
+ # Load a speaker embedding (e.g., female speaker 7)
22
+ embeddings_dataset = torchaudio.datasets.LJSPEECH(root="./")
23
+ speaker_embeddings = torch.tensor(embeddings_dataset[7][1]).unsqueeze(0)
24
+
25
+ except Exception as e:
26
+ print(f"Warning: Could not load full TTS models. Falling back to dummy functions. Error: {e}")
27
+ # Define placeholder variables if models fail to load
28
+ tts_model = None
29
+ tts_processor = None
30
+ tts_vocoder = None
31
+ speaker_embeddings = None
32
+
33
+ if torch.cuda.is_available():
34
+ device = "cuda"
35
+ if tts_model:
36
+ tts_model.to(device)
37
+ tts_vocoder.to(device)
38
+ else:
39
+ device = "cpu"
40
+
41
+ # --- Core Functions ---
42
+
43
+ @spaces.GPU(duration=120)
44
+ def tts_inference(text: str, lang: str, speed: float) -> Tuple[int, np.ndarray]:
45
+ """
46
+ Performs Text-to-Speech using the loaded model.
47
+ """
48
+ if not tts_model:
49
+ # Dummy output if models are not available
50
+ strings = get_localized_strings(lang)
51
+ text = strings["tts_output"]
52
+ dummy_audio = np.random.randint(-1000, 1000, size=int(DEFAULT_SR * 2), dtype=np.int16)
53
+ return DEFAULT_SR, dummy_audio
54
+
55
+ try:
56
+ inputs = tts_processor(text=text, return_tensors="pt")
57
+
58
+ # Adjust speed (simple approach by modifying duration factors in inputs)
59
+ # Note: True speed control requires modification of the underlying speech generation
60
+ # model or using a different library. We rely on the model's default behavior here.
61
+
62
+ inputs = inputs.to(device)
63
+
64
+ with torch.no_grad():
65
+ speech = tts_model.generate_speech(
66
+ inputs["input_ids"],
67
+ speaker_embeddings if speaker_embeddings is not None else None,
68
+ vocoder=tts_vocoder
69
+ )
70
+
71
+ # Convert tensor to numpy array and scale to 16-bit PCM
72
+ audio_data = speech.cpu().numpy()
73
+
74
+ # Rescale float audio (-1.0 to 1.0) to int16 format
75
+ audio_int16 = (audio_data * MAX_WAV_VALUE).astype(np.int16)
76
+
77
+ return TTS_VOCODER_ID, audio_int16 # Output sample rate of hifigan is 24000
78
+
79
+ except Exception as e:
80
+ print(f"TTS Inference Error: {e}")
81
+ raise gr.Error(f"TTS failed: {str(e)}")
82
+
83
+
84
+ @spaces.GPU(duration=180) # RVC training can be very long, simulating speedup
85
+ def rvc_training_mock(audio_file_path: str, model_name: str, lang: str, progress=gr.Progress()) -> Tuple[str, str]:
86
+ """
87
+ Simulates RVC model training. Creates a downloadable file.
88
+ """
89
+ strings = get_localized_strings(lang)
90
+
91
+ if not audio_file_path:
92
+ raise gr.Error(strings["train_input_audio"] + " " + strings["tts_input"] + " " + strings["tts_output"])
93
+
94
+ progress(0, desc=log_status(lang, "Starting audio analysis..."))
95
+
96
+ # 1. Simulate data preparation and feature extraction
97
+ try:
98
+ sr, audio_data = load_audio_from_path(audio_file_path)
99
+ total_duration = len(audio_data) / sr
100
+ except Exception as e:
101
+ raise gr.Error(f"Audio file error: {e}")
102
+
103
+ if total_duration < 30:
104
+ return None, log_status(lang, "Audio duration too short for training (Min 30s recommended)")
105
+
106
+ # 2. Simulate training steps (e.g., 5 steps)
107
+ progress(0.1, desc=log_status(lang, "Analyzing input features..."))
108
+ time.sleep(2)
109
+
110
+ for i in range(1, 6):
111
+ progress(0.1 + i * 0.15, desc=log_status(lang, "Simulating training epoch {i}/5...", i=i))
112
+ time.sleep(3)
113
+
114
+ # 3. Generate mock .pth file
115
+ progress(0.9, desc=log_status(lang, "Finalizing model and generating file..."))
116
+ model_path = generate_mock_pth(model_name, TEMP_DIR)
117
+
118
+ if not model_path:
119
+ raise gr.Error(log_status(lang, "Error creating model file."))
120
+
121
+ final_log = log_status(lang, "Training complete. Model saved to: {path}", path=model_path)
122
+ progress(1.0, desc=final_log)
123
+
124
+ # Return the path for the gr.File component to handle the download link
125
+ return model_path, final_log
126
+
127
+
128
+ @spaces.GPU(duration=60)
129
+ def rvc_conversion_mock(
130
+ singer_audio_file: str,
131
+ model_file: Dict[str, Any],
132
+ pitch_change: int,
133
+ index_rate: float,
134
+ lang: str,
135
+ progress=gr.Progress()
136
+ ) -> Tuple[int, np.ndarray]:
137
+ """
138
+ Simulates RVC voice conversion from a singer track using the cloned model.
139
+ """
140
+ strings = get_localized_strings(lang)
141
+
142
+ if not singer_audio_file:
143
+ raise gr.Error(strings["convert_input_singer"] + " " + strings["tts_input"])
144
+
145
+ model_path = get_rvc_model_path(model_file, "Simulated Model")
146
+
147
+ progress(0, desc=log_status(lang, "Starting conversion process..."))
148
+
149
+ # 1. Load input audio
150
+ try:
151
+ sr, input_audio = load_audio_from_path(singer_audio_file)
152
+ except Exception as e:
153
+ raise gr.Error(f"Audio file error: {e}")
154
+
155
+ # 2. Simulate conversion steps
156
+ progress(0.2, desc=log_status(lang, "Extracting source features and pitch ({pitch} ST)", pitch=pitch_change))
157
+ time.sleep(3)
158
+
159
+ progress(0.5, desc=log_status(lang, "Applying RVC Index (Rate: {rate})", rate=index_rate))
160
+ time.sleep(4)
161
+
162
+ # 3. Generate simulated converted audio:
163
+ # For a real RVC, this uses the model features.
164
+ # Here, we generate random noise, mix it with the original audio,
165
+ # and adjust volume/pitch slightly based on parameters.
166
+
167
+ input_audio_float = input_audio.astype(np.float32) / MAX_WAV_VALUE
168
+
169
+ # Simple simulation: Apply pitch shift (FFT-based manipulation is complex in numpy)
170
+ # Instead, we apply a small delay/reverb and modulate the volume
171
+ noise = np.random.normal(0, 0.1, len(input_audio_float))
172
+ converted_audio_float = input_audio_float * (1 + 0.1 * index_rate) + noise
173
+
174
+ # Simple pitch simulation: increase amplitude slightly if pitch is high
175
+ if pitch_change > 0:
176
+ converted_audio_float *= (1 + pitch_change / 30.0)
177
+
178
+ # Normalize output
179
+ max_val = np.max(np.abs(converted_audio_float))
180
+ if max_val > 1.0:
181
+ converted_audio_float /= max_val
182
+
183
+ converted_audio_int16 = (converted_audio_float * MAX_WAV_VALUE).astype(np.int16)
184
+
185
+ progress(1.0, desc=log_status(lang, "Conversion complete."))
186
+
187
+ # Return the converted audio
188
+ return sr, converted_audio_int16
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio
2
+ torch
3
+ torchaudio
4
+ numpy
5
+ librosa
6
+ transformers
7
+ accelerate
utils.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import time
4
+ import numpy as np
5
+ from typing import Literal, Dict, Any, Tuple
6
+ from config import LOCAL_STRINGS, DEFAULT_SR
7
+
8
+ def get_localized_strings(lang: Literal["en", "ar"]) -> Dict[str, Any]:
9
+ """Retrieves the localized strings dictionary."""
10
+ return LOCAL_STRINGS.get(lang, LOCAL_STRINGS["en"])
11
+
12
+ def log_status(lang: Literal["en", "ar"], message_key: str, **kwargs) -> str:
13
+ """Logs a localized status message."""
14
+ strings = get_localized_strings(lang)
15
+ message = strings.get(message_key, message_key).format(**kwargs)
16
+ return f"[{time.strftime('%H:%M:%S')}] {message}"
17
+
18
+ def generate_mock_pth(model_name: str, temp_dir: str) -> str:
19
+ """Simulates RVC model creation and returns the path to the dummy .pth file."""
20
+ # Ensure temporary directory exists
21
+ os.makedirs(temp_dir, exist_ok=True)
22
+
23
+ # Create a unique, descriptive path for the model file
24
+ filename = f"{model_name}_{int(time.time())}.pth"
25
+ model_path = os.path.join(temp_dir, filename)
26
+
27
+ # Simulate writing a small placeholder model file (real models are MBs/GBs)
28
+ try:
29
+ with open(model_path, 'w') as f:
30
+ f.write(f"RVC Model Data: {model_name}, Training Simulated at {time.ctime()}")
31
+ return model_path
32
+ except IOError:
33
+ # Handle potential permissions issues during file writing
34
+ return None
35
+
36
+ def clean_file_paths(paths: List[str]):
37
+ """Cleans up the temporary files created during the session."""
38
+ for path in paths:
39
+ if path and os.path.exists(path):
40
+ try:
41
+ os.remove(path)
42
+ except Exception as e:
43
+ print(f"Error cleaning up file {path}: {e}")
44
+
45
+ def get_rvc_model_path(model_file_data: dict, model_name: str) -> str:
46
+ """
47
+ Retrieves the actual file path from the Gradio FileData object or uses a default.
48
+ Gradio components return paths or FileData dicts upon upload.
49
+ """
50
+ if model_file_data and isinstance(model_file_data, dict) and 'path' in model_file_data:
51
+ return model_file_data['path']
52
+
53
+ # Fallback to a placeholder if no file is explicitly uploaded (for demo purposes)
54
+ return DEFAULT_RVC_MODEL_PATH
55
+
56
+ def load_audio_from_path(file_path: str) -> Tuple[int, np.ndarray]:
57
+ """Loads audio file using librosa, resampling to DEFAULT_SR."""
58
+ import librosa
59
+ try:
60
+ audio, sr = librosa.load(file_path, sr=DEFAULT_SR, mono=True)
61
+ # Convert to 16-bit PCM integer format for standard Gradio audio tuple
62
+ audio_int16 = (audio * MAX_WAV_VALUE).astype(np.int16)
63
+ return DEFAULT_SR, audio_int16
64
+ except Exception as e:
65
+ print(f"Error loading audio file {file_path}: {e}")
66
+ raise gr.Error(f"Failed to load audio: {str(e)}")