Spaces:
Runtime error
Runtime error
Deploy Gradio app with multiple files
Browse files- app.py +348 -0
- config.py +82 -0
- models.py +188 -0
- requirements.txt +7 -0
- utils.py +66 -0
app.py
ADDED
|
@@ -0,0 +1,348 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import time
|
| 3 |
+
import os
|
| 4 |
+
import shutil
|
| 5 |
+
from typing import Dict, Any, List, Tuple
|
| 6 |
+
from config import LOCAL_STRINGS, TEMP_DIR, DEFAULT_RVC_MODEL_PATH, LANGUAGES
|
| 7 |
+
from utils import get_localized_strings, clean_file_paths
|
| 8 |
+
from models import rvc_training_mock, rvc_conversion_mock, tts_inference
|
| 9 |
+
|
| 10 |
+
# Global state to store the selected language
|
| 11 |
+
current_lang_state = gr.State("en")
|
| 12 |
+
# Global state to track generated model file path for cleanup
|
| 13 |
+
trained_model_file_path = gr.State(DEFAULT_RVC_MODEL_PATH)
|
| 14 |
+
# State for uploaded RVC model path in conversion tab
|
| 15 |
+
converted_model_path = gr.State(None)
|
| 16 |
+
|
| 17 |
+
def update_ui_labels(lang: str) -> Dict[str, Any]:
|
| 18 |
+
"""Updates all component labels based on the selected language."""
|
| 19 |
+
strings = get_localized_strings(lang)
|
| 20 |
+
|
| 21 |
+
updates = {
|
| 22 |
+
# Tabs
|
| 23 |
+
"tab_train": gr.Tab(label=strings["tab_train"]),
|
| 24 |
+
"tab_convert": gr.Tab(label=strings["tab_convert"]),
|
| 25 |
+
"tab_tts": gr.Tab(label=strings["tab_tts"]),
|
| 26 |
+
|
| 27 |
+
# Header/Language Selector
|
| 28 |
+
"app_title": gr.Markdown(f"# {strings['title']}"),
|
| 29 |
+
"lang_radio": gr.Radio(label=strings["lang_select"], value=lang),
|
| 30 |
+
"subtitle": gr.Markdown(
|
| 31 |
+
f"[{strings['subtitle']}]({strings['subtitle_link']})"
|
| 32 |
+
),
|
| 33 |
+
|
| 34 |
+
# Training Tab
|
| 35 |
+
"train_desc": gr.Markdown(strings["train_desc"]),
|
| 36 |
+
"train_input_audio": gr.Audio(label=strings["train_input_audio"]),
|
| 37 |
+
"train_input_name": gr.Textbox(label=strings["train_input_name"]),
|
| 38 |
+
"train_btn": gr.Button(strings["train_btn"], variant="primary"),
|
| 39 |
+
"train_output_file": gr.File(label=strings["train_output_file"], visible=False),
|
| 40 |
+
"train_status": gr.Textbox(label=strings["train_status"], interactive=False),
|
| 41 |
+
|
| 42 |
+
# Conversion Tab
|
| 43 |
+
"convert_desc": gr.Markdown(strings["convert_desc"]),
|
| 44 |
+
"convert_input_singer": gr.Audio(label=strings["convert_input_singer"]),
|
| 45 |
+
"convert_input_model_file": gr.File(label=strings["convert_input_model"], visible=True),
|
| 46 |
+
"convert_pitch": gr.Slider(label=strings["convert_pitch"]),
|
| 47 |
+
"convert_index_rate": gr.Slider(label=strings["convert_index_rate"]),
|
| 48 |
+
"convert_btn": gr.Button(strings["convert_btn"], variant="primary"),
|
| 49 |
+
"convert_output": gr.Audio(label=strings["convert_output"]),
|
| 50 |
+
|
| 51 |
+
# TTS Tab
|
| 52 |
+
"tts_desc": gr.Markdown(strings["tts_desc"]),
|
| 53 |
+
"tts_input": gr.Textbox(label=strings["tts_input"], placeholder="Type your text here...", lines=3),
|
| 54 |
+
"tts_speed": gr.Slider(label=strings["tts_speed"]),
|
| 55 |
+
"tts_btn": gr.Button(strings["tts_btn"], variant="primary"),
|
| 56 |
+
"tts_output": gr.Audio(label=strings["tts_output"]),
|
| 57 |
+
}
|
| 58 |
+
return updates
|
| 59 |
+
|
| 60 |
+
# --- Gradio Application ---
|
| 61 |
+
|
| 62 |
+
with gr.Blocks(theme=gr.themes.Base(), css="""
|
| 63 |
+
.container {
|
| 64 |
+
max-width: 1000px;
|
| 65 |
+
margin: auto;
|
| 66 |
+
}
|
| 67 |
+
.rtl {
|
| 68 |
+
direction: rtl;
|
| 69 |
+
text-align: right;
|
| 70 |
+
}
|
| 71 |
+
.rtl label {
|
| 72 |
+
float: right !important;
|
| 73 |
+
}
|
| 74 |
+
.rtl .markdown {
|
| 75 |
+
text-align: right;
|
| 76 |
+
}
|
| 77 |
+
""") as demo:
|
| 78 |
+
|
| 79 |
+
gr.HTML("<div class='container' id='app_title_container'></div>") # Placeholder for title Markdown
|
| 80 |
+
|
| 81 |
+
# Define language selector and global state
|
| 82 |
+
with gr.Row(elem_id="lang_row"):
|
| 83 |
+
lang_radio = gr.Radio(
|
| 84 |
+
["en", "ar"],
|
| 85 |
+
value="en",
|
| 86 |
+
label=LOCAL_STRINGS["en"]["lang_select"],
|
| 87 |
+
elem_id="lang_radio",
|
| 88 |
+
scale=0
|
| 89 |
+
)
|
| 90 |
+
subtitle = gr.Markdown(
|
| 91 |
+
f"[{LOCAL_STRINGS['en']['subtitle']}]({LOCAL_STRINGS['en']['subtitle_link']})",
|
| 92 |
+
scale=1,
|
| 93 |
+
elem_classes=["subtitle-link"]
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
# Placeholders for dynamic components
|
| 97 |
+
global_components = {}
|
| 98 |
+
|
| 99 |
+
with gr.Blocks(elem_id="main_content") as content_block:
|
| 100 |
+
|
| 101 |
+
# Training Tab
|
| 102 |
+
with gr.Tab(label=LOCAL_STRINGS["en"]["tab_train"], elem_id="tab_train") as tab_train:
|
| 103 |
+
global_components["tab_train"] = tab_train
|
| 104 |
+
|
| 105 |
+
train_desc = gr.Markdown(LOCAL_STRINGS["en"]["train_desc"])
|
| 106 |
+
global_components["train_desc"] = train_desc
|
| 107 |
+
|
| 108 |
+
with gr.Row():
|
| 109 |
+
train_input_audio = gr.Audio(
|
| 110 |
+
sources=["upload", "microphone"],
|
| 111 |
+
type="filepath",
|
| 112 |
+
label=LOCAL_STRINGS["en"]["train_input_audio"]
|
| 113 |
+
)
|
| 114 |
+
global_components["train_input_audio"] = train_input_audio
|
| 115 |
+
|
| 116 |
+
train_input_name = gr.Textbox(
|
| 117 |
+
label=LOCAL_STRINGS["en"]["train_input_name"],
|
| 118 |
+
value="MyVoiceModel"
|
| 119 |
+
)
|
| 120 |
+
global_components["train_input_name"] = train_input_name
|
| 121 |
+
|
| 122 |
+
train_btn = gr.Button(LOCAL_STRINGS["en"]["train_btn"], variant="primary")
|
| 123 |
+
global_components["train_btn"] = train_btn
|
| 124 |
+
|
| 125 |
+
train_status = gr.Textbox(
|
| 126 |
+
label=LOCAL_STRINGS["en"]["train_status"],
|
| 127 |
+
interactive=False
|
| 128 |
+
)
|
| 129 |
+
global_components["train_status"] = train_status
|
| 130 |
+
|
| 131 |
+
train_output_file = gr.File(
|
| 132 |
+
label=LOCAL_STRINGS["en"]["train_output_file"],
|
| 133 |
+
visible=False,
|
| 134 |
+
type="filepath",
|
| 135 |
+
file_count="single"
|
| 136 |
+
)
|
| 137 |
+
global_components["train_output_file"] = train_output_file
|
| 138 |
+
|
| 139 |
+
# Conversion Tab
|
| 140 |
+
with gr.Tab(label=LOCAL_STRINGS["en"]["tab_convert"], elem_id="tab_convert") as tab_convert:
|
| 141 |
+
global_components["tab_convert"] = tab_convert
|
| 142 |
+
|
| 143 |
+
convert_desc = gr.Markdown(LOCAL_STRINGS["en"]["convert_desc"])
|
| 144 |
+
global_components["convert_desc"] = convert_desc
|
| 145 |
+
|
| 146 |
+
with gr.Row():
|
| 147 |
+
convert_input_singer = gr.Audio(
|
| 148 |
+
sources=["upload", "microphone"],
|
| 149 |
+
type="filepath",
|
| 150 |
+
label=LOCAL_STRINGS["en"]["convert_input_singer"]
|
| 151 |
+
)
|
| 152 |
+
global_components["convert_input_singer"] = convert_input_singer
|
| 153 |
+
|
| 154 |
+
convert_input_model_file = gr.File(
|
| 155 |
+
label=LOCAL_STRINGS["en"]["convert_input_model"],
|
| 156 |
+
file_types=[".pth"],
|
| 157 |
+
type="file"
|
| 158 |
+
)
|
| 159 |
+
global_components["convert_input_model_file"] = convert_input_model_file
|
| 160 |
+
|
| 161 |
+
with gr.Row():
|
| 162 |
+
convert_pitch = gr.Slider(
|
| 163 |
+
minimum=-12,
|
| 164 |
+
maximum=12,
|
| 165 |
+
step=1,
|
| 166 |
+
value=0,
|
| 167 |
+
label=LOCAL_STRINGS["en"]["convert_pitch"]
|
| 168 |
+
)
|
| 169 |
+
global_components["convert_pitch"] = convert_pitch
|
| 170 |
+
|
| 171 |
+
convert_index_rate = gr.Slider(
|
| 172 |
+
minimum=0.0,
|
| 173 |
+
maximum=1.0,
|
| 174 |
+
step=0.05,
|
| 175 |
+
value=0.7,
|
| 176 |
+
label=LOCAL_STRINGS["en"]["convert_index_rate"]
|
| 177 |
+
)
|
| 178 |
+
global_components["convert_index_rate"] = convert_index_rate
|
| 179 |
+
|
| 180 |
+
convert_btn = gr.Button(LOCAL_STRINGS["en"]["convert_btn"], variant="primary")
|
| 181 |
+
global_components["convert_btn"] = convert_btn
|
| 182 |
+
|
| 183 |
+
convert_output = gr.Audio(
|
| 184 |
+
label=LOCAL_STRINGS["en"]["convert_output"],
|
| 185 |
+
interactive=False
|
| 186 |
+
)
|
| 187 |
+
global_components["convert_output"] = convert_output
|
| 188 |
+
|
| 189 |
+
# TTS Tab
|
| 190 |
+
with gr.Tab(label=LOCAL_STRINGS["en"]["tab_tts"], elem_id="tab_tts") as tab_tts:
|
| 191 |
+
global_components["tab_tts"] = tab_tts
|
| 192 |
+
|
| 193 |
+
tts_desc = gr.Markdown(LOCAL_STRINGS["en"]["tts_desc"])
|
| 194 |
+
global_components["tts_desc"] = tts_desc
|
| 195 |
+
|
| 196 |
+
tts_input = gr.Textbox(
|
| 197 |
+
label=LOCAL_STRINGS["en"]["tts_input"],
|
| 198 |
+
placeholder="The quick brown fox jumps over the lazy dog.",
|
| 199 |
+
lines=3
|
| 200 |
+
)
|
| 201 |
+
global_components["tts_input"] = tts_input
|
| 202 |
+
|
| 203 |
+
with gr.Row():
|
| 204 |
+
tts_speed = gr.Slider(
|
| 205 |
+
minimum=0.5,
|
| 206 |
+
maximum=1.5,
|
| 207 |
+
step=0.1,
|
| 208 |
+
value=1.0,
|
| 209 |
+
label=LOCAL_STRINGS["en"]["tts_speed"]
|
| 210 |
+
)
|
| 211 |
+
global_components["tts_speed"] = tts_speed
|
| 212 |
+
|
| 213 |
+
tts_btn = gr.Button(LOCAL_STRINGS["en"]["tts_btn"], variant="primary")
|
| 214 |
+
global_components["tts_btn"] = tts_btn
|
| 215 |
+
|
| 216 |
+
tts_output = gr.Audio(
|
| 217 |
+
label=LOCAL_STRINGS["en"]["tts_output"],
|
| 218 |
+
interactive=False
|
| 219 |
+
)
|
| 220 |
+
global_components["tts_output"] = tts_output
|
| 221 |
+
|
| 222 |
+
# --- Event Handlers ---
|
| 223 |
+
|
| 224 |
+
# 1. Localization Handler
|
| 225 |
+
def set_language(lang: str) -> Dict[str, Any]:
|
| 226 |
+
"""Sets the language state and updates UI components."""
|
| 227 |
+
updates = update_ui_labels(lang)
|
| 228 |
+
|
| 229 |
+
# Apply RTL class if Arabic is selected
|
| 230 |
+
if lang == "ar":
|
| 231 |
+
updates["app_title_container"] = gr.HTML(
|
| 232 |
+
f"<div class='container rtl' id='app_title_container'># {LOCAL_STRINGS['ar']['title']}</div>"
|
| 233 |
+
)
|
| 234 |
+
|
| 235 |
+
# Apply classes to the main blocks content
|
| 236 |
+
updates["main_content"] = gr.Blocks(elem_classes=["rtl"])
|
| 237 |
+
|
| 238 |
+
# Need to explicitly set the subtitle markdown value and update its class
|
| 239 |
+
updates["subtitle"] = gr.Markdown(
|
| 240 |
+
f"[{LOCAL_STRINGS['ar']['subtitle']}]({LOCAL_STRINGS['ar']['subtitle_link']})",
|
| 241 |
+
elem_classes=["subtitle-link", "rtl"]
|
| 242 |
+
)
|
| 243 |
+
else:
|
| 244 |
+
updates["app_title_container"] = gr.HTML(
|
| 245 |
+
f"<div class='container' id='app_title_container'># {LOCAL_STRINGS['en']['title']}</div>"
|
| 246 |
+
)
|
| 247 |
+
updates["main_content"] = gr.Blocks(elem_classes=[])
|
| 248 |
+
updates["subtitle"] = gr.Markdown(
|
| 249 |
+
f"[{LOCAL_STRINGS['en']['subtitle']}]({LOCAL_STRINGS['en']['subtitle_link']})",
|
| 250 |
+
elem_classes=["subtitle-link"]
|
| 251 |
+
)
|
| 252 |
+
|
| 253 |
+
# Update the state last
|
| 254 |
+
updates["current_lang_state"] = lang
|
| 255 |
+
return updates
|
| 256 |
+
|
| 257 |
+
# Define the outputs for the language change event
|
| 258 |
+
localization_outputs = [
|
| 259 |
+
lang_radio, subtitle, content_block,
|
| 260 |
+
gr.Update(value=lang_radio.value, **global_components["train_input_audio"].__dict__),
|
| 261 |
+
gr.Update(value=lang_radio.value, **global_components["train_input_name"].__dict__),
|
| 262 |
+
gr.Update(value=lang_radio.value, **global_components["train_btn"].__dict__),
|
| 263 |
+
gr.Update(value=lang_radio.value, **global_components["train_status"].__dict__),
|
| 264 |
+
gr.Update(value=lang_radio.value, **global_components["train_output_file"].__dict__),
|
| 265 |
+
gr.Update(value=lang_radio.value, **global_components["convert_input_singer"].__dict__),
|
| 266 |
+
gr.Update(value=lang_radio.value, **global_components["convert_input_model_file"].__dict__),
|
| 267 |
+
gr.Update(value=lang_radio.value, **global_components["convert_pitch"].__dict__),
|
| 268 |
+
gr.Update(value=lang_radio.value, **global_components["convert_index_rate"].__dict__),
|
| 269 |
+
gr.Update(value=lang_radio.value, **global_components["convert_btn"].__dict__),
|
| 270 |
+
gr.Update(value=lang_radio.value, **global_components["convert_output"].__dict__),
|
| 271 |
+
gr.Update(value=lang_radio.value, **global_components["tts_input"].__dict__),
|
| 272 |
+
gr.Update(value=lang_radio.value, **global_components["tts_speed"].__dict__),
|
| 273 |
+
gr.Update(value=lang_radio.value, **global_components["tts_btn"].__dict__),
|
| 274 |
+
gr.Update(value=lang_radio.value, **global_components["tts_output"].__dict__),
|
| 275 |
+
current_lang_state,
|
| 276 |
+
|
| 277 |
+
# Markdown updates
|
| 278 |
+
train_desc, convert_desc, tts_desc,
|
| 279 |
+
tab_train, tab_convert, tab_tts,
|
| 280 |
+
|
| 281 |
+
# Invisible HTML component to handle the title update outside blocks
|
| 282 |
+
gr.HTML(visible=False, elem_id="app_title_container"),
|
| 283 |
+
]
|
| 284 |
+
|
| 285 |
+
lang_radio.change(
|
| 286 |
+
set_language,
|
| 287 |
+
inputs=[lang_radio],
|
| 288 |
+
outputs=localization_outputs,
|
| 289 |
+
queue=False,
|
| 290 |
+
show_progress="hidden"
|
| 291 |
+
)
|
| 292 |
+
|
| 293 |
+
# 2. Training Handler
|
| 294 |
+
def handle_training_output(model_path: str, log_message: str, current_model_state: str) -> Tuple[str, str, gr.Update, str]:
|
| 295 |
+
"""Handles the complex output of the training mock, saving the path and providing the download file."""
|
| 296 |
+
if not model_path:
|
| 297 |
+
# Training failed or conditions not met
|
| 298 |
+
return log_message, None, gr.File(visible=False), current_model_state
|
| 299 |
+
|
| 300 |
+
return log_message, model_path, gr.File(value=model_path, visible=True, label=get_localized_strings(current_lang_state.value)['train_output_file']), model_path
|
| 301 |
+
|
| 302 |
+
train_btn.click(
|
| 303 |
+
fn=rvc_training_mock,
|
| 304 |
+
inputs=[train_input_audio, train_input_name, current_lang_state],
|
| 305 |
+
outputs=[train_status, train_output_file, train_output_file, trained_model_file_path] # The last output updates the global state
|
| 306 |
+
)
|
| 307 |
+
|
| 308 |
+
# 3. Conversion Handler
|
| 309 |
+
convert_btn.click(
|
| 310 |
+
fn=rvc_conversion_mock,
|
| 311 |
+
inputs=[
|
| 312 |
+
convert_input_singer,
|
| 313 |
+
convert_input_model_file, # Gradio passes FileData object/dict
|
| 314 |
+
convert_pitch,
|
| 315 |
+
convert_index_rate,
|
| 316 |
+
current_lang_state
|
| 317 |
+
],
|
| 318 |
+
outputs=convert_output
|
| 319 |
+
)
|
| 320 |
+
|
| 321 |
+
# 4. TTS Handler
|
| 322 |
+
tts_btn.click(
|
| 323 |
+
fn=tts_inference,
|
| 324 |
+
inputs=[tts_input, current_lang_state, tts_speed],
|
| 325 |
+
outputs=tts_output
|
| 326 |
+
)
|
| 327 |
+
|
| 328 |
+
# Initial UI setup (must happen after components are defined)
|
| 329 |
+
# We run the language setter once to initialize the title and main content structure
|
| 330 |
+
initial_updates = set_language(lang_radio.value)
|
| 331 |
+
demo.load(
|
| 332 |
+
lambda: tuple(initial_updates.values()),
|
| 333 |
+
outputs=list(initial_updates.keys()),
|
| 334 |
+
queue=False,
|
| 335 |
+
show_progress="hidden"
|
| 336 |
+
)
|
| 337 |
+
|
| 338 |
+
# Cleanup handler when the session closes
|
| 339 |
+
demo.unload(
|
| 340 |
+
lambda path_list: clean_file_paths(path_list),
|
| 341 |
+
inputs=[
|
| 342 |
+
gr.List([train_input_audio, train_input_name, train_output_file, convert_input_singer, trained_model_file_path])
|
| 343 |
+
],
|
| 344 |
+
queue=False
|
| 345 |
+
)
|
| 346 |
+
|
| 347 |
+
if __name__ == "__main__":
|
| 348 |
+
demo.launch()
|
config.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from typing import Dict, Any, List
|
| 3 |
+
|
| 4 |
+
# --- Paths and Constants ---
|
| 5 |
+
TEMP_DIR = "temp_models"
|
| 6 |
+
os.makedirs(TEMP_DIR, exist_ok=True)
|
| 7 |
+
|
| 8 |
+
DEFAULT_RVC_MODEL_PATH = os.path.join(TEMP_DIR, "placeholder_rvc_model.pth")
|
| 9 |
+
DEFAULT_SR = 48000
|
| 10 |
+
|
| 11 |
+
# --- Localization Data (English and Arabic) ---
|
| 12 |
+
|
| 13 |
+
LOCAL_STRINGS: Dict[str, Dict[str, Any]] = {
|
| 14 |
+
"en": {
|
| 15 |
+
"title": "RVC Voice Cloning and Conversion Suite",
|
| 16 |
+
"subtitle": "Built with anycoder",
|
| 17 |
+
"subtitle_link": "https://huggingface.co/spaces/akhaliq/anycoder",
|
| 18 |
+
"lang_select": "Select Language",
|
| 19 |
+
"tab_train": "1. Voice Cloning (Training)",
|
| 20 |
+
"tab_convert": "2. Voice Conversion (Singing)",
|
| 21 |
+
"tab_tts": "3. Text-to-Speech",
|
| 22 |
+
"tts_desc": "Generate speech using a general AI voice model.",
|
| 23 |
+
"tts_input": "Text Input",
|
| 24 |
+
"tts_output": "Generated Speech",
|
| 25 |
+
"tts_btn": "Generate Speech",
|
| 26 |
+
"train_desc": "Upload 1-5 minutes of clear voice audio to create your clone. Output file is downloadable.",
|
| 27 |
+
"train_input_audio": "Upload Voice Sample Audio (WAV/MP3)",
|
| 28 |
+
"train_input_name": "Model Name (e.g., my_voice)",
|
| 29 |
+
"train_btn": "Start Voice Training (Mock)",
|
| 30 |
+
"train_output_file": "Download Trained Model (.pth)",
|
| 31 |
+
"train_status": "Training Status/Log",
|
| 32 |
+
"convert_desc": "Convert a singer's voice in an audio file to your cloned voice.",
|
| 33 |
+
"convert_input_singer": "Upload Singer Audio (WAV/MP3) to Convert",
|
| 34 |
+
"convert_input_model": "Load Trained Voice Model (.pth file)",
|
| 35 |
+
"convert_pitch": "Pitch Change (Semitones)",
|
| 36 |
+
"convert_index_rate": "Index Rate (Higher = More Fidelity to Target Voice)",
|
| 37 |
+
"convert_btn": "Perform Voice Conversion",
|
| 38 |
+
"convert_output": "Converted Audio Output",
|
| 39 |
+
"voice_select": "Select Target Voice Model",
|
| 40 |
+
"tts_speed": "Speech Speed (1.0 = Normal)",
|
| 41 |
+
"tts_voice": "TTS Voice Speaker (Default)",
|
| 42 |
+
},
|
| 43 |
+
"ar": {
|
| 44 |
+
"title": "حزمة تحويل واستنساخ الصوت RVC",
|
| 45 |
+
"subtitle": "مبني بواسطة anycoder",
|
| 46 |
+
"subtitle_link": "https://huggingface.co/spaces/akhaliq/anycoder",
|
| 47 |
+
"lang_select": "اختر اللغة",
|
| 48 |
+
"tab_train": "1. استنساخ الصوت (التدريب)",
|
| 49 |
+
"tab_convert": "2. تحويل الصوت (الغناء)",
|
| 50 |
+
"tab_tts": "3. تحويل النص إلى كلام",
|
| 51 |
+
"tts_desc": "إنشاء كلام باستخدام نموذج صوتي عام للذكاء الاصطناعي.",
|
| 52 |
+
"tts_input": "إدخال النص",
|
| 53 |
+
"tts_output": "الكلام الناتج",
|
| 54 |
+
"tts_btn": "توليد الكلام",
|
| 55 |
+
"train_desc": "قم بتحميل 1-5 دقائق من الصوت الواضح لإنشاء نسختك. يمكن تحميل الملف الناتج مباشرة.",
|
| 56 |
+
"train_input_audio": "تحميل عينة صوتية للتدريب (WAV/MP3)",
|
| 57 |
+
"train_input_name": "اسم النموذج (مثال: صوتي)",
|
| 58 |
+
"train_btn": "بدء تدريب الصوت (محاكاة)",
|
| 59 |
+
"train_output_file": "تحميل النموذج المدرب (.pth)",
|
| 60 |
+
"train_status": "حالة / سجل التدريب",
|
| 61 |
+
"convert_desc": "تحويل صوت المغني في ملف صوتي إلى صوتك المستنسخ.",
|
| 62 |
+
"convert_input_singer": "تحميل صوت المغني المراد تحويله (WAV/MP3)",
|
| 63 |
+
"convert_input_model": "تحميل نموذج الصوت المدرب (ملف .pth)",
|
| 64 |
+
"convert_pitch": "تغيير حدة الصوت (نغمات نصفية)",
|
| 65 |
+
"convert_index_rate": "معدل الفهرس (أعلى = ولاء أكبر للصوت الهدف)",
|
| 66 |
+
"convert_btn": "تنفيذ تحويل الصوت",
|
| 67 |
+
"convert_output": "إخراج الصوت المحول",
|
| 68 |
+
"voice_select": "اختيار نموذج الصوت الهدف",
|
| 69 |
+
"tts_speed": "سرعة الكلام (1.0 = عادي)",
|
| 70 |
+
"tts_voice": "المتحدث (افتراضي)",
|
| 71 |
+
},
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
# Supported languages
|
| 75 |
+
LANGUAGES = ["en", "ar"]
|
| 76 |
+
|
| 77 |
+
# TTS configuration (using a small, general-purpose TTS model)
|
| 78 |
+
TTS_MODEL_ID = "facebook/fastspeech2-en-ljspeech"
|
| 79 |
+
TTS_VOCODER_ID = "facebook/hifigan-en-ljspeech"
|
| 80 |
+
|
| 81 |
+
# Audio normalization factor for simulation (16-bit PCM max)
|
| 82 |
+
MAX_WAV_VALUE = 32767
|
models.py
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import numpy as np
|
| 3 |
+
import time
|
| 4 |
+
import os
|
| 5 |
+
import gradio as gr
|
| 6 |
+
import spaces
|
| 7 |
+
import torchaudio
|
| 8 |
+
from transformers import AutoProcessor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
|
| 9 |
+
from utils import get_localized_strings, generate_mock_pth, log_status, load_audio_from_path
|
| 10 |
+
from config import (
|
| 11 |
+
TTS_MODEL_ID, TTS_VOCODER_ID, DEFAULT_SR,
|
| 12 |
+
MAX_WAV_VALUE, TEMP_DIR, DEFAULT_RVC_MODEL_PATH
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
# --- TTS Setup ---
|
| 16 |
+
try:
|
| 17 |
+
tts_processor = AutoProcessor.from_pretrained("microsoft/speecht5_tts")
|
| 18 |
+
tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
|
| 19 |
+
tts_vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
| 20 |
+
|
| 21 |
+
# Load a speaker embedding (e.g., female speaker 7)
|
| 22 |
+
embeddings_dataset = torchaudio.datasets.LJSPEECH(root="./")
|
| 23 |
+
speaker_embeddings = torch.tensor(embeddings_dataset[7][1]).unsqueeze(0)
|
| 24 |
+
|
| 25 |
+
except Exception as e:
|
| 26 |
+
print(f"Warning: Could not load full TTS models. Falling back to dummy functions. Error: {e}")
|
| 27 |
+
# Define placeholder variables if models fail to load
|
| 28 |
+
tts_model = None
|
| 29 |
+
tts_processor = None
|
| 30 |
+
tts_vocoder = None
|
| 31 |
+
speaker_embeddings = None
|
| 32 |
+
|
| 33 |
+
if torch.cuda.is_available():
|
| 34 |
+
device = "cuda"
|
| 35 |
+
if tts_model:
|
| 36 |
+
tts_model.to(device)
|
| 37 |
+
tts_vocoder.to(device)
|
| 38 |
+
else:
|
| 39 |
+
device = "cpu"
|
| 40 |
+
|
| 41 |
+
# --- Core Functions ---
|
| 42 |
+
|
| 43 |
+
@spaces.GPU(duration=120)
|
| 44 |
+
def tts_inference(text: str, lang: str, speed: float) -> Tuple[int, np.ndarray]:
|
| 45 |
+
"""
|
| 46 |
+
Performs Text-to-Speech using the loaded model.
|
| 47 |
+
"""
|
| 48 |
+
if not tts_model:
|
| 49 |
+
# Dummy output if models are not available
|
| 50 |
+
strings = get_localized_strings(lang)
|
| 51 |
+
text = strings["tts_output"]
|
| 52 |
+
dummy_audio = np.random.randint(-1000, 1000, size=int(DEFAULT_SR * 2), dtype=np.int16)
|
| 53 |
+
return DEFAULT_SR, dummy_audio
|
| 54 |
+
|
| 55 |
+
try:
|
| 56 |
+
inputs = tts_processor(text=text, return_tensors="pt")
|
| 57 |
+
|
| 58 |
+
# Adjust speed (simple approach by modifying duration factors in inputs)
|
| 59 |
+
# Note: True speed control requires modification of the underlying speech generation
|
| 60 |
+
# model or using a different library. We rely on the model's default behavior here.
|
| 61 |
+
|
| 62 |
+
inputs = inputs.to(device)
|
| 63 |
+
|
| 64 |
+
with torch.no_grad():
|
| 65 |
+
speech = tts_model.generate_speech(
|
| 66 |
+
inputs["input_ids"],
|
| 67 |
+
speaker_embeddings if speaker_embeddings is not None else None,
|
| 68 |
+
vocoder=tts_vocoder
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
# Convert tensor to numpy array and scale to 16-bit PCM
|
| 72 |
+
audio_data = speech.cpu().numpy()
|
| 73 |
+
|
| 74 |
+
# Rescale float audio (-1.0 to 1.0) to int16 format
|
| 75 |
+
audio_int16 = (audio_data * MAX_WAV_VALUE).astype(np.int16)
|
| 76 |
+
|
| 77 |
+
return TTS_VOCODER_ID, audio_int16 # Output sample rate of hifigan is 24000
|
| 78 |
+
|
| 79 |
+
except Exception as e:
|
| 80 |
+
print(f"TTS Inference Error: {e}")
|
| 81 |
+
raise gr.Error(f"TTS failed: {str(e)}")
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
@spaces.GPU(duration=180) # RVC training can be very long, simulating speedup
|
| 85 |
+
def rvc_training_mock(audio_file_path: str, model_name: str, lang: str, progress=gr.Progress()) -> Tuple[str, str]:
|
| 86 |
+
"""
|
| 87 |
+
Simulates RVC model training. Creates a downloadable file.
|
| 88 |
+
"""
|
| 89 |
+
strings = get_localized_strings(lang)
|
| 90 |
+
|
| 91 |
+
if not audio_file_path:
|
| 92 |
+
raise gr.Error(strings["train_input_audio"] + " " + strings["tts_input"] + " " + strings["tts_output"])
|
| 93 |
+
|
| 94 |
+
progress(0, desc=log_status(lang, "Starting audio analysis..."))
|
| 95 |
+
|
| 96 |
+
# 1. Simulate data preparation and feature extraction
|
| 97 |
+
try:
|
| 98 |
+
sr, audio_data = load_audio_from_path(audio_file_path)
|
| 99 |
+
total_duration = len(audio_data) / sr
|
| 100 |
+
except Exception as e:
|
| 101 |
+
raise gr.Error(f"Audio file error: {e}")
|
| 102 |
+
|
| 103 |
+
if total_duration < 30:
|
| 104 |
+
return None, log_status(lang, "Audio duration too short for training (Min 30s recommended)")
|
| 105 |
+
|
| 106 |
+
# 2. Simulate training steps (e.g., 5 steps)
|
| 107 |
+
progress(0.1, desc=log_status(lang, "Analyzing input features..."))
|
| 108 |
+
time.sleep(2)
|
| 109 |
+
|
| 110 |
+
for i in range(1, 6):
|
| 111 |
+
progress(0.1 + i * 0.15, desc=log_status(lang, "Simulating training epoch {i}/5...", i=i))
|
| 112 |
+
time.sleep(3)
|
| 113 |
+
|
| 114 |
+
# 3. Generate mock .pth file
|
| 115 |
+
progress(0.9, desc=log_status(lang, "Finalizing model and generating file..."))
|
| 116 |
+
model_path = generate_mock_pth(model_name, TEMP_DIR)
|
| 117 |
+
|
| 118 |
+
if not model_path:
|
| 119 |
+
raise gr.Error(log_status(lang, "Error creating model file."))
|
| 120 |
+
|
| 121 |
+
final_log = log_status(lang, "Training complete. Model saved to: {path}", path=model_path)
|
| 122 |
+
progress(1.0, desc=final_log)
|
| 123 |
+
|
| 124 |
+
# Return the path for the gr.File component to handle the download link
|
| 125 |
+
return model_path, final_log
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
@spaces.GPU(duration=60)
|
| 129 |
+
def rvc_conversion_mock(
|
| 130 |
+
singer_audio_file: str,
|
| 131 |
+
model_file: Dict[str, Any],
|
| 132 |
+
pitch_change: int,
|
| 133 |
+
index_rate: float,
|
| 134 |
+
lang: str,
|
| 135 |
+
progress=gr.Progress()
|
| 136 |
+
) -> Tuple[int, np.ndarray]:
|
| 137 |
+
"""
|
| 138 |
+
Simulates RVC voice conversion from a singer track using the cloned model.
|
| 139 |
+
"""
|
| 140 |
+
strings = get_localized_strings(lang)
|
| 141 |
+
|
| 142 |
+
if not singer_audio_file:
|
| 143 |
+
raise gr.Error(strings["convert_input_singer"] + " " + strings["tts_input"])
|
| 144 |
+
|
| 145 |
+
model_path = get_rvc_model_path(model_file, "Simulated Model")
|
| 146 |
+
|
| 147 |
+
progress(0, desc=log_status(lang, "Starting conversion process..."))
|
| 148 |
+
|
| 149 |
+
# 1. Load input audio
|
| 150 |
+
try:
|
| 151 |
+
sr, input_audio = load_audio_from_path(singer_audio_file)
|
| 152 |
+
except Exception as e:
|
| 153 |
+
raise gr.Error(f"Audio file error: {e}")
|
| 154 |
+
|
| 155 |
+
# 2. Simulate conversion steps
|
| 156 |
+
progress(0.2, desc=log_status(lang, "Extracting source features and pitch ({pitch} ST)", pitch=pitch_change))
|
| 157 |
+
time.sleep(3)
|
| 158 |
+
|
| 159 |
+
progress(0.5, desc=log_status(lang, "Applying RVC Index (Rate: {rate})", rate=index_rate))
|
| 160 |
+
time.sleep(4)
|
| 161 |
+
|
| 162 |
+
# 3. Generate simulated converted audio:
|
| 163 |
+
# For a real RVC, this uses the model features.
|
| 164 |
+
# Here, we generate random noise, mix it with the original audio,
|
| 165 |
+
# and adjust volume/pitch slightly based on parameters.
|
| 166 |
+
|
| 167 |
+
input_audio_float = input_audio.astype(np.float32) / MAX_WAV_VALUE
|
| 168 |
+
|
| 169 |
+
# Simple simulation: Apply pitch shift (FFT-based manipulation is complex in numpy)
|
| 170 |
+
# Instead, we apply a small delay/reverb and modulate the volume
|
| 171 |
+
noise = np.random.normal(0, 0.1, len(input_audio_float))
|
| 172 |
+
converted_audio_float = input_audio_float * (1 + 0.1 * index_rate) + noise
|
| 173 |
+
|
| 174 |
+
# Simple pitch simulation: increase amplitude slightly if pitch is high
|
| 175 |
+
if pitch_change > 0:
|
| 176 |
+
converted_audio_float *= (1 + pitch_change / 30.0)
|
| 177 |
+
|
| 178 |
+
# Normalize output
|
| 179 |
+
max_val = np.max(np.abs(converted_audio_float))
|
| 180 |
+
if max_val > 1.0:
|
| 181 |
+
converted_audio_float /= max_val
|
| 182 |
+
|
| 183 |
+
converted_audio_int16 = (converted_audio_float * MAX_WAV_VALUE).astype(np.int16)
|
| 184 |
+
|
| 185 |
+
progress(1.0, desc=log_status(lang, "Conversion complete."))
|
| 186 |
+
|
| 187 |
+
# Return the converted audio
|
| 188 |
+
return sr, converted_audio_int16
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio
|
| 2 |
+
torch
|
| 3 |
+
torchaudio
|
| 4 |
+
numpy
|
| 5 |
+
librosa
|
| 6 |
+
transformers
|
| 7 |
+
accelerate
|
utils.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import shutil
|
| 3 |
+
import time
|
| 4 |
+
import numpy as np
|
| 5 |
+
from typing import Literal, Dict, Any, Tuple
|
| 6 |
+
from config import LOCAL_STRINGS, DEFAULT_SR
|
| 7 |
+
|
| 8 |
+
def get_localized_strings(lang: Literal["en", "ar"]) -> Dict[str, Any]:
|
| 9 |
+
"""Retrieves the localized strings dictionary."""
|
| 10 |
+
return LOCAL_STRINGS.get(lang, LOCAL_STRINGS["en"])
|
| 11 |
+
|
| 12 |
+
def log_status(lang: Literal["en", "ar"], message_key: str, **kwargs) -> str:
|
| 13 |
+
"""Logs a localized status message."""
|
| 14 |
+
strings = get_localized_strings(lang)
|
| 15 |
+
message = strings.get(message_key, message_key).format(**kwargs)
|
| 16 |
+
return f"[{time.strftime('%H:%M:%S')}] {message}"
|
| 17 |
+
|
| 18 |
+
def generate_mock_pth(model_name: str, temp_dir: str) -> str:
|
| 19 |
+
"""Simulates RVC model creation and returns the path to the dummy .pth file."""
|
| 20 |
+
# Ensure temporary directory exists
|
| 21 |
+
os.makedirs(temp_dir, exist_ok=True)
|
| 22 |
+
|
| 23 |
+
# Create a unique, descriptive path for the model file
|
| 24 |
+
filename = f"{model_name}_{int(time.time())}.pth"
|
| 25 |
+
model_path = os.path.join(temp_dir, filename)
|
| 26 |
+
|
| 27 |
+
# Simulate writing a small placeholder model file (real models are MBs/GBs)
|
| 28 |
+
try:
|
| 29 |
+
with open(model_path, 'w') as f:
|
| 30 |
+
f.write(f"RVC Model Data: {model_name}, Training Simulated at {time.ctime()}")
|
| 31 |
+
return model_path
|
| 32 |
+
except IOError:
|
| 33 |
+
# Handle potential permissions issues during file writing
|
| 34 |
+
return None
|
| 35 |
+
|
| 36 |
+
def clean_file_paths(paths: List[str]):
|
| 37 |
+
"""Cleans up the temporary files created during the session."""
|
| 38 |
+
for path in paths:
|
| 39 |
+
if path and os.path.exists(path):
|
| 40 |
+
try:
|
| 41 |
+
os.remove(path)
|
| 42 |
+
except Exception as e:
|
| 43 |
+
print(f"Error cleaning up file {path}: {e}")
|
| 44 |
+
|
| 45 |
+
def get_rvc_model_path(model_file_data: dict, model_name: str) -> str:
|
| 46 |
+
"""
|
| 47 |
+
Retrieves the actual file path from the Gradio FileData object or uses a default.
|
| 48 |
+
Gradio components return paths or FileData dicts upon upload.
|
| 49 |
+
"""
|
| 50 |
+
if model_file_data and isinstance(model_file_data, dict) and 'path' in model_file_data:
|
| 51 |
+
return model_file_data['path']
|
| 52 |
+
|
| 53 |
+
# Fallback to a placeholder if no file is explicitly uploaded (for demo purposes)
|
| 54 |
+
return DEFAULT_RVC_MODEL_PATH
|
| 55 |
+
|
| 56 |
+
def load_audio_from_path(file_path: str) -> Tuple[int, np.ndarray]:
|
| 57 |
+
"""Loads audio file using librosa, resampling to DEFAULT_SR."""
|
| 58 |
+
import librosa
|
| 59 |
+
try:
|
| 60 |
+
audio, sr = librosa.load(file_path, sr=DEFAULT_SR, mono=True)
|
| 61 |
+
# Convert to 16-bit PCM integer format for standard Gradio audio tuple
|
| 62 |
+
audio_int16 = (audio * MAX_WAV_VALUE).astype(np.int16)
|
| 63 |
+
return DEFAULT_SR, audio_int16
|
| 64 |
+
except Exception as e:
|
| 65 |
+
print(f"Error loading audio file {file_path}: {e}")
|
| 66 |
+
raise gr.Error(f"Failed to load audio: {str(e)}")
|