Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -120,45 +120,57 @@ tokenizer.save_pretrained(merged_model_path)
|
|
| 120 |
print(f"Merged model saved to {merged_model_path}")
|
| 121 |
|
| 122 |
# Initialize both models
|
| 123 |
-
print("Initializing
|
| 124 |
-
|
| 125 |
-
model_path=
|
| 126 |
mimo_audio_tokenizer_path=tokenizer_path
|
| 127 |
)
|
| 128 |
-
print("
|
| 129 |
|
| 130 |
-
print("Initializing
|
| 131 |
-
|
| 132 |
-
model_path=
|
| 133 |
mimo_audio_tokenizer_path=tokenizer_path
|
| 134 |
)
|
| 135 |
-
print("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
|
| 137 |
@spaces.GPU
|
| 138 |
-
def generate_speech(emotion, text
|
| 139 |
-
"""Generate
|
| 140 |
-
if not emotion or not emotion.strip():
|
| 141 |
-
return None, "Please enter an emotion description."
|
| 142 |
if not text or not text.strip():
|
| 143 |
return None, "Please enter text to convert to speech."
|
| 144 |
-
|
| 145 |
-
# Select
|
| 146 |
-
selected_model =
|
| 147 |
|
| 148 |
print(f"Using model: {model_choice}")
|
| 149 |
print("Generating:", text)
|
| 150 |
-
|
|
|
|
|
|
|
| 151 |
try:
|
| 152 |
# Create temporary file for output
|
| 153 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
|
| 154 |
output_path = tmp_file.name
|
| 155 |
|
| 156 |
-
# Generate TTS with emotion instruction
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
|
| 163 |
return output_path, "✅ Speech generated successfully!"
|
| 164 |
|
|
@@ -166,38 +178,36 @@ def generate_speech(emotion, text, model_choice):
|
|
| 166 |
return None, f"❌ Error: {str(e)}"
|
| 167 |
|
| 168 |
# Create Gradio interface
|
| 169 |
-
with gr.Blocks(title="
|
| 170 |
gr.Markdown("""
|
| 171 |
-
# 🎭
|
| 172 |
-
|
| 173 |
-
Generate
|
| 174 |
-
|
| 175 |
-
This is still a very early experiment and is very early in the training run, I need to change a few settings and retrain. But the model turned out quite nicely!
|
| 176 |
-
|
| 177 |
-
It may hallucinate, try a few times to get good results.
|
| 178 |
-
|
| 179 |
-
Voice cloning is not supported yet.
|
| 180 |
""")
|
| 181 |
|
| 182 |
with gr.Row():
|
| 183 |
with gr.Column():
|
| 184 |
model_selector = gr.Dropdown(
|
| 185 |
-
choices=[
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
|
|
|
|
|
|
|
|
|
| 189 |
)
|
|
|
|
| 190 |
emotion_input = gr.Textbox(
|
| 191 |
-
label="Emotion",
|
| 192 |
-
placeholder="e.g., 'intense anger, rage, fury
|
| 193 |
lines=3
|
| 194 |
)
|
| 195 |
text_input = gr.Textbox(
|
| 196 |
label="Text",
|
| 197 |
-
placeholder="Enter the text to speak
|
| 198 |
lines=5
|
| 199 |
)
|
| 200 |
-
generate_btn = gr.Button("Generate
|
| 201 |
|
| 202 |
with gr.Column():
|
| 203 |
audio_output = gr.Audio(
|
|
@@ -209,47 +219,79 @@ with gr.Blocks(title="EmoAct-MiMo TTS") as demo:
|
|
| 209 |
interactive=False
|
| 210 |
)
|
| 211 |
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
],
|
| 235 |
-
[
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
],
|
| 240 |
-
[
|
| 241 |
-
|
| 242 |
-
"Of course they chose you. They always choose you. (bitter laugh) Must be nice, having everything handed to you while the rest of us break our backs. You don't even appreciate what you have.",
|
| 243 |
-
"EmoAct-MiMo v1.1 (Beta)"
|
| 244 |
-
]
|
| 245 |
-
],
|
| 246 |
-
inputs=[emotion_input, text_input, model_selector]
|
| 247 |
-
)
|
| 248 |
|
| 249 |
# Event handler
|
| 250 |
generate_btn.click(
|
| 251 |
fn=generate_speech,
|
| 252 |
-
inputs=[emotion_input, text_input
|
| 253 |
outputs=[audio_output, status_output]
|
| 254 |
)
|
| 255 |
|
|
|
|
| 120 |
print(f"Merged model saved to {merged_model_path}")
|
| 121 |
|
| 122 |
# Initialize both models
|
| 123 |
+
print("Initializing base model...")
|
| 124 |
+
base_mimo = MimoAudio(
|
| 125 |
+
model_path=base_model_path,
|
| 126 |
mimo_audio_tokenizer_path=tokenizer_path
|
| 127 |
)
|
| 128 |
+
print("Base model ready!")
|
| 129 |
|
| 130 |
+
print("Initializing EmoAct model...")
|
| 131 |
+
emoact_mimo = MimoAudio(
|
| 132 |
+
model_path=merged_model_path,
|
| 133 |
mimo_audio_tokenizer_path=tokenizer_path
|
| 134 |
)
|
| 135 |
+
print("EmoAct model ready!")
|
| 136 |
+
|
| 137 |
+
# Store models in a dict for easy access
|
| 138 |
+
models = {
|
| 139 |
+
"Base Model (MiMo-Audio-7B-Instruct)": base_mimo,
|
| 140 |
+
"EmoAct-MiMo v1.1 (Beta - Emotional)": emoact_mimo
|
| 141 |
+
}
|
| 142 |
|
| 143 |
@spaces.GPU
|
| 144 |
+
def generate_speech(model_choice, emotion, text):
|
| 145 |
+
"""Generate speech from text using selected model"""
|
|
|
|
|
|
|
| 146 |
if not text or not text.strip():
|
| 147 |
return None, "Please enter text to convert to speech."
|
| 148 |
+
|
| 149 |
+
# Select the appropriate model
|
| 150 |
+
selected_model = models[model_choice]
|
| 151 |
|
| 152 |
print(f"Using model: {model_choice}")
|
| 153 |
print("Generating:", text)
|
| 154 |
+
if emotion and emotion.strip():
|
| 155 |
+
print("With emotion:", emotion)
|
| 156 |
+
|
| 157 |
try:
|
| 158 |
# Create temporary file for output
|
| 159 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
|
| 160 |
output_path = tmp_file.name
|
| 161 |
|
| 162 |
+
# Generate TTS with or without emotion instruction
|
| 163 |
+
if emotion and emotion.strip():
|
| 164 |
+
selected_model.tts_sft(
|
| 165 |
+
text=text.strip(),
|
| 166 |
+
output_path=output_path,
|
| 167 |
+
instruct=emotion.strip()
|
| 168 |
+
)
|
| 169 |
+
else:
|
| 170 |
+
selected_model.tts_sft(
|
| 171 |
+
text=text.strip(),
|
| 172 |
+
output_path=output_path
|
| 173 |
+
)
|
| 174 |
|
| 175 |
return output_path, "✅ Speech generated successfully!"
|
| 176 |
|
|
|
|
| 178 |
return None, f"❌ Error: {str(e)}"
|
| 179 |
|
| 180 |
# Create Gradio interface
|
| 181 |
+
with gr.Blocks(title="MiMo-Audio TTS") as demo:
|
| 182 |
gr.Markdown("""
|
| 183 |
+
# 🎭 MiMo-Audio Text-to-Speech
|
| 184 |
+
|
| 185 |
+
Generate speech using MiMo-Audio models with optional emotion control.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
""")
|
| 187 |
|
| 188 |
with gr.Row():
|
| 189 |
with gr.Column():
|
| 190 |
model_selector = gr.Dropdown(
|
| 191 |
+
choices=[
|
| 192 |
+
"Base Model (MiMo-Audio-7B-Instruct)",
|
| 193 |
+
"EmoAct-MiMo v1.1 (Beta - Emotional)"
|
| 194 |
+
],
|
| 195 |
+
value="Base Model (MiMo-Audio-7B-Instruct)",
|
| 196 |
+
label="Model Selection",
|
| 197 |
+
info="Base model is stable. EmoAct is a beta model for intense emotional speech."
|
| 198 |
)
|
| 199 |
+
|
| 200 |
emotion_input = gr.Textbox(
|
| 201 |
+
label="Emotion (Optional - works best with EmoAct model)",
|
| 202 |
+
placeholder="e.g., 'intense anger, rage, fury' or leave empty for neutral",
|
| 203 |
lines=3
|
| 204 |
)
|
| 205 |
text_input = gr.Textbox(
|
| 206 |
label="Text",
|
| 207 |
+
placeholder="Enter the text to speak...",
|
| 208 |
lines=5
|
| 209 |
)
|
| 210 |
+
generate_btn = gr.Button("Generate Speech", variant="primary")
|
| 211 |
|
| 212 |
with gr.Column():
|
| 213 |
audio_output = gr.Audio(
|
|
|
|
| 219 |
interactive=False
|
| 220 |
)
|
| 221 |
|
| 222 |
+
gr.Markdown("""
|
| 223 |
+
### Model Information
|
| 224 |
+
|
| 225 |
+
**Base Model (MiMo-Audio-7B-Instruct)**: The original stable model from Xiaomi. Best for general text-to-speech.
|
| 226 |
+
|
| 227 |
+
**EmoAct-MiMo v1.1 (Beta)**: An experimental emotional model fine-tuned for intense emotional expressions.
|
| 228 |
+
- ⚠️ **Beta warning**: This is an early experiment and may hallucinate or produce unexpected results
|
| 229 |
+
- Works best with detailed emotion descriptions
|
| 230 |
+
- Currently does not support voice cloning
|
| 231 |
+
- Try multiple times for best results
|
| 232 |
+
""")
|
| 233 |
+
|
| 234 |
+
# Examples for both models
|
| 235 |
+
gr.Markdown("### Examples")
|
| 236 |
+
|
| 237 |
+
with gr.Tab("Base Model Examples"):
|
| 238 |
+
gr.Examples(
|
| 239 |
+
examples=[
|
| 240 |
+
[
|
| 241 |
+
"Base Model (MiMo-Audio-7B-Instruct)",
|
| 242 |
+
"",
|
| 243 |
+
"Hello, welcome to MiMo Audio text to speech. This is the base model speaking in a neutral tone."
|
| 244 |
+
],
|
| 245 |
+
[
|
| 246 |
+
"Base Model (MiMo-Audio-7B-Instruct)",
|
| 247 |
+
"",
|
| 248 |
+
"The quick brown fox jumps over the lazy dog. This is a test of the text to speech system."
|
| 249 |
+
],
|
| 250 |
],
|
| 251 |
+
inputs=[model_selector, emotion_input, text_input]
|
| 252 |
+
)
|
| 253 |
+
|
| 254 |
+
with gr.Tab("EmoAct Emotional Examples"):
|
| 255 |
+
gr.Examples(
|
| 256 |
+
examples=[
|
| 257 |
+
[
|
| 258 |
+
"EmoAct-MiMo v1.1 (Beta - Emotional)",
|
| 259 |
+
"intense anger, rage, fury, hatred, and annoyance, speaking without any accent",
|
| 260 |
+
"You know what? I'm done. I'm done with your excuses. (sharp exhale) Every single time, it's the same, and I actually believed you'd change. (voice cracks slightly) God, I'm such an idiot for trusting you again."
|
| 261 |
+
],
|
| 262 |
+
[
|
| 263 |
+
"EmoAct-MiMo v1.1 (Beta - Emotional)",
|
| 264 |
+
"overwhelming grief, deep sorrow, heartbreak, and devastating sadness, speaking without any accent",
|
| 265 |
+
"I can't... I can't believe they're gone. (trembling voice) It doesn't feel real. I keep expecting them to walk through that door, and... (chokes up) ...and they never will. How am I supposed to go on without them?"
|
| 266 |
+
],
|
| 267 |
+
[
|
| 268 |
+
"EmoAct-MiMo v1.1 (Beta - Emotional)",
|
| 269 |
+
"extreme fear, terror, panic, dread, and anxiety, speaking without any accent",
|
| 270 |
+
"(breathing heavily) Did you hear that? Something's out there. (whispers urgently) We need to hide, NOW. Oh god, oh god, it's getting closer. I don't want to die. Please, please let us make it out of here alive."
|
| 271 |
+
],
|
| 272 |
+
[
|
| 273 |
+
"EmoAct-MiMo v1.1 (Beta - Emotional)",
|
| 274 |
+
"intense joy, euphoria, excitement, elation, and overwhelming happiness, speaking without any accent",
|
| 275 |
+
"YES! YES! I DID IT! (laughs breathlessly) I can't believe it actually worked! This is... this is everything I've ever dreamed of! I'm so happy I could cry!"
|
| 276 |
+
],
|
| 277 |
+
[
|
| 278 |
+
"EmoAct-MiMo v1.1 (Beta - Emotional)",
|
| 279 |
+
"crushing despair, hopelessness, depression, and deep emotional pain, speaking without any accent",
|
| 280 |
+
"(quietly, numbly) What's the point anymore? I've tried everything. Nothing changes. Nothing ever gets better. I'm so tired of pretending I'm okay when I'm falling apart inside."
|
| 281 |
+
],
|
| 282 |
+
[
|
| 283 |
+
"EmoAct-MiMo v1.1 (Beta - Emotional)",
|
| 284 |
+
"bitter jealousy, envy, resentment, and seething frustration, speaking without any accent",
|
| 285 |
+
"Of course they chose you. They always choose you. (bitter laugh) Must be nice, having everything handed to you while the rest of us break our backs. You don't even appreciate what you have."
|
| 286 |
+
],
|
| 287 |
],
|
| 288 |
+
inputs=[model_selector, emotion_input, text_input]
|
| 289 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 290 |
|
| 291 |
# Event handler
|
| 292 |
generate_btn.click(
|
| 293 |
fn=generate_speech,
|
| 294 |
+
inputs=[model_selector, emotion_input, text_input],
|
| 295 |
outputs=[audio_output, status_output]
|
| 296 |
)
|
| 297 |
|