Spaces:
Running
on
Zero
Running
on
Zero
1.1 Beta
Browse files
app.py
CHANGED
|
@@ -119,22 +119,33 @@ merged_model.save_pretrained(merged_model_path)
|
|
| 119 |
tokenizer.save_pretrained(merged_model_path)
|
| 120 |
print(f"Merged model saved to {merged_model_path}")
|
| 121 |
|
| 122 |
-
# Initialize
|
| 123 |
-
print("Initializing
|
| 124 |
-
|
| 125 |
model_path=merged_model_path,
|
| 126 |
mimo_audio_tokenizer_path=tokenizer_path
|
| 127 |
)
|
| 128 |
-
print("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
|
| 130 |
@spaces.GPU
|
| 131 |
-
def generate_speech(emotion, text):
|
| 132 |
-
"""Generate emotional speech from text
|
| 133 |
if not emotion or not emotion.strip():
|
| 134 |
return None, "Please enter an emotion description."
|
| 135 |
if not text or not text.strip():
|
| 136 |
return None, "Please enter text to convert to speech."
|
| 137 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
print("Generating:", text)
|
| 139 |
print("With emotion:", emotion)
|
| 140 |
try:
|
|
@@ -142,11 +153,8 @@ def generate_speech(emotion, text):
|
|
| 142 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
|
| 143 |
output_path = tmp_file.name
|
| 144 |
|
| 145 |
-
# Format the instruction with emotion and text
|
| 146 |
-
full_instruction = f"Emotion: {emotion.strip()}\nText: {text.strip()}"
|
| 147 |
-
|
| 148 |
# Generate TTS with emotion instruction
|
| 149 |
-
|
| 150 |
text=text.strip(),
|
| 151 |
output_path=output_path,
|
| 152 |
instruct=emotion.strip()
|
|
@@ -173,6 +181,12 @@ with gr.Blocks(title="EmoAct-MiMo TTS") as demo:
|
|
| 173 |
|
| 174 |
with gr.Row():
|
| 175 |
with gr.Column():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
emotion_input = gr.Textbox(
|
| 177 |
label="Emotion",
|
| 178 |
placeholder="e.g., 'intense anger, rage, fury, hatred, and annoyance, speaking without any accent'",
|
|
@@ -200,38 +214,44 @@ with gr.Blocks(title="EmoAct-MiMo TTS") as demo:
|
|
| 200 |
examples=[
|
| 201 |
[
|
| 202 |
"intense anger, rage, fury, hatred, and annoyance, speaking without any accent",
|
| 203 |
-
"You know what? I'm done. I'm done with your excuses. (sharp exhale) Every single time, it's the same, and I actually believed you'd change. (voice cracks slightly) God, I'm such an idiot for trusting you again."
|
|
|
|
| 204 |
],
|
| 205 |
[
|
| 206 |
"overwhelming grief, deep sorrow, heartbreak, and devastating sadness, speaking without any accent",
|
| 207 |
-
"I can't... I can't believe they're gone. (trembling voice) It doesn't feel real. I keep expecting them to walk through that door, and... (chokes up) ...and they never will. How am I supposed to go on without them?"
|
|
|
|
| 208 |
],
|
| 209 |
[
|
| 210 |
"extreme fear, terror, panic, dread, and anxiety, speaking without any accent",
|
| 211 |
-
"(breathing heavily) Did you hear that? Something's out there. (whispers urgently) We need to hide, NOW. Oh god, oh god, it's getting closer. I don't want to die. Please, please let us make it out of here alive."
|
|
|
|
| 212 |
],
|
| 213 |
[
|
| 214 |
"intense joy, euphoria, excitement, elation, and overwhelming happiness, speaking without any accent",
|
| 215 |
-
"YES! YES! I DID IT! (laughs breathlessly) I can't believe it actually worked! This is... this is everything I've ever dreamed of! I'm so happy I could cry!"
|
|
|
|
| 216 |
],
|
| 217 |
[
|
| 218 |
"crushing despair, hopelessness, depression, and deep emotional pain, speaking without any accent",
|
| 219 |
-
"(quietly, numbly) What's the point anymore? I've tried everything. Nothing changes. Nothing ever gets better. I'm so tired of pretending I'm okay when I'm falling apart inside."
|
|
|
|
| 220 |
],
|
| 221 |
[
|
| 222 |
"bitter jealousy, envy, resentment, and seething frustration, speaking without any accent",
|
| 223 |
-
"Of course they chose you. They always choose you. (bitter laugh) Must be nice, having everything handed to you while the rest of us break our backs. You don't even appreciate what you have."
|
|
|
|
| 224 |
]
|
| 225 |
],
|
| 226 |
-
inputs=[emotion_input, text_input]
|
| 227 |
)
|
| 228 |
|
| 229 |
# Event handler
|
| 230 |
generate_btn.click(
|
| 231 |
fn=generate_speech,
|
| 232 |
-
inputs=[emotion_input, text_input],
|
| 233 |
outputs=[audio_output, status_output]
|
| 234 |
)
|
| 235 |
|
| 236 |
if __name__ == "__main__":
|
| 237 |
-
demo.launch()
|
|
|
|
| 119 |
tokenizer.save_pretrained(merged_model_path)
|
| 120 |
print(f"Merged model saved to {merged_model_path}")
|
| 121 |
|
| 122 |
+
# Initialize both models
|
| 123 |
+
print("Initializing EmoAct-MiMo (merged) model...")
|
| 124 |
+
emoact_model = MimoAudio(
|
| 125 |
model_path=merged_model_path,
|
| 126 |
mimo_audio_tokenizer_path=tokenizer_path
|
| 127 |
)
|
| 128 |
+
print("EmoAct-MiMo model ready!")
|
| 129 |
+
|
| 130 |
+
print("Initializing base MiMo-Audio model...")
|
| 131 |
+
base_mimo_model = MimoAudio(
|
| 132 |
+
model_path=base_model_path,
|
| 133 |
+
mimo_audio_tokenizer_path=tokenizer_path
|
| 134 |
+
)
|
| 135 |
+
print("Base MiMo-Audio model ready!")
|
| 136 |
|
| 137 |
@spaces.GPU
|
| 138 |
+
def generate_speech(emotion, text, model_choice):
|
| 139 |
+
"""Generate emotional speech from text"""
|
| 140 |
if not emotion or not emotion.strip():
|
| 141 |
return None, "Please enter an emotion description."
|
| 142 |
if not text or not text.strip():
|
| 143 |
return None, "Please enter text to convert to speech."
|
| 144 |
|
| 145 |
+
# Select model based on choice
|
| 146 |
+
selected_model = emoact_model if model_choice == "EmoAct-MiMo v1.1 (Beta)" else base_mimo_model
|
| 147 |
+
|
| 148 |
+
print(f"Using model: {model_choice}")
|
| 149 |
print("Generating:", text)
|
| 150 |
print("With emotion:", emotion)
|
| 151 |
try:
|
|
|
|
| 153 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
|
| 154 |
output_path = tmp_file.name
|
| 155 |
|
|
|
|
|
|
|
|
|
|
| 156 |
# Generate TTS with emotion instruction
|
| 157 |
+
selected_model.tts_sft(
|
| 158 |
text=text.strip(),
|
| 159 |
output_path=output_path,
|
| 160 |
instruct=emotion.strip()
|
|
|
|
| 181 |
|
| 182 |
with gr.Row():
|
| 183 |
with gr.Column():
|
| 184 |
+
model_selector = gr.Dropdown(
|
| 185 |
+
choices=["MiMo-Audio 7B (Default)", "EmoAct-MiMo v1.1 (Beta)"],
|
| 186 |
+
value="MiMo-Audio 7B (Default)",
|
| 187 |
+
label="Model",
|
| 188 |
+
info="EmoAct-MiMo v1.1 is a beta fine-tune with enhanced emotion control but may be less stable"
|
| 189 |
+
)
|
| 190 |
emotion_input = gr.Textbox(
|
| 191 |
label="Emotion",
|
| 192 |
placeholder="e.g., 'intense anger, rage, fury, hatred, and annoyance, speaking without any accent'",
|
|
|
|
| 214 |
examples=[
|
| 215 |
[
|
| 216 |
"intense anger, rage, fury, hatred, and annoyance, speaking without any accent",
|
| 217 |
+
"You know what? I'm done. I'm done with your excuses. (sharp exhale) Every single time, it's the same, and I actually believed you'd change. (voice cracks slightly) God, I'm such an idiot for trusting you again.",
|
| 218 |
+
"MiMo-Audio 7B (Default)"
|
| 219 |
],
|
| 220 |
[
|
| 221 |
"overwhelming grief, deep sorrow, heartbreak, and devastating sadness, speaking without any accent",
|
| 222 |
+
"I can't... I can't believe they're gone. (trembling voice) It doesn't feel real. I keep expecting them to walk through that door, and... (chokes up) ...and they never will. How am I supposed to go on without them?",
|
| 223 |
+
"MiMo-Audio 7B (Default)"
|
| 224 |
],
|
| 225 |
[
|
| 226 |
"extreme fear, terror, panic, dread, and anxiety, speaking without any accent",
|
| 227 |
+
"(breathing heavily) Did you hear that? Something's out there. (whispers urgently) We need to hide, NOW. Oh god, oh god, it's getting closer. I don't want to die. Please, please let us make it out of here alive.",
|
| 228 |
+
"MiMo-Audio 7B (Default)"
|
| 229 |
],
|
| 230 |
[
|
| 231 |
"intense joy, euphoria, excitement, elation, and overwhelming happiness, speaking without any accent",
|
| 232 |
+
"YES! YES! I DID IT! (laughs breathlessly) I can't believe it actually worked! This is... this is everything I've ever dreamed of! I'm so happy I could cry!",
|
| 233 |
+
"EmoAct-MiMo v1.1 (Beta)"
|
| 234 |
],
|
| 235 |
[
|
| 236 |
"crushing despair, hopelessness, depression, and deep emotional pain, speaking without any accent",
|
| 237 |
+
"(quietly, numbly) What's the point anymore? I've tried everything. Nothing changes. Nothing ever gets better. I'm so tired of pretending I'm okay when I'm falling apart inside.",
|
| 238 |
+
"EmoAct-MiMo v1.1 (Beta)"
|
| 239 |
],
|
| 240 |
[
|
| 241 |
"bitter jealousy, envy, resentment, and seething frustration, speaking without any accent",
|
| 242 |
+
"Of course they chose you. They always choose you. (bitter laugh) Must be nice, having everything handed to you while the rest of us break our backs. You don't even appreciate what you have.",
|
| 243 |
+
"EmoAct-MiMo v1.1 (Beta)"
|
| 244 |
]
|
| 245 |
],
|
| 246 |
+
inputs=[emotion_input, text_input, model_selector]
|
| 247 |
)
|
| 248 |
|
| 249 |
# Event handler
|
| 250 |
generate_btn.click(
|
| 251 |
fn=generate_speech,
|
| 252 |
+
inputs=[emotion_input, text_input, model_selector],
|
| 253 |
outputs=[audio_output, status_output]
|
| 254 |
)
|
| 255 |
|
| 256 |
if __name__ == "__main__":
|
| 257 |
+
demo.launch()
|