Spaces:

mrfakename
/

EmoAct-MiMo

Running on Zero

App Files Files Community

mrfakename commited on 18 days ago

Commit

e6aa5b4

verified ·

1 Parent(s): 20ca5a7

Update app.py

Browse files

Files changed (1) hide show

app.py +117 -75

app.py CHANGED Viewed

@@ -120,45 +120,57 @@ tokenizer.save_pretrained(merged_model_path)
 print(f"Merged model saved to {merged_model_path}")
 # Initialize both models
-print("Initializing EmoAct-MiMo (merged) model...")
-emoact_model = MimoAudio(
-    model_path=merged_model_path,
     mimo_audio_tokenizer_path=tokenizer_path
 )
-print("EmoAct-MiMo model ready!")
-print("Initializing base MiMo-Audio model...")
-base_mimo_model = MimoAudio(
-    model_path=base_model_path,
     mimo_audio_tokenizer_path=tokenizer_path
 )
-print("Base MiMo-Audio model ready!")
 @spaces.GPU
-def generate_speech(emotion, text, model_choice):
-    """Generate emotional speech from text"""
-    if not emotion or not emotion.strip():
-        return None, "Please enter an emotion description."
     if not text or not text.strip():
         return None, "Please enter text to convert to speech."
-    # Select model based on choice
-    selected_model = emoact_model if model_choice == "EmoAct-MiMo v1.1 (Beta)" else base_mimo_model
     print(f"Using model: {model_choice}")
     print("Generating:", text)
-    print("With emotion:", emotion)
     try:
         # Create temporary file for output
         with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
             output_path = tmp_file.name
-        # Generate TTS with emotion instruction
-        selected_model.tts_sft(
-            text=text.strip(),
-            output_path=output_path,
-            instruct=emotion.strip()
-        )
         return output_path, "✅ Speech generated successfully!"
@@ -166,38 +178,36 @@ def generate_speech(emotion, text, model_choice):
         return None, f"❌ Error: {str(e)}"
 # Create Gradio interface
-with gr.Blocks(title="EmoAct-MiMo TTS") as demo:
     gr.Markdown("""
-    # 🎭 EmoAct-MiMo: Emotion-Controllable Text-to-Speech
-    Generate intensely emotional speech using the [EmoAct-MiMo model](https://huggingface.co/mrfakename/EmoAct-MiMo).
-    This is still a very early experiment and is very early in the training run, I need to change a few settings and retrain. But the model turned out quite nicely!
-    It may hallucinate, try a few times to get good results.
-    Voice cloning is not supported yet.
     """)
     with gr.Row():
         with gr.Column():
             model_selector = gr.Dropdown(
-                choices=["MiMo-Audio 7B (Default)", "EmoAct-MiMo v1.1 (Beta)"],
-                value="MiMo-Audio 7B (Default)",
-                label="Model",
-                info="EmoAct-MiMo v1.1 is a beta fine-tune with enhanced emotion control but may be less stable"
             )
             emotion_input = gr.Textbox(
-                label="Emotion",
-                placeholder="e.g., 'intense anger, rage, fury, hatred, and annoyance, speaking without any accent'",
                 lines=3
             )
             text_input = gr.Textbox(
                 label="Text",
-                placeholder="Enter the text to speak with emotion...",
                 lines=5
             )
-            generate_btn = gr.Button("Generate Emotional Speech", variant="primary")
         with gr.Column():
             audio_output = gr.Audio(
@@ -209,47 +219,79 @@ with gr.Blocks(title="EmoAct-MiMo TTS") as demo:
                 interactive=False
             )
-    # Intense emotion examples
-    gr.Examples(
-        examples=[
-            [
-                "intense anger, rage, fury, hatred, and annoyance, speaking without any accent",
-                "You know what? I'm done. I'm done with your excuses. (sharp exhale) Every single time, it's the same, and I actually believed you'd change. (voice cracks slightly) God, I'm such an idiot for trusting you again.",
-                "MiMo-Audio 7B (Default)"
-            ],
-            [
-                "overwhelming grief, deep sorrow, heartbreak, and devastating sadness, speaking without any accent",
-                "I can't... I can't believe they're gone. (trembling voice) It doesn't feel real. I keep expecting them to walk through that door, and... (chokes up) ...and they never will. How am I supposed to go on without them?",
-                "MiMo-Audio 7B (Default)"
-            ],
-            [
-                "extreme fear, terror, panic, dread, and anxiety, speaking without any accent",
-                "(breathing heavily) Did you hear that? Something's out there. (whispers urgently) We need to hide, NOW. Oh god, oh god, it's getting closer. I don't want to die. Please, please let us make it out of here alive.",
-                "MiMo-Audio 7B (Default)"
-            ],
-            [
-                "intense joy, euphoria, excitement, elation, and overwhelming happiness, speaking without any accent",
-                "YES! YES! I DID IT! (laughs breathlessly) I can't believe it actually worked! This is... this is everything I've ever dreamed of! I'm so happy I could cry!",
-                "EmoAct-MiMo v1.1 (Beta)"
             ],
-            [
-                "crushing despair, hopelessness, depression, and deep emotional pain, speaking without any accent",
-                "(quietly, numbly) What's the point anymore? I've tried everything. Nothing changes. Nothing ever gets better. I'm so tired of pretending I'm okay when I'm falling apart inside.",
-                "EmoAct-MiMo v1.1 (Beta)"
             ],
-            [
-                "bitter jealousy, envy, resentment, and seething frustration, speaking without any accent",
-                "Of course they chose you. They always choose you. (bitter laugh) Must be nice, having everything handed to you while the rest of us break our backs. You don't even appreciate what you have.",
-                "EmoAct-MiMo v1.1 (Beta)"
-            ]
-        ],
-        inputs=[emotion_input, text_input, model_selector]
-    )
     # Event handler
     generate_btn.click(
         fn=generate_speech,
-        inputs=[emotion_input, text_input, model_selector],
         outputs=[audio_output, status_output]
     )

 print(f"Merged model saved to {merged_model_path}")
 # Initialize both models
+print("Initializing base model...")
+base_mimo = MimoAudio(
+    model_path=base_model_path,
     mimo_audio_tokenizer_path=tokenizer_path
 )
+print("Base model ready!")
+print("Initializing EmoAct model...")
+emoact_mimo = MimoAudio(
+    model_path=merged_model_path,
     mimo_audio_tokenizer_path=tokenizer_path
 )
+print("EmoAct model ready!")
+# Store models in a dict for easy access
+models = {
+    "Base Model (MiMo-Audio-7B-Instruct)": base_mimo,
+    "EmoAct-MiMo v1.1 (Beta - Emotional)": emoact_mimo
+}
 @spaces.GPU
+def generate_speech(model_choice, emotion, text):
+    """Generate speech from text using selected model"""
     if not text or not text.strip():
         return None, "Please enter text to convert to speech."
+    # Select the appropriate model
+    selected_model = models[model_choice]
     print(f"Using model: {model_choice}")
     print("Generating:", text)
+    if emotion and emotion.strip():
+        print("With emotion:", emotion)
     try:
         # Create temporary file for output
         with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
             output_path = tmp_file.name
+        # Generate TTS with or without emotion instruction
+        if emotion and emotion.strip():
+            selected_model.tts_sft(
+                text=text.strip(),
+                output_path=output_path,
+                instruct=emotion.strip()
+            )
+        else:
+            selected_model.tts_sft(
+                text=text.strip(),
+                output_path=output_path
+            )
         return output_path, "✅ Speech generated successfully!"
         return None, f"❌ Error: {str(e)}"
 # Create Gradio interface
+with gr.Blocks(title="MiMo-Audio TTS") as demo:
     gr.Markdown("""
+    # 🎭 MiMo-Audio Text-to-Speech
+    Generate speech using MiMo-Audio models with optional emotion control.
     """)
     with gr.Row():
         with gr.Column():
             model_selector = gr.Dropdown(
+                choices=[
+                    "Base Model (MiMo-Audio-7B-Instruct)",
+                    "EmoAct-MiMo v1.1 (Beta - Emotional)"
+                ],
+                value="Base Model (MiMo-Audio-7B-Instruct)",
+                label="Model Selection",
+                info="Base model is stable. EmoAct is a beta model for intense emotional speech."
             )
             emotion_input = gr.Textbox(
+                label="Emotion (Optional - works best with EmoAct model)",
+                placeholder="e.g., 'intense anger, rage, fury' or leave empty for neutral",
                 lines=3
             )
             text_input = gr.Textbox(
                 label="Text",
+                placeholder="Enter the text to speak...",
                 lines=5
             )
+            generate_btn = gr.Button("Generate Speech", variant="primary")
         with gr.Column():
             audio_output = gr.Audio(
                 interactive=False
             )
+    gr.Markdown("""
+    ### Model Information
+    **Base Model (MiMo-Audio-7B-Instruct)**: The original stable model from Xiaomi. Best for general text-to-speech.
+    **EmoAct-MiMo v1.1 (Beta)**: An experimental emotional model fine-tuned for intense emotional expressions.
+    - ⚠️ **Beta warning**: This is an early experiment and may hallucinate or produce unexpected results
+    - Works best with detailed emotion descriptions
+    - Currently does not support voice cloning
+    - Try multiple times for best results
+    """)
+    # Examples for both models
+    gr.Markdown("### Examples")
+    with gr.Tab("Base Model Examples"):
+        gr.Examples(
+            examples=[
+                [
+                    "Base Model (MiMo-Audio-7B-Instruct)",
+                    "",
+                    "Hello, welcome to MiMo Audio text to speech. This is the base model speaking in a neutral tone."
+                ],
+                [
+                    "Base Model (MiMo-Audio-7B-Instruct)",
+                    "",
+                    "The quick brown fox jumps over the lazy dog. This is a test of the text to speech system."
+                ],
             ],
+            inputs=[model_selector, emotion_input, text_input]
+        )
+    with gr.Tab("EmoAct Emotional Examples"):
+        gr.Examples(
+            examples=[
+                [
+                    "EmoAct-MiMo v1.1 (Beta - Emotional)",
+                    "intense anger, rage, fury, hatred, and annoyance, speaking without any accent",
+                    "You know what? I'm done. I'm done with your excuses. (sharp exhale) Every single time, it's the same, and I actually believed you'd change. (voice cracks slightly) God, I'm such an idiot for trusting you again."
+                ],
+                [
+                    "EmoAct-MiMo v1.1 (Beta - Emotional)",
+                    "overwhelming grief, deep sorrow, heartbreak, and devastating sadness, speaking without any accent",
+                    "I can't... I can't believe they're gone. (trembling voice) It doesn't feel real. I keep expecting them to walk through that door, and... (chokes up) ...and they never will. How am I supposed to go on without them?"
+                ],
+                [
+                    "EmoAct-MiMo v1.1 (Beta - Emotional)",
+                    "extreme fear, terror, panic, dread, and anxiety, speaking without any accent",
+                    "(breathing heavily) Did you hear that? Something's out there. (whispers urgently) We need to hide, NOW. Oh god, oh god, it's getting closer. I don't want to die. Please, please let us make it out of here alive."
+                ],
+                [
+                    "EmoAct-MiMo v1.1 (Beta - Emotional)",
+                    "intense joy, euphoria, excitement, elation, and overwhelming happiness, speaking without any accent",
+                    "YES! YES! I DID IT! (laughs breathlessly) I can't believe it actually worked! This is... this is everything I've ever dreamed of! I'm so happy I could cry!"
+                ],
+                [
+                    "EmoAct-MiMo v1.1 (Beta - Emotional)",
+                    "crushing despair, hopelessness, depression, and deep emotional pain, speaking without any accent",
+                    "(quietly, numbly) What's the point anymore? I've tried everything. Nothing changes. Nothing ever gets better. I'm so tired of pretending I'm okay when I'm falling apart inside."
+                ],
+                [
+                    "EmoAct-MiMo v1.1 (Beta - Emotional)",
+                    "bitter jealousy, envy, resentment, and seething frustration, speaking without any accent",
+                    "Of course they chose you. They always choose you. (bitter laugh) Must be nice, having everything handed to you while the rest of us break our backs. You don't even appreciate what you have."
+                ],
             ],
+            inputs=[model_selector, emotion_input, text_input]
+        )
     # Event handler
     generate_btn.click(
         fn=generate_speech,
+        inputs=[model_selector, emotion_input, text_input],
         outputs=[audio_output, status_output]
     )