Spaces:

mrfakename
/

EmoAct-MiMo

Running on Zero

File size: 11,339 Bytes

ae34825
de8adeb
c6fc372
de8adeb
 
 
 
 
 
 
a38af7a
de8adeb
 
 
 
a38af7a
 
 
 
 
de8adeb
a38af7a
 
 
 
de8adeb
 
 
 
a38af7a
de8adeb
a38af7a
 
 
de8adeb
a38af7a
de8adeb
 
 
 
 
 
a38af7a
de8adeb
 
 
 
 
 
 
 
ae34825
ef96930
 
 
 
d7cad10
 
 
ef96930
 
 
 
d7cad10
 
 
ef96930
d7cad10
ef96930
da189fe
d7cad10
 
da189fe
 
 
573876e
 
 
d7cad10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
da189fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
573876e
 
da189fe
d7cad10
 
 
 
 
573876e
da189fe
573876e
 
da189fe
573876e
da189fe
573876e
da189fe
573876e
 
da189fe
 
 
 
573876e
da189fe
 
 
 
 
124e60c
ef96930
da189fe
 
20ca5a7
 
da189fe
e6aa5b4
da189fe
e6aa5b4
da189fe
573876e
e6aa5b4
ef96930
 
e6aa5b4
da189fe
 
 
ef96930
d7cad10
da189fe
20ca5a7
c8eead4
da189fe
e6aa5b4
ef96930
da189fe
 
 
ef96930
 
 
 
da189fe
 
ef96930
da189fe
 
 
 
 
 
 
 
ef96930
 
 
 
 
da189fe
ef96930
da189fe
 
 
 
 
 
 
 
 
ef96930
 
 
 
20ca5a7
573876e
da189fe
e6aa5b4
573876e
20ca5a7
d7cad10
da189fe
 
d7cad10
 
ef96930
d7cad10
da189fe
ef96930
 
da189fe
ef96930
 
 
 
 
 
 
 
 
 
 
da189fe
 
 
 
 
 
 
d7cad10
da189fe
 
 
 
d7cad10
da189fe
 
 
 
 
 
 
 
 
 
 
573876e
da189fe
c7f28af
da189fe
 
573876e
da189fe
c7f28af
da189fe
 
 
 
ef96930
 
 
 
e6aa5b4
ef96930
 
 
 
20ca5a7

import subprocess
import sys
import os
import torch
import platform

def install_flash_attention():
    # --- Step 1: Detect system info ---
    py_version = f"cp{sys.version_info.major}{sys.version_info.minor}"
    torch_version = torch.__version__.split("+")[0]  # e.g., '2.6.0'
    cuda_version = torch.version.cuda or "cpu"
    cxx11abi = "FALSE" if torch._C._GLIBCXX_USE_CXX11_ABI == 0 else "TRUE"
    system = platform.system().lower()
    arch = platform.machine()

    # --- Step 2: Normalize CUDA and torch version formatting ---
    if cuda_version != "cpu":
        # Extract only major.minor (e.g., 12.1 -> 12)
        cuda_major = cuda_version.split(".")[0]
        cuda_tag = f"cu{cuda_major}"
    else:
        cuda_tag = "cpu"

    # Use only torch major.minor (e.g., 2.6.0 -> 2.6)
    torch_tag = torch_version[:3]

    # --- Step 3: Build the wheel URL ---
    base_url = "https://github.com/Dao-AILab/flash-attention/releases/download"
    release_tag = "v2.7.4.post1"

    wheel_name = (
        f"flash_attn-2.7.4.post1+{cuda_tag}torch{torch_tag}"
        f"cxx11abi{cxx11abi}-"
        f"{py_version}-{py_version}-linux_x86_64.whl"
    )

    wheel_url = f"{base_url}/{release_tag}/{wheel_name}"

    print(f"🔥 Installing FlashAttention wheel:\n{wheel_url}\n")

    # --- Step 4: Install it ---
    env = dict(**os.environ, FLASH_ATTENTION_SKIP_CUDA_BUILD="TRUE")

    subprocess.run(
        ["pip", "install", wheel_url, "--no-build-isolation"],
        env=env,
        check=True,
    )

install_flash_attention()


import gradio as gr
import spaces
import torch
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer
from src.mimo_audio.modeling_mimo_audio import MiMoAudioArguments, MiMoAudioForCausalLM
from peft import PeftModel
from src.mimo_audio.mimo_audio import MimoAudio
import tempfile
import os

# Download base models from Hugging Face
print("Downloading MiMo-Audio base models from Hugging Face...")
base_model_path = snapshot_download(repo_id="XiaomiMiMo/MiMo-Audio-7B-Instruct")
tokenizer_path = snapshot_download(repo_id="XiaomiMiMo/MiMo-Audio-Tokenizer")
print(f"Base models downloaded to: {base_model_path}")

# Download both LoRA weights
print("Downloading EmoAct-MiMo LoRA weights...")
hf_token = os.environ.get("HF_TOKEN")
lora_v1_path = snapshot_download(repo_id="mrfakename/EmoAct-MiMo", token=hf_token)
print(f"LoRA v1.0 weights downloaded to: {lora_v1_path}")

print("Downloading EmoAct-MiMo v1.2 (Beta) LoRA weights...")
lora_v1_1_path = snapshot_download(repo_id="mrfakename/EmoAct-MiMo-v1.2", token=hf_token)
print(f"LoRA v1.2 (Beta) weights downloaded to: {lora_v1_1_path}")

# Load tokenizer and get special tokens
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(base_model_path)
sosp_idx = tokenizer.convert_tokens_to_ids("<|sosp|>")
eosp_idx = tokenizer.convert_tokens_to_ids("<|eosp|>")
empty_token = tokenizer.convert_tokens_to_ids("<|empty|>")
sostm_idx = tokenizer.convert_tokens_to_ids("<|sostm|>")
eostm_idx = tokenizer.convert_tokens_to_ids("<|eostm|>")
eot_idx = tokenizer.convert_tokens_to_ids("<|eot|>")

# Create model args
model_args = MiMoAudioArguments(
    model_name_or_path=base_model_path,
    sosp_idx=sosp_idx,
    eosp_idx=eosp_idx,
    empty_idx=empty_token,
    sostm_idx=sostm_idx,
    eostm_idx=eostm_idx,
    eot_idx=eot_idx,
)

# Load base model for v1.0
print("Loading base MiMo-Audio model for v1.0...")
base_model_v1 = MiMoAudioForCausalLM.from_pretrained(
    base_model_path,
    args=model_args,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
print("Base model v1.0 loaded")

# Load and merge LoRA v1.0
print("Loading LoRA v1.0 adapter...")
model_with_lora_v1 = PeftModel.from_pretrained(base_model_v1, lora_v1_path)
print("Merging LoRA v1.0 weights...")
merged_model_v1 = model_with_lora_v1.merge_and_unload()
print("LoRA v1.0 weights merged!")

# Save merged model v1.0 to temporary directory
print("Saving merged model v1.0...")
merged_model_v1_path = "/tmp/merged_mimo_audio_v1"
os.makedirs(merged_model_v1_path, exist_ok=True)
merged_model_v1.save_pretrained(merged_model_v1_path)
tokenizer.save_pretrained(merged_model_v1_path)
print(f"Merged model v1.0 saved to {merged_model_v1_path}")

# Load base model for v1.2
print("Loading base MiMo-Audio model for v1.2...")
base_model_v1_1 = MiMoAudioForCausalLM.from_pretrained(
    base_model_path,
    args=model_args,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
print("Base model v1.2 loaded")

# Load and merge LoRA v1.2
print("Loading LoRA v1.2 (Beta) adapter...")
model_with_lora_v1_1 = PeftModel.from_pretrained(base_model_v1_1, lora_v1_1_path)
print("Merging LoRA v1.2 (Beta) weights...")
merged_model_v1_1 = model_with_lora_v1_1.merge_and_unload()
print("LoRA v1.2 (Beta) weights merged!")

# Save merged model v1.2 to temporary directory
print("Saving merged model v1.2...")
merged_model_v1_1_path = "/tmp/merged_mimo_audio_v1_1"
os.makedirs(merged_model_v1_1_path, exist_ok=True)
merged_model_v1_1.save_pretrained(merged_model_v1_1_path)
tokenizer.save_pretrained(merged_model_v1_1_path)
print(f"Merged model v1.2 (Beta) saved to {merged_model_v1_1_path}")

# Initialize both MimoAudio models
print("Initializing MimoAudio wrappers...")
model_v1 = MimoAudio(
    model_path=merged_model_v1_path,
    mimo_audio_tokenizer_path=tokenizer_path
)
model_v1_1 = MimoAudio(
    model_path=merged_model_v1_1_path,
    mimo_audio_tokenizer_path=tokenizer_path
)
print("Both models ready!")

# Dictionary to store models
models = {
    "EmoAct-MiMo v1.0 (Stable)": model_v1,
    "EmoAct-MiMo v1.2 (Beta - Experimental)": model_v1_1
}

@spaces.GPU
def generate_speech(model_choice, emotion, text):
    """Generate emotional speech from text using selected EmoAct-MiMo model"""
    if not emotion or not emotion.strip():
        return None, "Please enter an emotion description."
    if not text or not text.strip():
        return None, "Please enter text to convert to speech."

    print(f"Using model: {model_choice}")
    print("Generating:", text)
    print("With emotion:", emotion)
    
    try:
        # Select the appropriate model
        model = models[model_choice]
        
        # Create temporary file for output
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
            output_path = tmp_file.name

        # Format the instruction with emotion and text
        full_instruction = f"Emotion: {emotion.strip()}\nText: {text.strip()}"

        # Generate TTS with emotion instruction
        model.tts_sft(
            text=text.strip(),
            output_path=output_path,
            instruct=emotion.strip()
        )

        return output_path, f"✅ Speech generated successfully using {model_choice}!"

    except Exception as e:
        return None, f"❌ Error: {str(e)}"

# Create Gradio interface
with gr.Blocks(title="EmoAct-MiMo TTS") as demo:
    gr.Markdown("""
    # 🎭 EmoAct-MiMo: Emotion-Controllable Text-to-Speech

    Generate intensely emotional speech using the [EmoAct-MiMo model](https://huggingface.co/mrfakename/EmoAct-MiMo).

    This is still a very early experiment and is very early in the training run, I need to change a few settings and retrain. But the model turned out quite nicely!

    It may hallucinate, try a few times to get good results.

    Voice cloning is not supported yet.
    """)

    with gr.Row():
        with gr.Column():
            model_selector = gr.Dropdown(
                choices=["EmoAct-MiMo v1.0 (Stable)", "EmoAct-MiMo v1.2 (Beta - Experimental)"],
                value="EmoAct-MiMo v1.0 (Stable)",
                label="Model Selection",
                info="v1.0 is the current stable model. v1.2 is a beta experimental version with potentially different characteristics."
            )
            emotion_input = gr.Textbox(
                label="Emotion",
                placeholder="e.g., 'intense anger, rage, fury, hatred, and annoyance, speaking without any accent'",
                lines=3
            )
            text_input = gr.Textbox(
                label="Text",
                placeholder="Enter the text to speak with emotion...",
                lines=5
            )
            generate_btn = gr.Button("Generate Emotional Speech", variant="primary")

        with gr.Column():
            audio_output = gr.Audio(
                label="Generated Speech",
                type="filepath"
            )
            status_output = gr.Textbox(
                label="Status",
                interactive=False
            )

    # Intense emotion examples
    gr.Examples(
        examples=[
            [
                "EmoAct-MiMo v1.0 (Stable)",
                "intense anger, rage, fury, hatred, and annoyance, speaking without any accent",
                "You know what? I'm done. I'm done with your excuses. (sharp exhale) Every single time, it's the same, and I actually believed you'd change. (voice cracks slightly) God, I'm such an idiot for trusting you again."
            ],
            [
                "EmoAct-MiMo v1.0 (Stable)",
                "overwhelming grief, deep sorrow, heartbreak, and devastating sadness, speaking without any accent",
                "I can't... I can't believe they're gone. (trembling voice) It doesn't feel real. I keep expecting them to walk through that door, and... (chokes up) ...and they never will. How am I supposed to go on without them?"
            ],
            [
                "EmoAct-MiMo v1.0 (Stable)",
                "extreme fear, terror, panic, dread, and anxiety, speaking without any accent",
                "(breathing heavily) Did you hear that? Something's out there. (whispers urgently) We need to hide, NOW. Oh god, oh god, it's getting closer. I don't want to die. Please, please let us make it out of here alive."
            ],
            [
                "EmoAct-MiMo v1.0 (Stable)",
                "intense joy, euphoria, excitement, elation, and overwhelming happiness, speaking without any accent",
                "YES! YES! I DID IT! (laughs breathlessly) I can't believe it actually worked! This is... this is everything I've ever dreamed of! I'm so happy I could cry!"
            ],
            [
                "EmoAct-MiMo v1.2 (Beta - Experimental)",
                "crushing despair, hopelessness, depression, and deep emotional pain, speaking without any accent",
                "What's the point anymore? I've tried everything. Nothing changes. Nothing ever gets better. I'm so tired of pretending I'm okay when I'm falling apart inside."
            ],
            [
                "EmoAct-MiMo v1.2 (Beta - Experimental)",
                "bitter jealousy, envy, resentment, and seething frustration, speaking without any accent",
                "Of course they chose you. They always choose you. <laugh> Must be nice, having everything handed to you while the rest of us break our backs. You don't even appreciate what you have."
            ]
        ],
        inputs=[model_selector, emotion_input, text_input]
    )

    # Event handler
    generate_btn.click(
        fn=generate_speech,
        inputs=[model_selector, emotion_input, text_input],
        outputs=[audio_output, status_output]
    )

if __name__ == "__main__":
    demo.launch()