File size: 11,339 Bytes
ae34825
de8adeb
c6fc372
de8adeb
 
 
 
 
 
 
a38af7a
de8adeb
 
 
 
a38af7a
 
 
 
 
de8adeb
a38af7a
 
 
 
de8adeb
 
 
 
a38af7a
de8adeb
a38af7a
 
 
de8adeb
a38af7a
de8adeb
 
 
 
 
 
a38af7a
de8adeb
 
 
 
 
 
 
 
ae34825
ef96930
 
 
 
d7cad10
 
 
ef96930
 
 
 
d7cad10
 
 
ef96930
d7cad10
ef96930
da189fe
d7cad10
 
da189fe
 
 
573876e
 
 
d7cad10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
da189fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
573876e
 
da189fe
d7cad10
 
 
 
 
573876e
da189fe
573876e
 
da189fe
573876e
da189fe
573876e
da189fe
573876e
 
da189fe
 
 
 
573876e
da189fe
 
 
 
 
124e60c
ef96930
da189fe
 
20ca5a7
 
da189fe
e6aa5b4
da189fe
e6aa5b4
da189fe
573876e
e6aa5b4
ef96930
 
e6aa5b4
da189fe
 
 
ef96930
d7cad10
da189fe
20ca5a7
c8eead4
da189fe
e6aa5b4
ef96930
da189fe
 
 
ef96930
 
 
 
da189fe
 
ef96930
da189fe
 
 
 
 
 
 
 
ef96930
 
 
 
 
da189fe
ef96930
da189fe
 
 
 
 
 
 
 
 
ef96930
 
 
 
20ca5a7
573876e
da189fe
e6aa5b4
573876e
20ca5a7
d7cad10
da189fe
 
d7cad10
 
ef96930
d7cad10
da189fe
ef96930
 
da189fe
ef96930
 
 
 
 
 
 
 
 
 
 
da189fe
 
 
 
 
 
 
d7cad10
da189fe
 
 
 
d7cad10
da189fe
 
 
 
 
 
 
 
 
 
 
573876e
da189fe
c7f28af
da189fe
 
573876e
da189fe
c7f28af
da189fe
 
 
 
ef96930
 
 
 
e6aa5b4
ef96930
 
 
 
20ca5a7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
import subprocess
import sys
import os
import torch
import platform

def install_flash_attention():
    # --- Step 1: Detect system info ---
    py_version = f"cp{sys.version_info.major}{sys.version_info.minor}"
    torch_version = torch.__version__.split("+")[0]  # e.g., '2.6.0'
    cuda_version = torch.version.cuda or "cpu"
    cxx11abi = "FALSE" if torch._C._GLIBCXX_USE_CXX11_ABI == 0 else "TRUE"
    system = platform.system().lower()
    arch = platform.machine()

    # --- Step 2: Normalize CUDA and torch version formatting ---
    if cuda_version != "cpu":
        # Extract only major.minor (e.g., 12.1 -> 12)
        cuda_major = cuda_version.split(".")[0]
        cuda_tag = f"cu{cuda_major}"
    else:
        cuda_tag = "cpu"

    # Use only torch major.minor (e.g., 2.6.0 -> 2.6)
    torch_tag = torch_version[:3]

    # --- Step 3: Build the wheel URL ---
    base_url = "https://github.com/Dao-AILab/flash-attention/releases/download"
    release_tag = "v2.7.4.post1"

    wheel_name = (
        f"flash_attn-2.7.4.post1+{cuda_tag}torch{torch_tag}"
        f"cxx11abi{cxx11abi}-"
        f"{py_version}-{py_version}-linux_x86_64.whl"
    )

    wheel_url = f"{base_url}/{release_tag}/{wheel_name}"

    print(f"πŸ”₯ Installing FlashAttention wheel:\n{wheel_url}\n")

    # --- Step 4: Install it ---
    env = dict(**os.environ, FLASH_ATTENTION_SKIP_CUDA_BUILD="TRUE")

    subprocess.run(
        ["pip", "install", wheel_url, "--no-build-isolation"],
        env=env,
        check=True,
    )

install_flash_attention()


import gradio as gr
import spaces
import torch
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer
from src.mimo_audio.modeling_mimo_audio import MiMoAudioArguments, MiMoAudioForCausalLM
from peft import PeftModel
from src.mimo_audio.mimo_audio import MimoAudio
import tempfile
import os

# Download base models from Hugging Face
print("Downloading MiMo-Audio base models from Hugging Face...")
base_model_path = snapshot_download(repo_id="XiaomiMiMo/MiMo-Audio-7B-Instruct")
tokenizer_path = snapshot_download(repo_id="XiaomiMiMo/MiMo-Audio-Tokenizer")
print(f"Base models downloaded to: {base_model_path}")

# Download both LoRA weights
print("Downloading EmoAct-MiMo LoRA weights...")
hf_token = os.environ.get("HF_TOKEN")
lora_v1_path = snapshot_download(repo_id="mrfakename/EmoAct-MiMo", token=hf_token)
print(f"LoRA v1.0 weights downloaded to: {lora_v1_path}")

print("Downloading EmoAct-MiMo v1.2 (Beta) LoRA weights...")
lora_v1_1_path = snapshot_download(repo_id="mrfakename/EmoAct-MiMo-v1.2", token=hf_token)
print(f"LoRA v1.2 (Beta) weights downloaded to: {lora_v1_1_path}")

# Load tokenizer and get special tokens
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(base_model_path)
sosp_idx = tokenizer.convert_tokens_to_ids("<|sosp|>")
eosp_idx = tokenizer.convert_tokens_to_ids("<|eosp|>")
empty_token = tokenizer.convert_tokens_to_ids("<|empty|>")
sostm_idx = tokenizer.convert_tokens_to_ids("<|sostm|>")
eostm_idx = tokenizer.convert_tokens_to_ids("<|eostm|>")
eot_idx = tokenizer.convert_tokens_to_ids("<|eot|>")

# Create model args
model_args = MiMoAudioArguments(
    model_name_or_path=base_model_path,
    sosp_idx=sosp_idx,
    eosp_idx=eosp_idx,
    empty_idx=empty_token,
    sostm_idx=sostm_idx,
    eostm_idx=eostm_idx,
    eot_idx=eot_idx,
)

# Load base model for v1.0
print("Loading base MiMo-Audio model for v1.0...")
base_model_v1 = MiMoAudioForCausalLM.from_pretrained(
    base_model_path,
    args=model_args,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
print("Base model v1.0 loaded")

# Load and merge LoRA v1.0
print("Loading LoRA v1.0 adapter...")
model_with_lora_v1 = PeftModel.from_pretrained(base_model_v1, lora_v1_path)
print("Merging LoRA v1.0 weights...")
merged_model_v1 = model_with_lora_v1.merge_and_unload()
print("LoRA v1.0 weights merged!")

# Save merged model v1.0 to temporary directory
print("Saving merged model v1.0...")
merged_model_v1_path = "/tmp/merged_mimo_audio_v1"
os.makedirs(merged_model_v1_path, exist_ok=True)
merged_model_v1.save_pretrained(merged_model_v1_path)
tokenizer.save_pretrained(merged_model_v1_path)
print(f"Merged model v1.0 saved to {merged_model_v1_path}")

# Load base model for v1.2
print("Loading base MiMo-Audio model for v1.2...")
base_model_v1_1 = MiMoAudioForCausalLM.from_pretrained(
    base_model_path,
    args=model_args,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
print("Base model v1.2 loaded")

# Load and merge LoRA v1.2
print("Loading LoRA v1.2 (Beta) adapter...")
model_with_lora_v1_1 = PeftModel.from_pretrained(base_model_v1_1, lora_v1_1_path)
print("Merging LoRA v1.2 (Beta) weights...")
merged_model_v1_1 = model_with_lora_v1_1.merge_and_unload()
print("LoRA v1.2 (Beta) weights merged!")

# Save merged model v1.2 to temporary directory
print("Saving merged model v1.2...")
merged_model_v1_1_path = "/tmp/merged_mimo_audio_v1_1"
os.makedirs(merged_model_v1_1_path, exist_ok=True)
merged_model_v1_1.save_pretrained(merged_model_v1_1_path)
tokenizer.save_pretrained(merged_model_v1_1_path)
print(f"Merged model v1.2 (Beta) saved to {merged_model_v1_1_path}")

# Initialize both MimoAudio models
print("Initializing MimoAudio wrappers...")
model_v1 = MimoAudio(
    model_path=merged_model_v1_path,
    mimo_audio_tokenizer_path=tokenizer_path
)
model_v1_1 = MimoAudio(
    model_path=merged_model_v1_1_path,
    mimo_audio_tokenizer_path=tokenizer_path
)
print("Both models ready!")

# Dictionary to store models
models = {
    "EmoAct-MiMo v1.0 (Stable)": model_v1,
    "EmoAct-MiMo v1.2 (Beta - Experimental)": model_v1_1
}

@spaces.GPU
def generate_speech(model_choice, emotion, text):
    """Generate emotional speech from text using selected EmoAct-MiMo model"""
    if not emotion or not emotion.strip():
        return None, "Please enter an emotion description."
    if not text or not text.strip():
        return None, "Please enter text to convert to speech."

    print(f"Using model: {model_choice}")
    print("Generating:", text)
    print("With emotion:", emotion)
    
    try:
        # Select the appropriate model
        model = models[model_choice]
        
        # Create temporary file for output
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
            output_path = tmp_file.name

        # Format the instruction with emotion and text
        full_instruction = f"Emotion: {emotion.strip()}\nText: {text.strip()}"

        # Generate TTS with emotion instruction
        model.tts_sft(
            text=text.strip(),
            output_path=output_path,
            instruct=emotion.strip()
        )

        return output_path, f"βœ… Speech generated successfully using {model_choice}!"

    except Exception as e:
        return None, f"❌ Error: {str(e)}"

# Create Gradio interface
with gr.Blocks(title="EmoAct-MiMo TTS") as demo:
    gr.Markdown("""
    # 🎭 EmoAct-MiMo: Emotion-Controllable Text-to-Speech

    Generate intensely emotional speech using the [EmoAct-MiMo model](https://huggingface.co/mrfakename/EmoAct-MiMo).

    This is still a very early experiment and is very early in the training run, I need to change a few settings and retrain. But the model turned out quite nicely!

    It may hallucinate, try a few times to get good results.

    Voice cloning is not supported yet.
    """)

    with gr.Row():
        with gr.Column():
            model_selector = gr.Dropdown(
                choices=["EmoAct-MiMo v1.0 (Stable)", "EmoAct-MiMo v1.2 (Beta - Experimental)"],
                value="EmoAct-MiMo v1.0 (Stable)",
                label="Model Selection",
                info="v1.0 is the current stable model. v1.2 is a beta experimental version with potentially different characteristics."
            )
            emotion_input = gr.Textbox(
                label="Emotion",
                placeholder="e.g., 'intense anger, rage, fury, hatred, and annoyance, speaking without any accent'",
                lines=3
            )
            text_input = gr.Textbox(
                label="Text",
                placeholder="Enter the text to speak with emotion...",
                lines=5
            )
            generate_btn = gr.Button("Generate Emotional Speech", variant="primary")

        with gr.Column():
            audio_output = gr.Audio(
                label="Generated Speech",
                type="filepath"
            )
            status_output = gr.Textbox(
                label="Status",
                interactive=False
            )

    # Intense emotion examples
    gr.Examples(
        examples=[
            [
                "EmoAct-MiMo v1.0 (Stable)",
                "intense anger, rage, fury, hatred, and annoyance, speaking without any accent",
                "You know what? I'm done. I'm done with your excuses. (sharp exhale) Every single time, it's the same, and I actually believed you'd change. (voice cracks slightly) God, I'm such an idiot for trusting you again."
            ],
            [
                "EmoAct-MiMo v1.0 (Stable)",
                "overwhelming grief, deep sorrow, heartbreak, and devastating sadness, speaking without any accent",
                "I can't... I can't believe they're gone. (trembling voice) It doesn't feel real. I keep expecting them to walk through that door, and... (chokes up) ...and they never will. How am I supposed to go on without them?"
            ],
            [
                "EmoAct-MiMo v1.0 (Stable)",
                "extreme fear, terror, panic, dread, and anxiety, speaking without any accent",
                "(breathing heavily) Did you hear that? Something's out there. (whispers urgently) We need to hide, NOW. Oh god, oh god, it's getting closer. I don't want to die. Please, please let us make it out of here alive."
            ],
            [
                "EmoAct-MiMo v1.0 (Stable)",
                "intense joy, euphoria, excitement, elation, and overwhelming happiness, speaking without any accent",
                "YES! YES! I DID IT! (laughs breathlessly) I can't believe it actually worked! This is... this is everything I've ever dreamed of! I'm so happy I could cry!"
            ],
            [
                "EmoAct-MiMo v1.2 (Beta - Experimental)",
                "crushing despair, hopelessness, depression, and deep emotional pain, speaking without any accent",
                "What's the point anymore? I've tried everything. Nothing changes. Nothing ever gets better. I'm so tired of pretending I'm okay when I'm falling apart inside."
            ],
            [
                "EmoAct-MiMo v1.2 (Beta - Experimental)",
                "bitter jealousy, envy, resentment, and seething frustration, speaking without any accent",
                "Of course they chose you. They always choose you. <laugh> Must be nice, having everything handed to you while the rest of us break our backs. You don't even appreciate what you have."
            ]
        ],
        inputs=[model_selector, emotion_input, text_input]
    )

    # Event handler
    generate_btn.click(
        fn=generate_speech,
        inputs=[model_selector, emotion_input, text_input],
        outputs=[audio_output, status_output]
    )

if __name__ == "__main__":
    demo.launch()