Spaces:
Running
on
Zero
Running
on
Zero
File size: 11,339 Bytes
ae34825 de8adeb c6fc372 de8adeb a38af7a de8adeb a38af7a de8adeb a38af7a de8adeb a38af7a de8adeb a38af7a de8adeb a38af7a de8adeb a38af7a de8adeb ae34825 ef96930 d7cad10 ef96930 d7cad10 ef96930 d7cad10 ef96930 da189fe d7cad10 da189fe 573876e d7cad10 da189fe 573876e da189fe d7cad10 573876e da189fe 573876e da189fe 573876e da189fe 573876e da189fe 573876e da189fe 573876e da189fe 124e60c ef96930 da189fe 20ca5a7 da189fe e6aa5b4 da189fe e6aa5b4 da189fe 573876e e6aa5b4 ef96930 e6aa5b4 da189fe ef96930 d7cad10 da189fe 20ca5a7 c8eead4 da189fe e6aa5b4 ef96930 da189fe ef96930 da189fe ef96930 da189fe ef96930 da189fe ef96930 da189fe ef96930 20ca5a7 573876e da189fe e6aa5b4 573876e 20ca5a7 d7cad10 da189fe d7cad10 ef96930 d7cad10 da189fe ef96930 da189fe ef96930 da189fe d7cad10 da189fe d7cad10 da189fe 573876e da189fe c7f28af da189fe 573876e da189fe c7f28af da189fe ef96930 e6aa5b4 ef96930 20ca5a7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 |
import subprocess
import sys
import os
import torch
import platform
def install_flash_attention():
# --- Step 1: Detect system info ---
py_version = f"cp{sys.version_info.major}{sys.version_info.minor}"
torch_version = torch.__version__.split("+")[0] # e.g., '2.6.0'
cuda_version = torch.version.cuda or "cpu"
cxx11abi = "FALSE" if torch._C._GLIBCXX_USE_CXX11_ABI == 0 else "TRUE"
system = platform.system().lower()
arch = platform.machine()
# --- Step 2: Normalize CUDA and torch version formatting ---
if cuda_version != "cpu":
# Extract only major.minor (e.g., 12.1 -> 12)
cuda_major = cuda_version.split(".")[0]
cuda_tag = f"cu{cuda_major}"
else:
cuda_tag = "cpu"
# Use only torch major.minor (e.g., 2.6.0 -> 2.6)
torch_tag = torch_version[:3]
# --- Step 3: Build the wheel URL ---
base_url = "https://github.com/Dao-AILab/flash-attention/releases/download"
release_tag = "v2.7.4.post1"
wheel_name = (
f"flash_attn-2.7.4.post1+{cuda_tag}torch{torch_tag}"
f"cxx11abi{cxx11abi}-"
f"{py_version}-{py_version}-linux_x86_64.whl"
)
wheel_url = f"{base_url}/{release_tag}/{wheel_name}"
print(f"π₯ Installing FlashAttention wheel:\n{wheel_url}\n")
# --- Step 4: Install it ---
env = dict(**os.environ, FLASH_ATTENTION_SKIP_CUDA_BUILD="TRUE")
subprocess.run(
["pip", "install", wheel_url, "--no-build-isolation"],
env=env,
check=True,
)
install_flash_attention()
import gradio as gr
import spaces
import torch
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer
from src.mimo_audio.modeling_mimo_audio import MiMoAudioArguments, MiMoAudioForCausalLM
from peft import PeftModel
from src.mimo_audio.mimo_audio import MimoAudio
import tempfile
import os
# Download base models from Hugging Face
print("Downloading MiMo-Audio base models from Hugging Face...")
base_model_path = snapshot_download(repo_id="XiaomiMiMo/MiMo-Audio-7B-Instruct")
tokenizer_path = snapshot_download(repo_id="XiaomiMiMo/MiMo-Audio-Tokenizer")
print(f"Base models downloaded to: {base_model_path}")
# Download both LoRA weights
print("Downloading EmoAct-MiMo LoRA weights...")
hf_token = os.environ.get("HF_TOKEN")
lora_v1_path = snapshot_download(repo_id="mrfakename/EmoAct-MiMo", token=hf_token)
print(f"LoRA v1.0 weights downloaded to: {lora_v1_path}")
print("Downloading EmoAct-MiMo v1.2 (Beta) LoRA weights...")
lora_v1_1_path = snapshot_download(repo_id="mrfakename/EmoAct-MiMo-v1.2", token=hf_token)
print(f"LoRA v1.2 (Beta) weights downloaded to: {lora_v1_1_path}")
# Load tokenizer and get special tokens
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(base_model_path)
sosp_idx = tokenizer.convert_tokens_to_ids("<|sosp|>")
eosp_idx = tokenizer.convert_tokens_to_ids("<|eosp|>")
empty_token = tokenizer.convert_tokens_to_ids("<|empty|>")
sostm_idx = tokenizer.convert_tokens_to_ids("<|sostm|>")
eostm_idx = tokenizer.convert_tokens_to_ids("<|eostm|>")
eot_idx = tokenizer.convert_tokens_to_ids("<|eot|>")
# Create model args
model_args = MiMoAudioArguments(
model_name_or_path=base_model_path,
sosp_idx=sosp_idx,
eosp_idx=eosp_idx,
empty_idx=empty_token,
sostm_idx=sostm_idx,
eostm_idx=eostm_idx,
eot_idx=eot_idx,
)
# Load base model for v1.0
print("Loading base MiMo-Audio model for v1.0...")
base_model_v1 = MiMoAudioForCausalLM.from_pretrained(
base_model_path,
args=model_args,
torch_dtype=torch.bfloat16,
device_map="auto",
)
print("Base model v1.0 loaded")
# Load and merge LoRA v1.0
print("Loading LoRA v1.0 adapter...")
model_with_lora_v1 = PeftModel.from_pretrained(base_model_v1, lora_v1_path)
print("Merging LoRA v1.0 weights...")
merged_model_v1 = model_with_lora_v1.merge_and_unload()
print("LoRA v1.0 weights merged!")
# Save merged model v1.0 to temporary directory
print("Saving merged model v1.0...")
merged_model_v1_path = "/tmp/merged_mimo_audio_v1"
os.makedirs(merged_model_v1_path, exist_ok=True)
merged_model_v1.save_pretrained(merged_model_v1_path)
tokenizer.save_pretrained(merged_model_v1_path)
print(f"Merged model v1.0 saved to {merged_model_v1_path}")
# Load base model for v1.2
print("Loading base MiMo-Audio model for v1.2...")
base_model_v1_1 = MiMoAudioForCausalLM.from_pretrained(
base_model_path,
args=model_args,
torch_dtype=torch.bfloat16,
device_map="auto",
)
print("Base model v1.2 loaded")
# Load and merge LoRA v1.2
print("Loading LoRA v1.2 (Beta) adapter...")
model_with_lora_v1_1 = PeftModel.from_pretrained(base_model_v1_1, lora_v1_1_path)
print("Merging LoRA v1.2 (Beta) weights...")
merged_model_v1_1 = model_with_lora_v1_1.merge_and_unload()
print("LoRA v1.2 (Beta) weights merged!")
# Save merged model v1.2 to temporary directory
print("Saving merged model v1.2...")
merged_model_v1_1_path = "/tmp/merged_mimo_audio_v1_1"
os.makedirs(merged_model_v1_1_path, exist_ok=True)
merged_model_v1_1.save_pretrained(merged_model_v1_1_path)
tokenizer.save_pretrained(merged_model_v1_1_path)
print(f"Merged model v1.2 (Beta) saved to {merged_model_v1_1_path}")
# Initialize both MimoAudio models
print("Initializing MimoAudio wrappers...")
model_v1 = MimoAudio(
model_path=merged_model_v1_path,
mimo_audio_tokenizer_path=tokenizer_path
)
model_v1_1 = MimoAudio(
model_path=merged_model_v1_1_path,
mimo_audio_tokenizer_path=tokenizer_path
)
print("Both models ready!")
# Dictionary to store models
models = {
"EmoAct-MiMo v1.0 (Stable)": model_v1,
"EmoAct-MiMo v1.2 (Beta - Experimental)": model_v1_1
}
@spaces.GPU
def generate_speech(model_choice, emotion, text):
"""Generate emotional speech from text using selected EmoAct-MiMo model"""
if not emotion or not emotion.strip():
return None, "Please enter an emotion description."
if not text or not text.strip():
return None, "Please enter text to convert to speech."
print(f"Using model: {model_choice}")
print("Generating:", text)
print("With emotion:", emotion)
try:
# Select the appropriate model
model = models[model_choice]
# Create temporary file for output
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
output_path = tmp_file.name
# Format the instruction with emotion and text
full_instruction = f"Emotion: {emotion.strip()}\nText: {text.strip()}"
# Generate TTS with emotion instruction
model.tts_sft(
text=text.strip(),
output_path=output_path,
instruct=emotion.strip()
)
return output_path, f"β
Speech generated successfully using {model_choice}!"
except Exception as e:
return None, f"β Error: {str(e)}"
# Create Gradio interface
with gr.Blocks(title="EmoAct-MiMo TTS") as demo:
gr.Markdown("""
# π EmoAct-MiMo: Emotion-Controllable Text-to-Speech
Generate intensely emotional speech using the [EmoAct-MiMo model](https://huggingface.co/mrfakename/EmoAct-MiMo).
This is still a very early experiment and is very early in the training run, I need to change a few settings and retrain. But the model turned out quite nicely!
It may hallucinate, try a few times to get good results.
Voice cloning is not supported yet.
""")
with gr.Row():
with gr.Column():
model_selector = gr.Dropdown(
choices=["EmoAct-MiMo v1.0 (Stable)", "EmoAct-MiMo v1.2 (Beta - Experimental)"],
value="EmoAct-MiMo v1.0 (Stable)",
label="Model Selection",
info="v1.0 is the current stable model. v1.2 is a beta experimental version with potentially different characteristics."
)
emotion_input = gr.Textbox(
label="Emotion",
placeholder="e.g., 'intense anger, rage, fury, hatred, and annoyance, speaking without any accent'",
lines=3
)
text_input = gr.Textbox(
label="Text",
placeholder="Enter the text to speak with emotion...",
lines=5
)
generate_btn = gr.Button("Generate Emotional Speech", variant="primary")
with gr.Column():
audio_output = gr.Audio(
label="Generated Speech",
type="filepath"
)
status_output = gr.Textbox(
label="Status",
interactive=False
)
# Intense emotion examples
gr.Examples(
examples=[
[
"EmoAct-MiMo v1.0 (Stable)",
"intense anger, rage, fury, hatred, and annoyance, speaking without any accent",
"You know what? I'm done. I'm done with your excuses. (sharp exhale) Every single time, it's the same, and I actually believed you'd change. (voice cracks slightly) God, I'm such an idiot for trusting you again."
],
[
"EmoAct-MiMo v1.0 (Stable)",
"overwhelming grief, deep sorrow, heartbreak, and devastating sadness, speaking without any accent",
"I can't... I can't believe they're gone. (trembling voice) It doesn't feel real. I keep expecting them to walk through that door, and... (chokes up) ...and they never will. How am I supposed to go on without them?"
],
[
"EmoAct-MiMo v1.0 (Stable)",
"extreme fear, terror, panic, dread, and anxiety, speaking without any accent",
"(breathing heavily) Did you hear that? Something's out there. (whispers urgently) We need to hide, NOW. Oh god, oh god, it's getting closer. I don't want to die. Please, please let us make it out of here alive."
],
[
"EmoAct-MiMo v1.0 (Stable)",
"intense joy, euphoria, excitement, elation, and overwhelming happiness, speaking without any accent",
"YES! YES! I DID IT! (laughs breathlessly) I can't believe it actually worked! This is... this is everything I've ever dreamed of! I'm so happy I could cry!"
],
[
"EmoAct-MiMo v1.2 (Beta - Experimental)",
"crushing despair, hopelessness, depression, and deep emotional pain, speaking without any accent",
"What's the point anymore? I've tried everything. Nothing changes. Nothing ever gets better. I'm so tired of pretending I'm okay when I'm falling apart inside."
],
[
"EmoAct-MiMo v1.2 (Beta - Experimental)",
"bitter jealousy, envy, resentment, and seething frustration, speaking without any accent",
"Of course they chose you. They always choose you. <laugh> Must be nice, having everything handed to you while the rest of us break our backs. You don't even appreciate what you have."
]
],
inputs=[model_selector, emotion_input, text_input]
)
# Event handler
generate_btn.click(
fn=generate_speech,
inputs=[model_selector, emotion_input, text_input],
outputs=[audio_output, status_output]
)
if __name__ == "__main__":
demo.launch() |