mrfakename commited on
Commit
20ca5a7
·
verified ·
1 Parent(s): a38af7a
Files changed (1) hide show
  1. app.py +39 -19
app.py CHANGED
@@ -119,22 +119,33 @@ merged_model.save_pretrained(merged_model_path)
119
  tokenizer.save_pretrained(merged_model_path)
120
  print(f"Merged model saved to {merged_model_path}")
121
 
122
- # Initialize MimoAudio with merged model
123
- print("Initializing MimoAudio wrapper...")
124
- model = MimoAudio(
125
  model_path=merged_model_path,
126
  mimo_audio_tokenizer_path=tokenizer_path
127
  )
128
- print("Model ready!")
 
 
 
 
 
 
 
129
 
130
  @spaces.GPU
131
- def generate_speech(emotion, text):
132
- """Generate emotional speech from text using EmoAct-MiMo"""
133
  if not emotion or not emotion.strip():
134
  return None, "Please enter an emotion description."
135
  if not text or not text.strip():
136
  return None, "Please enter text to convert to speech."
137
 
 
 
 
 
138
  print("Generating:", text)
139
  print("With emotion:", emotion)
140
  try:
@@ -142,11 +153,8 @@ def generate_speech(emotion, text):
142
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
143
  output_path = tmp_file.name
144
 
145
- # Format the instruction with emotion and text
146
- full_instruction = f"Emotion: {emotion.strip()}\nText: {text.strip()}"
147
-
148
  # Generate TTS with emotion instruction
149
- model.tts_sft(
150
  text=text.strip(),
151
  output_path=output_path,
152
  instruct=emotion.strip()
@@ -173,6 +181,12 @@ with gr.Blocks(title="EmoAct-MiMo TTS") as demo:
173
 
174
  with gr.Row():
175
  with gr.Column():
 
 
 
 
 
 
176
  emotion_input = gr.Textbox(
177
  label="Emotion",
178
  placeholder="e.g., 'intense anger, rage, fury, hatred, and annoyance, speaking without any accent'",
@@ -200,38 +214,44 @@ with gr.Blocks(title="EmoAct-MiMo TTS") as demo:
200
  examples=[
201
  [
202
  "intense anger, rage, fury, hatred, and annoyance, speaking without any accent",
203
- "You know what? I'm done. I'm done with your excuses. (sharp exhale) Every single time, it's the same, and I actually believed you'd change. (voice cracks slightly) God, I'm such an idiot for trusting you again."
 
204
  ],
205
  [
206
  "overwhelming grief, deep sorrow, heartbreak, and devastating sadness, speaking without any accent",
207
- "I can't... I can't believe they're gone. (trembling voice) It doesn't feel real. I keep expecting them to walk through that door, and... (chokes up) ...and they never will. How am I supposed to go on without them?"
 
208
  ],
209
  [
210
  "extreme fear, terror, panic, dread, and anxiety, speaking without any accent",
211
- "(breathing heavily) Did you hear that? Something's out there. (whispers urgently) We need to hide, NOW. Oh god, oh god, it's getting closer. I don't want to die. Please, please let us make it out of here alive."
 
212
  ],
213
  [
214
  "intense joy, euphoria, excitement, elation, and overwhelming happiness, speaking without any accent",
215
- "YES! YES! I DID IT! (laughs breathlessly) I can't believe it actually worked! This is... this is everything I've ever dreamed of! I'm so happy I could cry!"
 
216
  ],
217
  [
218
  "crushing despair, hopelessness, depression, and deep emotional pain, speaking without any accent",
219
- "(quietly, numbly) What's the point anymore? I've tried everything. Nothing changes. Nothing ever gets better. I'm so tired of pretending I'm okay when I'm falling apart inside."
 
220
  ],
221
  [
222
  "bitter jealousy, envy, resentment, and seething frustration, speaking without any accent",
223
- "Of course they chose you. They always choose you. (bitter laugh) Must be nice, having everything handed to you while the rest of us break our backs. You don't even appreciate what you have."
 
224
  ]
225
  ],
226
- inputs=[emotion_input, text_input]
227
  )
228
 
229
  # Event handler
230
  generate_btn.click(
231
  fn=generate_speech,
232
- inputs=[emotion_input, text_input],
233
  outputs=[audio_output, status_output]
234
  )
235
 
236
  if __name__ == "__main__":
237
- demo.launch()
 
119
  tokenizer.save_pretrained(merged_model_path)
120
  print(f"Merged model saved to {merged_model_path}")
121
 
122
+ # Initialize both models
123
+ print("Initializing EmoAct-MiMo (merged) model...")
124
+ emoact_model = MimoAudio(
125
  model_path=merged_model_path,
126
  mimo_audio_tokenizer_path=tokenizer_path
127
  )
128
+ print("EmoAct-MiMo model ready!")
129
+
130
+ print("Initializing base MiMo-Audio model...")
131
+ base_mimo_model = MimoAudio(
132
+ model_path=base_model_path,
133
+ mimo_audio_tokenizer_path=tokenizer_path
134
+ )
135
+ print("Base MiMo-Audio model ready!")
136
 
137
  @spaces.GPU
138
+ def generate_speech(emotion, text, model_choice):
139
+ """Generate emotional speech from text"""
140
  if not emotion or not emotion.strip():
141
  return None, "Please enter an emotion description."
142
  if not text or not text.strip():
143
  return None, "Please enter text to convert to speech."
144
 
145
+ # Select model based on choice
146
+ selected_model = emoact_model if model_choice == "EmoAct-MiMo v1.1 (Beta)" else base_mimo_model
147
+
148
+ print(f"Using model: {model_choice}")
149
  print("Generating:", text)
150
  print("With emotion:", emotion)
151
  try:
 
153
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
154
  output_path = tmp_file.name
155
 
 
 
 
156
  # Generate TTS with emotion instruction
157
+ selected_model.tts_sft(
158
  text=text.strip(),
159
  output_path=output_path,
160
  instruct=emotion.strip()
 
181
 
182
  with gr.Row():
183
  with gr.Column():
184
+ model_selector = gr.Dropdown(
185
+ choices=["MiMo-Audio 7B (Default)", "EmoAct-MiMo v1.1 (Beta)"],
186
+ value="MiMo-Audio 7B (Default)",
187
+ label="Model",
188
+ info="EmoAct-MiMo v1.1 is a beta fine-tune with enhanced emotion control but may be less stable"
189
+ )
190
  emotion_input = gr.Textbox(
191
  label="Emotion",
192
  placeholder="e.g., 'intense anger, rage, fury, hatred, and annoyance, speaking without any accent'",
 
214
  examples=[
215
  [
216
  "intense anger, rage, fury, hatred, and annoyance, speaking without any accent",
217
+ "You know what? I'm done. I'm done with your excuses. (sharp exhale) Every single time, it's the same, and I actually believed you'd change. (voice cracks slightly) God, I'm such an idiot for trusting you again.",
218
+ "MiMo-Audio 7B (Default)"
219
  ],
220
  [
221
  "overwhelming grief, deep sorrow, heartbreak, and devastating sadness, speaking without any accent",
222
+ "I can't... I can't believe they're gone. (trembling voice) It doesn't feel real. I keep expecting them to walk through that door, and... (chokes up) ...and they never will. How am I supposed to go on without them?",
223
+ "MiMo-Audio 7B (Default)"
224
  ],
225
  [
226
  "extreme fear, terror, panic, dread, and anxiety, speaking without any accent",
227
+ "(breathing heavily) Did you hear that? Something's out there. (whispers urgently) We need to hide, NOW. Oh god, oh god, it's getting closer. I don't want to die. Please, please let us make it out of here alive.",
228
+ "MiMo-Audio 7B (Default)"
229
  ],
230
  [
231
  "intense joy, euphoria, excitement, elation, and overwhelming happiness, speaking without any accent",
232
+ "YES! YES! I DID IT! (laughs breathlessly) I can't believe it actually worked! This is... this is everything I've ever dreamed of! I'm so happy I could cry!",
233
+ "EmoAct-MiMo v1.1 (Beta)"
234
  ],
235
  [
236
  "crushing despair, hopelessness, depression, and deep emotional pain, speaking without any accent",
237
+ "(quietly, numbly) What's the point anymore? I've tried everything. Nothing changes. Nothing ever gets better. I'm so tired of pretending I'm okay when I'm falling apart inside.",
238
+ "EmoAct-MiMo v1.1 (Beta)"
239
  ],
240
  [
241
  "bitter jealousy, envy, resentment, and seething frustration, speaking without any accent",
242
+ "Of course they chose you. They always choose you. (bitter laugh) Must be nice, having everything handed to you while the rest of us break our backs. You don't even appreciate what you have.",
243
+ "EmoAct-MiMo v1.1 (Beta)"
244
  ]
245
  ],
246
+ inputs=[emotion_input, text_input, model_selector]
247
  )
248
 
249
  # Event handler
250
  generate_btn.click(
251
  fn=generate_speech,
252
+ inputs=[emotion_input, text_input, model_selector],
253
  outputs=[audio_output, status_output]
254
  )
255
 
256
  if __name__ == "__main__":
257
+ demo.launch()