mrfakename commited on
Commit
e6aa5b4
·
verified ·
1 Parent(s): 20ca5a7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +117 -75
app.py CHANGED
@@ -120,45 +120,57 @@ tokenizer.save_pretrained(merged_model_path)
120
  print(f"Merged model saved to {merged_model_path}")
121
 
122
  # Initialize both models
123
- print("Initializing EmoAct-MiMo (merged) model...")
124
- emoact_model = MimoAudio(
125
- model_path=merged_model_path,
126
  mimo_audio_tokenizer_path=tokenizer_path
127
  )
128
- print("EmoAct-MiMo model ready!")
129
 
130
- print("Initializing base MiMo-Audio model...")
131
- base_mimo_model = MimoAudio(
132
- model_path=base_model_path,
133
  mimo_audio_tokenizer_path=tokenizer_path
134
  )
135
- print("Base MiMo-Audio model ready!")
 
 
 
 
 
 
136
 
137
  @spaces.GPU
138
- def generate_speech(emotion, text, model_choice):
139
- """Generate emotional speech from text"""
140
- if not emotion or not emotion.strip():
141
- return None, "Please enter an emotion description."
142
  if not text or not text.strip():
143
  return None, "Please enter text to convert to speech."
144
-
145
- # Select model based on choice
146
- selected_model = emoact_model if model_choice == "EmoAct-MiMo v1.1 (Beta)" else base_mimo_model
147
 
148
  print(f"Using model: {model_choice}")
149
  print("Generating:", text)
150
- print("With emotion:", emotion)
 
 
151
  try:
152
  # Create temporary file for output
153
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
154
  output_path = tmp_file.name
155
 
156
- # Generate TTS with emotion instruction
157
- selected_model.tts_sft(
158
- text=text.strip(),
159
- output_path=output_path,
160
- instruct=emotion.strip()
161
- )
 
 
 
 
 
 
162
 
163
  return output_path, "✅ Speech generated successfully!"
164
 
@@ -166,38 +178,36 @@ def generate_speech(emotion, text, model_choice):
166
  return None, f"❌ Error: {str(e)}"
167
 
168
  # Create Gradio interface
169
- with gr.Blocks(title="EmoAct-MiMo TTS") as demo:
170
  gr.Markdown("""
171
- # 🎭 EmoAct-MiMo: Emotion-Controllable Text-to-Speech
172
-
173
- Generate intensely emotional speech using the [EmoAct-MiMo model](https://huggingface.co/mrfakename/EmoAct-MiMo).
174
-
175
- This is still a very early experiment and is very early in the training run, I need to change a few settings and retrain. But the model turned out quite nicely!
176
-
177
- It may hallucinate, try a few times to get good results.
178
-
179
- Voice cloning is not supported yet.
180
  """)
181
 
182
  with gr.Row():
183
  with gr.Column():
184
  model_selector = gr.Dropdown(
185
- choices=["MiMo-Audio 7B (Default)", "EmoAct-MiMo v1.1 (Beta)"],
186
- value="MiMo-Audio 7B (Default)",
187
- label="Model",
188
- info="EmoAct-MiMo v1.1 is a beta fine-tune with enhanced emotion control but may be less stable"
 
 
 
189
  )
 
190
  emotion_input = gr.Textbox(
191
- label="Emotion",
192
- placeholder="e.g., 'intense anger, rage, fury, hatred, and annoyance, speaking without any accent'",
193
  lines=3
194
  )
195
  text_input = gr.Textbox(
196
  label="Text",
197
- placeholder="Enter the text to speak with emotion...",
198
  lines=5
199
  )
200
- generate_btn = gr.Button("Generate Emotional Speech", variant="primary")
201
 
202
  with gr.Column():
203
  audio_output = gr.Audio(
@@ -209,47 +219,79 @@ with gr.Blocks(title="EmoAct-MiMo TTS") as demo:
209
  interactive=False
210
  )
211
 
212
- # Intense emotion examples
213
- gr.Examples(
214
- examples=[
215
- [
216
- "intense anger, rage, fury, hatred, and annoyance, speaking without any accent",
217
- "You know what? I'm done. I'm done with your excuses. (sharp exhale) Every single time, it's the same, and I actually believed you'd change. (voice cracks slightly) God, I'm such an idiot for trusting you again.",
218
- "MiMo-Audio 7B (Default)"
219
- ],
220
- [
221
- "overwhelming grief, deep sorrow, heartbreak, and devastating sadness, speaking without any accent",
222
- "I can't... I can't believe they're gone. (trembling voice) It doesn't feel real. I keep expecting them to walk through that door, and... (chokes up) ...and they never will. How am I supposed to go on without them?",
223
- "MiMo-Audio 7B (Default)"
224
- ],
225
- [
226
- "extreme fear, terror, panic, dread, and anxiety, speaking without any accent",
227
- "(breathing heavily) Did you hear that? Something's out there. (whispers urgently) We need to hide, NOW. Oh god, oh god, it's getting closer. I don't want to die. Please, please let us make it out of here alive.",
228
- "MiMo-Audio 7B (Default)"
229
- ],
230
- [
231
- "intense joy, euphoria, excitement, elation, and overwhelming happiness, speaking without any accent",
232
- "YES! YES! I DID IT! (laughs breathlessly) I can't believe it actually worked! This is... this is everything I've ever dreamed of! I'm so happy I could cry!",
233
- "EmoAct-MiMo v1.1 (Beta)"
 
 
 
 
 
 
234
  ],
235
- [
236
- "crushing despair, hopelessness, depression, and deep emotional pain, speaking without any accent",
237
- "(quietly, numbly) What's the point anymore? I've tried everything. Nothing changes. Nothing ever gets better. I'm so tired of pretending I'm okay when I'm falling apart inside.",
238
- "EmoAct-MiMo v1.1 (Beta)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  ],
240
- [
241
- "bitter jealousy, envy, resentment, and seething frustration, speaking without any accent",
242
- "Of course they chose you. They always choose you. (bitter laugh) Must be nice, having everything handed to you while the rest of us break our backs. You don't even appreciate what you have.",
243
- "EmoAct-MiMo v1.1 (Beta)"
244
- ]
245
- ],
246
- inputs=[emotion_input, text_input, model_selector]
247
- )
248
 
249
  # Event handler
250
  generate_btn.click(
251
  fn=generate_speech,
252
- inputs=[emotion_input, text_input, model_selector],
253
  outputs=[audio_output, status_output]
254
  )
255
 
 
120
  print(f"Merged model saved to {merged_model_path}")
121
 
122
  # Initialize both models
123
+ print("Initializing base model...")
124
+ base_mimo = MimoAudio(
125
+ model_path=base_model_path,
126
  mimo_audio_tokenizer_path=tokenizer_path
127
  )
128
+ print("Base model ready!")
129
 
130
+ print("Initializing EmoAct model...")
131
+ emoact_mimo = MimoAudio(
132
+ model_path=merged_model_path,
133
  mimo_audio_tokenizer_path=tokenizer_path
134
  )
135
+ print("EmoAct model ready!")
136
+
137
+ # Store models in a dict for easy access
138
+ models = {
139
+ "Base Model (MiMo-Audio-7B-Instruct)": base_mimo,
140
+ "EmoAct-MiMo v1.1 (Beta - Emotional)": emoact_mimo
141
+ }
142
 
143
  @spaces.GPU
144
+ def generate_speech(model_choice, emotion, text):
145
+ """Generate speech from text using selected model"""
 
 
146
  if not text or not text.strip():
147
  return None, "Please enter text to convert to speech."
148
+
149
+ # Select the appropriate model
150
+ selected_model = models[model_choice]
151
 
152
  print(f"Using model: {model_choice}")
153
  print("Generating:", text)
154
+ if emotion and emotion.strip():
155
+ print("With emotion:", emotion)
156
+
157
  try:
158
  # Create temporary file for output
159
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
160
  output_path = tmp_file.name
161
 
162
+ # Generate TTS with or without emotion instruction
163
+ if emotion and emotion.strip():
164
+ selected_model.tts_sft(
165
+ text=text.strip(),
166
+ output_path=output_path,
167
+ instruct=emotion.strip()
168
+ )
169
+ else:
170
+ selected_model.tts_sft(
171
+ text=text.strip(),
172
+ output_path=output_path
173
+ )
174
 
175
  return output_path, "✅ Speech generated successfully!"
176
 
 
178
  return None, f"❌ Error: {str(e)}"
179
 
180
  # Create Gradio interface
181
+ with gr.Blocks(title="MiMo-Audio TTS") as demo:
182
  gr.Markdown("""
183
+ # 🎭 MiMo-Audio Text-to-Speech
184
+
185
+ Generate speech using MiMo-Audio models with optional emotion control.
 
 
 
 
 
 
186
  """)
187
 
188
  with gr.Row():
189
  with gr.Column():
190
  model_selector = gr.Dropdown(
191
+ choices=[
192
+ "Base Model (MiMo-Audio-7B-Instruct)",
193
+ "EmoAct-MiMo v1.1 (Beta - Emotional)"
194
+ ],
195
+ value="Base Model (MiMo-Audio-7B-Instruct)",
196
+ label="Model Selection",
197
+ info="Base model is stable. EmoAct is a beta model for intense emotional speech."
198
  )
199
+
200
  emotion_input = gr.Textbox(
201
+ label="Emotion (Optional - works best with EmoAct model)",
202
+ placeholder="e.g., 'intense anger, rage, fury' or leave empty for neutral",
203
  lines=3
204
  )
205
  text_input = gr.Textbox(
206
  label="Text",
207
+ placeholder="Enter the text to speak...",
208
  lines=5
209
  )
210
+ generate_btn = gr.Button("Generate Speech", variant="primary")
211
 
212
  with gr.Column():
213
  audio_output = gr.Audio(
 
219
  interactive=False
220
  )
221
 
222
+ gr.Markdown("""
223
+ ### Model Information
224
+
225
+ **Base Model (MiMo-Audio-7B-Instruct)**: The original stable model from Xiaomi. Best for general text-to-speech.
226
+
227
+ **EmoAct-MiMo v1.1 (Beta)**: An experimental emotional model fine-tuned for intense emotional expressions.
228
+ - ⚠️ **Beta warning**: This is an early experiment and may hallucinate or produce unexpected results
229
+ - Works best with detailed emotion descriptions
230
+ - Currently does not support voice cloning
231
+ - Try multiple times for best results
232
+ """)
233
+
234
+ # Examples for both models
235
+ gr.Markdown("### Examples")
236
+
237
+ with gr.Tab("Base Model Examples"):
238
+ gr.Examples(
239
+ examples=[
240
+ [
241
+ "Base Model (MiMo-Audio-7B-Instruct)",
242
+ "",
243
+ "Hello, welcome to MiMo Audio text to speech. This is the base model speaking in a neutral tone."
244
+ ],
245
+ [
246
+ "Base Model (MiMo-Audio-7B-Instruct)",
247
+ "",
248
+ "The quick brown fox jumps over the lazy dog. This is a test of the text to speech system."
249
+ ],
250
  ],
251
+ inputs=[model_selector, emotion_input, text_input]
252
+ )
253
+
254
+ with gr.Tab("EmoAct Emotional Examples"):
255
+ gr.Examples(
256
+ examples=[
257
+ [
258
+ "EmoAct-MiMo v1.1 (Beta - Emotional)",
259
+ "intense anger, rage, fury, hatred, and annoyance, speaking without any accent",
260
+ "You know what? I'm done. I'm done with your excuses. (sharp exhale) Every single time, it's the same, and I actually believed you'd change. (voice cracks slightly) God, I'm such an idiot for trusting you again."
261
+ ],
262
+ [
263
+ "EmoAct-MiMo v1.1 (Beta - Emotional)",
264
+ "overwhelming grief, deep sorrow, heartbreak, and devastating sadness, speaking without any accent",
265
+ "I can't... I can't believe they're gone. (trembling voice) It doesn't feel real. I keep expecting them to walk through that door, and... (chokes up) ...and they never will. How am I supposed to go on without them?"
266
+ ],
267
+ [
268
+ "EmoAct-MiMo v1.1 (Beta - Emotional)",
269
+ "extreme fear, terror, panic, dread, and anxiety, speaking without any accent",
270
+ "(breathing heavily) Did you hear that? Something's out there. (whispers urgently) We need to hide, NOW. Oh god, oh god, it's getting closer. I don't want to die. Please, please let us make it out of here alive."
271
+ ],
272
+ [
273
+ "EmoAct-MiMo v1.1 (Beta - Emotional)",
274
+ "intense joy, euphoria, excitement, elation, and overwhelming happiness, speaking without any accent",
275
+ "YES! YES! I DID IT! (laughs breathlessly) I can't believe it actually worked! This is... this is everything I've ever dreamed of! I'm so happy I could cry!"
276
+ ],
277
+ [
278
+ "EmoAct-MiMo v1.1 (Beta - Emotional)",
279
+ "crushing despair, hopelessness, depression, and deep emotional pain, speaking without any accent",
280
+ "(quietly, numbly) What's the point anymore? I've tried everything. Nothing changes. Nothing ever gets better. I'm so tired of pretending I'm okay when I'm falling apart inside."
281
+ ],
282
+ [
283
+ "EmoAct-MiMo v1.1 (Beta - Emotional)",
284
+ "bitter jealousy, envy, resentment, and seething frustration, speaking without any accent",
285
+ "Of course they chose you. They always choose you. (bitter laugh) Must be nice, having everything handed to you while the rest of us break our backs. You don't even appreciate what you have."
286
+ ],
287
  ],
288
+ inputs=[model_selector, emotion_input, text_input]
289
+ )
 
 
 
 
 
 
290
 
291
  # Event handler
292
  generate_btn.click(
293
  fn=generate_speech,
294
+ inputs=[model_selector, emotion_input, text_input],
295
  outputs=[audio_output, status_output]
296
  )
297