ceymox commited on
Commit
f47559b
·
verified ·
1 Parent(s): 2ab0984

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +606 -0
app.py ADDED
@@ -0,0 +1,606 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import time
4
+ import torch
5
+ import librosa
6
+ import requests
7
+ import tempfile
8
+ import threading
9
+ import numpy as np
10
+ import soundfile as sf
11
+ import gradio as gr
12
+ from transformers import AutoModel, logging as trf_logging
13
+ from huggingface_hub import login, hf_hub_download, scan_cache_dir
14
+
15
+ # Enable verbose logging for transformers
16
+ trf_logging.set_verbosity_info()
17
+
18
+ # Login (optional)
19
+ hf_token = os.getenv("HF_TOKEN")
20
+ if hf_token:
21
+ print("🔐 Logging into Hugging Face with token...")
22
+ login(token=hf_token)
23
+ else:
24
+ print("⚠️ HF_TOKEN not found. Proceeding without login...")
25
+
26
+ # Load model with GPU if available
27
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
28
+ print(f"🔧 Using device: {device}")
29
+
30
+ # Initialize model variable
31
+ model = None
32
+
33
+ # Define the repository ID
34
+ repo_id = "ai4bharat/IndicF5"
35
+
36
+ # Improved model loading with error handling
37
+ try:
38
+ print(f"Loading {repo_id} model...")
39
+ # Try direct loading first
40
+ model = AutoModel.from_pretrained(
41
+ repo_id,
42
+ trust_remote_code=True,
43
+ revision="main"
44
+ ).to(device)
45
+ print(f"Model loaded successfully! Type: {type(model)}")
46
+
47
+ # Check model attributes
48
+ model_methods = [method for method in dir(model) if not method.startswith('_') and callable(getattr(model, method))]
49
+ print(f"Available model methods: {model_methods[:10]}...")
50
+
51
+ except Exception as e:
52
+ print(f"⚠️ Error loading model directly: {e}")
53
+
54
+ try:
55
+ # Try loading with local_files_only if model is cached
56
+ model = AutoModel.from_pretrained(
57
+ repo_id,
58
+ trust_remote_code=True,
59
+ local_files_only=True
60
+ ).to(device)
61
+ print("Model loaded from cache!")
62
+ except Exception as e2:
63
+ print(f"❌ All attempts to load model failed: {e2}")
64
+
65
+ # Advanced audio processing functions
66
+ def remove_noise(audio_data, threshold=0.01):
67
+ """Apply simple noise gate to remove low-level noise"""
68
+ if audio_data is None:
69
+ return np.zeros(1000)
70
+
71
+ # Convert to numpy if needed
72
+ if isinstance(audio_data, torch.Tensor):
73
+ audio_data = audio_data.detach().cpu().numpy()
74
+ if isinstance(audio_data, list):
75
+ audio_data = np.array(audio_data)
76
+
77
+ # Apply noise gate
78
+ noise_mask = np.abs(audio_data) < threshold
79
+ clean_audio = audio_data.copy()
80
+ clean_audio[noise_mask] = 0
81
+
82
+ return clean_audio
83
+
84
+ def apply_smoothing(audio_data, window_size=5):
85
+ """Apply gentle smoothing to reduce artifacts"""
86
+ if audio_data is None or len(audio_data) < window_size*2:
87
+ return audio_data
88
+
89
+ # Simple moving average filter
90
+ kernel = np.ones(window_size) / window_size
91
+ smoothed = np.convolve(audio_data, kernel, mode='same')
92
+
93
+ # Keep original at the edges
94
+ smoothed[:window_size] = audio_data[:window_size]
95
+ smoothed[-window_size:] = audio_data[-window_size:]
96
+
97
+ return smoothed
98
+
99
+ def enhance_audio(audio_data):
100
+ """Process audio to improve quality and reduce noise"""
101
+ if audio_data is None:
102
+ return np.zeros(1000)
103
+
104
+ # Ensure numpy array
105
+ if isinstance(audio_data, torch.Tensor):
106
+ audio_data = audio_data.detach().cpu().numpy()
107
+ if isinstance(audio_data, list):
108
+ audio_data = np.array(audio_data)
109
+
110
+ # Ensure correct shape and dtype
111
+ if len(audio_data.shape) > 1:
112
+ audio_data = audio_data.flatten()
113
+ if audio_data.dtype != np.float32:
114
+ audio_data = audio_data.astype(np.float32)
115
+
116
+ # Skip processing if audio is empty or too short
117
+ if audio_data.size < 100:
118
+ return audio_data
119
+
120
+ # Check if the audio has reasonable amplitude
121
+ rms = np.sqrt(np.mean(audio_data**2))
122
+ print(f"Initial RMS: {rms}")
123
+
124
+ # Apply gain if needed
125
+ if rms < 0.05: # Very quiet
126
+ target_rms = 0.2
127
+ gain = target_rms / max(rms, 0.0001)
128
+ print(f"Applying gain factor: {gain}")
129
+ audio_data = audio_data * gain
130
+
131
+ # Remove DC offset
132
+ audio_data = audio_data - np.mean(audio_data)
133
+
134
+ # Apply noise gate to remove low-level noise
135
+ audio_data = remove_noise(audio_data, threshold=0.01)
136
+
137
+ # Apply gentle smoothing to reduce artifacts
138
+ audio_data = apply_smoothing(audio_data, window_size=3)
139
+
140
+ # Apply soft limiting to prevent clipping
141
+ max_amp = np.max(np.abs(audio_data))
142
+ if max_amp > 0.95:
143
+ audio_data = 0.95 * audio_data / max_amp
144
+
145
+ # Apply subtle compression for better audibility
146
+ audio_data = np.tanh(audio_data * 1.1) * 0.9
147
+
148
+ return audio_data
149
+
150
+ # Load audio from URL with improved error handling
151
+ def load_audio_from_url(url):
152
+ print(f"Downloading reference audio from {url}")
153
+ try:
154
+ response = requests.get(url)
155
+ if response.status_code == 200:
156
+ try:
157
+ # Save content to a temp file
158
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
159
+ temp_file.write(response.content)
160
+ temp_file.close()
161
+ print(f"Saved reference audio to temp file: {temp_file.name}")
162
+
163
+ # Try different methods to read the audio file
164
+ audio_data = None
165
+ sample_rate = None
166
+
167
+ # Try SoundFile first
168
+ try:
169
+ audio_data, sample_rate = sf.read(temp_file.name)
170
+ print(f"Audio loaded with SoundFile: {sample_rate}Hz, {len(audio_data)} samples")
171
+ except Exception as sf_error:
172
+ print(f"SoundFile failed: {sf_error}")
173
+
174
+ # Try librosa as fallback
175
+ try:
176
+ audio_data, sample_rate = librosa.load(temp_file.name, sr=None)
177
+ print(f"Audio loaded with librosa: {sample_rate}Hz, shape={audio_data.shape}")
178
+ except Exception as lr_error:
179
+ print(f"Librosa also failed: {lr_error}")
180
+
181
+ # Clean up temp file
182
+ os.unlink(temp_file.name)
183
+
184
+ if audio_data is not None:
185
+ # Apply audio enhancement to the reference
186
+ audio_data = enhance_audio(audio_data)
187
+ return sample_rate, audio_data
188
+
189
+ except Exception as e:
190
+ print(f"Failed to process audio data: {e}")
191
+ else:
192
+ print(f"Failed to download audio: status code {response.status_code}")
193
+ except Exception as e:
194
+ print(f"Error downloading audio: {e}")
195
+
196
+ # Return default values as fallback
197
+ print("⚠️ Returning default silence as reference audio")
198
+ return 24000, np.zeros(int(24000)) # 1 second of silence at 24kHz
199
+
200
+ # Split text into chunks for streaming
201
+ def split_into_chunks(text, max_length=30):
202
+ """Split text into smaller chunks based on punctuation and length"""
203
+ # First split by sentences
204
+ sentence_markers = ['.', '?', '!', ';', ':', '।', '॥']
205
+ chunks = []
206
+ current = ""
207
+
208
+ # Initial coarse splitting by sentence markers
209
+ for char in text:
210
+ current += char
211
+ if char in sentence_markers and current.strip():
212
+ chunks.append(current.strip())
213
+ current = ""
214
+
215
+ if current.strip():
216
+ chunks.append(current.strip())
217
+
218
+ # Further break down long sentences
219
+ final_chunks = []
220
+ for chunk in chunks:
221
+ if len(chunk) <= max_length:
222
+ final_chunks.append(chunk)
223
+ else:
224
+ # Try splitting by commas for long sentences
225
+ comma_splits = chunk.split(',')
226
+ current_part = ""
227
+
228
+ for part in comma_splits:
229
+ if len(current_part) + len(part) <= max_length:
230
+ if current_part:
231
+ current_part += ","
232
+ current_part += part
233
+ else:
234
+ if current_part:
235
+ final_chunks.append(current_part.strip())
236
+ current_part = part
237
+
238
+ if current_part:
239
+ final_chunks.append(current_part.strip())
240
+
241
+ print(f"Split text into {len(final_chunks)} chunks")
242
+ return final_chunks
243
+
244
+ # Improved model wrapper
245
+ class ModelWrapper:
246
+ def __init__(self, model):
247
+ self.model = model
248
+ print(f"Model wrapper initialized with model type: {type(model)}")
249
+
250
+ # Discover the appropriate generation method
251
+ self.generation_method = self._find_generation_method()
252
+
253
+ def _find_generation_method(self):
254
+ """Find the appropriate method to generate speech"""
255
+ if self.model is None:
256
+ return None
257
+
258
+ # Look for plausible generation methods
259
+ candidates = [
260
+ "generate_speech", "tts", "generate_audio", "synthesize",
261
+ "generate", "forward", "__call__"
262
+ ]
263
+
264
+ # Check for methods containing these keywords
265
+ for name in dir(self.model):
266
+ if any(candidate in name.lower() for candidate in candidates):
267
+ print(f"Found potential generation method: {name}")
268
+ return name
269
+
270
+ # If nothing specific found, default to __call__
271
+ print("No specific generation method found, will use __call__")
272
+ return "__call__"
273
+
274
+ def generate(self, text, ref_audio_path, ref_text, **kwargs):
275
+ """Generate speech with improved error handling and preprocessing"""
276
+ print(f"\n==== MODEL INFERENCE ====")
277
+ print(f"Text input: '{text}'")
278
+ print(f"Reference audio path: {ref_audio_path}")
279
+
280
+ # Check if files exist
281
+ if not os.path.exists(ref_audio_path):
282
+ print(f"⚠️ Reference audio file not found")
283
+ return None
284
+
285
+ # Try different calling approaches
286
+ result = None
287
+ method_name = self.generation_method if self.generation_method else "__call__"
288
+
289
+ # Set up different parameter combinations to try
290
+ param_combinations = [
291
+ # First try: standard keyword parameters
292
+ {"text": text, "ref_audio_path": ref_audio_path, "ref_text": ref_text},
293
+ # Second try: alternative parameter names
294
+ {"text": text, "reference_audio": ref_audio_path, "speaker_text": ref_text},
295
+ # Third try: just text and audio
296
+ {"text": text, "reference_audio": ref_audio_path},
297
+ # Fourth try: just text
298
+ {"text": text},
299
+ # Fifth try: positional arguments
300
+ {} # Will use positional below
301
+ ]
302
+
303
+ # Try each parameter combination
304
+ for i, params in enumerate(param_combinations):
305
+ try:
306
+ method = getattr(self.model, method_name)
307
+ print(f"Attempt {i+1}: Calling model.{method_name} with {list(params.keys())} parameters")
308
+
309
+ # For the positional arguments case
310
+ if not params:
311
+ result = method(text, ref_audio_path, ref_text, **kwargs)
312
+ else:
313
+ result = method(**params, **kwargs)
314
+
315
+ print(f"✓ Call succeeded with parameters: {list(params.keys())}")
316
+ break # Exit loop if successful
317
+
318
+ except Exception as e:
319
+ print(f"✗ Attempt {i+1} failed: {str(e)[:100]}...")
320
+ continue
321
+
322
+ # Process the result
323
+ if result is not None:
324
+ # Handle tuple results (might be audio, sample_rate)
325
+ if isinstance(result, tuple):
326
+ result = result[0] # Extract first element, assuming it's audio
327
+
328
+ # Convert torch tensor to numpy if needed
329
+ if isinstance(result, torch.Tensor):
330
+ result = result.detach().cpu().numpy()
331
+
332
+ # Ensure array is 1D
333
+ if hasattr(result, 'shape') and len(result.shape) > 1:
334
+ result = result.flatten()
335
+
336
+ # Apply advanced audio processing to improve quality
337
+ result = enhance_audio(result)
338
+
339
+ return result
340
+ else:
341
+ print("❌ All inference attempts failed")
342
+ return np.zeros(int(24000)) # Return 1 second of silence as fallback
343
+
344
+ # Create model wrapper
345
+ model_wrapper = ModelWrapper(model) if model is not None else None
346
+
347
+ # Streaming TTS class with improved audio quality
348
+ class StreamingTTS:
349
+ def __init__(self):
350
+ self.is_generating = False
351
+ self.should_stop = False
352
+ self.temp_dir = None
353
+ self.ref_audio_path = None
354
+ self.output_file = None
355
+ self.all_chunks = []
356
+ self.sample_rate = 24000 # Default sample rate
357
+
358
+ # Create temp directory
359
+ self.temp_dir = tempfile.mkdtemp()
360
+ print(f"Created temp directory: {self.temp_dir}")
361
+
362
+ def prepare_ref_audio(self, ref_audio, ref_sr):
363
+ """Prepare reference audio with enhanced quality"""
364
+ try:
365
+ if self.ref_audio_path is None:
366
+ self.ref_audio_path = os.path.join(self.temp_dir, "ref_audio.wav")
367
+
368
+ # Process the reference audio to ensure clean quality
369
+ ref_audio = enhance_audio(ref_audio)
370
+
371
+ # Save the reference audio
372
+ sf.write(self.ref_audio_path, ref_audio, ref_sr, format='WAV', subtype='FLOAT')
373
+ print(f"Saved reference audio to: {self.ref_audio_path}")
374
+
375
+ # Verify file was created
376
+ if os.path.exists(self.ref_audio_path):
377
+ print(f"Reference audio saved successfully: {os.path.getsize(self.ref_audio_path)} bytes")
378
+ else:
379
+ print("⚠️ Failed to create reference audio file!")
380
+
381
+ # Create output file
382
+ if self.output_file is None:
383
+ self.output_file = os.path.join(self.temp_dir, "output.wav")
384
+ print(f"Output will be saved to: {self.output_file}")
385
+ except Exception as e:
386
+ print(f"Error preparing reference audio: {e}")
387
+
388
+ def cleanup(self):
389
+ """Clean up temporary files"""
390
+ if self.temp_dir:
391
+ try:
392
+ if os.path.exists(self.ref_audio_path):
393
+ os.remove(self.ref_audio_path)
394
+ if os.path.exists(self.output_file):
395
+ os.remove(self.output_file)
396
+ os.rmdir(self.temp_dir)
397
+ self.temp_dir = None
398
+ print("Cleaned up temporary files")
399
+ except Exception as e:
400
+ print(f"Error cleaning up: {e}")
401
+
402
+ def generate(self, text, ref_audio, ref_sr, ref_text):
403
+ """Start generation in a new thread"""
404
+ if self.is_generating:
405
+ print("Already generating speech, please wait")
406
+ return
407
+
408
+ # Check model is loaded
409
+ if model_wrapper is None:
410
+ print("⚠️ Model is not loaded. Cannot generate speech.")
411
+ return
412
+
413
+ self.is_generating = True
414
+ self.should_stop = False
415
+ self.all_chunks = []
416
+
417
+ # Start in a new thread
418
+ threading.Thread(
419
+ target=self._process_streaming,
420
+ args=(text, ref_audio, ref_sr, ref_text),
421
+ daemon=True
422
+ ).start()
423
+
424
+ def _process_streaming(self, text, ref_audio, ref_sr, ref_text):
425
+ """Process text in chunks with high-quality audio generation"""
426
+ try:
427
+ # Prepare reference audio
428
+ self.prepare_ref_audio(ref_audio, ref_sr)
429
+
430
+ # Split text into smaller chunks for faster processing
431
+ chunks = split_into_chunks(text)
432
+ print(f"Processing {len(chunks)} chunks")
433
+
434
+ combined_audio = None
435
+ total_start_time = time.time()
436
+
437
+ # Process each chunk
438
+ for i, chunk in enumerate(chunks):
439
+ if self.should_stop:
440
+ print("Stopping generation as requested")
441
+ break
442
+
443
+ chunk_start = time.time()
444
+ print(f"Processing chunk {i+1}/{len(chunks)}: {chunk}")
445
+
446
+ # Generate speech for this chunk
447
+ try:
448
+ with torch.inference_mode():
449
+ chunk_audio = model_wrapper.generate(
450
+ chunk,
451
+ self.ref_audio_path,
452
+ ref_text
453
+ )
454
+
455
+ if chunk_audio is None or (hasattr(chunk_audio, 'size') and chunk_audio.size == 0):
456
+ print("⚠️ Empty audio returned for this chunk")
457
+ chunk_audio = np.zeros(int(24000 * 0.5)) # 0.5s silence
458
+
459
+ # Process the audio to improve quality
460
+ chunk_audio = enhance_audio(chunk_audio)
461
+
462
+ chunk_time = time.time() - chunk_start
463
+ print(f"✓ Chunk {i+1} processed in {chunk_time:.2f}s")
464
+
465
+ # Add small silence between chunks
466
+ silence = np.zeros(int(24000 * 0.1)) # 0.1s silence
467
+ chunk_audio = np.concatenate([chunk_audio, silence])
468
+
469
+ # Add to our collection
470
+ self.all_chunks.append(chunk_audio)
471
+
472
+ # Combine all chunks so far
473
+ if combined_audio is None:
474
+ combined_audio = chunk_audio
475
+ else:
476
+ combined_audio = np.concatenate([combined_audio, chunk_audio])
477
+
478
+ # Process combined audio for consistent quality
479
+ processed_audio = enhance_audio(combined_audio)
480
+
481
+ # Write intermediate output
482
+ sf.write(self.output_file, processed_audio, 24000, format='WAV', subtype='FLOAT')
483
+
484
+ except Exception as e:
485
+ print(f"Error processing chunk {i+1}: {str(e)[:100]}")
486
+ continue
487
+
488
+ total_time = time.time() - total_start_time
489
+ print(f"Total generation time: {total_time:.2f}s")
490
+
491
+ except Exception as e:
492
+ print(f"Error in streaming TTS: {str(e)[:100]}")
493
+ finally:
494
+ self.is_generating = False
495
+ print("Generation complete")
496
+
497
+ def get_current_audio(self):
498
+ """Get current audio file path for Gradio"""
499
+ if self.output_file and os.path.exists(self.output_file):
500
+ file_size = os.path.getsize(self.output_file)
501
+ if file_size > 0:
502
+ return self.output_file
503
+ return None
504
+
505
+ def stop(self):
506
+ """Stop generation"""
507
+ self.should_stop = True
508
+ print("Stop request received")
509
+
510
+ # Load reference example (Malayalam)
511
+ EXAMPLES = [{
512
+ "audio_url": "https://raw.githubusercontent.com/Aparna0112/voicerecording-_TTS/main/KC%20Voice.wav",
513
+ "ref_text": "ഹലോ ഇത് അപരനെ അല്ലേ ഞാൻ ജഗദീപ് ആണ് വിളിക്കുന്നത് ഇപ്പോൾ ഫ്രീയാണോ സംസാരിക്കാമോ ",
514
+ "synth_text": "ഞാൻ മലയാളം സംസാരിക്കാൻ കഴിയുന്നു."
515
+ }]
516
+
517
+ print("\nPreloading reference audio...")
518
+ ref_sr, ref_audio = load_audio_from_url(EXAMPLES[0]["audio_url"])
519
+
520
+ if ref_audio is None:
521
+ print("⚠️ Failed to load reference audio. Using silence instead.")
522
+ ref_audio = np.zeros(int(24000))
523
+ ref_sr = 24000
524
+
525
+ # Initialize streaming TTS
526
+ streaming_tts = StreamingTTS()
527
+
528
+ # Add a stop button functionality
529
+ def stop_generation():
530
+ streaming_tts.stop()
531
+ return "Generation stopped"
532
+
533
+ # Gradio interface
534
+ with gr.Blocks() as iface:
535
+ gr.Markdown("## 🚀 IndicF5 Malayalam TTS")
536
+
537
+ with gr.Row():
538
+ gr.Markdown("### System Status:")
539
+ system_status = gr.Markdown(f"- Device: {device}\n- Model loaded: {'Yes' if model is not None else 'No'}\n- Reference audio: {'Loaded' if ref_audio is not None else 'Not loaded'}")
540
+
541
+ with gr.Row():
542
+ text_input = gr.Textbox(
543
+ label="Malayalam Text",
544
+ placeholder="Enter text here...",
545
+ lines=3,
546
+ value=EXAMPLES[0]["synth_text"] if EXAMPLES else "ഹലോ, എന്തൊക്കെ ഉണ്ട് വിശേഷം?"
547
+ )
548
+
549
+ with gr.Row():
550
+ generate_btn = gr.Button("🎤 Generate Speech", variant="primary")
551
+ stop_btn = gr.Button("🛑 Stop Generation", variant="secondary")
552
+
553
+ # Status indicator
554
+ status_text = gr.Textbox(label="Status", value="Ready", interactive=False)
555
+
556
+ # Audio output
557
+ output_audio = gr.Audio(
558
+ label="Generated Speech",
559
+ type="filepath",
560
+ autoplay=True
561
+ )
562
+
563
+ # Debug information (hidden by default)
564
+ with gr.Accordion("Advanced", open=False):
565
+ debug_output = gr.Textbox(label="Debug Log", value="", lines=5)
566
+
567
+ def start_generation(text):
568
+ if not text.strip():
569
+ return None, "Please enter some text", "Error: Empty text input"
570
+
571
+ if model is None:
572
+ return None, "⚠️ Model not loaded. Cannot generate speech.", "Error: Model not loaded"
573
+
574
+ if ref_audio is None:
575
+ return None, "⚠️ Reference audio not loaded. Cannot generate speech.", "Error: Reference audio not loaded"
576
+
577
+ # Capture stdout for debug purposes
578
+ import io
579
+ from contextlib import redirect_stdout
580
+ f = io.StringIO()
581
+ with redirect_stdout(f):
582
+ streaming_tts.generate(text, ref_audio, ref_sr, EXAMPLES[0]["ref_text"] if EXAMPLES else "")
583
+
584
+ debug_log = f.getvalue()
585
+
586
+ # Add a delay to ensure file is created
587
+ time.sleep(1.5)
588
+
589
+ audio_path = streaming_tts.get_current_audio()
590
+ if audio_path and os.path.exists(audio_path) and os.path.getsize(audio_path) > 0:
591
+ return audio_path, "Generation started - audio playing", debug_log
592
+ else:
593
+ return None, "Starting generation... please wait", debug_log
594
+
595
+ generate_btn.click(start_generation, inputs=text_input, outputs=[output_audio, status_text, debug_output])
596
+ stop_btn.click(stop_generation, inputs=None, outputs=status_text)
597
+
598
+ # Cleanup on exit
599
+ def exit_handler():
600
+ streaming_tts.cleanup()
601
+
602
+ import atexit
603
+ atexit.register(exit_handler)
604
+
605
+ print("Starting Gradio interface...")
606
+ iface.launch()