Spaces:

grgsaliba
/

voice-denoising

Sleeping

App Files Files Community

grgsaliba commited on Oct 12

Commit

f7fb413

verified ·

1 Parent(s): 92858ca

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +262 -93

app.py CHANGED Viewed

@@ -9,6 +9,8 @@ import soundfile as sf
 import tempfile
 import os
 from scipy import signal
 # Note: In production, you would load a trained model
 # For this demo, we'll use a simple spectral subtraction approach
@@ -17,33 +19,33 @@ def spectral_subtraction_denoise(audio, sample_rate, noise_reduction_db=10):
     """
     Simple spectral subtraction for demonstration
     In production, this would use the trained DTLN model
     Args:
         audio: Input audio array
         sample_rate: Sampling rate
         noise_reduction_db: Amount of noise reduction in dB
     Returns:
         Denoised audio array
     """
     # Compute STFT
     f, t, Zxx = signal.stft(audio, fs=sample_rate, nperseg=512)
     # Estimate noise from first 0.3 seconds
     noise_frames = int(0.3 * len(t))
     noise_estimate = np.mean(np.abs(Zxx[:, :noise_frames]), axis=1, keepdims=True)
     # Spectral subtraction
     magnitude = np.abs(Zxx)
     phase = np.angle(Zxx)
     # Subtract noise estimate (with floor)
     alpha = 10 ** (noise_reduction_db / 20)
     magnitude_cleaned = np.maximum(magnitude - alpha * noise_estimate, 0.1 * magnitude)
     # Reconstruct complex spectrum
     Zxx_cleaned = magnitude_cleaned * np.exp(1j * phase)
     # Inverse STFT
     _, audio_cleaned = signal.istft(Zxx_cleaned, fs=sample_rate)
@@ -62,63 +64,63 @@ def spectral_subtraction_denoise(audio, sample_rate, noise_reduction_db=10):
 def process_audio(audio_file, noise_reduction):
     """
     Process uploaded audio file
     Args:
         audio_file: Path to uploaded audio file
         noise_reduction: Noise reduction strength (0-20 dB)
     Returns:
         Tuple of (sample_rate, denoised_audio)
     """
     if audio_file is None:
         return None, "Please upload an audio file"
     try:
         # Load audio
         audio, sample_rate = sf.read(audio_file)
         # Convert to mono if stereo
         if len(audio.shape) > 1:
             audio = np.mean(audio, axis=1)
         # Resample to 16kHz if needed (DTLN's native sample rate)
         if sample_rate != 16000:
             import scipy.signal as scipy_signal
             num_samples = int(len(audio) * 16000 / sample_rate)
             audio = scipy_signal.resample(audio, num_samples)
             sample_rate = 16000
         # Normalize input
         audio = audio / (np.max(np.abs(audio)) + 1e-8) * 0.95
         # Apply denoising
         # Note: In production, this would use the trained DTLN model
         denoised = spectral_subtraction_denoise(audio, sample_rate, noise_reduction)
         # Calculate improvement metrics
         noise = audio - denoised
         signal_power = np.mean(audio ** 2)
         noise_power = np.mean(noise ** 2)
         snr_improvement = 10 * np.log10(signal_power / (noise_power + 1e-10))
         info = f"""
         ✅ Processing Complete!
         📊 Audio Info:
         - Duration: {len(audio)/sample_rate:.2f}s
         - Sample Rate: {sample_rate} Hz
         - Length: {len(audio):,} samples
         📈 Quality Metrics:
         - SNR Improvement: {snr_improvement:.2f} dB
         - Noise Reduction: {noise_reduction} dB
         ⚠️ Note: This demo uses spectral subtraction for demonstration.
         The actual DTLN model provides superior quality when trained!
         """
         return (sample_rate, denoised.astype(np.float32)), info
     except Exception as e:
         return None, f"❌ Error processing audio: {str(e)}"
@@ -128,32 +130,125 @@ def generate_demo_audio():
     sample_rate = 16000
     duration = 3.0
     t = np.linspace(0, duration, int(duration * sample_rate))
     # Generate synthetic speech
     speech = (
         0.3 * np.sin(2 * np.pi * 200 * t) +
         0.2 * np.sin(2 * np.pi * 400 * t) +
         0.15 * np.sin(2 * np.pi * 600 * t)
     )
     # Add speech-like envelope
     envelope = 0.5 + 0.5 * np.sin(2 * np.pi * 2 * t)
     speech = speech * envelope
     # Add noise
     noise = np.random.randn(len(t)) * 0.2
     noisy = speech + noise
     # Normalize
     noisy = noisy / (np.max(np.abs(noisy)) + 1e-8) * 0.95
     # Save to temporary file
     temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
     sf.write(temp_file.name, noisy.astype(np.float32), sample_rate)
     return temp_file.name
 # Custom CSS
 custom_css = """
 .gradio-container {
@@ -167,7 +262,7 @@ custom_css = """
     background: linear-gradient(90deg, #45a049, #4CAF50);
 }
 #component-0 {
-    max-width: 900px;
     margin: auto;
     padding: 20px;
 }
@@ -188,53 +283,121 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
     ---
     """)
-    with gr.Row():
-        with gr.Column():
-            gr.Markdown("### 📤 Input")
-            audio_input = gr.Audio(
-                label="Upload Noisy Audio",
-                type="filepath"
-            )
-            noise_reduction = gr.Slider(
-                minimum=0,
-                maximum=20,
-                value=10,
-                step=1,
-                label="Noise Reduction Strength (dB)",
-                info="Higher values remove more noise but may affect speech quality"
-            )
             with gr.Row():
-                process_btn = gr.Button("🔄 Denoise Audio", variant="primary", size="lg")
-                demo_btn = gr.Button("🎵 Try Demo Audio", variant="secondary")
-        with gr.Column():
-            gr.Markdown("### 📥 Output")
-            audio_output = gr.Audio(
-                label="Denoised Audio",
-                type="numpy"
-            )
-            info_output = gr.Textbox(
-                label="Processing Info",
-                lines=12,
-                max_lines=12
             )
     # About section
     with gr.Accordion("📖 About This Model", open=False):
         gr.Markdown("""
         ### DTLN Architecture
         **Dual-signal Transformation LSTM Network** is a real-time speech enhancement model:
         - **Two-stage processing**: Magnitude estimation → Final enhancement
         - **LSTM-based**: Captures temporal dependencies in speech
         - **<1M parameters**: Lightweight for edge deployment
         - **Frequency + Time domain**: Processes both domains for better quality
         ### Edge Hardware Acceleration
         Compatible with various edge AI accelerators:
@@ -242,78 +405,78 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
         - **CPU**: ARM Cortex-M series
         - **Quantization**: 8-bit and 16-bit integer operations
         - **Memory**: Optimized for constrained devices
         ### Performance Targets
         | Metric | Value |
         |--------|-------|
         | Model Size | ~100 KB (INT8) |
         | Latency | 3-6 ms |
         | Power | 30-40 mW |
         | SNR Improvement | 10-15 dB |
         ---
         ⚠️ **Demo Note**: This Space uses spectral subtraction for demonstration.
         Download the full implementation to train and deploy the actual DTLN model!
         """)
-    # Training guide section
     with gr.Accordion("🛠️ Training & Deployment Guide", open=False):
         gr.Markdown("""
         ### Quick Start
         ```bash
         # 1. Install dependencies
         pip install -r requirements.txt
         # 2. Train model
         python train_dtln.py \\
             --clean-dir ./data/clean_speech \\
             --noise-dir ./data/noise \\
             --epochs 50 \\
             --batch-size 16
         # 3. Convert to TFLite INT8
         python convert_to_tflite.py \\
             --model ./models/best_model.h5 \\
             --output ./models/dtln_ethos_u55.tflite \\
             --calibration-dir ./data/clean_speech
         # 4. (Optional) Optimize for hardware accelerator
         vela --accelerator-config ethos-u55-256 \\
              --system-config Ethos_U55_High_End_Embedded \\
              ./models/dtln_ethos_u55.tflite
         ```
         ### Download Full Implementation
         The complete training and deployment code is available in the Files tab →
         Includes:
         - `dtln_ethos_u55.py` - Model architecture
         - `train_dtln.py` - Training with QAT
         - `convert_to_tflite.py` - TFLite conversion
         - `alif_e7_voice_denoising_guide.md` - Complete guide
         - `example_usage.py` - Usage examples
         ### Resources
         - [TensorFlow Lite Micro](https://www.tensorflow.org/lite/microcontrollers)
         - [Arm Ethos-U NPU](https://developer.arm.com/ip-products/processors/machine-learning/arm-ethos-u)
         - [DTLN Paper (Interspeech 2020)](https://arxiv.org/abs/2005.07551)
         """)
     # Tech specs section
     with gr.Accordion("⚙️ Technical Specifications", open=False):
         gr.Markdown("""
         ### Model Architecture Details
         **Input**: Raw audio waveform @ 16kHz
         - Frame length: 512 samples (32ms)
         - Frame shift: 128 samples (8ms)
         - Frequency bins: 257 (FFT size 512)
         **Network Structure**:
         ```
         Input Audio (16kHz)
@@ -334,20 +497,20 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
             ↓
         Output Audio (16kHz)
         ```
         **Training Configuration**:
         - Loss: Combined time + frequency domain MSE
         - Optimizer: Adam (lr=0.001)
         - Batch size: 16
         - Epochs: 50
         - Quantization: INT8 post-training quantization
         **Memory Footprint**:
         - Model weights: ~80 KB (INT8)
         - Tensor arena: ~100 KB
         - Audio buffers: ~2 KB
         - **Total**: ~200 KB
         ### Edge Device Deployment
         **Hardware Utilization**:
@@ -355,12 +518,12 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
         - CPU: For FFT operations (CMSIS-DSP)
         - Memory: Optimized buffer management
         - Peripherals: I2S/PDM for audio I/O
         **Power Profile**:
         - Active inference: 30-40 mW
         - Idle: <1 mW
         - Average (50% duty): ~15-20 mW
         **Real-time Constraints**:
         - Frame processing: 8ms available
         - FFT: ~1ms
@@ -368,7 +531,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
         - IFFT + overhead: ~2ms
         - **Margin**: ~1ms
         """)
     # Event handlers
     process_btn.click(
         fn=process_audio,
@@ -381,15 +544,21 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
         inputs=[],
         outputs=[audio_input]
     )
     # Footer
     gr.Markdown("""
     ---
     ### 📚 Citation
     If you use this model in your research, please cite:
     ```bibtex
     @inproceedings{westhausen2020dtln,
       title={Dual-signal transformation LSTM network for real-time noise suppression},
@@ -398,9 +567,9 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
       year={2020}
     }
     ```
     ---
     <div style="text-align: center; color: #666;">
         Built for <b>Edge AI</b> • Optimized for <b>Microcontrollers</b> •
         <a href="https://github.com/breizhn/DTLN">Original DTLN</a>

 import tempfile
 import os
 from scipy import signal
+import zipfile
+from pathlib import Path
 # Note: In production, you would load a trained model
 # For this demo, we'll use a simple spectral subtraction approach
     """
     Simple spectral subtraction for demonstration
     In production, this would use the trained DTLN model
     Args:
         audio: Input audio array
         sample_rate: Sampling rate
         noise_reduction_db: Amount of noise reduction in dB
     Returns:
         Denoised audio array
     """
     # Compute STFT
     f, t, Zxx = signal.stft(audio, fs=sample_rate, nperseg=512)
     # Estimate noise from first 0.3 seconds
     noise_frames = int(0.3 * len(t))
     noise_estimate = np.mean(np.abs(Zxx[:, :noise_frames]), axis=1, keepdims=True)
     # Spectral subtraction
     magnitude = np.abs(Zxx)
     phase = np.angle(Zxx)
     # Subtract noise estimate (with floor)
     alpha = 10 ** (noise_reduction_db / 20)
     magnitude_cleaned = np.maximum(magnitude - alpha * noise_estimate, 0.1 * magnitude)
     # Reconstruct complex spectrum
     Zxx_cleaned = magnitude_cleaned * np.exp(1j * phase)
     # Inverse STFT
     _, audio_cleaned = signal.istft(Zxx_cleaned, fs=sample_rate)
 def process_audio(audio_file, noise_reduction):
     """
     Process uploaded audio file
     Args:
         audio_file: Path to uploaded audio file
         noise_reduction: Noise reduction strength (0-20 dB)
     Returns:
         Tuple of (sample_rate, denoised_audio)
     """
     if audio_file is None:
         return None, "Please upload an audio file"
     try:
         # Load audio
         audio, sample_rate = sf.read(audio_file)
         # Convert to mono if stereo
         if len(audio.shape) > 1:
             audio = np.mean(audio, axis=1)
         # Resample to 16kHz if needed (DTLN's native sample rate)
         if sample_rate != 16000:
             import scipy.signal as scipy_signal
             num_samples = int(len(audio) * 16000 / sample_rate)
             audio = scipy_signal.resample(audio, num_samples)
             sample_rate = 16000
         # Normalize input
         audio = audio / (np.max(np.abs(audio)) + 1e-8) * 0.95
         # Apply denoising
         # Note: In production, this would use the trained DTLN model
         denoised = spectral_subtraction_denoise(audio, sample_rate, noise_reduction)
         # Calculate improvement metrics
         noise = audio - denoised
         signal_power = np.mean(audio ** 2)
         noise_power = np.mean(noise ** 2)
         snr_improvement = 10 * np.log10(signal_power / (noise_power + 1e-10))
         info = f"""
         ✅ Processing Complete!
         📊 Audio Info:
         - Duration: {len(audio)/sample_rate:.2f}s
         - Sample Rate: {sample_rate} Hz
         - Length: {len(audio):,} samples
         📈 Quality Metrics:
         - SNR Improvement: {snr_improvement:.2f} dB
         - Noise Reduction: {noise_reduction} dB
         ⚠️ Note: This demo uses spectral subtraction for demonstration.
         The actual DTLN model provides superior quality when trained!
         """
         return (sample_rate, denoised.astype(np.float32)), info
     except Exception as e:
         return None, f"❌ Error processing audio: {str(e)}"
     sample_rate = 16000
     duration = 3.0
     t = np.linspace(0, duration, int(duration * sample_rate))
     # Generate synthetic speech
     speech = (
         0.3 * np.sin(2 * np.pi * 200 * t) +
         0.2 * np.sin(2 * np.pi * 400 * t) +
         0.15 * np.sin(2 * np.pi * 600 * t)
     )
     # Add speech-like envelope
     envelope = 0.5 + 0.5 * np.sin(2 * np.pi * 2 * t)
     speech = speech * envelope
     # Add noise
     noise = np.random.randn(len(t)) * 0.2
     noisy = speech + noise
     # Normalize
     noisy = noisy / (np.max(np.abs(noisy)) + 1e-8) * 0.95
     # Save to temporary file
     temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
     sf.write(temp_file.name, noisy.astype(np.float32), sample_rate)
     return temp_file.name
+def start_training(clean_zip, noise_zip, epochs, batch_size, lstm_units):
+    """
+    Start training process with uploaded datasets
+    Args:
+        clean_zip: Uploaded ZIP file with clean speech
+        noise_zip: Uploaded ZIP file with noise samples
+        epochs: Number of training epochs
+        batch_size: Batch size
+        lstm_units: Number of LSTM units
+    Returns:
+        Status message
+    """
+    if clean_zip is None or noise_zip is None:
+        return "❌ Please upload both clean speech and noise datasets as ZIP files"
+    try:
+        # Create temporary directories
+        temp_dir = tempfile.mkdtemp()
+        clean_dir = os.path.join(temp_dir, 'clean')
+        noise_dir = os.path.join(temp_dir, 'noise')
+        os.makedirs(clean_dir, exist_ok=True)
+        os.makedirs(noise_dir, exist_ok=True)
+        # Extract ZIP files
+        with zipfile.ZipFile(clean_zip, 'r') as zip_ref:
+            zip_ref.extractall(clean_dir)
+        with zipfile.ZipFile(noise_zip, 'r') as zip_ref:
+            zip_ref.extractall(noise_dir)
+        # Count files
+        clean_files = list(Path(clean_dir).glob('**/*.wav'))
+        noise_files = list(Path(noise_dir).glob('**/*.wav'))
+        status = f"""
+        📦 Dataset Extracted Successfully!
+        📊 Dataset Info:
+        - Clean speech files: {len(clean_files)}
+        - Noise files: {len(noise_files)}
+        - Training epochs: {epochs}
+        - Batch size: {batch_size}
+        - LSTM units: {lstm_units}
+        ⚠️ Training on Hugging Face Spaces:
+        Due to the computational requirements and limited resources on Hugging Face Spaces,
+        training cannot be run directly in this demo environment.
+        📥 To train your own model:
+        1. Download the training files from the "Files" tab:
+           - train_dtln.py
+           - dtln_ethos_u55.py
+           - convert_to_tflite.py
+        2. Run training locally or on a GPU instance:
+           ```bash
+           python train_dtln.py \\
+               --clean-dir ./data/clean_speech \\
+               --noise-dir ./data/noise \\
+               --epochs {epochs} \\
+               --batch-size {batch_size} \\
+               --lstm-units {lstm_units}
+           ```
+        3. Convert to TFLite INT8:
+           ```bash
+           python convert_to_tflite.py \\
+               --model ./models/best_model.h5 \\
+               --output ./models/dtln.tflite \\
+               --calibration-dir ./data/clean_speech
+           ```
+        💡 Recommended Training Environment:
+        - GPU: NVIDIA RTX 3060 or better
+        - RAM: 16GB+
+        - Storage: 10GB+ for datasets
+        - Time: 2-4 hours for 50 epochs
+        For detailed instructions, see the deployment guide in the Files tab!
+        """
+        return status
+    except Exception as e:
+        return f"❌ Error processing datasets: {str(e)}"
 # Custom CSS
 custom_css = """
 .gradio-container {
     background: linear-gradient(90deg, #45a049, #4CAF50);
 }
 #component-0 {
+    max-width: 1200px;
     margin: auto;
     padding: 20px;
 }
     ---
     """)
+    with gr.Tabs():
+        # Demo Tab
+        with gr.Tab("🎵 Demo"):
             with gr.Row():
+                with gr.Column():
+                    gr.Markdown("### 📤 Input")
+                    audio_input = gr.Audio(
+                        label="Upload Noisy Audio",
+                        type="filepath"
+                    )
+                    noise_reduction = gr.Slider(
+                        minimum=0,
+                        maximum=20,
+                        value=10,
+                        step=1,
+                        label="Noise Reduction Strength (dB)",
+                        info="Higher values remove more noise but may affect speech quality"
+                    )
+                    with gr.Row():
+                        process_btn = gr.Button("🔄 Denoise Audio", variant="primary", size="lg")
+                        demo_btn = gr.Button("🎵 Try Demo Audio", variant="secondary")
+                with gr.Column():
+                    gr.Markdown("### 📥 Output")
+                    audio_output = gr.Audio(
+                        label="Denoised Audio",
+                        type="numpy"
+                    )
+                    info_output = gr.Textbox(
+                        label="Processing Info",
+                        lines=12,
+                        max_lines=12
+                    )
+        # Training Tab
+        with gr.Tab("🔬 Training"):
+            gr.Markdown("""
+            ### Train Your Own DTLN Model
+            Upload your datasets and configure training parameters.
+            ⚠️ **Note**: Training requires significant computational resources and cannot run
+            directly on Hugging Face Spaces. This interface helps you prepare your data and
+            provides the exact commands to run training locally.
+            """)
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown("#### 📦 Datasets")
+                    clean_upload = gr.File(
+                        label="Clean Speech Dataset (ZIP)",
+                        file_types=[".zip"],
+                        type="filepath"
+                    )
+                    gr.Markdown("*Upload a ZIP file containing clean speech WAV files*")
+                    noise_upload = gr.File(
+                        label="Noise Dataset (ZIP)",
+                        file_types=[".zip"],
+                        type="filepath"
+                    )
+                    gr.Markdown("*Upload a ZIP file containing noise WAV files*")
+                with gr.Column():
+                    gr.Markdown("#### ⚙️ Training Parameters")
+                    epochs_slider = gr.Slider(
+                        minimum=10,
+                        maximum=200,
+                        value=50,
+                        step=10,
+                        label="Training Epochs"
+                    )
+                    batch_slider = gr.Slider(
+                        minimum=4,
+                        maximum=64,
+                        value=16,
+                        step=4,
+                        label="Batch Size"
+                    )
+                    lstm_slider = gr.Slider(
+                        minimum=64,
+                        maximum=256,
+                        value=128,
+                        step=32,
+                        label="LSTM Units"
+                    )
+                    train_btn = gr.Button("📊 Prepare Training", variant="primary", size="lg")
+            training_output = gr.Textbox(
+                label="Training Instructions",
+                lines=25,
+                max_lines=30
             )
     # About section
     with gr.Accordion("📖 About This Model", open=False):
         gr.Markdown("""
         ### DTLN Architecture
         **Dual-signal Transformation LSTM Network** is a real-time speech enhancement model:
         - **Two-stage processing**: Magnitude estimation → Final enhancement
         - **LSTM-based**: Captures temporal dependencies in speech
         - **<1M parameters**: Lightweight for edge deployment
         - **Frequency + Time domain**: Processes both domains for better quality
         ### Edge Hardware Acceleration
         Compatible with various edge AI accelerators:
         - **CPU**: ARM Cortex-M series
         - **Quantization**: 8-bit and 16-bit integer operations
         - **Memory**: Optimized for constrained devices
         ### Performance Targets
         | Metric | Value |
         |--------|-------|
         | Model Size | ~100 KB (INT8) |
         | Latency | 3-6 ms |
         | Power | 30-40 mW |
         | SNR Improvement | 10-15 dB |
         ---
         ⚠️ **Demo Note**: This Space uses spectral subtraction for demonstration.
         Download the full implementation to train and deploy the actual DTLN model!
         """)
+    # Deployment guide section
     with gr.Accordion("🛠️ Training & Deployment Guide", open=False):
         gr.Markdown("""
         ### Quick Start
         ```bash
         # 1. Install dependencies
         pip install -r requirements.txt
         # 2. Train model
         python train_dtln.py \\
             --clean-dir ./data/clean_speech \\
             --noise-dir ./data/noise \\
             --epochs 50 \\
             --batch-size 16
         # 3. Convert to TFLite INT8
         python convert_to_tflite.py \\
             --model ./models/best_model.h5 \\
             --output ./models/dtln_ethos_u55.tflite \\
             --calibration-dir ./data/clean_speech
         # 4. (Optional) Optimize for hardware accelerator
         vela --accelerator-config ethos-u55-256 \\
              --system-config Ethos_U55_High_End_Embedded \\
              ./models/dtln_ethos_u55.tflite
         ```
         ### Download Full Implementation
         The complete training and deployment code is available in the Files tab →
         Includes:
         - `dtln_ethos_u55.py` - Model architecture
         - `train_dtln.py` - Training with QAT
         - `convert_to_tflite.py` - TFLite conversion
         - `alif_e7_voice_denoising_guide.md` - Complete guide
         - `example_usage.py` - Usage examples
         ### Resources
         - [TensorFlow Lite Micro](https://www.tensorflow.org/lite/microcontrollers)
         - [Arm Ethos-U NPU](https://developer.arm.com/ip-products/processors/machine-learning/arm-ethos-u)
         - [DTLN Paper (Interspeech 2020)](https://arxiv.org/abs/2005.07551)
         """)
     # Tech specs section
     with gr.Accordion("⚙️ Technical Specifications", open=False):
         gr.Markdown("""
         ### Model Architecture Details
         **Input**: Raw audio waveform @ 16kHz
         - Frame length: 512 samples (32ms)
         - Frame shift: 128 samples (8ms)
         - Frequency bins: 257 (FFT size 512)
         **Network Structure**:
         ```
         Input Audio (16kHz)
             ↓
         Output Audio (16kHz)
         ```
         **Training Configuration**:
         - Loss: Combined time + frequency domain MSE
         - Optimizer: Adam (lr=0.001)
         - Batch size: 16
         - Epochs: 50
         - Quantization: INT8 post-training quantization
         **Memory Footprint**:
         - Model weights: ~80 KB (INT8)
         - Tensor arena: ~100 KB
         - Audio buffers: ~2 KB
         - **Total**: ~200 KB
         ### Edge Device Deployment
         **Hardware Utilization**:
         - CPU: For FFT operations (CMSIS-DSP)
         - Memory: Optimized buffer management
         - Peripherals: I2S/PDM for audio I/O
         **Power Profile**:
         - Active inference: 30-40 mW
         - Idle: <1 mW
         - Average (50% duty): ~15-20 mW
         **Real-time Constraints**:
         - Frame processing: 8ms available
         - FFT: ~1ms
         - IFFT + overhead: ~2ms
         - **Margin**: ~1ms
         """)
     # Event handlers
     process_btn.click(
         fn=process_audio,
         inputs=[],
         outputs=[audio_input]
     )
+    train_btn.click(
+        fn=start_training,
+        inputs=[clean_upload, noise_upload, epochs_slider, batch_slider, lstm_slider],
+        outputs=[training_output]
+    )
     # Footer
     gr.Markdown("""
     ---
     ### 📚 Citation
     If you use this model in your research, please cite:
     ```bibtex
     @inproceedings{westhausen2020dtln,
       title={Dual-signal transformation LSTM network for real-time noise suppression},
       year={2020}
     }
     ```
     ---
     <div style="text-align: center; color: #666;">
         Built for <b>Edge AI</b> • Optimized for <b>Microcontrollers</b> •
         <a href="https://github.com/breizhn/DTLN">Original DTLN</a>