Spaces:
Sleeping
Sleeping
| """ | |
| Hugging Face Space: DTLN Voice Denoising | |
| Real-time speech denoising optimized for edge deployment | |
| """ | |
| import gradio as gr | |
| import numpy as np | |
| import soundfile as sf | |
| import tempfile | |
| import os | |
| from scipy import signal | |
| import zipfile | |
| from pathlib import Path | |
| # Note: In production, you would load a trained model | |
| # For this demo, we'll use a simple spectral subtraction approach | |
| def spectral_subtraction_denoise(audio, sample_rate, noise_reduction_db=10): | |
| """ | |
| Simple spectral subtraction for demonstration | |
| In production, this would use the trained DTLN model | |
| Args: | |
| audio: Input audio array | |
| sample_rate: Sampling rate | |
| noise_reduction_db: Amount of noise reduction in dB | |
| Returns: | |
| Denoised audio array | |
| """ | |
| # Compute STFT | |
| f, t, Zxx = signal.stft(audio, fs=sample_rate, nperseg=512) | |
| # Estimate noise from first 0.3 seconds | |
| noise_frames = int(0.3 * len(t)) | |
| noise_estimate = np.mean(np.abs(Zxx[:, :noise_frames]), axis=1, keepdims=True) | |
| # Spectral subtraction | |
| magnitude = np.abs(Zxx) | |
| phase = np.angle(Zxx) | |
| # Subtract noise estimate (with floor) | |
| alpha = 10 ** (noise_reduction_db / 20) | |
| magnitude_cleaned = np.maximum(magnitude - alpha * noise_estimate, 0.1 * magnitude) | |
| # Reconstruct complex spectrum | |
| Zxx_cleaned = magnitude_cleaned * np.exp(1j * phase) | |
| # Inverse STFT | |
| _, audio_cleaned = signal.istft(Zxx_cleaned, fs=sample_rate) | |
| # Ensure output length matches input (trim or pad if needed) | |
| if len(audio_cleaned) > len(audio): | |
| audio_cleaned = audio_cleaned[:len(audio)] | |
| elif len(audio_cleaned) < len(audio): | |
| audio_cleaned = np.pad(audio_cleaned, (0, len(audio) - len(audio_cleaned)), mode='constant') | |
| # Normalize | |
| audio_cleaned = audio_cleaned / (np.max(np.abs(audio_cleaned)) + 1e-8) * 0.95 | |
| return audio_cleaned | |
| def process_audio(audio_file, noise_reduction): | |
| """ | |
| Process uploaded audio file | |
| Args: | |
| audio_file: Path to uploaded audio file | |
| noise_reduction: Noise reduction strength (0-20 dB) | |
| Returns: | |
| Tuple of (sample_rate, denoised_audio) | |
| """ | |
| if audio_file is None: | |
| return None, "Please upload an audio file" | |
| try: | |
| # Load audio | |
| audio, sample_rate = sf.read(audio_file) | |
| # Convert to mono if stereo | |
| if len(audio.shape) > 1: | |
| audio = np.mean(audio, axis=1) | |
| # Resample to 16kHz if needed (DTLN's native sample rate) | |
| if sample_rate != 16000: | |
| import scipy.signal as scipy_signal | |
| num_samples = int(len(audio) * 16000 / sample_rate) | |
| audio = scipy_signal.resample(audio, num_samples) | |
| sample_rate = 16000 | |
| # Normalize input | |
| audio = audio / (np.max(np.abs(audio)) + 1e-8) * 0.95 | |
| # Apply denoising | |
| # Note: In production, this would use the trained DTLN model | |
| denoised = spectral_subtraction_denoise(audio, sample_rate, noise_reduction) | |
| # Calculate improvement metrics | |
| noise = audio - denoised | |
| signal_power = np.mean(audio ** 2) | |
| noise_power = np.mean(noise ** 2) | |
| snr_improvement = 10 * np.log10(signal_power / (noise_power + 1e-10)) | |
| info = f""" | |
| β Processing Complete! | |
| π Audio Info: | |
| - Duration: {len(audio)/sample_rate:.2f}s | |
| - Sample Rate: {sample_rate} Hz | |
| - Length: {len(audio):,} samples | |
| π Quality Metrics: | |
| - SNR Improvement: {snr_improvement:.2f} dB | |
| - Noise Reduction: {noise_reduction} dB | |
| β οΈ Note: This demo uses spectral subtraction for demonstration. | |
| The actual DTLN model provides superior quality when trained! | |
| """ | |
| return (sample_rate, denoised.astype(np.float32)), info | |
| except Exception as e: | |
| return None, f"β Error processing audio: {str(e)}" | |
| def generate_demo_audio(): | |
| """Generate demo noisy audio""" | |
| sample_rate = 16000 | |
| duration = 3.0 | |
| t = np.linspace(0, duration, int(duration * sample_rate)) | |
| # Generate synthetic speech | |
| speech = ( | |
| 0.3 * np.sin(2 * np.pi * 200 * t) + | |
| 0.2 * np.sin(2 * np.pi * 400 * t) + | |
| 0.15 * np.sin(2 * np.pi * 600 * t) | |
| ) | |
| # Add speech-like envelope | |
| envelope = 0.5 + 0.5 * np.sin(2 * np.pi * 2 * t) | |
| speech = speech * envelope | |
| # Add noise | |
| noise = np.random.randn(len(t)) * 0.2 | |
| noisy = speech + noise | |
| # Normalize | |
| noisy = noisy / (np.max(np.abs(noisy)) + 1e-8) * 0.95 | |
| # Save to temporary file | |
| temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav') | |
| sf.write(temp_file.name, noisy.astype(np.float32), sample_rate) | |
| return temp_file.name | |
| def start_training(clean_zip, noise_zip, epochs, batch_size, lstm_units): | |
| """ | |
| Start training process with uploaded datasets | |
| Args: | |
| clean_zip: Uploaded ZIP file with clean speech | |
| noise_zip: Uploaded ZIP file with noise samples | |
| epochs: Number of training epochs | |
| batch_size: Batch size | |
| lstm_units: Number of LSTM units | |
| Returns: | |
| Status message | |
| """ | |
| if clean_zip is None or noise_zip is None: | |
| return "β Please upload both clean speech and noise datasets as ZIP files" | |
| try: | |
| # Create temporary directories | |
| temp_dir = tempfile.mkdtemp() | |
| clean_dir = os.path.join(temp_dir, 'clean') | |
| noise_dir = os.path.join(temp_dir, 'noise') | |
| os.makedirs(clean_dir, exist_ok=True) | |
| os.makedirs(noise_dir, exist_ok=True) | |
| # Extract ZIP files | |
| with zipfile.ZipFile(clean_zip, 'r') as zip_ref: | |
| zip_ref.extractall(clean_dir) | |
| with zipfile.ZipFile(noise_zip, 'r') as zip_ref: | |
| zip_ref.extractall(noise_dir) | |
| # Count files | |
| clean_files = list(Path(clean_dir).glob('**/*.wav')) | |
| noise_files = list(Path(noise_dir).glob('**/*.wav')) | |
| status = f""" | |
| π¦ Dataset Extracted Successfully! | |
| π Dataset Info: | |
| - Clean speech files: {len(clean_files)} | |
| - Noise files: {len(noise_files)} | |
| - Training epochs: {epochs} | |
| - Batch size: {batch_size} | |
| - LSTM units: {lstm_units} | |
| β οΈ Training on Hugging Face Spaces: | |
| Due to the computational requirements and limited resources on Hugging Face Spaces, | |
| training cannot be run directly in this demo environment. | |
| π₯ To train your own model: | |
| 1. Download the training files from the "Files" tab: | |
| - train_dtln.py | |
| - dtln_ethos_u55.py | |
| - convert_to_tflite.py | |
| 2. Run training locally or on a GPU instance: | |
| ```bash | |
| python train_dtln.py \\ | |
| --clean-dir ./data/clean_speech \\ | |
| --noise-dir ./data/noise \\ | |
| --epochs {epochs} \\ | |
| --batch-size {batch_size} \\ | |
| --lstm-units {lstm_units} | |
| ``` | |
| 3. Convert to TFLite INT8: | |
| ```bash | |
| python convert_to_tflite.py \\ | |
| --model ./models/best_model.h5 \\ | |
| --output ./models/dtln.tflite \\ | |
| --calibration-dir ./data/clean_speech | |
| ``` | |
| π‘ Recommended Training Environment: | |
| - GPU: NVIDIA RTX 3060 or better | |
| - RAM: 16GB+ | |
| - Storage: 10GB+ for datasets | |
| - Time: 2-4 hours for 50 epochs | |
| For detailed instructions, see the deployment guide in the Files tab! | |
| """ | |
| return status | |
| except Exception as e: | |
| return f"β Error processing datasets: {str(e)}" | |
| # Custom CSS | |
| custom_css = """ | |
| .gradio-container { | |
| font-family: 'IBM Plex Sans', sans-serif; | |
| } | |
| .gr-button { | |
| background: linear-gradient(90deg, #4CAF50, #45a049); | |
| border: none; | |
| } | |
| .gr-button:hover { | |
| background: linear-gradient(90deg, #45a049, #4CAF50); | |
| } | |
| #component-0 { | |
| max-width: 1200px; | |
| margin: auto; | |
| padding: 20px; | |
| } | |
| """ | |
| # Build Gradio interface | |
| with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo: | |
| gr.Markdown(""" | |
| # ποΈ DTLN Voice Denoising | |
| Real-time speech enhancement optimized for edge deployment with **TensorFlow Lite**. | |
| ### π Features: | |
| - **Optimized for Edge AI**: Lightweight model with <100KB size | |
| - **Real-time Processing**: Low latency for streaming audio | |
| - **INT8 Quantization**: Efficient deployment with 8-bit precision | |
| - **TensorFlow Lite**: Ready for microcontroller deployment | |
| --- | |
| """) | |
| with gr.Tabs(): | |
| # Demo Tab | |
| with gr.Tab("π΅ Demo"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("### π€ Input") | |
| audio_input = gr.Audio( | |
| label="Upload Noisy Audio", | |
| type="filepath" | |
| ) | |
| noise_reduction = gr.Slider( | |
| minimum=0, | |
| maximum=20, | |
| value=10, | |
| step=1, | |
| label="Noise Reduction Strength (dB)", | |
| info="Higher values remove more noise but may affect speech quality" | |
| ) | |
| with gr.Row(): | |
| process_btn = gr.Button("π Denoise Audio", variant="primary", size="lg") | |
| demo_btn = gr.Button("π΅ Try Demo Audio", variant="secondary") | |
| with gr.Column(): | |
| gr.Markdown("### π₯ Output") | |
| audio_output = gr.Audio( | |
| label="Denoised Audio", | |
| type="numpy" | |
| ) | |
| info_output = gr.Textbox( | |
| label="Processing Info", | |
| lines=12, | |
| max_lines=12 | |
| ) | |
| # Training Tab | |
| with gr.Tab("π¬ Training"): | |
| gr.Markdown(""" | |
| ### Train Your Own DTLN Model | |
| Upload your datasets and configure training parameters. | |
| β οΈ **Note**: Training requires significant computational resources and cannot run | |
| directly on Hugging Face Spaces. This interface helps you prepare your data and | |
| provides the exact commands to run training locally. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("#### π¦ Datasets") | |
| clean_upload = gr.File( | |
| label="Clean Speech Dataset (ZIP)", | |
| file_types=[".zip"], | |
| type="filepath" | |
| ) | |
| gr.Markdown("*Upload a ZIP file containing clean speech WAV files*") | |
| noise_upload = gr.File( | |
| label="Noise Dataset (ZIP)", | |
| file_types=[".zip"], | |
| type="filepath" | |
| ) | |
| gr.Markdown("*Upload a ZIP file containing noise WAV files*") | |
| with gr.Column(): | |
| gr.Markdown("#### βοΈ Training Parameters") | |
| epochs_slider = gr.Slider( | |
| minimum=10, | |
| maximum=200, | |
| value=50, | |
| step=10, | |
| label="Training Epochs" | |
| ) | |
| batch_slider = gr.Slider( | |
| minimum=4, | |
| maximum=64, | |
| value=16, | |
| step=4, | |
| label="Batch Size" | |
| ) | |
| lstm_slider = gr.Slider( | |
| minimum=64, | |
| maximum=256, | |
| value=128, | |
| step=32, | |
| label="LSTM Units" | |
| ) | |
| train_btn = gr.Button("π Prepare Training", variant="primary", size="lg") | |
| training_output = gr.Textbox( | |
| label="Training Instructions", | |
| lines=25, | |
| max_lines=30 | |
| ) | |
| # About section | |
| with gr.Accordion("π About This Model", open=False): | |
| gr.Markdown(""" | |
| ### DTLN Architecture | |
| **Dual-signal Transformation LSTM Network** is a real-time speech enhancement model: | |
| - **Two-stage processing**: Magnitude estimation β Final enhancement | |
| - **LSTM-based**: Captures temporal dependencies in speech | |
| - **<1M parameters**: Lightweight for edge deployment | |
| - **Frequency + Time domain**: Processes both domains for better quality | |
| ### Edge Hardware Acceleration | |
| Compatible with various edge AI accelerators: | |
| - **NPU**: Arm Ethos-U series | |
| - **CPU**: ARM Cortex-M series | |
| - **Quantization**: 8-bit and 16-bit integer operations | |
| - **Memory**: Optimized for constrained devices | |
| ### Performance Targets | |
| | Metric | Value | | |
| |--------|-------| | |
| | Model Size | ~100 KB (INT8) | | |
| | Latency | 3-6 ms | | |
| | Power | 30-40 mW | | |
| | SNR Improvement | 10-15 dB | | |
| --- | |
| β οΈ **Demo Note**: This Space uses spectral subtraction for demonstration. | |
| Download the full implementation to train and deploy the actual DTLN model! | |
| """) | |
| # Deployment guide section | |
| with gr.Accordion("π οΈ Training & Deployment Guide", open=False): | |
| gr.Markdown(""" | |
| ### Quick Start | |
| ```bash | |
| # 1. Install dependencies | |
| pip install -r requirements.txt | |
| # 2. Train model | |
| python train_dtln.py \\ | |
| --clean-dir ./data/clean_speech \\ | |
| --noise-dir ./data/noise \\ | |
| --epochs 50 \\ | |
| --batch-size 16 | |
| # 3. Convert to TFLite INT8 | |
| python convert_to_tflite.py \\ | |
| --model ./models/best_model.h5 \\ | |
| --output ./models/dtln_ethos_u55.tflite \\ | |
| --calibration-dir ./data/clean_speech | |
| # 4. (Optional) Optimize for hardware accelerator | |
| vela --accelerator-config ethos-u55-256 \\ | |
| --system-config Ethos_U55_High_End_Embedded \\ | |
| ./models/dtln_ethos_u55.tflite | |
| ``` | |
| ### Download Full Implementation | |
| The complete training and deployment code is available in the Files tab β | |
| Includes: | |
| - `dtln_ethos_u55.py` - Model architecture | |
| - `train_dtln.py` - Training with QAT | |
| - `convert_to_tflite.py` - TFLite conversion | |
| - `alif_e7_voice_denoising_guide.md` - Complete guide | |
| - `example_usage.py` - Usage examples | |
| ### Resources | |
| - [TensorFlow Lite Micro](https://www.tensorflow.org/lite/microcontrollers) | |
| - [Arm Ethos-U NPU](https://developer.arm.com/ip-products/processors/machine-learning/arm-ethos-u) | |
| - [DTLN Paper (Interspeech 2020)](https://arxiv.org/abs/2005.07551) | |
| """) | |
| # Tech specs section | |
| with gr.Accordion("βοΈ Technical Specifications", open=False): | |
| gr.Markdown(""" | |
| ### Model Architecture Details | |
| **Input**: Raw audio waveform @ 16kHz | |
| - Frame length: 512 samples (32ms) | |
| - Frame shift: 128 samples (8ms) | |
| - Frequency bins: 257 (FFT size 512) | |
| **Network Structure**: | |
| ``` | |
| Input Audio (16kHz) | |
| β | |
| STFT (512-point) | |
| β | |
| [Stage 1] | |
| LSTM (128 units) β Dense (sigmoid) β Magnitude Mask 1 | |
| β | |
| Enhanced Magnitude 1 | |
| β | |
| [Stage 2] | |
| LSTM (128 units) β Dense (sigmoid) β Magnitude Mask 2 | |
| β | |
| Enhanced Magnitude | |
| β | |
| ISTFT | |
| β | |
| Output Audio (16kHz) | |
| ``` | |
| **Training Configuration**: | |
| - Loss: Combined time + frequency domain MSE | |
| - Optimizer: Adam (lr=0.001) | |
| - Batch size: 16 | |
| - Epochs: 50 | |
| - Quantization: INT8 post-training quantization | |
| **Memory Footprint**: | |
| - Model weights: ~80 KB (INT8) | |
| - Tensor arena: ~100 KB | |
| - Audio buffers: ~2 KB | |
| - **Total**: ~200 KB | |
| ### Edge Device Deployment | |
| **Hardware Utilization**: | |
| - NPU/CPU: For LSTM inference | |
| - CPU: For FFT operations (CMSIS-DSP) | |
| - Memory: Optimized buffer management | |
| - Peripherals: I2S/PDM for audio I/O | |
| **Power Profile**: | |
| - Active inference: 30-40 mW | |
| - Idle: <1 mW | |
| - Average (50% duty): ~15-20 mW | |
| **Real-time Constraints**: | |
| - Frame processing: 8ms available | |
| - FFT: ~1ms | |
| - NPU inference: ~4ms | |
| - IFFT + overhead: ~2ms | |
| - **Margin**: ~1ms | |
| """) | |
| # Event handlers | |
| process_btn.click( | |
| fn=process_audio, | |
| inputs=[audio_input, noise_reduction], | |
| outputs=[audio_output, info_output] | |
| ) | |
| demo_btn.click( | |
| fn=generate_demo_audio, | |
| inputs=[], | |
| outputs=[audio_input] | |
| ) | |
| train_btn.click( | |
| fn=start_training, | |
| inputs=[clean_upload, noise_upload, epochs_slider, batch_slider, lstm_slider], | |
| outputs=[training_output] | |
| ) | |
| # Footer | |
| gr.Markdown(""" | |
| --- | |
| ### π Citation | |
| If you use this model in your research, please cite: | |
| ```bibtex | |
| @inproceedings{westhausen2020dtln, | |
| title={Dual-signal transformation LSTM network for real-time noise suppression}, | |
| author={Westhausen, Nils L and Meyer, Bernd T}, | |
| booktitle={Interspeech}, | |
| year={2020} | |
| } | |
| ``` | |
| --- | |
| <div style="text-align: center; color: #666;"> | |
| Built for <b>Edge AI</b> β’ Optimized for <b>Microcontrollers</b> β’ | |
| <a href="https://github.com/breizhn/DTLN">Original DTLN</a> | |
| </div> | |
| """) | |
| # Launch configuration | |
| if __name__ == "__main__": | |
| demo.launch() | |