grgsaliba commited on
Commit
f7fb413
Β·
verified Β·
1 Parent(s): 92858ca

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +262 -93
app.py CHANGED
@@ -9,6 +9,8 @@ import soundfile as sf
9
  import tempfile
10
  import os
11
  from scipy import signal
 
 
12
 
13
  # Note: In production, you would load a trained model
14
  # For this demo, we'll use a simple spectral subtraction approach
@@ -17,33 +19,33 @@ def spectral_subtraction_denoise(audio, sample_rate, noise_reduction_db=10):
17
  """
18
  Simple spectral subtraction for demonstration
19
  In production, this would use the trained DTLN model
20
-
21
  Args:
22
  audio: Input audio array
23
  sample_rate: Sampling rate
24
  noise_reduction_db: Amount of noise reduction in dB
25
-
26
  Returns:
27
  Denoised audio array
28
  """
29
  # Compute STFT
30
  f, t, Zxx = signal.stft(audio, fs=sample_rate, nperseg=512)
31
-
32
  # Estimate noise from first 0.3 seconds
33
  noise_frames = int(0.3 * len(t))
34
  noise_estimate = np.mean(np.abs(Zxx[:, :noise_frames]), axis=1, keepdims=True)
35
-
36
  # Spectral subtraction
37
  magnitude = np.abs(Zxx)
38
  phase = np.angle(Zxx)
39
-
40
  # Subtract noise estimate (with floor)
41
  alpha = 10 ** (noise_reduction_db / 20)
42
  magnitude_cleaned = np.maximum(magnitude - alpha * noise_estimate, 0.1 * magnitude)
43
-
44
  # Reconstruct complex spectrum
45
  Zxx_cleaned = magnitude_cleaned * np.exp(1j * phase)
46
-
47
  # Inverse STFT
48
  _, audio_cleaned = signal.istft(Zxx_cleaned, fs=sample_rate)
49
 
@@ -62,63 +64,63 @@ def spectral_subtraction_denoise(audio, sample_rate, noise_reduction_db=10):
62
  def process_audio(audio_file, noise_reduction):
63
  """
64
  Process uploaded audio file
65
-
66
  Args:
67
  audio_file: Path to uploaded audio file
68
  noise_reduction: Noise reduction strength (0-20 dB)
69
-
70
  Returns:
71
  Tuple of (sample_rate, denoised_audio)
72
  """
73
  if audio_file is None:
74
  return None, "Please upload an audio file"
75
-
76
  try:
77
  # Load audio
78
  audio, sample_rate = sf.read(audio_file)
79
-
80
  # Convert to mono if stereo
81
  if len(audio.shape) > 1:
82
  audio = np.mean(audio, axis=1)
83
-
84
  # Resample to 16kHz if needed (DTLN's native sample rate)
85
  if sample_rate != 16000:
86
  import scipy.signal as scipy_signal
87
  num_samples = int(len(audio) * 16000 / sample_rate)
88
  audio = scipy_signal.resample(audio, num_samples)
89
  sample_rate = 16000
90
-
91
  # Normalize input
92
  audio = audio / (np.max(np.abs(audio)) + 1e-8) * 0.95
93
-
94
  # Apply denoising
95
  # Note: In production, this would use the trained DTLN model
96
  denoised = spectral_subtraction_denoise(audio, sample_rate, noise_reduction)
97
-
98
  # Calculate improvement metrics
99
  noise = audio - denoised
100
  signal_power = np.mean(audio ** 2)
101
  noise_power = np.mean(noise ** 2)
102
  snr_improvement = 10 * np.log10(signal_power / (noise_power + 1e-10))
103
-
104
  info = f"""
105
  βœ… Processing Complete!
106
-
107
  πŸ“Š Audio Info:
108
  - Duration: {len(audio)/sample_rate:.2f}s
109
  - Sample Rate: {sample_rate} Hz
110
  - Length: {len(audio):,} samples
111
-
112
  πŸ“ˆ Quality Metrics:
113
  - SNR Improvement: {snr_improvement:.2f} dB
114
  - Noise Reduction: {noise_reduction} dB
115
-
116
  ⚠️ Note: This demo uses spectral subtraction for demonstration.
117
  The actual DTLN model provides superior quality when trained!
118
  """
119
-
120
  return (sample_rate, denoised.astype(np.float32)), info
121
-
122
  except Exception as e:
123
  return None, f"❌ Error processing audio: {str(e)}"
124
 
@@ -128,32 +130,125 @@ def generate_demo_audio():
128
  sample_rate = 16000
129
  duration = 3.0
130
  t = np.linspace(0, duration, int(duration * sample_rate))
131
-
132
  # Generate synthetic speech
133
  speech = (
134
  0.3 * np.sin(2 * np.pi * 200 * t) +
135
  0.2 * np.sin(2 * np.pi * 400 * t) +
136
  0.15 * np.sin(2 * np.pi * 600 * t)
137
  )
138
-
139
  # Add speech-like envelope
140
  envelope = 0.5 + 0.5 * np.sin(2 * np.pi * 2 * t)
141
  speech = speech * envelope
142
-
143
  # Add noise
144
  noise = np.random.randn(len(t)) * 0.2
145
  noisy = speech + noise
146
-
147
  # Normalize
148
  noisy = noisy / (np.max(np.abs(noisy)) + 1e-8) * 0.95
149
-
150
  # Save to temporary file
151
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
152
  sf.write(temp_file.name, noisy.astype(np.float32), sample_rate)
153
-
154
  return temp_file.name
155
 
156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  # Custom CSS
158
  custom_css = """
159
  .gradio-container {
@@ -167,7 +262,7 @@ custom_css = """
167
  background: linear-gradient(90deg, #45a049, #4CAF50);
168
  }
169
  #component-0 {
170
- max-width: 900px;
171
  margin: auto;
172
  padding: 20px;
173
  }
@@ -188,53 +283,121 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
188
 
189
  ---
190
  """)
191
-
192
- with gr.Row():
193
- with gr.Column():
194
- gr.Markdown("### πŸ“€ Input")
195
- audio_input = gr.Audio(
196
- label="Upload Noisy Audio",
197
- type="filepath"
198
- )
199
-
200
- noise_reduction = gr.Slider(
201
- minimum=0,
202
- maximum=20,
203
- value=10,
204
- step=1,
205
- label="Noise Reduction Strength (dB)",
206
- info="Higher values remove more noise but may affect speech quality"
207
- )
208
-
209
  with gr.Row():
210
- process_btn = gr.Button("πŸ”„ Denoise Audio", variant="primary", size="lg")
211
- demo_btn = gr.Button("🎡 Try Demo Audio", variant="secondary")
212
-
213
- with gr.Column():
214
- gr.Markdown("### πŸ“₯ Output")
215
- audio_output = gr.Audio(
216
- label="Denoised Audio",
217
- type="numpy"
218
- )
219
-
220
- info_output = gr.Textbox(
221
- label="Processing Info",
222
- lines=12,
223
- max_lines=12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
  )
225
-
226
  # About section
227
  with gr.Accordion("πŸ“– About This Model", open=False):
228
  gr.Markdown("""
229
  ### DTLN Architecture
230
-
231
  **Dual-signal Transformation LSTM Network** is a real-time speech enhancement model:
232
-
233
  - **Two-stage processing**: Magnitude estimation β†’ Final enhancement
234
  - **LSTM-based**: Captures temporal dependencies in speech
235
  - **<1M parameters**: Lightweight for edge deployment
236
  - **Frequency + Time domain**: Processes both domains for better quality
237
-
238
  ### Edge Hardware Acceleration
239
 
240
  Compatible with various edge AI accelerators:
@@ -242,78 +405,78 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
242
  - **CPU**: ARM Cortex-M series
243
  - **Quantization**: 8-bit and 16-bit integer operations
244
  - **Memory**: Optimized for constrained devices
245
-
246
  ### Performance Targets
247
-
248
  | Metric | Value |
249
  |--------|-------|
250
  | Model Size | ~100 KB (INT8) |
251
  | Latency | 3-6 ms |
252
  | Power | 30-40 mW |
253
  | SNR Improvement | 10-15 dB |
254
-
255
  ---
256
-
257
  ⚠️ **Demo Note**: This Space uses spectral subtraction for demonstration.
258
  Download the full implementation to train and deploy the actual DTLN model!
259
  """)
260
-
261
- # Training guide section
262
  with gr.Accordion("πŸ› οΈ Training & Deployment Guide", open=False):
263
  gr.Markdown("""
264
  ### Quick Start
265
-
266
  ```bash
267
  # 1. Install dependencies
268
  pip install -r requirements.txt
269
-
270
  # 2. Train model
271
  python train_dtln.py \\
272
  --clean-dir ./data/clean_speech \\
273
  --noise-dir ./data/noise \\
274
  --epochs 50 \\
275
  --batch-size 16
276
-
277
  # 3. Convert to TFLite INT8
278
  python convert_to_tflite.py \\
279
  --model ./models/best_model.h5 \\
280
  --output ./models/dtln_ethos_u55.tflite \\
281
  --calibration-dir ./data/clean_speech
282
-
283
  # 4. (Optional) Optimize for hardware accelerator
284
  vela --accelerator-config ethos-u55-256 \\
285
  --system-config Ethos_U55_High_End_Embedded \\
286
  ./models/dtln_ethos_u55.tflite
287
  ```
288
-
289
  ### Download Full Implementation
290
-
291
  The complete training and deployment code is available in the Files tab β†’
292
-
293
  Includes:
294
  - `dtln_ethos_u55.py` - Model architecture
295
  - `train_dtln.py` - Training with QAT
296
  - `convert_to_tflite.py` - TFLite conversion
297
  - `alif_e7_voice_denoising_guide.md` - Complete guide
298
  - `example_usage.py` - Usage examples
299
-
300
  ### Resources
301
 
302
  - [TensorFlow Lite Micro](https://www.tensorflow.org/lite/microcontrollers)
303
  - [Arm Ethos-U NPU](https://developer.arm.com/ip-products/processors/machine-learning/arm-ethos-u)
304
  - [DTLN Paper (Interspeech 2020)](https://arxiv.org/abs/2005.07551)
305
  """)
306
-
307
  # Tech specs section
308
  with gr.Accordion("βš™οΈ Technical Specifications", open=False):
309
  gr.Markdown("""
310
  ### Model Architecture Details
311
-
312
  **Input**: Raw audio waveform @ 16kHz
313
  - Frame length: 512 samples (32ms)
314
  - Frame shift: 128 samples (8ms)
315
  - Frequency bins: 257 (FFT size 512)
316
-
317
  **Network Structure**:
318
  ```
319
  Input Audio (16kHz)
@@ -334,20 +497,20 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
334
  ↓
335
  Output Audio (16kHz)
336
  ```
337
-
338
  **Training Configuration**:
339
  - Loss: Combined time + frequency domain MSE
340
  - Optimizer: Adam (lr=0.001)
341
  - Batch size: 16
342
  - Epochs: 50
343
  - Quantization: INT8 post-training quantization
344
-
345
  **Memory Footprint**:
346
  - Model weights: ~80 KB (INT8)
347
  - Tensor arena: ~100 KB
348
  - Audio buffers: ~2 KB
349
  - **Total**: ~200 KB
350
-
351
  ### Edge Device Deployment
352
 
353
  **Hardware Utilization**:
@@ -355,12 +518,12 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
355
  - CPU: For FFT operations (CMSIS-DSP)
356
  - Memory: Optimized buffer management
357
  - Peripherals: I2S/PDM for audio I/O
358
-
359
  **Power Profile**:
360
  - Active inference: 30-40 mW
361
  - Idle: <1 mW
362
  - Average (50% duty): ~15-20 mW
363
-
364
  **Real-time Constraints**:
365
  - Frame processing: 8ms available
366
  - FFT: ~1ms
@@ -368,7 +531,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
368
  - IFFT + overhead: ~2ms
369
  - **Margin**: ~1ms
370
  """)
371
-
372
  # Event handlers
373
  process_btn.click(
374
  fn=process_audio,
@@ -381,15 +544,21 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
381
  inputs=[],
382
  outputs=[audio_input]
383
  )
384
-
 
 
 
 
 
 
385
  # Footer
386
  gr.Markdown("""
387
  ---
388
-
389
  ### πŸ“š Citation
390
-
391
  If you use this model in your research, please cite:
392
-
393
  ```bibtex
394
  @inproceedings{westhausen2020dtln,
395
  title={Dual-signal transformation LSTM network for real-time noise suppression},
@@ -398,9 +567,9 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
398
  year={2020}
399
  }
400
  ```
401
-
402
  ---
403
-
404
  <div style="text-align: center; color: #666;">
405
  Built for <b>Edge AI</b> β€’ Optimized for <b>Microcontrollers</b> β€’
406
  <a href="https://github.com/breizhn/DTLN">Original DTLN</a>
 
9
  import tempfile
10
  import os
11
  from scipy import signal
12
+ import zipfile
13
+ from pathlib import Path
14
 
15
  # Note: In production, you would load a trained model
16
  # For this demo, we'll use a simple spectral subtraction approach
 
19
  """
20
  Simple spectral subtraction for demonstration
21
  In production, this would use the trained DTLN model
22
+
23
  Args:
24
  audio: Input audio array
25
  sample_rate: Sampling rate
26
  noise_reduction_db: Amount of noise reduction in dB
27
+
28
  Returns:
29
  Denoised audio array
30
  """
31
  # Compute STFT
32
  f, t, Zxx = signal.stft(audio, fs=sample_rate, nperseg=512)
33
+
34
  # Estimate noise from first 0.3 seconds
35
  noise_frames = int(0.3 * len(t))
36
  noise_estimate = np.mean(np.abs(Zxx[:, :noise_frames]), axis=1, keepdims=True)
37
+
38
  # Spectral subtraction
39
  magnitude = np.abs(Zxx)
40
  phase = np.angle(Zxx)
41
+
42
  # Subtract noise estimate (with floor)
43
  alpha = 10 ** (noise_reduction_db / 20)
44
  magnitude_cleaned = np.maximum(magnitude - alpha * noise_estimate, 0.1 * magnitude)
45
+
46
  # Reconstruct complex spectrum
47
  Zxx_cleaned = magnitude_cleaned * np.exp(1j * phase)
48
+
49
  # Inverse STFT
50
  _, audio_cleaned = signal.istft(Zxx_cleaned, fs=sample_rate)
51
 
 
64
  def process_audio(audio_file, noise_reduction):
65
  """
66
  Process uploaded audio file
67
+
68
  Args:
69
  audio_file: Path to uploaded audio file
70
  noise_reduction: Noise reduction strength (0-20 dB)
71
+
72
  Returns:
73
  Tuple of (sample_rate, denoised_audio)
74
  """
75
  if audio_file is None:
76
  return None, "Please upload an audio file"
77
+
78
  try:
79
  # Load audio
80
  audio, sample_rate = sf.read(audio_file)
81
+
82
  # Convert to mono if stereo
83
  if len(audio.shape) > 1:
84
  audio = np.mean(audio, axis=1)
85
+
86
  # Resample to 16kHz if needed (DTLN's native sample rate)
87
  if sample_rate != 16000:
88
  import scipy.signal as scipy_signal
89
  num_samples = int(len(audio) * 16000 / sample_rate)
90
  audio = scipy_signal.resample(audio, num_samples)
91
  sample_rate = 16000
92
+
93
  # Normalize input
94
  audio = audio / (np.max(np.abs(audio)) + 1e-8) * 0.95
95
+
96
  # Apply denoising
97
  # Note: In production, this would use the trained DTLN model
98
  denoised = spectral_subtraction_denoise(audio, sample_rate, noise_reduction)
99
+
100
  # Calculate improvement metrics
101
  noise = audio - denoised
102
  signal_power = np.mean(audio ** 2)
103
  noise_power = np.mean(noise ** 2)
104
  snr_improvement = 10 * np.log10(signal_power / (noise_power + 1e-10))
105
+
106
  info = f"""
107
  βœ… Processing Complete!
108
+
109
  πŸ“Š Audio Info:
110
  - Duration: {len(audio)/sample_rate:.2f}s
111
  - Sample Rate: {sample_rate} Hz
112
  - Length: {len(audio):,} samples
113
+
114
  πŸ“ˆ Quality Metrics:
115
  - SNR Improvement: {snr_improvement:.2f} dB
116
  - Noise Reduction: {noise_reduction} dB
117
+
118
  ⚠️ Note: This demo uses spectral subtraction for demonstration.
119
  The actual DTLN model provides superior quality when trained!
120
  """
121
+
122
  return (sample_rate, denoised.astype(np.float32)), info
123
+
124
  except Exception as e:
125
  return None, f"❌ Error processing audio: {str(e)}"
126
 
 
130
  sample_rate = 16000
131
  duration = 3.0
132
  t = np.linspace(0, duration, int(duration * sample_rate))
133
+
134
  # Generate synthetic speech
135
  speech = (
136
  0.3 * np.sin(2 * np.pi * 200 * t) +
137
  0.2 * np.sin(2 * np.pi * 400 * t) +
138
  0.15 * np.sin(2 * np.pi * 600 * t)
139
  )
140
+
141
  # Add speech-like envelope
142
  envelope = 0.5 + 0.5 * np.sin(2 * np.pi * 2 * t)
143
  speech = speech * envelope
144
+
145
  # Add noise
146
  noise = np.random.randn(len(t)) * 0.2
147
  noisy = speech + noise
148
+
149
  # Normalize
150
  noisy = noisy / (np.max(np.abs(noisy)) + 1e-8) * 0.95
151
+
152
  # Save to temporary file
153
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
154
  sf.write(temp_file.name, noisy.astype(np.float32), sample_rate)
155
+
156
  return temp_file.name
157
 
158
 
159
+ def start_training(clean_zip, noise_zip, epochs, batch_size, lstm_units):
160
+ """
161
+ Start training process with uploaded datasets
162
+
163
+ Args:
164
+ clean_zip: Uploaded ZIP file with clean speech
165
+ noise_zip: Uploaded ZIP file with noise samples
166
+ epochs: Number of training epochs
167
+ batch_size: Batch size
168
+ lstm_units: Number of LSTM units
169
+
170
+ Returns:
171
+ Status message
172
+ """
173
+ if clean_zip is None or noise_zip is None:
174
+ return "❌ Please upload both clean speech and noise datasets as ZIP files"
175
+
176
+ try:
177
+ # Create temporary directories
178
+ temp_dir = tempfile.mkdtemp()
179
+ clean_dir = os.path.join(temp_dir, 'clean')
180
+ noise_dir = os.path.join(temp_dir, 'noise')
181
+ os.makedirs(clean_dir, exist_ok=True)
182
+ os.makedirs(noise_dir, exist_ok=True)
183
+
184
+ # Extract ZIP files
185
+ with zipfile.ZipFile(clean_zip, 'r') as zip_ref:
186
+ zip_ref.extractall(clean_dir)
187
+
188
+ with zipfile.ZipFile(noise_zip, 'r') as zip_ref:
189
+ zip_ref.extractall(noise_dir)
190
+
191
+ # Count files
192
+ clean_files = list(Path(clean_dir).glob('**/*.wav'))
193
+ noise_files = list(Path(noise_dir).glob('**/*.wav'))
194
+
195
+ status = f"""
196
+ πŸ“¦ Dataset Extracted Successfully!
197
+
198
+ πŸ“Š Dataset Info:
199
+ - Clean speech files: {len(clean_files)}
200
+ - Noise files: {len(noise_files)}
201
+ - Training epochs: {epochs}
202
+ - Batch size: {batch_size}
203
+ - LSTM units: {lstm_units}
204
+
205
+ ⚠️ Training on Hugging Face Spaces:
206
+
207
+ Due to the computational requirements and limited resources on Hugging Face Spaces,
208
+ training cannot be run directly in this demo environment.
209
+
210
+ πŸ“₯ To train your own model:
211
+
212
+ 1. Download the training files from the "Files" tab:
213
+ - train_dtln.py
214
+ - dtln_ethos_u55.py
215
+ - convert_to_tflite.py
216
+
217
+ 2. Run training locally or on a GPU instance:
218
+
219
+ ```bash
220
+ python train_dtln.py \\
221
+ --clean-dir ./data/clean_speech \\
222
+ --noise-dir ./data/noise \\
223
+ --epochs {epochs} \\
224
+ --batch-size {batch_size} \\
225
+ --lstm-units {lstm_units}
226
+ ```
227
+
228
+ 3. Convert to TFLite INT8:
229
+
230
+ ```bash
231
+ python convert_to_tflite.py \\
232
+ --model ./models/best_model.h5 \\
233
+ --output ./models/dtln.tflite \\
234
+ --calibration-dir ./data/clean_speech
235
+ ```
236
+
237
+ πŸ’‘ Recommended Training Environment:
238
+ - GPU: NVIDIA RTX 3060 or better
239
+ - RAM: 16GB+
240
+ - Storage: 10GB+ for datasets
241
+ - Time: 2-4 hours for 50 epochs
242
+
243
+ For detailed instructions, see the deployment guide in the Files tab!
244
+ """
245
+
246
+ return status
247
+
248
+ except Exception as e:
249
+ return f"❌ Error processing datasets: {str(e)}"
250
+
251
+
252
  # Custom CSS
253
  custom_css = """
254
  .gradio-container {
 
262
  background: linear-gradient(90deg, #45a049, #4CAF50);
263
  }
264
  #component-0 {
265
+ max-width: 1200px;
266
  margin: auto;
267
  padding: 20px;
268
  }
 
283
 
284
  ---
285
  """)
286
+
287
+ with gr.Tabs():
288
+ # Demo Tab
289
+ with gr.Tab("🎡 Demo"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290
  with gr.Row():
291
+ with gr.Column():
292
+ gr.Markdown("### πŸ“€ Input")
293
+ audio_input = gr.Audio(
294
+ label="Upload Noisy Audio",
295
+ type="filepath"
296
+ )
297
+
298
+ noise_reduction = gr.Slider(
299
+ minimum=0,
300
+ maximum=20,
301
+ value=10,
302
+ step=1,
303
+ label="Noise Reduction Strength (dB)",
304
+ info="Higher values remove more noise but may affect speech quality"
305
+ )
306
+
307
+ with gr.Row():
308
+ process_btn = gr.Button("πŸ”„ Denoise Audio", variant="primary", size="lg")
309
+ demo_btn = gr.Button("🎡 Try Demo Audio", variant="secondary")
310
+
311
+ with gr.Column():
312
+ gr.Markdown("### πŸ“₯ Output")
313
+ audio_output = gr.Audio(
314
+ label="Denoised Audio",
315
+ type="numpy"
316
+ )
317
+
318
+ info_output = gr.Textbox(
319
+ label="Processing Info",
320
+ lines=12,
321
+ max_lines=12
322
+ )
323
+
324
+ # Training Tab
325
+ with gr.Tab("πŸ”¬ Training"):
326
+ gr.Markdown("""
327
+ ### Train Your Own DTLN Model
328
+
329
+ Upload your datasets and configure training parameters.
330
+
331
+ ⚠️ **Note**: Training requires significant computational resources and cannot run
332
+ directly on Hugging Face Spaces. This interface helps you prepare your data and
333
+ provides the exact commands to run training locally.
334
+ """)
335
+
336
+ with gr.Row():
337
+ with gr.Column():
338
+ gr.Markdown("#### πŸ“¦ Datasets")
339
+
340
+ clean_upload = gr.File(
341
+ label="Clean Speech Dataset (ZIP)",
342
+ file_types=[".zip"],
343
+ type="filepath"
344
+ )
345
+ gr.Markdown("*Upload a ZIP file containing clean speech WAV files*")
346
+
347
+ noise_upload = gr.File(
348
+ label="Noise Dataset (ZIP)",
349
+ file_types=[".zip"],
350
+ type="filepath"
351
+ )
352
+ gr.Markdown("*Upload a ZIP file containing noise WAV files*")
353
+
354
+ with gr.Column():
355
+ gr.Markdown("#### βš™οΈ Training Parameters")
356
+
357
+ epochs_slider = gr.Slider(
358
+ minimum=10,
359
+ maximum=200,
360
+ value=50,
361
+ step=10,
362
+ label="Training Epochs"
363
+ )
364
+
365
+ batch_slider = gr.Slider(
366
+ minimum=4,
367
+ maximum=64,
368
+ value=16,
369
+ step=4,
370
+ label="Batch Size"
371
+ )
372
+
373
+ lstm_slider = gr.Slider(
374
+ minimum=64,
375
+ maximum=256,
376
+ value=128,
377
+ step=32,
378
+ label="LSTM Units"
379
+ )
380
+
381
+ train_btn = gr.Button("πŸ“Š Prepare Training", variant="primary", size="lg")
382
+
383
+ training_output = gr.Textbox(
384
+ label="Training Instructions",
385
+ lines=25,
386
+ max_lines=30
387
  )
388
+
389
  # About section
390
  with gr.Accordion("πŸ“– About This Model", open=False):
391
  gr.Markdown("""
392
  ### DTLN Architecture
393
+
394
  **Dual-signal Transformation LSTM Network** is a real-time speech enhancement model:
395
+
396
  - **Two-stage processing**: Magnitude estimation β†’ Final enhancement
397
  - **LSTM-based**: Captures temporal dependencies in speech
398
  - **<1M parameters**: Lightweight for edge deployment
399
  - **Frequency + Time domain**: Processes both domains for better quality
400
+
401
  ### Edge Hardware Acceleration
402
 
403
  Compatible with various edge AI accelerators:
 
405
  - **CPU**: ARM Cortex-M series
406
  - **Quantization**: 8-bit and 16-bit integer operations
407
  - **Memory**: Optimized for constrained devices
408
+
409
  ### Performance Targets
410
+
411
  | Metric | Value |
412
  |--------|-------|
413
  | Model Size | ~100 KB (INT8) |
414
  | Latency | 3-6 ms |
415
  | Power | 30-40 mW |
416
  | SNR Improvement | 10-15 dB |
417
+
418
  ---
419
+
420
  ⚠️ **Demo Note**: This Space uses spectral subtraction for demonstration.
421
  Download the full implementation to train and deploy the actual DTLN model!
422
  """)
423
+
424
+ # Deployment guide section
425
  with gr.Accordion("πŸ› οΈ Training & Deployment Guide", open=False):
426
  gr.Markdown("""
427
  ### Quick Start
428
+
429
  ```bash
430
  # 1. Install dependencies
431
  pip install -r requirements.txt
432
+
433
  # 2. Train model
434
  python train_dtln.py \\
435
  --clean-dir ./data/clean_speech \\
436
  --noise-dir ./data/noise \\
437
  --epochs 50 \\
438
  --batch-size 16
439
+
440
  # 3. Convert to TFLite INT8
441
  python convert_to_tflite.py \\
442
  --model ./models/best_model.h5 \\
443
  --output ./models/dtln_ethos_u55.tflite \\
444
  --calibration-dir ./data/clean_speech
445
+
446
  # 4. (Optional) Optimize for hardware accelerator
447
  vela --accelerator-config ethos-u55-256 \\
448
  --system-config Ethos_U55_High_End_Embedded \\
449
  ./models/dtln_ethos_u55.tflite
450
  ```
451
+
452
  ### Download Full Implementation
453
+
454
  The complete training and deployment code is available in the Files tab β†’
455
+
456
  Includes:
457
  - `dtln_ethos_u55.py` - Model architecture
458
  - `train_dtln.py` - Training with QAT
459
  - `convert_to_tflite.py` - TFLite conversion
460
  - `alif_e7_voice_denoising_guide.md` - Complete guide
461
  - `example_usage.py` - Usage examples
462
+
463
  ### Resources
464
 
465
  - [TensorFlow Lite Micro](https://www.tensorflow.org/lite/microcontrollers)
466
  - [Arm Ethos-U NPU](https://developer.arm.com/ip-products/processors/machine-learning/arm-ethos-u)
467
  - [DTLN Paper (Interspeech 2020)](https://arxiv.org/abs/2005.07551)
468
  """)
469
+
470
  # Tech specs section
471
  with gr.Accordion("βš™οΈ Technical Specifications", open=False):
472
  gr.Markdown("""
473
  ### Model Architecture Details
474
+
475
  **Input**: Raw audio waveform @ 16kHz
476
  - Frame length: 512 samples (32ms)
477
  - Frame shift: 128 samples (8ms)
478
  - Frequency bins: 257 (FFT size 512)
479
+
480
  **Network Structure**:
481
  ```
482
  Input Audio (16kHz)
 
497
  ↓
498
  Output Audio (16kHz)
499
  ```
500
+
501
  **Training Configuration**:
502
  - Loss: Combined time + frequency domain MSE
503
  - Optimizer: Adam (lr=0.001)
504
  - Batch size: 16
505
  - Epochs: 50
506
  - Quantization: INT8 post-training quantization
507
+
508
  **Memory Footprint**:
509
  - Model weights: ~80 KB (INT8)
510
  - Tensor arena: ~100 KB
511
  - Audio buffers: ~2 KB
512
  - **Total**: ~200 KB
513
+
514
  ### Edge Device Deployment
515
 
516
  **Hardware Utilization**:
 
518
  - CPU: For FFT operations (CMSIS-DSP)
519
  - Memory: Optimized buffer management
520
  - Peripherals: I2S/PDM for audio I/O
521
+
522
  **Power Profile**:
523
  - Active inference: 30-40 mW
524
  - Idle: <1 mW
525
  - Average (50% duty): ~15-20 mW
526
+
527
  **Real-time Constraints**:
528
  - Frame processing: 8ms available
529
  - FFT: ~1ms
 
531
  - IFFT + overhead: ~2ms
532
  - **Margin**: ~1ms
533
  """)
534
+
535
  # Event handlers
536
  process_btn.click(
537
  fn=process_audio,
 
544
  inputs=[],
545
  outputs=[audio_input]
546
  )
547
+
548
+ train_btn.click(
549
+ fn=start_training,
550
+ inputs=[clean_upload, noise_upload, epochs_slider, batch_slider, lstm_slider],
551
+ outputs=[training_output]
552
+ )
553
+
554
  # Footer
555
  gr.Markdown("""
556
  ---
557
+
558
  ### πŸ“š Citation
559
+
560
  If you use this model in your research, please cite:
561
+
562
  ```bibtex
563
  @inproceedings{westhausen2020dtln,
564
  title={Dual-signal transformation LSTM network for real-time noise suppression},
 
567
  year={2020}
568
  }
569
  ```
570
+
571
  ---
572
+
573
  <div style="text-align: center; color: #666;">
574
  Built for <b>Edge AI</b> β€’ Optimized for <b>Microcontrollers</b> β€’
575
  <a href="https://github.com/breizhn/DTLN">Original DTLN</a>