Spaces:

Ceva-IP
/

DPDFNetDemo

Running

App Files Files Community

danielr-ceva commited on 29 days ago

Commit

c4d3070

verified ·

1 Parent(s): cba8d01

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -73

app.py CHANGED Viewed

@@ -52,6 +52,7 @@ def discover_model_presets() -> Dict[str, ModelSpec]:
         "dpdfnet4",
         "dpdfnet8",
         "dpdfnet2_48khz_hr",
     ]
     found_paths = {p.stem: p for p in ONNX_DIR.glob("*.onnx") if p.is_file()}
@@ -125,35 +126,34 @@ def get_ort_session(model_key: str) -> ort.InferenceSession:
     return sess
-def _resolve_state_path(model_key: str) -> Path:
-    spec = MODEL_PRESETS[model_key]
-    model_path = Path(spec.onnx_path)
-    state_path = model_path.with_name(f"{model_path.stem}_state.npz")
-    if not state_path.is_file():
-        raise gr.Error(f"State file not found: {state_path}")
-    return state_path
 def _load_initial_state(model_key: str, session: ort.InferenceSession) -> np.ndarray:
     if model_key in _INIT_STATES:
         return _INIT_STATES[model_key]
-    state_path = _resolve_state_path(model_key)
-    with np.load(state_path) as data:
-        if "init_state" not in data:
-            raise gr.Error(f"Missing 'init_state' key in state file: {state_path}")
-        init_state = np.ascontiguousarray(data["init_state"].astype(np.float32, copy=False))
-    expected_shape = session.get_inputs()[1].shape
-    if len(expected_shape) != init_state.ndim:
         raise gr.Error(
-            f"Initial state rank mismatch for {state_path.name}: expected={expected_shape}, got={tuple(init_state.shape)}"
         )
-    for exp_dim, act_dim in zip(expected_shape, init_state.shape):
-        if isinstance(exp_dim, int) and exp_dim != act_dim:
-            raise gr.Error(
-                f"Initial state shape mismatch for {state_path.name}: expected={expected_shape}, got={tuple(init_state.shape)}"
-            )
     _INIT_STATES[model_key] = init_state
     return init_state
@@ -170,11 +170,7 @@ def vorbis_window(window_len: int) -> np.ndarray:
     return window.astype(np.float32)
-def get_wnorm(window_len: int, frame_size: int) -> float:
-    return 1.0 / (window_len ** 2 / (2 * frame_size))
-def _infer_stft_params(model_key: str, session: ort.InferenceSession) -> Tuple[int, int, float, np.ndarray]:
     # ONNX spec input is [B, T, F, 2] (or dynamic variants).
     spec_shape = session.get_inputs()[0].shape
     freq_bins = spec_shape[-2] if len(spec_shape) >= 2 else None
@@ -188,11 +184,10 @@ def _infer_stft_params(model_key: str, session: ort.InferenceSession) -> Tuple[i
     hop = win_len // 2
     win = vorbis_window(win_len)
-    wnorm = get_wnorm(win_len, hop)
-    return win_len, hop, wnorm, win
-def _preprocess_waveform(waveform: np.ndarray, win_len: int, hop: int, wnorm: float, win: np.ndarray) -> np.ndarray:
     audio = np.asarray(waveform, dtype=np.float32).reshape(-1)
     audio_pad = np.pad(audio, (0, win_len), mode="constant")
@@ -205,12 +200,12 @@ def _preprocess_waveform(waveform: np.ndarray, win_len: int, hop: int, wnorm: fl
         center=True,
         pad_mode="reflect",
     )
-    spec = (spec.T * wnorm).astype(np.complex64, copy=False)  # [T, F]
     spec_ri = np.stack([spec.real, spec.imag], axis=-1).astype(np.float32, copy=False)  # [T, F, 2]
-    return spec_ri[None, ...]  # [1, T, F, 2]
-def _postprocess_spec(spec_e: np.ndarray, win_len: int, hop: int, wnorm: float, win: np.ndarray) -> np.ndarray:
     spec_c = np.asarray(spec_e[0], dtype=np.float32)  # [T, F, 2]
     spec = (spec_c[..., 0] + 1j * spec_c[..., 1]).T.astype(np.complex64, copy=False)  # [F, T]
@@ -223,12 +218,10 @@ def _postprocess_spec(spec_e: np.ndarray, win_len: int, hop: int, wnorm: float,
         length=None,
     ).astype(np.float32, copy=False)
-    waveform_e = waveform_e / wnorm
-    waveform_e = np.concatenate(
         [waveform_e[win_len * 2 :], np.zeros(win_len * 2, dtype=np.float32)],
         axis=0,
     )
-    return waveform_e
 # -----------------------------
@@ -253,8 +246,8 @@ def enhance_audio_onnx(
     out_state_name = outputs[1].name
     waveform = np.asarray(audio_mono, dtype=np.float32).reshape(-1)
-    win_len, hop, wnorm, win = _infer_stft_params(model_key, sess)
-    spec_r_np = _preprocess_waveform(waveform, win_len=win_len, hop=hop, wnorm=wnorm, win=win)
     state = _load_initial_state(model_key, sess).copy()
     spec_e_frames = []
@@ -272,7 +265,7 @@ def enhance_audio_onnx(
         return waveform
     spec_e_np = np.concatenate(spec_e_frames, axis=1)
-    waveform_e = _postprocess_spec(spec_e_np, win_len=win_len, hop=hop, wnorm=wnorm, win=win)
     return np.asarray(waveform_e, dtype=np.float32).reshape(-1)
@@ -428,18 +421,26 @@ CSS = """
   border-radius: 16px;
   border: 1px solid rgba(0,0,0,0.08);
   background: linear-gradient(135deg, rgba(255,152,0,0.14), rgba(255,152,0,0.04));
 }
 #header h1{
-  margin: 0;
   font-size: 24px;
   font-weight: 800;
   letter-spacing: -0.2px;
 }
 #header p{
-  margin: 6px 0 0 0;
   color: var(--body-text-color-subdued);
-  font-size: 13.5px;
-  line-height: 1.35;
 }
 .spec img { border-radius: 14px; }
@@ -454,36 +455,14 @@ CSS = """
 """
 with gr.Blocks(theme=THEME, css=CSS, title="DPDFNet Speech Enhancement") as demo:
-    gr.HTML(
-        # """
-        # <div id="header">
-        #   <h1>DPDFNet Speech Enhancement</h1>
-        #   <p>
-        #     Upload or record up to 10 seconds. Multi-channel inputs are averaged to mono.
-        #     Choose any local ONNX model from <code>./onnx</code>.
-        #     Pre/postprocessing uses the same non-streaming STFT/iSTFT flow as <code>streaming/infer_dpdfnet_onnx.py</code>.
-        #   </p>
-        # </div>
-        # """
-        """
-        <div id="header" style="text-align: center; margin-bottom: 25px;">
-          <h1 style="margin-bottom: 6px;">DPDFNet Speech Enhancement</h1>
-          <p style="font-size: 14px; letter-spacing: 1px; margin-bottom: 14px; color: #555;">
-            Causal • Real-Time • Edge-Ready
-          </p>
-          <p style="max-width: 720px; margin: 0 auto; font-size: 15px; line-height: 1.6;">
-            DPDFNet extends DeepFilterNet2 with Dual-Path RNN blocks to improve
-            long-range temporal and cross-band modeling while preserving low latency.
-            Designed for single-channel streaming speech enhancement under challenging noise conditions.
-          </p>
-          <hr style="margin-top: 22px; border: none; height: 1px; background: linear-gradient(to right, transparent, #ddd, transparent);">
-        </div>
-        """
     )
     with gr.Row():

         "dpdfnet4",
         "dpdfnet8",
         "dpdfnet2_48khz_hr",
+        "dpdfnet8_48khz_hr",
     ]
     found_paths = {p.stem: p for p in ONNX_DIR.glob("*.onnx") if p.is_file()}
     return sess
 def _load_initial_state(model_key: str, session: ort.InferenceSession) -> np.ndarray:
     if model_key in _INIT_STATES:
         return _INIT_STATES[model_key]
+    if len(session.get_inputs()) < 2:
+        raise gr.Error("Expected streaming ONNX model with two inputs: (spec, state).")
+    meta = session.get_modelmeta().custom_metadata_map
+    try:
+        state_size = int(meta["state_size"])
+        erb_norm_state_size = int(meta["erb_norm_state_size"])
+        spec_norm_state_size = int(meta["spec_norm_state_size"])
+        erb_norm_init = np.array(
+            [float(x) for x in meta["erb_norm_init"].split(",")], dtype=np.float32
+        )
+        spec_norm_init = np.array(
+            [float(x) for x in meta["spec_norm_init"].split(",")], dtype=np.float32
+        )
+    except KeyError as exc:
         raise gr.Error(
+            f"ONNX model is missing required metadata key: {exc}. "
+            "Re-export the model to embed state initialisation metadata."
         )
+    init_state = np.zeros(state_size, dtype=np.float32)
+    init_state[0:erb_norm_state_size] = erb_norm_init
+    init_state[erb_norm_state_size:erb_norm_state_size + spec_norm_state_size] = spec_norm_init
+    init_state = np.ascontiguousarray(init_state)
     _INIT_STATES[model_key] = init_state
     return init_state
     return window.astype(np.float32)
+def _infer_stft_params(model_key: str, session: ort.InferenceSession) -> Tuple[int, int, np.ndarray]:
     # ONNX spec input is [B, T, F, 2] (or dynamic variants).
     spec_shape = session.get_inputs()[0].shape
     freq_bins = spec_shape[-2] if len(spec_shape) >= 2 else None
     hop = win_len // 2
     win = vorbis_window(win_len)
+    return win_len, hop, win
+def _preprocess_waveform(waveform: np.ndarray, win_len: int, hop: int, win: np.ndarray) -> np.ndarray:
     audio = np.asarray(waveform, dtype=np.float32).reshape(-1)
     audio_pad = np.pad(audio, (0, win_len), mode="constant")
         center=True,
         pad_mode="reflect",
     )
+    spec = spec.T.astype(np.complex64, copy=False)  # [T, F]
     spec_ri = np.stack([spec.real, spec.imag], axis=-1).astype(np.float32, copy=False)  # [T, F, 2]
+    return np.ascontiguousarray(spec_ri[None, ...], dtype=np.float32)  # [1, T, F, 2]
+def _postprocess_spec(spec_e: np.ndarray, win_len: int, hop: int, win: np.ndarray) -> np.ndarray:
     spec_c = np.asarray(spec_e[0], dtype=np.float32)  # [T, F, 2]
     spec = (spec_c[..., 0] + 1j * spec_c[..., 1]).T.astype(np.complex64, copy=False)  # [F, T]
         length=None,
     ).astype(np.float32, copy=False)
+    return np.concatenate(
         [waveform_e[win_len * 2 :], np.zeros(win_len * 2, dtype=np.float32)],
         axis=0,
     )
 # -----------------------------
     out_state_name = outputs[1].name
     waveform = np.asarray(audio_mono, dtype=np.float32).reshape(-1)
+    win_len, hop, win = _infer_stft_params(model_key, sess)
+    spec_r_np = _preprocess_waveform(waveform, win_len=win_len, hop=hop, win=win)
     state = _load_initial_state(model_key, sess).copy()
     spec_e_frames = []
         return waveform
     spec_e_np = np.concatenate(spec_e_frames, axis=1)
+    waveform_e = _postprocess_spec(spec_e_np, win_len=win_len, hop=hop, win=win)
     return np.asarray(waveform_e, dtype=np.float32).reshape(-1)
   border-radius: 16px;
   border: 1px solid rgba(0,0,0,0.08);
   background: linear-gradient(135deg, rgba(255,152,0,0.14), rgba(255,152,0,0.04));
+  text-align: center;
 }
 #header h1{
+  margin: 0 0 6px 0;
   font-size: 24px;
   font-weight: 800;
   letter-spacing: -0.2px;
 }
 #header p{
+  margin: 6px auto 0 auto;
+  max-width: 720px;
   color: var(--body-text-color-subdued);
+  font-size: 14px;
+  line-height: 1.6;
+}
+#header hr{
+  margin-top: 18px;
+  border: none;
+  height: 1px;
+  background: linear-gradient(to right, transparent, #ddd, transparent);
 }
 .spec img { border-radius: 14px; }
 """
 with gr.Blocks(theme=THEME, css=CSS, title="DPDFNet Speech Enhancement") as demo:
+    gr.Markdown(
+        "# DPDFNet Speech Enhancement\n\n"
+        "Causal · Real-Time · Edge-Ready\n\n"
+        "DPDFNet extends DeepFilterNet2 with Dual-Path RNN blocks to improve "
+        "long-range temporal and cross-band modeling while preserving low latency. "
+        "Designed for single-channel streaming speech enhancement under challenging noise conditions.\n\n"
+        "---",
+        elem_id="header",
     )
     with gr.Row():