Spaces:

AIvry
/

MAPSS-measures

Running on Zero

App Files Files Community

AIvry commited on Sep 14

Commit

1832e16

verified ·

1 Parent(s): 226ddaf

Upload 12 files

Browse files

Files changed (12) hide show

argshield.py +144 -0
audio.py +61 -0
config.py +33 -0
distortions.py +339 -0
engine.py +455 -0
hf_readme.md +136 -0
hf_requirements.txt +26 -0
init.py +4 -0
main.py +24 -0
metrics.py +549 -0
models.py +333 -0
utils.py +231 -0

argshield.py ADDED Viewed

	@@ -0,0 +1,144 @@

+from __future__ import annotations
+import argparse
+import json
+from pathlib import Path, PurePath
+import importlib.util
+from config import DEFAULT_ALPHA
+from models import get_model_config
+# Central table for default layers per model (kept identical to original table)
+MODEL_DEFAULT_LAYER = {
+    "raw": None,
+    "wavlm": 24,
+    "wav2vec2": 24,
+    "hubert": 24,
+    "wavlm_base": 12,
+    "wav2vec2_base": 12,
+    "hubert_base": 12,
+    "wav2vec2_xlsr": 24,
+    "ast": 12,
+}
+def _read_manifest_json(path: Path):
+    text = Path(path).read_text(encoding="utf-8")
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError as e:
+        raise SystemExit(f"Manifest must be JSON. Failed to parse: {e}")
+def _read_manifest_py(path: Path):
+    spec = importlib.util.spec_from_file_location("manifest_mod", str(path))
+    if spec is None or spec.loader is None:
+        raise SystemExit(f"Could not load Python manifest: {path}")
+    mod = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(mod)  # executes the .py file
+    if not hasattr(mod, "MANIFEST"):
+        raise SystemExit(f"Python manifest {path} must define a top-level variable MANIFEST")
+    manifest = mod.MANIFEST
+    def _to_str(p):
+        if isinstance(p, (Path, PurePath)):
+            return str(p)
+        if isinstance(p, str):
+            return p
+        raise TypeError(f"Path entry must be str or Path, got {type(p)}: {p}")
+    normalized = []
+    try:
+        for item in manifest:
+            mix_id = item["mixture_id"]
+            refs = [_to_str(x) for x in item["references"]]
+            systems = {}
+            for sys_name, lst in item["systems"].items():
+                systems[sys_name] = [_to_str(x) for x in lst]
+            normalized.append({
+                "mixture_id": mix_id,
+                "references": refs,
+                "systems": systems,
+            })
+    except (KeyError, TypeError, ValueError) as e:
+        raise SystemExit(f"Malformed MANIFEST in {path}: {e}")
+    return normalized
+def _read_manifest(path: Path):
+    suffix = path.suffix.lower()
+    if suffix in {".py"}:
+        return _read_manifest_py(path)
+    elif suffix in {".json", ".txt"}:
+        return _read_manifest_json(path)
+    else:
+        raise SystemExit(f"Unsupported manifest type '{suffix}'. Use .py, .json, or .txt")
+def _parse_args():
+    parser = argparse.ArgumentParser(
+        description="Run PS/PM experiment from a manifest file."
+    )
+    parser.add_argument(
+        "--manifest",
+        type=Path,
+        required=True,
+        help="Path to manifest (.py with MANIFEST or .json/.txt with JSON).",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        required=True,
+        help=("Embedding model. Choices: "
+              "raw, wavlm, wav2vec2, hubert, wavlm_base, wav2vec2_base, "
+              "hubert_base, wav2vec2_xlsr, ast"),
+    )
+    parser.add_argument(
+        "--layer",
+        type=int,
+        default=None,
+        help="Optional layer (validated per model). Omit to use the model default.",
+    )
+    parser.add_argument(
+        "--alpha",
+        type=float,
+        default=None,
+        help="Optional diffusion-maps alpha in [0,1] (default: config DEFAULT_ALPHA).",
+    )
+    parser.add_argument("--verbose", action="store_true", help="Verbose logging.")
+    parser.add_argument("--max-gpus", type=int, default=None, help="Limit GPUs to use (must be >= 0).")
+    return parser.parse_args()
+def _validate_and_resolve(model: str, layer_opt: int|None, alpha_opt: float|None):
+    allowed_models = set(get_model_config(0).keys())
+    if model not in allowed_models:
+        raise SystemExit(f"Unknown --model '{model}'. Allowed: {sorted(allowed_models)}")
+    max_layer = MODEL_DEFAULT_LAYER.get(model)
+    if model == "raw":
+        layer_final = 0 if layer_opt is None else int(layer_opt)
+    else:
+        if layer_opt is None:
+            if max_layer is None:
+                raise SystemExit(f"--layer must be provided for model '{model}'.")
+            layer_final = max_layer
+        else:
+            layer_final = int(layer_opt)
+            if max_layer is not None and not (0 <= layer_final <= max_layer):
+                raise SystemExit(
+                    f"--layer {layer_final} is out of range for '{model}'. "
+                    f"Expected 0..{max_layer} (or omit to use default {max_layer})."
+                )
+    alpha_final = DEFAULT_ALPHA if alpha_opt is None else float(alpha_opt)
+    if not (0.0 <= alpha_final <= 1.0):
+        raise SystemExit("--alpha must be in [0, 1].")
+    return layer_final, alpha_final
+def _validate_gpus(max_gpus_opt):
+    if max_gpus_opt is None:
+        return None
+    try:
+        mg = int(max_gpus_opt)
+    except Exception:
+        raise SystemExit("--max-gpus must be an integer >= 0.")
+    if mg < 0:
+        raise SystemExit("--max-gpus must be >= 0.")
+    return mg

audio.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import librosa
+import numpy as np
+import pyloudnorm as pyln
+import torch
+from config import SILENCE_RATIO, SR
+from utils import hungarian, safe_corr_np
+import warnings
+warnings.filterwarnings("ignore", message="Possible clipped samples in output.")
+def loudness_normalize(wav, sr=SR, target_lufs=-23.0):
+    meter = pyln.Meter(sr)
+    loudness = meter.integrated_loudness(wav)
+    normalized_wav = pyln.normalize.loudness(wav, loudness, target_lufs)
+    peak = np.max(np.abs(normalized_wav))
+    if peak > 1.0:
+        normalized_wav = normalized_wav / max(peak, 1e-12)
+    return np.clip(normalized_wav, -1.0, 1.0)
+def frame_rms_torch(sig, win, hop):
+    dev = sig.device
+    frames = sig.unfold(0, win, hop)
+    if frames.size(0) and (frames.size(0) - 1) * hop == sig.numel() - win:
+        frames = frames[:-1]
+    rms = torch.sqrt((frames**2).mean(1) + 1e-12)
+    return rms.to(dev)
+def make_union_voiced_mask(refs_tensors, win, hop):
+    device = refs_tensors[0].device
+    rms_vecs = [frame_rms_torch(r, win, hop) for r in refs_tensors]
+    lengths = [v.numel() for v in rms_vecs]
+    L_max = max(lengths)
+    silent_union = torch.zeros(L_max, dtype=torch.bool, device=device)
+    for idx, (rms, L) in enumerate(zip(rms_vecs, lengths)):
+        thr = SILENCE_RATIO * torch.sqrt((refs_tensors[idx] ** 2).mean())
+        sil = rms <= thr
+        silent_union[:L] |= sil
+    return ~silent_union
+def assign_outputs_to_refs_by_corr(ref_paths, out_paths):
+    if not out_paths:
+        return [None] * len(ref_paths)
+    refs = [loudness_normalize(librosa.load(str(p), sr=SR)[0]) for p in ref_paths]
+    outs = [loudness_normalize(librosa.load(str(p), sr=SR)[0]) for p in out_paths]
+    n, m = len(refs), len(outs)
+    K = max(n, m)
+    C = np.ones((K, K), dtype=np.float64)
+    for i in range(n):
+        for j in range(m):
+            r = safe_corr_np(refs[i], outs[j])
+            C[i, j] = 1.0 - (r + 1.0) * 0.5  # lower = better
+    ri, cj = hungarian(C)
+    mapping = [None] * n
+    for i, j in zip(ri, cj):
+        if i < n and j < m:
+            mapping[i] = int(j)
+    return mapping

config.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import os
+import torch
+import warnings
+warnings.filterwarnings(
+    "ignore",
+    category=UserWarning,
+    message=r"^expandable_segments not supported on this platform"
+)
+SR = 16_000
+RESULTS_ROOT = "results"
+BATCH_SIZE = 2
+ENERGY_WIN_MS = 20
+ENERGY_HOP_MS = 20
+SILENCE_RATIO = 0.1
+EPS = 1e-4
+COV_TOL = 1e-6
+DEFAULT_LAYER = 2
+DEFAULT_ADD_CI = True
+DEFAULT_DELTA_CI = 0.05
+DEFAULT_ALPHA = 1.0
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128,expandable_segments:True,garbage_collection_threshold:0.6"
+os.environ["CUDA_LAUNCH_BLOCKING"] = "0"
+torch.backends.cudnn.benchmark = True
+torch.backends.cudnn.deterministic = False
+torch.backends.cudnn.enabled = True
+if torch.cuda.is_available():
+    torch.cuda.set_per_process_memory_fraction(0.8)

distortions.py ADDED Viewed

	@@ -0,0 +1,339 @@

+import librosa
+import numpy as np
+from numpy.fft import irfft, rfft, rfftfreq
+from scipy.signal import butter, filtfilt, lfilter
+from config import ENERGY_WIN_MS, EPS, SR
+def sig_stats(x):
+    A_pk = max(np.max(np.abs(x)), EPS)
+    A_rms = max(np.sqrt(np.mean(x**2)), EPS)
+    A_95 = max(np.percentile(np.abs(x), 95), EPS)
+    return A_pk, A_rms, A_95
+def frame_distortions(
+    frame,
+    sr,
+    distortion_keys,
+    notch_freqs=None,
+    low_cutoffs=None,
+    high_cutoffs=None,
+    frame_start=0,
+):
+    notch_freqs = [] if notch_freqs is None else notch_freqs
+    low_cutoffs = [] if low_cutoffs is None else low_cutoffs
+    high_cutoffs = [] if high_cutoffs is None else high_cutoffs
+    distortions = {}
+    A_pk, A_rms, A_95 = sig_stats(frame)
+    frame_len = len(frame)
+    X = rfft(frame)
+    freqs = rfftfreq(frame_len, 1 / sr)
+    t = np.arange(frame_len) / sr
+    if ("notch" in distortion_keys) or distortion_keys == "all":
+        bw = 60.0
+        for f0 in notch_freqs:
+            Y = X.copy()
+            band = (freqs > f0 - bw) & (freqs < f0 + bw)
+            Y[band] = 0
+            distortions[f"Notch_{int(round(f0))}Hz"] = irfft(Y, n=len(frame))
+    if ("comb" in distortion_keys) or distortion_keys == "all":
+        for d_ms, decay in zip([2.5, 5, 7.5, 10, 12.5, 15], [0.4, 0.5, 0.6, 0.7, 0.9]):
+            D = int(sr * d_ms / 1000)
+            if D >= frame_len:
+                continue
+            out = frame.copy()
+            out[:-D] += decay * frame[D:]
+            distortions[f"Comb_{int(d_ms)}ms"] = out
+    if ("tremolo" in distortion_keys) or distortion_keys == "all":
+        depth = 1.0
+        t_centre = (frame_start + 0.5 * len(frame)) / sr
+        for r_hz in [1, 2, 4, 6]:
+            mod = (1 - depth) + depth * 0.5 * (1 + np.sin(2 * np.pi * r_hz * t_centre))
+            distortions[f"Tremolo_{r_hz}Hz"] = frame * mod
+    if ("noise" in distortion_keys) or distortion_keys == "all":
+        nyq = sr / 2
+        low_norm = 20 / nyq
+        high_freq = min(20_000, 0.45 * sr)
+        high_norm = min(high_freq / nyq, 0.99)
+        b_band, a_band = butter(5, [low_norm, high_norm], btype="band")
+        def add_noise(sig, snr_db, color="white"):
+            nl_target = 10 ** (snr_db / 10)
+            n = np.random.randn(len(sig))
+            if color == "pink":
+                n = np.cumsum(n)
+                n /= max(np.max(np.abs(n)), 1e-12)
+            elif color == "brown":
+                n = np.cumsum(np.cumsum(n))
+                n /= max(np.max(np.abs(n)), 1e-12)
+            n = lfilter(b_band, a_band, n)
+            rms_sig = np.sqrt(np.mean(sig**2))
+            rms_n = np.sqrt(np.mean(n**2)) + 1e-12
+            noise_rms = rms_sig / np.sqrt(nl_target)
+            noise_rms = max(noise_rms, rms_sig / np.sqrt(10 ** (15 / 10)))
+            n *= noise_rms / rms_n
+            return sig + n
+        for snr in [-15, -10, -5, 0, 5, 10, 15, 20, 25]:
+            for clr in ["white", "pink", "brown"]:
+                if (snr in [-15, -10, -5]) and (clr == "white"):
+                    continue
+                distortions[f"{clr.capitalize()}Noise_{snr}dB"] = add_noise(
+                    frame, snr, clr
+                )
+    if ("harmonic" in distortion_keys) or distortion_keys == "all":
+        for f_h, rel_amp in zip([100, 500, 1000, 4000], [0.4, 0.6, 0.8, 1.0]):
+            tone = (rel_amp * A_rms) * np.sin(2 * np.pi * f_h * t)
+            distortions[f"Harmonic_{f_h}Hz"] = frame + tone
+    if ("reverb" in distortion_keys) or distortion_keys == "all":
+        for tail_ms, decay in zip([50, 100, 200, 400], [0.3, 0.5, 0.7, 0.9]):
+            L = int(sr * tail_ms / 1000)
+            if L >= frame_len:
+                continue
+            irv = np.exp(-np.linspace(0, 6, L)) * decay
+            reverbed = np.convolve(frame, irv)[:frame_len]
+            distortions[f"Reverb_{tail_ms}ms"] = reverbed
+    if ("noisegate" in distortion_keys) or distortion_keys == "all":
+        for pct in [0.05, 0.10, 0.20, 0.40]:
+            thr = pct * A_95
+            g = frame.copy()
+            g[np.abs(g) < thr] = 0
+            distortions[f"NoiseGate_{int(pct * 100)}pct"] = g
+    if ("pitch_shift" in distortion_keys) or distortion_keys == "all":
+        n_fft = min(2048, frame_len // 2)
+        for shift in [-4, -2, 2, 4]:
+            y = librosa.effects.pitch_shift(frame, sr=sr, n_steps=shift, n_fft=n_fft)
+            distortions[f"PitchShift_{shift}st"] = y[:frame_len]
+    if ("lowpass" in distortion_keys) or distortion_keys == "all":
+        for fc in low_cutoffs:
+            if fc >= sr / 2 * 0.99:
+                continue
+            b, a = butter(6, fc / (sr / 2), btype="low")
+            distortions[f"Lowpass_{fc}Hz"] = filtfilt(b, a, frame)
+    if ("highpass" in distortion_keys) or distortion_keys == "all":
+        for fc in high_cutoffs:
+            if fc <= 20:
+                continue
+            b, a = butter(6, fc / (sr / 2), btype="high")
+            distortions[f"Highpass_{fc}Hz"] = filtfilt(b, a, frame)
+    if ("echo" in distortion_keys) or distortion_keys == "all":
+        for delay_ms, amp in zip([50, 100, 150], [0.4, 0.5, 0.7]):
+            D = int(sr * delay_ms / 1000)
+            if D >= frame_len:
+                continue
+            echo = np.pad(frame, (D, 0), "constant")[:-D] * amp
+            distortions[f"Echo_{delay_ms}ms"] = frame + echo
+    if ("clipping" in distortion_keys) or distortion_keys == "all":
+        for frac in [0.70, 0.50, 0.30]:
+            thr = frac * A_95
+            distortions[f"Clipping_{frac:.2f}p95"] = np.clip(frame, -thr, thr)
+    if ("vibrato" in distortion_keys) or distortion_keys == "all":
+        n_fft = min(2048, frame_len // 2)
+        base_depth = 0.03 * (A_rms / A_pk)
+        for rate_hz, scale in zip([3, 5, 7], [1.0, 1.3, 1.6]):
+            depth = np.clip(base_depth * scale, 0.01, 0.05)
+            y = librosa.effects.time_stretch(frame, rate=1 + depth, n_fft=n_fft)
+            distortions[f"Vibrato_{rate_hz}Hz"] = librosa.util.fix_length(
+                y, size=frame_len
+            )
+    return distortions
+def apply_adv_distortions(ref, distortion_keys, sr=SR):
+    frame_len = int(ENERGY_WIN_MS * sr / 1000)
+    n_frames = int(np.ceil(len(ref) / frame_len))
+    pad_len = n_frames * frame_len - len(ref)
+    ref_padded = (
+        np.concatenate([ref, np.zeros(pad_len, dtype=ref.dtype)]) if pad_len else ref
+    )
+    X_full = rfft(ref_padded)
+    freqs_f = rfftfreq(len(ref_padded), 1 / sr)
+    mag_full = np.abs(X_full)
+    valid = (freqs_f > 80) & (freqs_f < 0.45 * sr)
+    cand_indices = np.argsort(mag_full[valid])[-60:]
+    cand_freqs = freqs_f[valid][cand_indices]
+    cand_freqs = cand_freqs[np.argsort(mag_full[valid][cand_indices])[::-1]]
+    selected_notch_freqs = []
+    for f0 in cand_freqs:
+        if all(abs(f0 - f_sel) > 300 for f_sel in selected_notch_freqs):
+            selected_notch_freqs.append(float(f0))
+        if len(selected_notch_freqs) >= 20:
+            break
+    mag2 = np.abs(X_full) ** 2
+    total_p = mag2.sum()
+    cum_low = np.cumsum(mag2)
+    q_low = [0.50, 0.70, 0.85, 0.95]
+    lowpass_cutoffs = []
+    for q in q_low:
+        idx = np.searchsorted(cum_low, q * total_p)
+        f_c = float(freqs_f[idx])
+        lowpass_cutoffs.append(round(f_c / 100.0) * 100)
+    cum_high = np.cumsum(mag2[::-1])
+    q_high = [0.05, 0.15, 0.30, 0.50]
+    highpass_cutoffs = []
+    for q in q_high:
+        idx = np.searchsorted(cum_high, q * total_p)
+        f_c = float(freqs_f[-1 - idx])
+        highpass_cutoffs.append(round(f_c / 100.0) * 100)
+    lowpass_cutoffs = sorted(set(lowpass_cutoffs))
+    highpass_cutoffs = sorted(set(highpass_cutoffs))
+    out = {}
+    for f in range(n_frames):
+        start, end = f * frame_len, (f + 1) * frame_len
+        frame = ref_padded[start:end]
+        frame_dists = frame_distortions(
+            frame,
+            sr,
+            distortion_keys,
+            notch_freqs=selected_notch_freqs,
+            low_cutoffs=lowpass_cutoffs,
+            high_cutoffs=highpass_cutoffs,
+            frame_start=start,
+        )
+        for lbl, sig in frame_dists.items():
+            if lbl not in out:
+                out[lbl] = np.zeros_like(ref_padded)
+            out[lbl][start:end] = sig
+    return list(out.values())
+def apply_distortions(ref, distortion_keys, sr=SR):
+    distortions = {}
+    X = rfft(ref)
+    freqs = rfftfreq(len(ref), 1 / sr)
+    t = np.arange(len(ref)) / sr
+    if ("notch" in distortion_keys) or distortion_keys == "all":
+        for c in [500, 1000, 2000, 4000, 8000]:
+            Y = X.copy()
+            Y[(freqs > c - 50) & (freqs < c + 50)] = 0
+            distortions[f"Notch_{c}Hz"] = irfft(Y, n=len(ref))
+    if ("comb" in distortion_keys) or distortion_keys == "all":
+        for d, decay in zip([2.5, 5, 7.5, 10, 12.5, 15], [0.4, 0.5, 0.6, 0.7, 0.9]):
+            D = int(sr * d / 1000)
+            if D >= len(ref):
+                continue
+            cpy = ref.copy()
+            if len(ref) > D:
+                cpy[:-D] += decay * ref[D:]
+            distortions[f"Comb_{int(d)}ms"] = cpy
+    if ("tremolo" in distortion_keys) or distortion_keys == "all":
+        for r, depth in zip([1, 2, 4, 6], [0.3, 0.5, 0.8, 1.0]):
+            mod = (1 - depth) + depth * 0.5 * (1 + np.sin(2 * np.pi * r * t))
+            distortions[f"Tremolo_{r}Hz"] = ref * mod
+    if ("noise" in distortion_keys) or distortion_keys == "all":
+        def add_noise(signal, snr_db, color):
+            rms = np.sqrt(np.mean(signal**2))
+            nl = 10 ** (snr_db / 10)
+            noise_rms = rms / np.sqrt(nl)
+            n = np.random.randn(len(signal))
+            if color == "pink":
+                n = np.cumsum(n)
+                n /= max(np.max(np.abs(n)), 1e-12)
+            elif color == "brown":
+                n = np.cumsum(np.cumsum(n))
+                n /= max(np.max(np.abs(n)), 1e-12)
+            return signal + noise_rms * n
+        for snr in [-15, -10, -5, 0, 5, 10, 15, 20, 25]:
+            for clr in ["white", "pink", "brown"]:
+                if snr in [-15, -10, -5] and clr in ["white"]:
+                    continue
+                distortions[f"{clr.capitalize()}Noise_{snr}dB"] = add_noise(
+                    ref, snr, clr
+                )
+    if ("harmonic" in distortion_keys) or distortion_keys == "all":
+        for f_h, amp in zip([100, 500, 1000, 4000], [0.02, 0.03, 0.05, 0.08]):
+            tone = amp * np.sin(2 * np.pi * f_h * t)
+            distortions[f"Harmonic_{f_h}Hz"] = ref + tone
+    if ("reverb" in distortion_keys) or distortion_keys == "all":
+        for tail_ms, decay in zip([5, 10, 15, 20], [0.3, 0.5, 0.7, 0.9, 1.1]):
+            L = int(sr * tail_ms / 1000)
+            if L >= len(ref):
+                continue
+            irv = np.exp(-np.linspace(0, 3, L)) * decay
+            reverbed = np.convolve(ref, irv)[: len(ref)]
+            distortions[f"Reverb_{tail_ms}ms"] = reverbed
+    if ("noisegate" in distortion_keys) or distortion_keys == "all":
+        for thr in [0.005, 0.01, 0.02, 0.04]:
+            g = ref.copy()
+            g[np.abs(g) < thr] = 0
+            distortions[f"NoiseGate_{thr}"] = g
+    if ("pitch_shift" in distortion_keys) or distortion_keys == "all":
+        n_fft = min(2048, len(ref) // 2)
+        for shift in [-4, -2, 2, 4]:
+            shifted = librosa.effects.pitch_shift(
+                y=ref, sr=sr, n_steps=shift, n_fft=n_fft
+            )
+            distortions[f"PitchShift_{shift}st"] = shifted[: len(ref)]
+    if ("lowpass" in distortion_keys) or distortion_keys == "all":
+        for freq in [2000, 3000, 4000, 6000]:
+            if freq >= (sr / 2):
+                continue
+            b, a = butter(4, freq / (sr / 2), "low")
+            distortions[f"Lowpass_{freq}Hz"] = filtfilt(b, a, ref)
+    if ("highpass" in distortion_keys) or distortion_keys == "all":
+        for freq in [100, 300, 500, 800]:
+            if freq >= (sr / 2):
+                continue
+            b, a = butter(4, freq / (sr / 2), "high")
+            distortions[f"Highpass_{freq}Hz"] = filtfilt(b, a, ref)
+    if ("echo" in distortion_keys) or distortion_keys == "all":
+        for delay_ms, amp in zip([5, 10, 15, 20], [0.3, 0.5, 0.7]):
+            delay = int(sr * delay_ms / 1000)
+            if delay >= len(ref):
+                continue
+            echo = np.pad(ref, (delay, 0), "constant")[:-delay] * amp
+            distortions[f"Echo_{delay_ms}ms"] = ref + echo
+    if ("clipping" in distortion_keys) or distortion_keys == "all":
+        for thr in [0.3, 0.5, 0.7]:
+            distortions[f"Clipping_{thr}"] = np.clip(ref, -thr, thr)
+    if ("vibrato" in distortion_keys) or distortion_keys == "all":
+        for rate, depth in zip([3, 5, 7], [0.001, 0.002, 0.003]):
+            vibrato = np.sin(2 * np.pi * rate * t) * depth
+            vibrato_signal = librosa.effects.time_stretch(
+                ref, rate=1 + float(vibrato.mean()), n_fft=min(2048, len(ref) // 2)
+            )
+            distortions[f"Vibrato_{rate}Hz"] = librosa.util.fix_length(
+                vibrato_signal, size=len(ref)
+            )
+    return list(distortions.values())

engine.py ADDED Viewed

	@@ -0,0 +1,455 @@

+import json
+import random
+from concurrent.futures import ThreadPoolExecutor
+from datetime import datetime
+import librosa
+import pandas as pd
+from audio import (
+    assign_outputs_to_refs_by_corr,
+    loudness_normalize,
+    make_union_voiced_mask,
+)
+from config import *
+from distortions import apply_adv_distortions, apply_distortions
+from metrics import (
+    compute_pm,
+    compute_ps,
+    diffusion_map_torch,
+    pm_ci_components_full,
+    ps_ci_components_full,
+)
+from models import embed_batch, load_model
+from utils import *
+def compute_mapss_measures(
+        models,
+        mixtures,
+        *,
+        systems=None,
+        algos=None,
+        experiment_id=None,
+        layer=DEFAULT_LAYER,
+        add_ci=DEFAULT_ADD_CI,
+        alpha=DEFAULT_ALPHA,
+        seed=42,
+        on_missing="skip",
+        verbose=False,
+        max_gpus=None,
+):
+    gpu_distributor = GPUWorkDistributor(max_gpus)
+    ngpu = get_gpu_count(max_gpus)
+    if on_missing not in {"skip", "error"}:
+        raise ValueError("on_missing must be 'skip' or 'error'.")
+    torch.manual_seed(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+    canon_mix = canonicalize_mixtures(mixtures, systems=systems)
+    mixture_entries = []
+    for m in canon_mix:
+        entries = []
+        for i, refp in enumerate(m.refs):
+            sid = m.speaker_ids[i]
+            entries.append(
+                {"id": sid, "ref": Path(refp), "mixture": m.mixture_id, "outs": {}}
+            )
+        mixture_entries.append(entries)
+    for m, mix_entries in zip(canon_mix, mixture_entries):
+        for algo, out_list in (m.systems or {}).items():
+            mapping = assign_outputs_to_refs_by_corr(
+                [e["ref"] for e in mix_entries], out_list
+            )
+            for idx, e in enumerate(mix_entries):
+                j = mapping[idx]
+                if j is not None:
+                    e["outs"][algo] = out_list[j]
+    if algos is None:
+        algos_to_run = sorted(
+            {algo for m in canon_mix for algo in (m.systems or {}).keys()}
+        )
+    else:
+        algos_to_run = list(algos)
+    exp_id = experiment_id or datetime.now().strftime("%Y%m%d_%H%M%S")
+    exp_root = os.path.join(RESULTS_ROOT, f"experiment_{exp_id}")
+    os.makedirs(exp_root, exist_ok=True)
+    params = {
+        "models": models,
+        "layer": layer,
+        "add_ci": add_ci,
+        "alpha": alpha,
+        "seed": seed,
+        "batch_size": BATCH_SIZE,
+        "ngpu": ngpu,
+        "max_gpus": max_gpus,
+    }
+    with open(os.path.join(exp_root, "params.json"), "w") as f:
+        json.dump(params, f, indent=2)
+    canon_struct = [
+        {
+            "mixture_id": m.mixture_id,
+            "references": [str(p) for p in m.refs],
+            "systems": {
+                a: [str(p) for p in outs] for a, outs in (m.systems or {}).items()
+            },
+            "speaker_ids": m.speaker_ids,
+        }
+        for m in canon_mix
+    ]
+    with open(os.path.join(exp_root, "manifest_canonical.json"), "w") as f:
+        json.dump(canon_struct, f, indent=2)
+    print(f"Starting experiment {exp_id} with {ngpu} GPUs")
+    print(f"Results will be saved to: {exp_root}")
+    clear_gpu_memory()
+    get_gpu_memory_info(verbose)
+    flat_entries = [e for mix in mixture_entries for e in mix]
+    all_refs = {}
+    if verbose:
+        print("Loading reference signals...")
+    for e in flat_entries:
+        wav, _ = librosa.load(str(e["ref"]), sr=SR)
+        all_refs[e["id"]] = torch.from_numpy(loudness_normalize(wav))
+    if verbose:
+        print("Computing voiced masks...")
+    win = int(ENERGY_WIN_MS * SR / 1000)
+    hop = int(ENERGY_HOP_MS * SR / 1000)
+    voiced_mask_mix = []
+    for i, mix in enumerate(mixture_entries):
+        if verbose:
+            print(f"  Computing mask for mixture {i + 1}/{len(mixture_entries)}")
+        if ngpu > 0:
+            with torch.cuda.device(0):
+                refs_for_mix = [all_refs[e["id"]].cuda() for e in mix]
+                mask = make_union_voiced_mask(refs_for_mix, win, hop)
+                voiced_mask_mix.append(mask.cpu())
+                # Explicitly delete GPU tensors
+                for ref in refs_for_mix:
+                    del ref
+                torch.cuda.empty_cache()
+        else:
+            refs_for_mix = [all_refs[e["id"]].cpu() for e in mix]
+            mask = make_union_voiced_mask(refs_for_mix, win, hop)
+            voiced_mask_mix.append(mask.cpu())
+    ordered_speakers = [e["id"] for e in flat_entries]
+    for algo_idx, algo in enumerate(algos_to_run):
+        if verbose:
+            print(f"\nProcessing Algorithm {algo_idx + 1}/{len(algos_to_run)}: {algo}")
+        algo_dir = os.path.join(exp_root, algo)
+        os.makedirs(algo_dir, exist_ok=True)
+        all_outs = {}
+        missing = []
+        for mix_idx, mix in enumerate(mixture_entries):
+            for e in mix:
+                assigned_path = e.get("outs", {}).get(algo)
+                if assigned_path is None:
+                    missing.append((e["mixture"], e["id"]))
+                    continue
+                wav, _ = librosa.load(str(assigned_path), sr=SR)
+                all_outs[e["id"]] = torch.from_numpy(loudness_normalize(wav))
+        if missing:
+            msg = f"[{algo}] missing outputs for {len(missing)} speaker(s)"
+            if on_missing == "error":
+                raise FileNotFoundError(msg)
+            else:
+                if verbose:
+                    warnings.warn(msg + " Skipping those speakers.")
+        if not all_outs:
+            if verbose:
+                warnings.warn(f"[{algo}] No outputs provided. Skipping algorithm.")
+            continue
+        ps_ts = {m: {s: [] for s in ordered_speakers} for m in models}
+        pm_ts = {m: {s: [] for s in ordered_speakers} for m in models}
+        ps_bias_ts = {m: {s: [] for s in ordered_speakers} for m in models}
+        ps_prob_ts = {m: {s: [] for s in ordered_speakers} for m in models}
+        pm_bias_ts = {m: {s: [] for s in ordered_speakers} for m in models}
+        pm_prob_ts = {m: {s: [] for s in ordered_speakers} for m in models}
+        for model_idx, mname in enumerate(models):
+            if verbose:
+                print(f"  Processing Model {model_idx + 1}/{len(models)}: {mname}")
+            for metric_type in ["PS", "PM"]:
+                clear_gpu_memory()
+                gc.collect()
+                model_wrapper, layer_eff = load_model(mname, layer, max_gpus)
+                get_gpu_memory_info(verbose)
+                embs_by_mix = {}
+                labels_by_mix = {}
+                for k, mix in enumerate(mixture_entries):
+                    speakers_this_mix = [e for e in mix if e["id"] in all_outs]
+                    if not speakers_this_mix:
+                        continue
+                    if verbose:
+                        print(
+                            f"Processing mixture {k + 1}/{len(mixture_entries)} for {metric_type}"
+                        )
+                    all_signals_mix = []
+                    all_masks_mix = []
+                    all_labels_mix = []
+                    for e in speakers_this_mix:
+                        s = e["id"]
+                        if metric_type == "PS":
+                            dists = [
+                                loudness_normalize(d)
+                                for d in apply_distortions(all_refs[s].numpy(), "all")
+                            ]
+                        else:
+                            dists = [
+                                loudness_normalize(d)
+                                for d in apply_adv_distortions(
+                                    all_refs[s].numpy(), "all"
+                                )
+                            ]
+                        sigs = [all_refs[s].numpy(), all_outs[s].numpy()] + dists
+                        lbls = ["ref", "out"] + [f"d{i}" for i in range(len(dists))]
+                        masks = [voiced_mask_mix[k]] * len(sigs)
+                        all_signals_mix.extend(sigs)
+                        all_masks_mix.extend(masks)
+                        all_labels_mix.extend([f"{s}-{l}" for l in lbls])
+                    try:
+                        # Process in smaller batches
+                        batch_size = min(2, BATCH_SIZE)
+                        embeddings_list = []
+                        for i in range(0, len(all_signals_mix), batch_size):
+                            batch_sigs = all_signals_mix[i:i + batch_size]
+                            batch_masks = all_masks_mix[i:i + batch_size]
+                            batch_embs = embed_batch(
+                                batch_sigs,
+                                batch_masks,
+                                model_wrapper,
+                                layer_eff,
+                                use_mlm=False,
+                            )
+                            if batch_embs.numel() > 0:
+                                embeddings_list.append(batch_embs.cpu())
+                            torch.cuda.empty_cache()
+                        if embeddings_list:
+                            embeddings = torch.cat(embeddings_list, dim=0)
+                            embs_by_mix[k] = embeddings
+                            labels_by_mix[k] = all_labels_mix
+                    except Exception as ex:
+                        if verbose:
+                            print(f"      ERROR processing mixture {k + 1}: {ex}")
+                        continue
+                    finally:
+                        # Always clean up after processing a mixture
+                        del all_signals_mix, all_masks_mix
+                        if 'embeddings_list' in locals():
+                            del embeddings_list
+                        clear_gpu_memory()
+                        gc.collect()
+                if verbose:
+                    print(f"    Computing {metric_type} scores for {mname}...")
+                # Process mixtures with their stored embeddings and labels
+                with ThreadPoolExecutor(
+                        max_workers=min(2, ngpu if ngpu > 0 else 1)
+                ) as executor:
+                    for k in range(len(mixture_entries)):
+                        if k not in embs_by_mix:
+                            continue
+                        E, L, D = embs_by_mix[k].shape
+                        if L == 0:
+                            if verbose:
+                                print(f"        WARNING: mixture {k + 1} produced 0 frames after masking; skipping.")
+                            continue
+                        # Get the labels for this mixture
+                        labels_for_mix = labels_by_mix[k]
+                        def process_frame(f, embeddings_mix, labels_mix):
+                            try:
+                                frame_emb = embeddings_mix[:, f, :].detach().cpu().numpy()
+                                if add_ci:
+                                    coords_d, coords_c, eigvals, k_sub_gauss = (
+                                        gpu_distributor.execute_on_gpu(
+                                            diffusion_map_torch,
+                                            frame_emb,
+                                            labels_mix,
+                                            alpha=alpha,
+                                            eig_solver="full",
+                                            return_eigs=True,
+                                            return_complement=True,
+                                            return_cval=add_ci,
+                                        )
+                                    )
+                                else:
+                                    coords_d = gpu_distributor.execute_on_gpu(
+                                        diffusion_map_torch,
+                                        frame_emb,
+                                        labels_mix,
+                                        alpha=alpha,
+                                        eig_solver="full",
+                                        return_eigs=False,
+                                        return_complement=False,
+                                        return_cval=False,
+                                    )
+                                    coords_c = None
+                                    eigvals = None
+                                    k_sub_gauss = 1
+                                if metric_type == "PS":
+                                    score = compute_ps(
+                                        coords_d, labels_mix, max_gpus
+                                    )
+                                    bias = prob = None
+                                    if add_ci:
+                                        bias, prob = ps_ci_components_full(
+                                            coords_d,
+                                            coords_c,
+                                            eigvals,
+                                            labels_mix,
+                                            delta=DEFAULT_DELTA_CI,
+                                        )
+                                    return f, "PS", score, bias, prob
+                                else:
+                                    score = compute_pm(
+                                        coords_d, labels_mix, "gamma", max_gpus
+                                    )
+                                    bias = prob = None
+                                    if add_ci:
+                                        bias, prob = pm_ci_components_full(
+                                            coords_d,
+                                            coords_c,
+                                            eigvals,
+                                            labels_mix,
+                                            delta=DEFAULT_DELTA_CI,
+                                            K=k_sub_gauss,
+                                        )
+                                    return f, "PM", score, bias, prob
+                            except Exception as ex:
+                                if verbose:
+                                    print(f"        ERROR frame {f + 1}: {ex}")
+                                return None
+                        futures = [
+                            executor.submit(process_frame, f, embs_by_mix[k], labels_for_mix)
+                            for f in range(L)
+                        ]
+                        for fut in futures:
+                            result = fut.result()
+                            if result is None:
+                                continue
+                            f, metric, score, bias, prob = result
+                            if metric == "PS":
+                                for sp in score:
+                                    ps_ts[mname][sp].append(score[sp])
+                                    if add_ci and bias is not None:
+                                        ps_bias_ts[mname][sp].append(bias[sp])
+                                        ps_prob_ts[mname][sp].append(prob[sp])
+                            else:
+                                for sp in score:
+                                    pm_ts[mname][sp].append(score[sp])
+                                    if add_ci and bias is not None:
+                                        pm_bias_ts[mname][sp].append(bias[sp])
+                                        pm_prob_ts[mname][sp].append(prob[sp])
+                    # Clean up after processing all mixtures for this metric
+                    del embs_by_mix, labels_by_mix
+                    clear_gpu_memory()
+                    gc.collect()
+                del model_wrapper
+                clear_gpu_memory()
+                gc.collect()
+        if verbose:
+            print(f"  Saving results for {algo}...")
+        for m in models:
+            def _pad(vec, n):
+                return vec + [np.nan] * (n - len(vec))
+            max_len = 0
+            for s in ordered_speakers:
+                max_len = max(max_len, len(ps_ts[m][s]), len(pm_ts[m][s]))
+            pd.DataFrame(
+                {s: _pad(ps_ts[m][s], max_len) for s in ordered_speakers}
+            ).to_csv(os.path.join(algo_dir, f"ps_scores_{m}.csv"), index=False)
+            pd.DataFrame(
+                {s: _pad(pm_ts[m][s], max_len) for s in ordered_speakers}
+            ).to_csv(os.path.join(algo_dir, f"pm_scores_{m}.csv"), index=False)
+            if add_ci:
+                ci_cols = {}
+                for s in ordered_speakers:
+                    ci_cols[f"{s}_ps_bias"] = _pad(ps_bias_ts[m][s], max_len)
+                    ci_cols[f"{s}_ps_prob"] = _pad(ps_prob_ts[m][s], max_len)
+                    ci_cols[f"{s}_pm_bias"] = _pad(pm_bias_ts[m][s], max_len)
+                    ci_cols[f"{s}_pm_prob"] = _pad(pm_prob_ts[m][s], max_len)
+                pd.DataFrame(ci_cols).to_csv(
+                    os.path.join(algo_dir, f"ci_{m}.csv"), index=False
+                )
+        del all_outs
+        clear_gpu_memory()
+        gc.collect()
+    print(f"\nEXPERIMENT COMPLETED")
+    print(f"Results saved to: {exp_root}")
+    del all_refs, voiced_mask_mix
+    # Import and call the cleanup function
+    from models import cleanup_all_models
+    cleanup_all_models()
+    clear_gpu_memory()
+    get_gpu_memory_info(verbose)
+    gc.collect()
+    return exp_root

hf_readme.md ADDED Viewed

	@@ -0,0 +1,136 @@

+---
+title: MAPSS Multi Source Audio Perceptual Separation Scores
+emoji: 🎵
+colorFrom: blue
+colorTo: purple
+sdk: gradio
+sdk_version: 4.0.0
+app_file: app.py
+pinned: false
+license: mit
+---
+# MAPSS: Multi-source Audio Perceptual Separation Scores
+Evaluate audio source separation quality using Perceptual Similarity (PS) and Perceptual Matching (PM) metrics.
+## Features
+- **Perceptual Similarity (PS)**: Measures how similar separated outputs are to reference sources in perceptual embedding space
+- **Perceptual Matching (PM)**: Evaluates robustness against a comprehensive set of audio distortions
+- **Multiple embedding models**: Support for WavLM, Wav2Vec2, HuBERT, AST, and more
+- **Automatic output-to-reference matching**: Uses correlation-based Hungarian algorithm
+- **GPU-optimized processing**: Efficient batch processing with memory management
+- **Diffusion maps**: Advanced dimensionality reduction for perceptual space analysis
+## Input Format
+Upload a ZIP file containing:
+```
+your_mixture.zip
+├── references/       # Original clean sources
+│   ├── speaker1.wav
+│   ├── speaker2.wav
+│   └── ...
+└── outputs/         # Separated outputs from your algorithm
+    ├── separated1.wav
+    ├── separated2.wav
+    └── ...
+```
+### Audio Requirements
+- Format: WAV files
+- Sample rate: Any (automatically resampled to 16kHz)
+- Channels: Mono or stereo (converted to mono)
+- Number of files: Equal number of references and outputs
+## Output Format
+The tool generates a ZIP file containing:
+- `ps_scores_{model}.csv`: PS scores for each speaker/source (0-1, higher is better)
+- `pm_scores_{model}.csv`: PM scores for each speaker/source (0-1, higher is better)
+- `params.json`: Experiment parameters used
+- `manifest_canonical.json`: File mapping and processing details
+### Score Interpretation
+- **PS Score**: Perceptual Similarity
+  - 1.0 = Perfect separation (output identical to reference)
+  - 0.5 = Moderate separation quality
+  - 0.0 = Poor separation (output closer to other sources)
+- **PM Score**: Perceptual Matching (robustness)
+  - 1.0 = Highly robust to distortions
+  - 0.5 = Moderate robustness
+  - 0.0 = Not robust (easily confused with distorted versions)
+## Available Models
+| Model | Description | Default Layer | Use Case |
+|-------|-------------|---------------|----------|
+| `raw` | Raw waveform features | N/A | Baseline comparison |
+| `wavlm` | WavLM Large | 24 | Best overall performance |
+| `wav2vec2` | Wav2Vec2 Large | 24 | Strong performance |
+| `hubert` | HuBERT Large | 24 | Good for speech |
+| `wavlm_base` | WavLM Base | 12 | Faster, good quality |
+| `wav2vec2_base` | Wav2Vec2 Base | 12 | Faster processing |
+| `hubert_base` | HuBERT Base | 12 | Faster for speech |
+| `wav2vec2_xlsr` | Wav2Vec2 XLSR-53 | 24 | Multilingual |
+| `ast` | Audio Spectrogram Transformer | 12 | General audio |
+## Parameters
+- **Model**: Select the embedding model for feature extraction
+- **Layer**: Which transformer layer to use (auto-selected by default)
+- **Alpha**: Diffusion maps parameter (0.0-1.0, default: 1.0)
+  - 0.0 = No normalization
+  - 1.0 = Full normalization (recommended)
+## How It Works
+1. **Feature Extraction**: Audio signals are processed through pre-trained self-supervised models to extract perceptual embeddings
+2. **Voice Activity Detection**: Automatic detection of voiced segments using energy-based masking
+3. **Diffusion Maps**: Embeddings are projected using diffusion maps for robust dimensionality reduction
+4. **PS Computation**: Measures Mahalanobis distance between separated outputs and references vs other sources
+5. **PM Computation**: Evaluates against comprehensive distortions including:
+   - Noise (white, pink, brown at various SNRs)
+   - Filtering (lowpass, highpass, notch, comb)
+   - Effects (reverb, echo, tremolo, vibrato)
+   - Distortions (clipping, pitch shift, time stretch)
+6. **Scoring**: Frame-level scores are computed and aggregated
+## Technical Details
+- **Loudness normalization**: ITU-R BS.1770 standard (-23 LUFS)
+- **Frame-based processing**: 20ms windows with 20ms hop
+- **Correlation-based assignment**: Hungarian algorithm for optimal matching
+- **Memory optimization**: Batch processing with automatic GPU memory management
+- **Robust statistics**: Covariance regularization and outlier handling
+## Citation
+If you use MAPSS in your research, please cite:
+```bibtex
+@article{mapss2024,
+  title={MAPSS: Multi-source Audio Perceptual Separation Scores},
+  author={Your Name},
+  journal={arXiv preprint},
+  year={2024}
+}
+```
+## Limitations
+- Processing time scales with audio length and model size
+- Memory requirements depend on number of sources and audio length
+- Currently optimized for speech separation (music separation support in development)
+- Maximum recommended sources: 10 per mixture
+## License
+Code: MIT License
+Paper: CC-BY-4.0
+## Support
+For issues, questions, or contributions, please visit the [GitHub repository](https://github.com/yourusername/mapss).

hf_requirements.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+# Core dependencies
+gradio>=4.0.0
+torch>=2.0.0
+torchaudio>=2.0.0
+transformers>=4.35.0
+accelerate>=0.24.0
+# Audio processing
+librosa>=0.10.0
+soundfile>=0.12.0
+pyloudnorm>=0.1.0
+scipy>=1.11.0
+numpy>=1.24.0
+# Data handling
+pandas>=2.0.0
+# Model specific
+safetensors>=0.4.0
+sentencepiece>=0.1.99  # For some tokenizers
+# Optional optimizations
+triton>=2.1.0  # For faster attention if available
+# Memory management
+psutil>=5.9.0

init.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from engine import compute_mapss_measures
+__version__ = "1.0.0"
+__all__ = ["compute_mapss_measures"]

main.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from __future__ import annotations
+from pathlib import Path
+from engine import compute_mapss_measures
+from argshield import _parse_args, _read_manifest, _validate_and_resolve, _validate_gpus
+def main():
+    args = _parse_args()
+    manifest = _read_manifest(Path(args.manifest))
+    layer_final, alpha_final = _validate_and_resolve(args.model, args.layer, args.alpha)
+    max_gpus_final = _validate_gpus(args.max_gpus)
+    results_dir = compute_mapss_measures(
+        models=[args.model],
+        mixtures=manifest,
+        verbose=args.verbose,
+        max_gpus=max_gpus_final,
+        layer=layer_final,
+        alpha=alpha_final,
+    )
+    print(f"Results saved to: {results_dir}")
+if __name__ == "__main__":
+    main()

metrics.py ADDED Viewed

	@@ -0,0 +1,549 @@

+import math
+import numpy as np
+import torch
+from scipy.special import gammaincc
+from scipy.stats import gamma
+from config import COV_TOL, DEFAULT_DELTA_CI
+from utils import get_gpu_count, mahalanobis_torch, safe_cov_torch
+def pm_tail_gamma(d_out_sq, sq_dists):
+    """PM tail gamma exactly as original."""
+    mu = sq_dists.mean().item()
+    var = sq_dists.var(unbiased=True).item()
+    if var == 0.0:
+        return 1.0
+    k = (mu**2) / var
+    theta = var / mu
+    return float(1.0 - gamma.cdf(d_out_sq, a=k, scale=theta))
+def pm_tail_rank(d_out_sq, sq_dists):
+    """PM tail rank exactly as original."""
+    rank = int((sq_dists < d_out_sq).sum().item())
+    n = sq_dists.numel()
+    return 1.0 - (rank + 0.5) / (n + 1.0)
+def diffusion_map_torch(
+    X_np,
+    labels_by_mix,
+    *,
+    cutoff=0.99,
+    tol=1e-3,
+    diffusion_time=1,
+    alpha=0.0,
+    eig_solver="lobpcg",
+    k=None,
+    device=None,
+    return_eigs=False,
+    return_complement=False,
+    return_cval=False,
+):
+    """Diffusion map computation exactly as original."""
+    device = device or ("cuda:0" if torch.cuda.is_available() else "cpu")
+    X = torch.as_tensor(X_np, dtype=torch.float32, device=device)
+    N = X.shape[0]
+    if device != "cpu" and torch.cuda.is_available():
+        stream = torch.cuda.Stream(device=device)
+        ctx_dev = torch.cuda.device(device)
+        ctx_stream = torch.cuda.stream(stream)
+    else:
+        from contextlib import nullcontext
+        stream = None
+        ctx_dev = nullcontext()
+        ctx_stream = nullcontext()
+    with ctx_dev:
+        with ctx_stream:
+            if N > 1000:
+                chunk = min(500, N)
+                D2 = torch.zeros(N, N, device=device)
+                for i in range(0, N, chunk):
+                    ei = min(i + chunk, N)
+                    for j in range(0, N, chunk):
+                        ej = min(j + chunk, N)
+                        D2[i:ei, j:ej] = torch.cdist(X[i:ei], X[j:ej]).pow_(2)
+            else:
+                D2 = torch.cdist(X, X).pow_(2)
+            i, j = torch.triu_indices(
+                N, N, offset=1, device=None if device == "cpu" else device
+            )
+            eps = torch.median(D2[i, j])
+            K = torch.exp(-D2 / (2 * eps))
+            d = K.sum(dim=1)
+            if alpha != 0.0:
+                d_alpha_inv = d.pow(-alpha)
+                K *= d_alpha_inv[:, None] * d_alpha_inv[None, :]
+                d = K.sum(dim=1)
+            D_half_inv = torch.diag(torch.rsqrt(d))
+            K_sym = D_half_inv @ K @ D_half_inv
+            if eig_solver == "lobpcg":
+                m = k if k is not None else min(N - 1, 50)
+                init = torch.randn(N, m, device=device)
+                vals, vecs = torch.lobpcg(
+                    K_sym, k=m, X=init, niter=200, tol=tol, largest=True
+                )
+            elif eig_solver == "full":
+                vals, vecs = torch.linalg.eigh(K_sym)
+                vals, vecs = vals.flip(0), vecs.flip(1)
+                if k is not None:
+                    vecs = vecs[:, : k + 1]
+                    vals = vals[: k + 1]
+            else:
+                raise ValueError(f"Unknown eig_solver '{eig_solver}'")
+            psi = vecs[:, 1:]
+            lam = vals[1:]
+            cum = torch.cumsum(lam, dim=0)
+            L = int((cum / cum[-1] < cutoff).sum().item()) + 1
+            lam_pow = lam.pow(diffusion_time)
+            psi_all = psi * lam_pow
+            Psi = psi_all[:, :L]
+            Psi_rest = psi_all[:, L:]
+            if return_cval:
+                indices_with_out = [
+                    ii for ii, name in enumerate(labels_by_mix) if "out" in name
+                ]
+                valid_idx = torch.tensor(
+                    [ii for ii in range(N) if ii not in indices_with_out], device=device
+                )
+                pi_min = d[valid_idx].min() / d[valid_idx].sum()
+                c_val = lam_pow[0] * pi_min.rsqrt() / math.log(2.0)
+            if stream is not None:
+                stream.synchronize()
+    if return_complement and return_eigs and return_cval:
+        return (
+            Psi.cpu().numpy(),
+            Psi_rest.cpu().numpy(),
+            lam.cpu().numpy(),
+            float(c_val),
+        )
+    if return_complement and return_eigs:
+        return Psi.cpu().numpy(), Psi_rest.cpu().numpy(), lam.cpu().numpy()
+    if return_complement:
+        return Psi.cpu().numpy(), Psi_rest.cpu().numpy()
+    if return_eigs:
+        return Psi.cpu().numpy(), lam.cpu().numpy()
+    return Psi.cpu().numpy()
+def compute_ps(coords, labels, max_gpus=None):
+    ngpu = get_gpu_count(max_gpus)
+    if ngpu == 0:
+        coords_t = torch.tensor(coords)
+        spks_here = sorted({l.split("-")[0] for l in labels})
+        out = {}
+        for s in spks_here:
+            idxs = [i for i, l in enumerate(labels) if l.startswith(s)]
+            out_i = labels.index(f"{s}-out")
+            ref_is = [i for i in idxs if i != out_i]
+            mu = coords_t[ref_is].mean(0)
+            cov = safe_cov_torch(coords_t[ref_is])
+            inv = torch.linalg.inv(cov)
+            A = mahalanobis_torch(coords_t[out_i], mu, inv)
+            B_list = []
+            for o in spks_here:
+                if o == s:
+                    continue
+                o_idxs = [
+                    i
+                    for i, l in enumerate(labels)
+                    if l.startswith(o) and not l.endswith("-out")
+                ]
+                mu_o = coords_t[o_idxs].mean(0)
+                inv_o = torch.linalg.inv(safe_cov_torch(coords_t[o_idxs]))
+                B_list.append(mahalanobis_torch(coords_t[out_i], mu_o, inv_o))
+            B_min = torch.min(torch.stack(B_list)) if B_list else torch.tensor(0.0)
+            out[s] = (1 - A / (A + B_min + 1e-6)).item()
+        return out
+    # GPU version
+    device = min(ngpu - 1, 1)  # Use second GPU if available
+    device_str = f"cuda:{device}"
+    coords_t = torch.tensor(coords, device=device_str)
+    spks_here = sorted({l.split("-")[0] for l in labels})
+    out = {}
+    stream = torch.cuda.Stream(device=device_str)
+    with torch.cuda.device(device):
+        with torch.cuda.stream(stream):
+            for s in spks_here:
+                idxs = [i for i, l in enumerate(labels) if l.startswith(s)]
+                out_i = labels.index(f"{s}-out")
+                ref_is = [i for i in idxs if i != out_i]
+                mu = coords_t[ref_is].mean(0)
+                cov = safe_cov_torch(coords_t[ref_is])
+                inv = torch.linalg.inv(cov)
+                A = mahalanobis_torch(coords_t[out_i], mu, inv)
+                B_list = []
+                for o in spks_here:
+                    if o == s:
+                        continue
+                    o_idxs = [
+                        i
+                        for i, l in enumerate(labels)
+                        if l.startswith(o) and not l.endswith("-out")
+                    ]
+                    mu_o = coords_t[o_idxs].mean(0)
+                    inv_o = torch.linalg.inv(safe_cov_torch(coords_t[o_idxs]))
+                    B_list.append(mahalanobis_torch(coords_t[out_i], mu_o, inv_o))
+                B_min = (
+                    torch.min(torch.stack(B_list))
+                    if B_list
+                    else torch.tensor(0.0, device=device_str)
+                )
+                out[s] = (1 - A / (A + B_min + 1e-6)).item()
+            stream.synchronize()
+    return out
+def compute_pm(coords, labels, pm_method, max_gpus=None):
+    ngpu = get_gpu_count(max_gpus)
+    if ngpu == 0:
+        coords_t = torch.tensor(coords)
+        spks_here = sorted({l.split("-")[0] for l in labels})
+        out = {}
+        for s in spks_here:
+            idxs = [i for i, l in enumerate(labels) if l.startswith(s)]
+            ref_i = labels.index(f"{s}-ref")
+            out_i = labels.index(f"{s}-out")
+            d_idx = [i for i in idxs if i not in {ref_i, out_i}]
+            if len(d_idx) < 2:
+                out[s] = 0.0
+                continue
+            ref_v = coords_t[ref_i]
+            dist = coords_t[d_idx] - ref_v
+            N, D = dist.shape
+            cov = dist.T @ dist / (N - 1)
+            if torch.linalg.matrix_rank(cov) < D:
+                cov += torch.eye(D) * COV_TOL
+            inv = torch.linalg.inv(cov)
+            sq_dists = torch.stack(
+                [mahalanobis_torch(coords_t[i], ref_v, inv) ** 2 for i in d_idx]
+            )
+            d_out_sq = float(mahalanobis_torch(coords_t[out_i], ref_v, inv) ** 2)
+            pm_score = (
+                pm_tail_rank(d_out_sq, sq_dists)
+                if pm_method == "rank"
+                else pm_tail_gamma(d_out_sq, sq_dists)
+            )
+            out[s] = float(np.clip(pm_score, 0.0, 1.0))
+        return out
+    # GPU version
+    device = min(ngpu - 1, 1)
+    device_str = f"cuda:{device}"
+    coords_t = torch.tensor(coords, device=device_str)
+    spks_here = sorted({l.split("-")[0] for l in labels})
+    out = {}
+    stream = torch.cuda.Stream(device=device_str)
+    with torch.cuda.device(device):
+        with torch.cuda.stream(stream):
+            for s in spks_here:
+                idxs = [i for i, l in enumerate(labels) if l.startswith(s)]
+                ref_i = labels.index(f"{s}-ref")
+                out_i = labels.index(f"{s}-out")
+                d_idx = [i for i in idxs if i not in {ref_i, out_i}]
+                if len(d_idx) < 2:
+                    out[s] = 0.0
+                    continue
+                ref_v = coords_t[ref_i]
+                dist = coords_t[d_idx] - ref_v
+                N, D = dist.shape
+                cov = dist.T @ dist / (N - 1)
+                if torch.linalg.matrix_rank(cov) < D:
+                    cov += torch.eye(D, device=device_str) * COV_TOL
+                inv = torch.linalg.inv(cov)
+                sq_dists = torch.stack(
+                    [mahalanobis_torch(coords_t[i], ref_v, inv) ** 2 for i in d_idx]
+                )
+                d_out_sq = float(mahalanobis_torch(coords_t[out_i], ref_v, inv) ** 2)
+                pm_score = (
+                    pm_tail_rank(d_out_sq, sq_dists)
+                    if pm_method == "rank"
+                    else pm_tail_gamma(d_out_sq, sq_dists)
+                )
+                out[s] = float(np.clip(pm_score, 0.0, 1.0))
+            stream.synchronize()
+    return out
+def pm_ci_components_full(
+    coords_d, coords_rest, eigvals, labels, *, delta=0.05, K=1.0, C1=1.0, C2=1.0
+):
+    """PM CI components exactly as original - complete implementation."""
+    _EPS = 1e-12
+    def _safe_x(a, theta):
+        return a / max(theta, _EPS)
+    D = coords_d.shape[1]
+    m = coords_rest.shape[1]
+    if m == 0:
+        z = {s: 0.0 for s in {l.split("-")[0] for l in labels}}
+        return z.copy(), z.copy()
+    X_d = torch.tensor(
+        coords_d, device="cuda:0" if torch.cuda.is_available() else "cpu"
+    )
+    X_c = torch.tensor(
+        coords_rest, device="cuda:0" if torch.cuda.is_available() else "cpu"
+    )
+    spk_ids = sorted({l.split("-")[0] for l in labels})
+    bias_ci = {}
+    prob_ci = {}
+    for s in spk_ids:
+        idxs = [i for i, l in enumerate(labels) if l.startswith(s)]
+        ref_i = labels.index(f"{s}-ref")
+        out_i = labels.index(f"{s}-out")
+        dist_is = [i for i in idxs if i not in {ref_i, out_i}]
+        n_p = len(dist_is)
+        if n_p < 2:
+            bias_ci[s] = 0.0
+            prob_ci[s] = 0.0
+            continue
+        ref_d = X_d[ref_i]
+        ref_c = X_c[ref_i]
+        D_mat = X_d[dist_is] - ref_d
+        C_mat = X_c[dist_is] - ref_c
+        Sigma_d = safe_cov_torch(D_mat)
+        Sigma_c = safe_cov_torch(C_mat)
+        C_dc = D_mat.T @ C_mat / (n_p - 1)
+        inv_Sigma_d = torch.linalg.inv(Sigma_d)
+        S_i = (
+            Sigma_c
+            - C_dc.T @ inv_Sigma_d @ C_dc
+            + torch.eye(X_c.shape[1], device=X_c.device) * 1e-9
+        )
+        S_inv = torch.linalg.inv(S_i)
+        diff_out_d = X_d[out_i] - ref_d
+        diff_out_c = X_c[out_i] - ref_c
+        r_out = diff_out_c - C_dc.T @ inv_Sigma_d @ diff_out_d
+        delta_Gi_a = float(r_out @ S_inv @ r_out)
+        r_list = []
+        for p in dist_is:
+            d_p = X_d[p] - ref_d
+            c_p = X_c[p] - ref_c
+            r_p = c_p - C_dc.T @ inv_Sigma_d @ d_p
+            r_list.append(r_p)
+        R_p = torch.stack(r_list, dim=0)
+        delta_Gi_p = torch.sum(R_p @ S_inv * R_p, dim=1)
+        delta_Gi_mu_max = float(delta_Gi_p.max())
+        mah_sq = torch.stack(
+            [(X_d[i] - ref_d) @ inv_Sigma_d @ (X_d[i] - ref_d) for i in dist_is]
+        )
+        mu_g = float(mah_sq.mean())
+        sigma2_g = float(mah_sq.var(unbiased=True) + 1e-12)
+        sigma_g = math.sqrt(sigma2_g)
+        full_sq = mah_sq + delta_Gi_p
+        mu_full = float(full_sq.mean())
+        sigma2_full = float(full_sq.var(unbiased=True) + 1e-12)
+        if sigma2_g == 0.0:
+            delta_Gi_k = delta_Gi_theta = 0.0
+        else:
+            factor = delta_Gi_mu_max * n_p / (n_p - 1)
+            delta_Gi_k = 1.0 * factor * (mu_full + mu_g) / sigma2_g
+            delta_Gi_theta = 1.0 * factor * (sigma2_full + sigma2_g) / (mu_g**2 + 1e-9)
+        k_d = (mu_g**2) / max(sigma2_g, 1e-12)
+        theta_d = sigma2_g / max(mu_g, 1e-12)
+        a_d = float(diff_out_d @ inv_Sigma_d @ diff_out_d)
+        pm_center = gammaincc(k_d, _safe_x(a_d, theta_d))
+        corner_vals = []
+        for s_k in (-1, 1):
+            for s_theta in (-1, 1):
+                for s_a in (-1, 1):
+                    k_c = max(k_d + s_k * delta_Gi_k, 1e-6)
+                    theta_c = max(theta_d + s_theta * delta_Gi_theta, 1e-6)
+                    a_c = max(a_d + s_a * delta_Gi_a, 1e-8)
+                    corner_vals.append(gammaincc(k_c, _safe_x(a_c, theta_c)))
+        bias_ci[s] = max(abs(v - pm_center) for v in corner_vals)
+        # Probabilistic half-width
+        R_sq = float(mah_sq.max()) + 1e-12
+        log_term = math.log(6.0 / delta)
+        eps_mu = math.sqrt(2 * sigma2_g * log_term / n_p) + 3 * R_sq * log_term / n_p
+        eps_sigma = (
+            math.sqrt(2 * R_sq**2 * log_term / n_p) + 3 * R_sq**2 * log_term / n_p
+        )
+        g1_x = 2.0 * mu_g / (sigma2_g + 1e-9)
+        g1_y = -2.0 * mu_g**2 / (sigma_g**3 + 1e-9)
+        g2_x = -sigma2_g / (mu_g**2 + 1e-9)
+        g2_y = 2.0 * sigma_g / (mu_g + 1e-9)
+        delta_k = min(abs(g1_x) * eps_mu + abs(g1_y) * eps_sigma, 0.5 * k_d)
+        delta_theta = min(abs(g2_x) * eps_mu + abs(g2_y) * eps_sigma, 0.5 * theta_d)
+        delta_a = min(R_sq * math.sqrt(2 * log_term / n_p), 0.5 * a_d + 1e-12)
+        pm_corners = []
+        for s_k in (-1, 1):
+            for s_theta in (-1, 1):
+                for s_a in (-1, 1):
+                    k_c = k_d + s_k * delta_k
+                    theta_c = theta_d + s_theta * delta_theta
+                    a_c = max(a_d + s_a * delta_a, 1e-8)
+                    pm_corners.append(gammaincc(k_c, _safe_x(a_c, theta_c)))
+        prob_ci[s] = max(abs(pm - pm_center) for pm in pm_corners)
+    return bias_ci, prob_ci
+def ps_ci_components_full(coords_d, coords_rest, eigvals, labels, *, delta=0.05):
+    """PS CI components exactly as original - complete implementation."""
+    def _mean_dev(lam_max, delta, n_eff):
+        return math.sqrt(2 * lam_max * math.log(2 / delta) / n_eff)
+    def _rel_cov_dev(lam_max, trace, delta, n_eff, C=1.0):
+        r = trace / lam_max
+        abs_dev = (
+            C * lam_max * (math.sqrt(r / n_eff) + (r + math.log(2 / delta)) / n_eff)
+        )
+        return abs_dev / lam_max
+    def _maha_eps_m(a_hat, lam_min, lam_max, mean_dev, rel_cov_dev):
+        term1 = 2 * math.sqrt(a_hat) * mean_dev * math.sqrt(lam_max / lam_min)
+        term2 = a_hat * rel_cov_dev
+        return term1 + term2
+    D = coords_d.shape[1]
+    m = coords_rest.shape[1]
+    if m == 0:
+        z = {s: 0.0 for s in set(l.split("-")[0] for l in labels)}
+        return z.copy(), z.copy()
+    X_d = torch.tensor(
+        coords_d, device="cuda:0" if torch.cuda.is_available() else "cpu"
+    )
+    X_c = torch.tensor(
+        coords_rest, device="cuda:0" if torch.cuda.is_available() else "cpu"
+    )
+    spk_ids = sorted({l.split("-")[0] for l in labels})
+    bias = {}
+    prob = {}
+    for s in spk_ids:
+        idxs = [i for i, l in enumerate(labels) if l.startswith(s)]
+        out_i = labels.index(f"{s}-out")
+        ref_is = [i for i in idxs if i != out_i]
+        mu_d = X_d[ref_is].mean(0)
+        mu_c = X_c[ref_is].mean(0)
+        Sigma_d = safe_cov_torch(X_d[ref_is])
+        Sigma_c = safe_cov_torch(X_c[ref_is])
+        C_dc = (X_d[ref_is] - mu_d).T @ (X_c[ref_is] - mu_c) / (len(ref_is) - 1)
+        inv_Sd = torch.linalg.inv(Sigma_d)
+        lam_min = torch.linalg.eigvalsh(Sigma_d).min().clamp_min(1e-9).item()
+        lam_max = torch.linalg.eigvalsh(Sigma_d).max()
+        trace = torch.trace(Sigma_d).item()
+        diff_d = X_d[out_i] - mu_d
+        diff_c = X_c[out_i] - mu_c
+        A_d = float(mahalanobis_torch(X_d[out_i], mu_d, inv_Sd))
+        r_i = diff_c - C_dc.T @ inv_Sd @ diff_d
+        S_i = (
+            Sigma_c
+            - C_dc.T @ inv_Sd @ C_dc
+            + torch.eye(X_c.shape[1], device=X_c.device) * 1e-9
+        )
+        term_i = math.sqrt(float(r_i @ torch.linalg.solve(S_i, r_i)))
+        B_d, term_j = float("inf"), 0.0
+        Sig_o = None
+        for o in spk_ids:
+            if o == s:
+                continue
+            o_idxs = [
+                i
+                for i, l in enumerate(labels)
+                if l.startswith(o) and not l.endswith("-out")
+            ]
+            muo_d = X_d[o_idxs].mean(0)
+            muo_c = X_c[o_idxs].mean(0)
+            Sig_o_tmp = safe_cov_torch(X_d[o_idxs])
+            inv_So = torch.linalg.inv(Sig_o_tmp)
+            this_B = float(mahalanobis_torch(X_d[out_i], muo_d, inv_So))
+            if this_B < B_d:
+                B_d = this_B
+                Sig_o = Sig_o_tmp
+                diff_do = X_d[out_i] - muo_d
+                diff_co = X_c[out_i] - muo_c
+                C_oc = (
+                    (X_d[o_idxs] - muo_d).T @ (X_c[o_idxs] - muo_c) / (len(o_idxs) - 1)
+                )
+                r_j = diff_co - C_oc.T @ inv_So @ diff_do
+                S_j = (
+                    safe_cov_torch(X_c[o_idxs])
+                    - C_oc.T @ inv_So @ C_oc
+                    + torch.eye(X_c.shape[1], device=X_c.device) * 1e-9
+                )
+                term_j = math.sqrt(float(r_j @ torch.linalg.solve(S_j, r_j)))
+        denom = A_d + B_d
+        bias[s] = (B_d * term_i + A_d * term_j) / (denom**2)
+        if Sig_o is not None:
+            lam_min_o = torch.linalg.eigvalsh(Sig_o).min().clamp_min(1e-9).item()
+            lam_max_o = torch.linalg.eigvalsh(Sig_o).max().item()
+            trace_o = torch.trace(Sig_o).item()
+            n_eff = max(int(0.7 * len(ref_is)), 3)
+            RIDGE = 0.05
+            lam_min_eff = max(lam_min, RIDGE * lam_max.item())
+            lam_min_o_eff = max(lam_min_o, RIDGE * lam_max_o)
+            eps_i_sg = _maha_eps_m(
+                A_d,
+                lam_min_eff,
+                lam_max.item(),
+                _mean_dev(lam_max.item(), delta / 2, n_eff),
+                _rel_cov_dev(lam_max.item(), trace, delta / 2, n_eff),
+            )
+            eps_j_sg = _maha_eps_m(
+                B_d,
+                lam_min_o_eff,
+                lam_max_o,
+                _mean_dev(lam_max_o, delta / 2, n_eff),
+                _rel_cov_dev(lam_max_o, trace_o, delta / 2, n_eff),
+            )
+            grad_l2 = math.hypot(A_d, B_d) / (A_d + B_d) ** 2
+            ps_radius = grad_l2 * math.hypot(eps_i_sg, eps_j_sg)
+            prob[s] = min(1.0, ps_radius)
+        else:
+            prob[s] = 0.0
+    return bias, prob

models.py ADDED Viewed

	@@ -0,0 +1,333 @@

+import queue
+import threading
+import gc
+import torch
+import torch.nn.functional as F
+from transformers import (
+    HubertModel,
+    Wav2Vec2FeatureExtractor,
+    Wav2Vec2Model,
+    WavLMModel,
+    ASTModel,
+    AutoFeatureExtractor,
+)
+from config import BATCH_SIZE, ENERGY_HOP_MS, ENERGY_WIN_MS, SR
+from utils import get_gpu_count
+class BalancedDualGPUModel:
+    def __init__(self, model_name, layer, max_gpus=None):
+        self.layer = layer
+        self.models = []
+        self.extractors = []
+        self.devices = []
+        ngpu = get_gpu_count(max_gpus)
+        for gpu_id in range(min(ngpu, 2)):
+            device = f"cuda:{gpu_id}"
+            self.devices.append(device)
+            ckpt, cls, _ = get_model_config(layer)[model_name]
+            if cls is ASTModel:
+                extractor = AutoFeatureExtractor.from_pretrained(ckpt)
+            else:
+                extractor = Wav2Vec2FeatureExtractor.from_pretrained(ckpt)
+            attn_impl = "eager" if cls in (WavLMModel, ASTModel) else "sdpa"
+            model = cls.from_pretrained(
+                ckpt,
+                output_hidden_states=True,
+                use_safetensors=True,
+                torch_dtype=torch.float16,
+                low_cpu_mem_usage=True,
+                attn_implementation=attn_impl
+            )
+            model.eval()
+            model = model.to(device)
+            for param in model.parameters():
+                param.requires_grad = False
+            self.extractors.append(extractor)
+            self.models.append(model)
+        self.gpu_queues = [queue.Queue() for _ in range(len(self.devices))]
+        self.result_queue = queue.Queue()
+        self.workers = []
+        for i in range(len(self.devices)):
+            worker = threading.Thread(target=self._gpu_worker, args=(i,))
+            worker.daemon = True
+            worker.start()
+            self.workers.append(worker)
+    def _gpu_worker(self, gpu_id):
+        device = self.devices[gpu_id]
+        model = self.models[gpu_id]
+        extractor = self.extractors[gpu_id]
+        while True:
+            task = self.gpu_queues[gpu_id].get()
+            if task is None:
+                break
+            signals, masks, use_mlm, task_id = task
+            try:
+                inputs = extractor(
+                    signals, sampling_rate=SR, return_tensors="pt", padding=True
+                )
+                input_values = inputs.input_values.to(device, non_blocking=True)
+                torch.cuda.empty_cache()
+                orig_mode = model.training
+                model.train() if use_mlm else model.eval()
+                with torch.no_grad():
+                    with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
+                        hs = model(
+                            input_values, output_hidden_states=True
+                        ).hidden_states[self.layer]
+                model.train(orig_mode)
+                B, T, D = hs.shape
+                keep = []
+                for b in range(B):
+                    mask_b = masks[b].float().unsqueeze(0).unsqueeze(0).to(device)
+                    mask_t = F.interpolate(mask_b, size=T, mode="nearest")[0, 0].bool()
+                    keep.append(hs[b][mask_t].cpu())
+                # Aggressive cleanup
+                del hs, input_values, inputs
+                torch.cuda.empty_cache()
+                if keep:
+                    L_max = max(x.shape[0] for x in keep)
+                    keep_padded = [
+                        F.pad(x, (0, 0, 0, L_max - x.shape[0])) for x in keep
+                    ]
+                    result = torch.stack(keep_padded, dim=0)
+                else:
+                    result = torch.empty(0, 0, 0)
+                self.result_queue.put((task_id, result))
+            except Exception as e:
+                self.result_queue.put((task_id, e))
+            finally:
+                # Always clear cache after processing
+                torch.cuda.empty_cache()
+    def process_batch(self, signals, masks, use_mlm=False):
+        if not signals:
+            return torch.empty(0, 0, 0)
+        batch_size = len(signals)
+        split = (batch_size + len(self.devices) - 1) // len(self.devices)
+        results = {}
+        task_id = 0
+        for i in range(0, batch_size, split):
+            end = min(i + split, batch_size)
+            gpu_id = (i // split) % len(self.devices)
+            self.gpu_queues[gpu_id].put(
+                (signals[i:end], masks[i:end], use_mlm, task_id)
+            )
+            task_id += 1
+        for _ in range(task_id):
+            tid, result = self.result_queue.get()
+            if isinstance(result, Exception):
+                raise result
+            results[tid] = result
+        parts = [results[i] for i in range(task_id) if results[i].numel() > 0]
+        return torch.cat(parts, dim=0) if parts else torch.empty(0, 0, 0)
+    def cleanup(self):
+        """Explicit cleanup method"""
+        for q in self.gpu_queues:
+            q.put(None)
+        for w in self.workers:
+            w.join(timeout=5.0)
+        for model in self.models:
+            del model
+        for extractor in self.extractors:
+            del extractor
+        self.models.clear()
+        self.extractors.clear()
+        torch.cuda.empty_cache()
+        gc.collect()
+    def __del__(self):
+        self.cleanup()
+# NO CACHE - we need to clean up models properly between runs
+def get_model_config(layer):
+    return {
+        "raw": (None, None, None),
+        "wavlm": ("microsoft/wavlm-large", WavLMModel, layer),
+        "wav2vec2": ("facebook/wav2vec2-large-lv60", Wav2Vec2Model, layer),
+        "hubert": ("facebook/hubert-large-ll60k", HubertModel, layer),
+        "wavlm_base": ("microsoft/wavlm-base", WavLMModel, layer),
+        "wav2vec2_base": ("facebook/wav2vec2-base", Wav2Vec2Model, layer),
+        "hubert_base": ("facebook/hubert-base-ls960", HubertModel, layer),
+        "wav2vec2_xlsr": ("facebook/wav2vec2-large-xlsr-53", Wav2Vec2Model, layer),
+        "ast": ("MIT/ast-finetuned-audioset-10-10-0.4593", ASTModel, layer),
+    }
+# Store loaded models globally to properly manage them
+_loaded_models = {}
+def load_model(name, layer, max_gpus=None):
+    global _loaded_models
+    # Clean up any previously loaded models first
+    if _loaded_models:
+        for key, model_data in _loaded_models.items():
+            if isinstance(model_data, tuple) and len(model_data) == 2:
+                if isinstance(model_data[0], BalancedDualGPUModel):
+                    model_data[0].cleanup()
+                elif isinstance(model_data[0], tuple):
+                    # Single GPU model
+                    _, model = model_data[0]
+                    del model
+        _loaded_models.clear()
+        torch.cuda.empty_cache()
+        gc.collect()
+    if name.lower() in {"raw", "waveform"}:
+        return "raw", layer
+    ngpu = get_gpu_count(max_gpus)
+    if ngpu > 1:
+        model = BalancedDualGPUModel(name, layer, max_gpus)
+        _loaded_models[name] = (model, layer)
+        return model, layer
+    else:
+        ckpt, cls, layer_eff = get_model_config(layer)[name]
+        if cls is ASTModel:
+            extractor = AutoFeatureExtractor.from_pretrained(ckpt)
+        else:
+            extractor = Wav2Vec2FeatureExtractor.from_pretrained(ckpt)
+        device = "cuda:0" if torch.cuda.is_available() else "cpu"
+        attn_impl = "eager" if cls in (WavLMModel, ASTModel) else "sdpa"
+        model = cls.from_pretrained(
+            ckpt,
+            output_hidden_states=True,
+            use_safetensors=True,
+            torch_dtype=torch.float16,
+            low_cpu_mem_usage=True,
+            attn_implementation=attn_impl
+        )
+        model.eval()
+        model = model.to(device)
+        for param in model.parameters():
+            param.requires_grad = False
+        model_tuple = ((extractor, model), layer_eff)
+        _loaded_models[name] = model_tuple
+        return (extractor, model), layer_eff
+def cleanup_all_models():
+    """Call this at the end of each experiment to ensure complete cleanup"""
+    global _loaded_models
+    if _loaded_models:
+        for key, model_data in _loaded_models.items():
+            if isinstance(model_data, tuple) and len(model_data) == 2:
+                if isinstance(model_data[0], BalancedDualGPUModel):
+                    model_data[0].cleanup()
+                elif isinstance(model_data[0], tuple):
+                    # Single GPU model
+                    _, model = model_data[0]
+                    del model
+        _loaded_models.clear()
+    torch.cuda.empty_cache()
+    gc.collect()
+def embed_batch_raw(signals, masks_audio):
+    win = int(ENERGY_WIN_MS * SR / 1000)
+    hop = int(ENERGY_HOP_MS * SR / 1000)
+    reps, L_max = [], 0
+    for sig_np, mask_np in zip(signals, masks_audio):
+        x = torch.as_tensor(sig_np[:-1], dtype=torch.float32)
+        frames = x.unfold(0, win, hop)
+        mask = torch.as_tensor(mask_np[: len(frames)], dtype=torch.bool)
+        keep = frames[mask] if mask.any() else frames[:1]
+        reps.append(keep)
+        L_max = max(L_max, keep.size(0))
+    reps = [F.pad(r, (0, 0, 0, L_max - r.size(0))) for r in reps]
+    return torch.stack(reps, dim=0)
+def embed_batch_single_gpu(
+        signals, masks_audio, extractor, model, layer, use_mlm=False
+):
+    if not signals:
+        return torch.empty(0, 0, 0)
+    device = next(model.parameters()).device
+    max_batch = 2
+    all_keeps = []
+    for i in range(0, len(signals), max_batch):
+        batch_signals = signals[i:i + max_batch]
+        batch_masks = masks_audio[i:i + max_batch]
+        inputs = extractor(batch_signals, sampling_rate=SR, return_tensors="pt", padding=True)
+        input_values = inputs.input_values.to(device, non_blocking=True)
+        orig_mode = model.training
+        model.train() if use_mlm else model.eval()
+        with torch.no_grad():
+            with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
+                hs = model(input_values, output_hidden_states=True).hidden_states[layer]
+        model.train(orig_mode)
+        B, T, D = hs.shape
+        for b in range(B):
+            mask_b = batch_masks[b].float().unsqueeze(0).unsqueeze(0).to(device)
+            mask_t = F.interpolate(mask_b, size=T, mode="nearest")[0, 0].bool()
+            all_keeps.append(hs[b][mask_t].cpu())
+        # Aggressive cleanup
+        del hs, input_values, inputs
+        torch.cuda.empty_cache()
+    if all_keeps:
+        L_max = max(x.shape[0] for x in all_keeps)
+        keep_padded = [F.pad(x, (0, 0, 0, L_max - x.shape[0])) for x in all_keeps]
+        result = torch.stack(keep_padded, dim=0)
+        # Clean up intermediate lists
+        del all_keeps, keep_padded
+        return result
+    else:
+        return torch.empty(0, 0, 0)
+def embed_batch(signals, masks_audio, model_wrapper, layer, use_mlm=False):
+    if model_wrapper == "raw":
+        return embed_batch_raw(signals, masks_audio)
+    if isinstance(model_wrapper, BalancedDualGPUModel):
+        all_embeddings = []
+        batch_size = min(BATCH_SIZE, 2)
+        for i in range(0, len(signals), batch_size):
+            batch_emb = model_wrapper.process_batch(
+                signals[i: i + batch_size], masks_audio[i: i + batch_size], use_mlm
+            )
+            if batch_emb.numel() > 0:
+                all_embeddings.append(batch_emb)
+            # Clear cache after each batch
+            torch.cuda.empty_cache()
+        if all_embeddings:
+            result = torch.cat(all_embeddings, dim=0)
+            del all_embeddings
+            return result
+        else:
+            return torch.empty(0, 0, 0)
+    else:
+        extractor, model = model_wrapper
+        return embed_batch_single_gpu(
+            signals, masks_audio, extractor, model, layer, use_mlm=use_mlm
+        )

utils.py ADDED Viewed

	@@ -0,0 +1,231 @@

+import gc
+import threading
+import warnings
+from dataclasses import dataclass
+from pathlib import Path
+import numpy as np
+import torch
+try:
+    from scipy.optimize import linear_sum_assignment as _lsa
+except Exception:
+    _lsa = None
+warnings.filterwarnings("ignore", message="Some weights of Wav2Vec2Model")
+def get_gpu_count(max_gpus=None):
+    ngpu = torch.cuda.device_count()
+    if max_gpus is not None:
+        ngpu = min(ngpu, max_gpus)
+    return ngpu
+def clear_gpu_memory():
+    """Enhanced GPU memory clearing"""
+    if torch.cuda.is_available():
+        for i in range(torch.cuda.device_count()):
+            with torch.cuda.device(i):
+                torch.cuda.empty_cache()
+                torch.cuda.synchronize()
+        gc.collect()
+        torch.cuda.empty_cache()
+def get_gpu_memory_info(verbose=False):
+    if not verbose:
+        return
+    for i in range(torch.cuda.device_count()):
+        try:
+            free_b, total_b = torch.cuda.mem_get_info(i)  # type: ignore[attr-defined]
+            free_gb = free_b / 1024**3
+            total_gb = total_b / 1024**3
+        except Exception:
+            total_gb = torch.cuda.get_device_properties(i).total_memory / 1024**3
+            free_gb = total_gb - (torch.cuda.memory_reserved(i) / 1024**3)
+        mem_allocated = torch.cuda.memory_allocated(i) / 1024**3
+        print(f"GPU {i}: {mem_allocated:.2f}GB allocated, {free_gb:.2f}GB free / {total_gb:.2f}GB total")
+def write_wav_16bit(path, x, sr=16000):
+    path = Path(path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    try:
+        import soundfile as sf
+        sf.write(str(path), x.astype(np.float32), sr)
+    except Exception:
+        from scipy.io.wavfile import write
+        write(str(path), sr, (np.clip(x, -1, 1) * 32767).astype(np.int16))
+def safe_corr_np(a, b):
+    L = min(len(a), len(b))
+    if L <= 1:
+        return 0.0
+    a = a[:L].astype(np.float64)
+    b = b[:L].astype(np.float64)
+    a -= a.mean()
+    b -= b.mean()
+    da = a.std()
+    db = b.std()
+    if da <= 1e-12 or db <= 1e-12:
+        return 0.0
+    r = float((a * b).mean() / (da * db))
+    return max(-1.0, min(1.0, r))
+def hungarian(cost):
+    try:
+        if _lsa is not None:
+            return _lsa(cost)
+        raise RuntimeError("scipy.optimize.linear_sum_assignment unavailable")
+    except Exception:
+        used = set()
+        rows, cols = [], []
+        for i in range(cost.shape[0]):
+            j = int(
+                np.argmin(
+                    [
+                        cost[i, k] if k not in used else 1e12
+                        for k in range(cost.shape[1])
+                    ]
+                )
+            )
+            used.add(j)
+            rows.append(i)
+            cols.append(j)
+        return np.asarray(rows), np.asarray(cols)
+class GPUWorkDistributor:
+    def __init__(self, max_gpus=None):
+        ngpu = get_gpu_count(max_gpus)
+        self.gpu_locks = [threading.Lock() for _ in range(max(1, min(ngpu, 2)))]
+        self.gpu_load = [0 for _ in range(max(1, min(ngpu, 2)))]
+        self.ngpu = ngpu
+    def get_least_loaded_gpu(self):
+        return int(np.argmin(self.gpu_load))
+    def execute_on_gpu(self, func, *args, **kwargs):
+        if self.ngpu == 0:
+            kwargs.pop("device", None)
+            return func(*args, **kwargs)
+        gid = self.get_least_loaded_gpu()
+        with self.gpu_locks[gid]:
+            self.gpu_load[gid] += 1
+            try:
+                with torch.cuda.device(gid):
+                    kwargs["device"] = f"cuda:{gid}"
+                    result = func(*args, **kwargs)
+                    # Clear cache after execution
+                    torch.cuda.empty_cache()
+                return result
+            finally:
+                self.gpu_load[gid] -= 1
+@dataclass
+class Mixture:
+    mixture_id: str
+    refs: list[Path]
+    systems: dict[str, list[Path]]
+    speaker_ids: list[str]
+def canonicalize_mixtures(mixtures, systems=None):
+    canon = []
+    if not mixtures:
+        return canon
+    def as_paths(seq):
+        return [p if isinstance(p, Path) else Path(str(p)) for p in seq]
+    def speaker_id_from_ref(ref_path, idx, mixture_id):
+        stem = (ref_path.stem or "").strip()
+        if not stem:
+            stem = f"spk{idx:02d}"
+        return f"{mixture_id}__{stem}"
+    if isinstance(mixtures[0], dict):
+        for m in mixtures:
+            mid = str(m.get("mixture_id") or m.get("id") or "").strip()
+            if not mid:
+                raise ValueError("Each mixture must include 'mixture_id'.")
+            refs = as_paths(m.get("references", []))
+            if not refs:
+                raise ValueError(f"Mixture {mid}: 'references' must be non-empty.")
+            sysmap = {}
+            if isinstance(m.get("systems"), dict):
+                for algo, outs in m["systems"].items():
+                    sysmap[str(algo)] = as_paths(outs)
+            spk_ids = [speaker_id_from_ref(r, i, mid) for i, r in enumerate(refs)]
+            canon.append(Mixture(mid, refs, sysmap, spk_ids))
+        return canon
+    if isinstance(mixtures[0], list):
+        for i, group in enumerate(mixtures):
+            mid = f"mix_{i:03d}"
+            refs, spk_ids = [], []
+            for d in group:
+                if not isinstance(d, dict) or "ref" not in d or "id" not in d:
+                    raise ValueError(
+                        "Legacy mixtures expect dicts with 'id' and 'ref'."
+                    )
+                rp = Path(d["ref"])
+                refs.append(rp)
+                spk_ids.append(f"{mid}__{str(d['id']).strip()}")
+            sysmap = {}
+            if systems:
+                for algo, per_mix in systems.items():
+                    if mid in per_mix:
+                        sysmap[algo] = as_paths(per_mix[mid])
+            canon.append(Mixture(mid, refs, sysmap, spk_ids))
+        return canon
+    raise ValueError("Unsupported 'mixtures' format.")
+def random_misalign(sig, sr, max_ms, mode="single", rng=None):
+    import random
+    if rng is None:
+        rng = random
+    max_samples = int(sr * max_ms / 1000)
+    if max_samples == 0:
+        return sig
+    shift = (
+        rng.randint(-max_samples, max_samples) if mode == "range" else int(max_samples)
+    )
+    if shift == 0:
+        return sig
+    if isinstance(sig, torch.Tensor):
+        z = torch.zeros(abs(shift), dtype=sig.dtype, device=sig.device)
+        return (
+            torch.cat([z, sig[:-shift]]) if shift > 0 else torch.cat([sig[-shift:], z])
+        )
+    else:
+        z = np.zeros(abs(shift), dtype=sig.dtype)
+        return (
+            np.concatenate([z, sig[:-shift]])
+            if shift > 0
+            else np.concatenate([sig[-shift:], z])
+        )
+def safe_cov_torch(X):
+    Xc = X - X.mean(dim=0, keepdim=True)
+    cov = Xc.T @ Xc / (Xc.shape[0] - 1)
+    if torch.linalg.matrix_rank(cov) < cov.shape[0]:
+        cov += torch.eye(cov.shape[0], device=cov.device) * 1e-6
+    return cov
+def mahalanobis_torch(x, mu, inv):
+    diff = x - mu
+    diff_T = diff.transpose(-1, -2) if diff.ndim >= 2 else diff
+    return torch.sqrt(diff @ inv @ diff_T + 1e-6)