Jai-D commited on Mar 14

Commit

bfc6d2a

verified ·

1 Parent(s): 73b528e

Upload folder using huggingface_hub

Browse files

Files changed (30) hide show

.DS_Store +0 -0
.gitattributes +1 -0
Beethoven_WoO80_var27_8bars_3_15.wav +3 -0
Dockerfile +57 -0
__pycache__/constants.cpython-312.pyc +0 -0
__pycache__/handler.cpython-312.pyc +0 -0
checkpoints/.gitignore +1 -0
checkpoints/fold0/best.ckpt +3 -0
checkpoints/fold1/best.ckpt +3 -0
checkpoints/fold2/best.ckpt +3 -0
checkpoints/fold3/best.ckpt +3 -0
checkpoints/fold_0/best.ckpt +3 -0
checkpoints/fold_1/best.ckpt +3 -0
checkpoints/fold_2/best.ckpt +3 -0
checkpoints/fold_3/best.ckpt +3 -0
constants.py +69 -0
handler.py +247 -0
models/__init__.py +27 -0
models/__pycache__/__init__.cpython-312.pyc +0 -0
models/__pycache__/inference.cpython-312.pyc +0 -0
models/__pycache__/loader.cpython-312.pyc +0 -0
models/calibration.py +119 -0
models/inference.py +79 -0
models/loader.py +189 -0
preprocessing/__init__.py +15 -0
preprocessing/__pycache__/__init__.cpython-312.pyc +0 -0
preprocessing/__pycache__/audio.cpython-312.pyc +0 -0
preprocessing/audio.py +130 -0
requirements.txt +19 -0
sync_checkpoints.sh +51 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+Beethoven_WoO80_var27_8bars_3_15.wav filter=lfs diff=lfs merge=lfs -text

Beethoven_WoO80_var27_8bars_3_15.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:46cbac4d34b53eb2cd7bc83e0d2e1f44dbecb02a5471b6e0ca444d0ff29251c2
+size 2531508

Dockerfile ADDED Viewed

	@@ -0,0 +1,57 @@

+# M1c MuQ L9-12 Inference Handler
+# HuggingFace Inference Endpoints container for piano performance analysis
+FROM nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu22.04
+# Prevent interactive prompts
+ENV DEBIAN_FRONTEND=noninteractive
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    python3.11 \
+    python3.11-venv \
+    ffmpeg \
+    libsndfile1 \
+    git \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+# Set Python 3.11 as default
+RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 && \
+    update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1
+# Install uv
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+ENV PATH="/root/.local/bin:$PATH"
+WORKDIR /app
+# Install Python dependencies with uv
+COPY requirements.txt .
+RUN uv pip install --system --no-cache -r requirements.txt
+# Pre-download HuggingFace models (cached in image)
+# MuQ only
+RUN python3 -c "\
+print('Downloading MuQ-large-msd-iter...'); \
+from muq import MuQ; \
+MuQ.from_pretrained('OpenMuQ/MuQ-large-msd-iter'); \
+print('Done!'); \
+"
+# Copy application code
+COPY constants.py .
+COPY handler.py .
+COPY models/ ./models/
+COPY preprocessing/ ./preprocessing/
+# Create checkpoints directory structure
+RUN mkdir -p /app/checkpoints/fold0 /app/checkpoints/fold1 /app/checkpoints/fold2 /app/checkpoints/fold3
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+ENV TRANSFORMERS_CACHE=/app/.cache/huggingface
+ENV HF_HOME=/app/.cache/huggingface
+# HuggingFace Inference Endpoints expects handler.py
+# The EndpointHandler class will be automatically detected

__pycache__/constants.cpython-312.pyc ADDED Viewed

Binary file (1.63 kB). View file

__pycache__/handler.cpython-312.pyc ADDED Viewed

Binary file (9.12 kB). View file

checkpoints/.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ *

checkpoints/fold0/best.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3b0fe082f7928a1d180313643d21ec83c82f392c01a4e8a2595cb55607306084
+size 15869013

checkpoints/fold1/best.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ee6e94e1367da7999d4ee9f68de8e31724b7957e0571d421a8087d42cec8a9c8
+size 15869013

checkpoints/fold2/best.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:551924ff5d2692c129b86f750798d9532a288bc3e1d56dca2b897fe73672c717
+size 15869013

checkpoints/fold3/best.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:68188c86e5190e9a7663a69a013e7e9cc5cb793d177b8f1fb6252dae46607f24
+size 15869013

checkpoints/fold_0/best.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:edeed3bb972341076e06f2bacebfb40fd9ed5cba5e9f8de8f9bfe4e12c586d47
+size 637877835

checkpoints/fold_1/best.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b53550d16677918954191bf2228a3210986f5ddfcd48010f0272e981eab838cb
+size 637877899

checkpoints/fold_2/best.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:027592ad5a7a31ad1ae1628944842e3b5c4da96bb7f16a955f266a4b41a7b20b
+size 637877899

checkpoints/fold_3/best.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eb0de4d0aa81cae2ee8fe0b6a40a634624e75567443b343602b55d21fd80c49f
+size 637877899

constants.py ADDED Viewed

	@@ -0,0 +1,69 @@

+"""Constants for A1-Max MuQ LoRA inference handler."""
+PERCEPIANO_DIMENSIONS = [
+    "dynamics",
+    "timing",
+    "pedaling",
+    "articulation",
+    "phrasing",
+    "interpretation",
+]
+# A1-Max model configuration
+# MuQ embeddings (1024 dim) with attention pooling -> encoder -> regression head
+MODEL_CONFIG = {
+    # MuQ configuration (layers to average)
+    "muq_layer_start": 9,
+    "muq_layer_end": 13,  # Exclusive (layers 9, 10, 11, 12)
+    "muq_dim": 1024,  # Per-layer hidden size (= input_dim)
+    # Head configuration
+    "input_dim": 1024,
+    "hidden_dim": 512,
+    "num_labels": 6,
+    "dropout": 0.2,
+    # Audio processing
+    "target_sr": 24000,
+    "max_frames": 1000,
+}
+# Model info for response
+MODEL_INFO = {
+    "name": "A1-Max MuQ LoRA",
+    "type": "audio-muq-lora",
+    "pairwise": 0.7872,
+    "description": "A1-Max: MuQ + LoRA with ListMLE, CCC, mixup, hard negative mining",
+    "architecture": "MuQLoRAMaxModel (MuQ L9-12 avg -> attn pool -> encoder -> 6-dim regression)",
+    "best_config": "A1max_r32_L7-12_ls0.1",
+}
+# Number of folds for ensemble
+N_FOLDS = 4
+# MAESTRO calibration stats: per-dimension distribution over 24,321 professional segments.
+# Computed by model/scripts/compute_maestro_calibration.py using A1-Max 4-fold ensemble.
+MAESTRO_CALIBRATION = {
+    "dynamics": {
+        "mean": 0.560947, "std": 0.021063,
+        "p5": 0.526612, "p25": 0.546136, "p50": 0.560859, "p75": 0.575372, "p95": 0.59573,
+    },
+    "timing": {
+        "mean": 0.531883, "std": 0.028791,
+        "p5": 0.480467, "p25": 0.512976, "p50": 0.534302, "p75": 0.552652, "p95": 0.575376,
+    },
+    "pedaling": {
+        "mean": 0.590465, "std": 0.030438,
+        "p5": 0.534399, "p25": 0.572243, "p50": 0.593731, "p75": 0.611854, "p95": 0.635053,
+    },
+    "articulation": {
+        "mean": 0.553624, "std": 0.014287,
+        "p5": 0.53023, "p25": 0.543792, "p50": 0.553554, "p75": 0.563275, "p95": 0.577426,
+    },
+    "phrasing": {
+        "mean": 0.550866, "std": 0.013717,
+        "p5": 0.528466, "p25": 0.541541, "p50": 0.550801, "p75": 0.560116, "p95": 0.573567,
+    },
+    "interpretation": {
+        "mean": 0.564377, "std": 0.023457,
+        "p5": 0.522434, "p25": 0.549302, "p50": 0.566195, "p75": 0.580981, "p95": 0.599733,
+    },
+}

handler.py ADDED Viewed

	@@ -0,0 +1,247 @@

+"""HuggingFace Inference Endpoints handler for piano performance analysis.
+A1-Max MuQ LoRA model using MuQ layers 9-12 with attention pooling.
+Returns 6-dimension performance evaluation scores:
+dynamics, timing, pedaling, articulation, phrasing, interpretation.
+Compatible with HuggingFace Inference Endpoints custom handler pattern.
+"""
+import base64
+import time
+import traceback
+from pathlib import Path
+from typing import Any, Dict, Union
+import numpy as np
+from constants import MODEL_INFO, PERCEPIANO_DIMENSIONS
+from models.loader import get_model_cache
+from models.inference import (
+    extract_muq_embeddings,
+    predict_with_ensemble,
+)
+from preprocessing.audio import (
+    AudioDownloadError,
+    AudioProcessingError,
+    download_and_preprocess_audio,
+    preprocess_audio_from_bytes,
+)
+class EndpointHandler:
+    """HuggingFace Inference Endpoints handler for piano performance analysis."""
+    def __init__(self, path: str = ""):
+        """Initialize MuQ model and prediction heads.
+        Called once when the endpoint container starts.
+        Args:
+            path: Path to the model repository (provided by HF Inference Endpoints).
+                  Contains the checkpoints/ directory with model weights.
+        """
+        print(f"Initializing A1-Max EndpointHandler with path: {path}")
+        # Determine checkpoint directory
+        # HF Inference Endpoints mount the repo at the provided path
+        # Fall back to /repository (HF default) or current dir for local testing
+        if path:
+            model_path = Path(path)
+        else:
+            model_path = Path("/repository")
+            if not model_path.exists():
+                model_path = Path(".")
+        checkpoint_dir = model_path / "checkpoints"
+        if not checkpoint_dir.exists():
+            # Try /app/checkpoints for backward compatibility
+            checkpoint_dir = Path("/app/checkpoints")
+        print(f"Using checkpoint directory: {checkpoint_dir}")
+        # Initialize model cache (loads MuQ and prediction heads)
+        self._cache = get_model_cache()
+        self._cache.initialize(device="cuda", checkpoint_dir=checkpoint_dir)
+        print("A1-Max EndpointHandler initialization complete!")
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        """Process inference request.
+        Args:
+            data: Request payload. Supports two formats:
+                HuggingFace format:
+                {
+                    "inputs": "<base64-audio>" or {"audio_url": "..."},
+                    "parameters": {
+                        "max_duration_seconds": 300
+                    }
+                }
+                Legacy RunPod format (for backward compatibility):
+                {
+                    "input": {
+                        "audio_url": "https://...",
+                        "options": {...}
+                    }
+                }
+        Returns:
+            Prediction results:
+            {
+                "predictions": {"timing": 0.85, ...},
+                "model_info": {"name": "M1c-MuQ-L9-12", "r2": 0.539},
+                "audio_duration_seconds": 180.5,
+                "processing_time_ms": 1234
+            }
+            Or error:
+            {
+                "error": {"code": "...", "message": "..."}
+            }
+        """
+        start_time = time.time()
+        try:
+            # Parse input - support both HF and legacy RunPod formats
+            inputs, parameters = self._parse_request(data)
+            # Extract parameters
+            max_duration = parameters.get("max_duration_seconds", 300)
+            # Load and preprocess audio
+            audio, duration = self._load_audio(inputs, max_duration)
+            print(f"Audio loaded: {duration:.1f}s")
+            # Verify models are loaded
+            if not self._cache.muq_model:
+                return {
+                    "error": {
+                        "code": "MODEL_NOT_LOADED",
+                        "message": "MuQ model not initialized",
+                    }
+                }
+            # Extract MuQ embeddings (averaged layers 9-12)
+            print("Extracting MuQ embeddings (layers 9-12)...")
+            embeddings = extract_muq_embeddings(audio, self._cache)
+            print(f"MuQ embeddings shape: {embeddings.shape}")
+            # Get ensemble predictions (4-fold A1-Max)
+            print("Running A1-Max ensemble inference...")
+            predictions = predict_with_ensemble(embeddings, self._cache)
+            # Build response
+            processing_time_ms = int((time.time() - start_time) * 1000)
+            result = {
+                "predictions": self._predictions_to_dict(predictions),
+                "model_info": {
+                    "name": MODEL_INFO["name"],
+                    "type": MODEL_INFO["type"],
+                    "pairwise": MODEL_INFO["pairwise"],
+                    "architecture": MODEL_INFO["architecture"],
+                    "ensemble_folds": len(self._cache.muq_heads),
+                },
+                "audio_duration_seconds": duration,
+                "processing_time_ms": processing_time_ms,
+            }
+            print(f"Inference complete in {processing_time_ms}ms")
+            return result
+        except AudioDownloadError as e:
+            return {
+                "error": {
+                    "code": "AUDIO_DOWNLOAD_FAILED",
+                    "message": str(e),
+                }
+            }
+        except AudioProcessingError as e:
+            return {
+                "error": {
+                    "code": "AUDIO_PROCESSING_FAILED",
+                    "message": str(e),
+                }
+            }
+        except Exception as e:
+            return {
+                "error": {
+                    "code": "INFERENCE_ERROR",
+                    "message": str(e),
+                    "traceback": traceback.format_exc(),
+                }
+            }
+    def _parse_request(self, data: Dict[str, Any]) -> tuple:
+        """Parse request data supporting both HF and legacy formats.
+        Returns:
+            Tuple of (inputs, parameters)
+        """
+        # HF format: {"inputs": ..., "parameters": ...}
+        if "inputs" in data:
+            inputs = data["inputs"]
+            parameters = data.get("parameters", {})
+            return inputs, parameters
+        # Legacy RunPod format: {"input": {"audio_url": ..., "options": ...}}
+        if "input" in data:
+            job_input = data["input"]
+            inputs = {
+                "audio_url": job_input.get("audio_url"),
+                "performance_id": job_input.get("performance_id", "unknown"),
+            }
+            parameters = job_input.get("options", {})
+            parameters["performance_id"] = inputs.get("performance_id", "unknown")
+            return inputs, parameters
+        # Fallback: treat entire data as inputs
+        return data, {}
+    def _load_audio(
+        self, inputs: Union[str, bytes, Dict[str, Any]], max_duration: int
+    ) -> tuple:
+        """Load audio from various input formats.
+        Args:
+            inputs: One of:
+                - str: Base64-encoded audio bytes
+                - bytes: Raw audio bytes
+                - dict: {"audio_url": "..."} for URL-based loading
+        Returns:
+            Tuple of (audio_array, duration_seconds)
+        """
+        if isinstance(inputs, str):
+            # Base64-encoded audio
+            try:
+                audio_bytes = base64.b64decode(inputs)
+                return preprocess_audio_from_bytes(audio_bytes, max_duration=max_duration)
+            except Exception:
+                # Maybe it's a URL string
+                if inputs.startswith("http"):
+                    return download_and_preprocess_audio(inputs, max_duration=max_duration)
+                raise AudioProcessingError("Invalid input string: not base64 or URL")
+        elif isinstance(inputs, bytes):
+            # Raw bytes
+            return preprocess_audio_from_bytes(inputs, max_duration=max_duration)
+        elif isinstance(inputs, dict):
+            # URL-based input
+            audio_url = inputs.get("audio_url")
+            if not audio_url:
+                raise AudioProcessingError("No audio_url provided in inputs")
+            return download_and_preprocess_audio(audio_url, max_duration=max_duration)
+        else:
+            raise AudioProcessingError(f"Unsupported input type: {type(inputs)}")
+    def _predictions_to_dict(self, preds: np.ndarray) -> Dict[str, float]:
+        """Convert prediction array to dimension dict."""
+        return {dim: float(preds[i]) for i, dim in enumerate(PERCEPIANO_DIMENSIONS)}

models/__init__.py ADDED Viewed

	@@ -0,0 +1,27 @@

+"""Model loading and inference for A1-Max MuQ LoRA."""
+from models.loader import (
+    A1MaxInferenceHead,
+    ModelCache,
+    get_model_cache,
+)
+from models.inference import (
+    extract_muq_embeddings,
+    predict_with_ensemble,
+)
+from models.calibration import (
+    calibrate_predictions,
+    predictions_to_calibrated_dict,
+    get_calibration_context,
+)
+__all__ = [
+    "A1MaxInferenceHead",
+    "ModelCache",
+    "get_model_cache",
+    "extract_muq_embeddings",
+    "predict_with_ensemble",
+    "calibrate_predictions",
+    "predictions_to_calibrated_dict",
+    "get_calibration_context",
+]

models/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (688 Bytes). View file

models/__pycache__/inference.cpython-312.pyc ADDED Viewed

Binary file (4.87 kB). View file

models/__pycache__/loader.cpython-312.pyc ADDED Viewed

Binary file (8.84 kB). View file

models/calibration.py ADDED Viewed

	@@ -0,0 +1,119 @@

+"""MAESTRO-based calibration for performance predictions.
+Normalizes raw model predictions relative to professional MAESTRO recordings,
+making scores more interpretable for end users.
+"""
+import numpy as np
+from typing import Dict
+from constants import MAESTRO_CALIBRATION, PERCEPIANO_DIMENSIONS
+def calibrate_predictions(
+    raw_predictions: np.ndarray,
+    method: str = "percentile",
+) -> np.ndarray:
+    """Calibrate raw predictions using MAESTRO professional benchmarks.
+    Args:
+        raw_predictions: Raw model outputs [6] in range ~[0, 1]
+        method: Calibration method:
+            - "percentile": Scale to [0, 1] where 0 = MAESTRO 5th percentile,
+              1 = MAESTRO 95th percentile. Scores can exceed [0, 1] for
+              exceptional or below-average performances.
+            - "zscore": Convert to z-scores relative to MAESTRO distribution.
+    Returns:
+        Calibrated predictions [6]. For "percentile" method, ~0.5 means
+        comparable to average MAESTRO professional performance.
+    """
+    calibrated = np.zeros_like(raw_predictions)
+    for i, dim in enumerate(PERCEPIANO_DIMENSIONS):
+        raw_score = raw_predictions[i]
+        # Get calibration stats - keys match PERCEPIANO_DIMENSIONS exactly
+        dim_key = dim
+        if dim_key not in MAESTRO_CALIBRATION:
+            # Fallback: use raw score (this shouldn't happen with properly configured data)
+            calibrated[i] = raw_score
+            continue
+        stats = MAESTRO_CALIBRATION[dim_key]
+        if method == "percentile":
+            # Scale so MAESTRO 5th percentile = 0, 95th percentile = 1
+            # This means ~0.5 = average professional performance
+            p5 = stats["p5"]
+            p95 = stats["p95"]
+            range_width = p95 - p5
+            if range_width > 0:
+                calibrated[i] = (raw_score - p5) / range_width
+            else:
+                calibrated[i] = 0.5
+        elif method == "zscore":
+            # Convert to z-score relative to MAESTRO mean/std
+            mean = stats["mean"]
+            std = stats["std"]
+            if std > 0:
+                calibrated[i] = (raw_score - mean) / std
+            else:
+                calibrated[i] = 0.0
+        else:
+            calibrated[i] = raw_score
+    return calibrated
+def predictions_to_calibrated_dict(
+    raw_predictions: np.ndarray,
+) -> Dict[str, Dict[str, float]]:
+    """Convert raw predictions to a dict with both raw and calibrated scores.
+    Args:
+        raw_predictions: Raw model outputs [6]
+    Returns:
+        Dict with structure:
+        {
+            "timing": {"raw": 0.65, "calibrated": 0.42, "percentile_rank": 42},
+            ...
+        }
+    """
+    calibrated = calibrate_predictions(raw_predictions, method="percentile")
+    result = {}
+    for i, dim in enumerate(PERCEPIANO_DIMENSIONS):
+        raw_score = float(raw_predictions[i])
+        cal_score = float(calibrated[i])
+        # Clamp percentile rank to [0, 100] for display
+        percentile_rank = int(max(0, min(100, cal_score * 100)))
+        result[dim] = {
+            "raw": round(raw_score, 4),
+            "calibrated": round(max(0.0, min(1.0, cal_score)), 4),
+            "percentile_rank": percentile_rank,
+        }
+    return result
+def get_calibration_context() -> str:
+    """Get a text description of the calibration for LLM context.
+    Returns:
+        String describing how to interpret calibrated scores.
+    """
+    return """Score Interpretation (calibrated relative to 500 professional MAESTRO recordings):
+- 0.0 = Performance at the 5th percentile of professionals (lower end)
+- 0.5 = Performance at the 50th percentile of professionals (average professional level)
+- 1.0 = Performance at the 95th percentile of professionals (exceptional)
+- Scores can exceed [0, 1] for truly exceptional or below-average performances
+Note: These scores compare against competition-level professional pianists.
+A calibrated score of 0.5 represents professional-level competency."""

models/inference.py ADDED Viewed

	@@ -0,0 +1,79 @@

+"""A1-Max MuQ inference - MuQ embedding extraction and prediction."""
+import numpy as np
+import torch
+from constants import MODEL_CONFIG
+from models.loader import ModelCache
+@torch.no_grad()
+def extract_muq_embeddings(
+    audio: np.ndarray,
+    cache: ModelCache,
+    layer_start: int = None,
+    layer_end: int = None,
+    max_frames: int = None,
+) -> torch.Tensor:
+    """Extract MuQ embeddings from audio waveform.
+    Averages hidden states from layers 9-12 (best performing range).
+    Args:
+        audio: Audio waveform at 24kHz
+        cache: Model cache with loaded MuQ model
+        layer_start: Start layer (inclusive), default 9
+        layer_end: End layer (exclusive), default 13
+        max_frames: Maximum frames to keep
+    Returns:
+        Embeddings tensor [T, 1024] where T is number of frames
+    """
+    layer_start = layer_start or MODEL_CONFIG["muq_layer_start"]
+    layer_end = layer_end or MODEL_CONFIG["muq_layer_end"]
+    max_frames = max_frames or MODEL_CONFIG["max_frames"]
+    # MuQ expects [B, samples] tensor
+    wavs = torch.tensor(audio).unsqueeze(0).to(cache.device)
+    # Get hidden states from all layers
+    outputs = cache.muq_model(wavs, output_hidden_states=True)
+    # Average layers 9-12 (indices in hidden_states tuple)
+    # hidden_states is tuple of [B, T, D] tensors
+    hidden_states = outputs.hidden_states[layer_start:layer_end]
+    embeddings = torch.stack(hidden_states, dim=0).mean(dim=0).squeeze(0)
+    if embeddings.shape[0] > max_frames:
+        embeddings = embeddings[:max_frames]
+    return embeddings
+@torch.no_grad()
+def predict_with_ensemble(
+    embeddings: torch.Tensor,
+    cache: ModelCache,
+) -> np.ndarray:
+    """Get predictions from 4-fold ensemble of A1-Max heads.
+    Each head uses attention pooling on frame-level embeddings,
+    then encoder + regression head to predict 6-dim scores.
+    Args:
+        embeddings: Frame embeddings [T, D] from MuQ
+        cache: Model cache with loaded heads
+    Returns:
+        Averaged predictions [6] across all folds
+    """
+    if not cache.muq_heads:
+        raise RuntimeError("No A1-Max heads loaded in cache")
+    # Get predictions from each fold head
+    predictions = []
+    for head in cache.muq_heads:
+        pred = head(embeddings).cpu().numpy()
+        predictions.append(pred)
+    return np.mean(predictions, axis=0)

models/loader.py ADDED Viewed

	@@ -0,0 +1,189 @@

+"""Model loading and caching for A1-Max MuQ LoRA inference."""
+from pathlib import Path
+from typing import List, Optional
+import torch
+import torch.nn as nn
+from constants import MODEL_CONFIG, N_FOLDS
+class A1MaxInferenceHead(nn.Module):
+    """Inference-only version of MuQLoRAMaxModel's predict_scores path.
+    Replicates the architecture needed for score prediction:
+    - Attention pooling: [B, T, D] -> [B, D]
+    - Encoder: 2-layer MLP [B, D] -> [B, hidden_dim]
+    - Regression head: MLP + sigmoid [B, hidden_dim] -> [B, num_labels]
+    Does NOT include ranking/contrastive/comparator modules (training-only).
+    """
+    def __init__(
+        self,
+        input_dim: int = 1024,
+        hidden_dim: int = 512,
+        num_labels: int = 6,
+        dropout: float = 0.2,
+    ):
+        super().__init__()
+        self.num_labels = num_labels
+        # Attention pooling (matches MuQLoRAModel.attn)
+        self.attn = nn.Sequential(
+            nn.Linear(input_dim, 256), nn.Tanh(), nn.Linear(256, 1)
+        )
+        # Shared encoder (matches MuQLoRAModel.encoder)
+        self.encoder = nn.Sequential(
+            nn.Linear(input_dim, hidden_dim),
+            nn.LayerNorm(hidden_dim),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.LayerNorm(hidden_dim),
+            nn.GELU(),
+            nn.Dropout(dropout),
+        )
+        # Regression head (matches MuQLoRAModel.regression_head)
+        self.regression_head = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim // 2),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim // 2, num_labels),
+            nn.Sigmoid(),
+        )
+    def forward(self, embeddings: torch.Tensor) -> torch.Tensor:
+        """Predict quality scores from frame embeddings.
+        Args:
+            embeddings: Frame embeddings [B, T, D] or [T, D].
+        Returns:
+            Scores [B, num_labels] or [num_labels] in [0, 1].
+        """
+        squeeze_output = False
+        if embeddings.dim() == 2:
+            embeddings = embeddings.unsqueeze(0)
+            squeeze_output = True
+        # Attention pool
+        scores = self.attn(embeddings).squeeze(-1)  # [B, T]
+        w = torch.softmax(scores, dim=-1).unsqueeze(-1)  # [B, T, 1]
+        pooled = (embeddings * w).sum(1)  # [B, D]
+        # Encode
+        z = self.encoder(pooled)  # [B, hidden_dim]
+        # Predict
+        result = self.regression_head(z)  # [B, num_labels]
+        return result.squeeze(0) if squeeze_output else result
+class ModelCache:
+    """Singleton cache for loaded models."""
+    _instance: Optional["ModelCache"] = None
+    def __new__(cls) -> "ModelCache":
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+            cls._instance._initialized = False
+        return cls._instance
+    def __init__(self):
+        if self._initialized:
+            return
+        self.muq_model = None
+        self.muq_heads: List[A1MaxInferenceHead] = []
+        self.device = None
+        self._initialized = True
+    def initialize(self, device: str = "cuda", checkpoint_dir: Optional[Path] = None):
+        """Load MuQ model and A1-Max prediction heads. Called once on container start."""
+        if self.muq_model is not None:
+            return
+        self.device = torch.device(device if torch.cuda.is_available() else "cpu")
+        print(f"Initializing A1-Max models on {self.device}...")
+        # Load MuQ from HuggingFace
+        print("Loading MuQ-large-msd-iter...")
+        try:
+            from muq import MuQ
+            self.muq_model = MuQ.from_pretrained("OpenMuQ/MuQ-large-msd-iter")
+            self.muq_model = self.muq_model.to(self.device)
+            self.muq_model.eval()
+            print("MuQ loaded successfully")
+        except ImportError as e:
+            raise ImportError(
+                "MuQ library not found. Install with: pip install muq"
+            ) from e
+        # Load A1-Max prediction heads (4 folds)
+        print("Loading A1-Max prediction heads...")
+        checkpoint_dir = checkpoint_dir or Path("/repository/checkpoints")
+        if not checkpoint_dir.exists():
+            checkpoint_dir = Path("/app/checkpoints")
+        for fold in range(N_FOLDS):
+            ckpt_path = checkpoint_dir / f"fold_{fold}" / "best.ckpt"
+            # Also try the epoch-based naming from sweep
+            if not ckpt_path.exists():
+                fold_dir = checkpoint_dir / f"fold_{fold}"
+                if fold_dir.exists():
+                    ckpts = sorted(fold_dir.glob("*.ckpt"))
+                    if ckpts:
+                        ckpt_path = ckpts[0]
+            if ckpt_path.exists():
+                head = self._load_a1max_head(ckpt_path)
+                self.muq_heads.append(head)
+                print(f"  Loaded fold {fold} from {ckpt_path}")
+            else:
+                print(f"  Warning: No checkpoint found for fold {fold}")
+        print(f"Initialization complete. {len(self.muq_heads)} heads loaded.")
+    def _load_a1max_head(self, ckpt_path: Path) -> A1MaxInferenceHead:
+        """Load an A1MaxInferenceHead from PyTorch Lightning checkpoint."""
+        checkpoint = torch.load(ckpt_path, map_location=self.device, weights_only=False)
+        hparams = checkpoint.get("hyper_parameters", {})
+        head = A1MaxInferenceHead(
+            input_dim=hparams.get("input_dim", MODEL_CONFIG["input_dim"]),
+            hidden_dim=hparams.get("hidden_dim", MODEL_CONFIG["hidden_dim"]),
+            num_labels=hparams.get("num_labels", MODEL_CONFIG["num_labels"]),
+            dropout=hparams.get("dropout", MODEL_CONFIG["dropout"]),
+        )
+        # Load state dict from Lightning checkpoint
+        state_dict = checkpoint["state_dict"]
+        # Map Lightning keys to inference head keys
+        # Lightning saves as: attn.0.weight, encoder.0.weight, regression_head.0.weight, etc.
+        head_state = {}
+        for key, value in state_dict.items():
+            if key.startswith("attn.") or key.startswith("encoder.") or key.startswith("regression_head."):
+                head_state[key] = value
+        head.load_state_dict(head_state, strict=True)
+        head.to(self.device)
+        head.eval()
+        return head
+_cache: Optional[ModelCache] = None
+def get_model_cache() -> ModelCache:
+    """Get the global model cache instance."""
+    global _cache
+    if _cache is None:
+        _cache = ModelCache()
+    return _cache

preprocessing/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+"""Audio preprocessing modules."""
+from preprocessing.audio import (
+    download_and_preprocess_audio,
+    preprocess_audio_from_bytes,
+    AudioDownloadError,
+    AudioProcessingError,
+)
+__all__ = [
+    "download_and_preprocess_audio",
+    "preprocess_audio_from_bytes",
+    "AudioDownloadError",
+    "AudioProcessingError",
+]

preprocessing/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (541 Bytes). View file

preprocessing/__pycache__/audio.cpython-312.pyc ADDED Viewed

Binary file (5.8 kB). View file

preprocessing/audio.py ADDED Viewed

	@@ -0,0 +1,130 @@

+"""Audio download and preprocessing for D9c inference."""
+import tempfile
+from pathlib import Path
+from typing import Tuple
+import librosa
+import numpy as np
+import requests
+# Default sample rate for MERT/MuQ (hardcoded to avoid import issues)
+TARGET_SR = 24000
+class AudioDownloadError(Exception):
+    """Raised when audio download fails."""
+    pass
+class AudioProcessingError(Exception):
+    """Raised when audio processing fails."""
+    pass
+def download_and_preprocess_audio(
+    audio_url: str,
+    target_sr: int = TARGET_SR,
+    max_duration: int = 300,
+    timeout: int = 60,
+) -> Tuple[np.ndarray, float]:
+    """Download audio from URL and preprocess for MERT/MuQ.
+    Args:
+        audio_url: URL to download audio from
+        target_sr: Target sample rate (24kHz for MERT/MuQ)
+        max_duration: Maximum audio duration in seconds
+        timeout: Download timeout in seconds
+    Returns:
+        Tuple of (audio_array, duration_seconds)
+    Raises:
+        AudioDownloadError: If download fails
+        AudioProcessingError: If audio processing fails
+    """
+    try:
+        response = requests.get(audio_url, timeout=timeout, stream=True)
+        response.raise_for_status()
+    except requests.RequestException as e:
+        raise AudioDownloadError(f"Failed to download audio: {e}")
+    # Determine file extension from content-type or URL
+    content_type = response.headers.get("content-type", "")
+    if "mpeg" in content_type or audio_url.endswith(".mp3"):
+        suffix = ".mp3"
+    elif "wav" in content_type or audio_url.endswith(".wav"):
+        suffix = ".wav"
+    elif "flac" in content_type or audio_url.endswith(".flac"):
+        suffix = ".flac"
+    else:
+        suffix = ".mp3"
+    with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as f:
+        for chunk in response.iter_content(chunk_size=8192):
+            f.write(chunk)
+        temp_path = Path(f.name)
+    try:
+        audio, sr = librosa.load(temp_path, sr=target_sr, mono=True)
+        duration = len(audio) / sr
+        if duration > max_duration:
+            raise AudioProcessingError(
+                f"Audio too long: {duration:.1f}s > {max_duration}s limit"
+            )
+        if duration < 1.0:
+            raise AudioProcessingError(
+                f"Audio too short: {duration:.1f}s < 1.0s minimum"
+            )
+        return audio, duration
+    except AudioProcessingError:
+        raise
+    except Exception as e:
+        raise AudioProcessingError(f"Failed to process audio: {e}")
+    finally:
+        temp_path.unlink(missing_ok=True)
+def load_audio_from_file(
+    audio_path: Path,
+    target_sr: int = TARGET_SR,
+) -> Tuple[np.ndarray, float]:
+    """Load audio from local file."""
+    audio, sr = librosa.load(audio_path, sr=target_sr, mono=True)
+    duration = len(audio) / sr
+    return audio, duration
+def preprocess_audio_from_bytes(
+    audio_bytes: bytes,
+    target_sr: int = TARGET_SR,
+    max_duration: int = 300,
+) -> Tuple[np.ndarray, float]:
+    """Preprocess audio from raw bytes (e.g., base64 decoded)."""
+    import io
+    try:
+        audio, sr = librosa.load(io.BytesIO(audio_bytes), sr=target_sr, mono=True)
+        duration = len(audio) / sr
+        if duration > max_duration:
+            raise AudioProcessingError(
+                f"Audio too long: {duration:.1f}s > {max_duration}s limit"
+            )
+        if duration < 1.0:
+            raise AudioProcessingError(
+                f"Audio too short: {duration:.1f}s < 1.0s minimum"
+            )
+        return audio, duration
+    except AudioProcessingError:
+        raise
+    except Exception as e:
+        raise AudioProcessingError(f"Failed to process audio bytes: {e}")

requirements.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+# A1-Max MuQ LoRA - HuggingFace Inference Endpoints dependencies
+# This file is read by HF Endpoints to install Python packages
+# PyTorch and ML
+torch>=2.0.0
+transformers>=4.30.0
+pytorch-lightning>=2.0.0
+# Audio embedding models
+muq  # MuQ - Music Understanding Quantized from ByteDance/OpenMuQ
+# Audio processing
+librosa>=0.10.0
+soundfile>=0.12.0
+# Utilities
+numpy>=1.24.0
+scipy>=1.10.0
+requests>=2.28.0

sync_checkpoints.sh ADDED Viewed

	@@ -0,0 +1,51 @@

+#!/bin/bash
+# Sync A1-Max MuQ LoRA checkpoints from Google Drive
+# Run this before building the Docker image or uploading to HuggingFace
+set -e
+CHECKPOINT_DIR="./checkpoints"
+GDRIVE_PATH="gdrive:crescendai_data/checkpoints/a1_max_sweep/A1max_r32_L7-12_ls0.1"
+echo "A1-Max MuQ LoRA Checkpoint Sync"
+echo "================================"
+echo ""
+echo "Creating checkpoint directories..."
+mkdir -p "$CHECKPOINT_DIR/fold_0"
+mkdir -p "$CHECKPOINT_DIR/fold_1"
+mkdir -p "$CHECKPOINT_DIR/fold_2"
+mkdir -p "$CHECKPOINT_DIR/fold_3"
+echo ""
+echo "Syncing A1-Max checkpoints (4-fold ensemble, 80.8% pairwise)..."
+echo "Source: $GDRIVE_PATH"
+echo ""
+# Sync each fold's best checkpoint
+for fold in 0 1 2 3; do
+    echo "Syncing fold_$fold..."
+    rclone copyto "$GDRIVE_PATH/fold_${fold}/best.ckpt" "$CHECKPOINT_DIR/fold_$fold/best.ckpt" --progress
+done
+echo ""
+echo "Checkpoint sync complete!"
+echo ""
+echo "Directory structure:"
+ls -la "$CHECKPOINT_DIR"
+echo ""
+for fold in 0 1 2 3; do
+    echo "fold_$fold:"
+    ls -la "$CHECKPOINT_DIR/fold_$fold"
+done
+echo ""
+echo "Expected HuggingFace repository structure:"
+echo "  checkpoints/"
+echo "    fold_0/best.ckpt"
+echo "    fold_1/best.ckpt"
+echo "    fold_2/best.ckpt"
+echo "    fold_3/best.ckpt"
+echo ""
+echo "Model: A1-Max MuQ LoRA r32 L7-12 (6-dim, 80.8% pairwise, R2=0.50)"