| from __future__ import annotations |
|
|
| import numpy as np |
| import torch |
| import librosa |
| from transformers import ClapModel, ClapProcessor |
|
|
|
|
| class AudioEmbedder: |
| """ |
| CLAP-based audio embedder. |
| Optimized for environmental soundscape semantics. |
| """ |
|
|
| def __init__( |
| self, |
| model_name: str = "laion/clap-htsat-unfused", |
| device: str = "cpu", |
| target_sr: int = 48000, |
| ): |
| self.device = device |
| self.target_sr = target_sr |
| self.processor = ClapProcessor.from_pretrained(model_name) |
| self.model = ClapModel.from_pretrained(model_name) |
| self.model.to(self.device) |
| self.model.eval() |
|
|
| def _extract_features(self, output, projection: str) -> torch.Tensor: |
| """Extract 1-D projected embedding (512-d) from model output. |
| |
| Handles both raw tensors and BaseModelOutputWithPooling objects |
| across different transformers versions. |
| """ |
| target_dim = getattr(self.model.config, "projection_dim", 512) |
| if not isinstance(output, torch.Tensor): |
| |
| pooled = output.pooler_output |
| |
| if pooled.shape[-1] != target_dim: |
| proj = getattr(self.model, projection, None) |
| if proj is not None: |
| pooled = proj(pooled) |
| output = pooled |
| if output.dim() == 3: |
| pooled = output[:, 0, :] |
| if pooled.shape[-1] != target_dim: |
| proj = getattr(self.model, projection, None) |
| if proj is not None: |
| pooled = proj(pooled) |
| output = pooled |
| if output.dim() == 2: |
| output = output[0] |
| return output |
|
|
| @torch.no_grad() |
| def embed(self, audio_path: str) -> np.ndarray: |
| waveform, _ = librosa.load(audio_path, sr=self.target_sr, mono=True) |
|
|
| |
| try: |
| inputs = self.processor( |
| audio=waveform, |
| sampling_rate=self.target_sr, |
| return_tensors="pt", |
| ).to(self.device) |
| except TypeError: |
| inputs = self.processor( |
| audios=waveform, |
| sampling_rate=self.target_sr, |
| return_tensors="pt", |
| ).to(self.device) |
|
|
| outputs = self.model.get_audio_features(**inputs) |
| emb = self._extract_features(outputs, "audio_projection") |
| return emb.cpu().numpy().astype("float32") |
|
|
| @torch.no_grad() |
| def embed_text(self, text: str) -> np.ndarray: |
| """Embed text using CLAP's text encoder (for text-audio comparison).""" |
| inputs = self.processor( |
| text=[text], |
| return_tensors="pt", |
| padding=True, |
| ).to(self.device) |
| feats = self.model.get_text_features(**inputs) |
| feats = self._extract_features(feats, "text_projection") |
| return feats.cpu().numpy().astype("float32") |
|
|