Initial release: VoiceCLAP-Small (BUD-E-Whisper + MiniLM, dual-tower CLAP, 1 epoch on voiceclap_10)

Browse files

Files changed (8) hide show

README.md +86 -0
config.json +27 -0
configuration_voiceclap.py +42 -0
model.safetensors +3 -0
modeling_voiceclap.py +205 -0
preprocessor_config.json +14 -0
tokenizer.json +0 -0
tokenizer_config.json +24 -0

README.md ADDED Viewed

	@@ -0,0 +1,86 @@

+---
+license: cc-by-4.0
+language:
+  - en
+library_name: transformers
+pipeline_tag: zero-shot-audio-classification
+tags:
+  - audio
+  - speech
+  - emotion
+  - clap
+  - contrastive
+  - voice
+---
+# VoiceCLAP-Small
+Voice-text contrastive (CLAP-style) embedding model trained on dense vocal-style
+captions for the [VoiceNet](https://huggingface.co/VoiceNet) suite.
+VoiceCLAP-Small is the smaller of the two voice-text contrastive anchors
+released with VoiceNet. It is a **dual-tower** model: a
+[BUD-E-Whisper_V1.1](https://huggingface.co/laion/BUD-E-Whisper_V1.1) audio
+encoder paired with
+[`sentence-transformers/all-MiniLM-L6-v2`](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)
+on the text side, joined by an MLP projection on each side and trained with
+the SigLIP sigmoid contrastive loss.
+| | |
+| --- | --- |
+| Architecture | dual-tower CLAP (BUD-E-Whisper-Small + MiniLM-L6-v2) |
+| Audio encoder | Whisper-style: 12 layers × 768 dim × 12 heads, 80-mel input @ 16 kHz |
+| Text encoder | BERT/MiniLM, 6 layers × 384 dim, mean-pooled |
+| Joint embedding | 768-d, L2-normalised |
+| Loss | SigLIP (sigmoid contrastive) |
+| Total parameters | ~110 M |
+| Epochs | 1 |
+## Training data
+Trained for **1 epoch** on the open `voiceclap_10` mixture used in the
+VoiceNet paper:
+- `emolia-balanced-5M-subset` (annotated subset of [Emilia](https://huggingface.co/datasets/amphion/Emilia-Dataset))
+- `laions_got_talent_clean_with_captions`
+- `majestrino-data`
+- `synthetic_vocal_bursts`
+- `improved_synthetic_vocal_bursts`
+- `ears`
+All clips are captioned with `MOSS-Audio-8B-Thinking`-derived dense vocal-style
+captions covering emotions, talking-style attributes, and demographics.
+## Standalone load example
+Only `transformers` and `torchaudio` are required (both on PyPI).
+```python
+import torch, torchaudio
+from transformers import AutoModel, AutoTokenizer
+model = AutoModel.from_pretrained("VoiceNet/voiceclap-small", trust_remote_code=True).eval()
+tok   = AutoTokenizer.from_pretrained("VoiceNet/voiceclap-small")
+# Audio: any-length 16 kHz waveform, mono
+wav, sr = torchaudio.load("clip.wav")
+if sr != 16000:
+    wav = torchaudio.functional.resample(wav, sr, 16000)
+wav = wav.mean(0)                                         # (T,)
+audio_emb = model.encode_waveform(wav)                    # (1, 768), L2-normed
+# Text: short caption(s)
+enc      = tok(["a calm and steady voice"], padding=True, return_tensors="pt")
+text_emb = model.encode_text(enc.input_ids, enc.attention_mask)
+# Cosine similarity (embeddings already L2-normalised)
+print((audio_emb @ text_emb.T).item())
+```
+`encode_waveform` accepts clips up to 30 s; longer clips should be chunked or
+truncated before being passed in. Embeddings are 768-d and unit-norm, so
+`a @ t.T` is the cosine similarity used in zero-shot retrieval.
+## Citation
+If you use this model, please cite the VoiceNet paper.

config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "architectures": [
+    "VoiceCLAPSmall"
+  ],
+  "dtype": "float32",
+  "embed_dim": 768,
+  "model_type": "voiceclap-small",
+  "n_ctx": 1500,
+  "n_head": 12,
+  "n_layer": 12,
+  "n_mels": 80,
+  "n_state": 768,
+  "text_hidden_dim": 384,
+  "text_intermediate_size": 1536,
+  "text_layer_norm_eps": 1e-12,
+  "text_max_position_embeddings": 512,
+  "text_num_heads": 12,
+  "text_num_layers": 6,
+  "text_pad_token_id": 0,
+  "text_proj_hidden": 576,
+  "text_vocab_size": 30522,
+  "transformers_version": "5.7.0",
+  "auto_map": {
+    "AutoConfig": "configuration_voiceclap.VoiceCLAPSmallConfig",
+    "AutoModel": "modeling_voiceclap.VoiceCLAPSmall"
+  }
+}

configuration_voiceclap.py ADDED Viewed

	@@ -0,0 +1,42 @@

+"""VoiceCLAP-Small config."""
+from transformers import PretrainedConfig
+class VoiceCLAPSmallConfig(PretrainedConfig):
+    model_type = "voiceclap-small"
+    def __init__(
+        self,
+        embed_dim: int = 768,
+        n_mels: int = 80,
+        n_ctx: int = 1500,
+        n_state: int = 768,
+        n_head: int = 12,
+        n_layer: int = 12,
+        text_hidden_dim: int = 384,
+        text_proj_hidden: int = 576,
+        text_vocab_size: int = 30522,
+        text_intermediate_size: int = 1536,
+        text_num_layers: int = 6,
+        text_num_heads: int = 12,
+        text_max_position_embeddings: int = 512,
+        text_layer_norm_eps: float = 1e-12,
+        text_pad_token_id: int = 0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.embed_dim = embed_dim
+        self.n_mels = n_mels
+        self.n_ctx = n_ctx
+        self.n_state = n_state
+        self.n_head = n_head
+        self.n_layer = n_layer
+        self.text_hidden_dim = text_hidden_dim
+        self.text_proj_hidden = text_proj_hidden
+        self.text_vocab_size = text_vocab_size
+        self.text_intermediate_size = text_intermediate_size
+        self.text_num_layers = text_num_layers
+        self.text_num_heads = text_num_heads
+        self.text_max_position_embeddings = text_max_position_embeddings
+        self.text_layer_norm_eps = text_layer_norm_eps
+        self.text_pad_token_id = text_pad_token_id

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:90032e4c237a363b3fa79c576d1673ad61360a269336a7c0242d18797bfe769f
+size 452717328

modeling_voiceclap.py ADDED Viewed

	@@ -0,0 +1,205 @@

+"""VoiceCLAP-Small: dual-tower CLAP using BUD-E-Whisper-Small + MiniLM.
+Standalone single-file implementation. Only depends on PyTorch and
+HuggingFace `transformers` (for `BertModel`, `PreTrainedModel`, and
+`PretrainedConfig`).
+"""
+import math
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import BertConfig, BertModel, PreTrainedModel
+try:
+    from .configuration_voiceclap import VoiceCLAPSmallConfig
+except ImportError:
+    from configuration_voiceclap import VoiceCLAPSmallConfig
+class _LayerNorm(nn.LayerNorm):
+    def forward(self, x):
+        return super().forward(x.float()).type(x.dtype)
+def _sinusoids(length: int, channels: int, max_timescale: float = 10000.0) -> torch.Tensor:
+    assert channels % 2 == 0
+    log_timescale_increment = math.log(max_timescale) / (channels // 2 - 1)
+    inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2))
+    scaled_time = torch.arange(length)[:, None] * inv_timescales[None, :]
+    return torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
+class _MultiHeadAttention(nn.Module):
+    def __init__(self, n_state: int, n_head: int):
+        super().__init__()
+        self.n_head = n_head
+        self.query = nn.Linear(n_state, n_state)
+        self.key = nn.Linear(n_state, n_state, bias=False)
+        self.value = nn.Linear(n_state, n_state)
+        self.out = nn.Linear(n_state, n_state)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        q = self.query(x)
+        k = self.key(x)
+        v = self.value(x)
+        n_batch, n_ctx, n_state = q.shape
+        head_dim = n_state // self.n_head
+        q = q.view(n_batch, n_ctx, self.n_head, head_dim).transpose(1, 2)
+        k = k.view(n_batch, n_ctx, self.n_head, head_dim).transpose(1, 2)
+        v = v.view(n_batch, n_ctx, self.n_head, head_dim).transpose(1, 2)
+        out = F.scaled_dot_product_attention(q, k, v)
+        out = out.transpose(1, 2).reshape(n_batch, n_ctx, n_state)
+        return self.out(out)
+class _ResidualAttentionBlock(nn.Module):
+    def __init__(self, n_state: int, n_head: int):
+        super().__init__()
+        self.attn = _MultiHeadAttention(n_state, n_head)
+        self.attn_ln = _LayerNorm(n_state)
+        n_mlp = n_state * 4
+        self.mlp = nn.Sequential(nn.Linear(n_state, n_mlp), nn.GELU(), nn.Linear(n_mlp, n_state))
+        self.mlp_ln = _LayerNorm(n_state)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.attn_ln(x))
+        x = x + self.mlp(self.mlp_ln(x))
+        return x
+class _WhisperAudioEncoder(nn.Module):
+    """Whisper-style audio encoder. Takes a precomputed log-mel spectrogram."""
+    def __init__(
+        self,
+        n_mels: int = 80,
+        n_ctx: int = 1500,
+        n_state: int = 768,
+        n_head: int = 12,
+        n_layer: int = 12,
+        output_dim: int = 768,
+    ):
+        super().__init__()
+        self.conv1 = nn.Conv1d(n_mels, n_state, kernel_size=3, padding=1)
+        self.conv2 = nn.Conv1d(n_state, n_state, kernel_size=3, stride=2, padding=1)
+        self.register_buffer("positional_embedding", _sinusoids(n_ctx, n_state))
+        self.blocks = nn.ModuleList(
+            [_ResidualAttentionBlock(n_state, n_head) for _ in range(n_layer)]
+        )
+        self.ln_post = _LayerNorm(n_state)
+        self.avg_pooler = nn.AvgPool1d(kernel_size=2, stride=2)
+        self.proj = nn.Linear(n_state, output_dim)
+    def forward(self, mel: torch.Tensor) -> torch.Tensor:
+        # mel: (B, n_mels, T_mel)
+        x = F.gelu(self.conv1(mel))
+        x = F.gelu(self.conv2(x))
+        x = x.permute(0, 2, 1)  # (B, T', D)
+        T = x.size(1)
+        x = x + self.positional_embedding[:T].to(dtype=x.dtype, device=x.device)
+        for block in self.blocks:
+            x = block(x)
+        x = x.permute(0, 2, 1)
+        x = self.avg_pooler(x)
+        x = x.permute(0, 2, 1)
+        x = self.ln_post(x)
+        x = self.proj(x)
+        return x
+class VoiceCLAPSmall(PreTrainedModel):
+    config_class = VoiceCLAPSmallConfig
+    def __init__(self, config: VoiceCLAPSmallConfig):
+        super().__init__(config)
+        self.audio_encoder = _WhisperAudioEncoder(
+            n_mels=config.n_mels,
+            n_ctx=config.n_ctx,
+            n_state=config.n_state,
+            n_head=config.n_head,
+            n_layer=config.n_layer,
+            output_dim=config.embed_dim,
+        )
+        self.audio_proj = nn.Sequential(
+            nn.Linear(config.embed_dim, config.embed_dim),
+            nn.GELU(),
+            nn.Linear(config.embed_dim, config.embed_dim),
+        )
+        bert_config = BertConfig(
+            vocab_size=config.text_vocab_size,
+            hidden_size=config.text_hidden_dim,
+            num_hidden_layers=config.text_num_layers,
+            num_attention_heads=config.text_num_heads,
+            intermediate_size=config.text_intermediate_size,
+            max_position_embeddings=config.text_max_position_embeddings,
+            layer_norm_eps=config.text_layer_norm_eps,
+            pad_token_id=config.text_pad_token_id,
+        )
+        self.text_encoder = BertModel(bert_config, add_pooling_layer=False)
+        self.text_proj = nn.Sequential(
+            nn.Linear(config.text_hidden_dim, config.text_proj_hidden, bias=False),
+            nn.GELU(),
+            nn.Linear(config.text_proj_hidden, config.embed_dim, bias=False),
+        )
+        self.logit_scale = nn.Parameter(torch.zeros(()))
+        self.logit_bias = nn.Parameter(torch.zeros(()))
+        # Mel filterbank used by encode_waveform / compute_log_mel.
+        # 80 mel bins x 201 freq bins for n_fft=400, sr=16000 (Whisper-style).
+        self.register_buffer(
+            "mel_filters",
+            torch.zeros(config.n_mels, 201),
+            persistent=True,
+        )
+        self.post_init()
+    @torch.no_grad()
+    def compute_log_mel(
+        self, waveform: torch.Tensor, sample_rate: int = 16000
+    ) -> torch.Tensor:
+        """Whisper-style log-mel spectrogram. waveform: (B, T) or (T,) at 16 kHz.
+        Returns (B, n_mels, T_mel). Matches the training-time preprocessing
+        bit-exactly so embeddings reproduce the published results.
+        """
+        if sample_rate != 16000:
+            raise ValueError(f"sample_rate must be 16000, got {sample_rate}")
+        if waveform.dim() == 1:
+            waveform = waveform.unsqueeze(0)
+        device = self.mel_filters.device
+        waveform = waveform.to(device=device, dtype=torch.float32)
+        window = torch.hann_window(400, device=device)
+        stft = torch.stft(waveform, n_fft=400, hop_length=160, window=window, return_complex=True)
+        magnitudes = stft[..., :-1].abs() ** 2
+        mel = self.mel_filters.to(magnitudes.dtype) @ magnitudes
+        log_spec = torch.clamp(mel, min=1e-10).log10()
+        log_spec = torch.maximum(log_spec, log_spec.amax(dim=(-2, -1), keepdim=True) - 8.0)
+        log_spec = (log_spec + 4.0) / 4.0
+        return log_spec
+    def encode_waveform(self, waveform: torch.Tensor, sample_rate: int = 16000) -> torch.Tensor:
+        """Encode raw 16 kHz waveform; calls ``compute_log_mel`` then ``encode_audio``."""
+        mel = self.compute_log_mel(waveform, sample_rate=sample_rate)
+        return self.encode_audio(mel)
+    def encode_audio(self, mel: torch.Tensor) -> torch.Tensor:
+        feats = self.audio_encoder(mel)            # (B, T', D)
+        feats = feats.mean(dim=1)                  # clip-level mean
+        feats = self.audio_proj(feats)
+        return F.normalize(feats, dim=-1)
+    def encode_text(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if attention_mask is None:
+            attention_mask = (input_ids != self.config.text_pad_token_id).long()
+        out = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
+        hidden = out.last_hidden_state                                  # (B, T, H)
+        mask = attention_mask.unsqueeze(-1).to(hidden.dtype)
+        pooled = (hidden * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1e-9)
+        feats = self.text_proj(pooled)
+        return F.normalize(feats, dim=-1)

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "chunk_length": 30,
+  "dither": 0.0,
+  "feature_extractor_type": "WhisperFeatureExtractor",
+  "feature_size": 80,
+  "hop_length": 160,
+  "n_fft": 400,
+  "n_samples": 480000,
+  "nb_max_frames": 3000,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "return_attention_mask": false,
+  "sampling_rate": 16000
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "backend": "tokenizers",
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "is_local": false,
+  "local_files_only": false,
+  "mask_token": "[MASK]",
+  "max_length": 128,
+  "model_max_length": 512,
+  "never_split": null,
+  "pad_to_multiple_of": null,
+  "pad_token": "[PAD]",
+  "pad_token_type_id": 0,
+  "padding_side": "right",
+  "sep_token": "[SEP]",
+  "stride": 0,
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "truncation_side": "right",
+  "truncation_strategy": "longest_first",
+  "unk_token": "[UNK]"
+}