gijs commited on
Commit
f5fcbcd
·
verified ·
1 Parent(s): e41db6c

Initial release: VoiceCLAP-Small (BUD-E-Whisper + MiniLM, dual-tower CLAP, 1 epoch on voiceclap_10)

Browse files
README.md ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: cc-by-4.0
3
+ language:
4
+ - en
5
+ library_name: transformers
6
+ pipeline_tag: zero-shot-audio-classification
7
+ tags:
8
+ - audio
9
+ - speech
10
+ - emotion
11
+ - clap
12
+ - contrastive
13
+ - voice
14
+ ---
15
+
16
+ # VoiceCLAP-Small
17
+
18
+ Voice-text contrastive (CLAP-style) embedding model trained on dense vocal-style
19
+ captions for the [VoiceNet](https://huggingface.co/VoiceNet) suite.
20
+
21
+ VoiceCLAP-Small is the smaller of the two voice-text contrastive anchors
22
+ released with VoiceNet. It is a **dual-tower** model: a
23
+ [BUD-E-Whisper_V1.1](https://huggingface.co/laion/BUD-E-Whisper_V1.1) audio
24
+ encoder paired with
25
+ [`sentence-transformers/all-MiniLM-L6-v2`](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)
26
+ on the text side, joined by an MLP projection on each side and trained with
27
+ the SigLIP sigmoid contrastive loss.
28
+
29
+ | | |
30
+ | --- | --- |
31
+ | Architecture | dual-tower CLAP (BUD-E-Whisper-Small + MiniLM-L6-v2) |
32
+ | Audio encoder | Whisper-style: 12 layers × 768 dim × 12 heads, 80-mel input @ 16 kHz |
33
+ | Text encoder | BERT/MiniLM, 6 layers × 384 dim, mean-pooled |
34
+ | Joint embedding | 768-d, L2-normalised |
35
+ | Loss | SigLIP (sigmoid contrastive) |
36
+ | Total parameters | ~110 M |
37
+ | Epochs | 1 |
38
+
39
+ ## Training data
40
+
41
+ Trained for **1 epoch** on the open `voiceclap_10` mixture used in the
42
+ VoiceNet paper:
43
+
44
+ - `emolia-balanced-5M-subset` (annotated subset of [Emilia](https://huggingface.co/datasets/amphion/Emilia-Dataset))
45
+ - `laions_got_talent_clean_with_captions`
46
+ - `majestrino-data`
47
+ - `synthetic_vocal_bursts`
48
+ - `improved_synthetic_vocal_bursts`
49
+ - `ears`
50
+
51
+ All clips are captioned with `MOSS-Audio-8B-Thinking`-derived dense vocal-style
52
+ captions covering emotions, talking-style attributes, and demographics.
53
+
54
+ ## Standalone load example
55
+
56
+ Only `transformers` and `torchaudio` are required (both on PyPI).
57
+
58
+ ```python
59
+ import torch, torchaudio
60
+ from transformers import AutoModel, AutoTokenizer
61
+
62
+ model = AutoModel.from_pretrained("VoiceNet/voiceclap-small", trust_remote_code=True).eval()
63
+ tok = AutoTokenizer.from_pretrained("VoiceNet/voiceclap-small")
64
+
65
+ # Audio: any-length 16 kHz waveform, mono
66
+ wav, sr = torchaudio.load("clip.wav")
67
+ if sr != 16000:
68
+ wav = torchaudio.functional.resample(wav, sr, 16000)
69
+ wav = wav.mean(0) # (T,)
70
+ audio_emb = model.encode_waveform(wav) # (1, 768), L2-normed
71
+
72
+ # Text: short caption(s)
73
+ enc = tok(["a calm and steady voice"], padding=True, return_tensors="pt")
74
+ text_emb = model.encode_text(enc.input_ids, enc.attention_mask)
75
+
76
+ # Cosine similarity (embeddings already L2-normalised)
77
+ print((audio_emb @ text_emb.T).item())
78
+ ```
79
+
80
+ `encode_waveform` accepts clips up to 30 s; longer clips should be chunked or
81
+ truncated before being passed in. Embeddings are 768-d and unit-norm, so
82
+ `a @ t.T` is the cosine similarity used in zero-shot retrieval.
83
+
84
+ ## Citation
85
+
86
+ If you use this model, please cite the VoiceNet paper.
config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "VoiceCLAPSmall"
4
+ ],
5
+ "dtype": "float32",
6
+ "embed_dim": 768,
7
+ "model_type": "voiceclap-small",
8
+ "n_ctx": 1500,
9
+ "n_head": 12,
10
+ "n_layer": 12,
11
+ "n_mels": 80,
12
+ "n_state": 768,
13
+ "text_hidden_dim": 384,
14
+ "text_intermediate_size": 1536,
15
+ "text_layer_norm_eps": 1e-12,
16
+ "text_max_position_embeddings": 512,
17
+ "text_num_heads": 12,
18
+ "text_num_layers": 6,
19
+ "text_pad_token_id": 0,
20
+ "text_proj_hidden": 576,
21
+ "text_vocab_size": 30522,
22
+ "transformers_version": "5.7.0",
23
+ "auto_map": {
24
+ "AutoConfig": "configuration_voiceclap.VoiceCLAPSmallConfig",
25
+ "AutoModel": "modeling_voiceclap.VoiceCLAPSmall"
26
+ }
27
+ }
configuration_voiceclap.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """VoiceCLAP-Small config."""
2
+ from transformers import PretrainedConfig
3
+
4
+
5
+ class VoiceCLAPSmallConfig(PretrainedConfig):
6
+ model_type = "voiceclap-small"
7
+
8
+ def __init__(
9
+ self,
10
+ embed_dim: int = 768,
11
+ n_mels: int = 80,
12
+ n_ctx: int = 1500,
13
+ n_state: int = 768,
14
+ n_head: int = 12,
15
+ n_layer: int = 12,
16
+ text_hidden_dim: int = 384,
17
+ text_proj_hidden: int = 576,
18
+ text_vocab_size: int = 30522,
19
+ text_intermediate_size: int = 1536,
20
+ text_num_layers: int = 6,
21
+ text_num_heads: int = 12,
22
+ text_max_position_embeddings: int = 512,
23
+ text_layer_norm_eps: float = 1e-12,
24
+ text_pad_token_id: int = 0,
25
+ **kwargs,
26
+ ):
27
+ super().__init__(**kwargs)
28
+ self.embed_dim = embed_dim
29
+ self.n_mels = n_mels
30
+ self.n_ctx = n_ctx
31
+ self.n_state = n_state
32
+ self.n_head = n_head
33
+ self.n_layer = n_layer
34
+ self.text_hidden_dim = text_hidden_dim
35
+ self.text_proj_hidden = text_proj_hidden
36
+ self.text_vocab_size = text_vocab_size
37
+ self.text_intermediate_size = text_intermediate_size
38
+ self.text_num_layers = text_num_layers
39
+ self.text_num_heads = text_num_heads
40
+ self.text_max_position_embeddings = text_max_position_embeddings
41
+ self.text_layer_norm_eps = text_layer_norm_eps
42
+ self.text_pad_token_id = text_pad_token_id
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90032e4c237a363b3fa79c576d1673ad61360a269336a7c0242d18797bfe769f
3
+ size 452717328
modeling_voiceclap.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """VoiceCLAP-Small: dual-tower CLAP using BUD-E-Whisper-Small + MiniLM.
2
+
3
+ Standalone single-file implementation. Only depends on PyTorch and
4
+ HuggingFace `transformers` (for `BertModel`, `PreTrainedModel`, and
5
+ `PretrainedConfig`).
6
+ """
7
+ import math
8
+ from typing import Optional
9
+
10
+ import torch
11
+ import torch.nn as nn
12
+ import torch.nn.functional as F
13
+ from transformers import BertConfig, BertModel, PreTrainedModel
14
+
15
+ try:
16
+ from .configuration_voiceclap import VoiceCLAPSmallConfig
17
+ except ImportError:
18
+ from configuration_voiceclap import VoiceCLAPSmallConfig
19
+
20
+
21
+ class _LayerNorm(nn.LayerNorm):
22
+ def forward(self, x):
23
+ return super().forward(x.float()).type(x.dtype)
24
+
25
+
26
+ def _sinusoids(length: int, channels: int, max_timescale: float = 10000.0) -> torch.Tensor:
27
+ assert channels % 2 == 0
28
+ log_timescale_increment = math.log(max_timescale) / (channels // 2 - 1)
29
+ inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2))
30
+ scaled_time = torch.arange(length)[:, None] * inv_timescales[None, :]
31
+ return torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
32
+
33
+
34
+ class _MultiHeadAttention(nn.Module):
35
+ def __init__(self, n_state: int, n_head: int):
36
+ super().__init__()
37
+ self.n_head = n_head
38
+ self.query = nn.Linear(n_state, n_state)
39
+ self.key = nn.Linear(n_state, n_state, bias=False)
40
+ self.value = nn.Linear(n_state, n_state)
41
+ self.out = nn.Linear(n_state, n_state)
42
+
43
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
44
+ q = self.query(x)
45
+ k = self.key(x)
46
+ v = self.value(x)
47
+ n_batch, n_ctx, n_state = q.shape
48
+ head_dim = n_state // self.n_head
49
+ q = q.view(n_batch, n_ctx, self.n_head, head_dim).transpose(1, 2)
50
+ k = k.view(n_batch, n_ctx, self.n_head, head_dim).transpose(1, 2)
51
+ v = v.view(n_batch, n_ctx, self.n_head, head_dim).transpose(1, 2)
52
+ out = F.scaled_dot_product_attention(q, k, v)
53
+ out = out.transpose(1, 2).reshape(n_batch, n_ctx, n_state)
54
+ return self.out(out)
55
+
56
+
57
+ class _ResidualAttentionBlock(nn.Module):
58
+ def __init__(self, n_state: int, n_head: int):
59
+ super().__init__()
60
+ self.attn = _MultiHeadAttention(n_state, n_head)
61
+ self.attn_ln = _LayerNorm(n_state)
62
+ n_mlp = n_state * 4
63
+ self.mlp = nn.Sequential(nn.Linear(n_state, n_mlp), nn.GELU(), nn.Linear(n_mlp, n_state))
64
+ self.mlp_ln = _LayerNorm(n_state)
65
+
66
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
67
+ x = x + self.attn(self.attn_ln(x))
68
+ x = x + self.mlp(self.mlp_ln(x))
69
+ return x
70
+
71
+
72
+ class _WhisperAudioEncoder(nn.Module):
73
+ """Whisper-style audio encoder. Takes a precomputed log-mel spectrogram."""
74
+
75
+ def __init__(
76
+ self,
77
+ n_mels: int = 80,
78
+ n_ctx: int = 1500,
79
+ n_state: int = 768,
80
+ n_head: int = 12,
81
+ n_layer: int = 12,
82
+ output_dim: int = 768,
83
+ ):
84
+ super().__init__()
85
+ self.conv1 = nn.Conv1d(n_mels, n_state, kernel_size=3, padding=1)
86
+ self.conv2 = nn.Conv1d(n_state, n_state, kernel_size=3, stride=2, padding=1)
87
+ self.register_buffer("positional_embedding", _sinusoids(n_ctx, n_state))
88
+ self.blocks = nn.ModuleList(
89
+ [_ResidualAttentionBlock(n_state, n_head) for _ in range(n_layer)]
90
+ )
91
+ self.ln_post = _LayerNorm(n_state)
92
+ self.avg_pooler = nn.AvgPool1d(kernel_size=2, stride=2)
93
+ self.proj = nn.Linear(n_state, output_dim)
94
+
95
+ def forward(self, mel: torch.Tensor) -> torch.Tensor:
96
+ # mel: (B, n_mels, T_mel)
97
+ x = F.gelu(self.conv1(mel))
98
+ x = F.gelu(self.conv2(x))
99
+ x = x.permute(0, 2, 1) # (B, T', D)
100
+ T = x.size(1)
101
+ x = x + self.positional_embedding[:T].to(dtype=x.dtype, device=x.device)
102
+ for block in self.blocks:
103
+ x = block(x)
104
+ x = x.permute(0, 2, 1)
105
+ x = self.avg_pooler(x)
106
+ x = x.permute(0, 2, 1)
107
+ x = self.ln_post(x)
108
+ x = self.proj(x)
109
+ return x
110
+
111
+
112
+ class VoiceCLAPSmall(PreTrainedModel):
113
+ config_class = VoiceCLAPSmallConfig
114
+
115
+ def __init__(self, config: VoiceCLAPSmallConfig):
116
+ super().__init__(config)
117
+ self.audio_encoder = _WhisperAudioEncoder(
118
+ n_mels=config.n_mels,
119
+ n_ctx=config.n_ctx,
120
+ n_state=config.n_state,
121
+ n_head=config.n_head,
122
+ n_layer=config.n_layer,
123
+ output_dim=config.embed_dim,
124
+ )
125
+ self.audio_proj = nn.Sequential(
126
+ nn.Linear(config.embed_dim, config.embed_dim),
127
+ nn.GELU(),
128
+ nn.Linear(config.embed_dim, config.embed_dim),
129
+ )
130
+ bert_config = BertConfig(
131
+ vocab_size=config.text_vocab_size,
132
+ hidden_size=config.text_hidden_dim,
133
+ num_hidden_layers=config.text_num_layers,
134
+ num_attention_heads=config.text_num_heads,
135
+ intermediate_size=config.text_intermediate_size,
136
+ max_position_embeddings=config.text_max_position_embeddings,
137
+ layer_norm_eps=config.text_layer_norm_eps,
138
+ pad_token_id=config.text_pad_token_id,
139
+ )
140
+ self.text_encoder = BertModel(bert_config, add_pooling_layer=False)
141
+ self.text_proj = nn.Sequential(
142
+ nn.Linear(config.text_hidden_dim, config.text_proj_hidden, bias=False),
143
+ nn.GELU(),
144
+ nn.Linear(config.text_proj_hidden, config.embed_dim, bias=False),
145
+ )
146
+ self.logit_scale = nn.Parameter(torch.zeros(()))
147
+ self.logit_bias = nn.Parameter(torch.zeros(()))
148
+
149
+ # Mel filterbank used by encode_waveform / compute_log_mel.
150
+ # 80 mel bins x 201 freq bins for n_fft=400, sr=16000 (Whisper-style).
151
+ self.register_buffer(
152
+ "mel_filters",
153
+ torch.zeros(config.n_mels, 201),
154
+ persistent=True,
155
+ )
156
+ self.post_init()
157
+
158
+ @torch.no_grad()
159
+ def compute_log_mel(
160
+ self, waveform: torch.Tensor, sample_rate: int = 16000
161
+ ) -> torch.Tensor:
162
+ """Whisper-style log-mel spectrogram. waveform: (B, T) or (T,) at 16 kHz.
163
+
164
+ Returns (B, n_mels, T_mel). Matches the training-time preprocessing
165
+ bit-exactly so embeddings reproduce the published results.
166
+ """
167
+ if sample_rate != 16000:
168
+ raise ValueError(f"sample_rate must be 16000, got {sample_rate}")
169
+ if waveform.dim() == 1:
170
+ waveform = waveform.unsqueeze(0)
171
+ device = self.mel_filters.device
172
+ waveform = waveform.to(device=device, dtype=torch.float32)
173
+ window = torch.hann_window(400, device=device)
174
+ stft = torch.stft(waveform, n_fft=400, hop_length=160, window=window, return_complex=True)
175
+ magnitudes = stft[..., :-1].abs() ** 2
176
+ mel = self.mel_filters.to(magnitudes.dtype) @ magnitudes
177
+ log_spec = torch.clamp(mel, min=1e-10).log10()
178
+ log_spec = torch.maximum(log_spec, log_spec.amax(dim=(-2, -1), keepdim=True) - 8.0)
179
+ log_spec = (log_spec + 4.0) / 4.0
180
+ return log_spec
181
+
182
+ def encode_waveform(self, waveform: torch.Tensor, sample_rate: int = 16000) -> torch.Tensor:
183
+ """Encode raw 16 kHz waveform; calls ``compute_log_mel`` then ``encode_audio``."""
184
+ mel = self.compute_log_mel(waveform, sample_rate=sample_rate)
185
+ return self.encode_audio(mel)
186
+
187
+ def encode_audio(self, mel: torch.Tensor) -> torch.Tensor:
188
+ feats = self.audio_encoder(mel) # (B, T', D)
189
+ feats = feats.mean(dim=1) # clip-level mean
190
+ feats = self.audio_proj(feats)
191
+ return F.normalize(feats, dim=-1)
192
+
193
+ def encode_text(
194
+ self,
195
+ input_ids: torch.Tensor,
196
+ attention_mask: Optional[torch.Tensor] = None,
197
+ ) -> torch.Tensor:
198
+ if attention_mask is None:
199
+ attention_mask = (input_ids != self.config.text_pad_token_id).long()
200
+ out = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
201
+ hidden = out.last_hidden_state # (B, T, H)
202
+ mask = attention_mask.unsqueeze(-1).to(hidden.dtype)
203
+ pooled = (hidden * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1e-9)
204
+ feats = self.text_proj(pooled)
205
+ return F.normalize(feats, dim=-1)
preprocessor_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "chunk_length": 30,
3
+ "dither": 0.0,
4
+ "feature_extractor_type": "WhisperFeatureExtractor",
5
+ "feature_size": 80,
6
+ "hop_length": 160,
7
+ "n_fft": 400,
8
+ "n_samples": 480000,
9
+ "nb_max_frames": 3000,
10
+ "padding_side": "right",
11
+ "padding_value": 0.0,
12
+ "return_attention_mask": false,
13
+ "sampling_rate": 16000
14
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "cls_token": "[CLS]",
4
+ "do_basic_tokenize": true,
5
+ "do_lower_case": true,
6
+ "is_local": false,
7
+ "local_files_only": false,
8
+ "mask_token": "[MASK]",
9
+ "max_length": 128,
10
+ "model_max_length": 512,
11
+ "never_split": null,
12
+ "pad_to_multiple_of": null,
13
+ "pad_token": "[PAD]",
14
+ "pad_token_type_id": 0,
15
+ "padding_side": "right",
16
+ "sep_token": "[SEP]",
17
+ "stride": 0,
18
+ "strip_accents": null,
19
+ "tokenize_chinese_chars": true,
20
+ "tokenizer_class": "BertTokenizer",
21
+ "truncation_side": "right",
22
+ "truncation_strategy": "longest_first",
23
+ "unk_token": "[UNK]"
24
+ }