root commited on
Commit
9778d56
1 Parent(s): 40e68f7
Files changed (31) hide show
  1. CLAP/msclap/CLAPWrapper.py +274 -0
  2. CLAP/msclap/__init__.py +0 -0
  3. CLAP/msclap/clap.ipynb +0 -0
  4. CLAP/msclap/classification.ipynb +361 -0
  5. CLAP/msclap/configs/.ipynb_checkpoints/config-checkpoint.yml +26 -0
  6. CLAP/msclap/configs/config.yml +26 -0
  7. CLAP/msclap/esc50_dataset.py +82 -0
  8. CLAP/msclap/models/.ipynb_checkpoints/audio-checkpoint.py +201 -0
  9. CLAP/msclap/models/.ipynb_checkpoints/clap-checkpoint.py +90 -0
  10. CLAP/msclap/models/.ipynb_checkpoints/utils-checkpoint.py +26 -0
  11. CLAP/msclap/models/__init__.py +3 -0
  12. CLAP/msclap/models/__pycache__/__init__.cpython-310.pyc +0 -0
  13. CLAP/msclap/models/__pycache__/__init__.cpython-311.pyc +0 -0
  14. CLAP/msclap/models/__pycache__/__init__.cpython-38.pyc +0 -0
  15. CLAP/msclap/models/__pycache__/audio.cpython-310.pyc +0 -0
  16. CLAP/msclap/models/__pycache__/audio.cpython-311.pyc +0 -0
  17. CLAP/msclap/models/__pycache__/audio.cpython-38.pyc +0 -0
  18. CLAP/msclap/models/__pycache__/clap.cpython-310.pyc +0 -0
  19. CLAP/msclap/models/__pycache__/clap.cpython-311.pyc +0 -0
  20. CLAP/msclap/models/__pycache__/clap.cpython-38.pyc +0 -0
  21. CLAP/msclap/models/__pycache__/utils.cpython-310.pyc +0 -0
  22. CLAP/msclap/models/__pycache__/utils.cpython-311.pyc +0 -0
  23. CLAP/msclap/models/__pycache__/utils.cpython-38.pyc +0 -0
  24. CLAP/msclap/models/audio.py +200 -0
  25. CLAP/msclap/models/clap.py +92 -0
  26. CLAP/msclap/models/utils.py +26 -0
  27. CLAP/msclap/zero_shot_classification.py +46 -0
  28. CLAP/msclap/zero_shot_predictions.py +52 -0
  29. README.md +1 -0
  30. ldm/modules/encoders/audio_projector_res.py +94 -0
  31. requirements.txt +18 -0
CLAP/msclap/CLAPWrapper.py ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import torchaudio
3
+ # from torch._six import string_classes
4
+ import collections
5
+ import re
6
+ import torch.nn.functional as F
7
+ import numpy as np
8
+ from transformers import AutoTokenizer
9
+ from models.utils import read_config_as_args
10
+ from models.clap import CLAP
11
+ import math
12
+ import torchaudio.transforms as T
13
+ import os
14
+ import torch
15
+ from importlib_resources import files
16
+
17
+
18
+ class CLAPWrapper():
19
+ """
20
+ A class for interfacing CLAP model.
21
+ """
22
+
23
+ def __init__(self, model_fp, use_cuda=False):
24
+ self.np_str_obj_array_pattern = re.compile(r'[SaUO]')
25
+ self.file_path = os.path.realpath(__file__)
26
+ self.default_collate_err_msg_format = (
27
+ "default_collate: batch must contain tensors, numpy arrays, numbers, "
28
+ "dicts or lists; found {}")
29
+ self.config_as_str = files('configs').joinpath('config.yml').read_text()
30
+ self.model_fp = model_fp
31
+ self.use_cuda = use_cuda
32
+ self.clap, self.tokenizer, self.args = self.load_clap()
33
+
34
+
35
+ def load_clap(self):
36
+ r"""Load CLAP model with args from config file"""
37
+
38
+ args = read_config_as_args(self.config_as_str, is_config_str=True)
39
+
40
+ if 'bert' in args.text_model:
41
+ self.token_keys = ['input_ids', 'token_type_ids', 'attention_mask']
42
+ else:
43
+ self.token_keys = ['input_ids', 'attention_mask']
44
+
45
+ clap = CLAP(
46
+ audioenc_name=args.audioenc_name,
47
+ sample_rate=args.sampling_rate,
48
+ window_size=args.window_size,
49
+ hop_size=args.hop_size,
50
+ mel_bins=args.mel_bins,
51
+ fmin=args.fmin,
52
+ fmax=args.fmax,
53
+ classes_num=args.num_classes,
54
+ out_emb=args.out_emb,
55
+ text_model=args.text_model,
56
+ transformer_embed_dim=args.transformer_embed_dim,
57
+ d_proj=args.d_proj
58
+ )
59
+
60
+ # print("---")
61
+ # print(f"duration is {args.duration}")
62
+
63
+ # args.duration = 10
64
+
65
+ # Load pretrained weights for model
66
+ model_state_dict = torch.load(self.model_fp, map_location=torch.device('cpu'))['model']
67
+ clap.load_state_dict(model_state_dict, strict=False)
68
+
69
+ clap.eval() # set clap in eval mode
70
+ tokenizer = AutoTokenizer.from_pretrained(args.text_model)
71
+
72
+ if self.use_cuda and torch.cuda.is_available():
73
+ clap = clap.cuda()
74
+
75
+ return clap, tokenizer, args
76
+
77
+ def default_collate(self, batch):
78
+ r"""Puts each data field into a tensor with outer dimension batch size"""
79
+ elem = batch[0]
80
+ elem_type = type(elem)
81
+
82
+
83
+ if isinstance(elem, torch.Tensor):
84
+ out = None
85
+ if torch.utils.data.get_worker_info() is not None:
86
+ # If we're in a background process, concatenate directly into a
87
+ # shared memory tensor to avoid an extra copy
88
+ numel = sum([x.numel() for x in batch])
89
+ storage = elem.storage()._new_shared(numel)
90
+ out = elem.new(storage)
91
+ return torch.stack(batch, 0, out=out)
92
+
93
+ elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
94
+ and elem_type.__name__ != 'string_':
95
+ if elem_type.__name__ == 'ndarray' or elem_type.__name__ == 'memmap':
96
+ # array of string classes and object
97
+
98
+ if self.np_str_obj_array_pattern.search(elem.dtype.str) is not None:
99
+ raise TypeError(
100
+ self.default_collate_err_msg_format.format(elem.dtype))
101
+
102
+ return self.default_collate([torch.as_tensor(b) for b in batch])
103
+ elif elem.shape == (): # scalars
104
+ return torch.as_tensor(batch)
105
+ elif isinstance(elem, float):
106
+ return torch.tensor(batch, dtype=torch.float64)
107
+ elif isinstance(elem, int):
108
+ return torch.tensor(batch)
109
+ # elif isinstance(elem, string_classes):
110
+ # return batch
111
+ elif isinstance(elem, collections.abc.Mapping):
112
+ return {key: self.default_collate([d[key] for d in batch]) for key in elem}
113
+ elif isinstance(elem, tuple) and hasattr(elem, '_fields'): # namedtuple
114
+ return elem_type(*(self.default_collate(samples) for samples in zip(*batch)))
115
+ elif isinstance(elem, collections.abc.Sequence):
116
+ # check to make sure that the elements in batch have consistent size
117
+ it = iter(batch)
118
+ elem_size = len(next(it))
119
+ if not all(len(elem) == elem_size for elem in it):
120
+ raise RuntimeError(
121
+ 'each element in list of batch should be of equal size')
122
+ transposed = zip(*batch)
123
+ return [self.default_collate(samples) for samples in transposed]
124
+
125
+ raise TypeError(self.default_collate_err_msg_format.format(elem_type))
126
+
127
+ def load_audio_into_tensor(self, audio_path, audio_duration, resample=False):
128
+ r"""Loads audio file and returns raw audio."""
129
+ # Randomly sample a segment of audio_duration from the clip or pad to match duration
130
+ audio_time_series, sample_rate = torchaudio.load(audio_path)
131
+
132
+ resample_rate = self.args.sampling_rate
133
+
134
+ audio_time_series = torch.mean(audio_time_series, dim=0, keepdim=True)
135
+
136
+
137
+ if resample:
138
+ resampler = T.Resample(sample_rate, resample_rate)
139
+ audio_time_series = resampler(audio_time_series)
140
+
141
+ audio_time_series = audio_time_series.reshape(-1)
142
+
143
+ # audio_duration = 10
144
+ # window_len = 5
145
+ # window_count = 10
146
+
147
+
148
+ # audio_time_series is shorter than predefined audio duration,
149
+ # so audio_time_series is extended
150
+ if audio_duration*resample_rate >= audio_time_series.shape[0]: # it was sample rate here but why it should be wrong ????
151
+ repeat_factor = int(np.ceil((audio_duration*resample_rate) /
152
+ audio_time_series.shape[0]))
153
+ # Repeat audio_time_series by repeat_factor to match audio_duration
154
+ audio_time_series = audio_time_series.repeat(repeat_factor)
155
+ # remove excess part of audio_time_series
156
+ audio_time_series = audio_time_series[0:audio_duration*resample_rate]
157
+ else:
158
+ # audio_time_series is longer than predefined audio duration,
159
+ # so audio_time_series is trimmed
160
+ start_index = random.randrange(
161
+ audio_time_series.shape[0] - audio_duration*resample_rate)
162
+ audio_time_series = audio_time_series[start_index:start_index +
163
+ audio_duration*resample_rate]
164
+
165
+
166
+ return torch.FloatTensor(audio_time_series)
167
+
168
+ def preprocess_audio(self, audio_files, resample):
169
+ r"""Load list of audio files and return raw audio"""
170
+ audio_tensors = []
171
+
172
+ for audio_file in audio_files:
173
+ # print(self.args.duration)
174
+
175
+ audio_tensor = self.load_audio_into_tensor(
176
+ audio_file, self.args.duration, resample)
177
+
178
+ if self.use_cuda and torch.cuda.is_available():
179
+ audio_tensor = audio_tensor.reshape(1, -1).cuda()
180
+ else:
181
+ audio_tensor.reshape(1, -1)
182
+
183
+ # audio_tensor = audio_tensor.reshape(
184
+ # 1, -1).cuda if self.use_cuda and torch.cuda.is_available() else audio_tensor.reshape(1, -1)
185
+
186
+ audio_tensors.append(audio_tensor)
187
+
188
+ return self.default_collate(audio_tensors)
189
+
190
+ def preprocess_text(self, text_queries):
191
+ r"""Load list of class labels and return tokenized text"""
192
+ tokenized_texts = []
193
+ for ttext in text_queries:
194
+ tok = self.tokenizer.encode_plus(
195
+ text=ttext, add_special_tokens=True, max_length=self.args.text_len, pad_to_max_length=True, return_tensors="pt")
196
+ for key in self.token_keys:
197
+ tok[key] = tok[key].reshape(-1).cuda() if self.use_cuda and torch.cuda.is_available() else tok[key].reshape(-1)
198
+ tokenized_texts.append(tok)
199
+ return self.default_collate(tokenized_texts)
200
+
201
+ def get_text_embeddings(self, class_labels):
202
+ r"""Load list of class labels and return text embeddings"""
203
+ preprocessed_text = self.preprocess_text(class_labels)
204
+ text_embeddings = self._get_text_embeddings(preprocessed_text)
205
+ text_embeddings = text_embeddings/torch.norm(text_embeddings, dim=-1, keepdim=True)
206
+ return text_embeddings
207
+
208
+ def get_audio_embeddings(self, audio_files, resample, use_aug=False):
209
+ r"""Load list of audio files and return a audio embeddings"""
210
+ preprocessed_audio = self.preprocess_audio(audio_files, resample)
211
+ audio_embeddings, audio_inner_layer = self._get_audio_embeddings(preprocessed_audio, use_aug=use_aug)
212
+ audio_embeddings = audio_embeddings/torch.norm(audio_embeddings, dim=-1, keepdim=True)
213
+ return audio_embeddings, audio_inner_layer
214
+
215
+ def _get_text_embeddings(self, preprocessed_text):
216
+ r"""Load preprocessed text and return text embeddings"""
217
+ with torch.no_grad():
218
+ text_embeddings = self.clap.caption_encoder(preprocessed_text)
219
+ text_embeddings = text_embeddings/torch.norm(text_embeddings, dim=-1, keepdim=True)
220
+ return text_embeddings
221
+
222
+ def _get_audio_embeddings(self, preprocessed_audio, use_aug=False):
223
+ r"""Load preprocessed audio and return a audio embeddings"""
224
+ with torch.no_grad():
225
+ preprocessed_audio = preprocessed_audio.reshape(
226
+ preprocessed_audio.shape[0], preprocessed_audio.shape[2])
227
+ #Append [0] the audio emebdding, [1] has output class probabilities
228
+
229
+ audio_embeddings, _, audio_inner_layer = self.clap.audio_encoder(preprocessed_audio, use_aug=use_aug)
230
+ audio_embeddings = audio_embeddings/torch.norm(audio_embeddings, dim=-1, keepdim=True)
231
+
232
+
233
+ return audio_embeddings, audio_inner_layer
234
+
235
+ def compute_similarity(self, audio_embeddings, text_embeddings):
236
+ r"""Compute similarity between text and audio embeddings"""
237
+ logit_scale = self.clap.logit_scale.exp()
238
+ similarity = logit_scale*text_embeddings @ audio_embeddings.T
239
+ return similarity.T
240
+
241
+ def _generic_batch_inference(self, func, *args):
242
+ r"""Process audio and/or text per batch"""
243
+ input_tmp = args[0]
244
+ batch_size = args[-1]
245
+ # args[0] has audio_files, args[1] has class_labels
246
+ inputs = [args[0], args[1]] if len(args) == 3 else [args[0]]
247
+ args0_len = len(args[0])
248
+ # compute text_embeddings once for all the audio_files batches
249
+ if len(inputs) == 2:
250
+ text_embeddings = self.get_text_embeddings(args[1])
251
+ inputs = [args[0], args[1], text_embeddings]
252
+ dataset_idx = 0
253
+ for _ in range(math.ceil(args0_len/batch_size)):
254
+ next_batch_idx = dataset_idx + batch_size
255
+ # batch size is bigger than available audio/text items
256
+ if next_batch_idx >= args0_len:
257
+ inputs[0] = input_tmp[dataset_idx:]
258
+ return func(*tuple(inputs))
259
+ else:
260
+ inputs[0] = input_tmp[dataset_idx:next_batch_idx]
261
+ yield func(*tuple(inputs))
262
+ dataset_idx = next_batch_idx
263
+
264
+ def get_audio_embeddings_per_batch(self, audio_files, batch_size):
265
+ r"""Load preprocessed audio and return a audio embeddings per batch"""
266
+ return self._generic_batch_inference(self.get_audio_embeddings, audio_files, batch_size)
267
+
268
+ def get_text_embeddings_per_batch(self, class_labels, batch_size):
269
+ r"""Load preprocessed text and return text embeddings per batch"""
270
+ return self._generic_batch_inference(self.get_text_embeddings, class_labels, batch_size)
271
+
272
+ def classify_audio_files_per_batch(self, audio_files, class_labels, batch_size):
273
+ r"""Compute classification probabilities for each audio recording in a batch and each class label"""
274
+ return self._generic_batch_inference(self.classify_audio_files, audio_files, class_labels, batch_size)
CLAP/msclap/__init__.py ADDED
File without changes
CLAP/msclap/clap.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
CLAP/msclap/classification.ipynb ADDED
@@ -0,0 +1,361 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 15,
6
+ "id": "6bf499e8-54b0-498b-84b6-aba956cc573b",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import pandas as pd\n",
11
+ "\n",
12
+ "\n",
13
+ "from CLAPWrapper import CLAPWrapper\n",
14
+ "from esc50_dataset import ESC50\n",
15
+ "import torch.nn.functional as F\n",
16
+ "import numpy as np\n",
17
+ "from tqdm import tqdm\n",
18
+ "from sklearn.metrics import accuracy_score\n",
19
+ "\n",
20
+ "import torch\n"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": 16,
26
+ "id": "082e82b9-56b4-41ce-a8f8-390bb5bc0193",
27
+ "metadata": {},
28
+ "outputs": [],
29
+ "source": [
30
+ "df = pd.read_csv(\"../landscape/landscape_final.csv\")\n",
31
+ "\n",
32
+ "classes = list(set(df[\"label\"]))\n",
33
+ "\n",
34
+ "prompt = 'this is a sound of '\n",
35
+ "y = [prompt + x for x in classes]\n",
36
+ "\n",
37
+ "class_count = len(classes)"
38
+ ]
39
+ },
40
+ {
41
+ "cell_type": "code",
42
+ "execution_count": 17,
43
+ "id": "68e72bf4-6c94-438d-b3f3-c46aaa0b88cc",
44
+ "metadata": {},
45
+ "outputs": [],
46
+ "source": [
47
+ "class_dict = {k: v for v, k in enumerate(classes)}"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "code",
52
+ "execution_count": 37,
53
+ "id": "80c437e3-b7e3-41bc-bb9c-fab936648caf",
54
+ "metadata": {},
55
+ "outputs": [
56
+ {
57
+ "name": "stderr",
58
+ "output_type": "stream",
59
+ "text": [
60
+ "/kuacc/users/bbiner21/.conda/envs/clap/lib/python3.8/site-packages/torchlibrosa/stft.py:193: FutureWarning: Pass size=1024 as keyword args. From version 0.10 passing these as positional arguments will result in an error\n",
61
+ " fft_window = librosa.util.pad_center(fft_window, n_fft)\n",
62
+ "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']\n",
63
+ "- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
64
+ "- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
65
+ "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
66
+ "/kuacc/users/bbiner21/.conda/envs/clap/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:2339: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
67
+ " warnings.warn(\n"
68
+ ]
69
+ }
70
+ ],
71
+ "source": [
72
+ "# Load and initialize CLAP\n",
73
+ "weights_path = \"../clap_weight/CLAP_weights_2022.pth\"\n",
74
+ "\n",
75
+ "# Setting use_cuda = True will load the model on a GPU using CUDA\n",
76
+ "clap_model = CLAPWrapper(weights_path, use_cuda=False)\n",
77
+ "\n",
78
+ "# Computing text embeddings\n",
79
+ "text_embeddings = clap_model.get_text_embeddings(y)\n",
80
+ "\n",
81
+ "# Computing audio embeddings\n",
82
+ "y_preds, y_labels = [], []\n"
83
+ ]
84
+ },
85
+ {
86
+ "cell_type": "code",
87
+ "execution_count": 38,
88
+ "id": "3093fa76-5c25-4cae-a43c-8368fdfd96fc",
89
+ "metadata": {},
90
+ "outputs": [
91
+ {
92
+ "name": "stderr",
93
+ "output_type": "stream",
94
+ "text": [
95
+ "100%|██████████| 1061/1061 [02:33<00:00, 6.92it/s]\n"
96
+ ]
97
+ }
98
+ ],
99
+ "source": [
100
+ "\n",
101
+ "gt = []\n",
102
+ "pred = []\n",
103
+ "\n",
104
+ "for i in tqdm(range(len(df.index))):\n",
105
+ " x = \"/datasets/audio-image/audios/audio_10s/\" + df.iloc[i,1] + \".wav\"\n",
106
+ " \n",
107
+ " cur_class = class_dict[df.iloc[i,0]]\n",
108
+ " one_hot = torch.zeros((1,class_count))\n",
109
+ " one_hot[0,cur_class] = 1.0 \n",
110
+ " \n",
111
+ " gt.append(cur_class)\n",
112
+ " \n",
113
+ " \n",
114
+ "# x, _, one_hot_target = dataset.__getitem__(i)\n",
115
+ " audio_embeddings = clap_model.get_audio_embeddings([x], resample=True)\n",
116
+ " \n",
117
+ " similarity = clap_model.compute_similarity(audio_embeddings, text_embeddings)\n",
118
+ " y_pred = F.softmax(similarity.detach().cpu(), dim=1).numpy()\n",
119
+ " \n",
120
+ " pred.append(np.argmax(y_pred, axis=1)[0])\n",
121
+ " y_preds.append(y_pred)\n",
122
+ " y_labels.append(one_hot.detach().cpu().numpy())\n"
123
+ ]
124
+ },
125
+ {
126
+ "cell_type": "code",
127
+ "execution_count": 23,
128
+ "id": "e2247ab8-844d-4eba-b691-4d38051a51a3",
129
+ "metadata": {},
130
+ "outputs": [
131
+ {
132
+ "name": "stdout",
133
+ "output_type": "stream",
134
+ "text": [
135
+ "ESC50 Accuracy 0.4458058435438266\n"
136
+ ]
137
+ },
138
+ {
139
+ "data": {
140
+ "text/plain": [
141
+ "'\\nThe output:\\n\\nESC50 Accuracy: 82.6%\\n\\n'"
142
+ ]
143
+ },
144
+ "execution_count": 23,
145
+ "metadata": {},
146
+ "output_type": "execute_result"
147
+ }
148
+ ],
149
+ "source": [
150
+ "\n",
151
+ "\n",
152
+ "\n",
153
+ "# for i in tqdm(range(len(dataset))):\n",
154
+ "# x, _, one_hot_target = dataset.__getitem__(i)\n",
155
+ "# audio_embeddings = clap_model.get_audio_embeddings([x], resample=True)\n",
156
+ "# similarity = clap_model.compute_similarity(audio_embeddings, text_embeddings)\n",
157
+ "# y_pred = F.softmax(similarity.detach().cpu(), dim=1).numpy()\n",
158
+ "# y_preds.append(y_pred)\n",
159
+ "# y_labels.append(one_hot_target.detach().cpu().numpy())\n",
160
+ "\n",
161
+ "y_labels, y_preds = np.concatenate(y_labels, axis=0), np.concatenate(y_preds, axis=0)\n",
162
+ "acc = accuracy_score(np.argmax(y_labels, axis=1), np.argmax(y_preds, axis=1))\n",
163
+ "print('ESC50 Accuracy {}'.format(acc))\n",
164
+ "\n",
165
+ "\"\"\"\n",
166
+ "The output:\n",
167
+ "\n",
168
+ "ESC50 Accuracy: 82.6%\n",
169
+ "\n",
170
+ "\"\"\"\n"
171
+ ]
172
+ },
173
+ {
174
+ "cell_type": "code",
175
+ "execution_count": 25,
176
+ "id": "41254964-43ec-4fcb-b1d0-2c9ae76d56f6",
177
+ "metadata": {},
178
+ "outputs": [],
179
+ "source": [
180
+ "gt = []\n",
181
+ "x = \"/datasets/audio-image/audios/audio_10s/\" + df.iloc[0,1] + \".wav\"\n",
182
+ "\n",
183
+ "cur_class = class_dict[df.iloc[0,0]]\n",
184
+ "one_hot = torch.zeros((1,class_count))\n",
185
+ "one_hot[0,cur_class] = 1.0 \n",
186
+ "\n",
187
+ "gt.append(cur_class)\n",
188
+ "\n",
189
+ "\n",
190
+ "# x, _, one_hot_target = dataset.__getitem__(i)\n",
191
+ "audio_embeddings = clap_model.get_audio_embeddings([x], resample=True)\n",
192
+ "\n",
193
+ "similarity = clap_model.compute_similarity(audio_embeddings, text_embeddings)"
194
+ ]
195
+ },
196
+ {
197
+ "cell_type": "code",
198
+ "execution_count": 31,
199
+ "id": "7e73d889-05b6-46ab-820a-9728b1623d5a",
200
+ "metadata": {},
201
+ "outputs": [],
202
+ "source": [
203
+ "y_pred = F.softmax(similarity.detach().cpu(), dim=1).numpy()\n"
204
+ ]
205
+ },
206
+ {
207
+ "cell_type": "code",
208
+ "execution_count": 35,
209
+ "id": "99574178-aba0-467b-a370-679ae927b13b",
210
+ "metadata": {},
211
+ "outputs": [
212
+ {
213
+ "data": {
214
+ "text/plain": [
215
+ "3"
216
+ ]
217
+ },
218
+ "execution_count": 35,
219
+ "metadata": {},
220
+ "output_type": "execute_result"
221
+ }
222
+ ],
223
+ "source": [
224
+ "np.argmax(y_pred, axis=1)[0]"
225
+ ]
226
+ },
227
+ {
228
+ "cell_type": "code",
229
+ "execution_count": 41,
230
+ "id": "21b42bef-9500-46be-8f3e-2c53b91462d0",
231
+ "metadata": {},
232
+ "outputs": [
233
+ {
234
+ "data": {
235
+ "text/plain": [
236
+ "array([0.28571429, 0.35164835, 0.7877095 , 0.59615385, 0.01639344,\n",
237
+ " 0.93243243, 0.93292683, 0.03092784, 0.4 ])"
238
+ ]
239
+ },
240
+ "execution_count": 41,
241
+ "metadata": {},
242
+ "output_type": "execute_result"
243
+ }
244
+ ],
245
+ "source": [
246
+ "from sklearn.metrics import confusion_matrix\n",
247
+ "\n",
248
+ "matrix = confusion_matrix(gt, pred)\n",
249
+ "matrix.diagonal()/matrix.sum(axis=1)"
250
+ ]
251
+ },
252
+ {
253
+ "cell_type": "code",
254
+ "execution_count": 42,
255
+ "id": "2e96c02a-d789-417e-aaec-a420976bef17",
256
+ "metadata": {},
257
+ "outputs": [
258
+ {
259
+ "data": {
260
+ "text/plain": [
261
+ "array([[ 8, 2, 0, 0, 0, 0, 18, 0, 0],\n",
262
+ " [ 5, 64, 1, 11, 0, 0, 100, 1, 0],\n",
263
+ " [ 1, 1, 141, 5, 2, 3, 23, 1, 2],\n",
264
+ " [ 2, 1, 0, 31, 0, 1, 15, 0, 2],\n",
265
+ " [ 70, 51, 0, 0, 3, 2, 40, 17, 0],\n",
266
+ " [ 1, 1, 0, 3, 0, 69, 0, 0, 0],\n",
267
+ " [ 2, 1, 7, 0, 0, 0, 153, 0, 1],\n",
268
+ " [ 30, 85, 0, 1, 0, 0, 72, 6, 0],\n",
269
+ " [ 1, 0, 0, 1, 0, 1, 0, 0, 2]])"
270
+ ]
271
+ },
272
+ "execution_count": 42,
273
+ "metadata": {},
274
+ "output_type": "execute_result"
275
+ }
276
+ ],
277
+ "source": [
278
+ "matrix"
279
+ ]
280
+ },
281
+ {
282
+ "cell_type": "code",
283
+ "execution_count": 43,
284
+ "id": "24911c5c-06ed-492f-927d-1555df15b1c5",
285
+ "metadata": {},
286
+ "outputs": [
287
+ {
288
+ "data": {
289
+ "text/plain": [
290
+ "['this is a sound of waterfall burbling',\n",
291
+ " 'this is a sound of wind noise',\n",
292
+ " 'this is a sound of fire crackling',\n",
293
+ " 'this is a sound of thunder',\n",
294
+ " 'this is a sound of squishing water',\n",
295
+ " 'this is a sound of underwater bubbling',\n",
296
+ " 'this is a sound of raining',\n",
297
+ " 'this is a sound of splashing water',\n",
298
+ " 'this is a sound of explosion']"
299
+ ]
300
+ },
301
+ "execution_count": 43,
302
+ "metadata": {},
303
+ "output_type": "execute_result"
304
+ }
305
+ ],
306
+ "source": [
307
+ "y"
308
+ ]
309
+ },
310
+ {
311
+ "cell_type": "code",
312
+ "execution_count": 44,
313
+ "id": "e90b1d22-ddcd-421b-a011-ef1054cdf412",
314
+ "metadata": {},
315
+ "outputs": [
316
+ {
317
+ "data": {
318
+ "text/plain": [
319
+ "['waterfall burbling',\n",
320
+ " 'wind noise',\n",
321
+ " 'fire crackling',\n",
322
+ " 'thunder',\n",
323
+ " 'squishing water',\n",
324
+ " 'underwater bubbling',\n",
325
+ " 'raining',\n",
326
+ " 'splashing water',\n",
327
+ " 'explosion']"
328
+ ]
329
+ },
330
+ "execution_count": 44,
331
+ "metadata": {},
332
+ "output_type": "execute_result"
333
+ }
334
+ ],
335
+ "source": [
336
+ "classes"
337
+ ]
338
+ }
339
+ ],
340
+ "metadata": {
341
+ "kernelspec": {
342
+ "display_name": "clap",
343
+ "language": "python",
344
+ "name": "clap"
345
+ },
346
+ "language_info": {
347
+ "codemirror_mode": {
348
+ "name": "ipython",
349
+ "version": 3
350
+ },
351
+ "file_extension": ".py",
352
+ "mimetype": "text/x-python",
353
+ "name": "python",
354
+ "nbconvert_exporter": "python",
355
+ "pygments_lexer": "ipython3",
356
+ "version": "3.8.16"
357
+ }
358
+ },
359
+ "nbformat": 4,
360
+ "nbformat_minor": 5
361
+ }
CLAP/msclap/configs/.ipynb_checkpoints/config-checkpoint.yml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # TEXT ENCODER CONFIG
2
+ text_model: 'bert-base-uncased'
3
+ text_len: 100
4
+ transformer_embed_dim: 768
5
+ freeze_text_encoder_weights: True
6
+
7
+ # AUDIO ENCODER CONFIG
8
+ audioenc_name: 'Cnn14'
9
+ out_emb: 2048
10
+ sampling_rate: 44100
11
+ duration: 10
12
+ fmin: 50
13
+ fmax: 14000
14
+ n_fft: 1028
15
+ hop_size: 320
16
+ mel_bins: 64
17
+ window_size: 1024
18
+
19
+ # PROJECTION SPACE CONFIG
20
+ d_proj: 1024
21
+ temperature: 0.003
22
+
23
+ # TRAINING AND EVALUATION CONFIG
24
+ num_classes: 527
25
+ batch_size: 1024
26
+ demo: False
CLAP/msclap/configs/config.yml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # TEXT ENCODER CONFIG
2
+ text_model: 'bert-base-uncased'
3
+ text_len: 100
4
+ transformer_embed_dim: 768
5
+ freeze_text_encoder_weights: True
6
+
7
+ # AUDIO ENCODER CONFIG
8
+ audioenc_name: 'Cnn14'
9
+ out_emb: 2048
10
+ sampling_rate: 44100
11
+ duration: 10
12
+ fmin: 50
13
+ fmax: 14000
14
+ n_fft: 1028
15
+ hop_size: 320
16
+ mel_bins: 64
17
+ window_size: 1024
18
+
19
+ # PROJECTION SPACE CONFIG
20
+ d_proj: 1024
21
+ temperature: 0.003
22
+
23
+ # TRAINING AND EVALUATION CONFIG
24
+ num_classes: 527
25
+ batch_size: 1024
26
+ demo: False
CLAP/msclap/esc50_dataset.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch.utils.data import Dataset
2
+ from torchvision.datasets.utils import download_url
3
+ from tqdm import tqdm
4
+ import pandas as pd
5
+ import os
6
+ import torch.nn as nn
7
+ import torch
8
+
9
+ class AudioDataset(Dataset):
10
+ def __init__(self, root: str, download: bool = True):
11
+ self.root = os.path.expanduser(root)
12
+ if download:
13
+ self.download()
14
+
15
+ def __getitem__(self, index):
16
+ raise NotImplementedError
17
+
18
+ def download(self):
19
+ raise NotImplementedError
20
+
21
+ def __len__(self):
22
+ raise NotImplementedError
23
+
24
+
25
+ class ESC50(AudioDataset):
26
+ base_folder = 'ESC-50-master'
27
+ url = "https://github.com/karolpiczak/ESC-50/archive/refs/heads/master.zip"
28
+ filename = "ESC-50-master.zip"
29
+ num_files_in_dir = 2000
30
+ audio_dir = 'audio'
31
+ label_col = 'category'
32
+ file_col = 'filename'
33
+ meta = {
34
+ 'filename': os.path.join('meta','esc50.csv'),
35
+ }
36
+
37
+ def __init__(self, root, reading_transformations: nn.Module = None, download: bool = True):
38
+ super().__init__(root)
39
+ self._load_meta()
40
+
41
+ self.targets, self.audio_paths = [], []
42
+ self.pre_transformations = reading_transformations
43
+ print("Loading audio files")
44
+ # self.df['filename'] = os.path.join(self.root, self.base_folder, self.audio_dir) + os.sep + self.df['filename']
45
+ self.df['category'] = self.df['category'].str.replace('_',' ')
46
+
47
+ for _, row in tqdm(self.df.iterrows()):
48
+ file_path = os.path.join(self.root, self.base_folder, self.audio_dir, row[self.file_col])
49
+ self.targets.append(row[self.label_col])
50
+ self.audio_paths.append(file_path)
51
+
52
+ def _load_meta(self):
53
+ path = os.path.join(self.root, self.base_folder, self.meta['filename'])
54
+
55
+ self.df = pd.read_csv(path)
56
+ self.class_to_idx = {}
57
+ self.classes = [x.replace('_',' ') for x in sorted(self.df[self.label_col].unique())]
58
+ for i, category in enumerate(self.classes):
59
+ self.class_to_idx[category] = i
60
+
61
+ def __getitem__(self, index):
62
+ """
63
+ Args:
64
+ index (int): Index
65
+ Returns:
66
+ tuple: (image, target) where target is index of the target class.
67
+ """
68
+ file_path, target = self.audio_paths[index], self.targets[index]
69
+ idx = torch.tensor(self.class_to_idx[target])
70
+ one_hot_target = torch.zeros(len(self.classes)).scatter_(0, idx, 1).reshape(1,-1)
71
+ return file_path, target, one_hot_target
72
+
73
+ def __len__(self):
74
+ return len(self.audio_paths)
75
+
76
+ def download(self):
77
+ download_url(self.url, self.root, self.filename)
78
+
79
+ # extract file
80
+ from zipfile import ZipFile
81
+ with ZipFile(os.path.join(self.root, self.filename), 'r') as zip:
82
+ zip.extractall(path=self.root)
CLAP/msclap/models/.ipynb_checkpoints/audio-checkpoint.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ from torchlibrosa.stft import Spectrogram, LogmelFilterBank
5
+
6
+ #
7
+ import torchaudio
8
+ import random
9
+
10
+ def get_audio_encoder(name: str):
11
+ if name == "Cnn14":
12
+ return Cnn14
13
+ else:
14
+ raise Exception('The audio encoder name {} is incorrect or not supported'.format(name))
15
+
16
+
17
+ class ConvBlock(nn.Module):
18
+ def __init__(self, in_channels, out_channels):
19
+
20
+ super(ConvBlock, self).__init__()
21
+
22
+ self.conv1 = nn.Conv2d(in_channels=in_channels,
23
+ out_channels=out_channels,
24
+ kernel_size=(3, 3), stride=(1, 1),
25
+ padding=(1, 1), bias=False)
26
+
27
+ self.conv2 = nn.Conv2d(in_channels=out_channels,
28
+ out_channels=out_channels,
29
+ kernel_size=(3, 3), stride=(1, 1),
30
+ padding=(1, 1), bias=False)
31
+
32
+ self.bn1 = nn.BatchNorm2d(out_channels)
33
+ self.bn2 = nn.BatchNorm2d(out_channels)
34
+
35
+
36
+ def forward(self, input, pool_size=(2, 2), pool_type='avg'):
37
+
38
+ x = input
39
+ x = F.relu_(self.bn1(self.conv1(x)))
40
+ x = F.relu_(self.bn2(self.conv2(x)))
41
+ if pool_type == 'max':
42
+ x = F.max_pool2d(x, kernel_size=pool_size)
43
+ elif pool_type == 'avg':
44
+ x = F.avg_pool2d(x, kernel_size=pool_size)
45
+ elif pool_type == 'avg+max':
46
+ x1 = F.avg_pool2d(x, kernel_size=pool_size)
47
+ x2 = F.max_pool2d(x, kernel_size=pool_size)
48
+ x = x1 + x2
49
+ else:
50
+ raise Exception('Incorrect argument!')
51
+
52
+ return x
53
+
54
+
55
+ class ConvBlock5x5(nn.Module):
56
+ def __init__(self, in_channels, out_channels):
57
+
58
+ super(ConvBlock5x5, self).__init__()
59
+
60
+ self.conv1 = nn.Conv2d(in_channels=in_channels,
61
+ out_channels=out_channels,
62
+ kernel_size=(5, 5), stride=(1, 1),
63
+ padding=(2, 2), bias=False)
64
+
65
+ self.bn1 = nn.BatchNorm2d(out_channels)
66
+
67
+
68
+ def forward(self, input, pool_size=(2, 2), pool_type='avg'):
69
+
70
+ x = input
71
+ x = F.relu_(self.bn1(self.conv1(x)))
72
+ if pool_type == 'max':
73
+ x = F.max_pool2d(x, kernel_size=pool_size)
74
+ elif pool_type == 'avg':
75
+ x = F.avg_pool2d(x, kernel_size=pool_size)
76
+ elif pool_type == 'avg+max':
77
+ x1 = F.avg_pool2d(x, kernel_size=pool_size)
78
+ x2 = F.max_pool2d(x, kernel_size=pool_size)
79
+ x = x1 + x2
80
+ else:
81
+ raise Exception('Incorrect argument!')
82
+
83
+ return x
84
+
85
+
86
+ class AttBlock(nn.Module):
87
+ def __init__(self, n_in, n_out, activation='linear', temperature=1.):
88
+ super(AttBlock, self).__init__()
89
+
90
+ self.activation = activation
91
+ self.temperature = temperature
92
+ self.att = nn.Conv1d(in_channels=n_in, out_channels=n_out, kernel_size=1, stride=1, padding=0, bias=True)
93
+ self.cla = nn.Conv1d(in_channels=n_in, out_channels=n_out, kernel_size=1, stride=1, padding=0, bias=True)
94
+
95
+ self.bn_att = nn.BatchNorm1d(n_out)
96
+
97
+ def forward(self, x):
98
+ # x: (n_samples, n_in, n_time)
99
+ norm_att = torch.softmax(torch.clamp(self.att(x), -10, 10), dim=-1)
100
+ cla = self.nonlinear_transform(self.cla(x))
101
+ x = torch.sum(norm_att * cla, dim=2)
102
+ return x, norm_att, cla
103
+
104
+ def nonlinear_transform(self, x):
105
+ if self.activation == 'linear':
106
+ return x
107
+ elif self.activation == 'sigmoid':
108
+ return torch.sigmoid(x)
109
+
110
+
111
+ class Cnn14(nn.Module):
112
+ def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
113
+ fmax, classes_num, out_emb):
114
+
115
+ super(Cnn14, self).__init__()
116
+
117
+ window = 'hann'
118
+ center = True
119
+ pad_mode = 'reflect'
120
+ ref = 1.0
121
+ amin = 1e-10
122
+ top_db = None
123
+
124
+ # Spectrogram extractor
125
+ self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size,
126
+ win_length=window_size, window=window, center=center, pad_mode=pad_mode,
127
+ freeze_parameters=True)
128
+
129
+ # Logmel feature extractor
130
+ self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size,
131
+ n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db,
132
+ freeze_parameters=True)
133
+
134
+
135
+ self.freq_masking = torchaudio.transforms.FrequencyMasking(freq_mask_param=80)
136
+ self.time_masking = torchaudio.transforms.TimeMasking(80)
137
+
138
+
139
+ self.bn0 = nn.BatchNorm2d(64)
140
+
141
+ self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
142
+ self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
143
+ self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
144
+ self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
145
+ self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
146
+ self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)
147
+
148
+ # out_emb is 2048 for best Cnn14
149
+ self.fc1 = nn.Linear(2048, out_emb, bias=True)
150
+ self.fc_audioset = nn.Linear(out_emb, classes_num, bias=True)
151
+
152
+ def forward(self, input, mixup_lambda=None):
153
+ """
154
+ Input: (batch_size, data_length)
155
+ """
156
+
157
+
158
+
159
+
160
+ x = self.spectrogram_extractor(input) # (batch_size, 1, time_steps, freq_bins)
161
+ x = self.logmel_extractor(x) # (batch_size, 1, time_steps, mel_bins)
162
+
163
+ random_aug_freq = random.uniform(0,1)
164
+ random_aug_time = random.uniform(0,1)
165
+
166
+ if random_aug_freq < 0.2:
167
+ x = self.freq_masking(x)
168
+ if random_aug_time < 0.2:
169
+ x = self.time_masking(x)
170
+
171
+
172
+
173
+ x = x.transpose(1, 3)
174
+ x = self.bn0(x)
175
+ x = x.transpose(1, 3)
176
+
177
+ x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
178
+ x = F.dropout(x, p=0.2, training=self.training)
179
+ x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
180
+ x = F.dropout(x, p=0.2, training=self.training)
181
+ x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
182
+ x = F.dropout(x, p=0.2, training=self.training)
183
+ x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
184
+ x = F.dropout(x, p=0.2, training=self.training)
185
+ x = self.conv_block5(x, pool_size=(2, 2), pool_type='avg')
186
+ x = F.dropout(x, p=0.2, training=self.training)
187
+ x = self.conv_block6(x, pool_size=(1, 1), pool_type='avg')
188
+ x = F.dropout(x, p=0.2, training=self.training)
189
+ x = torch.mean(x, dim=3)
190
+
191
+ (x1, _) = torch.max(x, dim=2)
192
+ x2 = torch.mean(x, dim=2)
193
+ x = x1 + x2
194
+ x = F.dropout(x, p=0.5, training=self.training)
195
+ x = F.relu_(self.fc1(x))
196
+ embedding = F.dropout(x, p=0.5, training=self.training)
197
+ clipwise_output = torch.sigmoid(self.fc_audioset(x))
198
+
199
+ output_dict = {'clipwise_output': clipwise_output, 'embedding': embedding}
200
+
201
+ return output_dict
CLAP/msclap/models/.ipynb_checkpoints/clap-checkpoint.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import torch.nn.functional as F
4
+ from torch import nn
5
+ from transformers import AutoModel
6
+ from .audio import get_audio_encoder
7
+
8
+ class Projection(nn.Module):
9
+ def __init__(self, d_in: int, d_out: int, p: float=0.5) -> None:
10
+ super().__init__()
11
+ self.linear1 = nn.Linear(d_in, d_out, bias=False)
12
+ self.linear2 = nn.Linear(d_out, d_out, bias=False)
13
+ self.layer_norm = nn.LayerNorm(d_out)
14
+ self.drop = nn.Dropout(p)
15
+
16
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
17
+ embed1 = self.linear1(x)
18
+ embed2 = self.drop(self.linear2(F.gelu(embed1)))
19
+ embeds = self.layer_norm(embed1 + embed2)
20
+ return embeds
21
+
22
+ class AudioEncoder(nn.Module):
23
+ def __init__(self, audioenc_name:str, d_in: int, d_out: int, sample_rate: int, window_size: int,
24
+ hop_size: int, mel_bins: int, fmin: int, fmax: int, classes_num: int) -> None:
25
+ super().__init__()
26
+
27
+ audio_encoder = get_audio_encoder(audioenc_name)
28
+
29
+ self.base = audio_encoder(
30
+ sample_rate, window_size,
31
+ hop_size, mel_bins, fmin, fmax,
32
+ classes_num, d_in)
33
+
34
+ self.projection = Projection(d_in, d_out)
35
+
36
+ def forward(self, x):
37
+ out_dict = self.base(x)
38
+ audio_features, audio_classification_output = out_dict['embedding'], out_dict['clipwise_output']
39
+ projected_vec = self.projection(audio_features)
40
+ return projected_vec, audio_classification_output
41
+
42
+ class TextEncoder(nn.Module):
43
+ def __init__(self, d_out: int, text_model: str, transformer_embed_dim: int) -> None:
44
+ super().__init__()
45
+ self.base = AutoModel.from_pretrained(text_model)
46
+
47
+ self.projection = Projection(transformer_embed_dim, d_out)
48
+
49
+ def forward(self, x):
50
+ out = self.base(**x)[0]
51
+ out = out[:, 0, :] # get CLS token output
52
+ projected_vec = self.projection(out)
53
+ return projected_vec
54
+
55
+ class CLAP(nn.Module):
56
+ def __init__(self,
57
+ # audio
58
+ audioenc_name: str,
59
+ sample_rate: int,
60
+ window_size: int,
61
+ hop_size: int,
62
+ mel_bins: int,
63
+ fmin: int,
64
+ fmax: int,
65
+ classes_num: int,
66
+ out_emb: int,
67
+ # text
68
+ text_model: str,
69
+ transformer_embed_dim: int,
70
+ # common
71
+ d_proj: int,
72
+ ):
73
+ super().__init__()
74
+
75
+
76
+ self.audio_encoder = AudioEncoder(
77
+ audioenc_name, out_emb, d_proj,
78
+ sample_rate, window_size, hop_size, mel_bins, fmin, fmax, classes_num)
79
+
80
+ self.caption_encoder = TextEncoder(
81
+ d_proj, text_model, transformer_embed_dim
82
+ )
83
+
84
+ self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
85
+
86
+ def forward(self, audio, text):
87
+ audio_embed, _ = self.audio_encoder(audio)
88
+ caption_embed = self.caption_encoder(text)
89
+
90
+ return caption_embed, audio_embed, self.logit_scale.exp()
CLAP/msclap/models/.ipynb_checkpoints/utils-checkpoint.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import yaml
3
+ import sys
4
+
5
+ def read_config_as_args(config_path,args=None,is_config_str=False):
6
+ return_dict = {}
7
+
8
+ if config_path is not None:
9
+ if is_config_str:
10
+ yml_config = yaml.load(config_path, Loader=yaml.FullLoader)
11
+ else:
12
+ with open(config_path, "r") as f:
13
+ yml_config = yaml.load(f, Loader=yaml.FullLoader)
14
+
15
+ if args != None:
16
+ for k, v in yml_config.items():
17
+ if k in args.__dict__:
18
+ args.__dict__[k] = v
19
+ else:
20
+ sys.stderr.write("Ignored unknown parameter {} in yaml.\n".format(k))
21
+ else:
22
+ for k, v in yml_config.items():
23
+ return_dict[k] = v
24
+
25
+ args = args if args != None else return_dict
26
+ return argparse.Namespace(**args)
CLAP/msclap/models/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from . import clap
2
+ from . import audio
3
+ from . import utils
CLAP/msclap/models/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (234 Bytes). View file
 
CLAP/msclap/models/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (302 Bytes). View file
 
CLAP/msclap/models/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (234 Bytes). View file
 
CLAP/msclap/models/__pycache__/audio.cpython-310.pyc ADDED
Binary file (5.39 kB). View file
 
CLAP/msclap/models/__pycache__/audio.cpython-311.pyc ADDED
Binary file (10.9 kB). View file
 
CLAP/msclap/models/__pycache__/audio.cpython-38.pyc ADDED
Binary file (5.24 kB). View file
 
CLAP/msclap/models/__pycache__/clap.cpython-310.pyc ADDED
Binary file (3.67 kB). View file
 
CLAP/msclap/models/__pycache__/clap.cpython-311.pyc ADDED
Binary file (6.42 kB). View file
 
CLAP/msclap/models/__pycache__/clap.cpython-38.pyc ADDED
Binary file (3.53 kB). View file
 
CLAP/msclap/models/__pycache__/utils.cpython-310.pyc ADDED
Binary file (774 Bytes). View file
 
CLAP/msclap/models/__pycache__/utils.cpython-311.pyc ADDED
Binary file (1.54 kB). View file
 
CLAP/msclap/models/__pycache__/utils.cpython-38.pyc ADDED
Binary file (741 Bytes). View file
 
CLAP/msclap/models/audio.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ from torchlibrosa.stft import Spectrogram, LogmelFilterBank
5
+
6
+ #
7
+ import torchaudio
8
+ import random
9
+
10
+ def get_audio_encoder(name: str):
11
+ if name == "Cnn14":
12
+ return Cnn14
13
+ else:
14
+ raise Exception('The audio encoder name {} is incorrect or not supported'.format(name))
15
+
16
+
17
+ class ConvBlock(nn.Module):
18
+ def __init__(self, in_channels, out_channels):
19
+
20
+ super(ConvBlock, self).__init__()
21
+
22
+ self.conv1 = nn.Conv2d(in_channels=in_channels,
23
+ out_channels=out_channels,
24
+ kernel_size=(3, 3), stride=(1, 1),
25
+ padding=(1, 1), bias=False)
26
+
27
+ self.conv2 = nn.Conv2d(in_channels=out_channels,
28
+ out_channels=out_channels,
29
+ kernel_size=(3, 3), stride=(1, 1),
30
+ padding=(1, 1), bias=False)
31
+
32
+ self.bn1 = nn.BatchNorm2d(out_channels)
33
+ self.bn2 = nn.BatchNorm2d(out_channels)
34
+
35
+
36
+ def forward(self, input, pool_size=(2, 2), pool_type='avg'):
37
+
38
+ x = input
39
+ x = F.relu_(self.bn1(self.conv1(x)))
40
+ x = F.relu_(self.bn2(self.conv2(x)))
41
+ if pool_type == 'max':
42
+ x = F.max_pool2d(x, kernel_size=pool_size)
43
+ elif pool_type == 'avg':
44
+ x = F.avg_pool2d(x, kernel_size=pool_size)
45
+ elif pool_type == 'avg+max':
46
+ x1 = F.avg_pool2d(x, kernel_size=pool_size)
47
+ x2 = F.max_pool2d(x, kernel_size=pool_size)
48
+ x = x1 + x2
49
+ else:
50
+ raise Exception('Incorrect argument!')
51
+
52
+ return x
53
+
54
+
55
+ class ConvBlock5x5(nn.Module):
56
+ def __init__(self, in_channels, out_channels):
57
+
58
+ super(ConvBlock5x5, self).__init__()
59
+
60
+ self.conv1 = nn.Conv2d(in_channels=in_channels,
61
+ out_channels=out_channels,
62
+ kernel_size=(5, 5), stride=(1, 1),
63
+ padding=(2, 2), bias=False)
64
+
65
+ self.bn1 = nn.BatchNorm2d(out_channels)
66
+
67
+
68
+ def forward(self, input, pool_size=(2, 2), pool_type='avg'):
69
+
70
+ x = input
71
+ x = F.relu_(self.bn1(self.conv1(x)))
72
+ if pool_type == 'max':
73
+ x = F.max_pool2d(x, kernel_size=pool_size)
74
+ elif pool_type == 'avg':
75
+ x = F.avg_pool2d(x, kernel_size=pool_size)
76
+ elif pool_type == 'avg+max':
77
+ x1 = F.avg_pool2d(x, kernel_size=pool_size)
78
+ x2 = F.max_pool2d(x, kernel_size=pool_size)
79
+ x = x1 + x2
80
+ else:
81
+ raise Exception('Incorrect argument!')
82
+
83
+ return x
84
+
85
+
86
+ class AttBlock(nn.Module):
87
+ def __init__(self, n_in, n_out, activation='linear', temperature=1.):
88
+ super(AttBlock, self).__init__()
89
+
90
+ self.activation = activation
91
+ self.temperature = temperature
92
+ self.att = nn.Conv1d(in_channels=n_in, out_channels=n_out, kernel_size=1, stride=1, padding=0, bias=True)
93
+ self.cla = nn.Conv1d(in_channels=n_in, out_channels=n_out, kernel_size=1, stride=1, padding=0, bias=True)
94
+
95
+ self.bn_att = nn.BatchNorm1d(n_out)
96
+
97
+ def forward(self, x):
98
+ # x: (n_samples, n_in, n_time)
99
+ norm_att = torch.softmax(torch.clamp(self.att(x), -10, 10), dim=-1)
100
+ cla = self.nonlinear_transform(self.cla(x))
101
+ x = torch.sum(norm_att * cla, dim=2)
102
+ return x, norm_att, cla
103
+
104
+ def nonlinear_transform(self, x):
105
+ if self.activation == 'linear':
106
+ return x
107
+ elif self.activation == 'sigmoid':
108
+ return torch.sigmoid(x)
109
+
110
+
111
+ class Cnn14(nn.Module):
112
+ def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
113
+ fmax, classes_num, out_emb):
114
+
115
+ super(Cnn14, self).__init__()
116
+
117
+ window = 'hann'
118
+ center = True
119
+ pad_mode = 'reflect'
120
+ ref = 1.0
121
+ amin = 1e-10
122
+ top_db = None
123
+
124
+ # Spectrogram extractor
125
+ self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size,
126
+ win_length=window_size, window=window, center=center, pad_mode=pad_mode,
127
+ freeze_parameters=True)
128
+
129
+ # Logmel feature extractor
130
+ self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size,
131
+ n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db,
132
+ freeze_parameters=True)
133
+
134
+
135
+ self.freq_masking = torchaudio.transforms.FrequencyMasking(freq_mask_param=80)
136
+ self.time_masking = torchaudio.transforms.TimeMasking(80)
137
+
138
+
139
+ self.bn0 = nn.BatchNorm2d(64)
140
+
141
+ self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
142
+ self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
143
+ self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
144
+ self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
145
+ self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
146
+ self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)
147
+
148
+ # out_emb is 2048 for best Cnn14
149
+ self.fc1 = nn.Linear(2048, out_emb, bias=True)
150
+ self.fc_audioset = nn.Linear(out_emb, classes_num, bias=True)
151
+
152
+ def forward(self, input, mixup_lambda=None, use_aug=False):
153
+ """
154
+ Input: (batch_size, data_length)
155
+ """
156
+
157
+ x = self.spectrogram_extractor(input) # (batch_size, 1, time_steps, freq_bins)
158
+ x = self.logmel_extractor(x) # (batch_size, 1, time_steps, mel_bins)
159
+
160
+ # if use_aug:
161
+ # random_aug_freq = random.uniform(0,1)
162
+ # random_aug_time = random.uniform(0,1)
163
+ # if random_aug_freq < 0.2:
164
+ # x = self.freq_masking(x)
165
+ # if random_aug_time < 0.2:
166
+ # x = self.time_masking(x)
167
+
168
+ x = x.transpose(1, 3)
169
+ x = self.bn0(x)
170
+ x = x.transpose(1, 3)
171
+
172
+ x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
173
+ x = F.dropout(x, p=0.2, training=self.training)
174
+ x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
175
+ x = F.dropout(x, p=0.2, training=self.training)
176
+ x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
177
+ x = F.dropout(x, p=0.2, training=self.training)
178
+ x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
179
+ x = F.dropout(x, p=0.2, training=self.training)
180
+ x = self.conv_block5(x, pool_size=(2, 2), pool_type='avg')
181
+ x = F.dropout(x, p=0.2, training=self.training)
182
+ x = self.conv_block6(x, pool_size=(1, 1), pool_type='avg')
183
+ x = F.dropout(x, p=0.2, training=self.training)
184
+
185
+ x = torch.mean(x, dim=3)
186
+
187
+ x_inner_layer = x.clone()
188
+
189
+ (x1, _) = torch.max(x, dim=2)
190
+ x2 = torch.mean(x, dim=2)
191
+ x = x1 + x2
192
+ x = F.dropout(x, p=0.5, training=self.training)
193
+ x = F.relu_(self.fc1(x))
194
+ embedding = F.dropout(x, p=0.5, training=self.training)
195
+ clipwise_output = torch.sigmoid(self.fc_audioset(x))
196
+
197
+ output_dict = {'clipwise_output': clipwise_output, 'embedding': embedding, 'inner_layer': x_inner_layer}
198
+
199
+ return output_dict
200
+
CLAP/msclap/models/clap.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import torch.nn.functional as F
4
+ from torch import nn
5
+ from transformers import AutoModel
6
+ from .audio import get_audio_encoder
7
+
8
+ class Projection(nn.Module):
9
+ def __init__(self, d_in: int, d_out: int, p: float=0.5) -> None:
10
+ super().__init__()
11
+ self.linear1 = nn.Linear(d_in, d_out, bias=False)
12
+ self.linear2 = nn.Linear(d_out, d_out, bias=False)
13
+ self.layer_norm = nn.LayerNorm(d_out)
14
+ self.drop = nn.Dropout(p)
15
+
16
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
17
+ embed1 = self.linear1(x)
18
+ embed2 = self.drop(self.linear2(F.gelu(embed1)))
19
+ embeds = self.layer_norm(embed1 + embed2)
20
+ return embeds
21
+
22
+ class AudioEncoder(nn.Module):
23
+ def __init__(self, audioenc_name:str, d_in: int, d_out: int, sample_rate: int, window_size: int,
24
+ hop_size: int, mel_bins: int, fmin: int, fmax: int, classes_num: int) -> None:
25
+ super().__init__()
26
+
27
+ audio_encoder = get_audio_encoder(audioenc_name)
28
+
29
+ self.base = audio_encoder(
30
+ sample_rate, window_size,
31
+ hop_size, mel_bins, fmin, fmax,
32
+ classes_num, d_in,
33
+ )
34
+
35
+ self.projection = Projection(d_in, d_out)
36
+
37
+ def forward(self, x, use_aug=False):
38
+ out_dict = self.base(x, use_aug=use_aug)
39
+ audio_features, audio_classification_output = out_dict['embedding'], out_dict['clipwise_output']
40
+ audio_inner_layer = out_dict['inner_layer']
41
+ projected_vec = self.projection(audio_features)
42
+ return projected_vec, audio_classification_output, audio_inner_layer
43
+
44
+ class TextEncoder(nn.Module):
45
+ def __init__(self, d_out: int, text_model: str, transformer_embed_dim: int) -> None:
46
+ super().__init__()
47
+ self.base = AutoModel.from_pretrained(text_model)
48
+
49
+ self.projection = Projection(transformer_embed_dim, d_out)
50
+
51
+ def forward(self, x):
52
+ out = self.base(**x)[0]
53
+ out = out[:, 0, :] # get CLS token output
54
+ projected_vec = self.projection(out)
55
+ return projected_vec
56
+
57
+ class CLAP(nn.Module):
58
+ def __init__(self,
59
+ # audio
60
+ audioenc_name: str,
61
+ sample_rate: int,
62
+ window_size: int,
63
+ hop_size: int,
64
+ mel_bins: int,
65
+ fmin: int,
66
+ fmax: int,
67
+ classes_num: int,
68
+ out_emb: int,
69
+ # text
70
+ text_model: str,
71
+ transformer_embed_dim: int,
72
+ # common
73
+ d_proj: int,
74
+ ):
75
+ super().__init__()
76
+
77
+
78
+ self.audio_encoder = AudioEncoder(
79
+ audioenc_name, out_emb, d_proj,
80
+ sample_rate, window_size, hop_size, mel_bins, fmin, fmax, classes_num)
81
+
82
+ self.caption_encoder = TextEncoder(
83
+ d_proj, text_model, transformer_embed_dim
84
+ )
85
+
86
+ self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
87
+
88
+ def forward(self, audio, text):
89
+ audio_embed, _, _ = self.audio_encoder(audio)
90
+ caption_embed = self.caption_encoder(text)
91
+
92
+ return caption_embed, audio_embed, self.logit_scale.exp()
CLAP/msclap/models/utils.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import yaml
3
+ import sys
4
+
5
+ def read_config_as_args(config_path,args=None,is_config_str=False):
6
+ return_dict = {}
7
+
8
+ if config_path is not None:
9
+ if is_config_str:
10
+ yml_config = yaml.load(config_path, Loader=yaml.FullLoader)
11
+ else:
12
+ with open(config_path, "r") as f:
13
+ yml_config = yaml.load(f, Loader=yaml.FullLoader)
14
+
15
+ if args != None:
16
+ for k, v in yml_config.items():
17
+ if k in args.__dict__:
18
+ args.__dict__[k] = v
19
+ else:
20
+ sys.stderr.write("Ignored unknown parameter {} in yaml.\n".format(k))
21
+ else:
22
+ for k, v in yml_config.items():
23
+ return_dict[k] = v
24
+
25
+ args = args if args != None else return_dict
26
+ return argparse.Namespace(**args)
CLAP/msclap/zero_shot_classification.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This is an example using CLAP to perform zeroshot
3
+ classification on ESC50 (https://github.com/karolpiczak/ESC-50).
4
+ """
5
+
6
+ from CLAPWrapper import CLAPWrapper
7
+ from esc50_dataset import ESC50
8
+ import torch.nn.functional as F
9
+ import numpy as np
10
+ from tqdm import tqdm
11
+ from sklearn.metrics import accuracy_score
12
+
13
+ # Load dataset
14
+ dataset = ESC50(root="data_path", download=False)
15
+ prompt = 'this is a sound of '
16
+ y = [prompt + x for x in dataset.classes]
17
+
18
+
19
+ # Load and initialize CLAP
20
+ weights_path = "weights_path"
21
+ clap_model = CLAPWrapper(weights_path, use_cuda=False)
22
+
23
+
24
+ # Computing text embeddings
25
+ text_embeddings = clap_model.get_text_embeddings(y)
26
+
27
+ # Computing audio embeddings
28
+ y_preds, y_labels = [], []
29
+ for i in tqdm(range(len(dataset))):
30
+ x, _, one_hot_target = dataset.__getitem__(i)
31
+ audio_embeddings = clap_model.get_audio_embeddings([x], resample=True)
32
+ similarity = clap_model.compute_similarity(audio_embeddings, text_embeddings)
33
+ y_pred = F.softmax(similarity.detach().cpu(), dim=1).numpy()
34
+ y_preds.append(y_pred)
35
+ y_labels.append(one_hot_target.detach().cpu().numpy())
36
+
37
+ y_labels, y_preds = np.concatenate(y_labels, axis=0), np.concatenate(y_preds, axis=0)
38
+ acc = accuracy_score(np.argmax(y_labels, axis=1), np.argmax(y_preds, axis=1))
39
+ print('ESC50 Accuracy {}'.format(acc))
40
+
41
+ """
42
+ The output:
43
+
44
+ ESC50 Accuracy: 82.6%
45
+
46
+ """
CLAP/msclap/zero_shot_predictions.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This is an example using CLAP for zero-shot
3
+ inference using ESC50 (https://github.com/karolpiczak/ESC-50).
4
+ """
5
+
6
+ from CLAPWrapper import CLAPWrapper
7
+ from esc50_dataset import ESC50
8
+ import torch.nn.functional as F
9
+
10
+ # Load ESC50 dataset
11
+ dataset = ESC50(root="data_path", download=True) # set download=True when dataset is not downloaded
12
+ audio_file, target, one_hot_target = dataset[1000]
13
+ audio_file = [audio_file]
14
+ prompt = 'this is a sound of '
15
+ y = [prompt + x for x in dataset.classes]
16
+
17
+ # Load and initialize CLAP
18
+ weights_path = "weights_path"
19
+
20
+ # Setting use_cuda = True will load the model on a GPU using CUDA
21
+ clap_model = CLAPWrapper(weights_path, use_cuda=False)
22
+
23
+ # compute text embeddings from natural text
24
+ text_embeddings = clap_model.get_text_embeddings(y)
25
+
26
+ # compute the audio embeddings from an audio file
27
+ audio_embeddings = clap_model.get_audio_embeddings(audio_file, resample=True)
28
+
29
+ # compute the similarity between audio_embeddings and text_embeddings
30
+ similarity = clap_model.compute_similarity(audio_embeddings, text_embeddings)
31
+
32
+ similarity = F.softmax(similarity, dim=1)
33
+ values, indices = similarity[0].topk(5)
34
+
35
+ # view the results
36
+ print("Ground Truth: {}".format(target))
37
+ print("Top predictions:\n")
38
+ for value, index in zip(values, indices):
39
+ print(f"{dataset.classes[index]:>16s}: {100 * value.item():.2f}%")
40
+
41
+ """
42
+ The output (the exact numbers may vary):
43
+
44
+ Ground Truth: coughing
45
+ Top predictions:
46
+
47
+ coughing: 86.34%
48
+ sneezing: 9.30%
49
+ drinking sipping: 1.31%
50
+ laughing: 1.20%
51
+ glass breaking: 0.81%
52
+ """
README.md CHANGED
@@ -8,6 +8,7 @@ sdk_version: 4.29.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
+ python_version: 3.10.13
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
ldm/modules/encoders/audio_projector_res.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ # from ldm.modules.attention import SpatialTransformer, BasicTransformerBlock
5
+
6
+ from torch import nn, einsum
7
+ from einops import rearrange, repeat
8
+
9
+ #k,q will be from audio
10
+
11
+ class MyCrossAttention(nn.Module):
12
+ def __init__(self, device="cuda", audio_dim = 1024, context_dim = 768, dropout=0.0, h = 8, dim_head=40):
13
+ super().__init__()
14
+ self.h = h
15
+ inner_dim = dim_head * h
16
+ self.scale = dim_head ** -0.5
17
+
18
+ self.to_q_adapter = nn.Linear(context_dim, inner_dim, bias=False)
19
+ self.to_k_adapter = nn.Linear(context_dim, inner_dim, bias=False)
20
+ self.to_v_adapter = nn.Linear(context_dim, inner_dim, bias=False)
21
+
22
+ def forward(self, audio):
23
+ q_adapter = self.to_q_adapter(audio) #from text
24
+ k_adapter = self.to_k_adapter(audio)
25
+ v_adapter = self.to_v_adapter(audio)
26
+
27
+ q_adapter, k_adapter, v_adapter = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=self.h), (q_adapter, k_adapter, v_adapter))
28
+
29
+ sim_adapter = einsum('b i d, b j d -> b i j', q_adapter, k_adapter) * self.scale
30
+
31
+ attn_adapter = sim_adapter.softmax(dim=-1)
32
+
33
+ out = einsum('b i j, b j d -> b i d', attn_adapter, v_adapter)
34
+ out = rearrange(out, '(b h) n d -> b n (h d)', h=self.h)
35
+ # print(f'ca out shape is: {out.shape}')
36
+
37
+ return out
38
+
39
+
40
+ class Adapter(nn.Module):
41
+ def __init__(self, device="cuda", audio_dim = 1024, context_dim = 768, dropout=0.0, h = 8, dim_head=40, audio_token_count = 10, initial_channel_dim=1, transformer_layer_count=4):
42
+ super(Adapter, self).__init__()
43
+ self.h = h
44
+ inner_dim = dim_head * h
45
+
46
+ audio_att_inner_dim = audio_token_count
47
+
48
+ self.audio_emb_projection = nn.Sequential(
49
+ nn.Conv1d(initial_channel_dim, audio_att_inner_dim, kernel_size = 17, stride = 1, padding = 8),
50
+ nn.GELU(),
51
+ nn.Conv1d(audio_att_inner_dim, audio_att_inner_dim, kernel_size = 17, stride = 1, padding = 8),
52
+ nn.GELU(),
53
+ nn.LayerNorm([audio_att_inner_dim, audio_dim]),
54
+ nn.Conv1d(audio_att_inner_dim, audio_att_inner_dim, kernel_size = 17, stride = 1, padding = 8),
55
+ nn.GELU(),
56
+ nn.LayerNorm([audio_att_inner_dim, audio_dim]),
57
+ nn.ConvTranspose1d(audio_att_inner_dim, audio_att_inner_dim, kernel_size = 17, stride=3, padding=7),
58
+ nn.GELU(),
59
+ nn.LayerNorm([audio_att_inner_dim, 3*audio_dim]),
60
+ nn.GELU(),
61
+ nn.Conv1d(audio_att_inner_dim, audio_att_inner_dim, kernel_size = 17, stride=4, padding=7),
62
+ nn.Dropout(dropout)
63
+ )
64
+
65
+ #create a stack of MyCrossAttention layers
66
+ self.cross_attention = nn.ModuleList([MyCrossAttention(device, audio_dim, context_dim, dropout, h, dim_head) for _ in range(transformer_layer_count)])
67
+
68
+ #create a stack of linear, gelu, linear dropout layers to be used after the cross attention
69
+ self.between_attention = nn.ModuleList([nn.Sequential(
70
+ nn.Linear(inner_dim, inner_dim),
71
+ nn.GELU(),
72
+ nn.Linear(inner_dim, context_dim),
73
+ nn.Dropout(dropout)
74
+ ) for _ in range(transformer_layer_count)])
75
+
76
+ self.to_out_adapter = nn.Sequential(
77
+ nn.Linear(context_dim, context_dim),
78
+ nn.Dropout(dropout)
79
+ )
80
+
81
+
82
+ def forward(self, audio_context):
83
+ audio_proj = self.audio_emb_projection(audio_context) #[bs, 64, 1024]
84
+ for cross_attention, between_attention in zip(self.cross_attention, self.between_attention):
85
+ out = cross_attention(audio_proj)
86
+ out = between_attention(out) + audio_proj
87
+ # print(f'out shape is: {out.shape}')
88
+
89
+ out = self.to_out_adapter(out) #[bs, 77, 768]
90
+ # print(f'context dim is: {out.shape}')
91
+
92
+ return out
93
+
94
+
requirements.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate==0.25.0
2
+ diffusers==0.27.2
3
+ einops==0.7.0
4
+ gradio==4.26.0
5
+ gradio_client==0.15.1
6
+ librosa==0.10.1
7
+ numpy==1.26.4
8
+ omegaconf==2.3.0
9
+ pillow==10.3.0
10
+ scikit-learn==1.4.2
11
+ scipy==1.13.0
12
+ soundfile==0.12.1
13
+ torch==2.0.1
14
+ torchaudio==2.0.2
15
+ torchlibrosa==0.1.0
16
+ torchvision==0.15.2
17
+ tqdm==4.66.2
18
+ transformers==4.35.2