Spaces:

burakcanbiner
/

SonicDiffusion

Sleeping

App Files Files Community

root commited on May 10

Commit

9778d56

•

1 Parent(s): 40e68f7

init

Browse files

Files changed (31) hide show

CLAP/msclap/CLAPWrapper.py +274 -0
CLAP/msclap/__init__.py +0 -0
CLAP/msclap/clap.ipynb +0 -0
CLAP/msclap/classification.ipynb +361 -0
CLAP/msclap/configs/.ipynb_checkpoints/config-checkpoint.yml +26 -0
CLAP/msclap/configs/config.yml +26 -0
CLAP/msclap/esc50_dataset.py +82 -0
CLAP/msclap/models/.ipynb_checkpoints/audio-checkpoint.py +201 -0
CLAP/msclap/models/.ipynb_checkpoints/clap-checkpoint.py +90 -0
CLAP/msclap/models/.ipynb_checkpoints/utils-checkpoint.py +26 -0
CLAP/msclap/models/__init__.py +3 -0
CLAP/msclap/models/__pycache__/__init__.cpython-310.pyc +0 -0
CLAP/msclap/models/__pycache__/__init__.cpython-311.pyc +0 -0
CLAP/msclap/models/__pycache__/__init__.cpython-38.pyc +0 -0
CLAP/msclap/models/__pycache__/audio.cpython-310.pyc +0 -0
CLAP/msclap/models/__pycache__/audio.cpython-311.pyc +0 -0
CLAP/msclap/models/__pycache__/audio.cpython-38.pyc +0 -0
CLAP/msclap/models/__pycache__/clap.cpython-310.pyc +0 -0
CLAP/msclap/models/__pycache__/clap.cpython-311.pyc +0 -0
CLAP/msclap/models/__pycache__/clap.cpython-38.pyc +0 -0
CLAP/msclap/models/__pycache__/utils.cpython-310.pyc +0 -0
CLAP/msclap/models/__pycache__/utils.cpython-311.pyc +0 -0
CLAP/msclap/models/__pycache__/utils.cpython-38.pyc +0 -0
CLAP/msclap/models/audio.py +200 -0
CLAP/msclap/models/clap.py +92 -0
CLAP/msclap/models/utils.py +26 -0
CLAP/msclap/zero_shot_classification.py +46 -0
CLAP/msclap/zero_shot_predictions.py +52 -0
README.md +1 -0
ldm/modules/encoders/audio_projector_res.py +94 -0
requirements.txt +18 -0

CLAP/msclap/CLAPWrapper.py ADDED Viewed

	@@ -0,0 +1,274 @@

+import random
+import torchaudio
+# from torch._six import string_classes
+import collections
+import re
+import torch.nn.functional as F
+import numpy as np
+from transformers import AutoTokenizer
+from models.utils import read_config_as_args
+from models.clap import CLAP
+import math
+import torchaudio.transforms as T
+import os
+import torch
+from importlib_resources import files
+class CLAPWrapper():
+    """
+    A class for interfacing CLAP model.
+    """
+    def __init__(self, model_fp, use_cuda=False):
+        self.np_str_obj_array_pattern = re.compile(r'[SaUO]')
+        self.file_path = os.path.realpath(__file__)
+        self.default_collate_err_msg_format = (
+            "default_collate: batch must contain tensors, numpy arrays, numbers, "
+            "dicts or lists; found {}")
+        self.config_as_str = files('configs').joinpath('config.yml').read_text()
+        self.model_fp = model_fp
+        self.use_cuda = use_cuda
+        self.clap, self.tokenizer, self.args = self.load_clap()
+    def load_clap(self):
+        r"""Load CLAP model with args from config file"""
+        args = read_config_as_args(self.config_as_str, is_config_str=True)
+        if 'bert' in args.text_model:
+            self.token_keys = ['input_ids', 'token_type_ids', 'attention_mask']
+        else:
+            self.token_keys = ['input_ids', 'attention_mask']
+        clap = CLAP(
+            audioenc_name=args.audioenc_name,
+            sample_rate=args.sampling_rate,
+            window_size=args.window_size,
+            hop_size=args.hop_size,
+            mel_bins=args.mel_bins,
+            fmin=args.fmin,
+            fmax=args.fmax,
+            classes_num=args.num_classes,
+            out_emb=args.out_emb,
+            text_model=args.text_model,
+            transformer_embed_dim=args.transformer_embed_dim,
+            d_proj=args.d_proj
+        )
+        # print("---")
+        # print(f"duration is {args.duration}")
+        # args.duration = 10
+        # Load pretrained weights for model
+        model_state_dict = torch.load(self.model_fp, map_location=torch.device('cpu'))['model']
+        clap.load_state_dict(model_state_dict, strict=False)
+        clap.eval()  # set clap in eval mode
+        tokenizer = AutoTokenizer.from_pretrained(args.text_model)
+        if self.use_cuda and torch.cuda.is_available():
+            clap = clap.cuda()
+        return clap, tokenizer, args
+    def default_collate(self, batch):
+        r"""Puts each data field into a tensor with outer dimension batch size"""
+        elem = batch[0]
+        elem_type = type(elem)
+        if isinstance(elem, torch.Tensor):
+            out = None
+            if torch.utils.data.get_worker_info() is not None:
+                # If we're in a background process, concatenate directly into a
+                # shared memory tensor to avoid an extra copy
+                numel = sum([x.numel() for x in batch])
+                storage = elem.storage()._new_shared(numel)
+                out = elem.new(storage)
+            return torch.stack(batch, 0, out=out)
+        elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
+                and elem_type.__name__ != 'string_':
+            if elem_type.__name__ == 'ndarray' or elem_type.__name__ == 'memmap':
+                # array of string classes and object
+                if self.np_str_obj_array_pattern.search(elem.dtype.str) is not None:
+                    raise TypeError(
+                        self.default_collate_err_msg_format.format(elem.dtype))
+                return self.default_collate([torch.as_tensor(b) for b in batch])
+            elif elem.shape == ():  # scalars
+                return torch.as_tensor(batch)
+        elif isinstance(elem, float):
+            return torch.tensor(batch, dtype=torch.float64)
+        elif isinstance(elem, int):
+            return torch.tensor(batch)
+#         elif isinstance(elem, string_classes):
+#             return batch
+        elif isinstance(elem, collections.abc.Mapping):
+            return {key: self.default_collate([d[key] for d in batch]) for key in elem}
+        elif isinstance(elem, tuple) and hasattr(elem, '_fields'):  # namedtuple
+            return elem_type(*(self.default_collate(samples) for samples in zip(*batch)))
+        elif isinstance(elem, collections.abc.Sequence):
+            # check to make sure that the elements in batch have consistent size
+            it = iter(batch)
+            elem_size = len(next(it))
+            if not all(len(elem) == elem_size for elem in it):
+                raise RuntimeError(
+                    'each element in list of batch should be of equal size')
+            transposed = zip(*batch)
+            return [self.default_collate(samples) for samples in transposed]
+        raise TypeError(self.default_collate_err_msg_format.format(elem_type))
+    def load_audio_into_tensor(self, audio_path, audio_duration, resample=False):
+        r"""Loads audio file and returns raw audio."""
+        # Randomly sample a segment of audio_duration from the clip or pad to match duration
+        audio_time_series, sample_rate = torchaudio.load(audio_path)
+        resample_rate = self.args.sampling_rate
+        audio_time_series = torch.mean(audio_time_series, dim=0, keepdim=True)
+        if resample:
+            resampler = T.Resample(sample_rate, resample_rate)
+            audio_time_series = resampler(audio_time_series)
+        audio_time_series = audio_time_series.reshape(-1)
+        # audio_duration = 10
+        # window_len = 5
+        # window_count = 10
+        # audio_time_series is shorter than predefined audio duration,
+        # so audio_time_series is extended
+        if audio_duration*resample_rate >= audio_time_series.shape[0]: # it was sample rate here but why it should be wrong ????
+            repeat_factor = int(np.ceil((audio_duration*resample_rate) /
+                                        audio_time_series.shape[0]))
+            # Repeat audio_time_series by repeat_factor to match audio_duration
+            audio_time_series = audio_time_series.repeat(repeat_factor)
+            # remove excess part of audio_time_series
+            audio_time_series = audio_time_series[0:audio_duration*resample_rate]
+        else:
+            # audio_time_series is longer than predefined audio duration,
+            # so audio_time_series is trimmed
+            start_index = random.randrange(
+                audio_time_series.shape[0] - audio_duration*resample_rate)
+            audio_time_series = audio_time_series[start_index:start_index +
+                                                  audio_duration*resample_rate]
+        return torch.FloatTensor(audio_time_series)
+    def preprocess_audio(self, audio_files, resample):
+        r"""Load list of audio files and return raw audio"""
+        audio_tensors = []
+        for audio_file in audio_files:
+            # print(self.args.duration)
+            audio_tensor = self.load_audio_into_tensor(
+                audio_file, self.args.duration, resample)
+            if self.use_cuda and torch.cuda.is_available():
+                audio_tensor = audio_tensor.reshape(1, -1).cuda()
+            else:
+                 audio_tensor.reshape(1, -1)
+#             audio_tensor = audio_tensor.reshape(
+#                 1, -1).cuda if self.use_cuda and torch.cuda.is_available() else audio_tensor.reshape(1, -1)
+            audio_tensors.append(audio_tensor)
+        return self.default_collate(audio_tensors)
+    def preprocess_text(self, text_queries):
+        r"""Load list of class labels and return tokenized text"""
+        tokenized_texts = []
+        for ttext in text_queries:
+            tok = self.tokenizer.encode_plus(
+                text=ttext, add_special_tokens=True, max_length=self.args.text_len, pad_to_max_length=True, return_tensors="pt")
+            for key in self.token_keys:
+                tok[key] = tok[key].reshape(-1).cuda() if self.use_cuda and torch.cuda.is_available() else tok[key].reshape(-1)
+            tokenized_texts.append(tok)
+        return self.default_collate(tokenized_texts)
+    def get_text_embeddings(self, class_labels):
+        r"""Load list of class labels and return text embeddings"""
+        preprocessed_text = self.preprocess_text(class_labels)
+        text_embeddings = self._get_text_embeddings(preprocessed_text)
+        text_embeddings = text_embeddings/torch.norm(text_embeddings, dim=-1, keepdim=True)
+        return text_embeddings
+    def get_audio_embeddings(self, audio_files, resample, use_aug=False):
+        r"""Load list of audio files and return a audio embeddings"""
+        preprocessed_audio = self.preprocess_audio(audio_files, resample)
+        audio_embeddings, audio_inner_layer = self._get_audio_embeddings(preprocessed_audio, use_aug=use_aug)
+        audio_embeddings = audio_embeddings/torch.norm(audio_embeddings, dim=-1, keepdim=True)
+        return audio_embeddings, audio_inner_layer
+    def _get_text_embeddings(self, preprocessed_text):
+        r"""Load preprocessed text and return text embeddings"""
+        with torch.no_grad():
+            text_embeddings = self.clap.caption_encoder(preprocessed_text)
+            text_embeddings = text_embeddings/torch.norm(text_embeddings, dim=-1, keepdim=True)
+            return text_embeddings
+    def _get_audio_embeddings(self, preprocessed_audio, use_aug=False):
+        r"""Load preprocessed audio and return a audio embeddings"""
+        with torch.no_grad():
+            preprocessed_audio = preprocessed_audio.reshape(
+                preprocessed_audio.shape[0], preprocessed_audio.shape[2])
+            #Append [0] the audio emebdding, [1] has output class probabilities
+            audio_embeddings, _, audio_inner_layer = self.clap.audio_encoder(preprocessed_audio, use_aug=use_aug)
+            audio_embeddings = audio_embeddings/torch.norm(audio_embeddings, dim=-1, keepdim=True)
+            return audio_embeddings, audio_inner_layer
+    def compute_similarity(self, audio_embeddings, text_embeddings):
+        r"""Compute similarity between text and audio embeddings"""
+        logit_scale = self.clap.logit_scale.exp()
+        similarity = logit_scale*text_embeddings @ audio_embeddings.T
+        return similarity.T
+    def _generic_batch_inference(self, func, *args):
+        r"""Process audio and/or text per batch"""
+        input_tmp = args[0]
+        batch_size = args[-1]
+        # args[0] has audio_files, args[1] has class_labels
+        inputs = [args[0], args[1]] if len(args) == 3 else [args[0]]
+        args0_len = len(args[0])
+        # compute text_embeddings once for all the audio_files batches
+        if len(inputs) == 2:
+            text_embeddings = self.get_text_embeddings(args[1])
+            inputs = [args[0], args[1], text_embeddings]
+        dataset_idx = 0
+        for _ in range(math.ceil(args0_len/batch_size)):
+            next_batch_idx = dataset_idx + batch_size
+            # batch size is bigger than available audio/text items
+            if next_batch_idx >= args0_len:
+                inputs[0] = input_tmp[dataset_idx:]
+                return func(*tuple(inputs))
+            else:
+                inputs[0] = input_tmp[dataset_idx:next_batch_idx]
+                yield func(*tuple(inputs))
+            dataset_idx = next_batch_idx
+    def get_audio_embeddings_per_batch(self, audio_files, batch_size):
+        r"""Load preprocessed audio and return a audio embeddings per batch"""
+        return self._generic_batch_inference(self.get_audio_embeddings, audio_files, batch_size)
+    def get_text_embeddings_per_batch(self, class_labels, batch_size):
+        r"""Load preprocessed text and return text embeddings per batch"""
+        return self._generic_batch_inference(self.get_text_embeddings, class_labels, batch_size)
+    def classify_audio_files_per_batch(self, audio_files, class_labels, batch_size):
+        r"""Compute classification probabilities for each audio recording in a batch and each class label"""
+        return self._generic_batch_inference(self.classify_audio_files, audio_files, class_labels, batch_size)

CLAP/msclap/__init__.py ADDED Viewed

File without changes

CLAP/msclap/clap.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

CLAP/msclap/classification.ipynb ADDED Viewed

	@@ -0,0 +1,361 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "6bf499e8-54b0-498b-84b6-aba956cc573b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "\n",
+    "from CLAPWrapper import CLAPWrapper\n",
+    "from esc50_dataset import ESC50\n",
+    "import torch.nn.functional as F\n",
+    "import numpy as np\n",
+    "from tqdm import tqdm\n",
+    "from sklearn.metrics import accuracy_score\n",
+    "\n",
+    "import torch\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "082e82b9-56b4-41ce-a8f8-390bb5bc0193",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv(\"../landscape/landscape_final.csv\")\n",
+    "\n",
+    "classes = list(set(df[\"label\"]))\n",
+    "\n",
+    "prompt = 'this is a sound of '\n",
+    "y = [prompt + x for x in classes]\n",
+    "\n",
+    "class_count = len(classes)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "68e72bf4-6c94-438d-b3f3-c46aaa0b88cc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class_dict = {k: v for v, k in enumerate(classes)}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "id": "80c437e3-b7e3-41bc-bb9c-fab936648caf",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/kuacc/users/bbiner21/.conda/envs/clap/lib/python3.8/site-packages/torchlibrosa/stft.py:193: FutureWarning: Pass size=1024 as keyword args. From version 0.10 passing these as positional arguments will result in an error\n",
+      "  fft_window = librosa.util.pad_center(fft_window, n_fft)\n",
+      "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']\n",
+      "- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+      "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
+      "/kuacc/users/bbiner21/.conda/envs/clap/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:2339: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Load and initialize CLAP\n",
+    "weights_path = \"../clap_weight/CLAP_weights_2022.pth\"\n",
+    "\n",
+    "# Setting use_cuda = True will load the model on a GPU using CUDA\n",
+    "clap_model = CLAPWrapper(weights_path, use_cuda=False)\n",
+    "\n",
+    "# Computing text embeddings\n",
+    "text_embeddings = clap_model.get_text_embeddings(y)\n",
+    "\n",
+    "# Computing audio embeddings\n",
+    "y_preds, y_labels = [], []\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "id": "3093fa76-5c25-4cae-a43c-8368fdfd96fc",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 1061/1061 [02:33<00:00,  6.92it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "gt = []\n",
+    "pred = []\n",
+    "\n",
+    "for i in tqdm(range(len(df.index))):\n",
+    "    x = \"/datasets/audio-image/audios/audio_10s/\" + df.iloc[i,1] + \".wav\"\n",
+    "    \n",
+    "    cur_class = class_dict[df.iloc[i,0]]\n",
+    "    one_hot = torch.zeros((1,class_count))\n",
+    "    one_hot[0,cur_class] = 1.0 \n",
+    "    \n",
+    "    gt.append(cur_class)\n",
+    "    \n",
+    "    \n",
+    "#     x, _, one_hot_target = dataset.__getitem__(i)\n",
+    "    audio_embeddings = clap_model.get_audio_embeddings([x], resample=True)\n",
+    "    \n",
+    "    similarity = clap_model.compute_similarity(audio_embeddings, text_embeddings)\n",
+    "    y_pred = F.softmax(similarity.detach().cpu(), dim=1).numpy()\n",
+    "    \n",
+    "    pred.append(np.argmax(y_pred, axis=1)[0])\n",
+    "    y_preds.append(y_pred)\n",
+    "    y_labels.append(one_hot.detach().cpu().numpy())\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "e2247ab8-844d-4eba-b691-4d38051a51a3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ESC50 Accuracy 0.4458058435438266\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'\\nThe output:\\n\\nESC50 Accuracy: 82.6%\\n\\n'"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\n",
+    "\n",
+    "\n",
+    "# for i in tqdm(range(len(dataset))):\n",
+    "#     x, _, one_hot_target = dataset.__getitem__(i)\n",
+    "#     audio_embeddings = clap_model.get_audio_embeddings([x], resample=True)\n",
+    "#     similarity = clap_model.compute_similarity(audio_embeddings, text_embeddings)\n",
+    "#     y_pred = F.softmax(similarity.detach().cpu(), dim=1).numpy()\n",
+    "#     y_preds.append(y_pred)\n",
+    "#     y_labels.append(one_hot_target.detach().cpu().numpy())\n",
+    "\n",
+    "y_labels, y_preds = np.concatenate(y_labels, axis=0), np.concatenate(y_preds, axis=0)\n",
+    "acc = accuracy_score(np.argmax(y_labels, axis=1), np.argmax(y_preds, axis=1))\n",
+    "print('ESC50 Accuracy {}'.format(acc))\n",
+    "\n",
+    "\"\"\"\n",
+    "The output:\n",
+    "\n",
+    "ESC50 Accuracy: 82.6%\n",
+    "\n",
+    "\"\"\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "41254964-43ec-4fcb-b1d0-2c9ae76d56f6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gt = []\n",
+    "x = \"/datasets/audio-image/audios/audio_10s/\" + df.iloc[0,1] + \".wav\"\n",
+    "\n",
+    "cur_class = class_dict[df.iloc[0,0]]\n",
+    "one_hot = torch.zeros((1,class_count))\n",
+    "one_hot[0,cur_class] = 1.0 \n",
+    "\n",
+    "gt.append(cur_class)\n",
+    "\n",
+    "\n",
+    "#     x, _, one_hot_target = dataset.__getitem__(i)\n",
+    "audio_embeddings = clap_model.get_audio_embeddings([x], resample=True)\n",
+    "\n",
+    "similarity = clap_model.compute_similarity(audio_embeddings, text_embeddings)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "7e73d889-05b6-46ab-820a-9728b1623d5a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y_pred = F.softmax(similarity.detach().cpu(), dim=1).numpy()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "99574178-aba0-467b-a370-679ae927b13b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "3"
+      ]
+     },
+     "execution_count": 35,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "np.argmax(y_pred, axis=1)[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "id": "21b42bef-9500-46be-8f3e-2c53b91462d0",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([0.28571429, 0.35164835, 0.7877095 , 0.59615385, 0.01639344,\n",
+       "       0.93243243, 0.93292683, 0.03092784, 0.4       ])"
+      ]
+     },
+     "execution_count": 41,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from sklearn.metrics import confusion_matrix\n",
+    "\n",
+    "matrix = confusion_matrix(gt, pred)\n",
+    "matrix.diagonal()/matrix.sum(axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "id": "2e96c02a-d789-417e-aaec-a420976bef17",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([[  8,   2,   0,   0,   0,   0,  18,   0,   0],\n",
+       "       [  5,  64,   1,  11,   0,   0, 100,   1,   0],\n",
+       "       [  1,   1, 141,   5,   2,   3,  23,   1,   2],\n",
+       "       [  2,   1,   0,  31,   0,   1,  15,   0,   2],\n",
+       "       [ 70,  51,   0,   0,   3,   2,  40,  17,   0],\n",
+       "       [  1,   1,   0,   3,   0,  69,   0,   0,   0],\n",
+       "       [  2,   1,   7,   0,   0,   0, 153,   0,   1],\n",
+       "       [ 30,  85,   0,   1,   0,   0,  72,   6,   0],\n",
+       "       [  1,   0,   0,   1,   0,   1,   0,   0,   2]])"
+      ]
+     },
+     "execution_count": 42,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "matrix"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "id": "24911c5c-06ed-492f-927d-1555df15b1c5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['this is a sound of waterfall burbling',\n",
+       " 'this is a sound of wind noise',\n",
+       " 'this is a sound of fire crackling',\n",
+       " 'this is a sound of thunder',\n",
+       " 'this is a sound of squishing water',\n",
+       " 'this is a sound of underwater bubbling',\n",
+       " 'this is a sound of raining',\n",
+       " 'this is a sound of splashing water',\n",
+       " 'this is a sound of explosion']"
+      ]
+     },
+     "execution_count": 43,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "y"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "id": "e90b1d22-ddcd-421b-a011-ef1054cdf412",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['waterfall burbling',\n",
+       " 'wind noise',\n",
+       " 'fire crackling',\n",
+       " 'thunder',\n",
+       " 'squishing water',\n",
+       " 'underwater bubbling',\n",
+       " 'raining',\n",
+       " 'splashing water',\n",
+       " 'explosion']"
+      ]
+     },
+     "execution_count": 44,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "classes"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "clap",
+   "language": "python",
+   "name": "clap"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

CLAP/msclap/configs/.ipynb_checkpoints/config-checkpoint.yml ADDED Viewed

	@@ -0,0 +1,26 @@

+# TEXT ENCODER CONFIG
+text_model: 'bert-base-uncased'
+text_len: 100
+transformer_embed_dim: 768
+freeze_text_encoder_weights: True
+# AUDIO ENCODER CONFIG
+audioenc_name: 'Cnn14'
+out_emb: 2048
+sampling_rate: 44100
+duration: 10
+fmin: 50
+fmax: 14000
+n_fft: 1028
+hop_size: 320
+mel_bins: 64
+window_size: 1024
+# PROJECTION SPACE CONFIG
+d_proj: 1024
+temperature: 0.003
+# TRAINING AND EVALUATION CONFIG
+num_classes: 527
+batch_size: 1024
+demo: False

CLAP/msclap/configs/config.yml ADDED Viewed

	@@ -0,0 +1,26 @@

+# TEXT ENCODER CONFIG
+text_model: 'bert-base-uncased'
+text_len: 100
+transformer_embed_dim: 768
+freeze_text_encoder_weights: True
+# AUDIO ENCODER CONFIG
+audioenc_name: 'Cnn14'
+out_emb: 2048
+sampling_rate: 44100
+duration: 10
+fmin: 50
+fmax: 14000
+n_fft: 1028
+hop_size: 320
+mel_bins: 64
+window_size: 1024
+# PROJECTION SPACE CONFIG
+d_proj: 1024
+temperature: 0.003
+# TRAINING AND EVALUATION CONFIG
+num_classes: 527
+batch_size: 1024
+demo: False

CLAP/msclap/esc50_dataset.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from torch.utils.data import Dataset
+from torchvision.datasets.utils import download_url
+from tqdm import tqdm
+import pandas as pd
+import os
+import torch.nn as nn
+import torch
+class AudioDataset(Dataset):
+    def __init__(self, root: str, download: bool = True):
+        self.root = os.path.expanduser(root)
+        if download:
+            self.download()
+    def __getitem__(self, index):
+        raise NotImplementedError
+    def download(self):
+        raise NotImplementedError
+    def __len__(self):
+        raise NotImplementedError
+class ESC50(AudioDataset):
+    base_folder = 'ESC-50-master'
+    url = "https://github.com/karolpiczak/ESC-50/archive/refs/heads/master.zip"
+    filename = "ESC-50-master.zip"
+    num_files_in_dir = 2000
+    audio_dir = 'audio'
+    label_col = 'category'
+    file_col = 'filename'
+    meta = {
+        'filename': os.path.join('meta','esc50.csv'),
+    }
+    def __init__(self, root, reading_transformations: nn.Module = None, download: bool = True):
+        super().__init__(root)
+        self._load_meta()
+        self.targets, self.audio_paths = [], []
+        self.pre_transformations = reading_transformations
+        print("Loading audio files")
+        # self.df['filename'] = os.path.join(self.root, self.base_folder, self.audio_dir) + os.sep + self.df['filename']
+        self.df['category'] = self.df['category'].str.replace('_',' ')
+        for _, row in tqdm(self.df.iterrows()):
+            file_path = os.path.join(self.root, self.base_folder, self.audio_dir, row[self.file_col])
+            self.targets.append(row[self.label_col])
+            self.audio_paths.append(file_path)
+    def _load_meta(self):
+        path = os.path.join(self.root, self.base_folder, self.meta['filename'])
+        self.df = pd.read_csv(path)
+        self.class_to_idx = {}
+        self.classes = [x.replace('_',' ') for x in sorted(self.df[self.label_col].unique())]
+        for i, category in enumerate(self.classes):
+            self.class_to_idx[category] = i
+    def __getitem__(self, index):
+        """
+        Args:
+            index (int): Index
+        Returns:
+            tuple: (image, target) where target is index of the target class.
+        """
+        file_path, target = self.audio_paths[index], self.targets[index]
+        idx = torch.tensor(self.class_to_idx[target])
+        one_hot_target = torch.zeros(len(self.classes)).scatter_(0, idx, 1).reshape(1,-1)
+        return file_path, target, one_hot_target
+    def __len__(self):
+        return len(self.audio_paths)
+    def download(self):
+        download_url(self.url, self.root, self.filename)
+        # extract file
+        from zipfile import ZipFile
+        with ZipFile(os.path.join(self.root, self.filename), 'r') as zip:
+            zip.extractall(path=self.root)

CLAP/msclap/models/.ipynb_checkpoints/audio-checkpoint.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchlibrosa.stft import Spectrogram, LogmelFilterBank
+#
+import torchaudio
+import random
+def get_audio_encoder(name: str):
+    if name == "Cnn14":
+        return Cnn14
+    else:
+        raise Exception('The audio encoder name {} is incorrect or not supported'.format(name))
+class ConvBlock(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(ConvBlock, self).__init__()
+        self.conv1 = nn.Conv2d(in_channels=in_channels,
+                              out_channels=out_channels,
+                              kernel_size=(3, 3), stride=(1, 1),
+                              padding=(1, 1), bias=False)
+        self.conv2 = nn.Conv2d(in_channels=out_channels,
+                              out_channels=out_channels,
+                              kernel_size=(3, 3), stride=(1, 1),
+                              padding=(1, 1), bias=False)
+        self.bn1 = nn.BatchNorm2d(out_channels)
+        self.bn2 = nn.BatchNorm2d(out_channels)
+    def forward(self, input, pool_size=(2, 2), pool_type='avg'):
+        x = input
+        x = F.relu_(self.bn1(self.conv1(x)))
+        x = F.relu_(self.bn2(self.conv2(x)))
+        if pool_type == 'max':
+            x = F.max_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg':
+            x = F.avg_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg+max':
+            x1 = F.avg_pool2d(x, kernel_size=pool_size)
+            x2 = F.max_pool2d(x, kernel_size=pool_size)
+            x = x1 + x2
+        else:
+            raise Exception('Incorrect argument!')
+        return x
+class ConvBlock5x5(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(ConvBlock5x5, self).__init__()
+        self.conv1 = nn.Conv2d(in_channels=in_channels,
+                              out_channels=out_channels,
+                              kernel_size=(5, 5), stride=(1, 1),
+                              padding=(2, 2), bias=False)
+        self.bn1 = nn.BatchNorm2d(out_channels)
+    def forward(self, input, pool_size=(2, 2), pool_type='avg'):
+        x = input
+        x = F.relu_(self.bn1(self.conv1(x)))
+        if pool_type == 'max':
+            x = F.max_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg':
+            x = F.avg_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg+max':
+            x1 = F.avg_pool2d(x, kernel_size=pool_size)
+            x2 = F.max_pool2d(x, kernel_size=pool_size)
+            x = x1 + x2
+        else:
+            raise Exception('Incorrect argument!')
+        return x
+class AttBlock(nn.Module):
+    def __init__(self, n_in, n_out, activation='linear', temperature=1.):
+        super(AttBlock, self).__init__()
+        self.activation = activation
+        self.temperature = temperature
+        self.att = nn.Conv1d(in_channels=n_in, out_channels=n_out, kernel_size=1, stride=1, padding=0, bias=True)
+        self.cla = nn.Conv1d(in_channels=n_in, out_channels=n_out, kernel_size=1, stride=1, padding=0, bias=True)
+        self.bn_att = nn.BatchNorm1d(n_out)
+    def forward(self, x):
+        # x: (n_samples, n_in, n_time)
+        norm_att = torch.softmax(torch.clamp(self.att(x), -10, 10), dim=-1)
+        cla = self.nonlinear_transform(self.cla(x))
+        x = torch.sum(norm_att * cla, dim=2)
+        return x, norm_att, cla
+    def nonlinear_transform(self, x):
+        if self.activation == 'linear':
+            return x
+        elif self.activation == 'sigmoid':
+            return torch.sigmoid(x)
+class Cnn14(nn.Module):
+    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
+        fmax, classes_num, out_emb):
+        super(Cnn14, self).__init__()
+        window = 'hann'
+        center = True
+        pad_mode = 'reflect'
+        ref = 1.0
+        amin = 1e-10
+        top_db = None
+        # Spectrogram extractor
+        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size,
+            win_length=window_size, window=window, center=center, pad_mode=pad_mode,
+            freeze_parameters=True)
+        # Logmel feature extractor
+        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size,
+            n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db,
+            freeze_parameters=True)
+        self.freq_masking = torchaudio.transforms.FrequencyMasking(freq_mask_param=80)
+        self.time_masking = torchaudio.transforms.TimeMasking(80)
+        self.bn0 = nn.BatchNorm2d(64)
+        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
+        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
+        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+        self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
+        self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)
+        # out_emb is 2048 for best Cnn14
+        self.fc1 = nn.Linear(2048, out_emb, bias=True)
+        self.fc_audioset = nn.Linear(out_emb, classes_num, bias=True)
+    def forward(self, input, mixup_lambda=None):
+        """
+        Input: (batch_size, data_length)
+        """
+        x = self.spectrogram_extractor(input)   # (batch_size, 1, time_steps, freq_bins)
+        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
+        random_aug_freq = random.uniform(0,1)
+        random_aug_time = random.uniform(0,1)
+        if random_aug_freq < 0.2:
+            x = self.freq_masking(x)
+        if random_aug_time < 0.2:
+            x = self.time_masking(x)
+        x = x.transpose(1, 3)
+        x = self.bn0(x)
+        x = x.transpose(1, 3)
+        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block5(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block6(x, pool_size=(1, 1), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = torch.mean(x, dim=3)
+        (x1, _) = torch.max(x, dim=2)
+        x2 = torch.mean(x, dim=2)
+        x = x1 + x2
+        x = F.dropout(x, p=0.5, training=self.training)
+        x = F.relu_(self.fc1(x))
+        embedding = F.dropout(x, p=0.5, training=self.training)
+        clipwise_output = torch.sigmoid(self.fc_audioset(x))
+        output_dict = {'clipwise_output': clipwise_output, 'embedding': embedding}
+        return output_dict

CLAP/msclap/models/.ipynb_checkpoints/clap-checkpoint.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers import AutoModel
+from .audio import get_audio_encoder
+class Projection(nn.Module):
+    def __init__(self, d_in: int, d_out: int, p: float=0.5) -> None:
+        super().__init__()
+        self.linear1 = nn.Linear(d_in, d_out, bias=False)
+        self.linear2 = nn.Linear(d_out, d_out, bias=False)
+        self.layer_norm = nn.LayerNorm(d_out)
+        self.drop = nn.Dropout(p)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        embed1 = self.linear1(x)
+        embed2 = self.drop(self.linear2(F.gelu(embed1)))
+        embeds = self.layer_norm(embed1 + embed2)
+        return embeds
+class AudioEncoder(nn.Module):
+    def __init__(self, audioenc_name:str, d_in: int, d_out: int, sample_rate: int, window_size: int,
+            hop_size: int, mel_bins: int, fmin: int, fmax: int, classes_num: int) -> None:
+        super().__init__()
+        audio_encoder = get_audio_encoder(audioenc_name)
+        self.base = audio_encoder(
+            sample_rate, window_size,
+            hop_size, mel_bins, fmin, fmax,
+            classes_num, d_in)
+        self.projection = Projection(d_in, d_out)
+    def forward(self, x):
+        out_dict = self.base(x)
+        audio_features, audio_classification_output = out_dict['embedding'], out_dict['clipwise_output']
+        projected_vec = self.projection(audio_features)
+        return projected_vec, audio_classification_output
+class TextEncoder(nn.Module):
+    def __init__(self, d_out: int, text_model: str, transformer_embed_dim: int) -> None:
+        super().__init__()
+        self.base = AutoModel.from_pretrained(text_model)
+        self.projection = Projection(transformer_embed_dim, d_out)
+    def forward(self, x):
+        out = self.base(**x)[0]
+        out = out[:, 0, :]  # get CLS token output
+        projected_vec = self.projection(out)
+        return projected_vec
+class CLAP(nn.Module):
+    def __init__(self,
+                # audio
+                audioenc_name: str,
+                sample_rate: int,
+                window_size: int,
+                hop_size: int,
+                mel_bins: int,
+                fmin: int,
+                fmax: int,
+                classes_num: int,
+                out_emb: int,
+                # text
+                text_model: str,
+                transformer_embed_dim: int,
+                # common
+                d_proj: int,
+                ):
+        super().__init__()
+        self.audio_encoder = AudioEncoder(
+            audioenc_name, out_emb, d_proj,
+            sample_rate, window_size, hop_size, mel_bins, fmin, fmax, classes_num)
+        self.caption_encoder = TextEncoder(
+            d_proj, text_model, transformer_embed_dim
+        )
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+    def forward(self, audio, text):
+        audio_embed, _ = self.audio_encoder(audio)
+        caption_embed = self.caption_encoder(text)
+        return caption_embed, audio_embed, self.logit_scale.exp()

CLAP/msclap/models/.ipynb_checkpoints/utils-checkpoint.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import argparse
+import yaml
+import sys
+def read_config_as_args(config_path,args=None,is_config_str=False):
+    return_dict = {}
+    if config_path is not None:
+        if is_config_str:
+            yml_config = yaml.load(config_path, Loader=yaml.FullLoader)
+        else:
+            with open(config_path, "r") as f:
+                yml_config = yaml.load(f, Loader=yaml.FullLoader)
+        if args != None:
+            for k, v in yml_config.items():
+                if k in args.__dict__:
+                    args.__dict__[k] = v
+                else:
+                    sys.stderr.write("Ignored unknown parameter {} in yaml.\n".format(k))
+        else:
+            for k, v in yml_config.items():
+                return_dict[k] = v
+    args = args if args != None else return_dict
+    return argparse.Namespace(**args)

CLAP/msclap/models/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from . import clap
+from . import audio
+from . import utils

CLAP/msclap/models/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (234 Bytes). View file

CLAP/msclap/models/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (302 Bytes). View file

CLAP/msclap/models/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (234 Bytes). View file

CLAP/msclap/models/__pycache__/audio.cpython-310.pyc ADDED Viewed

Binary file (5.39 kB). View file

CLAP/msclap/models/__pycache__/audio.cpython-311.pyc ADDED Viewed

Binary file (10.9 kB). View file

CLAP/msclap/models/__pycache__/audio.cpython-38.pyc ADDED Viewed

Binary file (5.24 kB). View file

CLAP/msclap/models/__pycache__/clap.cpython-310.pyc ADDED Viewed

Binary file (3.67 kB). View file

CLAP/msclap/models/__pycache__/clap.cpython-311.pyc ADDED Viewed

Binary file (6.42 kB). View file

CLAP/msclap/models/__pycache__/clap.cpython-38.pyc ADDED Viewed

Binary file (3.53 kB). View file

CLAP/msclap/models/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (774 Bytes). View file

CLAP/msclap/models/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (1.54 kB). View file

CLAP/msclap/models/__pycache__/utils.cpython-38.pyc ADDED Viewed

Binary file (741 Bytes). View file

CLAP/msclap/models/audio.py ADDED Viewed

	@@ -0,0 +1,200 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchlibrosa.stft import Spectrogram, LogmelFilterBank
+#
+import torchaudio
+import random
+def get_audio_encoder(name: str):
+    if name == "Cnn14":
+        return Cnn14
+    else:
+        raise Exception('The audio encoder name {} is incorrect or not supported'.format(name))
+class ConvBlock(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(ConvBlock, self).__init__()
+        self.conv1 = nn.Conv2d(in_channels=in_channels,
+                              out_channels=out_channels,
+                              kernel_size=(3, 3), stride=(1, 1),
+                              padding=(1, 1), bias=False)
+        self.conv2 = nn.Conv2d(in_channels=out_channels,
+                              out_channels=out_channels,
+                              kernel_size=(3, 3), stride=(1, 1),
+                              padding=(1, 1), bias=False)
+        self.bn1 = nn.BatchNorm2d(out_channels)
+        self.bn2 = nn.BatchNorm2d(out_channels)
+    def forward(self, input, pool_size=(2, 2), pool_type='avg'):
+        x = input
+        x = F.relu_(self.bn1(self.conv1(x)))
+        x = F.relu_(self.bn2(self.conv2(x)))
+        if pool_type == 'max':
+            x = F.max_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg':
+            x = F.avg_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg+max':
+            x1 = F.avg_pool2d(x, kernel_size=pool_size)
+            x2 = F.max_pool2d(x, kernel_size=pool_size)
+            x = x1 + x2
+        else:
+            raise Exception('Incorrect argument!')
+        return x
+class ConvBlock5x5(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(ConvBlock5x5, self).__init__()
+        self.conv1 = nn.Conv2d(in_channels=in_channels,
+                              out_channels=out_channels,
+                              kernel_size=(5, 5), stride=(1, 1),
+                              padding=(2, 2), bias=False)
+        self.bn1 = nn.BatchNorm2d(out_channels)
+    def forward(self, input, pool_size=(2, 2), pool_type='avg'):
+        x = input
+        x = F.relu_(self.bn1(self.conv1(x)))
+        if pool_type == 'max':
+            x = F.max_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg':
+            x = F.avg_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg+max':
+            x1 = F.avg_pool2d(x, kernel_size=pool_size)
+            x2 = F.max_pool2d(x, kernel_size=pool_size)
+            x = x1 + x2
+        else:
+            raise Exception('Incorrect argument!')
+        return x
+class AttBlock(nn.Module):
+    def __init__(self, n_in, n_out, activation='linear', temperature=1.):
+        super(AttBlock, self).__init__()
+        self.activation = activation
+        self.temperature = temperature
+        self.att = nn.Conv1d(in_channels=n_in, out_channels=n_out, kernel_size=1, stride=1, padding=0, bias=True)
+        self.cla = nn.Conv1d(in_channels=n_in, out_channels=n_out, kernel_size=1, stride=1, padding=0, bias=True)
+        self.bn_att = nn.BatchNorm1d(n_out)
+    def forward(self, x):
+        # x: (n_samples, n_in, n_time)
+        norm_att = torch.softmax(torch.clamp(self.att(x), -10, 10), dim=-1)
+        cla = self.nonlinear_transform(self.cla(x))
+        x = torch.sum(norm_att * cla, dim=2)
+        return x, norm_att, cla
+    def nonlinear_transform(self, x):
+        if self.activation == 'linear':
+            return x
+        elif self.activation == 'sigmoid':
+            return torch.sigmoid(x)
+class Cnn14(nn.Module):
+    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
+        fmax, classes_num, out_emb):
+        super(Cnn14, self).__init__()
+        window = 'hann'
+        center = True
+        pad_mode = 'reflect'
+        ref = 1.0
+        amin = 1e-10
+        top_db = None
+        # Spectrogram extractor
+        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size,
+            win_length=window_size, window=window, center=center, pad_mode=pad_mode,
+            freeze_parameters=True)
+        # Logmel feature extractor
+        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size,
+            n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db,
+            freeze_parameters=True)
+        self.freq_masking = torchaudio.transforms.FrequencyMasking(freq_mask_param=80)
+        self.time_masking = torchaudio.transforms.TimeMasking(80)
+        self.bn0 = nn.BatchNorm2d(64)
+        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
+        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
+        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+        self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
+        self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)
+        # out_emb is 2048 for best Cnn14
+        self.fc1 = nn.Linear(2048, out_emb, bias=True)
+        self.fc_audioset = nn.Linear(out_emb, classes_num, bias=True)
+    def forward(self, input, mixup_lambda=None, use_aug=False):
+        """
+        Input: (batch_size, data_length)
+        """
+        x = self.spectrogram_extractor(input)   # (batch_size, 1, time_steps, freq_bins)
+        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
+        # if use_aug:
+        #     random_aug_freq = random.uniform(0,1)
+        #     random_aug_time = random.uniform(0,1)
+        #     if random_aug_freq < 0.2:
+        #         x = self.freq_masking(x)
+        #     if random_aug_time < 0.2:
+        #         x = self.time_masking(x)
+        x = x.transpose(1, 3)
+        x = self.bn0(x)
+        x = x.transpose(1, 3)
+        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block5(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block6(x, pool_size=(1, 1), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = torch.mean(x, dim=3)
+        x_inner_layer = x.clone()
+        (x1, _) = torch.max(x, dim=2)
+        x2 = torch.mean(x, dim=2)
+        x = x1 + x2
+        x = F.dropout(x, p=0.5, training=self.training)
+        x = F.relu_(self.fc1(x))
+        embedding = F.dropout(x, p=0.5, training=self.training)
+        clipwise_output = torch.sigmoid(self.fc_audioset(x))
+        output_dict = {'clipwise_output': clipwise_output, 'embedding': embedding, 'inner_layer': x_inner_layer}
+        return output_dict

CLAP/msclap/models/clap.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers import AutoModel
+from .audio import get_audio_encoder
+class Projection(nn.Module):
+    def __init__(self, d_in: int, d_out: int, p: float=0.5) -> None:
+        super().__init__()
+        self.linear1 = nn.Linear(d_in, d_out, bias=False)
+        self.linear2 = nn.Linear(d_out, d_out, bias=False)
+        self.layer_norm = nn.LayerNorm(d_out)
+        self.drop = nn.Dropout(p)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        embed1 = self.linear1(x)
+        embed2 = self.drop(self.linear2(F.gelu(embed1)))
+        embeds = self.layer_norm(embed1 + embed2)
+        return embeds
+class AudioEncoder(nn.Module):
+    def __init__(self, audioenc_name:str, d_in: int, d_out: int, sample_rate: int, window_size: int,
+            hop_size: int, mel_bins: int, fmin: int, fmax: int, classes_num: int) -> None:
+        super().__init__()
+        audio_encoder = get_audio_encoder(audioenc_name)
+        self.base = audio_encoder(
+            sample_rate, window_size,
+            hop_size, mel_bins, fmin, fmax,
+            classes_num, d_in,
+            )
+        self.projection = Projection(d_in, d_out)
+    def forward(self, x, use_aug=False):
+        out_dict = self.base(x, use_aug=use_aug)
+        audio_features, audio_classification_output = out_dict['embedding'], out_dict['clipwise_output']
+        audio_inner_layer = out_dict['inner_layer']
+        projected_vec = self.projection(audio_features)
+        return projected_vec, audio_classification_output, audio_inner_layer
+class TextEncoder(nn.Module):
+    def __init__(self, d_out: int, text_model: str, transformer_embed_dim: int) -> None:
+        super().__init__()
+        self.base = AutoModel.from_pretrained(text_model)
+        self.projection = Projection(transformer_embed_dim, d_out)
+    def forward(self, x):
+        out = self.base(**x)[0]
+        out = out[:, 0, :]  # get CLS token output
+        projected_vec = self.projection(out)
+        return projected_vec
+class CLAP(nn.Module):
+    def __init__(self,
+                # audio
+                audioenc_name: str,
+                sample_rate: int,
+                window_size: int,
+                hop_size: int,
+                mel_bins: int,
+                fmin: int,
+                fmax: int,
+                classes_num: int,
+                out_emb: int,
+                # text
+                text_model: str,
+                transformer_embed_dim: int,
+                # common
+                d_proj: int,
+                ):
+        super().__init__()
+        self.audio_encoder = AudioEncoder(
+            audioenc_name, out_emb, d_proj,
+            sample_rate, window_size, hop_size, mel_bins, fmin, fmax, classes_num)
+        self.caption_encoder = TextEncoder(
+            d_proj, text_model, transformer_embed_dim
+        )
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+    def forward(self, audio, text):
+        audio_embed, _, _ = self.audio_encoder(audio)
+        caption_embed = self.caption_encoder(text)
+        return caption_embed, audio_embed, self.logit_scale.exp()

CLAP/msclap/models/utils.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import argparse
+import yaml
+import sys
+def read_config_as_args(config_path,args=None,is_config_str=False):
+    return_dict = {}
+    if config_path is not None:
+        if is_config_str:
+            yml_config = yaml.load(config_path, Loader=yaml.FullLoader)
+        else:
+            with open(config_path, "r") as f:
+                yml_config = yaml.load(f, Loader=yaml.FullLoader)
+        if args != None:
+            for k, v in yml_config.items():
+                if k in args.__dict__:
+                    args.__dict__[k] = v
+                else:
+                    sys.stderr.write("Ignored unknown parameter {} in yaml.\n".format(k))
+        else:
+            for k, v in yml_config.items():
+                return_dict[k] = v
+    args = args if args != None else return_dict
+    return argparse.Namespace(**args)

CLAP/msclap/zero_shot_classification.py ADDED Viewed

	@@ -0,0 +1,46 @@

+"""
+This is an example using CLAP to perform zeroshot
+    classification on ESC50 (https://github.com/karolpiczak/ESC-50).
+"""
+from CLAPWrapper import CLAPWrapper
+from esc50_dataset import ESC50
+import torch.nn.functional as F
+import numpy as np
+from tqdm import tqdm
+from sklearn.metrics import accuracy_score
+# Load dataset
+dataset = ESC50(root="data_path", download=False)
+prompt = 'this is a sound of '
+y = [prompt + x for x in dataset.classes]
+# Load and initialize CLAP
+weights_path = "weights_path"
+clap_model = CLAPWrapper(weights_path, use_cuda=False)
+# Computing text embeddings
+text_embeddings = clap_model.get_text_embeddings(y)
+# Computing audio embeddings
+y_preds, y_labels = [], []
+for i in tqdm(range(len(dataset))):
+    x, _, one_hot_target = dataset.__getitem__(i)
+    audio_embeddings = clap_model.get_audio_embeddings([x], resample=True)
+    similarity = clap_model.compute_similarity(audio_embeddings, text_embeddings)
+    y_pred = F.softmax(similarity.detach().cpu(), dim=1).numpy()
+    y_preds.append(y_pred)
+    y_labels.append(one_hot_target.detach().cpu().numpy())
+y_labels, y_preds = np.concatenate(y_labels, axis=0), np.concatenate(y_preds, axis=0)
+acc = accuracy_score(np.argmax(y_labels, axis=1), np.argmax(y_preds, axis=1))
+print('ESC50 Accuracy {}'.format(acc))
+"""
+The output:
+ESC50 Accuracy: 82.6%
+"""

CLAP/msclap/zero_shot_predictions.py ADDED Viewed

	@@ -0,0 +1,52 @@

+"""
+This is an example using CLAP for zero-shot
+        inference using ESC50 (https://github.com/karolpiczak/ESC-50).
+"""
+from CLAPWrapper import CLAPWrapper
+from esc50_dataset import ESC50
+import torch.nn.functional as F
+# Load ESC50 dataset
+dataset = ESC50(root="data_path", download=True) # set download=True when dataset is not downloaded
+audio_file, target, one_hot_target = dataset[1000]
+audio_file = [audio_file]
+prompt = 'this is a sound of '
+y = [prompt + x for x in dataset.classes]
+# Load and initialize CLAP
+weights_path = "weights_path"
+# Setting use_cuda = True will load the model on a GPU using CUDA
+clap_model = CLAPWrapper(weights_path, use_cuda=False)
+# compute text embeddings from natural text
+text_embeddings = clap_model.get_text_embeddings(y)
+# compute the audio embeddings from an audio file
+audio_embeddings = clap_model.get_audio_embeddings(audio_file, resample=True)
+# compute the similarity between audio_embeddings and text_embeddings
+similarity = clap_model.compute_similarity(audio_embeddings, text_embeddings)
+similarity = F.softmax(similarity, dim=1)
+values, indices = similarity[0].topk(5)
+# view the results
+print("Ground Truth: {}".format(target))
+print("Top predictions:\n")
+for value, index in zip(values, indices):
+    print(f"{dataset.classes[index]:>16s}: {100 * value.item():.2f}%")
+"""
+The output (the exact numbers may vary):
+Ground Truth: coughing
+Top predictions:
+        coughing: 86.34%
+        sneezing: 9.30%
+drinking sipping: 1.31%
+        laughing: 1.20%
+  glass breaking: 0.81%
+"""

README.md CHANGED Viewed

@@ -8,6 +8,7 @@ sdk_version: 4.29.0
 app_file: app.py
 pinned: false
 license: mit
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 app_file: app.py
 pinned: false
 license: mit
+python_version: 3.10.13
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

ldm/modules/encoders/audio_projector_res.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# from ldm.modules.attention import SpatialTransformer, BasicTransformerBlock
+from torch import nn, einsum
+from einops import rearrange, repeat
+#k,q will be from audio
+class MyCrossAttention(nn.Module):
+    def __init__(self,  device="cuda", audio_dim = 1024, context_dim = 768, dropout=0.0, h = 8, dim_head=40):
+        super().__init__()
+        self.h = h
+        inner_dim = dim_head * h
+        self.scale = dim_head ** -0.5
+        self.to_q_adapter = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_k_adapter = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_v_adapter = nn.Linear(context_dim, inner_dim, bias=False)
+    def forward(self, audio):
+        q_adapter = self.to_q_adapter(audio) #from text
+        k_adapter = self.to_k_adapter(audio)
+        v_adapter = self.to_v_adapter(audio)
+        q_adapter, k_adapter, v_adapter = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=self.h), (q_adapter, k_adapter, v_adapter))
+        sim_adapter = einsum('b i d, b j d -> b i j', q_adapter, k_adapter) * self.scale
+        attn_adapter = sim_adapter.softmax(dim=-1)
+        out = einsum('b i j, b j d -> b i d', attn_adapter, v_adapter)
+        out = rearrange(out, '(b h) n d -> b n (h d)', h=self.h)
+        # print(f'ca out shape is: {out.shape}')
+        return out
+class Adapter(nn.Module):
+    def __init__(self,  device="cuda", audio_dim = 1024, context_dim = 768, dropout=0.0, h = 8, dim_head=40, audio_token_count = 10, initial_channel_dim=1, transformer_layer_count=4):
+        super(Adapter, self).__init__()
+        self.h = h
+        inner_dim = dim_head * h
+        audio_att_inner_dim = audio_token_count
+        self.audio_emb_projection = nn.Sequential(
+            nn.Conv1d(initial_channel_dim, audio_att_inner_dim, kernel_size = 17, stride = 1, padding = 8),
+            nn.GELU(),
+            nn.Conv1d(audio_att_inner_dim, audio_att_inner_dim, kernel_size = 17, stride = 1, padding = 8),
+            nn.GELU(),
+            nn.LayerNorm([audio_att_inner_dim, audio_dim]),
+            nn.Conv1d(audio_att_inner_dim, audio_att_inner_dim, kernel_size = 17, stride = 1, padding = 8),
+            nn.GELU(),
+            nn.LayerNorm([audio_att_inner_dim, audio_dim]),
+            nn.ConvTranspose1d(audio_att_inner_dim, audio_att_inner_dim, kernel_size = 17, stride=3, padding=7),
+            nn.GELU(),
+            nn.LayerNorm([audio_att_inner_dim, 3*audio_dim]),
+            nn.GELU(),
+            nn.Conv1d(audio_att_inner_dim, audio_att_inner_dim, kernel_size = 17, stride=4, padding=7),
+            nn.Dropout(dropout)
+        )
+        #create a stack of MyCrossAttention layers
+        self.cross_attention = nn.ModuleList([MyCrossAttention(device, audio_dim, context_dim, dropout, h, dim_head) for _ in range(transformer_layer_count)])
+        #create a stack of linear, gelu, linear dropout layers to be used after the cross attention
+        self.between_attention = nn.ModuleList([nn.Sequential(
+            nn.Linear(inner_dim, inner_dim),
+            nn.GELU(),
+            nn.Linear(inner_dim, context_dim),
+            nn.Dropout(dropout)
+            ) for _ in range(transformer_layer_count)])
+        self.to_out_adapter = nn.Sequential(
+        nn.Linear(context_dim, context_dim),
+        nn.Dropout(dropout)
+        )
+    def forward(self, audio_context):
+        audio_proj = self.audio_emb_projection(audio_context) #[bs, 64, 1024]
+        for cross_attention, between_attention in zip(self.cross_attention, self.between_attention):
+            out = cross_attention(audio_proj)
+            out = between_attention(out) + audio_proj
+            # print(f'out shape is: {out.shape}')
+        out = self.to_out_adapter(out) #[bs, 77, 768]
+        # print(f'context dim is: {out.shape}')
+        return out

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+accelerate==0.25.0
+diffusers==0.27.2
+einops==0.7.0
+gradio==4.26.0
+gradio_client==0.15.1
+librosa==0.10.1
+numpy==1.26.4
+omegaconf==2.3.0
+pillow==10.3.0
+scikit-learn==1.4.2
+scipy==1.13.0
+soundfile==0.12.1
+torch==2.0.1
+torchaudio==2.0.2
+torchlibrosa==0.1.0
+torchvision==0.15.2
+tqdm==4.66.2
+transformers==4.35.2