Spaces:

fpaissan
/

tinyCLAP

Running

App Files Files Community

fpaissan commited on Jun 18, 2024

Commit

5e02fce

•

1 Parent(s): 70d8c7d

tinyCLAP Space

Browse files

Files changed (9) hide show

.gitignore +174 -0
1-20133-A-39.wav +0 -0
app.py +164 -0
hparams/inference.yaml +153 -0
modules.py +423 -0
requirements.txt +7 -0
siren.wav +0 -0
tunztunz_music.wav +0 -0
whistling_and_chirping.wav +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,174 @@

+results/
+*.pth
+drop*
+confs.py
+# CLAP/
+build.sh
+run.sh
+Dockerfile
+debug
+*.swp
+ckp/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

1-20133-A-39.wav ADDED Viewed

Binary file (441 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,164 @@

+"""This recipe to train CLAP.
+It supports distillation using tinyCLAP (https://arxiv.org/abs/2311.14517).
+Authors
+    * Francesco Paissan 2024
+"""
+import sys
+import gradio as gr
+import speechbrain as sb
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchaudio
+import torchaudio.transforms as T
+from hyperpyyaml import load_hyperpyyaml
+from speechbrain.utils.distributed import run_on_main
+from speechbrain.utils.metric_stats import MetricStats
+torch.backends.cudnn.enabled = False
+eps = 1e-10
+class CLAPBrain(sb.Brain):
+    def preprocess(self, wavs):
+        """Pre-process wavs."""
+        x = self.hparams.spectrogram_extractor(wavs)
+        x = self.hparams.logmel_extractor(x)
+        return x
+    def prepare_txt_features(self, text):
+        """Prepares text features to input in CLAP text encoder."""
+        txt_inp = self.hparams.txt_tokenizer(
+            text,
+            max_length=self.hparams.text_max_length,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt",
+        ).to(self.device)
+        return txt_inp
+    def compute_sim(self, audio_embed, caption_embed):
+        """Computes CLAP similarity metric."""
+        similarity = audio_embed @ caption_embed.t()
+        return similarity
+    def compute_forward(self, batch, stage):
+        if len(batch) == 2:
+            wavs, caption = batch
+        else:
+            wavs, caption, _, _ = batch
+        wavs = wavs.to(self.device).squeeze(1)
+        x_sb = self.preprocess(wavs)
+        text_inp = self.prepare_txt_features(caption)
+        txt_shared, aud_shared = self.hparams.clap(
+            x_sb,
+            text_inp.input_ids.data,
+            text_inp.token_type_ids.data,
+            text_inp.attention_mask.data,
+        )
+        if not hasattr(self.modules, "clap"):
+            aud_shared_student, _, _ = self.modules.clap_student(x_sb)
+            aud_shared_student = aud_shared_student / aud_shared_student.norm(
+                dim=1, keepdim=True
+            )
+        return txt_shared, aud_shared, aud_shared_student
+def audio_preprocess(x, sample_rate):
+    tmp, sr = torchaudio.load(x)
+    resample = T.Resample(sr, sample_rate)
+    tmp = resample(tmp)
+    tmp = tmp.sum(0, keepdims=True)
+    return tmp
+@torch.no_grad()
+def inference_wrapper(clap_brain):
+    def f(wav_path, prompt):
+        clap_brain.modules.eval()
+        tmp = audio_preprocess(wav_path, clap_brain.hparams.sample_rate)
+        ret = clap_brain.compute_forward([tmp, prompt], stage=sb.Stage.TEST)
+        sim = clap_brain.compute_sim(ret[2], ret[0])
+        return f"tinyCLAP similarity is: {round(sim.item(), 2)}"
+    return f
+if __name__ == "__main__":
+    # CLI:
+    # hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
+    hparams_file = "hparams/inference.yaml"
+    # Load hyperparameters file with command-line overrides
+    with open(hparams_file) as fin:
+        hparams = load_hyperpyyaml(fin, {})
+    # Tensorboard logging
+    if hparams["use_tensorboard"]:
+        from speechbrain.utils.train_logger import TensorboardLogger
+        hparams["tensorboard_train_logger"] = TensorboardLogger(
+            hparams["tensorboard_logs_folder"]
+        )
+    hparams["clap"].to(hparams["device"])
+    hparams["clap"].requires_grad_(False)
+    hparams["clap"].eval()
+    if hparams["zs_eval"]:
+        hparams["class_list"] = datasets["train"].dataset.classes
+    if hparams["audioenc_name_student"] is not None:
+        if hparams["projection_only"]:
+            print("Freezing Base AudioEncoder. Updating only the projection layers.")
+            hparams["student_model"].base.requires_grad_(False)
+    hparams["spectrogram_extractor"].to(hparams["device"])
+    hparams["logmel_extractor"].to(hparams["device"])
+    clap_brain = CLAPBrain(
+        modules=hparams["modules"],
+        hparams=hparams,
+    )
+    if hparams["pretrained_CLAP"] is not None:
+        print("Loading CLAP model...")
+        run_on_main(hparams["load_CLAP"].collect_files)
+        hparams["load_CLAP"].load_collected()
+    inference_api = inference_wrapper(clap_brain)
+    examples_list = [
+        ["./tunztunz_music.wav", "this is the sound of house music"],
+        ["./siren.wav", "this is the sound of sirens wailing"],
+        [
+            "./whistling_and_chirping.wav",
+            "someone is whistling while birds are chirping",
+        ],
+    ]
+    demo = gr.Interface(
+        fn=inference_api,
+        inputs=[gr.Audio(type="filepath"), gr.Textbox()],
+        outputs=["text"],
+        examples=examples_list,
+    )
+    demo.launch()

hparams/inference.yaml ADDED Viewed

	@@ -0,0 +1,153 @@

+# #################################
+# The recipe for distilling the CLAP baseline.
+#
+# Author:
+#  * Francesco Paissan 2024
+# #################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 1234
+__set_seed: !!python/object/apply:torch.manual_seed [!ref <seed>]
+# Set up folders for reading from and writing to -- if null dataset is ignored
+esc_folder: null
+us8k_folder: null
+tut17_folder: null
+audiocaps_folder: null
+macs_folder: null
+clotho_folder: null
+fsd50k_folder: null
+device: "cpu"
+projection_only: False
+# Audio Enc Student type
+audioenc_name_student: phinet_alpha_1.50_beta_0.75_t0_6_N_7
+aud_emb_dim_student: 2048
+zs_eval: False
+clap_ckpt: "https://zenodo.org/records/7312125/files/CLAP_weights_2022.pth"
+experiment_name: tinyCLAP
+output_folder: !ref ./results/<experiment_name>/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+# Tensorboard logs
+use_tensorboard: False
+tensorboard_logs_folder: !ref <output_folder>/tb_logs/
+ckpt_interval_minutes: 15 # save checkpoint every N min
+# Training parameters
+number_of_epochs: 100
+batch_size: 64
+lr: 0.012
+sample_rate: 44100
+signal_length_s: 5
+# Feature parameters
+n_mels: 64
+spec_mag_power: 1
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+opt_class: !name:torch.optim.Adam
+    lr: !ref <lr>
+lr_annealing: !new:speechbrain.nnet.schedulers.ReduceLROnPlateau
+    factor: 0.1
+    patience: 10
+# Logging + checkpoints
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        student_model: !ref <student_model>
+        counter: !ref <epoch_counter>
+pretrained_CLAP: !ref fpaissan/tinyCLAP/<audioenc_name_student>.ckpt
+load_CLAP: !new:speechbrain.utils.parameter_transfer.Pretrainer
+    collect_in: !ref <save_folder>
+    loadables:
+        student_model: !ref <student_model>
+    paths:
+        student_model: !ref <pretrained_CLAP>
+fmin: 50
+fmax: 14000
+aud_emb_classes_num: 527
+emb_norm_type: bn
+aud_emb_dim: 2048
+txt_emb_dim: 768
+shared_emb_dim: 1024
+text_max_length: 100
+use_pretrained: True
+clap: !new:modules.CLAP
+    audioenc_name: Cnn14
+    classes_num: !ref <aud_emb_classes_num>
+    out_emb: !ref <aud_emb_dim>
+    text_model: bert-base-uncased
+    transformer_embed_dim: !ref <txt_emb_dim>
+    d_proj: !ref <shared_emb_dim>
+    pretrained_weights: !ref <use_pretrained>
+    CLAP_weights: !ref <clap_ckpt>
+    audioenc_name_student: !ref <audioenc_name_student>
+    out_emb_student: !ref <aud_emb_dim_student>
+txt_tokenizer: !apply:transformers.AutoTokenizer.from_pretrained
+    pretrained_model_name_or_path: bert-base-uncased
+# Interpretation hyperparams
+K: 1024
+# pre-processing
+n_fft: 1024
+hop_length: 320
+win_length: 1024
+use_melspectra_log1p: False
+use_melspectra: True
+use_stft2mel: True
+# Spectrogram extractor
+spectrogram_extractor: !new:torchlibrosa.stft.Spectrogram
+    n_fft: !ref <n_fft>
+    hop_length: !ref <hop_length>
+    win_length: !ref <win_length>
+    window: "hann"
+    center: True
+    pad_mode: "reflect"
+    freeze_parameters: True
+# Logmel feature extractor
+logmel_extractor: !new:torchlibrosa.stft.LogmelFilterBank
+    sr: !ref <sample_rate>
+    n_fft: !ref <win_length>
+    n_mels: !ref <n_mels>
+    fmin: !ref <fmin>
+    fmax: !ref <fmax>
+    ref: 1.0
+    amin: 0.0000000001
+    top_db: null
+    freeze_parameters: True
+student_model: !new:modules.AudioEncoder
+    audioenc_name: !ref <audioenc_name_student>
+    d_in: !ref <aud_emb_dim_student>
+    d_out: !ref <shared_emb_dim>
+    classes_num: !ref <aud_emb_classes_num>
+modules:
+    clap_student: !ref <student_model>

modules.py ADDED Viewed

	@@ -0,0 +1,423 @@

+"""
+Code to define CLAP-related networks.
+Some code inspired from here https://github.com/zhepeiw/clap_curation
+Credits:
+    * Francesco Paissan 2024
+"""
+import numpy as np
+import torch
+import torch.nn.functional as F
+from micromind.networks import PhiNet
+from speechbrain.utils.fetching import fetch
+from torch import nn
+from torchinfo import summary
+from transformers import AutoModel, BatchEncoding
+def get_model_from_str(s, vs=("alpha", "beta", "t0", "N")):
+    def get_var(s, key):
+        tmp = s.split("_")
+        return tmp[tmp.index(key) + 1]
+    verb = "PhiNet initialized with "
+    ret = {}
+    for k in vs:
+        verb += f"{k}={get_var(s, k)} "
+        ret[k] = float(get_var(s, k))
+    ret["t_zero"] = ret["t0"]
+    ret["num_layers"] = ret["N"]
+    del ret["t0"]
+    del ret["N"]
+    return ret
+def get_audio_encoder(name: str):
+    if name == "Cnn14":
+        return Cnn14
+    elif "phinet" in name:
+        phinet_conf = get_model_from_str(name)
+        return PhiNet(input_shape=(1, 640, 64), compatibility=True, **phinet_conf)
+    else:
+        raise Exception(
+            "The audio encoder name {} is incorrect or not supported".format(name)
+        )
+class Projection(nn.Module):
+    def __init__(self, d_in: int, d_out: int, p: float = 0.5) -> None:
+        super().__init__()
+        self.linear1 = nn.Linear(d_in, d_out, bias=False)
+        self.linear2 = nn.Linear(d_out, d_out, bias=False)
+        self.layer_norm = nn.LayerNorm(d_out)
+        self.drop = nn.Dropout(p)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        embed1 = self.linear1(x)
+        embed2 = self.drop(self.linear2(F.gelu(embed1)))
+        embeds = self.layer_norm(embed1 + embed2)
+        return embeds
+class PhiNet(PhiNet):
+    def __init__(self, embedding_dim=2048, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.bn0 = nn.BatchNorm2d(64)
+        if embedding_dim is not None:
+            in_channels_next = self._layers[-1]._layers[-2].weight.shape[0]
+            self.pn_block = nn.Conv2d(
+                in_channels_next,
+                embedding_dim,
+                kernel_size=1,
+                stride=2,
+            )
+    def forward(self, x):
+        if x.dim() == 3:
+            x = x[:, None]
+        x = x.transpose(1, 3)
+        x = self.bn0(x)
+        x = x.transpose(1, 3)
+        x = super().forward(x)
+        embedding = x
+        x = self.pn_block(x)
+        x = x.mean((-1, -2))
+        return {"embedding": (x, embedding), "clipwise_output": x}
+class ConvBlock(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(ConvBlock, self).__init__()
+        self.conv1 = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=(3, 3),
+            stride=(1, 1),
+            padding=(1, 1),
+            bias=False,
+        )
+        self.conv2 = nn.Conv2d(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=(3, 3),
+            stride=(1, 1),
+            padding=(1, 1),
+            bias=False,
+        )
+        self.bn1 = nn.BatchNorm2d(out_channels)
+        self.bn2 = nn.BatchNorm2d(out_channels)
+    def forward(self, input, pool_size=(2, 2), pool_type="avg"):
+        x = input
+        x = F.relu_(self.bn1(self.conv1(x)))
+        x = F.relu_(self.bn2(self.conv2(x)))
+        if pool_type == "max":
+            x = F.max_pool2d(x, kernel_size=pool_size)
+        elif pool_type == "avg":
+            x = F.avg_pool2d(x, kernel_size=pool_size)
+        elif pool_type == "avg+max":
+            x1 = F.avg_pool2d(x, kernel_size=pool_size)
+            x2 = F.max_pool2d(x, kernel_size=pool_size)
+            x = x1 + x2
+        else:
+            raise Exception("Incorrect argument!")
+        return x
+class ConvBlock5x5(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(ConvBlock5x5, self).__init__()
+        self.conv1 = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=(5, 5),
+            stride=(1, 1),
+            padding=(2, 2),
+            bias=False,
+        )
+        self.bn1 = nn.BatchNorm2d(out_channels)
+    def forward(self, input, pool_size=(2, 2), pool_type="avg"):
+        x = input
+        x = F.relu_(self.bn1(self.conv1(x)))
+        if pool_type == "max":
+            x = F.max_pool2d(x, kernel_size=pool_size)
+        elif pool_type == "avg":
+            x = F.avg_pool2d(x, kernel_size=pool_size)
+        elif pool_type == "avg+max":
+            x1 = F.avg_pool2d(x, kernel_size=pool_size)
+            x2 = F.max_pool2d(x, kernel_size=pool_size)
+            x = x1 + x2
+        else:
+            raise Exception("Incorrect argument!")
+        return x
+class AttBlock(nn.Module):
+    def __init__(self, n_in, n_out, activation="linear", temperature=1.0):
+        super(AttBlock, self).__init__()
+        self.activation = activation
+        self.temperature = temperature
+        self.att = nn.Conv1d(
+            in_channels=n_in,
+            out_channels=n_out,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+        )
+        self.cla = nn.Conv1d(
+            in_channels=n_in,
+            out_channels=n_out,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+        )
+        self.bn_att = nn.BatchNorm1d(n_out)
+    def forward(self, x):
+        # x: (n_samples, n_in, n_time)
+        norm_att = torch.softmax(torch.clamp(self.att(x), -10, 10), dim=-1)
+        cla = self.nonlinear_transform(self.cla(x))
+        x = torch.sum(norm_att * cla, dim=2)
+        return x, norm_att, cla
+    def nonlinear_transform(self, x):
+        if self.activation == "linear":
+            return x
+        elif self.activation == "sigmoid":
+            return torch.sigmoid(x)
+class Cnn14(nn.Module):
+    def __init__(
+        self,
+        classes_num,
+        out_emb,
+    ):
+        super(Cnn14, self).__init__()
+        self.bn0 = nn.BatchNorm2d(64)
+        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
+        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
+        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+        self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
+        self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)
+        # out_emb is 2048 for best Cnn14
+        self.fc1 = nn.Linear(2048, out_emb, bias=True)
+        self.fc_audioset = nn.Linear(out_emb, classes_num, bias=True)
+    def forward(self, x, mixup_lambda=None):
+        """
+        Input: (batch_size, data_length)
+        """
+        # (batch_size, 1, time_steps, mel_bins)
+        if x.dim() == 3:
+            x = x.unsqueeze(1)
+        x = x.transpose(1, 3)
+        x = self.bn0(x)
+        x = x.transpose(1, 3)
+        x = self.conv_block1(x, pool_size=(2, 2), pool_type="avg")
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block2(x, pool_size=(2, 2), pool_type="avg")
+        x = F.dropout(x, p=0.2, training=self.training)
+        x4_out = self.conv_block3(x, pool_size=(2, 2), pool_type="avg")
+        x = F.dropout(x4_out, p=0.2, training=self.training)
+        x3_out = self.conv_block4(x, pool_size=(2, 2), pool_type="avg")
+        x = F.dropout(x3_out, p=0.2, training=self.training)
+        x2_out = self.conv_block5(x, pool_size=(2, 2), pool_type="avg")
+        x = F.dropout(x2_out, p=0.2, training=self.training)
+        x1_out = self.conv_block6(x, pool_size=(1, 1), pool_type="avg")
+        x = F.dropout(x1_out, p=0.2, training=self.training)
+        x = torch.mean(x, dim=3)
+        (x1, _) = torch.max(x, dim=2)
+        x2 = torch.mean(x, dim=2)
+        x = x1 + x2
+        x = F.dropout(x, p=0.5, training=self.training)
+        x = F.relu_(self.fc1(x))
+        embedding = F.dropout(x, p=0.5, training=self.training)
+        clipwise_output = torch.sigmoid(self.fc_audioset(x))
+        output_dict = {
+            "clipwise_output": clipwise_output,
+            "embedding": (embedding, x1_out, x2_out, x3_out, x4_out),
+        }
+        return output_dict
+class AudioEncoder(nn.Module):
+    def __init__(
+        self,
+        audioenc_name: str,
+        d_in: int,
+        d_out: int,
+        classes_num: int,
+    ) -> None:
+        super().__init__()
+        audio_encoder = get_audio_encoder(audioenc_name)
+        if not "phinet" in audioenc_name:
+            self.base = audio_encoder(
+                classes_num,
+                d_in,
+            )
+        else:
+            self.base = audio_encoder
+        self.projection = Projection(d_in, d_out)
+    def forward(self, x):
+        out_dict = self.base(x)
+        audio_features, audio_classification_output = (
+            out_dict["embedding"][0],
+            out_dict["clipwise_output"],
+        )
+        projected_vec = self.projection(audio_features)
+        return (
+            projected_vec,
+            out_dict["embedding"][1:],
+            audio_classification_output,
+        )
+class TextEncoder(nn.Module):
+    def __init__(self, d_out: int, text_model: str, transformer_embed_dim: int) -> None:
+        super().__init__()
+        self.base = AutoModel.from_pretrained(text_model)
+        self.projection = Projection(transformer_embed_dim, d_out)
+    def forward(self, x):
+        out = self.base(**x)[0]
+        hidden_state = out
+        out = out[:, 0, :]  # get CLS token output
+        projected_vec = self.projection(out)
+        self.hidden_state = hidden_state.detach()
+        return projected_vec
+class CLAP(nn.Module):
+    def __init__(
+        self,
+        # audio
+        audioenc_name: str,
+        classes_num: int,
+        out_emb: int,
+        # text
+        text_model: str,
+        transformer_embed_dim: int,
+        # common
+        d_proj: int,
+        pretrained_weights: bool = True,
+        CLAP_weights: str = None,
+        # audio student
+        audioenc_name_student=None,
+        out_emb_student=None,
+    ):
+        super().__init__()
+        ckpt_path = None
+        if pretrained_weights and CLAP_weights is not None:
+            weights_path = "CLAP_weights.pth"
+            tmp = CLAP_weights.split("/")
+            print(
+                " ".join(
+                    """Fetching CLAP weights.
+                The checkpoint is a ~2GB, so be patient.
+                The process will start right after.
+                """.split()
+                )
+            )
+            fetch(
+                tmp[-1],
+                "/".join(tmp[:-1]),
+                savedir=".",
+                save_filename=weights_path,
+            )
+            ckpt_path = weights_path
+        self.audio_encoder = AudioEncoder(
+            audioenc_name,
+            out_emb,
+            d_proj,
+            classes_num,
+        )
+        self.caption_encoder = TextEncoder(d_proj, text_model, transformer_embed_dim)
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        state_dict = torch.load(ckpt_path)["model"]
+        self.load_state_dict(self.clean_state_dict(state_dict))
+        print("Loaded pretrained CLAP checkpoint.")
+    @staticmethod
+    def clean_state_dict(state_dict):
+        """Removes pre-processing keys from the state-dict."""
+        keys_to_remove = []
+        for k in state_dict:
+            if "spectrogram" in k or "mel" in k:
+                keys_to_remove.append(k)
+        for k in keys_to_remove:
+            state_dict.pop(
+                k,
+                None,
+            )
+        return state_dict
+    def forward(self, audio, input_ids, token_type_ids, attention_mask, single=None):
+        audio_embed = None
+        caption_embed = None
+        if not single == "txt":
+            audio_embed, _, _ = self.audio_encoder(audio)
+            audio_embed = audio_embed / audio_embed.norm(dim=1, keepdim=True)
+        if not single == "aud":
+            text = BatchEncoding(
+                {
+                    "input_ids": input_ids,
+                    "token_type_ids": token_type_ids,
+                    "attention_mask": attention_mask,
+                }
+            )
+            caption_embed = self.caption_encoder(text)
+            caption_embed = caption_embed / caption_embed.norm(dim=1, keepdim=True)
+        return caption_embed, audio_embed

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+speechbrain
+pandas
+transformers==4.28.1
+torchlibrosa
+micromind
+torchinfo
+gradio

siren.wav ADDED Viewed

Binary file (640 kB). View file

tunztunz_music.wav ADDED Viewed

Binary file (963 kB). View file

whistling_and_chirping.wav ADDED Viewed

Binary file (328 kB). View file