init

Browse files

Files changed (9) hide show

README.md +75 -0
config/__init__.py +4 -0
config/comm.py +132 -0
config/cvt-13-224x224.yaml +83 -0
config/default.py +202 -0
cvt.py +694 -0
melspectrogram.py +40 -0
protoclr.pth +3 -0
requirements.txt +9 -0

README.md ADDED Viewed

	@@ -0,0 +1,75 @@

+# ProtoCLR
+This repository contains a CvT-13 [Convolutional Vision Transformer](https://arxiv.org/abs/2103.15808) model trained from scratch on the [Xeno-Canto dataset](https://huggingface.co/datasets/ilyassmoummad/Xeno-Canto-6s-16khz), specifically on 6-second audio segments sampled at 16 kHz. The model is trained on Mel spectrograms of bird sounds using ProtoCLR [(Prototypical Contrastive Loss)](https://arxiv.org/abs/2409.08589) for 300 epochs and can be used as a feature extractor for bird audio classification and related tasks.
+## Files
+- `cvt.py`: Defines the CvT-13 model architecture.
+- `protoclr_300.pth`: Pre-trained model weights for ProtoCLR.
+- `config/`: Configuration files for CvT-13 setup.
+- `mel_spectrogram.py`: Contains the `MelSpectrogramProcessor` class, which converts audio waveforms into Mel spectrograms, a format suitable for model input.
+## Setup
+1. **Install dependencies**:
+    Ensure you have the required Python packages, including `torch` and any other dependencies listed in `requirements.txt`.
+    ```bash
+    pip install -r requirements.txt
+    ```
+2. **Prepare the audio**:
+    - **Sample rate**: Ensure your audio is sampled at 16 kHz.
+    - **Padding**: For audio shorter than 6 seconds, pad with zeros or repeat the audio to reach 6 seconds.
+    - **Chunking**: For audio longer than 6 seconds, split it into 6-second chunks.
+## Usage
+To use the model, process your audio data using the `MelSpectrogramProcessor`, and then pass the processed spectrograms to the CvT-13 model.
+## Example Code
+The following example demonstrates loading, processing, and running inference on an audio file:
+```python
+import torch
+from cvt import cvt13  # Import model architecture
+from melspectrogram import MelSpectrogramProcessor  # Import Mel spectrogram processor
+# Initialize the preprocessor and model
+preprocessor = MelSpectrogramProcessor()
+model = cvt13()
+model.load_state_dict(torch.load("protoclr.pth"))
+model.eval()
+# Load and preprocess a sample audio waveform
+def load_waveform(file_path):
+    # Replace this with your specific audio loading function
+    # For example, using torchaudio to load and resample
+    pass
+waveform = load_waveform("path/to/audio.wav")  # Load your audio file here
+# Ensure waveform is sampled at 16 kHz, then pad/chunk as needed for 6s length
+input_tensor = preprocessor.process(waveform).unsqueeze(0)  # Add batch dimension
+# Run the model on the preprocessed audio
+with torch.no_grad():
+    output = model(input_tensor)
+    print("Model output shape:", output.shape)
+```
+## Citation
+If you use our model in your research, please cite the following paper:
+```bibtex
+@misc{moummad2024dirlbs,
+      title={Domain-Invariant Representation Learning of Bird Sounds},
+      author={Ilyass Moummad and Romain Serizel and Emmanouil Benetos and Nicolas Farrugia},
+      year={2024},
+      eprint={2409.08589},
+      archivePrefix={arXiv},
+      primaryClass={cs.SD},
+      url={https://arxiv.org/abs/2409.08589},
+}
+```

config/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .default import _C as config
+from .default import update_config
+from .default import _update_config_from_file
+from .default import save_config

config/comm.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import pickle
+import torch
+import torch.distributed as dist
+class Comm(object):
+    def __init__(self, local_rank=0):
+        self.local_rank = 0
+    @property
+    def world_size(self):
+        if not dist.is_available():
+            return 1
+        if not dist.is_initialized():
+            return 1
+        return dist.get_world_size()
+    @property
+    def rank(self):
+        if not dist.is_available():
+            return 0
+        if not dist.is_initialized():
+            return 0
+        return dist.get_rank()
+    @property
+    def local_rank(self):
+        if not dist.is_available():
+            return 0
+        if not dist.is_initialized():
+            return 0
+        return self._local_rank
+    @local_rank.setter
+    def local_rank(self, value):
+        if not dist.is_available():
+            self._local_rank = 0
+        if not dist.is_initialized():
+            self._local_rank = 0
+        self._local_rank = value
+    @property
+    def head(self):
+        return 'Rank[{}/{}]'.format(self.rank, self.world_size)
+    def is_main_process(self):
+        return self.rank == 0
+    def synchronize(self):
+        """
+        Helper function to synchronize (barrier) among all processes when
+        using distributed training
+        """
+        if self.world_size == 1:
+            return
+        dist.barrier()
+comm = Comm()
+def all_gather(data):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors)
+    Args:
+        data: any picklable object
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    world_size = comm.world_size
+    if world_size == 1:
+        return [data]
+    # serialized to a Tensor
+    buffer = pickle.dumps(data)
+    storage = torch.ByteStorage.from_buffer(buffer)
+    tensor = torch.ByteTensor(storage).to("cuda")
+    # obtain Tensor size of each rank
+    local_size = torch.LongTensor([tensor.numel()]).to("cuda")
+    size_list = [torch.LongTensor([0]).to("cuda") for _ in range(world_size)]
+    dist.all_gather(size_list, local_size)
+    size_list = [int(size.item()) for size in size_list]
+    max_size = max(size_list)
+    # receiving Tensor from all ranks
+    # we pad the tensor because torch all_gather does not support
+    # gathering tensors of different shapes
+    tensor_list = []
+    for _ in size_list:
+        tensor_list.append(torch.ByteTensor(size=(max_size,)).to("cuda"))
+    if local_size != max_size:
+        padding = torch.ByteTensor(size=(max_size - local_size,)).to("cuda")
+        tensor = torch.cat((tensor, padding), dim=0)
+    dist.all_gather(tensor_list, tensor)
+    data_list = []
+    for size, tensor in zip(size_list, tensor_list):
+        buffer = tensor.cpu().numpy().tobytes()[:size]
+        data_list.append(pickle.loads(buffer))
+    return data_list
+def reduce_dict(input_dict, average=True):
+    """
+    Args:
+        input_dict (dict): all the values will be reduced
+        average (bool): whether to do average or sum
+    Reduce the values in the dictionary from all processes so that process with rank
+    0 has the averaged results. Returns a dict with the same fields as
+    input_dict, after reduction.
+    """
+    world_size = comm.world_size
+    if world_size < 2:
+        return input_dict
+    with torch.no_grad():
+        names = []
+        values = []
+        # sort the keys so that they are consistent across processes
+        for k in sorted(input_dict.keys()):
+            names.append(k)
+            values.append(input_dict[k])
+        values = torch.stack(values, dim=0)
+        dist.reduce(values, dst=0)
+        if dist.get_rank() == 0 and average:
+            # only main process gets accumulated, so only divide by
+            # world_size in this case
+            values /= world_size
+        reduced_dict = {k: v for k, v in zip(names, values)}
+    return reduced_dict

config/cvt-13-224x224.yaml ADDED Viewed

	@@ -0,0 +1,83 @@

+OUTPUT_DIR: 'OUTPUT/'
+WORKERS: 6
+PRINT_FREQ: 500
+AMP:
+  ENABLED: true
+MODEL:
+  NAME: cls_cvt
+  SPEC:
+    INIT: 'trunc_norm'
+    NUM_STAGES: 3
+    PATCH_SIZE: [7, 3, 3]
+    PATCH_STRIDE: [4, 2, 2]
+    PATCH_PADDING: [2, 1, 1]
+    DIM_EMBED: [64, 192, 384]
+    NUM_HEADS: [1, 3, 6]
+    DEPTH: [1, 2, 10]
+    MLP_RATIO: [4.0, 4.0, 4.0]
+    ATTN_DROP_RATE: [0.0, 0.0, 0.0]
+    DROP_RATE: [0.0, 0.0, 0.0]
+    DROP_PATH_RATE: [0.0, 0.0, 0.1]
+    QKV_BIAS: [True, True, True]
+    CLS_TOKEN: [False, False, True]
+    POS_EMBED: [False, False, False]
+    QKV_PROJ_METHOD: ['dw_bn', 'dw_bn', 'dw_bn']
+    KERNEL_QKV: [3, 3, 3]
+    PADDING_KV: [1, 1, 1]
+    STRIDE_KV: [2, 2, 2]
+    PADDING_Q: [1, 1, 1]
+    STRIDE_Q: [1, 1, 1]
+AUG:
+  MIXUP_PROB: 1.0
+  MIXUP: 0.8
+  MIXCUT: 1.0
+  TIMM_AUG:
+    USE_LOADER: true
+    RE_COUNT: 1
+    RE_MODE: pixel
+    RE_SPLIT: false
+    RE_PROB: 0.25
+    AUTO_AUGMENT: rand-m9-mstd0.5-inc1
+    HFLIP: 0.5
+    VFLIP: 0.0
+    COLOR_JITTER: 0.4
+    INTERPOLATION: bicubic
+LOSS:
+  LABEL_SMOOTHING: 0.1
+CUDNN:
+  BENCHMARK: true
+  DETERMINISTIC: false
+  ENABLED: true
+DATASET:
+  DATASET: 'imagenet'
+  DATA_FORMAT: 'jpg'
+  ROOT: 'DATASET/imagenet/'
+  TEST_SET: 'val'
+  TRAIN_SET: 'train'
+TEST:
+  BATCH_SIZE_PER_GPU: 32
+  IMAGE_SIZE: [224, 224]
+  MODEL_FILE: ''
+  INTERPOLATION: 3
+TRAIN:
+  BATCH_SIZE_PER_GPU: 256
+  LR: 0.00025
+  IMAGE_SIZE: [224, 224]
+  BEGIN_EPOCH: 0
+  END_EPOCH: 300
+  LR_SCHEDULER:
+    METHOD: 'timm'
+    ARGS:
+      sched: 'cosine'
+      warmup_epochs: 5
+      warmup_lr: 0.000001
+      min_lr: 0.00001
+      cooldown_epochs: 10
+      decay_rate: 0.1
+  OPTIMIZER: adamW
+  WD: 0.05
+  WITHOUT_WD_LIST: ['bn', 'bias', 'ln']
+  SHUFFLE: true
+DEBUG:
+  DEBUG: false

config/default.py ADDED Viewed

	@@ -0,0 +1,202 @@

+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os.path as op
+import yaml
+from yacs.config import CfgNode as CN
+from config import comm
+_C = CN()
+_C.BASE = ['']
+_C.NAME = ''
+_C.DATA_DIR = ''
+_C.DIST_BACKEND = 'nccl'
+_C.GPUS = (0,)
+# _C.LOG_DIR = ''
+_C.MULTIPROCESSING_DISTRIBUTED = True
+_C.OUTPUT_DIR = ''
+_C.PIN_MEMORY = True
+_C.PRINT_FREQ = 20
+_C.RANK = 0
+_C.VERBOSE = True
+_C.WORKERS = 4
+_C.MODEL_SUMMARY = False
+_C.AMP = CN()
+_C.AMP.ENABLED = False
+_C.AMP.MEMORY_FORMAT = 'nchw'
+# Cudnn related params
+_C.CUDNN = CN()
+_C.CUDNN.BENCHMARK = True
+_C.CUDNN.DETERMINISTIC = False
+_C.CUDNN.ENABLED = True
+# common params for NETWORK
+_C.MODEL = CN()
+_C.MODEL.NAME = 'cls_hrnet'
+_C.MODEL.INIT_WEIGHTS = True
+_C.MODEL.PRETRAINED = ''
+_C.MODEL.PRETRAINED_LAYERS = ['*']
+_C.MODEL.NUM_CLASSES = 1000
+_C.MODEL.SPEC = CN(new_allowed=True)
+_C.LOSS = CN(new_allowed=True)
+_C.LOSS.LABEL_SMOOTHING = 0.0
+_C.LOSS.LOSS = 'softmax'
+# DATASET related params
+_C.DATASET = CN()
+_C.DATASET.ROOT = ''
+_C.DATASET.DATASET = 'imagenet'
+_C.DATASET.TRAIN_SET = 'train'
+_C.DATASET.TEST_SET = 'val'
+_C.DATASET.DATA_FORMAT = 'jpg'
+_C.DATASET.LABELMAP = ''
+_C.DATASET.TRAIN_TSV_LIST = []
+_C.DATASET.TEST_TSV_LIST = []
+_C.DATASET.SAMPLER = 'default'
+_C.DATASET.TARGET_SIZE = -1
+# training data augmentation
+_C.INPUT = CN()
+_C.INPUT.MEAN = [0.485, 0.456, 0.406]
+_C.INPUT.STD = [0.229, 0.224, 0.225]
+# data augmentation
+_C.AUG = CN()
+_C.AUG.SCALE = (0.08, 1.0)
+_C.AUG.RATIO = (3.0/4.0, 4.0/3.0)
+_C.AUG.COLOR_JITTER = [0.4, 0.4, 0.4, 0.1, 0.0]
+_C.AUG.GRAY_SCALE = 0.0
+_C.AUG.GAUSSIAN_BLUR = 0.0
+_C.AUG.DROPBLOCK_LAYERS = [3, 4]
+_C.AUG.DROPBLOCK_KEEP_PROB = 1.0
+_C.AUG.DROPBLOCK_BLOCK_SIZE = 7
+_C.AUG.MIXUP_PROB = 0.0
+_C.AUG.MIXUP = 0.0
+_C.AUG.MIXCUT = 0.0
+_C.AUG.MIXCUT_MINMAX = []
+_C.AUG.MIXUP_SWITCH_PROB = 0.5
+_C.AUG.MIXUP_MODE = 'batch'
+_C.AUG.MIXCUT_AND_MIXUP = False
+_C.AUG.INTERPOLATION = 2
+_C.AUG.TIMM_AUG = CN(new_allowed=True)
+_C.AUG.TIMM_AUG.USE_LOADER = False
+_C.AUG.TIMM_AUG.USE_TRANSFORM = False
+# train
+_C.TRAIN = CN()
+_C.TRAIN.AUTO_RESUME = True
+_C.TRAIN.CHECKPOINT = ''
+_C.TRAIN.LR_SCHEDULER = CN(new_allowed=True)
+_C.TRAIN.SCALE_LR = True
+_C.TRAIN.LR = 0.001
+_C.TRAIN.OPTIMIZER = 'sgd'
+_C.TRAIN.OPTIMIZER_ARGS = CN(new_allowed=True)
+_C.TRAIN.MOMENTUM = 0.9
+_C.TRAIN.WD = 0.0001
+_C.TRAIN.WITHOUT_WD_LIST = []
+_C.TRAIN.NESTEROV = True
+# for adam
+_C.TRAIN.GAMMA1 = 0.99
+_C.TRAIN.GAMMA2 = 0.0
+_C.TRAIN.BEGIN_EPOCH = 0
+_C.TRAIN.END_EPOCH = 100
+_C.TRAIN.IMAGE_SIZE = [224, 224]  # width * height, ex: 192 * 256
+_C.TRAIN.BATCH_SIZE_PER_GPU = 32
+_C.TRAIN.SHUFFLE = True
+_C.TRAIN.EVAL_BEGIN_EPOCH = 0
+_C.TRAIN.DETECT_ANOMALY = False
+_C.TRAIN.CLIP_GRAD_NORM = 0.0
+_C.TRAIN.SAVE_ALL_MODELS = False
+# testing
+_C.TEST = CN()
+# size of images for each device
+_C.TEST.BATCH_SIZE_PER_GPU = 32
+_C.TEST.CENTER_CROP = True
+_C.TEST.IMAGE_SIZE = [224, 224]  # width * height, ex: 192 * 256
+_C.TEST.INTERPOLATION = 2
+_C.TEST.MODEL_FILE = ''
+_C.TEST.REAL_LABELS = False
+_C.TEST.VALID_LABELS = ''
+_C.FINETUNE = CN()
+_C.FINETUNE.FINETUNE = False
+_C.FINETUNE.USE_TRAIN_AUG = False
+_C.FINETUNE.BASE_LR = 0.003
+_C.FINETUNE.BATCH_SIZE = 512
+_C.FINETUNE.EVAL_EVERY = 3000
+_C.FINETUNE.TRAIN_MODE = True
+# _C.FINETUNE.MODEL_FILE = ''
+_C.FINETUNE.FROZEN_LAYERS = []
+_C.FINETUNE.LR_SCHEDULER = CN(new_allowed=True)
+_C.FINETUNE.LR_SCHEDULER.DECAY_TYPE = 'step'
+# debug
+_C.DEBUG = CN()
+_C.DEBUG.DEBUG = False
+def _update_config_from_file(config, cfg_file):
+    config.defrost()
+    with open(cfg_file, 'r') as f:
+        yaml_cfg = yaml.load(f, Loader=yaml.FullLoader)
+    for cfg in yaml_cfg.setdefault('BASE', ['']):
+        if cfg:
+            _update_config_from_file(
+                config, op.join(op.dirname(cfg_file), cfg)
+            )
+    print('=> merge config from {}'.format(cfg_file))
+    config.merge_from_file(cfg_file)
+    config.freeze()
+def update_config(config, args):
+    _update_config_from_file(config, args.cfg)
+    config.defrost()
+    config.merge_from_list(args.opts)
+    if config.TRAIN.SCALE_LR:
+        config.TRAIN.LR *= comm.world_size
+    file_name, _ = op.splitext(op.basename(args.cfg))
+    config.NAME = file_name + config.NAME
+    config.RANK = comm.rank
+    if 'timm' == config.TRAIN.LR_SCHEDULER.METHOD:
+        config.TRAIN.LR_SCHEDULER.ARGS.epochs = config.TRAIN.END_EPOCH
+    if 'timm' == config.TRAIN.OPTIMIZER:
+        config.TRAIN.OPTIMIZER_ARGS.lr = config.TRAIN.LR
+    aug = config.AUG
+    if aug.MIXUP > 0.0 or aug.MIXCUT > 0.0 or aug.MIXCUT_MINMAX:
+        aug.MIXUP_PROB = 1.0
+    config.freeze()
+def save_config(cfg, path):
+    if comm.is_main_process():
+        with open(path, 'w') as f:
+            f.write(cfg.dump())
+if __name__ == '__main__':
+    import sys
+    with open(sys.argv[1], 'w') as f:
+        print(_C, file=f)

cvt.py ADDED Viewed

	@@ -0,0 +1,694 @@

+from functools import partial
+from itertools import repeat
+#from torch._six import container_abcs
+import collections.abc as container_abcs
+import logging
+import os
+from collections import OrderedDict
+import numpy as np
+import scipy
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from einops.layers.torch import Rearrange
+from timm.models.layers import DropPath, trunc_normal_
+#from .registry import register_model
+from config import config
+from torchinfo import summary
+import yaml
+_model_entrypoints = {}
+def register_model(fn):
+    module_name_split = fn.__module__.split('.')
+    model_name = module_name_split[-1]
+    _model_entrypoints[model_name] = fn
+    return fn
+def model_entrypoints(model_name):
+    return _model_entrypoints[model_name]
+def is_model(model_name):
+    return model_name in _model_entrypoints
+# From PyTorch internals
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, container_abcs.Iterable):
+            return x
+        return tuple(repeat(x, n))
+    return parse
+to_1tuple = _ntuple(1)
+to_2tuple = _ntuple(2)
+to_3tuple = _ntuple(3)
+to_4tuple = _ntuple(4)
+to_ntuple = _ntuple
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+class Mlp(nn.Module):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class Attention(nn.Module):
+    def __init__(self,
+                 dim_in,
+                 dim_out,
+                 num_heads,
+                 qkv_bias=False,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 method='dw_bn',
+                 kernel_size=3,
+                 stride_kv=1,
+                 stride_q=1,
+                 padding_kv=1,
+                 padding_q=1,
+                 with_cls_token=True,
+                 **kwargs
+                 ):
+        super().__init__()
+        self.stride_kv = stride_kv
+        self.stride_q = stride_q
+        self.dim = dim_out
+        self.num_heads = num_heads
+        # head_dim = self.qkv_dim // num_heads
+        self.scale = dim_out ** -0.5
+        self.with_cls_token = with_cls_token
+        self.conv_proj_q = self._build_projection(
+            dim_in, dim_out, kernel_size, padding_q,
+            stride_q, 'linear' if method == 'avg' else method
+        )
+        self.conv_proj_k = self._build_projection(
+            dim_in, dim_out, kernel_size, padding_kv,
+            stride_kv, method
+        )
+        self.conv_proj_v = self._build_projection(
+            dim_in, dim_out, kernel_size, padding_kv,
+            stride_kv, method
+        )
+        self.proj_q = nn.Linear(dim_in, dim_out, bias=qkv_bias)
+        self.proj_k = nn.Linear(dim_in, dim_out, bias=qkv_bias)
+        self.proj_v = nn.Linear(dim_in, dim_out, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim_out, dim_out)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def _build_projection(self,
+                          dim_in,
+                          dim_out,
+                          kernel_size,
+                          padding,
+                          stride,
+                          method):
+        if method == 'dw_bn':
+            proj = nn.Sequential(OrderedDict([
+                ('conv', nn.Conv2d(
+                    dim_in,
+                    dim_in,
+                    kernel_size=kernel_size,
+                    padding=padding,
+                    stride=stride,
+                    bias=False,
+                    groups=dim_in
+                )),
+                ('bn', nn.BatchNorm2d(dim_in)),
+                ('rearrage', Rearrange('b c h w -> b (h w) c')),
+            ]))
+        elif method == 'avg':
+            proj = nn.Sequential(OrderedDict([
+                ('avg', nn.AvgPool2d(
+                    kernel_size=kernel_size,
+                    padding=padding,
+                    stride=stride,
+                    ceil_mode=True
+                )),
+                ('rearrage', Rearrange('b c h w -> b (h w) c')),
+            ]))
+        elif method == 'linear':
+            proj = None
+        else:
+            raise ValueError('Unknown method ({})'.format(method))
+        return proj
+    def forward_conv(self, x, h, w):
+        if self.with_cls_token:
+            cls_token, x = torch.split(x, [1, h*w], 1)
+        x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w)
+        if self.conv_proj_q is not None:
+            q = self.conv_proj_q(x)
+        else:
+            q = rearrange(x, 'b c h w -> b (h w) c')
+        if self.conv_proj_k is not None:
+            k = self.conv_proj_k(x)
+        else:
+            k = rearrange(x, 'b c h w -> b (h w) c')
+        if self.conv_proj_v is not None:
+            v = self.conv_proj_v(x)
+        else:
+            v = rearrange(x, 'b c h w -> b (h w) c')
+        if self.with_cls_token:
+            q = torch.cat((cls_token, q), dim=1)
+            k = torch.cat((cls_token, k), dim=1)
+            v = torch.cat((cls_token, v), dim=1)
+        return q, k, v
+    def forward(self, x, h, w):
+        if (
+            self.conv_proj_q is not None
+            or self.conv_proj_k is not None
+            or self.conv_proj_v is not None
+        ):
+            q, k, v = self.forward_conv(x, h, w)
+        q = rearrange(self.proj_q(q), 'b t (h d) -> b h t d', h=self.num_heads)
+        k = rearrange(self.proj_k(k), 'b t (h d) -> b h t d', h=self.num_heads)
+        v = rearrange(self.proj_v(v), 'b t (h d) -> b h t d', h=self.num_heads)
+        attn_score = torch.einsum('bhlk,bhtk->bhlt', [q, k]) * self.scale
+        attn = F.softmax(attn_score, dim=-1)
+        attn = self.attn_drop(attn)
+        x = torch.einsum('bhlt,bhtv->bhlv', [attn, v])
+        x = rearrange(x, 'b h t d -> b t (h d)')
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+    @staticmethod
+    def compute_macs(module, input, output):
+        # T: num_token
+        # S: num_token
+        input = input[0]
+        flops = 0
+        _, T, C = input.shape
+        H = W = int(np.sqrt(T-1)) if module.with_cls_token else int(np.sqrt(T))
+        H_Q = H / module.stride_q
+        W_Q = H / module.stride_q
+        T_Q = H_Q * W_Q + 1 if module.with_cls_token else H_Q * W_Q
+        H_KV = H / module.stride_kv
+        W_KV = W / module.stride_kv
+        T_KV = H_KV * W_KV + 1 if module.with_cls_token else H_KV * W_KV
+        # C = module.dim
+        # S = T
+        # Scaled-dot-product macs
+        # [B x T x C] x [B x C x T] --> [B x T x S]
+        # multiplication-addition is counted as 1 because operations can be fused
+        flops += T_Q * T_KV * module.dim
+        # [B x T x S] x [B x S x C] --> [B x T x C]
+        flops += T_Q * module.dim * T_KV
+        if (
+            hasattr(module, 'conv_proj_q')
+            and hasattr(module.conv_proj_q, 'conv')
+        ):
+            params = sum(
+                [
+                    p.numel()
+                    for p in module.conv_proj_q.conv.parameters()
+                ]
+            )
+            flops += params * H_Q * W_Q
+        if (
+            hasattr(module, 'conv_proj_k')
+            and hasattr(module.conv_proj_k, 'conv')
+        ):
+            params = sum(
+                [
+                    p.numel()
+                    for p in module.conv_proj_k.conv.parameters()
+                ]
+            )
+            flops += params * H_KV * W_KV
+        if (
+            hasattr(module, 'conv_proj_v')
+            and hasattr(module.conv_proj_v, 'conv')
+        ):
+            params = sum(
+                [
+                    p.numel()
+                    for p in module.conv_proj_v.conv.parameters()
+                ]
+            )
+            flops += params * H_KV * W_KV
+        params = sum([p.numel() for p in module.proj_q.parameters()])
+        flops += params * T_Q
+        params = sum([p.numel() for p in module.proj_k.parameters()])
+        flops += params * T_KV
+        params = sum([p.numel() for p in module.proj_v.parameters()])
+        flops += params * T_KV
+        params = sum([p.numel() for p in module.proj.parameters()])
+        flops += params * T
+        module.__flops__ += flops
+class Block(nn.Module):
+    def __init__(self,
+                 dim_in,
+                 dim_out,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 **kwargs):
+        super().__init__()
+        self.with_cls_token = kwargs['with_cls_token']
+        self.norm1 = norm_layer(dim_in)
+        self.attn = Attention(
+            dim_in, dim_out, num_heads, qkv_bias, attn_drop, drop,
+            **kwargs
+        )
+        self.drop_path = DropPath(drop_path) \
+            if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim_out)
+        dim_mlp_hidden = int(dim_out * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim_out,
+            hidden_features=dim_mlp_hidden,
+            act_layer=act_layer,
+            drop=drop
+        )
+    def forward(self, x, h, w):
+        res = x
+        x = self.norm1(x)
+        attn = self.attn(x, h, w)
+        x = res + self.drop_path(attn)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class ConvEmbed(nn.Module):
+    """ Image to Conv Embedding
+    """
+    def __init__(self,
+                 patch_size=7,
+                 in_chans=1, #1 for spectrogram, 3 for rgb image
+                 embed_dim=64,
+                 stride=4,
+                 padding=2,
+                 norm_layer=None):
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim,
+            kernel_size=patch_size,
+            stride=stride,
+            padding=padding
+        )
+        self.norm = norm_layer(embed_dim) if norm_layer else None
+    def forward(self, x):
+        x = self.proj(x)
+        B, C, H, W = x.shape
+        x = rearrange(x, 'b c h w -> b (h w) c')
+        if self.norm:
+            x = self.norm(x)
+        x = rearrange(x, 'b (h w) c -> b c h w', h=H, w=W)
+        return x
+class VisionTransformer(nn.Module):
+    """ Vision Transformer with support for patch or hybrid CNN input stage
+    """
+    def __init__(self,
+                 patch_size=16,
+                 patch_stride=16,
+                 patch_padding=0,
+                 in_chans=1, #1for spectrogram, 3 for RGB
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 init='trunc_norm',
+                 **kwargs):
+        super().__init__()
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.rearrage = None
+        self.patch_embed = ConvEmbed(
+            # img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            stride=patch_stride,
+            padding=patch_padding,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer
+        )
+        with_cls_token = kwargs['with_cls_token']
+        if with_cls_token:
+            self.cls_token = nn.Parameter(
+                torch.zeros(1, 1, embed_dim)
+            )
+        else:
+            self.cls_token = None
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        blocks = []
+        for j in range(depth):
+            blocks.append(
+                Block(
+                    dim_in=embed_dim,
+                    dim_out=embed_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[j],
+                    act_layer=act_layer,
+                    norm_layer=norm_layer,
+                    **kwargs
+                )
+            )
+        self.blocks = nn.ModuleList(blocks)
+        if self.cls_token is not None:
+            trunc_normal_(self.cls_token, std=.02)
+        if init == 'xavier':
+            self.apply(self._init_weights_xavier)
+        else:
+            self.apply(self._init_weights_trunc_normal)
+    def _init_weights_trunc_normal(self, m):
+        if isinstance(m, nn.Linear):
+            logging.info('=> init weight of Linear from trunc norm')
+            trunc_normal_(m.weight, std=0.02)
+            if m.bias is not None:
+                logging.info('=> init bias of Linear to zeros')
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, (nn.LayerNorm, nn.BatchNorm2d)):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def _init_weights_xavier(self, m):
+        if isinstance(m, nn.Linear):
+            logging.info('=> init weight of Linear from xavier uniform')
+            nn.init.xavier_uniform_(m.weight)
+            if m.bias is not None:
+                logging.info('=> init bias of Linear to zeros')
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, (nn.LayerNorm, nn.BatchNorm2d)):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def forward(self, x):
+        x = self.patch_embed(x)
+        B, C, H, W = x.size()
+        x = rearrange(x, 'b c h w -> b (h w) c')
+        cls_tokens = None
+        if self.cls_token is not None:
+            # stole cls_tokens impl from Phil Wang, thanks
+            cls_tokens = self.cls_token.expand(B, -1, -1)
+            x = torch.cat((cls_tokens, x), dim=1)
+        x = self.pos_drop(x)
+        for i, blk in enumerate(self.blocks):
+            x = blk(x, H, W)
+        if self.cls_token is not None:
+            cls_tokens, x = torch.split(x, [1, H*W], 1)
+        x = rearrange(x, 'b (h w) c -> b c h w', h=H, w=W)
+        return x, cls_tokens
+class ConvolutionalVisionTransformer(nn.Module):
+    def __init__(self,
+                 in_chans=1, #3 for RGB, 1 for Spectrogram
+                 num_classes=1000,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 init='trunc_norm',
+                 spec=None):
+        super().__init__()
+        self.num_classes = num_classes
+        self.num_stages = spec['NUM_STAGES']
+        for i in range(self.num_stages):
+            kwargs = {
+                'patch_size': spec['PATCH_SIZE'][i],
+                'patch_stride': spec['PATCH_STRIDE'][i],
+                'patch_padding': spec['PATCH_PADDING'][i],
+                'embed_dim': spec['DIM_EMBED'][i],
+                'depth': spec['DEPTH'][i],
+                'num_heads': spec['NUM_HEADS'][i],
+                'mlp_ratio': spec['MLP_RATIO'][i],
+                'qkv_bias': spec['QKV_BIAS'][i],
+                'drop_rate': spec['DROP_RATE'][i],
+                'attn_drop_rate': spec['ATTN_DROP_RATE'][i],
+                'drop_path_rate': spec['DROP_PATH_RATE'][i],
+                'with_cls_token': spec['CLS_TOKEN'][i],
+                'method': spec['QKV_PROJ_METHOD'][i],
+                'kernel_size': spec['KERNEL_QKV'][i],
+                'padding_q': spec['PADDING_Q'][i],
+                'padding_kv': spec['PADDING_KV'][i],
+                'stride_kv': spec['STRIDE_KV'][i],
+                'stride_q': spec['STRIDE_Q'][i],
+            }
+            stage = VisionTransformer(
+                in_chans=in_chans,
+                init=init,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                **kwargs
+            )
+            setattr(self, f'stage{i}', stage)
+            in_chans = spec['DIM_EMBED'][i]
+        dim_embed = spec['DIM_EMBED'][-1]
+        self.norm = norm_layer(dim_embed)
+        self.cls_token = spec['CLS_TOKEN'][-1]
+        # Classifier head
+        #self.head = nn.Linear(dim_embed, num_classes) if num_classes > 0 else nn.Identity()
+        #trunc_normal_(self.head.weight, std=0.02)
+        self.head = nn.Identity()
+    def init_weights(self, pretrained='', pretrained_layers=[], verbose=True):
+        if os.path.isfile(pretrained):
+            pretrained_dict = torch.load(pretrained, map_location='cpu')
+            logging.info(f'=> loading pretrained model {pretrained}')
+            model_dict = self.state_dict()
+            pretrained_dict = {
+                k: v for k, v in pretrained_dict.items()
+                if k in model_dict.keys()
+            }
+            need_init_state_dict = {}
+            for k, v in pretrained_dict.items():
+                need_init = (
+                        k.split('.')[0] in pretrained_layers
+                        #or pretrained_layers[0] is '*'
+                        or pretrained_layers[0] == '*'
+                )
+                if need_init:
+                    if verbose:
+                        logging.info(f'=> init {k} from {pretrained}')
+                    if 'pos_embed' in k and v.size() != model_dict[k].size():
+                        size_pretrained = v.size()
+                        size_new = model_dict[k].size()
+                        logging.info(
+                            '=> load_pretrained: resized variant: {} to {}'
+                            .format(size_pretrained, size_new)
+                        )
+                        ntok_new = size_new[1]
+                        ntok_new -= 1
+                        posemb_tok, posemb_grid = v[:, :1], v[0, 1:]
+                        gs_old = int(np.sqrt(len(posemb_grid)))
+                        gs_new = int(np.sqrt(ntok_new))
+                        logging.info(
+                            '=> load_pretrained: grid-size from {} to {}'
+                            .format(gs_old, gs_new)
+                        )
+                        posemb_grid = posemb_grid.reshape(gs_old, gs_old, -1)
+                        zoom = (gs_new / gs_old, gs_new / gs_old, 1)
+                        posemb_grid = scipy.ndimage.zoom(
+                            posemb_grid, zoom, order=1
+                        )
+                        posemb_grid = posemb_grid.reshape(1, gs_new ** 2, -1)
+                        v = torch.tensor(
+                            np.concatenate([posemb_tok, posemb_grid], axis=1)
+                        )
+                    need_init_state_dict[k] = v
+            self.load_state_dict(need_init_state_dict, strict=False)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        layers = set()
+        for i in range(self.num_stages):
+            layers.add(f'stage{i}.pos_embed')
+            layers.add(f'stage{i}.cls_token')
+        return layers
+    def forward_features(self, x):
+        for i in range(self.num_stages):
+            x, cls_tokens = getattr(self, f'stage{i}')(x)
+        if self.cls_token:
+            x = self.norm(cls_tokens)
+            #x = cls_tokens
+            x = torch.squeeze(x)
+        else:
+            x = rearrange(x, 'b c h w -> b (h w) c')
+            x = self.norm(x)
+            x = torch.mean(x, dim=1)
+        return x
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+@register_model
+def get_cls_model(**kwargs):
+    msvit_spec = config.MODEL.SPEC
+    msvit = ConvolutionalVisionTransformer(
+        in_chans=1, #1 for spectrogram 3 for RGB
+        num_classes=config.MODEL.NUM_CLASSES,
+        act_layer=QuickGELU,
+        norm_layer=partial(LayerNorm, eps=1e-5),
+        init=getattr(msvit_spec, 'INIT', 'trunc_norm'),
+        spec=msvit_spec
+    )
+    # if config.MODEL.INIT_WEIGHTS:
+    #     msvit.init_weights(
+    #         config.MODEL.PRETRAINED,
+    #         config.MODEL.PRETRAINED_LAYERS,
+    #         config.VERBOSE
+    #     )
+    return msvit
+def build_model(config, **kwargs):
+    model_name = config.MODEL.NAME
+    if not is_model(model_name):
+        raise ValueError(f'Unkown model: {model_name}')
+    return model_entrypoints(model_name)(config, **kwargs)
+def cvt13(**kwargs):
+    f = open('config/cvt-13-224x224.yaml', 'r')
+    config = yaml.safe_load(f)
+    return ConvolutionalVisionTransformer(spec=config['MODEL']['SPEC']) # only loades the config, no pretraining
+if __name__ == '__main__':
+    f = open('config/cvt-13-224x224.yaml', 'r')
+    config = yaml.safe_load(f)
+    model = ConvolutionalVisionTransformer(spec=config['MODEL']['SPEC'])
+    print(summary(model))
+    quit()
+    print(summary(model, input_size=(4, 1, 128, 301)))

melspectrogram.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from torchaudio import transforms as T
+import torch
+import torch.nn as nn
+MEAN, STD = 0.5347, 0.0772 # Xeno-Canto stats
+SR = 16000
+NFFT = 1024
+HOPLEN = 320
+NMELS = 128
+FMIN = 50
+FMAX = 8000
+class Normalization(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x):
+        return (x - x.min()) / (x.max() - x.min())
+class Standardization(torch.nn.Module):
+    def __init__(self, mean, std):
+        super().__init__()
+        self.mean = mean
+        self.std = std
+    def forward(self, x):
+        return (x - self.mean) / self.std
+class MelSpectrogramProcessor:
+    def __init__(self, sample_rate=SR, n_mels=NMELS, n_fft=NFFT, hop_length=HOPLEN, f_min=FMIN, f_max=FMAX):
+        self.transform = nn.Sequential(
+            T.MelSpectrogram(sample_rate=sample_rate, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length, f_min=f_min, f_max=f_max),
+            T.AmplitudeToDB(),
+            Normalization(),
+            Standardization(mean=MEAN, std=STD),
+        )
+    def process(self, waveform):
+        return self.transform(waveform)

protoclr.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b7eaf62e2f66084f50cbb9677420a3a93b81334fadddeb4eb9790e734f4a514f
+size 78717724

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+einops==0.8.0
+numpy==2.1.3
+PyYAML==6.0.2
+scipy==1.14.1
+timm==1.0.11
+torch==2.2.2
+torchaudio==2.2.2
+torchinfo==1.8.0
+yacs==0.1.8