add mae model (#15)

Browse files

* add mae model

* push to hugging face

* minor cleanup

* more changes

* set mask ratio to 0

* successful validation

* rearrange

* remove testing code

* remove unused funtion

* add a predict method

* update to correct version of phenom-beta

* add Kian's PR suggestion

* add test

* add comment to download model

* add multiple channel test

* allow channelwise embs

* clean some dead code

* add reconstruction notebook with example. can run on CPU no prob

* fix up

* udpate notebook

* remove the need for hydra

---------

Co-authored-by: Laksh <laksh.arumugam@recursionpharma.com>
Co-authored-by: kian-kd <kian.kd@recursionpharma.com>

Files changed (15) hide show

.gitignore +32 -0
generate_reconstructions.ipynb +0 -0
huggingface_mae.py +293 -0
mae_modules.py +2 -2
models/phenom_beta_huggingface/config.json +85 -0
normalizer.py +7 -0
requirements.in +14 -0
requirements.txt +213 -9
sample/AA41_s1_1.jp2 +0 -0
sample/AA41_s1_2.jp2 +0 -0
sample/AA41_s1_3.jp2 +0 -0
sample/AA41_s1_4.jp2 +0 -0
sample/AA41_s1_5.jp2 +0 -0
sample/AA41_s1_6.jp2 +0 -0
test_huggingface_mae.py +32 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,32 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# model artifacts
+*.pickle
+*.ckpt
+*.safetensors

generate_reconstructions.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

huggingface_mae.py ADDED Viewed

	@@ -0,0 +1,293 @@

+from typing import Dict, Tuple, Union
+import torch
+import torch.nn as nn
+from transformers import PretrainedConfig, PreTrainedModel
+from loss import FourierLoss
+from normalizer import Normalizer
+from mae_modules import CAMAEDecoder, MAEDecoder, MAEEncoder
+from mae_utils import flatten_images
+from vit import (
+    generate_2d_sincos_pos_embeddings,
+    sincos_positional_encoding_vit,
+    vit_small_patch16_256,
+)
+TensorDict = Dict[str, torch.Tensor]
+class MAEConfig(PretrainedConfig):
+    model_type = "MAE"
+    def __init__(
+        self,
+        mask_ratio=0.75,
+        encoder=None,
+        decoder=None,
+        loss=None,
+        optimizer=None,
+        input_norm=None,
+        fourier_loss=None,
+        fourier_loss_weight=0.0,
+        lr_scheduler=None,
+        use_MAE_weight_init=False,
+        crop_size=-1,
+        mask_fourier_loss=True,
+        return_channelwise_embeddings=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.mask_ratio = mask_ratio
+        self.encoder = encoder
+        self.decoder = decoder
+        self.loss = loss
+        self.optimizer = optimizer
+        self.input_norm = input_norm
+        self.fourier_loss = fourier_loss
+        self.fourier_loss_weight = fourier_loss_weight
+        self.lr_scheduler = lr_scheduler
+        self.use_MAE_weight_init = use_MAE_weight_init
+        self.crop_size = crop_size
+        self.mask_fourier_loss = mask_fourier_loss
+        self.return_channelwise_embeddings = return_channelwise_embeddings
+class MAEModel(PreTrainedModel):
+    config_class = MAEConfig
+    # Loss metrics
+    TOTAL_LOSS = "loss"
+    RECON_LOSS = "reconstruction_loss"
+    FOURIER_LOSS = "fourier_loss"
+    def __init__(self, config: MAEConfig):
+        super().__init__(config)
+        self.mask_ratio = config.mask_ratio
+        # Could use Hydra to instantiate instead
+        self.encoder = MAEEncoder(
+            vit_backbone=sincos_positional_encoding_vit(
+                vit_backbone=vit_small_patch16_256(global_pool="avg")
+            ),
+            max_in_chans=11,  # upper limit on number of input channels
+            channel_agnostic=True,
+        )
+        self.decoder = CAMAEDecoder(
+            depth=8,
+            embed_dim=512,
+            mlp_ratio=4,
+            norm_layer=nn.LayerNorm,
+            num_heads=16,
+            num_modalities=6,
+            qkv_bias=True,
+            tokens_per_modality=256,
+        )
+        self.input_norm = torch.nn.Sequential(
+            Normalizer(),
+            nn.InstanceNorm2d(None, affine=False, track_running_stats=False),
+        )
+        self.fourier_loss_weight = config.fourier_loss_weight
+        self.mask_fourier_loss = config.mask_fourier_loss
+        self.return_channelwise_embeddings = config.return_channelwise_embeddings
+        self.tokens_per_channel = 256  # hardcode the number of tokens per channel since we are patch16 crop 256
+        # loss stuff
+        self.loss = torch.nn.MSELoss(reduction="none")
+        self.fourier_loss = FourierLoss(num_multimodal_modalities=6)
+        if self.fourier_loss_weight > 0 and self.fourier_loss is None:
+            raise ValueError(
+                "FourierLoss weight is activated but no fourier_loss was defined in constructor"
+            )
+        elif self.fourier_loss_weight >= 1:
+            raise ValueError(
+                "FourierLoss weight is too large to do mixing factor, weight should be < 1"
+            )
+        self.patch_size = int(self.encoder.vit_backbone.patch_embed.patch_size[0])
+        # projection layer between the encoder and decoder
+        self.encoder_decoder_proj = nn.Linear(
+            self.encoder.embed_dim, self.decoder.embed_dim, bias=True
+        )
+        self.decoder_pred = nn.Linear(
+            self.decoder.embed_dim,
+            self.patch_size**2
+            * (1 if self.encoder.channel_agnostic else self.in_chans),
+            bias=True,
+        )  # linear layer from decoder embedding to input dims
+        # overwrite decoder pos embeddings based on encoder params
+        self.decoder.pos_embeddings = generate_2d_sincos_pos_embeddings(  # type: ignore[assignment]
+            self.decoder.embed_dim,
+            length=self.encoder.vit_backbone.patch_embed.grid_size[0],
+            use_class_token=self.encoder.vit_backbone.cls_token is not None,
+            num_modality=(
+                self.decoder.num_modalities if self.encoder.channel_agnostic else 1
+            ),
+        )
+        if config.use_MAE_weight_init:
+            w = self.encoder.vit_backbone.patch_embed.proj.weight.data
+            torch.nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+            torch.nn.init.normal_(self.encoder.vit_backbone.cls_token, std=0.02)
+            torch.nn.init.normal_(self.decoder.mask_token, std=0.02)
+            self.apply(self._MAE_init_weights)
+    def setup(self, stage: str) -> None:
+        super().setup(stage)
+    def _MAE_init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            torch.nn.init.xavier_uniform_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    @staticmethod
+    def decode_to_reconstruction(
+        encoder_latent: torch.Tensor,
+        ind_restore: torch.Tensor,
+        proj: torch.nn.Module,
+        decoder: MAEDecoder | CAMAEDecoder,
+        pred: torch.nn.Module,
+    ) -> torch.Tensor:
+        """Feed forward the encoder latent through the decoders necessary projections and transformations."""
+        decoder_latent_projection = proj(
+            encoder_latent
+        )  # projection from encoder.embed_dim to decoder.embed_dim
+        decoder_tokens = decoder.forward_masked(
+            decoder_latent_projection, ind_restore
+        )  # decoder.embed_dim output
+        predicted_reconstruction = pred(
+            decoder_tokens
+        )  # linear projection to input dim
+        return predicted_reconstruction[:, 1:, :]  # drop class token
+    def forward(
+        self, imgs: torch.Tensor, constant_noise: Union[torch.Tensor, None] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        imgs = self.input_norm(imgs)
+        latent, mask, ind_restore = self.encoder.forward_masked(
+            imgs, self.mask_ratio, constant_noise
+        )  # encoder blocks
+        reconstruction = self.decode_to_reconstruction(
+            latent,
+            ind_restore,
+            self.encoder_decoder_proj,
+            self.decoder,
+            self.decoder_pred,
+        )
+        return latent, reconstruction, mask
+    def compute_MAE_loss(
+        self,
+        reconstruction: torch.Tensor,
+        img: torch.Tensor,
+        mask: torch.Tensor,
+    ) -> Tuple[torch.Tensor, Dict[str, float]]:
+        """Computes final loss and returns specific values of component losses for metric reporting."""
+        loss_dict = {}
+        img = self.input_norm(img)
+        target_flattened = flatten_images(
+            img,
+            patch_size=self.patch_size,
+            channel_agnostic=self.encoder.channel_agnostic,
+        )
+        loss: torch.Tensor = self.loss(
+            reconstruction, target_flattened
+        )  # should be with MSE or MAE (L1) with reduction='none'
+        loss = loss.mean(
+            dim=-1
+        )  # average over embedding dim -> mean loss per patch (N,L)
+        loss = (loss * mask).sum() / mask.sum()  # mean loss on masked patches only
+        loss_dict[self.RECON_LOSS] = loss.item()
+        # compute fourier loss
+        if self.fourier_loss_weight > 0:
+            floss: torch.Tensor = self.fourier_loss(reconstruction, target_flattened)
+            if not self.mask_fourier_loss:
+                floss = floss.mean()
+            else:
+                floss = floss.mean(dim=-1)
+                floss = (floss * mask).sum() / mask.sum()
+            loss_dict[self.FOURIER_LOSS] = floss.item()
+        # here we use a mixing factor to keep the loss magnitude appropriate with fourier
+        if self.fourier_loss_weight > 0:
+            loss = (1 - self.fourier_loss_weight) * loss + (
+                self.fourier_loss_weight * floss
+            )
+        return loss, loss_dict
+    def training_step(self, batch: TensorDict, batch_idx: int) -> TensorDict:
+        img = batch["pixels"]
+        latent, reconstruction, mask = self(img.clone())
+        full_loss, loss_dict = self.compute_MAE_loss(reconstruction, img.float(), mask)
+        return {
+            "loss": full_loss,
+            **loss_dict,  # type: ignore[dict-item]
+        }
+    def validation_step(self, batch: TensorDict, batch_idx: int) -> TensorDict:
+        return self.training_step(batch, batch_idx)
+    def update_metrics(self, outputs: TensorDict, batch: TensorDict) -> None:
+        self.metrics["lr"].update(value=self.lr_scheduler.get_last_lr())
+        for key, value in outputs.items():
+            if key.endswith("loss"):
+                self.metrics[key].update(value)
+    def on_validation_batch_end(  # type: ignore[override]
+        self,
+        outputs: TensorDict,
+        batch: TensorDict,
+        batch_idx: int,
+        dataloader_idx: int = 0,
+    ) -> None:
+        super().on_validation_batch_end(outputs, batch, batch_idx, dataloader_idx)
+    def predict(self, imgs: torch.Tensor) -> torch.Tensor:
+        imgs = self.input_norm(imgs)
+        X = self.encoder.vit_backbone.forward_features(
+            imgs
+        )  # 3d tensor N x num_tokens x dim
+        if self.return_channelwise_embeddings:
+            N, _, d = X.shape
+            num_channels = imgs.shape[1]
+            X_reshaped = X[:, 1:, :].view(N, num_channels, self.tokens_per_channel, d)
+            pooled_segments = X_reshaped.mean(
+                dim=2
+            )  # Resulting shape: (N, num_channels, d)
+            latent = pooled_segments.view(N, num_channels * d).contiguous()
+        else:
+            latent = X[:, 1:, :].mean(dim=1)  # 1 + 256 * C tokens
+        return latent
+    def save_pretrained(self, save_directory: str, **kwargs):
+        filename = kwargs.pop("filename", "model.safetensors")
+        modelpath = f"{save_directory}/{filename}"
+        self.config.save_pretrained(save_directory)
+        torch.save({"state_dict": self.state_dict()}, modelpath)
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        filename = kwargs.pop("filename", "model.safetensors")
+        modelpath = f"{pretrained_model_name_or_path}/{filename}"
+        config = MAEConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        state_dict = torch.load(modelpath, map_location="cpu")
+        model = cls(config, *model_args, **kwargs)
+        model.load_state_dict(state_dict["state_dict"])
+        return model

mae_modules.py CHANGED Viewed

@@ -7,8 +7,8 @@ import torch.nn as nn
 from timm.models.helpers import checkpoint_seq
 from timm.models.vision_transformer import Block, Mlp, VisionTransformer
-from .masking import transformer_random_masking
-from .vit import channel_agnostic_vit
 # If interested in training new MAEs, combine an encoder and decoder into a new module, and you should
 # leverage the flattening and unflattening utilities as needed from mae_utils.py.

 from timm.models.helpers import checkpoint_seq
 from timm.models.vision_transformer import Block, Mlp, VisionTransformer
+from masking import transformer_random_masking
+from vit import channel_agnostic_vit
 # If interested in training new MAEs, combine an encoder and decoder into a new module, and you should
 # leverage the flattening and unflattening utilities as needed from mae_utils.py.

models/phenom_beta_huggingface/config.json ADDED Viewed

	@@ -0,0 +1,85 @@

+{
+  "_attn_implementation_autoset": true,
+  "apply_loss_unmasked": false,
+  "architectures": [
+    "MAEModel"
+  ],
+  "crop_size": -1,
+  "decoder": {
+    "_target_": "mae_modules.CAMAEDecoder",
+    "depth": 8,
+    "embed_dim": 512,
+    "mlp_ratio": 4,
+    "norm_layer": {
+      "_partial_": true,
+      "_target_": "torch.nn.LayerNorm",
+      "eps": 1e-06
+    },
+    "num_heads": 16,
+    "num_modalities": 6,
+    "qkv_bias": true,
+    "tokens_per_modality": 256
+  },
+  "encoder": {
+    "_target_": "mae_modules.MAEEncoder",
+    "channel_agnostic": true,
+    "max_in_chans": 11,
+    "vit_backbone": {
+      "_target_": "vit.sincos_positional_encoding_vit",
+      "vit_backbone": {
+        "_target_": "vit.vit_small_patch16_256",
+        "global_pool": "avg"
+      }
+    }
+  },
+  "fourier_loss": {
+    "_target_": "loss.FourierLoss",
+    "num_multimodal_modalities": 6
+  },
+  "fourier_loss_weight": 0.0,
+  "input_norm": {
+    "_args_": [
+      {
+        "_target_": "normalizer.Normalizer"
+      },
+      {
+        "_target_": "torch.nn.InstanceNorm2d",
+        "affine": false,
+        "num_features": null,
+        "track_running_stats": false
+      }
+    ],
+    "_target_": "torch.nn.Sequential"
+  },
+  "layernorm_unfreeze": true,
+  "loss": {
+    "_target_": "torch.nn.MSELoss",
+    "reduction": "none"
+  },
+  "lr_scheduler": {
+    "_partial_": true,
+    "_target_": "torch.optim.lr_scheduler.OneCycleLR",
+    "anneal_strategy": "cos",
+    "max_lr": 0.0001,
+    "pct_start": 0.1
+  },
+  "mask_fourier_loss": true,
+  "mask_ratio": 0.0,
+  "model_type": "MAE",
+  "norm_pix_loss": false,
+  "num_blocks_to_freeze": 0,
+  "optimizer": {
+    "_partial_": true,
+    "_target_": "timm.optim.lion.Lion",
+    "betas": [
+      0.9,
+      0.95
+    ],
+    "lr": 0.0001,
+    "weight_decay": 0.05
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.46.1",
+  "trim_encoder_blocks": null,
+  "use_MAE_weight_init": false
+}

normalizer.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import torch
+class Normalizer(torch.nn.Module):
+    def forward(self, pixels: torch.Tensor) -> torch.Tensor:
+        pixels = pixels.float()
+        return pixels / 255.0

requirements.in ADDED Viewed

	@@ -0,0 +1,14 @@

+huggingface-hub
+timm
+torch>=2.3
+torchmetrics
+torchvision
+tqdm
+transformers
+xformers
+zarr
+hydra-core
+pytorch-lightning>=2.1
+isort
+ruff
+pytest

requirements.txt CHANGED Viewed

@@ -1,9 +1,213 @@
-huggingface-hub==0.18.0
-timm==0.9.7
-torch==2.1.0+cu121
-torchmetrics==1.2.0
-torchvision==0.16.0+cu121
-tqdm==4.66.1
-transformers==4.35.2
-xformers==0.0.22.post7
-zarr==2.16.1

+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --no-emit-index-url --output-file=requirements.txt requirements.in
+#
+--trusted-host pypi.ngc.nvidia.com
+aiohappyeyeballs==2.4.3
+    # via aiohttp
+aiohttp==3.10.10
+    # via fsspec
+aiosignal==1.3.1
+    # via aiohttp
+antlr4-python3-runtime==4.9.3
+    # via
+    #   hydra-core
+    #   omegaconf
+asciitree==0.3.3
+    # via zarr
+async-timeout==4.0.3
+    # via aiohttp
+attrs==24.2.0
+    # via aiohttp
+certifi==2024.8.30
+    # via requests
+charset-normalizer==3.4.0
+    # via requests
+exceptiongroup==1.2.2
+    # via pytest
+fasteners==0.19
+    # via zarr
+filelock==3.16.1
+    # via
+    #   huggingface-hub
+    #   torch
+    #   transformers
+    #   triton
+frozenlist==1.4.1
+    # via
+    #   aiohttp
+    #   aiosignal
+fsspec[http]==2024.10.0
+    # via
+    #   huggingface-hub
+    #   pytorch-lightning
+    #   torch
+huggingface-hub==0.26.1
+    # via
+    #   -r requirements.in
+    #   timm
+    #   tokenizers
+    #   transformers
+hydra-core==1.3.2
+    # via -r requirements.in
+idna==3.10
+    # via
+    #   requests
+    #   yarl
+iniconfig==2.0.0
+    # via pytest
+isort==5.13.2
+    # via -r requirements.in
+jinja2==3.1.4
+    # via torch
+lightning-utilities==0.11.8
+    # via
+    #   pytorch-lightning
+    #   torchmetrics
+markupsafe==3.0.2
+    # via jinja2
+mpmath==1.3.0
+    # via sympy
+multidict==6.1.0
+    # via
+    #   aiohttp
+    #   yarl
+networkx==3.4.2
+    # via torch
+numcodecs==0.13.1
+    # via zarr
+numpy==1.26.4
+    # via
+    #   numcodecs
+    #   torchmetrics
+    #   torchvision
+    #   transformers
+    #   xformers
+    #   zarr
+nvidia-cublas-cu12==12.4.5.8
+    # via
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.4.127
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.4.127
+    # via torch
+nvidia-cuda-runtime-cu12==12.4.127
+    # via torch
+nvidia-cudnn-cu12==9.1.0.70
+    # via torch
+nvidia-cufft-cu12==11.2.1.3
+    # via torch
+nvidia-curand-cu12==10.3.5.147
+    # via torch
+nvidia-cusolver-cu12==11.6.1.9
+    # via torch
+nvidia-cusparse-cu12==12.3.1.170
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-nccl-cu12==2.21.5
+    # via torch
+nvidia-nvjitlink-cu12==12.4.127
+    # via
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+    #   torch
+nvidia-nvtx-cu12==12.4.127
+    # via torch
+omegaconf==2.3.0
+    # via hydra-core
+packaging==24.1
+    # via
+    #   huggingface-hub
+    #   hydra-core
+    #   lightning-utilities
+    #   pytest
+    #   pytorch-lightning
+    #   torchmetrics
+    #   transformers
+pillow==11.0.0
+    # via torchvision
+pluggy==1.5.0
+    # via pytest
+propcache==0.2.0
+    # via yarl
+pytest==8.3.3
+    # via -r requirements.in
+pytorch-lightning==2.4.0
+    # via -r requirements.in
+pyyaml==6.0.2
+    # via
+    #   huggingface-hub
+    #   omegaconf
+    #   pytorch-lightning
+    #   timm
+    #   transformers
+regex==2024.9.11
+    # via transformers
+requests==2.32.3
+    # via
+    #   huggingface-hub
+    #   transformers
+ruff==0.7.0
+    # via -r requirements.in
+safetensors==0.4.5
+    # via
+    #   timm
+    #   transformers
+sympy==1.13.1
+    # via torch
+timm==1.0.11
+    # via -r requirements.in
+tokenizers==0.20.1
+    # via transformers
+tomli==2.0.2
+    # via pytest
+torch==2.5.0
+    # via
+    #   -r requirements.in
+    #   pytorch-lightning
+    #   timm
+    #   torchmetrics
+    #   torchvision
+    #   xformers
+torchmetrics==1.5.0
+    # via
+    #   -r requirements.in
+    #   pytorch-lightning
+torchvision==0.20.0
+    # via
+    #   -r requirements.in
+    #   timm
+tqdm==4.66.5
+    # via
+    #   -r requirements.in
+    #   huggingface-hub
+    #   pytorch-lightning
+    #   transformers
+transformers==4.45.2
+    # via -r requirements.in
+triton==3.1.0
+    # via torch
+typing-extensions==4.12.2
+    # via
+    #   huggingface-hub
+    #   lightning-utilities
+    #   multidict
+    #   pytorch-lightning
+    #   torch
+urllib3==2.2.3
+    # via requests
+xformers==0.0.28.post2
+    # via -r requirements.in
+yarl==1.16.0
+    # via aiohttp
+zarr==2.18.3
+    # via -r requirements.in
+# The following packages are considered to be unsafe in a requirements file:
+# setuptools

sample/AA41_s1_1.jp2 ADDED Viewed

sample/AA41_s1_2.jp2 ADDED Viewed

sample/AA41_s1_3.jp2 ADDED Viewed

sample/AA41_s1_4.jp2 ADDED Viewed

sample/AA41_s1_5.jp2 ADDED Viewed

sample/AA41_s1_6.jp2 ADDED Viewed

test_huggingface_mae.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import pytest
+import torch
+from huggingface_mae import MAEModel
+huggingface_phenombeta_model_dir = "models/phenom_beta_huggingface"
+# huggingface_modelpath = "recursionpharma/test-pb-model"
+@pytest.fixture
+def huggingface_model():
+    # Make sure you have the model/config downloaded from https://huggingface.co/recursionpharma/test-pb-model to this directory
+    # huggingface-cli download recursionpharma/test-pb-model --local-dir=models/phenom_beta_huggingface
+    huggingface_model = MAEModel.from_pretrained(huggingface_phenombeta_model_dir)
+    huggingface_model.eval()
+    return huggingface_model
+@pytest.mark.parametrize("C", [1, 4, 6, 11])
+@pytest.mark.parametrize("return_channelwise_embeddings", [True, False])
+def test_model_predict(huggingface_model, C, return_channelwise_embeddings):
+    example_input_array = torch.randint(
+        low=0,
+        high=255,
+        size=(2, C, 256, 256),
+        dtype=torch.uint8,
+        device=huggingface_model.device,
+    )
+    huggingface_model.return_channelwise_embeddings = return_channelwise_embeddings
+    embeddings = huggingface_model.predict(example_input_array)
+    expected_output_dim = 384 * C if return_channelwise_embeddings else 384
+    assert embeddings.shape == (2, expected_output_dim)