Spaces:

j0hngou
/

vision-diffmask

Runtime error

App Files Files Community

din0s commited on Jun 28, 2022

Commit

d4ab5ac

•

1 Parent(s): d395a3a

Add code

Browse files

Files changed (28) hide show

.gitattributes +1 -0
app.py +63 -0
checkpoints/.gitkeep +1 -0
checkpoints/diffmask.ckpt +3 -0
code/attributions/__init__.py +2 -0
code/attributions/attention_rollout.py +59 -0
code/attributions/grad_cam.py +55 -0
code/datamodules/__init__.py +3 -0
code/datamodules/base.py +156 -0
code/datamodules/image_classification.py +44 -0
code/datamodules/transformations.py +41 -0
code/datamodules/utils.py +133 -0
code/datamodules/visual_qa.py +241 -0
code/eval_base.py +102 -0
code/main.py +215 -0
code/models/__init__.py +2 -0
code/models/classification.py +112 -0
code/models/gates.py +261 -0
code/models/interpretation.py +482 -0
code/models/utils.py +64 -0
code/train_base.py +123 -0
code/utils/__init__.py +0 -0
code/utils/distributions.py +64 -0
code/utils/getters_setters.py +122 -0
code/utils/metrics.py +67 -0
code/utils/optimizer.py +151 -0
code/utils/plot.py +252 -0
requirements.txt +7 -0

.gitattributes CHANGED Viewed

@@ -25,3 +25,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+checkpoints/diffmask.ckpt filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import sys
+sys.path.insert(0, './code')
+from datamodules.transformations import UnNest
+from models.interpretation import ImageInterpretationNet
+from transformers import ViTFeatureExtractor, ViTForImageClassification
+from utils.plot import smoothen, draw_mask_on_image, draw_heatmap_on_image
+import gradio as gr
+import numpy as np
+import torch
+# Load Vision Transformer
+hf_model = "tanlq/vit-base-patch16-224-in21k-finetuned-cifar10"
+vit = ViTForImageClassification.from_pretrained(hf_model)
+vit.eval()
+# Load Feature Extractor
+feature_extractor = ViTFeatureExtractor.from_pretrained(hf_model, return_tensors="pt")
+feature_extractor = UnNest(feature_extractor)
+# Load Vision DiffMask
+diffmask = ImageInterpretationNet.load_from_checkpoint('checkpoints/diffmask.ckpt')
+diffmask.set_vision_transformer(vit)
+# Define mask plotting functions
+def draw_mask(image, mask):
+    return draw_mask_on_image(image, smoothen(mask))\
+        .permute(1, 2, 0)\
+        .clip(0, 1)\
+        .numpy()
+def draw_heatmap(image, mask):
+    return draw_heatmap_on_image(image, smoothen(mask))\
+        .permute(1, 2, 0)\
+        .clip(0, 1)\
+        .numpy()
+# Define callable method for the demo
+def get_mask(image):
+    if image is None:
+        return None
+    image = torch.from_numpy(image).permute(2, 0, 1).float() / 255
+    dm_image = feature_extractor(image).unsqueeze(0)
+    mask = diffmask.get_mask(dm_image)["mask"][0].detach()
+    masked_img = draw_mask(image, mask)
+    heatmap = draw_heatmap(image, mask)
+    return np.hstack((masked_img, heatmap))
+# Launch demo interface
+gr.Interface(
+    get_mask,
+    inputs=gr.inputs.Image(label="Input", shape=(224, 224), source="upload", type="numpy"),
+    outputs=[gr.outputs.Image(label="Output")],
+    title="Vision DiffMask Demo",
+    live=True,
+).launch()

checkpoints/.gitkeep ADDED Viewed

	@@ -0,0 +1 @@


1	+

checkpoints/diffmask.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:33ceff3adc10ffb86bdaa3c90380e7925e76e7b170ed42d1cc00ff33328fc77b
+size 16610391

code/attributions/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .attention_rollout import attention_rollout
2	+ from .grad_cam import grad_cam

code/attributions/attention_rollout.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import torch
+import torch.nn.functional as F
+from math import sqrt
+from torch import Tensor
+from transformers import ViTForImageClassification
+@torch.no_grad()
+def attention_rollout(
+    images: Tensor,
+    vit: ViTForImageClassification,
+    discard_ratio: float = 0.9,
+    head_fusion: str = "mean",
+    device: str = "cpu",
+) -> Tensor:
+    """Performs the Attention Rollout method on a batch of images (https://arxiv.org/pdf/2005.00928.pdf)."""
+    # Forward pass and save attention maps
+    attentions = vit(images, output_attentions=True).attentions
+    B, _, H, W = images.shape  # Batch size, channels, height, width
+    P = attentions[0].size(-1)  # Number of patches
+    mask = torch.eye(P).to(device)
+    # Iterate over layers
+    for j, attention in enumerate(attentions):
+        if head_fusion == "mean":
+            attention_heads_fused = attention.mean(axis=1)
+        elif head_fusion == "max":
+            attention_heads_fused = attention.max(axis=1)[0]
+        elif head_fusion == "min":
+            attention_heads_fused = attention.min(axis=1)[0]
+        else:
+            raise "Attention head fusion type Not supported"
+        # Drop the lowest attentions, but don't drop the class token
+        flat = attention_heads_fused.view(B, -1)
+        _, indices = flat.topk(int(flat.size(-1) * discard_ratio), -1, False)
+        indices = indices[indices != 0]
+        flat[0, indices] = 0
+        # I = torch.eye(P)
+        a = (attention_heads_fused + torch.eye(P).to(device)) / 2
+        a = a / a.sum(dim=-1).view(-1, P, 1)
+        mask = a @ mask
+    # Look at the total attention between the class token and the image patches
+    mask = mask[:, 0, 1:]
+    mask = mask / torch.max(mask)
+    N = int(sqrt(P))
+    S = int(H / N)
+    mask = mask.reshape(B, 1, N, N)
+    mask = F.interpolate(mask, scale_factor=S)
+    mask = mask.reshape(B, H, W)
+    return mask

code/attributions/grad_cam.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import torch
+from pytorch_grad_cam import GradCAM
+from torch import Tensor
+from transformers import ViTForImageClassification
+def grad_cam(images: Tensor, vit: ViTForImageClassification, use_cuda: bool = False) -> Tensor:
+    """Performs the Grad-CAM method on a batch of images (https://arxiv.org/pdf/1610.02391.pdf)."""
+    # Wrap the ViT model to be compatible with GradCAM
+    vit = ViTWrapper(vit)
+    vit.eval()
+    # Create GradCAM object
+    cam = GradCAM(
+        model=vit,
+        target_layers=[vit.target_layer],
+        reshape_transform=_reshape_transform,
+        use_cuda=use_cuda,
+    )
+    # Compute GradCAM masks
+    grayscale_cam = cam(
+        input_tensor=images,
+        targets=None,
+        eigen_smooth=True,
+        aug_smooth=True,
+    )
+    return torch.from_numpy(grayscale_cam)
+def _reshape_transform(tensor, height=14, width=14):
+    result = tensor[:, 1:, :].reshape(tensor.size(0), height, width, tensor.size(2))
+    # Bring the channels to the first dimension
+    result = result.transpose(2, 3).transpose(1, 2)
+    return result
+class ViTWrapper(torch.nn.Module):
+    """ViT Wrapper to use with Grad-CAM."""
+    def __init__(self, vit: ViTForImageClassification):
+        super().__init__()
+        self.vit = vit
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.vit(x).logits
+    @property
+    def target_layer(self):
+        return self.vit.vit.encoder.layer[-2].layernorm_after

code/datamodules/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .base import ImageDataModule
+from .image_classification import CIFAR10DataModule, MNISTDataModule
+from .visual_qa import CIFAR10QADataModule, ToyQADataModule

code/datamodules/base.py ADDED Viewed

	@@ -0,0 +1,156 @@

+from .transformations import AddGaussianNoise
+from abc import abstractmethod, ABCMeta
+from argparse import ArgumentParser
+from pytorch_lightning import LightningDataModule
+from torch.utils.data import (
+    DataLoader,
+    Dataset,
+    default_collate,
+    RandomSampler,
+    SequentialSampler,
+)
+from torchvision import transforms
+from typing import Optional
+class ImageDataModule(LightningDataModule, metaclass=ABCMeta):
+    @staticmethod
+    def add_model_specific_args(parent_parser: ArgumentParser) -> ArgumentParser:
+        parser = parent_parser.add_argument_group("Data Modules")
+        parser.add_argument(
+            "--data_dir",
+            type=str,
+            default="data/",
+            help="The directory where the data is stored.",
+        )
+        parser.add_argument(
+            "--batch_size",
+            type=int,
+            default=32,
+            help="The batch size to use.",
+        )
+        parser.add_argument(
+            "--add_noise",
+            action="store_true",
+            help="Use gaussian noise augmentation.",
+        )
+        parser.add_argument(
+            "--add_rotation",
+            action="store_true",
+            help="Use rotation augmentation.",
+        )
+        parser.add_argument(
+            "--add_blur",
+            action="store_true",
+            help="Use blur augmentation.",
+        )
+        parser.add_argument(
+            "--num_workers",
+            type=int,
+            default=4,
+            help="Number of workers to use for data loading.",
+        )
+        return parent_parser
+    # Declare variables that will be initialized later
+    train_data: Dataset
+    val_data: Dataset
+    test_data: Dataset
+    def __init__(
+        self,
+        feature_extractor: Optional[callable] = None,
+        data_dir: str = "data/",
+        batch_size: int = 32,
+        add_noise: bool = False,
+        add_rotation: bool = False,
+        add_blur: bool = False,
+        num_workers: int = 4,
+    ):
+        """Abstract Pytorch Lightning DataModule for image datasets.
+        Args:
+            feature_extractor (callable): feature extractor instance
+            data_dir (str): directory to store the dataset
+            batch_size (int): batch size for the train/val/test dataloaders
+            add_noise (bool): whether to add noise to the images
+            add_rotation (bool): whether to add random rotation to the images
+            add_blur (bool): whether to add blur to the images
+            num_workers (int): number of workers for train/val/test dataloaders
+        """
+        super().__init__()
+        # Store hyperparameters
+        self.data_dir = data_dir
+        self.batch_size = batch_size
+        self.feature_extractor = feature_extractor
+        self.num_workers = num_workers
+        # Set the transforms
+        # If the feature_extractor is None, then we do not split the images into features
+        init_transforms = [feature_extractor] if feature_extractor else []
+        self.transform = transforms.Compose(init_transforms)
+        self._add_transforms(add_noise, add_rotation, add_blur)
+        # Set the collate function and the samplers
+        # These can be adapted in a child datamodule class to have a different behavior
+        self.collate_fn = default_collate
+        self.shuffled_sampler = RandomSampler
+        self.sequential_sampler = SequentialSampler
+    def _add_transforms(self, noise: bool, rotation: bool, blur: bool):
+        """Add transforms to the module's transformations list.
+        Args:
+            noise (bool): whether to add noise to the images
+            rotation (bool): whether to add random rotation to the images
+            blur (bool): whether to add blur to the images
+        """
+        # TODO:
+        # - Which order to add the transforms in?
+        # - Applied in both train and test or just test?
+        # - Check what transforms are applied by the model
+        if noise:
+            self.transform.transforms.append(AddGaussianNoise(0.0, 1.0))
+        if rotation:
+            self.transform.transforms.append(transforms.RandomRotation(20))
+        if blur:
+            self.transform.transforms.append(transforms.GaussianBlur(3))
+    @abstractmethod
+    def prepare_data(self):
+        raise NotImplementedError()
+    @abstractmethod
+    def setup(self, stage: Optional[str] = None):
+        raise NotImplementedError()
+    # noinspection PyTypeChecker
+    def train_dataloader(self) -> DataLoader:
+        return DataLoader(
+            self.train_data,
+            batch_size=self.batch_size,
+            num_workers=self.num_workers,
+            collate_fn=self.collate_fn,
+            sampler=self.shuffled_sampler(self.train_data),
+        )
+    # noinspection PyTypeChecker
+    def val_dataloader(self) -> DataLoader:
+        return DataLoader(
+            self.val_data,
+            batch_size=self.batch_size,
+            num_workers=self.num_workers,
+            collate_fn=self.collate_fn,
+            sampler=self.sequential_sampler(self.val_data),
+        )
+    # noinspection PyTypeChecker
+    def test_dataloader(self) -> DataLoader:
+        return DataLoader(
+            self.test_data,
+            batch_size=self.batch_size,
+            num_workers=self.num_workers,
+            collate_fn=self.collate_fn,
+            sampler=self.sequential_sampler(self.test_data),
+        )

code/datamodules/image_classification.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from .base import ImageDataModule
+from torch.utils.data import random_split
+from torchvision.datasets import MNIST, CIFAR10
+from typing import Optional
+class MNISTDataModule(ImageDataModule):
+    """Datamodule for the MNIST dataset."""
+    def prepare_data(self):
+        # Download MNIST
+        MNIST(self.data_dir, train=True, download=True)
+        MNIST(self.data_dir, train=False, download=True)
+    def setup(self, stage: Optional[str] = None):
+        # Set the training and validation data
+        if stage == "fit" or stage is None:
+            mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)
+            self.train_data, self.val_data = random_split(mnist_full, [55000, 5000])
+        # Set the test data
+        if stage == "test" or stage is None:
+            self.test_data = MNIST(self.data_dir, train=False, transform=self.transform)
+class CIFAR10DataModule(ImageDataModule):
+    """Datamodule for the CIFAR10 dataset."""
+    def prepare_data(self):
+        # Download CIFAR10
+        CIFAR10(self.data_dir, train=True, download=True)
+        CIFAR10(self.data_dir, train=False, download=True)
+    def setup(self, stage: Optional[str] = None):
+        # Set the training and validation data
+        if stage == "fit" or stage is None:
+            cifar10_full = CIFAR10(self.data_dir, train=True, transform=self.transform)
+            self.train_data, self.val_data = random_split(cifar10_full, [45000, 5000])
+        # Set the test data
+        if stage == "test" or stage is None:
+            self.test_data = CIFAR10(
+                self.data_dir, train=False, transform=self.transform
+            )

code/datamodules/transformations.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from torch import Tensor
+from transformers.image_utils import ImageInput
+import torch
+class AddGaussianNoise:
+    """Add Gaussian noise to an image.
+    Args:
+        mean (float): mean of the Gaussian noise
+        std (float): standard deviation of the Gaussian noise
+    """
+    def __init__(self, mean: float = 0.0, std: float = 1.0):
+        self.std = std
+        self.mean = mean
+    def __call__(self, tensor: Tensor) -> Tensor:
+        return tensor + torch.randn(tensor.size()) * self.std + self.mean
+    def __repr__(self) -> str:
+        return self.__class__.__name__ + "(mean={0}, std={1})".format(
+            self.mean, self.std
+        )
+class UnNest:
+    """Un-nest the output of a feature extractor"""
+    def __init__(self, feature_extractor: callable):
+        self.feature_extractor = feature_extractor
+    def __call__(self, x: ImageInput) -> Tensor:
+        # Pass the input through the feature extractor
+        x = self.feature_extractor(x)
+        # Un-nest the pixel_values tensor
+        x = torch.tensor(x["pixel_values"][0])
+        # HuggingFace models expect 3D tensors [C, H, W]
+        return x if len(x) == 3 else x.unsqueeze(0)

code/datamodules/utils.py ADDED Viewed

	@@ -0,0 +1,133 @@

+from .image_classification import CIFAR10DataModule, ImageDataModule, MNISTDataModule
+from .transformations import UnNest
+from .visual_qa import CIFAR10QADataModule, ToyQADataModule
+from argparse import Namespace
+from transformers import ConvNextFeatureExtractor, ViTFeatureExtractor
+def get_configs(args: Namespace) -> tuple[dict, dict]:
+    """Get the model and feature extractor configs from the command line args.
+    Args:
+        args (Namespace): the argparse Namespace object
+    Returns:
+         a tuple containing the model and feature extractor configs
+    """
+    if args.dataset == "MNIST":
+        # We upsample the MNIST images to 112x112, with 1 channel (grayscale)
+        # and 10 classes (0-9). We normalize the image to have a mean of 0.5
+        # and a standard deviation of ±0.5.
+        model_cfg_args = {
+            "image_size": 112,
+            "num_channels": 1,
+            "num_labels": 10,
+        }
+        fe_cfg_args = {
+            "image_mean": [0.5],
+            "image_std": [0.5],
+        }
+    elif args.dataset.startswith("CIFAR10"):
+        if args.dataset not in ("CIFAR10", "CIFAR10_QA"):
+            raise Exception(f"Unknown CIFAR10 variant: {args.dataset}")
+        # We upsample the CIFAR10 images to 224x224, with 3 channels (RGB) and
+        # 10 classes (0-9) for the normal dataset, or (grid_size)^2 + 1 for the
+        # toy task.  We normalize the image to have a mean of 0.5 and a standard
+        # deviation of ±0.5.
+        model_cfg_args = {
+            "image_size": 224,  # fixed to 224 because pretrained models have that size
+            "num_channels": 3,
+            "num_labels": (args.grid_size**2) + 1
+            if args.dataset == "CIFAR10_QA"
+            else 10,
+        }
+        fe_cfg_args = {
+            "image_mean": [0.5, 0.5, 0.5],
+            "image_std": [0.5, 0.5, 0.5],
+        }
+    elif args.dataset == "toy":
+        # We use an image size so that each patch contains a single color, with
+        # 3 channels (RGB) and (grid_size)^2 + 1 classes. We normalize the image
+        # to have a mean of 0.5 and a standard deviation of ±0.5.
+        model_cfg_args = {
+            "image_size": args.grid_size * 16,
+            "num_channels": 3,
+            "num_labels": (args.grid_size**2) + 1,
+        }
+        fe_cfg_args = {
+            "image_mean": [0.5, 0.5, 0.5],
+            "image_std": [0.5, 0.5, 0.5],
+        }
+    else:
+        raise Exception(f"Unknown dataset: {args.dataset}")
+    # Set the feature extractor's size attribute to  be the same as the model's image size
+    fe_cfg_args["size"] = model_cfg_args["image_size"]
+    # Set the tensors' return type to PyTorch tensors
+    fe_cfg_args["return_tensors"] = "pt"
+    return model_cfg_args, fe_cfg_args
+def datamodule_factory(args: Namespace) -> ImageDataModule:
+    """A factory method for creating a datamodule based on the command line args.
+    Args:
+        args (Namespace): the argparse Namespace object
+    Returns:
+        an ImageDataModule instance
+    """
+    # Get the model and feature extractor configs
+    model_cfg_args, fe_cfg_args = get_configs(args)
+    # Set the feature extractor class based on the provided base model name
+    if args.base_model == "ViT":
+        fe_class = ViTFeatureExtractor
+    elif args.base_model == "ConvNeXt":
+        fe_class = ConvNextFeatureExtractor
+    else:
+        raise Exception(f"Unknown base model: {args.base_model}")
+    # Create the feature extractor instance
+    if args.from_pretrained:
+        feature_extractor = fe_class.from_pretrained(
+            args.from_pretrained, **fe_cfg_args
+        )
+    else:
+        feature_extractor = fe_class(**fe_cfg_args)
+    # Un-nest the feature extractor's output
+    feature_extractor = UnNest(feature_extractor)
+    # Define the datamodule's configuration
+    dm_cfg = {
+        "feature_extractor": feature_extractor,
+        "batch_size": args.batch_size,
+        "add_noise": args.add_noise,
+        "add_rotation": args.add_rotation,
+        "add_blur": args.add_blur,
+        "num_workers": args.num_workers,
+    }
+    # Determine the dataset class based on the provided dataset name
+    if args.dataset.startswith("CIFAR10"):
+        if args.dataset == "CIFAR10":
+            dm_class = CIFAR10DataModule
+        elif args.dataset == "CIFAR10_QA":
+            dm_cfg["class_idx"] = args.class_idx
+            dm_cfg["grid_size"] = args.grid_size
+            dm_class = CIFAR10QADataModule
+        else:
+            raise Exception(f"Unknown CIFAR10 variant: {args.dataset}")
+    elif args.dataset == "MNIST":
+        dm_class = MNISTDataModule
+    elif args.dataset == "toy":
+        dm_cfg["class_idx"] = args.class_idx
+        dm_cfg["grid_size"] = args.grid_size
+        dm_class = ToyQADataModule
+    else:
+        raise Exception(f"Unknown dataset: {args.dataset}")
+    return dm_class(**dm_cfg)

code/datamodules/visual_qa.py ADDED Viewed

	@@ -0,0 +1,241 @@

+from .image_classification import CIFAR10DataModule
+from argparse import ArgumentParser
+from functools import partial
+from torch import LongTensor
+from torch.utils.data import default_collate, random_split, Sampler
+from torchvision import transforms
+from torchvision.datasets import VisionDataset
+from typing import Iterator, Optional
+import itertools
+import random
+import torch
+class CIFAR10QADataModule(CIFAR10DataModule):
+    @staticmethod
+    def add_model_specific_args(parent_parser: ArgumentParser) -> ArgumentParser:
+        parser = parent_parser.add_argument_group("Visual QA")
+        parser.add_argument(
+            "--class_idx",
+            type=int,
+            default=3,
+            help="The class (index) to count.",
+        )
+        parser.add_argument(
+            "--grid_size",
+            type=int,
+            default=3,
+            help="The number of images per row in the grid.",
+        )
+        return parent_parser
+    def __init__(
+        self,
+        class_idx: int,
+        grid_size: int = 3,
+        feature_extractor: callable = None,
+        data_dir: str = "data/",
+        batch_size: int = 32,
+        add_noise: bool = False,
+        add_rotation: bool = False,
+        add_blur: bool = False,
+        num_workers: int = 4,
+    ):
+        """A datamodule for a modified CIFAR10 dataset that is used for Question Answering.
+        More specifically, the task is to count the number of images of a certain class in a grid.
+        Args:
+            class_idx (int): the class (index) to count
+            grid_size (int): the number of images per row in the grid
+            feature_extractor (callable): a callable feature extractor instance
+            data_dir (str): the directory to store the dataset
+            batch_size (int): the batch size for the train/val/test dataloaders
+            add_noise (bool): whether to add noise to the images
+            add_rotation (bool): whether to add rotation augmentation
+            add_blur (bool): whether to add blur augmentation
+            num_workers (int): the number of workers to use for data loading
+        """
+        super().__init__(
+            feature_extractor,
+            data_dir,
+            (grid_size**2) * batch_size,
+            add_noise,
+            add_rotation,
+            add_blur,
+            num_workers,
+        )
+        # Store hyperparameters
+        self.class_idx = class_idx
+        self.grid_size = grid_size
+        # Save the existing transformations to be applied after creating the grid
+        self.post_transform = self.transform
+        # Set the pre-batch transformation to be the conversion from PIL to tensor
+        self.transform = transforms.PILToTensor()
+        # Specify the custom collate function and samplers
+        self.collate_fn = self.custom_collate_fn
+        self.shuffled_sampler = partial(
+            FairGridSampler,
+            class_idx=class_idx,
+            grid_size=grid_size,
+            shuffle=True,
+        )
+        self.sequential_sampler = partial(
+            FairGridSampler,
+            class_idx=class_idx,
+            grid_size=grid_size,
+            shuffle=False,
+        )
+    def custom_collate_fn(self, batch):
+        # Split the batch into groups of grid_size**2
+        idx = range(len(batch))
+        grids = zip(*(iter(idx),) * (self.grid_size**2))
+        new_batch = []
+        for grid in grids:
+            # Create a grid of images from the indices in the batch
+            img = torch.hstack(
+                [
+                    torch.dstack(
+                        [batch[i][0] for i in grid[idx : idx + self.grid_size]]
+                    )
+                    for idx in range(
+                        0, self.grid_size**2 - self.grid_size + 1, self.grid_size
+                    )
+                ]
+            )
+            # Apply the post transformations to the grid
+            img = self.post_transform(img)
+            # Define the target as the number of images that have the class_idx
+            targets = [batch[i][1] for i in grid]
+            target = targets.count(self.class_idx)
+            # Append grid and target to the batch
+            new_batch += [(img, target)]
+        return default_collate(new_batch)
+class ToyQADataModule(CIFAR10QADataModule):
+    """A datamodule for the toy dataset as described in the paper."""
+    def prepare_data(self):
+        # No need to download anything for the toy task
+        pass
+    def setup(self, stage: Optional[str] = None):
+        img_size = 16
+        samples = []
+        # Generate 6000 samples based on 6 different colors
+        for r, g, b in itertools.product((0, 1), (0, 1), (0, 1)):
+            if r == g == b:
+                # We do not want black/white patches
+                continue
+            for _ in range(1000):
+                patch = torch.vstack(
+                    [
+                        r * torch.ones(1, img_size, img_size),
+                        g * torch.ones(1, img_size, img_size),
+                        b * torch.ones(1, img_size, img_size),
+                    ]
+                )
+                # Assign a unique id to each color
+                target = int(f"{r}{g}{b}", 2) - 1
+                # Append the patch and target to the samples
+                samples += [(patch, target)]
+        # Split the data to 90% train, 5% validation and 5% test
+        train_size = int(len(samples) * 0.9)
+        val_size = (len(samples) - train_size) // 2
+        test_size = len(samples) - train_size - val_size
+        self.train_data, self.val_data, self.test_data = random_split(
+            samples,
+            [
+                train_size,
+                val_size,
+                test_size,
+            ],
+        )
+class FairGridSampler(Sampler[int]):
+    def __init__(
+        self,
+        dataset: VisionDataset,
+        class_idx: int,
+        grid_size: int,
+        shuffle: bool = False,
+    ):
+        """A sampler that returns a grid of images from the dataset, with a uniformly random
+         amount of appearances for a specific class of interest.
+        Args:
+            dataset (VisionDataset): the dataset to sample from
+            class_idx(int): the class (index) to treat as the class of interest
+            grid_size (int): the number of images per row in the grid
+            shuffle (bool): whether to shuffle the dataset before sampling
+        """
+        super().__init__(dataset)
+        # Save the hyperparameters
+        self.dataset = dataset
+        self.grid_size = grid_size
+        self.n_images = grid_size**2
+        # Get the indices of the class of interest
+        self.class_indices = LongTensor(
+            [i for i, x in enumerate(dataset) if x[1] == class_idx]
+        )
+        # Get the indices of all other classes
+        self.other_indices = LongTensor(
+            [i for i, x in enumerate(dataset) if x[1] != class_idx]
+        )
+        # Fix the seed if shuffle is False
+        self.seed = None if shuffle else self._get_seed()
+    @staticmethod
+    def _get_seed() -> int:
+        """Utility function for generating a random seed."""
+        return int(torch.empty((), dtype=torch.int64).random_().item())
+    def __iter__(self) -> Iterator[int]:
+        # Create a torch Generator object
+        seed = self.seed if self.seed is not None else self._get_seed()
+        gen = torch.Generator()
+        gen.manual_seed(seed)
+        # Sample the batches
+        for _ in range(len(self.dataset) // self.n_images):
+            # Pick the number of instances for the class of interest
+            n_samples = torch.randint(self.n_images + 1, (), generator=gen).item()
+            # Sample the indices from the class of interest
+            idx_from_class = torch.randperm(
+                len(self.class_indices),
+                generator=gen,
+            )[:n_samples]
+            # Sample the indices from the other classes
+            idx_from_other = torch.randperm(
+                len(self.other_indices),
+                generator=gen,
+            )[: self.n_images - n_samples]
+            # Concatenate the corresponding lists of patches to form a grid
+            grid = (
+                self.class_indices[idx_from_class].tolist()
+                + self.other_indices[idx_from_other].tolist()
+            )
+            # Shuffle the order of the patches within the grid
+            random.shuffle(grid)
+            yield from grid
+    def __len__(self) -> int:
+        return len(self.dataset)

code/eval_base.py ADDED Viewed

	@@ -0,0 +1,102 @@

+from datamodules import CIFAR10QADataModule, ImageDataModule
+from datamodules.utils import datamodule_factory
+from models import ImageClassificationNet
+from models.utils import model_factory
+from pytorch_lightning.loggers import WandbLogger
+import argparse
+import pytorch_lightning as pl
+def main(args: argparse.Namespace):
+    # Seed
+    pl.seed_everything(args.seed)
+    # Create base model
+    base = model_factory(args, own_config=True)
+    # Load datamodule
+    dm = datamodule_factory(args)
+    # Load the model from the specified checkpoint
+    model = ImageClassificationNet.load_from_checkpoint(
+        args.checkpoint,
+        model=base,
+        num_train_steps=0,
+    )
+    # Create wandb logger
+    wandb_logger = WandbLogger(
+        name=f"{args.dataset}_eval_{args.base_model} ({args.from_pretrained})",
+        project="Patch-DiffMask",
+    )
+    # Create trainer
+    trainer = pl.Trainer(
+        accelerator="auto",
+        logger=wandb_logger,
+        max_epochs=1,
+        enable_progress_bar=args.enable_progress_bar,
+    )
+    # Evaluate the model
+    trainer.test(model, dm)
+    # Save the HuggingFace model to be used with --from_pretrained
+    save_dir = f"checkpoints/{args.base_model}_{args.dataset}"
+    model.model.save_pretrained(save_dir)
+    dm.feature_extractor.save_pretrained(save_dir)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--checkpoint",
+        type=str,
+        required=True,
+        help="Checkpoint to resume the training from.",
+    )
+    # Trainer
+    parser.add_argument(
+        "--enable_progress_bar",
+        action="store_true",
+        help="Whether to show progress bar during training. NOT recommended when logging to files.",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=123,
+        help="Random seed for reproducibility.",
+    )
+    # Base (classification) model
+    parser.add_argument(
+        "--base_model",
+        type=str,
+        default="ViT",
+        choices=["ViT", "ConvNeXt"],
+        help="Base model architecture to train.",
+    )
+    parser.add_argument(
+        "--from_pretrained",
+        type=str,
+        # default="tanlq/vit-base-patch16-224-in21k-finetuned-cifar10",
+        help="The name of the pretrained HF model to fine-tune from.",
+    )
+    # Datamodule
+    ImageDataModule.add_model_specific_args(parser)
+    CIFAR10QADataModule.add_model_specific_args(parser)
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default="toy",
+        choices=["MNIST", "CIFAR10", "CIFAR10_QA", "toy"],
+        help="The dataset to use.",
+    )
+    args = parser.parse_args()
+    main(args)

code/main.py ADDED Viewed

	@@ -0,0 +1,215 @@

+from argparse import ArgumentParser, Namespace
+from attributions import attention_rollout, grad_cam
+from datamodules import CIFAR10QADataModule, ImageDataModule
+from datamodules.utils import datamodule_factory
+from functools import partial
+from models import ImageInterpretationNet
+from pytorch_lightning.callbacks import ModelCheckpoint
+from pytorch_lightning.loggers import WandbLogger
+from transformers import ViTForImageClassification
+from utils.plot import DrawMaskCallback, log_masks
+import pytorch_lightning as pl
+def get_experiment_name(args: Namespace):
+    """Create a name for the experiment based on the command line arguments."""
+    # Convert to dictionary
+    args = vars(args)
+    # Create a list with non-experiment arguments
+    non_experiment_args = [
+        "add_blur",
+        "add_noise",
+        "add_rotation",
+        "base_model",
+        "batch_size",
+        "class_idx",
+        "data_dir",
+        "enable_progress_bar",
+        "from_pretrained",
+        "log_every_n_steps",
+        "num_epochs",
+        "num_workers",
+        "sample_images",
+        "seed",
+    ]
+    # Create experiment name from experiment arguments
+    return "-".join(
+        [
+            f"{name}={value}"
+            for name, value in sorted(args.items())
+            if name not in non_experiment_args
+        ]
+    )
+def setup_sample_image_logs(
+    dm: ImageDataModule,
+    args: Namespace,
+    logger: WandbLogger,
+    n_panels: int = 2,  # TODO: change?
+):
+    """Setup the log callbacks for sampling and plotting images."""
+    images_per_panel = args.sample_images
+    # Sample images
+    sample_images = []
+    iter_loader = iter(dm.val_dataloader())
+    for panel in range(n_panels):
+        X, Y = next(iter_loader)
+        sample_images += [(X[:images_per_panel], Y[:images_per_panel])]
+    # Define mask callback
+    mask_cb = partial(DrawMaskCallback, log_every_n_steps=args.log_every_n_steps)
+    callbacks = []
+    for panel in range(n_panels):
+        # Initialize ViT model
+        vit = ViTForImageClassification.from_pretrained(args.from_pretrained)
+        # Extract samples for current panel
+        samples = sample_images[panel]
+        X, _ = samples
+        # Log GradCAM
+        gradcam_masks = grad_cam(X, vit)
+        log_masks(X, gradcam_masks, f"GradCAM {panel}", logger)
+        # Log Attention Rollout
+        rollout_masks = attention_rollout(X, vit)
+        log_masks(X, rollout_masks, f"Attention Rollout {panel}", logger)
+        # Create mask callback
+        callbacks += [mask_cb(samples, key=f"{panel}")]
+    return callbacks
+def main(args: Namespace):
+    # Seed
+    pl.seed_everything(args.seed)
+    # Load pre-trained Transformer
+    model = ViTForImageClassification.from_pretrained(args.from_pretrained)
+    # Load datamodule
+    dm = datamodule_factory(args)
+    # Setup datamodule to sample images for the mask callback
+    dm.prepare_data()
+    dm.setup("fit")
+    # Create Vision DiffMask for the model
+    diffmask = ImageInterpretationNet(
+        model_cfg=model.config,
+        alpha=args.alpha,
+        lr=args.lr,
+        eps=args.eps,
+        lr_placeholder=args.lr_placeholder,
+        lr_alpha=args.lr_alpha,
+        mul_activation=args.mul_activation,
+        add_activation=args.add_activation,
+        placeholder=not args.no_placeholder,
+        weighted_layer_pred=args.weighted_layer_distribution,
+    )
+    diffmask.set_vision_transformer(model)
+    # Create wandb logger instance
+    wandb_logger = WandbLogger(
+        name=get_experiment_name(args),
+        project="Patch-DiffMask",
+    )
+    # Create checkpoint callback
+    ckpt_cb = ModelCheckpoint(
+        save_top_k=-1,
+        dirpath=f"checkpoints/{wandb_logger.version}",
+        every_n_train_steps=args.log_every_n_steps,
+    )
+    # Create mask callbacks
+    mask_cbs = setup_sample_image_logs(dm, args, wandb_logger)
+    # Create trainer
+    trainer = pl.Trainer(
+        accelerator="auto",
+        callbacks=[ckpt_cb, *mask_cbs],
+        enable_progress_bar=args.enable_progress_bar,
+        logger=wandb_logger,
+        max_epochs=args.num_epochs,
+    )
+    # Train the model
+    trainer.fit(diffmask, dm)
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    # Trainer
+    parser.add_argument(
+        "--enable_progress_bar",
+        action="store_true",
+        help="Whether to enable the progress bar (NOT recommended when logging to file).",
+    )
+    parser.add_argument(
+        "--num_epochs",
+        type=int,
+        default=5,
+        help="Number of epochs to train.",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=123,
+        help="Random seed for reproducibility.",
+    )
+    # Logging
+    parser.add_argument(
+        "--sample_images",
+        type=int,
+        default=8,
+        help="Number of images to sample for the mask callback.",
+    )
+    parser.add_argument(
+        "--log_every_n_steps",
+        type=int,
+        default=200,
+        help="Number of steps between logging media & checkpoints.",
+    )
+    # Base (classification) model
+    parser.add_argument(
+        "--base_model",
+        type=str,
+        default="ViT",
+        choices=["ViT"],
+        help="Base model architecture to train.",
+    )
+    parser.add_argument(
+        "--from_pretrained",
+        type=str,
+        default="tanlq/vit-base-patch16-224-in21k-finetuned-cifar10",
+        help="The name of the pretrained HF model to load.",
+    )
+    # Interpretation model
+    ImageInterpretationNet.add_model_specific_args(parser)
+    # Datamodule
+    ImageDataModule.add_model_specific_args(parser)
+    CIFAR10QADataModule.add_model_specific_args(parser)
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default="CIFAR10",
+        choices=["MNIST", "CIFAR10", "CIFAR10_QA", "toy"],
+        help="The dataset to use.",
+    )
+    args = parser.parse_args()
+    main(args)

code/models/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .classification import ImageClassificationNet
2	+ from .interpretation import ImageInterpretationNet

code/models/classification.py ADDED Viewed

	@@ -0,0 +1,112 @@

+"""
+Parts of this file have been adapted from
+https://uvadlc-notebooks.readthedocs.io/en/latest/tutorial_notebooks/tutorial15/Vision_Transformer.html
+"""
+import pytorch_lightning as pl
+import torch.nn.functional as F
+from argparse import ArgumentParser
+from torch import Tensor
+from torch.optim import AdamW, Optimizer, RAdam
+from torch.optim.lr_scheduler import _LRScheduler
+from transformers import get_scheduler, PreTrainedModel
+class ImageClassificationNet(pl.LightningModule):
+    @staticmethod
+    def add_model_specific_args(parent_parser: ArgumentParser) -> ArgumentParser:
+        parser = parent_parser.add_argument_group("Classification Model")
+        parser.add_argument(
+            "--optimizer",
+            type=str,
+            default="AdamW",
+            choices=["AdamW", "RAdam"],
+            help="The optimizer to use to train the model.",
+        )
+        parser.add_argument(
+            "--weight_decay",
+            type=float,
+            default=1e-2,
+            help="The optimizer's weight decay.",
+        )
+        parser.add_argument(
+            "--lr",
+            type=float,
+            default=5e-5,
+            help="The initial learning rate for the model.",
+        )
+        return parent_parser
+    def __init__(
+        self,
+        model: PreTrainedModel,
+        num_train_steps: int,
+        optimizer: str = "AdamW",
+        weight_decay: float = 1e-2,
+        lr: float = 5e-5,
+    ):
+        """A PyTorch Lightning Module for a HuggingFace model used for image classification.
+        Args:
+            model (PreTrainedModel): a pretrained model for image classification
+            num_train_steps (int): number of training steps
+            optimizer (str): optimizer to use
+            weight_decay (float): weight decay for optimizer
+            lr (float): the learning rate used for training
+        """
+        super().__init__()
+        # Save the hyperparameters and the model
+        self.save_hyperparameters(ignore=["model"])
+        self.model = model
+    def forward(self, x: Tensor) -> Tensor:
+        return self.model(x).logits
+    def configure_optimizers(self) -> tuple[list[Optimizer], list[_LRScheduler]]:
+        # Set the optimizer class based on the hyperparameter
+        if self.hparams.optimizer == "AdamW":
+            optim_class = AdamW
+        elif self.hparams.optimizer == "RAdam":
+            optim_class = RAdam
+        else:
+            raise Exception(f"Unknown optimizer {self.hparams.optimizer}")
+        # Create the optimizer and the learning rate scheduler
+        optimizer = optim_class(
+            self.parameters(),
+            weight_decay=self.hparams.weight_decay,
+            lr=self.hparams.lr,
+        )
+        lr_scheduler = get_scheduler(
+            name="linear",
+            optimizer=optimizer,
+            num_warmup_steps=0,
+            num_training_steps=self.hparams.num_train_steps,
+        )
+        return [optimizer], [lr_scheduler]
+    def _calculate_loss(self, batch: tuple[Tensor, Tensor], mode: str) -> Tensor:
+        imgs, labels = batch
+        preds = self.model(imgs).logits
+        loss = F.cross_entropy(preds, labels)
+        acc = (preds.argmax(dim=-1) == labels).float().mean()
+        self.log(f"{mode}_loss", loss)
+        self.log(f"{mode}_acc", acc)
+        return loss
+    def training_step(self, batch: tuple[Tensor, Tensor], _: Tensor) -> Tensor:
+        loss = self._calculate_loss(batch, mode="train")
+        return loss
+    def validation_step(self, batch: tuple[Tensor, Tensor], _: Tensor):
+        self._calculate_loss(batch, mode="val")
+    def test_step(self, batch: tuple[Tensor, Tensor], _: Tensor):
+        self._calculate_loss(batch, mode="test")

code/models/gates.py ADDED Viewed

	@@ -0,0 +1,261 @@

+"""
+Parts of this file have been adapted from
+https://github.com/nicola-decao/diffmask/blob/master/diffmask/models/gates.py
+"""
+import torch
+import torch.nn as nn
+from torch import Tensor
+from typing import Optional
+from utils.distributions import RectifiedStreched, BinaryConcrete
+class MLPGate(nn.Module):
+    def __init__(self, input_size: int, hidden_size: int, bias: bool = True):
+        """
+        This is an MLP with the following structure;
+        Linear(input_size, hidden_size), Tanh(), Linear(hidden_size, 1)
+        The bias of the last layer is set to 5.0 to start with high probability
+        of keeping states (fundamental for good convergence as the initialized
+        DiffMask has not learned what to mask yet).
+        Args:
+            input_size (int): the number of input features
+            hidden_size (int): the number of hidden units
+            bias (bool): whether to use a bias term
+        """
+        super().__init__()
+        self.f = nn.Sequential(
+            nn.utils.weight_norm(nn.Linear(input_size, hidden_size)),
+            nn.Tanh(),
+            nn.utils.weight_norm(nn.Linear(hidden_size, 1, bias=bias)),
+        )
+        if bias:
+            self.f[-1].bias.data[:] = 5.0
+    def forward(self, *args: Tensor) -> Tensor:
+        return self.f(torch.cat(args, -1))
+class MLPMaxGate(nn.Module):
+    def __init__(
+        self,
+        input_size: int,
+        hidden_size: int,
+        mul_activation: float = 10.0,
+        add_activation: float = 5.0,
+        bias: bool = True,
+    ):
+        """
+        This is an MLP with the following structure;
+        Linear(input_size, hidden_size), Tanh(), Linear(hidden_size, 1)
+        The bias of the last layer is set to 5.0 to start with high probability
+        of keeping states (fundamental for good convergence as the initialized
+        DiffMask has not learned what to mask yet).
+        It also uses a scaler for the output of the activation function.
+        Args:
+            input_size (int): the number of input features
+            hidden_size (int): the number of hidden units
+            mul_activation (float): the scaler for the output of the activation function
+            add_activation (float): the offset for the output of the activation function
+            bias (bool): whether to use a bias term
+        """
+        super().__init__()
+        self.f = nn.Sequential(
+            nn.utils.weight_norm(nn.Linear(input_size, hidden_size)),
+            nn.Tanh(),
+            nn.utils.weight_norm(nn.Linear(hidden_size, 1, bias=bias)),
+            nn.Tanh(),
+        )
+        self.add_activation = nn.Parameter(torch.tensor(add_activation))
+        self.mul_activation = mul_activation
+    def forward(self, *args: Tensor) -> Tensor:
+        return self.f(torch.cat(args, -1)) * self.mul_activation + self.add_activation
+class DiffMaskGateInput(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        hidden_attention: int,
+        num_hidden_layers: int,
+        max_position_embeddings: int,
+        gate_fn: nn.Module = MLPMaxGate,
+        mul_activation: float = 10.0,
+        add_activation: float = 5.0,
+        gate_bias: bool = True,
+        placeholder: bool = False,
+        init_vector: Tensor = None,
+    ):
+        """This is a DiffMask module that masks the input of the first layer.
+        Args:
+            hidden_size (int): the size of the hidden representations
+            hidden_attention (int) the amount of units in the gate's hidden (bottleneck) layer
+            num_hidden_layers (int): the number of hidden layers (and thus gates to use)
+            max_position_embeddings (int): the amount of placeholder embeddings to learn for the masked positions
+            gate_fn (nn.Module): the PyTorch module to use as a gate
+            mul_activation (float): the scaler for the output of the activation function
+            add_activation (float): the offset for the output of the activation function
+            gate_bias (bool): whether to use a bias term
+            placeholder (bool): whether to use placeholder embeddings or a zero vector
+            init_vector (Tensor): the initial vector to use for the placeholder embeddings
+        """
+        super().__init__()
+        # Create a ModuleList with the gates
+        self.g_hat = nn.ModuleList(
+            [
+                gate_fn(
+                    hidden_size * 2,
+                    hidden_attention,
+                    mul_activation,
+                    add_activation,
+                    gate_bias,
+                )
+                for _ in range(num_hidden_layers)
+            ]
+        )
+        if placeholder:
+            # Use a placeholder embedding for the masked positions
+            self.placeholder = nn.Parameter(
+                nn.init.xavier_normal_(
+                    torch.empty((1, max_position_embeddings, hidden_size))
+                )
+                if init_vector is None
+                else init_vector.view(1, 1, hidden_size).repeat(
+                    1, max_position_embeddings, 1
+                )
+            )
+        else:
+            # Use a zero vector for the masked positions
+            self.register_buffer(
+                "placeholder",
+                torch.zeros((1, 1, hidden_size)),
+            )
+    def forward(
+        self, hidden_states: tuple[Tensor], layer_pred: Optional[int]
+    ) -> tuple[tuple[Tensor], Tensor, Tensor, Tensor, Tensor]:
+        # Concatenate the output of all the gates
+        logits = torch.cat(
+            [
+                self.g_hat[i](hidden_states[0], hidden_states[i])
+                for i in range(
+                    (layer_pred + 1) if layer_pred is not None else len(hidden_states)
+                )
+            ],
+            -1,
+        )
+        # Define a Hard Concrete distribution
+        dist = RectifiedStreched(
+            BinaryConcrete(torch.full_like(logits, 0.2), logits),
+            l=-0.2,
+            r=1.0,
+        )
+        # Calculate the expectation for the full gate probabilities
+        # These act as votes for the masked positions
+        gates_full = dist.rsample().cumprod(-1)
+        expected_L0_full = dist.log_expected_L0().cumsum(-1)
+        # Extract the probabilities from the last layer, which acts
+        # as an aggregation of the votes per position
+        gates = gates_full[..., -1]
+        expected_L0 = expected_L0_full[..., -1]
+        return (
+            hidden_states[0] * gates.unsqueeze(-1)
+            + self.placeholder[:, : hidden_states[0].shape[-2]]
+            * (1 - gates).unsqueeze(-1),
+            gates,
+            expected_L0,
+            gates_full,
+            expected_L0_full,
+        )
+# class DiffMaskGateHidden(nn.Module):
+#     def __init__(
+#         self,
+#         hidden_size: int,
+#         hidden_attention: int,
+#         num_hidden_layers: int,
+#         max_position_embeddings: int,
+#         gate_fn: nn.Module = MLPMaxGate,
+#         gate_bias: bool = True,
+#         placeholder: bool = False,
+#         init_vector: Tensor = None,
+#     ):
+#         super().__init__()
+#
+#         self.g_hat = nn.ModuleList(
+#             [
+#                 gate_fn(hidden_size, hidden_attention, bias=gate_bias)
+#                 for _ in range(num_hidden_layers)
+#             ]
+#         )
+#
+#         if placeholder:
+#             self.placeholder = nn.ParameterList(
+#                 [
+#                     nn.Parameter(
+#                         nn.init.xavier_normal_(
+#                             torch.empty((1, max_position_embeddings, hidden_size))
+#                         )
+#                         if init_vector is None
+#                         else init_vector.view(1, 1, hidden_size).repeat(
+#                             1, max_position_embeddings, 1
+#                         )
+#                     )
+#                     for _ in range(num_hidden_layers)
+#                 ]
+#             )
+#         else:
+#             self.register_buffer(
+#                 "placeholder",
+#                 torch.zeros((num_hidden_layers, 1, 1, hidden_size)),
+#             )
+#
+#     def forward(
+#         self, hidden_states: tuple[Tensor], layer_pred: Optional[int]
+#     ) -> tuple[tuple[Tensor], Tensor, Tensor, Tensor, Tensor]:
+#         if layer_pred is not None:
+#             logits = self.g_hat[layer_pred](hidden_states[layer_pred])
+#         else:
+#             logits = torch.cat(
+#                 [self.g_hat[i](hidden_states[i]) for i in range(len(hidden_states))], -1
+#             )
+#
+#         dist = RectifiedStreched(
+#             BinaryConcrete(torch.full_like(logits, 0.2), logits),
+#             l=-0.2,
+#             r=1.0,
+#         )
+#
+#         gates_full = dist.rsample()
+#         expected_L0_full = dist.log_expected_L0()
+#
+#         gates = gates_full if layer_pred is not None else gates_full[..., :1]
+#         expected_L0 = (
+#             expected_L0_full if layer_pred is not None else expected_L0_full[..., :1]
+#         )
+#
+#         layer_pred = layer_pred or 0  # equiv to "layer_pred if layer_pred else 0"
+#         return (
+#             hidden_states[layer_pred] * gates
+#             + self.placeholder[layer_pred][:, : hidden_states[layer_pred].shape[-2]]
+#             * (1 - gates),
+#             gates.squeeze(-1),
+#             expected_L0.squeeze(-1),
+#             gates_full,
+#             expected_L0_full,
+#         )

code/models/interpretation.py ADDED Viewed

	@@ -0,0 +1,482 @@

+import pytorch_lightning as pl
+import torch
+import torch.nn.functional as F
+from .gates import DiffMaskGateInput
+from argparse import ArgumentParser
+from math import sqrt
+from pytorch_lightning.core.optimizer import LightningOptimizer
+from torch import Tensor
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler
+from transformers import (
+    get_constant_schedule_with_warmup,
+    get_constant_schedule,
+    ViTForImageClassification,
+)
+from transformers.models.vit.configuration_vit import ViTConfig
+from typing import Optional, Union
+from utils.getters_setters import vit_getter, vit_setter
+from utils.metrics import accuracy_precision_recall_f1
+from utils.optimizer import LookaheadAdam
+class ImageInterpretationNet(pl.LightningModule):
+    @staticmethod
+    def add_model_specific_args(parent_parser: ArgumentParser) -> ArgumentParser:
+        parser = parent_parser.add_argument_group("Vision DiffMask")
+        parser.add_argument(
+            "--alpha",
+            type=float,
+            default=20.0,
+            help="Initial value for the Lagrangian",
+        )
+        parser.add_argument(
+            "--lr",
+            type=float,
+            default=2e-5,
+            help="Learning rate for DiffMask.",
+        )
+        parser.add_argument(
+            "--eps",
+            type=float,
+            default=0.1,
+            help="KL divergence tolerance.",
+        )
+        parser.add_argument(
+            "--no_placeholder",
+            action="store_true",
+            help="Whether to not use placeholder",
+        )
+        parser.add_argument(
+            "--lr_placeholder",
+            type=float,
+            default=1e-3,
+            help="Learning for mask vectors.",
+        )
+        parser.add_argument(
+            "--lr_alpha",
+            type=float,
+            default=0.3,
+            help="Learning rate for lagrangian optimizer.",
+        )
+        parser.add_argument(
+            "--mul_activation",
+            type=float,
+            default=15.0,
+            help="Value to multiply gate activations.",
+        )
+        parser.add_argument(
+            "--add_activation",
+            type=float,
+            default=8.0,
+            help="Value to add to gate activations.",
+        )
+        parser.add_argument(
+            "--weighted_layer_distribution",
+            action="store_true",
+            help="Whether to use a weighted distribution when picking a layer in DiffMask forward.",
+        )
+        return parent_parser
+    # Declare variables that will be initialized later
+    model: ViTForImageClassification
+    def __init__(
+        self,
+        model_cfg: ViTConfig,
+        alpha: float = 1,
+        lr: float = 3e-4,
+        eps: float = 0.1,
+        eps_valid: float = 0.8,
+        acc_valid: float = 0.75,
+        lr_placeholder: float = 1e-3,
+        lr_alpha: float = 0.3,
+        mul_activation: float = 10.0,
+        add_activation: float = 5.0,
+        placeholder: bool = True,
+        weighted_layer_pred: bool = False,
+    ):
+        """A PyTorch Lightning Module for the VisionDiffMask model on the Vision Transformer.
+        Args:
+            model_cfg (ViTConfig): the configuration of the Vision Transformer model
+            alpha (float): the initial value for the Lagrangian
+            lr (float): the learning rate for the DiffMask gates
+            eps (float): the tolerance for the KL divergence
+            eps_valid (float): the tolerance for the KL divergence in the validation step
+            acc_valid (float): the accuracy threshold for the validation step
+            lr_placeholder (float): the learning rate for the learnable masking embeddings
+            lr_alpha (float): the learning rate for the Lagrangian
+            mul_activation (float): the value to multiply the gate activations by
+            add_activation (float): the value to add to the gate activations
+            placeholder (bool): whether to use placeholder embeddings or a zero vector
+            weighted_layer_pred (bool): whether to use a weighted distribution when picking a layer
+        """
+        super().__init__()
+        # Save the hyperparameters
+        self.save_hyperparameters()
+        # Create DiffMask instance
+        self.gate = DiffMaskGateInput(
+            hidden_size=model_cfg.hidden_size,
+            hidden_attention=model_cfg.hidden_size // 4,
+            num_hidden_layers=model_cfg.num_hidden_layers + 2,
+            max_position_embeddings=1,
+            mul_activation=mul_activation,
+            add_activation=add_activation,
+            placeholder=placeholder,
+        )
+        # Create the Lagrangian values for the dual optimization
+        self.alpha = torch.nn.ParameterList(
+            [
+                torch.nn.Parameter(torch.ones(()) * alpha)
+                for _ in range(model_cfg.num_hidden_layers + 2)
+            ]
+        )
+        # Register buffers for running metrics
+        self.register_buffer(
+            "running_acc", torch.ones((model_cfg.num_hidden_layers + 2,))
+        )
+        self.register_buffer(
+            "running_l0", torch.ones((model_cfg.num_hidden_layers + 2,))
+        )
+        self.register_buffer(
+            "running_steps", torch.zeros((model_cfg.num_hidden_layers + 2,))
+        )
+    def set_vision_transformer(self, model: ViTForImageClassification):
+        """Set the Vision Transformer model to be used with this module."""
+        # Save the model instance as a class attribute
+        self.model = model
+        # Freeze the model's parameters
+        for param in self.model.parameters():
+            param.requires_grad = False
+    def forward_explainer(
+        self, x: Tensor, attribution: bool = False
+    ) -> tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, int, int]:
+        """Performs a forward pass through the explainer (VisionDiffMask) model."""
+        # Get the original logits and hidden states from the model
+        logits_orig, hidden_states = vit_getter(self.model, x)
+        # Add [CLS] token to deal with shape mismatch in self.gate() call
+        patch_embeddings = hidden_states[0]
+        batch_size = len(patch_embeddings)
+        cls_tokens = self.model.vit.embeddings.cls_token.expand(batch_size, -1, -1)
+        hidden_states[0] = torch.cat((cls_tokens, patch_embeddings), dim=1)
+        # Select the layer to generate the mask from in this pass
+        n_hidden = len(hidden_states)
+        if self.hparams.weighted_layer_pred:
+            # If weighted layer prediction is enabled, use a weighted distribution
+            # instead of uniformly picking a layer after a number of steps
+            low_weight = (
+                lambda i: self.running_acc[i] > 0.75
+                and self.running_l0[i] < 0.1
+                and self.running_steps[i] > 100
+            )
+            layers = torch.tensor(list(range(n_hidden)))
+            p = torch.tensor([0.1 if low_weight(i) else 1 for i in range(n_hidden)])
+            p = p / p.sum()
+            idx = p.multinomial(num_samples=1)
+            layer_pred = layers[idx].item()
+        else:
+            layer_pred = torch.randint(n_hidden, ()).item()
+        # Set the layer to drop to 0, since we are only interested in masking the input
+        layer_drop = 0
+        (
+            new_hidden_state,
+            gates,
+            expected_L0,
+            gates_full,
+            expected_L0_full,
+        ) = self.gate(
+            hidden_states=hidden_states,
+            layer_pred=None
+            if attribution
+            else layer_pred,  # if attribution, we get all the hidden states
+        )
+        # Create the list of the new hidden states for the new forward pass
+        new_hidden_states = (
+            [None] * layer_drop
+            + [new_hidden_state]
+            + [None] * (n_hidden - layer_drop - 1)
+        )
+        # Get the new logits from the masked input
+        logits, _ = vit_setter(self.model, x, new_hidden_states)
+        return (
+            logits,
+            logits_orig,
+            gates,
+            expected_L0,
+            gates_full,
+            expected_L0_full,
+            layer_drop,
+            layer_pred,
+        )
+    def get_mask(self, x: Tensor,
+                 idx: int = -1,
+                 aggregated_mask: bool = True,
+                 ) -> dict[str, Tensor]:
+        """
+        Generates a mask for the given input.
+        Args:
+            x: the input to generate the mask for
+            idx: the index of the layer to generate the mask from
+            aggregated_mask: whether to use an aggregative mask from each layer
+        Returns:
+            a dictionary containing the mask, kl divergence and the predicted class
+        """
+        # Pass from forward explainer with attribution=True
+        (
+            logits,
+            logits_orig,
+            gates,
+            expected_L0,
+            gates_full,
+            expected_L0_full,
+            layer_drop,
+            layer_pred,
+        ) = self.forward_explainer(x, attribution=True)
+        # Calculate KL-divergence
+        kl_div = torch.distributions.kl_divergence(
+            torch.distributions.Categorical(logits=logits_orig),
+            torch.distributions.Categorical(logits=logits),
+        )
+        # Get predicted class
+        pred_class = logits.argmax(-1)
+        # Calculate mask
+        if aggregated_mask:
+            mask = expected_L0_full[:, :, idx].exp()
+        else:
+            mask = gates_full[:, :, idx]
+        mask = mask[:, 1:]
+        C, H, W = x.shape[1:]  # channels, height, width
+        B, P = mask.shape  # batch, patches
+        N = int(sqrt(P))  # patches per side
+        S = int(H / N)  # patch size
+        # Reshape mask to match input shape
+        mask = mask.reshape(B, 1, N, N)
+        mask = F.interpolate(mask, scale_factor=S)
+        mask = mask.reshape(B, H, W)
+        return {"mask": mask, "kl_div": kl_div, "pred_class": pred_class}
+    def forward(self, x: Tensor) -> Tensor:
+        return self.model(x).logits
+    def training_step(self, batch: tuple[Tensor, Tensor], *args, **kwargs) -> dict:
+        # Unpack the batch
+        x, y = batch
+        # Pass the batch through the explainer (VisionDiffMask) model
+        (
+            logits,
+            logits_orig,
+            gates,
+            expected_L0,
+            gates_full,
+            expected_L0_full,
+            layer_drop,
+            layer_pred,
+        ) = self.forward_explainer(x)
+        # Calculate the KL-divergence loss term
+        loss_c = (
+            torch.distributions.kl_divergence(
+                torch.distributions.Categorical(logits=logits_orig),
+                torch.distributions.Categorical(logits=logits),
+            )
+            - self.hparams.eps
+        )
+        # Calculate the L0 loss term
+        loss_g = expected_L0.mean(-1)
+        # Calculate the full loss term
+        loss = self.alpha[layer_pred] * loss_c + loss_g
+        # Calculate the accuracy
+        acc, _, _, _ = accuracy_precision_recall_f1(
+            logits.argmax(-1), logits_orig.argmax(-1), average=True
+        )
+        # Calculate the average L0 loss
+        l0 = expected_L0.exp().mean(-1)
+        outputs_dict = {
+            "loss_c": loss_c.mean(-1),
+            "loss_g": loss_g.mean(-1),
+            "alpha": self.alpha[layer_pred].mean(-1),
+            "acc": acc,
+            "l0": l0.mean(-1),
+            "layer_pred": layer_pred,
+            "r_acc": self.running_acc[layer_pred],
+            "r_l0": self.running_l0[layer_pred],
+            "r_steps": self.running_steps[layer_pred],
+            "debug_loss": loss.mean(-1),
+        }
+        outputs_dict = {
+            "loss": loss.mean(-1),
+            **outputs_dict,
+            "log": outputs_dict,
+            "progress_bar": outputs_dict,
+        }
+        self.log(
+            "loss", outputs_dict["loss"], on_step=True, on_epoch=True, prog_bar=True
+        )
+        self.log(
+            "loss_c", outputs_dict["loss_c"], on_step=True, on_epoch=True, prog_bar=True
+        )
+        self.log(
+            "loss_g", outputs_dict["loss_g"], on_step=True, on_epoch=True, prog_bar=True
+        )
+        self.log("acc", outputs_dict["acc"], on_step=True, on_epoch=True, prog_bar=True)
+        self.log("l0", outputs_dict["l0"], on_step=True, on_epoch=True, prog_bar=True)
+        self.log(
+            "alpha", outputs_dict["alpha"], on_step=True, on_epoch=True, prog_bar=True
+        )
+        outputs_dict = {
+            "{}{}".format("" if self.training else "val_", k): v
+            for k, v in outputs_dict.items()
+        }
+        if self.training:
+            self.running_acc[layer_pred] = (
+                self.running_acc[layer_pred] * 0.9 + acc * 0.1
+            )
+            self.running_l0[layer_pred] = (
+                self.running_l0[layer_pred] * 0.9 + l0.mean(-1) * 0.1
+            )
+            self.running_steps[layer_pred] += 1
+        return outputs_dict
+    def validation_epoch_end(self, outputs: list[dict]):
+        outputs_dict = {
+            k: [e[k] for e in outputs if k in e]
+            for k in ("val_loss_c", "val_loss_g", "val_acc", "val_l0")
+        }
+        outputs_dict = {k: sum(v) / len(v) for k, v in outputs_dict.items()}
+        outputs_dict["val_loss_c"] += self.hparams.eps
+        outputs_dict = {
+            "val_loss": outputs_dict["val_l0"]
+            if outputs_dict["val_loss_c"] <= self.hparams.eps_valid
+            and outputs_dict["val_acc"] >= self.hparams.acc_valid
+            else torch.full_like(outputs_dict["val_l0"], float("inf")),
+            **outputs_dict,
+            "log": outputs_dict,
+        }
+        return outputs_dict
+    def configure_optimizers(self) -> tuple[list[Optimizer], list[_LRScheduler]]:
+        optimizers = [
+            LookaheadAdam(
+                params=[
+                    {
+                        "params": self.gate.g_hat.parameters(),
+                        "lr": self.hparams.lr,
+                    },
+                    {
+                        "params": self.gate.placeholder.parameters()
+                        if isinstance(self.gate.placeholder, torch.nn.ParameterList)
+                        else [self.gate.placeholder],
+                        "lr": self.hparams.lr_placeholder,
+                    },
+                ],
+                # centered=True, # this is for LookaheadRMSprop
+            ),
+            LookaheadAdam(
+                params=[self.alpha]
+                if isinstance(self.alpha, torch.Tensor)
+                else self.alpha.parameters(),
+                lr=self.hparams.lr_alpha,
+            ),
+        ]
+        schedulers = [
+            {
+                "scheduler": get_constant_schedule_with_warmup(optimizers[0], 12 * 100),
+                "interval": "step",
+            },
+            get_constant_schedule(optimizers[1]),
+        ]
+        return optimizers, schedulers
+    def optimizer_step(
+        self,
+        epoch: int,
+        batch_idx: int,
+        optimizer: Union[Optimizer, LightningOptimizer],
+        optimizer_idx: int = 0,
+        optimizer_closure: Optional[callable] = None,
+        on_tpu: bool = False,
+        using_native_amp: bool = False,
+        using_lbfgs: bool = False,
+    ):
+        # Optimizer 0: Minimize loss w.r.t. DiffMask's parameters
+        if optimizer_idx == 0:
+            # Gradient ascent on the model's parameters
+            optimizer.step(closure=optimizer_closure)
+            optimizer.zero_grad()
+            for g in optimizer.param_groups:
+                for p in g["params"]:
+                    p.grad = None
+        # Optimizer 1: Maximize loss w.r.t. the Langrangian
+        elif optimizer_idx == 1:
+            # Reverse the sign of the Langrangian's gradients
+            for i in range(len(self.alpha)):
+                if self.alpha[i].grad:
+                    self.alpha[i].grad *= -1
+            # Gradient ascent on the Langrangian
+            optimizer.step(closure=optimizer_closure)
+            optimizer.zero_grad()
+            for g in optimizer.param_groups:
+                for p in g["params"]:
+                    p.grad = None
+            # Clip the Lagrangian's values
+            for i in range(len(self.alpha)):
+                self.alpha[i].data = torch.where(
+                    self.alpha[i].data < 0,
+                    torch.full_like(self.alpha[i].data, 0),
+                    self.alpha[i].data,
+                )
+                self.alpha[i].data = torch.where(
+                    self.alpha[i].data > 200,
+                    torch.full_like(self.alpha[i].data, 200),
+                    self.alpha[i].data,
+                )
+    def on_save_checkpoint(self, ckpt: dict):
+        # Remove VIT from checkpoint as we can load it dynamically
+        keys = list(ckpt["state_dict"].keys())
+        for key in keys:
+            if key.startswith("model."):
+                del ckpt["state_dict"][key]

code/models/utils.py ADDED Viewed

	@@ -0,0 +1,64 @@

+from datamodules.utils import get_configs
+from transformers import (
+    ConvNextConfig,
+    ConvNextForImageClassification,
+    PreTrainedModel,
+    ViTConfig,
+    ViTForImageClassification,
+)
+import argparse
+import torch
+def set_clf_head(base: PreTrainedModel, num_classes: int):
+    """Set the classification head of the model in case of an output mismatch.
+    Args:
+        base (PreTrainedModel): the model to modify
+        num_classes (int): the number of classes to use for the output layer
+    """
+    if base.classifier.out_features != num_classes:
+        in_features = base.classifier.in_features
+        base.classifier = torch.nn.Linear(in_features, num_classes)
+def model_factory(
+    args: argparse.Namespace,
+    own_config: bool = False,
+) -> PreTrainedModel:
+    """A factory method for creating a HuggingFace model based on the command line args.
+    Args:
+        args (Namespace): the argparse Namespace object
+        own_config (bool): whether to create our own model config instead of a pretrained one;
+            this is recommended when the model was pre-trained on another task with a different
+            amount of classes for its classifier head
+    Returns:
+        a PreTrainedModel instance
+    """
+    if args.base_model == "ViT":
+        # Create a new Vision Transformer
+        config_class = ViTConfig
+        base_class = ViTForImageClassification
+    elif args.base_model == "ConvNeXt":
+        # Create a new ConvNext model
+        config_class = ConvNextConfig
+        base_class = ConvNextForImageClassification
+    else:
+        raise Exception(f"Unknown base model: {args.base_model}")
+    # Get the model config
+    model_cfg_args, _ = get_configs(args)
+    if not own_config and args.from_pretrained:
+        # Create a model from a pretrained model
+        base = base_class.from_pretrained(args.from_pretrained)
+        # Set the classifier head if needed
+        set_clf_head(base, model_cfg_args["num_labels"])
+    else:
+        # Create a model based on the config
+        config = config_class(**model_cfg_args)
+        base = base_class(config)
+    return base

code/train_base.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import argparse
+import pytorch_lightning as pl
+from datamodules import CIFAR10QADataModule, ImageDataModule
+from datamodules.utils import datamodule_factory
+from models import ImageClassificationNet
+from models.utils import model_factory
+from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
+from pytorch_lightning.loggers import WandbLogger
+def main(args: argparse.Namespace):
+    # Seed
+    pl.seed_everything(args.seed)
+    # Create base model
+    base = model_factory(args)
+    # Load datamodule
+    dm = datamodule_factory(args)
+    dm.prepare_data()
+    dm.setup("fit")
+    if args.checkpoint:
+        # Load the model from the specified checkpoint
+        model = ImageClassificationNet.load_from_checkpoint(args.checkpoint, model=base)
+    else:
+        # Create a new instance of the classification model
+        model = ImageClassificationNet(
+            model=base,
+            num_train_steps=args.num_epochs * len(dm.train_dataloader()),
+            optimizer=args.optimizer,
+            weight_decay=args.weight_decay,
+            lr=args.lr,
+        )
+    # Create wandb logger
+    wandb_logger = WandbLogger(
+        name=f"{args.dataset}_training_{args.base_model} ({args.from_pretrained})",
+        project="Patch-DiffMask",
+    )
+    # Create checkpoint callback
+    ckpt_cb = ModelCheckpoint(dirpath=f"checkpoints/{wandb_logger.version}")
+    # Create early stopping callback
+    es_cb = EarlyStopping(monitor="val_acc", mode="max", patience=5)
+    # Create trainer
+    trainer = pl.Trainer(
+        accelerator="auto",
+        callbacks=[ckpt_cb, es_cb],
+        logger=wandb_logger,
+        max_epochs=args.num_epochs,
+        enable_progress_bar=args.enable_progress_bar,
+    )
+    trainer_args = {}
+    if args.checkpoint:
+        # Resume trainer from checkpoint
+        trainer_args["ckpt_path"] = args.checkpoint
+    # Train the model
+    trainer.fit(model, dm, **trainer_args)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--checkpoint",
+        type=str,
+        help="Checkpoint to resume the training from.",
+    )
+    # Trainer
+    parser.add_argument(
+        "--enable_progress_bar",
+        action="store_true",
+        help="Whether to show progress bar during training. NOT recommended when logging to files.",
+    )
+    parser.add_argument(
+        "--num_epochs",
+        type=int,
+        default=5,
+        help="Number of epochs to train.",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=123,
+        help="Random seed for reproducibility.",
+    )
+    # Base (classification) model
+    ImageClassificationNet.add_model_specific_args(parser)
+    parser.add_argument(
+        "--base_model",
+        type=str,
+        default="ViT",
+        choices=["ViT", "ConvNeXt"],
+        help="Base model architecture to train.",
+    )
+    parser.add_argument(
+        "--from_pretrained",
+        type=str,
+        # default="tanlq/vit-base-patch16-224-in21k-finetuned-cifar10",
+        help="The name of the pretrained HF model to fine-tune from.",
+    )
+    # Datamodule
+    ImageDataModule.add_model_specific_args(parser)
+    CIFAR10QADataModule.add_model_specific_args(parser)
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default="toy",
+        choices=["MNIST", "CIFAR10", "CIFAR10_QA", "toy"],
+        help="The dataset to use.",
+    )
+    args = parser.parse_args()
+    main(args)

code/utils/__init__.py ADDED Viewed

File without changes

code/utils/distributions.py ADDED Viewed

	@@ -0,0 +1,64 @@

+"""
+File copied from
+https://github.com/nicola-decao/diffmask/blob/master/diffmask/models/distributions.py
+"""
+import torch
+import torch.distributions as distr
+import torch.nn.functional as F
+from torch import Tensor
+class BinaryConcrete(distr.relaxed_bernoulli.RelaxedBernoulli):
+    def __init__(self, temperature: Tensor, logits: Tensor):
+        super().__init__(temperature=temperature, logits=logits)
+        self.device = self.temperature.device
+    def cdf(self, value: Tensor) -> Tensor:
+        return torch.sigmoid(
+            (torch.log(value) - torch.log(1.0 - value)) * self.temperature - self.logits
+        )
+    def log_prob(self, value: Tensor) -> Tensor:
+        return torch.where(
+            (value > 0) & (value < 1),
+            super().log_prob(value),
+            torch.full_like(value, -float("inf")),
+        )
+    def log_expected_L0(self, value: Tensor) -> Tensor:
+        return -F.softplus(
+            (torch.log(value) - torch.log(1 - value)) * self.temperature - self.logits
+        )
+class Streched(distr.TransformedDistribution):
+    def __init__(self, base_dist, l: float = -0.1, r: float = 1.1):
+        super().__init__(base_dist, distr.AffineTransform(loc=l, scale=r - l))
+    def log_expected_L0(self) -> Tensor:
+        value = torch.tensor(0.0, device=self.base_dist.device)
+        for transform in self.transforms[::-1]:
+            value = transform.inv(value)
+        if self._validate_args:
+            self.base_dist._validate_sample(value)
+        value = self.base_dist.log_expected_L0(value)
+        value = self._monotonize_cdf(value)
+        return value
+    def expected_L0(self) -> Tensor:
+        return self.log_expected_L0().exp()
+class RectifiedStreched(Streched):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    @torch.no_grad()
+    def sample(self, sample_shape: torch.Size = torch.Size([])) -> Tensor:
+        return self.rsample(sample_shape)
+    def rsample(self, sample_shape: torch.Size = torch.Size([])) -> Tensor:
+        x = super().rsample(sample_shape)
+        return x.clamp(0, 1)

code/utils/getters_setters.py ADDED Viewed

	@@ -0,0 +1,122 @@

+from torch import Tensor
+from torch.nn import Module
+from torch.utils.hooks import RemovableHandle
+from transformers import ViTForImageClassification
+from typing import Optional, Union
+def _add_hooks(
+    model: ViTForImageClassification, get_hook: callable
+) -> list[RemovableHandle]:
+    """Adds a list of hooks to the model according to the get_hook function provided.
+    Args:
+        model (ViTForImageClassification): the ViT instance to add hooks to
+        get_hook (callable): a function that takes an index and returns a hook
+    Returns:
+        a list of RemovableHandle instances
+    """
+    return (
+        [model.vit.embeddings.patch_embeddings.register_forward_hook(get_hook(0))]
+        + [
+            layer.register_forward_pre_hook(get_hook(i + 1))
+            for i, layer in enumerate(model.vit.encoder.layer)
+        ]
+        + [
+            model.vit.encoder.layer[-1].register_forward_hook(
+                get_hook(len(model.vit.encoder.layer) + 1)
+            )
+        ]
+    )
+def vit_getter(
+    model: ViTForImageClassification, x: Tensor
+) -> tuple[Tensor, list[Tensor]]:
+    """A function that returns the logits and hidden states of the model.
+    Args:
+        model (ViTForImageClassification): the ViT instance to use for the forward pass
+        x (Tensor): the input to the model
+    Returns:
+        a tuple of the model's logits and hidden states
+    """
+    hidden_states_ = []
+    def get_hook(i: int) -> callable:
+        def hook(_: Module, inputs: tuple, outputs: Optional[tuple] = None):
+            if i == 0:
+                hidden_states_.append(outputs)
+            elif 1 <= i <= len(model.vit.encoder.layer):
+                hidden_states_.append(inputs[0])
+            elif i == len(model.vit.encoder.layer) + 1:
+                hidden_states_.append(outputs[0])
+        return hook
+    handles = _add_hooks(model, get_hook)
+    try:
+        logits = model(x).logits
+    finally:
+        for handle in handles:
+            handle.remove()
+    return logits, hidden_states_
+def vit_setter(
+    model: ViTForImageClassification, x: Tensor, hidden_states: list[Optional[Tensor]]
+) -> tuple[Tensor, list[Tensor]]:
+    """A function that sets some of the model's hidden states and returns its (new) logits
+     and hidden states after another forward pass.
+    Args:
+        model (ViTForImageClassification): the ViT instance to use for the forward pass
+        x (Tensor): the input to the model
+        hidden_states (list[Optional[Tensor]]): a list, with each element corresponding to
+         a hidden state to set or None to calculate anew for that index
+    Returns:
+        a tuple of the model's logits and (new) hidden states
+    """
+    hidden_states_ = []
+    def get_hook(i: int) -> callable:
+        def hook(
+            _: Module, inputs: tuple, outputs: Optional[tuple] = None
+        ) -> Optional[Union[tuple, Tensor]]:
+            if i == 0:
+                if hidden_states[i] is not None:
+                    # print(hidden_states[i].shape)
+                    hidden_states_.append(hidden_states[i][:, 1:])
+                    return hidden_states_[-1]
+                else:
+                    hidden_states_.append(outputs)
+            elif 1 <= i <= len(model.vit.encoder.layer):
+                if hidden_states[i] is not None:
+                    hidden_states_.append(hidden_states[i])
+                    return (hidden_states[i],) + inputs[1:]
+                else:
+                    hidden_states_.append(inputs[0])
+            elif i == len(model.vit.encoder.layer) + 1:
+                if hidden_states[i] is not None:
+                    hidden_states_.append(hidden_states[i])
+                    return (hidden_states[i],) + outputs[1:]
+                else:
+                    hidden_states_.append(outputs[0])
+        return hook
+    handles = _add_hooks(model, get_hook)
+    try:
+        logits = model(x).logits
+    finally:
+        for handle in handles:
+            handle.remove()
+    return logits, hidden_states_

code/utils/metrics.py ADDED Viewed

	@@ -0,0 +1,67 @@

+"""
+File copied from
+https://github.com/nicola-decao/diffmask/blob/master/diffmask/utils/util.py
+"""
+import torch
+from torch import Tensor
+def accuracy_precision_recall_f1(
+    y_pred: Tensor, y_true: Tensor, average: bool = True
+) -> tuple[Tensor, Tensor, Tensor, Tensor]:
+    """Calculates the accuracy, precision, recall and f1 score given the predicted and true labels.
+    Args:
+        y_pred (Tensor): predicted labels
+        y_true (Tensor): true labels
+        average (bool): whether to average the scores or not
+    Returns:
+        a tuple of the accuracy, precision, recall and f1 score
+    """
+    M = confusion_matrix(y_pred, y_true)
+    tp = M.diagonal(dim1=-2, dim2=-1).float()
+    precision_den = M.sum(-2)
+    precision = torch.where(
+        precision_den == 0, torch.zeros_like(tp), tp / precision_den
+    )
+    recall_den = M.sum(-1)
+    recall = torch.where(recall_den == 0, torch.ones_like(tp), tp / recall_den)
+    f1_den = precision + recall
+    f1 = torch.where(
+        f1_den == 0, torch.zeros_like(tp), 2 * (precision * recall) / f1_den
+    )
+    # noinspection PyTypeChecker
+    return ((y_pred == y_true).float().mean(-1),) + (
+        tuple(e.mean(-1) for e in (precision, recall, f1))
+        if average
+        else (precision, recall, f1)
+    )
+def confusion_matrix(y_pred: Tensor, y_true: Tensor) -> Tensor:
+    """Creates a confusion matrix given the predicted and true labels."""
+    device = y_pred.device
+    labels = max(y_pred.max().item() + 1, y_true.max().item() + 1)
+    return (
+        (
+            torch.stack((y_true, y_pred), -1).unsqueeze(-2).unsqueeze(-2)
+            == torch.stack(
+                (
+                    torch.arange(labels, device=device).unsqueeze(-1).repeat(1, labels),
+                    torch.arange(labels, device=device).unsqueeze(-2).repeat(labels, 1),
+                ),
+                -1,
+            )
+        )
+        .all(-1)
+        .sum(-3)
+    )

code/utils/optimizer.py ADDED Viewed

	@@ -0,0 +1,151 @@

+"""
+File copied from
+https://github.com/nicola-decao/diffmask/blob/master/diffmask/optim/lookahead.py
+"""
+import torch
+import torch.optim as optim
+from collections import defaultdict
+from torch import Tensor
+from torch.optim.optimizer import Optimizer
+from typing import Iterable, Optional, Union
+_params_type = Union[Iterable[Tensor], Iterable[dict]]
+class Lookahead(Optimizer):
+    """Lookahead optimizer: https://arxiv.org/abs/1907.08610"""
+    # noinspection PyMissingConstructor
+    def __init__(self, base_optimizer: Optimizer, alpha: float = 0.5, k: int = 6):
+        if not 0.0 <= alpha <= 1.0:
+            raise ValueError(f"Invalid slow update rate: {alpha}")
+        if not 1 <= k:
+            raise ValueError(f"Invalid lookahead steps: {k}")
+        defaults = dict(lookahead_alpha=alpha, lookahead_k=k, lookahead_step=0)
+        self.base_optimizer = base_optimizer
+        self.param_groups = self.base_optimizer.param_groups
+        self.defaults = base_optimizer.defaults
+        self.defaults.update(defaults)
+        self.state = defaultdict(dict)
+        # manually add our defaults to the param groups
+        for name, default in defaults.items():
+            for group in self.param_groups:
+                group.setdefault(name, default)
+    def update_slow(self, group: dict):
+        for fast_p in group["params"]:
+            if fast_p.grad is None:
+                continue
+            param_state = self.state[fast_p]
+            if "slow_buffer" not in param_state:
+                param_state["slow_buffer"] = torch.empty_like(fast_p.data)
+                param_state["slow_buffer"].copy_(fast_p.data)
+            slow = param_state["slow_buffer"]
+            slow.add_(fast_p.data - slow, alpha=group["lookahead_alpha"])
+            fast_p.data.copy_(slow)
+    def sync_lookahead(self):
+        for group in self.param_groups:
+            self.update_slow(group)
+    def step(self, closure: Optional[callable] = None) -> Optional[float]:
+        # print(self.k)
+        # assert id(self.param_groups) == id(self.base_optimizer.param_groups)
+        loss = self.base_optimizer.step(closure)
+        for group in self.param_groups:
+            group["lookahead_step"] += 1
+            if group["lookahead_step"] % group["lookahead_k"] == 0:
+                self.update_slow(group)
+        return loss
+    def state_dict(self) -> dict:
+        fast_state_dict = self.base_optimizer.state_dict()
+        slow_state = {
+            (id(k) if isinstance(k, torch.Tensor) else k): v
+            for k, v in self.state.items()
+        }
+        fast_state = fast_state_dict["state"]
+        param_groups = fast_state_dict["param_groups"]
+        return {
+            "state": fast_state,
+            "slow_state": slow_state,
+            "param_groups": param_groups,
+        }
+    def load_state_dict(self, state_dict: dict):
+        fast_state_dict = {
+            "state": state_dict["state"],
+            "param_groups": state_dict["param_groups"],
+        }
+        self.base_optimizer.load_state_dict(fast_state_dict)
+        # We want to restore the slow state, but share param_groups reference
+        # with base_optimizer. This is a bit redundant but least code
+        slow_state_new = False
+        if "slow_state" not in state_dict:
+            print("Loading state_dict from optimizer without Lookahead applied.")
+            state_dict["slow_state"] = defaultdict(dict)
+            slow_state_new = True
+        slow_state_dict = {
+            "state": state_dict["slow_state"],
+            "param_groups": state_dict[
+                "param_groups"
+            ],  # this is pointless but saves code
+        }
+        super(Lookahead, self).load_state_dict(slow_state_dict)
+        self.param_groups = (
+            self.base_optimizer.param_groups
+        )  # make both ref same container
+        if slow_state_new:
+            # reapply defaults to catch missing lookahead specific ones
+            for name, default in self.defaults.items():
+                for group in self.param_groups:
+                    group.setdefault(name, default)
+def LookaheadAdam(
+    params: _params_type,
+    lr: float = 1e-3,
+    betas: tuple[float, float] = (0.9, 0.999),
+    eps: float = 1e-08,
+    weight_decay: float = 0,
+    amsgrad: bool = False,
+    lalpha: float = 0.5,
+    k: int = 6,
+):
+    return Lookahead(
+        torch.optim.Adam(params, lr, betas, eps, weight_decay, amsgrad), lalpha, k
+    )
+def LookaheadRAdam(
+    params: _params_type,
+    lr: float = 1e-3,
+    betas: tuple[float, float] = (0.9, 0.999),
+    eps: float = 1e-8,
+    weight_decay: float = 0,
+    lalpha: float = 0.5,
+    k: int = 6,
+):
+    return Lookahead(optim.RAdam(params, lr, betas, eps, weight_decay), lalpha, k)
+def LookaheadRMSprop(
+    params: _params_type,
+    lr: float = 1e-2,
+    alpha: float = 0.99,
+    eps: float = 1e-08,
+    weight_decay: float = 0,
+    momentum: float = 0,
+    centered: bool = False,
+    lalpha: float = 0.5,
+    k: int = 6,
+):
+    return Lookahead(
+        torch.optim.RMSprop(params, lr, alpha, eps, weight_decay, momentum, centered),
+        lalpha,
+        k,
+    )

code/utils/plot.py ADDED Viewed

	@@ -0,0 +1,252 @@

+import cv2
+import numpy as np
+import torch
+from pytorch_lightning import LightningModule
+from pytorch_lightning.callbacks import Callback
+from pytorch_lightning.loggers import WandbLogger
+from pytorch_lightning.trainer import Trainer
+from torch import Tensor
+@torch.no_grad()
+def unnormalize(
+    images: Tensor,
+    mean: tuple[float] = (0.5, 0.5, 0.5),
+    std: tuple[float] = (0.5, 0.5, 0.5),
+) -> Tensor:
+    """Reverts the normalization transformation applied before ViT.
+    Args:
+        images (Tensor): a batch of images
+        mean (tuple[int]): the means used for normalization - defaults to (0.5, 0.5, 0.5)
+        std (tuple[int]): the stds used for normalization - defaults to (0.5, 0.5, 0.5)
+    Returns:
+        the un-normalized batch of images
+    """
+    unnormalized_images = images.clone()
+    for i, (m, s) in enumerate(zip(mean, std)):
+        unnormalized_images[:, i, :, :].mul_(s).add_(m)
+    return unnormalized_images
+@torch.no_grad()
+def smoothen(mask: Tensor, patch_size: int = 16) -> Tensor:
+    """Smoothens a mask by downsampling it and re-upsampling it
+     with bi-linear interpolation.
+    Args:
+        mask (Tensor): a 2D float torch tensor with values in [0, 1]
+        patch_size (int): the patch size in pixels
+    Returns:
+        a smoothened mask at the pixel level
+    """
+    device = mask.device
+    (h, w) = mask.shape
+    mask = cv2.resize(
+        mask.cpu().numpy(),
+        (h // patch_size, w // patch_size),
+        interpolation=cv2.INTER_NEAREST,
+    )
+    mask = cv2.resize(mask, (h, w), interpolation=cv2.INTER_LINEAR)
+    return torch.tensor(mask).to(device)
+@torch.no_grad()
+def draw_mask_on_image(image: Tensor, mask: Tensor) -> Tensor:
+    """Overlays a dimming mask on the image.
+    Args:
+        image (Tensor): a float torch tensor with values in [0, 1]
+        mask (Tensor): a float torch tensor with values in [0, 1]
+    Returns:
+        the image with parts of it dimmed according to the mask
+    """
+    masked_image = image * mask
+    return masked_image
+@torch.no_grad()
+def draw_heatmap_on_image(
+    image: Tensor,
+    mask: Tensor,
+    colormap: int = cv2.COLORMAP_JET,
+) -> Tensor:
+    """Overlays a heatmap on the image.
+    Args:
+        image (Tensor): a float torch tensor with values in [0, 1]
+        mask (Tensor): a float torch tensor with values in [0, 1]
+        colormap (int): the OpenCV colormap to be used
+    Returns:
+        the image with the heatmap overlaid
+    """
+    # Save the device of the image
+    original_device = image.device
+    # Convert image & mask to numpy
+    image = image.permute(1, 2, 0).cpu().numpy()
+    mask = mask.cpu().numpy()
+    # Create heatmap
+    heatmap = cv2.applyColorMap(np.uint8(255 * mask), colormap)
+    heatmap = cv2.cvtColor(heatmap, cv2.COLOR_BGR2RGB)
+    heatmap = np.float32(heatmap) / 255
+    # Overlay heatmap on image
+    masked_image = image + heatmap
+    masked_image = masked_image / np.max(masked_image)
+    return torch.tensor(masked_image).permute(2, 0, 1).to(original_device)
+def _prepare_samples(images: Tensor, masks: Tensor) -> tuple[Tensor, list[float]]:
+    """Prepares the samples for the masking/heatmap visualization.
+    Args:
+        images (Tensor): a float torch tensor with values in [0, 1]
+        masks (Tensor): a float torch tensor with values in [0, 1]
+    Returns
+        a tuple of image triplets (img, masked, heatmap) and their
+         corresponding masking percentages
+    """
+    num_channels = images[0].shape[0]
+    # Smoothen masks
+    masks = [smoothen(m) for m in masks]
+    # Un-normalize images
+    if num_channels == 1:
+        images = [
+            torch.repeat_interleave(img, 3, 0)
+            for img in unnormalize(images, mean=(0.5,), std=(0.5,))
+        ]
+    else:
+        images = [img for img in unnormalize(images)]
+    # Draw mask on sample images
+    images_with_mask = [
+        draw_mask_on_image(image, mask) for image, mask in zip(images, masks)
+    ]
+    # Draw heatmap on sample images
+    images_with_heatmap = [
+        draw_heatmap_on_image(image, mask) for image, mask in zip(images, masks)
+    ]
+    # Chunk to triplets (image, masked image, heatmap)
+    samples = torch.cat(
+        [
+            torch.cat(images, dim=2),
+            torch.cat(images_with_mask, dim=2),
+            torch.cat(images_with_heatmap, dim=2),
+        ],
+        dim=1,
+    ).chunk(len(images), dim=-1)
+    # Compute masking percentages
+    masked_pixels_percentages = [
+        100 * (1 - torch.stack(masks)[i].mean(-1).mean(-1).item())
+        for i in range(len(masks))
+    ]
+    return samples, masked_pixels_percentages
+def log_masks(images: Tensor, masks: Tensor, key: str, logger: WandbLogger):
+    """Logs a set of images with their masks to WandB.
+    Args:
+        images (Tensor): a float torch tensor with values in [0, 1]
+        masks (Tensor): a float torch tensor with values in [0, 1]
+        key (str): the key to log the images with
+        logger (WandbLogger): the logger to log the images to
+    """
+    samples, masked_pixels_percentages = _prepare_samples(images, masks)
+    # Log with wandb
+    logger.log_image(
+        key=key,
+        images=list(samples),
+        caption=[
+            f"Masking: {masked_pixels_percentage:.2f}% "
+            for masked_pixels_percentage in masked_pixels_percentages
+        ],
+    )
+class DrawMaskCallback(Callback):
+    def __init__(
+        self,
+        samples: list[tuple[Tensor, Tensor]],
+        log_every_n_steps: int = 200,
+        key: str = "",
+    ):
+        """A callback that logs VisionDiffMask masks for the sample images to WandB.
+        Args:
+            samples (list[tuple[Tensor, Tensor]): a list of image, label pairs
+            log_every_n_steps (int): the interval in steps to log the masks to WandB
+            key (str): the key to log the images with (allows for multiple batches)
+        """
+        self.images = torch.stack([img for img in samples[0]])
+        self.labels = [label.item() for label in samples[1]]
+        self.log_every_n_steps = log_every_n_steps
+        self.key = key
+    def _log_masks(self, trainer: Trainer, pl_module: LightningModule):
+        # Predict mask
+        with torch.no_grad():
+            pl_module.eval()
+            outputs = pl_module.get_mask(self.images)
+            pl_module.train()
+        # Unnest outputs
+        masks = outputs["mask"]
+        kl_divs = outputs["kl_div"]
+        pred_classes = outputs["pred_class"].cpu()
+        # Prepare masked samples for logging
+        samples, masked_pixels_percentages = _prepare_samples(self.images, masks)
+        # Log with wandb
+        trainer.logger.log_image(
+            key="DiffMask " + self.key,
+            images=list(samples),
+            caption=[
+                f"Masking: {masked_pixels_percentage:.2f}% "
+                f"\n KL-divergence: {kl_div:.4f} "
+                f"\n Class: {pl_module.model.config.id2label[label]} "
+                f"\n Predicted Class: {pl_module.model.config.id2label[pred_class.item()]}"
+                for masked_pixels_percentage, kl_div, label, pred_class in zip(
+                    masked_pixels_percentages, kl_divs, self.labels, pred_classes
+                )
+            ],
+        )
+    def on_fit_start(self, trainer: Trainer, pl_module: LightningModule):
+        # Transfer sample images to correct device
+        self.images = self.images.to(pl_module.device)
+        # Log sample images
+        self._log_masks(trainer, pl_module)
+    def on_train_batch_end(
+        self,
+        trainer: Trainer,
+        pl_module: LightningModule,
+        outputs: dict,
+        batch: tuple[Tensor, Tensor],
+        batch_idx: int,
+        unused: int = 0,
+    ):
+        # Log sample images every n steps
+        if batch_idx % self.log_every_n_steps == 0:
+            self._log_masks(trainer, pl_module)

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+numpy
+opencv-python
+pytorch_lightning
+torch
+torchvision
+transformers