First commit

Browse files

Files changed (10) hide show

freeda/__init__.py +1 -0
freeda/configs/dinov2_vitb_clip_vitb.yaml +34 -0
freeda/configs/dinov2_vitl_clip_vitl.yaml +34 -0
freeda/configs/dinov2_vitl_clip_vitl_approx.yaml +34 -0
freeda/models/freeda_model.py +519 -0
freeda/models/mask_proposer/superpixel.py +66 -0
freeda/models/vision_backbone.py +54 -0
freeda/utils/factory.py +111 -0
main.py +15 -0
requirements.txt +9 -0

freeda/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .utils.factory import load

freeda/configs/dinov2_vitb_clip_vitb.yaml ADDED Viewed

	@@ -0,0 +1,34 @@

+data:
+  collection_url: "https://drive.google.com/uc?id=10v1ZbbVjZQhA43F9JWju5chYnPjR2Ael"
+  compression: "zip"
+  index_url: "https://drive.google.com/uc?id=1AHE6YpY7sGQGp_wPetcr5-z8mk91kWtF"
+  collection_length: 2166945
+clip:
+  model: ViT-B-16
+  weights: openai
+  templates:
+    - "itap of a {}."
+    - "a bad photo of a {}."
+    - "a origami {}."
+    - "a photo of the large {}."
+    - "a {} in a video game."
+    - "art of the {}."
+    - "a photo of the small {}."
+backbone:
+  model: "vit_base_patch14_dinov2.lvd142m"
+  img_size: 518
+mask_proposer:
+  use_mask_proposals: true
+  method: "superpixel"
+  args:
+    algorithm: "felzenszwalb"
+    scale: 100
+    sigma: 1.0
+    min_size: 100
+freeda:
+  global_local_ensemble: 0.8
+  k_search: 350
+  sliding_window: true
+  with_background: false
+  background_threshold: 0.48

freeda/configs/dinov2_vitl_clip_vitl.yaml ADDED Viewed

	@@ -0,0 +1,34 @@

+data:
+  collection_url: "https://drive.google.com/uc?id=1U4d0exJuq29b0rLR6iOT20ErW3DAmgw0"
+  compression: "tar"
+  index_url: "https://drive.google.com/uc?id=1FHjpM0aqPf9OjiuG_341EMlEuq6hsh6L"
+  collection_length: 2166946
+clip:
+  model: ViT-L-14
+  weights: openai
+  templates:
+    - "itap of a {}."
+    - "a bad photo of a {}."
+    - "a origami {}."
+    - "a photo of the large {}."
+    - "a {} in a video game."
+    - "art of the {}."
+    - "a photo of the small {}."
+backbone:
+  model: "vit_large_patch14_dinov2.lvd142m"
+  img_size: 518
+mask_proposer:
+  use_mask_proposals: true
+  method: "superpixel"
+  args:
+    algorithm: "felzenszwalb"
+    scale: 100
+    sigma: 1.0
+    min_size: 100
+freeda:
+  global_local_ensemble: 0.8
+  k_search: 350
+  sliding_window: true
+  with_background: false
+  background_threshold: 0.48

freeda/configs/dinov2_vitl_clip_vitl_approx.yaml ADDED Viewed

	@@ -0,0 +1,34 @@

+data:
+  collection_url: "https://drive.google.com/uc?id=1U4d0exJuq29b0rLR6iOT20ErW3DAmgw0"
+  compression: "tar"
+  index_url: "https://drive.google.com/uc?id=1LGBnwu8g2PDzIlgyd7gLqkD02wS9r8Hp"
+  collection_length: 2166946
+clip:
+  model: ViT-L-14
+  weights: openai
+  templates:
+    - "itap of a {}."
+    - "a bad photo of a {}."
+    - "a origami {}."
+    - "a photo of the large {}."
+    - "a {} in a video game."
+    - "art of the {}."
+    - "a photo of the small {}."
+backbone:
+  model: "vit_large_patch14_dinov2.lvd142m"
+  img_size: 518
+mask_proposer:
+  use_mask_proposals: true
+  method: "superpixel"
+  args:
+    algorithm: "felzenszwalb"
+    scale: 100
+    sigma: 1.0
+    min_size: 100
+freeda:
+  global_local_ensemble: 0.8
+  k_search: 350
+  ef_search: 4096
+  sliding_window: true
+  with_background: false
+  background_threshold: 0.5

freeda/models/freeda_model.py ADDED Viewed

	@@ -0,0 +1,519 @@

+import torch
+import open_clip
+import faiss
+import os
+import numpy as np
+from math import sqrt, ceil
+from PIL import Image
+from torchvision.transforms import Compose, Resize
+from torchvision.transforms.functional import pil_to_tensor
+from torch.nn.functional import interpolate
+from freeda.models.vision_backbone import VisionBackbone
+class FreeDA(torch.nn.Module):
+    def __init__(self, config: dict,
+                 lazy_init: bool = True,
+                 collection_in_gpu: bool = False,
+                 collection_path: str = None,
+                 index_path: str = None,
+                 use_cached_embeddings: bool = True,
+                 cache_embeddings: bool = True,
+                 embeddings_cache_path: str = None,
+                 device: str = 'cuda',
+                 verbose: bool = False,
+                 max_masks_batch: int = 128):
+        """
+        Initialize the model.
+        Args:
+            config (dict): The configuration of the model.
+            lazy_init (bool): Whether to lazily load the collection.
+            collection_in_gpu (bool): Whether to load the faiss retrieval collection in GPU.
+            collection_path (str): The path to the collection.
+            index_path (str): The path to the index.
+            use_cached_embeddings (bool): Whether to use the cached embeddings for the required model.
+            cache_embeddings (bool): Whether to cache the embeddings.
+            embeddings_cache_path (str): The path to the embeddings cache.
+            device (str): The device to use.
+            verbose (bool): Whether to print the progress.
+            max_masks_batch (int): The maximum number of masks to process at once.
+        """
+        super(FreeDA, self).__init__()
+        self.clip_model_name = config['clip']['model']
+        self.clip_weights = config['clip']['weights']
+        self.backbone = VisionBackbone(config['backbone'], device)
+        self.collection_path = collection_path
+        self.index_path = index_path
+        self.collection_in_gpu = collection_in_gpu
+        self.lazy_init = lazy_init
+        self.use_cached_embeddings = use_cached_embeddings
+        self.cache_embeddings = cache_embeddings
+        self.embeddings_cache_path = embeddings_cache_path
+        self.device = device
+        self.verbose = verbose
+        self.config = config
+        self.max_masks_batch = max_masks_batch
+        self.collection_initialized = False
+        self.ens = self.config['freeda']['global_local_ensemble']
+        if "ef_search" in self.config['freeda']:
+            self.ef_search = self.config['freeda']['ef_search']
+        self.use_mask_proposer = config['mask_proposer']['use_mask_proposals']
+        if self.use_mask_proposer:
+            if config['mask_proposer']['method'] == 'superpixel':
+                from freeda.models.mask_proposer.superpixel import SuperpixelMaskProposer
+                self.mask_proposer = SuperpixelMaskProposer(config['mask_proposer']['args'])
+        self.templates = config['clip']['templates']
+        self.sliding_window = config['freeda']['sliding_window']
+        self.with_background = config['freeda']['with_background']
+        if self.with_background:
+            self.background_threshold = config['freeda']['background_threshold']
+        self.initialized = False
+        if not lazy_init:
+            self.init_collection()
+            self.init_clip()
+            self.initialized = True
+    def init_collection(self):
+        """
+        Initialize the collection reading the faiss index and the list of embeddings. Moves the index to GPU if collection_in_gpu is True.
+        """
+        self.collection_index = faiss.read_index(self.index_path + "faiss_index/knn.index")
+        if str(type(self.collection_index)) == "<class 'faiss.swigfaiss.IndexHNSWFlat'>" \
+                or str(type(self.collection_index)) == "<class 'faiss.swigfaiss_avx2.IndexHNSWFlat'>":
+            print(f"Setting faiss efSearch to {self.ef_search}")
+            faiss.ParameterSpace().set_index_parameter(self.collection_index, 'efSearch', self.ef_search)
+        self.collection_embeddings = sorted(os.listdir(self.collection_path))
+        if self.collection_in_gpu:
+            resources = [faiss.StandardGpuResources()]
+            self.collection_index = faiss.index_cpu_to_gpu_multiple_py(resources, self.collection_index)
+    @torch.no_grad()
+    def init_clip(self):
+        """
+        Initialize the CLIP model and tokenizer.
+        """
+        self.clip_model, _, self.clip_image_preprocess = open_clip.create_model_and_transforms(self.clip_model_name,
+                                                                                    pretrained=self.clip_weights,
+                                                                                    device=self.device)
+        self.clip_resize_dim = self.clip_image_preprocess.transforms[0].size
+        self.clip_image_preprocess = Compose([
+                                     Resize(
+                                         (self.clip_image_preprocess.transforms[0].size, self.clip_image_preprocess.transforms[0].size),
+                                         interpolation=self.clip_image_preprocess.transforms[0].interpolation, antialias=None),
+                                     lambda x: x / 255,
+                                     self.clip_image_preprocess.transforms[4]
+        ])
+        self.clip_model.eval()
+        self.clip_tokenizer = open_clip.get_tokenizer(self.clip_model_name)
+    @torch.no_grad()
+    def set_categories(self, categories):
+        """
+        Set the categories to be used by the model. If use_cached_embeddings is True, it will try to load the embeddings from the cache.
+        Otherwise, it will compute the embeddings and cache them if cache_embeddings is True.
+        Args:
+            categories (list): The list of textual arbitrary categories.
+        Return:
+            output_prototypes (torch.Tensor): The prototype embeddings. [num_categories, vis_emb_dim]
+            output_text_embeddings (torch.Tensor): The text embeddings. [num_categories, txt_emb_dim]
+        """
+        self.categories = categories
+        self.num_categories = len(self.categories) + 1 if self.with_background else len(self.categories)
+        num_categories = len(categories)
+        already_cached_categories = {}
+        not_cached_categories = {}
+        if self.use_cached_embeddings:
+            for i, category in enumerate(self.categories):
+                if os.path.exists(f"{self.embeddings_cache_path}/visual/{category}.npy"):
+                    already_cached_categories[i] = category
+                else:
+                    not_cached_categories[i] = category
+        else:
+            not_cached_categories = {i: category for i, category in enumerate(self.categories)}
+        if len(not_cached_categories.keys()) != 0:
+            if self.lazy_init and not self.initialized:
+                self.init_clip()
+                self.init_collection()
+                self.initialized = True
+            num_templates = len(self.templates)
+            text = [template.format(category) for category in [v for k, v in not_cached_categories.items()] for template in self.templates]
+            tokens = self.clip_tokenizer(text).to(self.device)
+            context_length = tokens.shape[-1]
+            tokens = tokens.reshape(-1, context_length)
+            text_embeddings = self.clip_model.encode_text(tokens)
+            text_emb_dim = text_embeddings.shape[-1]
+            text_embeddings = text_embeddings.reshape(-1, num_templates, text_emb_dim)
+            text_embeddings = text_embeddings.mean(dim=1)
+            text_embeddings = text_embeddings / (text_embeddings.norm(dim=-1, keepdim=True) + 1e-6)
+            _, indices = self.collection_index.search(text_embeddings.cpu().numpy(), self.config['freeda']['k_search'])
+            prototypes = []
+            for c in indices:
+                category_retrieved_embeddings = []
+                for k in c:
+                    retrieved_embedding = torch.from_numpy(np.load(f"{self.collection_path}/{self.collection_embeddings[k]}"))
+                    if len(retrieved_embedding.shape) == 1:
+                        retrieved_embedding = retrieved_embedding.unsqueeze(0)
+                    category_retrieved_embeddings.append(retrieved_embedding)
+                category_retrieved_embeddings = torch.cat(category_retrieved_embeddings, dim=0).to(self.device)
+                prototypes.append(category_retrieved_embeddings)
+            prototypes = torch.stack(prototypes, dim=0).mean(dim=1)
+            prototypes = prototypes / (prototypes.norm(dim=-1, keepdim=True) + 1e-6)
+            prototypes_emb_dim = prototypes.shape[-1]
+            if self.cache_embeddings:
+                os.makedirs(f"{self.embeddings_cache_path}/visual", exist_ok=True)
+                os.makedirs(f"{self.embeddings_cache_path}/textual", exist_ok=True)
+                for j, (i, category) in enumerate(not_cached_categories.items()):
+                    print(f"Caching embeddings for {category}")
+                    np.save(f"{self.embeddings_cache_path}/visual/{category}.npy", prototypes[j].cpu().numpy())
+                    np.save(f"{self.embeddings_cache_path}/textual/{category}.npy", text_embeddings[j].cpu().numpy())
+            output_prototypes = torch.zeros(num_categories, prototypes_emb_dim, device=self.device)
+            output_text_embeddings = torch.zeros(num_categories, text_emb_dim, device=self.device)
+            for i, category in already_cached_categories.items():
+                if self.verbose:
+                    print(f"Loading cached embeddings for {category}")
+                output_prototypes[i] = torch.tensor(np.load(f"{self.embeddings_cache_path}/visual/{category}.npy"), device=self.device)
+                output_text_embeddings[i] = torch.tensor(np.load(f"{self.embeddings_cache_path}/textual/{category}.npy"), device=self.device)
+            for i, category in not_cached_categories.items():
+                output_prototypes[i] = prototypes[j]
+                output_text_embeddings[i] = text_embeddings[j]
+        else:
+            loaded_prototypes_embeddings = [None for _ in range(num_categories)]
+            loaded_text_embeddings = [None for _ in range(num_categories)]
+            for i, category in already_cached_categories.items():
+                if self.verbose:
+                    print(f"Loading cached embeddings for {category}")
+                loaded_prototypes_embeddings[i] = torch.tensor(np.load(f"{self.embeddings_cache_path}/visual/{category}.npy"), device=self.device)
+                loaded_text_embeddings[i] = torch.tensor(np.load(f"{self.embeddings_cache_path}/textual/{category}.npy"), device=self.device)
+            output_prototypes = torch.stack(loaded_prototypes_embeddings, dim=0).to(self.device)
+            output_text_embeddings = torch.stack(loaded_text_embeddings, dim=0).to(self.device)
+        self.prototype_embeddings = output_prototypes.clone()
+        self.text_embeddings = output_text_embeddings.clone()
+        return output_prototypes, output_text_embeddings
+    @torch.no_grad()
+    def set_images(self, images):
+        """
+        Set the images to be used by the model and extracts their backbone features, CLIP features and mask proposals.
+        If sliding_window is True, it will split the images into windows.
+        It also returns the features and the masks if the caller wants to use them for other purposes.
+        Args:
+            images (list): The list of PIL images.
+        Return:
+            backbone_features (list): The list of backbone features.[num_images, num_patches_h, num_patches_w, vis_emb_dim]
+            clip_features (torch.Tensor): The visual features from the CLIP model. [num_images, txt_emb_dim]
+            masks (list): The list of mask proposals. (num_images) x (pred_masks, n_pred_masks, covered_pixels, assigned_masks)
+        """
+        self.images = images
+        self.original_sizes = [image.size for image in images]
+        if self.sliding_window:
+            new_images = []
+            self.window_boxes = []
+            for image in images:
+                image_windows = self.get_window_boxes(image)
+                self.window_boxes.append(image_windows)
+                new_images.extend([image.crop(window_box) for window_box in image_windows])
+            self.images_pre_sliding = self.images
+            self.images = new_images
+        backbone_features, clip_features = self.set_backbone_features(self.images)
+        masks = self.set_mask_proposals(self.images)
+        return backbone_features, clip_features, masks
+    def get_window_boxes(self, image):
+        """
+        Get the window boxes for the sliding window approach.
+        Args:
+            image (PIL.Image): The image to split into windows.
+        Return:
+            window_boxes (list): The list of window boxes coordinates. (x1, y1, x2, y2)
+        """
+        short_side = min(image.size)
+        long_side = max(image.size)
+        aspect_ratio = long_side / short_side
+        num_windows = ceil(aspect_ratio)
+        window_shift = (long_side - short_side) / (num_windows -1) if num_windows > 1 else 0
+        window_boxes = []
+        current_shift = 0
+        for j in range(num_windows):
+            if short_side == image.size[0]:
+                window_boxes.append((0, round(current_shift), short_side, round(current_shift + short_side)))
+            else:
+                window_boxes.append((round(current_shift), 0, round(current_shift + short_side), short_side))
+            current_shift += window_shift
+        return window_boxes
+    @torch.no_grad()
+    def get_clip_visual_features(self, images):
+        """
+        Get the normalized visual features from the CLIP model.
+        Args:
+            images (list or PIL.Image): The list of PIL images or a single PIL image.
+        Return:
+            clip_features (torch.Tensor): The visual features from the CLIP model. [num_images, vis_emb_dim]
+        """
+        if type(images) == list:
+            if len(images) == 0:
+                raise ValueError("Images list is empty")
+            images = [pil_to_tensor(image) for image in images]
+            images = [self.clip_image_preprocess(image.unsqueeze(0)).squeeze(0) for image in images]
+            images = torch.stack(images, dim=0).to(self.device)
+        else:
+            images = self.clip_image_preprocess(images)
+        clip_features = self.clip_model.encode_image(images)
+        clip_features = clip_features / (clip_features.norm(dim=-1, keepdim=True) + 1e-6)
+        return clip_features
+    @torch.no_grad()
+    def set_backbone_features(self, images):
+        """
+        Set the backbone features and the CLIP features of the images.
+        If the model is lazy or global similarities are required, it will initialize the CLIP model.
+        Args:
+            images (list): The list of PIL images.
+        Return:
+            backbone_features (list): The list of backbone features. [num_images, num_patches_h, num_patches_w, vis_emb_dim]
+            clip_features (torch.Tensor): The visual features from the CLIP model. [num_images, vis_emb_dim]
+        """
+        self.backbone_features = self.backbone(images)
+        if self.ens != 1.0:
+            if self.lazy_init and not self.initialized:
+                self.init_clip()
+        self.clip_features = self.get_clip_visual_features(images)
+        return self.backbone_features, self.clip_features
+    @torch.no_grad()
+    def set_mask_proposals(self, images):
+        """
+        Set the mask proposals of the images.
+        Args:
+            images (list): The list of PIL images.
+        Return:
+            masks (list): The list of mask proposals. (num_images) x (pred_masks, n_pred_masks, covered_pixels, assigned_masks)
+                Each mask proposal is composed of the predicted binary masks, the number of predicted masks, the pixels covered by masks and the assigned masks for each pixel.
+                pred_masks: [n_pred_masks, h, w] (bool)
+                n_pred_masks: int
+                covered_pixels: [h, w] (bool)
+                assigned_masks: [h, w] (int)
+        """
+        self.masks = [self.mask_proposer(image, self.device) for image in images] # List of tuples (pred_masks, n_pred_masks, covered_pixels, assigned_masks)
+        return self.masks
+    @torch.no_grad()
+    def forward(self):
+        """
+        Forward pass of the model.
+        Return:
+            masks (list): The list of output segmentation masks. [num_images, h, w] (int)
+        """
+        patch_similarities = self.compute_patch_similarities(self.backbone_features, self.prototype_embeddings, self.masks)
+        region_embeddings_batch = self.region_pooling(self.backbone_features, self.masks)
+        region_similarities = self.compute_region_similarities(region_embeddings_batch, self.prototype_embeddings, self.masks)
+        global_similarities = self.compute_global_similarities(self.clip_features, self.text_embeddings)
+        similarities = self.compute_final_similarities(region_similarities, patch_similarities, global_similarities, self.masks)
+        if self.sliding_window:
+            similarities = self.merge_sliding_windows(similarities)
+            self.images = self.images_pre_sliding
+        masks = [similarity.argmax(dim=0) for similarity in similarities]
+        return masks
+    @torch.no_grad()
+    def merge_sliding_windows(self, similarities):
+        """
+        Merge the similarities of the sliding windows.
+        Args:
+            similarities (list): The list of similarities of the sliding windows. [num_windows, num_categories, h, w]
+        Return:
+            new_similarities (list): The list of merged similarities. [num_images, num_categories, h, w]
+        """
+        counter = 0
+        new_similarities = []
+        for original_size, window_boxes in zip(self.original_sizes, self.window_boxes):
+            new_similarity = torch.zeros(self.num_categories, original_size[1], original_size[0], device=self.device)
+            window_overlaps = torch.zeros(original_size[1], original_size[0], device=self.device)
+            for window_box in window_boxes:
+                new_similarity[:, window_box[1]:window_box[3], window_box[0]:window_box[2]] += similarities[counter]
+                window_overlaps[window_box[1]:window_box[3], window_box[0]:window_box[2]] += 1
+                counter += 1
+            new_similarity = new_similarity / window_overlaps
+            new_similarities.append(new_similarity)
+        return new_similarities
+    @torch.no_grad()
+    def region_pooling(self, features, masks):
+        """
+        Perform region pooling to get the region embeddings from the patch-level embeddings and the mask proposals.
+        Args:
+            features (torch.Tensor): The backbone features. [num_images, num_patches_h, num_patches_w, vis_emb_dim]
+            masks (list): The list of mask proposals. (num_images) x (pred_masks, n_pred_masks, covered_pixels, assigned_masks)
+        Return:
+            region_embeddings_batch (list): The list of normalized region embeddings. [num_images, num_regions, vis_emb_dim]
+        """
+        region_embeddings_batch = []
+        for i in range(len(masks)):
+            pred_masks = interpolate(masks[i][0].unsqueeze(1).float(), size=(features.shape[1], features.shape[2]), mode='bilinear', align_corners=True).type(torch.bool).squeeze(1)
+            region_embeddings = torch.zeros(pred_masks.shape[0], features.shape[-1], device=self.device)
+            for j in range(0, pred_masks.shape[0], self.max_masks_batch):
+                r = min(j + self.max_masks_batch, masks[i][1])
+                current_region_embeddings = pred_masks[j:r].unsqueeze(-1) * features[i].unsqueeze(0)
+                region_embeddings[j:r] = current_region_embeddings.sum(dim=(1, 2)) / pred_masks[j:r].sum(dim=(1, 2)).unsqueeze(-1)
+            region_embeddings = region_embeddings / (region_embeddings.norm(dim=-1, keepdim=True) + 1e-6)
+            self.masks[i][2][self.masks[i][0][region_embeddings.isnan().sum(-1).bool()].sum(0).bool()] = False # Replace pixels covered by too small regions
+            region_embeddings_batch.append(region_embeddings)
+        return region_embeddings_batch
+    @torch.no_grad()
+    def compute_region_similarities(self, region_embeddings_batch, prototype_embeddings, masks):
+        """
+        Compute the region local similarities between the region embeddings and the visual prototypes.
+        Args:
+            region_embeddings_batch (list): The list of region embeddings. [num_images, num_regions, vis_emb_dim]
+            prototype_embeddings (torch.Tensor): The prototype embeddings. [num_categories, vis_emb_dim]
+            masks (list): The list of mask proposals. (num_images) x (pred_masks, n_pred_masks, covered_pixels, assigned_masks)
+        Return:
+            similarities (list): The list of region similarities. [num_images, num_categories, h, w]
+        """
+        similarities = []
+        for i in range(len(region_embeddings_batch)):
+            output_similarities = torch.zeros(len(self.categories), masks[i][0].shape[1], masks[i][0].shape[2], device=self.device)
+            region_embeddings = region_embeddings_batch[i]
+            region_similarities = torch.matmul(region_embeddings, prototype_embeddings.T)
+            output_similarities[:, masks[i][2]] = region_similarities[masks[i][3]][masks[i][2]].permute(1, 0)
+            similarities.append(torch.sigmoid(output_similarities))
+        return similarities
+    @torch.no_grad()
+    def compute_patch_similarities(self, patch_embeddings, prototype_embeddings, masks):
+        """
+        Compute the per-patch local similarities between the patch-level embeddings and the visual prototypes.
+        Args:
+            patch_embeddings (torch.Tensor): The patch-level embeddings. [num_images, num_patches_h, num_patches_w, vis_emb_dim]
+            prototype_embeddings (torch.Tensor): The prototype embeddings. [num_categories, vis_emb_dim]
+            masks (list): The list of mask proposals.
+        Return:
+            output_similarities (list): The list of patch similarities. [num_images, num_categories, h, w]
+        """
+        patch_embeddings = patch_embeddings / (patch_embeddings.norm(dim=-1, keepdim=True) + 1e-6)
+        similarities = torch.matmul(patch_embeddings, prototype_embeddings.T)
+        similarities = torch.sigmoid(similarities)
+        image_sizes = [mask[0].shape[1:] for mask in masks]
+        output_similarities = []
+        for i, image_size in enumerate(image_sizes):
+            output_similarities.append(interpolate(similarities[i].permute(2,0,1).unsqueeze(1), size=image_size, mode='bilinear', align_corners=True).squeeze(1))
+        return output_similarities
+    @torch.no_grad()
+    def compute_global_similarities(self, clip_features, text_embeddings):
+        """
+        Compute the global similarities between the CLIP visual features and the text embeddings.
+        Args:
+            clip_features (torch.Tensor): The visual features from the CLIP model. [num_images, vis_emb_dim]
+            text_embeddings (torch.Tensor): The text embeddings. [num_categories, txt_emb_dim]
+        Return:
+            similarities (list): The list of global similarities. [num_images, num_categories]
+        """
+        similarities = torch.matmul(clip_features, text_embeddings.T)
+        return similarities
+    @torch.no_grad()
+    def compute_final_similarities(self, region_similarities, patch_similarities, global_similarities, masks):
+        """
+        Compute the final similarities by combining the region and patch local similarities with the textual global similarities.
+        Args:
+            region_similarities (list): The list of region local similarities. [num_images, num_categories, h, w]
+            patch_similarities (list): The list of per-patch interpolated local similarities. [num_images, num_categories, h, w]
+            global_similarities (list): The list of textual global similarities. [num_images, num_categories]
+            masks (list): The list of mask proposals. (num_images) x (pred_masks, n_pred_masks, covered_pixels, assigned_masks)
+        Return:
+            new_similarities (list): The list of final similarities. [num_images, num_categories, h, w]
+        """
+        new_similarities = self.replace_covered_pixel_similarities(masks, patch_similarities, region_similarities)
+        new_similarities = [self.ens * new_similarities[i] + (1 - self.ens) * global_similarities[i].reshape(-1, 1, 1) for i in range(len(new_similarities))]
+        if self.with_background:
+            new_similarities = self.add_backgrounds(new_similarities)
+        return new_similarities
+    @torch.no_grad()
+    def add_backgrounds(self, similarities):
+        """
+        Add the background class to the similarities by thresholding the maximum similarity of each pixel.
+        Args:
+            similarities (list): The list of final similarities. [num_images, num_categories, h, w]
+        Return:
+            similarities (list): The list of final similarities with the background class. [num_images, num_categories + 1, h, w]
+        """
+        for i in range(len(similarities)):
+            background = (similarities[i].max(dim=0).values < self.background_threshold).float().unsqueeze(0)
+            similarities[i] = torch.cat([background, similarities[i]], dim=0)
+        return similarities
+    @torch.no_grad()
+    def replace_covered_pixel_similarities(self, masks, patch_similarities, region_similarities):
+        """
+        Replace the similarities of the covered pixels by the region similarities.
+        Args:
+            masks (list): The list of mask proposals. (num_images) x (pred_masks, n_pred_masks, covered_pixels, assigned_masks)
+            patch_similarities (list): The list of per-patch interpolated local similarities. [num_images, num_categories, h, w]
+            region_similarities (list): The list of region local similarities. [num_images, num_categories, h, w]
+        Return:
+            output_similarities (list): The list of final similarities. [num_images, num_categories, h, w]
+        """
+        output_similarities = []
+        for i in range(len(masks)):
+            tmp_patch_similarities = patch_similarities[i].permute(1, 2, 0)
+            tmp_new_similarities = region_similarities[i].permute(1, 2, 0)
+            tmp_patch_similarities[masks[i][2]] = tmp_new_similarities[masks[i][2]]
+            output_similarities.append(tmp_patch_similarities.permute(2, 0, 1))
+        return output_similarities
+    @torch.no_grad()
+    def visualize(self, segmentation_masks, output_paths, legend=True):
+        """
+        Visualize the segmentation masks saving the plots to the provided output paths.
+        Args:
+            segmentation_masks (list): The list of segmentation masks. [num_images, h, w] (int)
+            output_paths (list): The list of output paths. [num_images]
+            legend (bool): Whether to add a legend to the plot.
+        """
+        if len(segmentation_masks) != len(self.images):
+            raise ValueError("Number of segmentation masks and images must be the same")
+        if len(segmentation_masks) != len(output_paths):
+            raise ValueError("Number of segmentation masks and output paths must be the same")
+        from skimage.segmentation import find_boundaries
+        import random
+        import matplotlib.pyplot as plt
+        from matplotlib.lines import Line2D
+        for i in range(len(segmentation_masks)):
+            mask = segmentation_masks[i]
+            image = self.images[i]
+            h, w = mask.shape
+            colored_image = torch.zeros((h, w, 3), dtype=torch.int)
+            random_colors = []
+            new_categories = ["background"] + self.categories if self.with_background else self.categories
+            for index in range(len(new_categories)):
+                rand_mask = torch.ones(h, w, 3, dtype=torch.int)
+                rand_mask[:,:,-1] = (mask == index) * 255
+                random_rgb = [0, 0, 0] if self.with_background and index == 0 else [random.randint(64, 255), random.randint(64, 255), random.randint(64, 255)]
+                for j in range(3):
+                    rand_mask[:,:,j] = torch.ones(h, w, dtype=torch.int) * random_rgb[j] * (mask == index).cpu().int()
+                colored_image[(mask == index).cpu()] = rand_mask[(mask == index).cpu()]
+                random_colors.append([channel / 255 for channel in random_rgb])
+            plt.imshow(image)
+            boundaries = find_boundaries(mask.cpu().numpy().astype(np.uint8))
+            boundaries_image = np.zeros((h, w, 4), dtype=np.uint8)
+            boundaries_image[:,:,3] = boundaries * 255
+            plt.imshow(colored_image.numpy(), alpha=0.5)
+            plt.imshow(boundaries_image)
+            plt.axis('off')
+            plt.tight_layout()
+            if legend:
+                legend_elements = [Line2D([0], [0], color=random_colors[i], lw=4, label=new_categories[i]) for i in range(len(random_colors))]
+                plt.legend(handles=legend_elements, loc='upper left', bbox_to_anchor=(1, 1))
+            plt.savefig(output_paths[i], bbox_inches='tight')
+            plt.close()

freeda/models/mask_proposer/superpixel.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import cv2
+import torch
+import numpy as np
+from torchvision.transforms.functional import pil_to_tensor
+from skimage.color import rgb2gray
+from skimage.filters import sobel
+from skimage.measure import regionprops
+from skimage.segmentation import felzenszwalb, slic, quickshift, watershed
+class SuperpixelMaskProposer:
+    def __init__(self, config):
+        self.config = config
+        self.superpixel_method = config['algorithm']
+        self.superpixel_conf = config
+        self.superpixel_conf.pop('algorithm')
+        if self.superpixel_method == 'seeds':
+            self.num_iterations = self.superpixel_conf.pop("num_iterations")
+    def __call__(self, image, device):
+        if type(image) != torch.Tensor:
+            image = pil_to_tensor(image)
+        pred_masks = []
+        n_pred_masks = []
+        assigned_masks = []
+        covered_pixels = torch.ones(image.shape[1], image.shape[2]).type(torch.bool).to(device)
+        if self.superpixel_method == "seeds":
+            image = image.permute(1, 2, 0).cpu().numpy()
+            image = np.ascontiguousarray(image.astype(np.uint8))
+        else:
+            image = image.permute(1, 2, 0).cpu().numpy() / 255
+        if self.superpixel_method == "felzenszwalb":
+            superpixel_mask = felzenszwalb(image, **self.superpixel_conf)
+        elif self.superpixel_method == "slic":
+            superpixel_mask = slic(image, **self.superpixel_conf)
+        elif self.superpixel_method == "quickshift":
+            superpixel_mask = quickshift(image, **self.superpixel_conf)
+        elif self.superpixel_method == "watershed":
+            gradient = sobel(rgb2gray(image))
+            superpixel_mask = watershed(gradient, **self.superpixel_conf)
+        elif self.superpixel_method == "seeds":
+            superpix_seeds = cv2.ximageproc.createSuperpixelSEEDS(**self.superpixel_conf)
+            superpix_seeds.iterate(image, self.num_iterations)
+            superpixel_mask = superpix_seeds.getLabels()
+            num_superpixels = superpix_seeds.getNumberOfSuperpixels()
+        else:
+            raise NotImplementedError(f"Superpixel algorithm {self.superpixel_method} not implemented.")
+        if self.superpixel_method == "seeds":
+            superpixel_mask_binary = np.array([superpixel_mask == i for i in np.arange(num_superpixels)])
+        else:
+            superpixel_mask_binary = np.array([superpixel_mask == i for i in np.unique(superpixel_mask)])
+        num_superpixel = superpixel_mask_binary.shape[0]
+        pred_masks = torch.from_numpy(superpixel_mask_binary).type(torch.bool).to(device)
+        n_pred_masks = num_superpixel
+        assigned_masks = torch.from_numpy(superpixel_mask).type(torch.long).to(device)
+        if self.superpixel_method == "watershed":
+            assigned_masks = assigned_masks - 1
+        return pred_masks, n_pred_masks, covered_pixels, assigned_masks

freeda/models/vision_backbone.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import torch
+import timm
+from math import sqrt
+from torchvision.transforms import Compose, Resize
+from torchvision.transforms.functional import pil_to_tensor
+class VisionBackbone(torch.nn.Module):
+    def __init__(self,
+                 config: dict,
+                 device: str = 'cuda',
+                 max_batch_size: int = 16):
+        super(VisionBackbone, self).__init__()
+        self.backbone_name = config['model']
+        self.device = device
+        self.model = timm.create_model(
+            config['model'],
+            pretrained=True,
+            img_size=config['img_size'],
+        ).to(self.device).eval()
+        data_config = timm.data.resolve_model_data_config(config['model'])
+        self.transform = timm.data.create_transform(**data_config, is_training=False)
+        self.transform = Compose([
+            Resize((config['img_size'], config['img_size']), antialias=None),
+            lambda x: x / 255,
+            self.transform.transforms[-1]
+        ])
+        self.max_batch_size = max_batch_size
+    @torch.no_grad()
+    def forward(self, images):
+        if type(images) == list:
+            if len(images) == 0:
+                raise ValueError("Images list is empty")
+            images = [pil_to_tensor(image) for image in images]
+            images = [self.transform(image.unsqueeze(0)).squeeze(0) for image in images]
+            images = torch.stack(images, dim=0)
+        else:
+            if images.shape[1] != 3:
+                images = images.permute(0, 3, 1, 2)
+            images = self.transform(images)
+        batch_size = images.shape[0]
+        if batch_size < self.max_batch_size:
+            features = self.model.forward_features(images.to(self.device))
+        else:
+            features = []
+            for i in range(0, batch_size, self.max_batch_size):
+                r = min(i + self.max_batch_size, batch_size)
+                features.append(self.model.forward_features(images[i:r].to(self.device)))
+            features = torch.cat(features, dim=0)
+        num_tokens_side = int(sqrt(features.shape[1] - 1))
+        return features[:, 1::, :].reshape(batch_size, num_tokens_side, num_tokens_side, features.shape[-1])

freeda/utils/factory.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import os
+import yaml
+import gdown
+import subprocess
+import tarfile
+import zipfile
+from freeda.models.freeda_model import FreeDA
+def load(model_name: str,
+         lazy_init: bool = False,
+         collection_in_gpu: bool = False,
+         use_cache: bool = True,
+         force_cache_download: bool = False,
+         collection_path: str = None,
+         index_path: str = None,
+         use_cached_embeddings: bool = True,
+         cache_embeddings: bool = True,
+         embeddings_cache_path: str = None,
+         verbose: bool = True,
+         device: str = 'cuda',
+         custom_configs_path: str = None):
+    """
+    Load the model and its configuration.
+    Args:
+        model_name (str): The name of the model to load.
+        lazy_init (bool): Whether to lazily load the collection.
+        collection_in_gpu (bool): Whether to load the collection in GPU.
+        use_cache (bool): Whether to use the cache. If False, required data will be downloaded.
+        force_cache_dowload (bool): Whether to force dowloading and storing in cache.
+        collection_path (str): The path to the collection.
+        index_path (str): The path to the index.
+        use_cached_embeddings (bool): Whether to use the cached embeddings for the required model.
+        cache_embeddings (bool): Whether to cache the embeddings.
+        embeddings_cache_path (str): The path to the embeddings cache.
+        verbose (bool): Whether to print the progress.
+        device (str): The device to use.
+        custom_configs_path (str): The path to a directory with custom configurations. If None, uses the default configs.
+    Return:
+        FreeDA: The FreeDA model.
+    """
+    if custom_configs_path is None:
+        current_path = os.path.dirname(os.path.abspath(__file__))
+        configs_path = os.path.join(current_path, '../configs')
+    else:
+        configs_path = custom_configs_path
+    if f"{model_name}.yaml" not in os.listdir(configs_path):
+        raise ValueError(f"Model {model_name} not available")
+    with open(f"{configs_path}/{model_name}.yaml", 'r') as file:
+        config = yaml.load(file, Loader=yaml.FullLoader)
+    cache_root = os.path.expanduser(f"~/.cache/freeda/{model_name}/")
+    new_collection_path = f"{cache_root}prototype_embeddings/" if collection_path is None else collection_path
+    os.makedirs(new_collection_path, exist_ok=True)
+    new_index_path = f"{cache_root}index/" if index_path is None else index_path
+    os.makedirs(new_index_path, exist_ok=True)
+    if use_cache:
+        use_cached_collection = True if len(os.listdir(new_collection_path)) == config['data']['collection_length'] and not force_cache_download else False
+        use_cached_index = True if len(os.listdir(new_index_path)) != 0 and not force_cache_download else False
+    else:
+        use_cached_collection = False
+        use_cached_index = False
+    if not use_cached_collection:
+        if verbose:
+            print("Downloading collection...")
+        output_collection_tar = f"{cache_root}prototype_embeddings.tar"
+        gdown.download(config['data']['collection_url'], output_collection_tar, quiet=verbose)
+        if verbose:
+            print("Extracting compressed collection... (it may take a while)")
+        if config['data']['compression'] == 'zip':
+            with tarfile.open(output_collection_tar, 'r:gz') as tar:
+                tar.extractall(new_collection_path)
+        elif config['data']['compression'] == 'tar':
+            with tarfile.open(output_collection_tar, 'r') as tar:
+                tar.extractall(new_collection_path)
+        else:
+            raise ValueError("Invalid compression format")
+    else:
+        if verbose:
+            print("Using cached collection...")
+    if not use_cached_index:
+        if verbose:
+            print("Downloading index...")
+        output_index_zip = f"{cache_root}faiss_index.zip"
+        gdown.download(config['data']['index_url'], output_index_zip, quiet=verbose)
+        with zipfile.ZipFile(output_index_zip, 'r') as zip_ref:
+            zip_ref.extractall(new_index_path)
+    else:
+        if verbose:
+            print("Using cached index...")
+    if embeddings_cache_path is None and use_cached_embeddings:
+        embeddings_cache_path = f"{cache_root}embeddings/"
+    elif embeddings_cache_path is not None and use_cached_embeddings:
+        embeddings_cache_path = os.path.expanduser(embeddings_cache_path)
+    return FreeDA(config,
+                  lazy_init=lazy_init,
+                  collection_in_gpu=collection_in_gpu,
+                  collection_path=new_collection_path,
+                  index_path=new_index_path,
+                  device=device,
+                  use_cached_embeddings=use_cached_embeddings,
+                  cache_embeddings=cache_embeddings,
+                  embeddings_cache_path=embeddings_cache_path,
+                  verbose=verbose)

main.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import freeda
+from PIL import Image
+import requests
+from io import BytesIO
+if __name__ == "__main__":
+    fr = freeda.load("dinov2_vitb_clip_vitb")
+    response1 = requests.get("https://farm9.staticflickr.com/8306/7926031760_b313dca06a_z.jpg")
+    img1 = Image.open(BytesIO(response1.content))
+    response2 = requests.get("https://farm3.staticflickr.com/2207/2157810040_4883738d2d_z.jpg")
+    img2 = Image.open(BytesIO(response2.content))
+    fr.set_categories(["cat", "table", "pen", "keyboard", "toilet", "wall"])
+    fr.set_images([img1, img2])
+    segmentation = fr()
+    fr.visualize(segmentation, ["plot.png", "plot1.png"])

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+faiss==1.8.0
+gdown==5.2.0
+matplotlib==3.9.2
+opencv_python==4.10.0.84
+Pillow==10.4.0
+PyYAML==6.0.2
+Requests==2.32.3
+skimage==0.0
+timm==1.0.9