Spaces:

osimeoni
/

FOUND

Runtime error

App Files Files Community

osimeoni commited on Mar 30, 2023

Commit

25cae60

1 Parent(s): 907a760

FOUND - second

Browse files

Files changed (30) hide show

LICENSE +201 -0
__init__.py +3 -0
app.py +84 -6
bilateral_solver.py +214 -0
bkg_seg.py +84 -0
configs/found_DUTS-TR.yaml +34 -0
data/examples/VOC_000030.jpg +0 -0
data/weights/decoder_weights.pt +0 -0
datasets/VOC.py +80 -0
datasets/__init__.py +0 -0
datasets/augmentations.py +68 -0
datasets/datasets.py +409 -0
datasets/geometric_transforms.py +160 -0
datasets/uod_datasets.py +384 -0
datasets/utils.py +44 -0
evaluation/__init__.py +0 -0
evaluation/metrics/__init__.py +0 -0
evaluation/metrics/average_meter.py +21 -0
evaluation/metrics/f_measure.py +111 -0
evaluation/metrics/iou.py +37 -0
evaluation/metrics/mae.py +14 -0
evaluation/metrics/pixel_acc.py +21 -0
evaluation/metrics/s_measure.py +126 -0
evaluation/saliency.py +290 -0
evaluation/uod.py +118 -0
main_found_evaluate.py +122 -0
main_visualize.py +99 -0
misc.py +254 -0
model.py +243 -0
requirements.txt +10 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+import sys
+from os.path import dirname, join
+sys.path.insert(0, join(dirname(__file__), '.'))

app.py CHANGED Viewed

@@ -1,16 +1,94 @@
 import gradio as gr
 title = 'FOUND'
 description = 'Gradio Demo accompanying paper "Unsupervised Object Localization: Observing the Background to Discover Objects"\n \
     The app is running CPU-only, times are therefore .\n'
-article = """<h1 align="center">[FOUND] Unsupervised Object Localization: Observing the Background to Discover Objects</h1>
     """
-def greet(name):
-    return "Hello " + name + "!!"
-iface = gr.Interface(fn=greet, title=title, description=description,
-                     article=article, inputs="text", outputs="text")
-iface.launch()

+import os
+import torch
+import argparse
+import torch.nn as nn
+import torch.nn.functional as F
+import matplotlib.pyplot as plt
+from PIL import Image
+from model import FoundModel
+from misc import load_config
+from torchvision import transforms as T
 import gradio as gr
+NORMALIZE = T.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
+CACHE = True
+def blend_images(bg, fg, alpha=0.5):
+    fg = fg.convert('RGBA')
+    bg = bg.convert('RGBA')
+    blended = Image.blend(bg, fg, alpha=alpha)
+    return blended
+def predict(img_input):
+    config = "configs/found_DUTS-TR.yaml"
+    model_weights = "data/weights/decoder_weights.pt"
+    # Configuration
+    config = load_config(config)
+    # ------------------------------------
+    # Load the model
+    model = FoundModel(vit_model=config.model["pre_training"],
+                        vit_arch=config.model["arch"],
+                        vit_patch_size=config.model["patch_size"],
+                        enc_type_feats=config.found["feats"],
+                        bkg_type_feats=config.found["feats"],
+                        bkg_th=config.found["bkg_th"])
+    # Load weights
+    model.decoder_load_weights(model_weights)
+    model.eval()
+    print(f"Model {model_weights} loaded correctly.")
+    # Load the image
+    img_pil = Image.open(img_input)
+    img = img_pil.convert("RGB")
+    t = T.Compose([T.ToTensor(), NORMALIZE])
+    img_t = t(img)[None,:,:,:]
+    inputs = img_t.to("cuda")
+    # Forward step
+    with torch.no_grad():
+        preds, _, _, _ = model.forward_step(inputs, for_eval=True)
+    # Apply FOUND
+    sigmoid = nn.Sigmoid()
+    h, w = img_t.shape[-2:]
+    preds_up = F.interpolate(
+        preds, scale_factor=model.vit_patch_size, mode="bilinear", align_corners=False
+    )[..., :h, :w]
+    preds_up = (
+        (sigmoid(preds_up.detach()) > 0.5).squeeze(0).float()
+    )
+    return blend_images(img_pil, preds_up)
 title = 'FOUND'
 description = 'Gradio Demo accompanying paper "Unsupervised Object Localization: Observing the Background to Discover Objects"\n \
     The app is running CPU-only, times are therefore .\n'
+article = """<h2 align="center">Unsupervised Object Localization: Observing the Background to Discover Objects </h2>
+    <h1 align="center"> FOUND </h1>
     """
+examples = ["data/examples/VOC_000030.jpg"]
+iface = gr.Interface(fn=predict,
+                     title=title,
+                     description=description,
+                     article=article,
+                     inputs=gr.Image(type='filepath'),
+                     outputs=gr.Image(label="Object localization", type="pil"),
+                     examples=examples,
+                     cache_examples=CACHE
+                     )
+iface.launch(show_error=True, enable_queue=True, inline=True)

bilateral_solver.py ADDED Viewed

	@@ -0,0 +1,214 @@

+"""
+Code adapted from TokenCut: https://github.com/YangtaoWANG95/TokenCut
+"""
+import PIL.Image as Image
+import numpy as np
+from scipy import ndimage
+from scipy.sparse import diags, csr_matrix
+from scipy.sparse.linalg import cg
+RGB_TO_YUV = np.array(
+    [[0.299, 0.587, 0.114], [-0.168736, -0.331264, 0.5], [0.5, -0.418688, -0.081312]]
+)
+YUV_TO_RGB = np.array([[1.0, 0.0, 1.402], [1.0, -0.34414, -0.71414], [1.0, 1.772, 0.0]])
+YUV_OFFSET = np.array([0, 128.0, 128.0]).reshape(1, 1, -1)
+MAX_VAL = 255.0
+def rgb2yuv(im):
+    return np.tensordot(im, RGB_TO_YUV, ([2], [1])) + YUV_OFFSET
+def yuv2rgb(im):
+    return np.tensordot(im.astype(float) - YUV_OFFSET, YUV_TO_RGB, ([2], [1]))
+def get_valid_idx(valid, candidates):
+    """Find which values are present in a list and where they are located"""
+    locs = np.searchsorted(valid, candidates)
+    # Handle edge case where the candidate is larger than all valid values
+    locs = np.clip(locs, 0, len(valid) - 1)
+    # Identify which values are actually present
+    valid_idx = np.flatnonzero(valid[locs] == candidates)
+    locs = locs[valid_idx]
+    return valid_idx, locs
+class BilateralGrid(object):
+    def __init__(self, im, sigma_spatial=32, sigma_luma=8, sigma_chroma=8):
+        im_yuv = rgb2yuv(im)
+        # Compute 5-dimensional XYLUV bilateral-space coordinates
+        Iy, Ix = np.mgrid[: im.shape[0], : im.shape[1]]
+        x_coords = (Ix / sigma_spatial).astype(int)
+        y_coords = (Iy / sigma_spatial).astype(int)
+        luma_coords = (im_yuv[..., 0] / sigma_luma).astype(int)
+        chroma_coords = (im_yuv[..., 1:] / sigma_chroma).astype(int)
+        coords = np.dstack((x_coords, y_coords, luma_coords, chroma_coords))
+        coords_flat = coords.reshape(-1, coords.shape[-1])
+        self.npixels, self.dim = coords_flat.shape
+        # Hacky "hash vector" for coordinates,
+        # Requires all scaled coordinates be < MAX_VAL
+        self.hash_vec = MAX_VAL ** np.arange(self.dim)
+        # Construct S and B matrix
+        self._compute_factorization(coords_flat)
+    def _compute_factorization(self, coords_flat):
+        # Hash each coordinate in grid to a unique value
+        hashed_coords = self._hash_coords(coords_flat)
+        unique_hashes, unique_idx, idx = np.unique(
+            hashed_coords, return_index=True, return_inverse=True
+        )
+        # Identify unique set of vertices
+        unique_coords = coords_flat[unique_idx]
+        self.nvertices = len(unique_coords)
+        # Construct sparse splat matrix that maps from pixels to vertices
+        self.S = csr_matrix((np.ones(self.npixels), (idx, np.arange(self.npixels))))
+        # Construct sparse blur matrices.
+        # Note that these represent [1 0 1] blurs, excluding the central element
+        self.blurs = []
+        for d in range(self.dim):
+            blur = 0.0
+            for offset in (-1, 1):
+                offset_vec = np.zeros((1, self.dim))
+                offset_vec[:, d] = offset
+                neighbor_hash = self._hash_coords(unique_coords + offset_vec)
+                valid_coord, idx = get_valid_idx(unique_hashes, neighbor_hash)
+                blur = blur + csr_matrix(
+                    (np.ones((len(valid_coord),)), (valid_coord, idx)),
+                    shape=(self.nvertices, self.nvertices),
+                )
+            self.blurs.append(blur)
+    def _hash_coords(self, coord):
+        """Hacky function to turn a coordinate into a unique value"""
+        return np.dot(coord.reshape(-1, self.dim), self.hash_vec)
+    def splat(self, x):
+        return self.S.dot(x)
+    def slice(self, y):
+        return self.S.T.dot(y)
+    def blur(self, x):
+        """Blur a bilateral-space vector with a 1 2 1 kernel in each dimension"""
+        assert x.shape[0] == self.nvertices
+        out = 2 * self.dim * x
+        for blur in self.blurs:
+            out = out + blur.dot(x)
+        return out
+    def filter(self, x):
+        """Apply bilateral filter to an input x"""
+        return self.slice(self.blur(self.splat(x))) / self.slice(
+            self.blur(self.splat(np.ones_like(x)))
+        )
+def bistochastize(grid, maxiter=10):
+    """Compute diagonal matrices to bistochastize a bilateral grid"""
+    m = grid.splat(np.ones(grid.npixels))
+    n = np.ones(grid.nvertices)
+    for i in range(maxiter):
+        n = np.sqrt(n * m / grid.blur(n))
+    # Correct m to satisfy the assumption of bistochastization regardless
+    # of how many iterations have been run.
+    m = n * grid.blur(n)
+    Dm = diags(m, 0)
+    Dn = diags(n, 0)
+    return Dn, Dm
+class BilateralSolver(object):
+    def __init__(self, grid, params):
+        self.grid = grid
+        self.params = params
+        self.Dn, self.Dm = bistochastize(grid)
+    def solve(self, x, w):
+        # Check that w is a vector or a nx1 matrix
+        if w.ndim == 2:
+            assert w.shape[1] == 1
+        elif w.dim == 1:
+            w = w.reshape(w.shape[0], 1)
+        A_smooth = self.Dm - self.Dn.dot(self.grid.blur(self.Dn))
+        w_splat = self.grid.splat(w)
+        A_data = diags(w_splat[:, 0], 0)
+        A = self.params["lam"] * A_smooth + A_data
+        xw = x * w
+        b = self.grid.splat(xw)
+        # Use simple Jacobi preconditioner
+        A_diag = np.maximum(A.diagonal(), self.params["A_diag_min"])
+        M = diags(1 / A_diag, 0)
+        # Flat initialization
+        y0 = self.grid.splat(xw) / w_splat
+        yhat = np.empty_like(y0)
+        for d in range(x.shape[-1]):
+            yhat[..., d], info = cg(
+                A,
+                b[..., d],
+                x0=y0[..., d],
+                M=M,
+                maxiter=self.params["cg_maxiter"],
+                tol=self.params["cg_tol"],
+            )
+        xhat = self.grid.slice(yhat)
+        return xhat
+def bilateral_solver_output(
+    img_pth,
+    target,
+    img=None,
+    sigma_spatial=24,
+    sigma_luma=4,
+    sigma_chroma=4,
+    get_all_cc=False
+):
+    if img is None:
+        reference = np.array(Image.open(img_pth).convert("RGB"))
+    else:
+        reference = np.array(img)
+    h, w = target.shape
+    confidence = np.ones((h, w)) * 0.999
+    grid_params = {
+        "sigma_luma": sigma_luma,  # Brightness bandwidth
+        "sigma_chroma": sigma_chroma,  # Color bandwidth
+        "sigma_spatial": sigma_spatial,  # Spatial bandwidth
+    }
+    bs_params = {
+        "lam": 256,  # The strength of the smoothness parameter
+        "A_diag_min": 1e-5,  # Clamp the diagonal of the A diagonal in the Jacobi preconditioner.
+        "cg_tol": 1e-5,  # The tolerance on the convergence in PCG
+        "cg_maxiter": 25,  # The number of PCG iterations
+    }
+    grid = BilateralGrid(reference, **grid_params)
+    t = target.reshape(-1, 1).astype(np.double)
+    c = confidence.reshape(-1, 1).astype(np.double)
+    ## output solver, which is a soft value
+    output_solver = BilateralSolver(grid, bs_params).solve(t, c).reshape((h, w))
+    binary_solver = ndimage.binary_fill_holes(output_solver > 0.5)
+    labeled, nr_objects = ndimage.label(binary_solver)
+    nb_pixel = [np.sum(labeled == i) for i in range(nr_objects + 1)]
+    pixel_order = np.argsort(nb_pixel)
+    if get_all_cc:
+        # Remove known bakground
+        pixel_descending_order = pixel_order[::-1]
+        # Get all CC expect biggest one, may consider it as background, try and change here
+        binary_solver = (labeled[None,:,:] == pixel_descending_order[1:,None,None]).astype(int).sum(0)
+    else:
+        try:
+            binary_solver = labeled == pixel_order[-2]
+        except:
+            binary_solver = np.ones((h, w), dtype=bool)
+    return output_solver, binary_solver

bkg_seg.py ADDED Viewed

	@@ -0,0 +1,84 @@

+# Copyright 2022 - Valeo Comfort and Driving Assistance - Oriane Siméoni @ valeo.ai
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn.functional as F
+from typing import Tuple
+def compute_img_bkg_seg(
+    attentions,
+    feats,
+    featmap_dims,
+    th_bkg,
+    dim=64,
+    epsilon: float = 1e-10,
+    apply_weights: bool = True,
+) -> Tuple[torch.Tensor, float]:
+    """
+    inputs
+       - attentions [B, ]
+    """
+    w_featmap, h_featmap = featmap_dims
+    nb, nh, _ = attentions.shape[:3]
+    # we keep only the output patch attention
+    att = attentions[:, :, 0, 1:].reshape(nb, nh, -1)
+    att = att.reshape(nb, nh, w_featmap, h_featmap)
+    # -----------------------------------------------
+    # Inspired by CroW sparsity channel weighting of each head CroW, Kalantidis etal.
+    threshold = torch.mean(att.reshape(nb, -1), dim=1)  # Find threshold per image
+    Q = torch.sum(
+        att.reshape(nb, nh, w_featmap * h_featmap) > threshold[:, None, None], axis=2
+    ) / (w_featmap * h_featmap)
+    beta = torch.log(torch.sum(Q + epsilon, dim=1)[:, None] / (Q + epsilon))
+    # Weight features based on attention sparsity
+    descs = feats[:,1:,]
+    if apply_weights:
+        descs = (descs.reshape(nb, -1, nh, dim) * beta[:, None, :, None]).reshape(
+            nb, -1, nh * dim
+        )
+    else:
+        descs = (descs.reshape(nb, -1, nh, dim)).reshape(
+            nb, -1, nh * dim
+        )
+    # -----------------------------------------------
+    # Compute cosine-similarities
+    descs = F.normalize(descs, dim=-1, p=2)
+    cos_sim = torch.bmm(descs, descs.permute(0, 2, 1))
+    # -----------------------------------------------
+    # Find pixel with least amount of attention
+    if apply_weights:
+        att = att.reshape(nb, nh, w_featmap, h_featmap) * beta[:, :, None, None]
+    else:
+        att = att.reshape(nb, nh, w_featmap, h_featmap)
+    id_pixel_ref = torch.argmin(torch.sum(att, axis=1).reshape(nb, -1), dim=-1)
+    # -----------------------------------------------
+    # Mask of definitely background pixels: 1 on the background
+    cos_sim = cos_sim.reshape(nb, -1, w_featmap * h_featmap)
+    bkg_mask = (
+        cos_sim[torch.arange(cos_sim.size(0)), id_pixel_ref, :].reshape(
+            nb, w_featmap, h_featmap
+        )
+        > th_bkg
+    )  # mask to be used to remove background
+    return bkg_mask.float()

configs/found_DUTS-TR.yaml ADDED Viewed

	@@ -0,0 +1,34 @@

+model:
+  arch: vit_small
+  patch_size: 8
+  pre_training: dino
+found:
+  bkg_th: 0.3
+  feats: k
+training:
+  dataset: DUTS-TR
+  dataset_set: null
+  # Hyper params
+  seed: 0
+  max_iter: 500
+  nb_epochs: 3
+  batch_size: 50
+  lr0: 5e-2
+  step_lr_size: 50
+  step_lr_gamma: 0.95
+  w_bs_loss: 1.5
+  stop_bkg_loss: 100
+  # Augmentations
+  crop_size: 224
+  scale_range: [0.1, 3.0]
+  photometric_aug: gaussian_blur
+  proba_photometric_aug: 0.5
+  cropping_strategy: random_scale
+evaluation:
+  type: saliency # uod, retrieval
+  datasets: [DUT-OMRON, ECSSD]

data/examples/VOC_000030.jpg ADDED Viewed

data/weights/decoder_weights.pt ADDED Viewed

Binary file (2.69 kB). View file

datasets/VOC.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import os
+from typing import Optional, Tuple, Union, Dict, List
+import cv2
+from pycocotools.coco import COCO
+import numpy as np
+import torch
+import torchvision
+from PIL import Image, PngImagePlugin
+from torch.utils.data import Dataset
+from torchvision import transforms as T
+from torchvision.transforms import ColorJitter, RandomApply, RandomGrayscale
+from tqdm import tqdm
+VOCDetectionMetadataType = Dict[str, Dict[str, Union[str, Dict[str, str], List[str]]]]
+def get_voc_detection_gt(
+    metadata: VOCDetectionMetadataType, remove_hards: bool = False
+) -> Tuple[np.array, List[str]]:
+    objects = metadata["annotation"]["object"]
+    nb_obj = len(objects)
+    gt_bbxs = []
+    gt_clss = []
+    for object in range(nb_obj):
+        if remove_hards and (
+            objects[object]["truncated"] == "1"
+            or objects[object]["difficult"] == "1"
+        ):
+            continue
+        gt_cls = objects[object]["name"]
+        gt_clss.append(gt_cls)
+        obj = objects[object]["bndbox"]
+        x1y1x2y2 = [
+            int(obj["xmin"]),
+            int(obj["ymin"]),
+            int(obj["xmax"]),
+            int(obj["ymax"]),
+        ]
+        # Original annotations are integers in the range [1, W or H]
+        # Assuming they mean 1-based pixel indices (inclusive),
+        # a box with annotation (xmin=1, xmax=W) covers the whole image.
+        # In coordinate space this is represented by (xmin=0, xmax=W)
+        x1y1x2y2[0] -= 1
+        x1y1x2y2[1] -= 1
+        gt_bbxs.append(x1y1x2y2)
+    return np.asarray(gt_bbxs), gt_clss
+def create_gt_masks_if_voc(labels: PngImagePlugin.PngImageFile) -> Image.Image:
+    mask = np.array(labels)
+    mask_gt = (mask > 0).astype(float)
+    mask_gt = np.where(mask_gt != 0.0, 255, mask_gt)
+    mask_gt = Image.fromarray(np.uint8(mask_gt))
+    return mask_gt
+def create_VOC_loader(img_dir, dataset_set, evaluation_type):
+    year = img_dir[-4:]
+    download = not os.path.exists(img_dir)
+    if evaluation_type == "uod":
+        loader = torchvision.datasets.VOCDetection(
+            img_dir,
+            year=year,
+            image_set=dataset_set,
+            transform=None,
+            download=download,
+        )
+    elif evaluation_type == "saliency":
+        loader = torchvision.datasets.VOCSegmentation(
+            img_dir,
+            year=year,
+            image_set=dataset_set,
+            transform=None,
+            download=download,
+        )
+    else:
+        raise ValueError(f"Not implemented for {evaluation_type}.")
+    return loader

datasets/__init__.py ADDED Viewed

File without changes

datasets/augmentations.py ADDED Viewed

	@@ -0,0 +1,68 @@

+"""
+Code borrowed from SelfMask: https://github.com/NoelShin/selfmask
+"""
+import numpy as np
+import torch
+from PIL import Image
+from typing import Optional, Tuple, Union
+from torchvision.transforms import ColorJitter, RandomApply, RandomGrayscale
+from datasets.utils import GaussianBlur
+from datasets.geometric_transforms import (
+    random_scale,
+    random_crop,
+    random_hflip,
+)
+def geometric_augmentations(
+    image: Image.Image,
+    random_scale_range: Optional[Tuple[float, float]] = None,
+    random_crop_size: Optional[int] = None,
+    random_hflip_p: Optional[float] = None,
+    mask: Optional[Union[Image.Image, np.ndarray, torch.Tensor]] = None,
+    ignore_index: Optional[int] = None,
+) -> Tuple[Image.Image, torch.Tensor]:
+    """Note. image and mask are assumed to be of base size, thus share a spatial shape."""
+    if random_scale_range is not None:
+        image, mask = random_scale(
+            image=image, random_scale_range=random_scale_range, mask=mask
+        )
+    if random_crop_size is not None:
+        crop_size = (random_crop_size, random_crop_size)
+        fill = tuple(np.array(image).mean(axis=(0, 1)).astype(np.uint8).tolist())
+        image, offset = random_crop(image=image, crop_size=crop_size, fill=fill)
+        if mask is not None:
+            assert ignore_index is not None
+            mask = random_crop(
+                image=mask, crop_size=crop_size, fill=ignore_index, offset=offset
+            )[0]
+    if random_hflip_p is not None:
+        image, mask = random_hflip(image=image, p=random_hflip_p, mask=mask)
+    return image, mask
+def photometric_augmentations(
+    image: Image.Image,
+    random_color_jitter: bool,
+    random_grayscale: bool,
+    random_gaussian_blur: bool,
+    proba_photometric_aug: float,
+) -> torch.Tensor:
+    if random_color_jitter:
+        color_jitter = ColorJitter(
+            brightness=0.8, contrast=0.8, saturation=0.8, hue=0.2
+        )
+        image = RandomApply([color_jitter], p=proba_photometric_aug)(image)
+    if random_grayscale:
+        image = RandomGrayscale(proba_photometric_aug)(image)
+    if random_gaussian_blur:
+        w, h = image.size
+        image = GaussianBlur(kernel_size=int((0.1 * min(w, h) // 2 * 2) + 1))(
+            image, proba_photometric_aug
+        )
+    return image

datasets/datasets.py ADDED Viewed

	@@ -0,0 +1,409 @@

+"""
+Dataset functions for applying Normalized Cut.
+Code adapted from SelfMask: https://github.com/NoelShin/selfmask
+"""
+import os
+from typing import Optional, Tuple, Union
+from pycocotools.coco import COCO
+import numpy as np
+import torch
+import torchvision
+from PIL import Image
+from torch.utils.data import Dataset
+from torchvision import transforms as T
+from datasets.utils import unnormalize
+from datasets.geometric_transforms import resize
+from datasets.VOC import get_voc_detection_gt, create_gt_masks_if_voc, create_VOC_loader
+from datasets.augmentations import geometric_augmentations, photometric_augmentations
+from datasets.uod_datasets import UODDataset
+NORMALIZE = T.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
+def set_dataset_dir(dataset_name, root_dir):
+    if dataset_name == "ECSSD":
+        dataset_dir = os.path.join(root_dir, "ECSSD")
+        img_dir = os.path.join(dataset_dir, "images")
+        gt_dir = os.path.join(dataset_dir, "ground_truth_mask")
+    elif dataset_name == "DUTS-TEST":
+        dataset_dir = os.path.join(root_dir, "DUTS")
+        img_dir = os.path.join(dataset_dir, "DUTS-TE-Image")
+        gt_dir = os.path.join(dataset_dir, "DUTS-TE-Mask")
+    elif dataset_name == "DUTS-TR":
+        dataset_dir = os.path.join(root_dir, "DUTS")
+        img_dir = os.path.join(dataset_dir, "DUTS-TR-Image")
+        gt_dir = os.path.join(dataset_dir, "DUTS-TR-Mask")
+    elif dataset_name == "DUT-OMRON":
+        dataset_dir = os.path.join(root_dir, "DUT-OMRON")
+        img_dir = os.path.join(dataset_dir, "DUT-OMRON-image")
+        gt_dir = os.path.join(dataset_dir, "pixelwiseGT-new-PNG")
+    elif dataset_name == "VOC07":
+        dataset_dir = os.path.join(root_dir, "VOC2007")
+        img_dir = dataset_dir
+        gt_dir = dataset_dir
+    elif dataset_name == "VOC12":
+        dataset_dir = os.path.join('/datasets_local/osimeoni', "VOC2012")
+        img_dir = dataset_dir
+        gt_dir = dataset_dir
+    elif dataset_name == "COCO17":
+        dataset_dir = os.path.join(root_dir, "COCO")
+        img_dir = dataset_dir
+        gt_dir = dataset_dir
+    elif dataset_name == "ImageNet":
+        dataset_dir = os.path.join(root_dir, "ImageNet")
+        img_dir = dataset_dir
+        gt_dir = dataset_dir
+    else:
+        raise ValueError(f"Unknown dataset {dataset_name}")
+    return img_dir, gt_dir
+def build_dataset(
+    root_dir: str,
+    dataset_name: str,
+    dataset_set: Optional[str] = None,
+    for_eval: bool = False,
+    config=None,
+    evaluation_type="saliency", # uod,
+):
+    """
+    Build dataset
+    """
+    if evaluation_type == "saliency":
+        img_dir, gt_dir = set_dataset_dir(dataset_name, root_dir)
+        dataset = FoundDataset(
+            name=dataset_name,
+            img_dir=img_dir,
+            gt_dir=gt_dir,
+            dataset_set=dataset_set,
+            config=config,
+            for_eval=for_eval,
+            evaluation_type=evaluation_type,
+        )
+    elif evaluation_type == "uod":
+        assert dataset_name in ["VOC07", "VOC12", "COCO20k"]
+        dataset_set = "trainval" if dataset_name in ["VOC07", "VOC12"] else "train"
+        no_hards = False
+        dataset = UODDataset(
+            dataset_name,
+            dataset_set,
+            root_dir=root_dir,
+            remove_hards=no_hards,
+        )
+    return dataset
+class FoundDataset(Dataset):
+    def __init__(
+        self,
+        name: str,
+        img_dir: str,
+        gt_dir: str,
+        dataset_set: Optional[str] = None,
+        config=None,
+        for_eval:bool = False,
+        evaluation_type:str = "saliency",
+    ) -> None:
+        """
+        Args:
+            root_dir (string): Directory with all the images.
+            transform (callable, optional): Optional transform to be applied
+                on a sample.
+        """
+        self.for_eval = for_eval
+        self.use_aug =  not for_eval
+        self.evaluation_type = evaluation_type
+        assert evaluation_type in ["saliency"]
+        self.name = name
+        self.dataset_set = dataset_set
+        self.img_dir = img_dir
+        self.gt_dir = gt_dir
+        # if VOC dataset
+        self.loader = None
+        self.cocoGt = None
+        self.config = config
+        if "VOC" in self.name:
+            self.loader = create_VOC_loader(self.img_dir, dataset_set, evaluation_type)
+        # if ImageNet dataset
+        elif "ImageNet" in self.name:
+            self.loader = torchvision.datasets.ImageNet(
+                self.img_dir,
+                split=dataset_set,
+                transform=None,
+                target_transform=None,
+            )
+        elif "COCO" in self.name:
+            year = int("20"+self.name[-2:])
+            annFile=f'/datasets_local/COCO/annotations/instances_{dataset_set}{str(year)}.json'
+            self.cocoGt=COCO(annFile)
+            self.img_ids = list(sorted(self.cocoGt.getImgIds()))
+            self.img_dir = f'/datasets_local/COCO/images/{dataset_set}{str(year)}/'
+        # Transformations
+        if self.for_eval:
+            full_img_transform, no_norm_full_img_transform = self.get_init_transformation(
+                isVOC="VOC" in name
+            )
+            self.full_img_transform = full_img_transform
+            self.no_norm_full_img_transform = no_norm_full_img_transform
+        # Images
+        self.list_images = None
+        if not "VOC" in self.name and not "COCO" in self.name:
+            self.list_images = [
+                os.path.join(img_dir, i) for i in sorted(os.listdir(img_dir))
+            ]
+        self.ignore_index = -1
+        self.mean = NORMALIZE.mean
+        self.std = NORMALIZE.std
+        self.to_tensor_and_normalize = T.Compose([T.ToTensor(), NORMALIZE])
+        self.normalize = NORMALIZE
+        if config is not None and self.use_aug:
+            self._set_aug(config)
+    def get_init_transformation(self, isVOC: bool = False):
+        if isVOC:
+            t = T.Compose([T.PILToTensor(), T.ConvertImageDtype(torch.float), NORMALIZE])
+            t_nonorm = T.Compose([T.PILToTensor(), T.ConvertImageDtype(torch.float)])
+            return t, t_nonorm
+        else:
+            t = T.Compose([T.ToTensor(), NORMALIZE])
+            t_nonorm = T.Compose([T.ToTensor()])
+            return t, t_nonorm
+    def _set_aug(self, config):
+        """
+        Set augmentation based on config.
+        """
+        photometric_aug = config.training["photometric_aug"]
+        self.cropping_strategy = config.training["cropping_strategy"]
+        if self.cropping_strategy == "center_crop":
+            self.use_aug = False  # default strategy, not considered to be a data aug
+        self.scale_range = config.training["scale_range"]
+        self.crop_size = config.training["crop_size"]
+        self.center_crop_transforms = T.Compose(
+            [
+                T.CenterCrop((self.crop_size, self.crop_size)),
+                T.ToTensor(),
+            ]
+        )
+        self.center_crop_only_transforms = T.Compose(
+            [T.CenterCrop((self.crop_size, self.crop_size)), T.PILToTensor()]
+        )
+        self.proba_photometric_aug = config.training["proba_photometric_aug"]
+        self.random_color_jitter = False
+        self.random_grayscale = False
+        self.random_gaussian_blur = False
+        if photometric_aug == "color_jitter":
+            self.random_color_jitter = True
+        elif photometric_aug == "grayscale":
+            self.random_grayscale = True
+        elif photometric_aug == "gaussian_blur":
+            self.random_gaussian_blur = True
+    def _preprocess_data_aug(
+        self,
+        image: Image.Image,
+        mask: Image.Image,
+        ignore_index: Optional[int] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Prepare data in a proper form for either training (data augmentation) or validation."""
+        # resize to base size
+        image = resize(
+            image,
+            size=self.crop_size,
+            edge="shorter",
+            interpolation="bilinear",
+        )
+        mask = resize(
+            mask,
+            size=self.crop_size,
+            edge="shorter",
+            interpolation="bilinear",
+        )
+        if not isinstance(mask, torch.Tensor):
+            mask: torch.Tensor = torch.tensor(np.array(mask))
+        random_scale_range = None
+        random_crop_size = None
+        random_hflip_p = None
+        if self.cropping_strategy == "random_scale":
+            random_scale_range = self.scale_range
+        elif self.cropping_strategy == "random_crop":
+            random_crop_size = self.crop_size
+        elif self.cropping_strategy == "random_hflip":
+            random_hflip_p = 0.5
+        elif self.cropping_strategy == "random_crop_and_hflip":
+            random_hflip_p = 0.5
+            random_crop_size = self.crop_size
+        if random_crop_size or random_hflip_p or random_scale_range:
+            image, mask = geometric_augmentations(
+                image=image,
+                mask=mask,
+                random_scale_range=random_scale_range,
+                random_crop_size=random_crop_size,
+                ignore_index=ignore_index,
+                random_hflip_p=random_hflip_p,
+            )
+        if random_scale_range:
+            # resize to (self.crop_size, self.crop_size)
+            image = resize(
+                image,
+                size=self.crop_size,
+                interpolation="bilinear",
+            )
+            mask = resize(
+                mask,
+                size=(self.crop_size, self.crop_size),
+                interpolation="bilinear",
+            )
+        image = photometric_augmentations(
+            image,
+            random_color_jitter=self.random_color_jitter,
+            random_grayscale=self.random_grayscale,
+            random_gaussian_blur=self.random_gaussian_blur,
+            proba_photometric_aug=self.proba_photometric_aug,
+        )
+        # to tensor + normalize image
+        image = self.to_tensor_and_normalize(image)
+        return image, mask
+    def __len__(self) -> int:
+        if "VOC" in self.name:
+            return len(self.loader)
+        elif "ImageNet" in self.name:
+            return len(self.loader)
+        elif "COCO" in self.name:
+            return len(self.img_ids)
+        return len(self.list_images)
+    def _apply_center_crop(
+        self, image: Image.Image, mask: Union[Image.Image, np.ndarray, torch.Tensor]
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        img_t = self.center_crop_transforms(image)
+        # need to normalize image
+        img_t = self.normalize(img_t)
+        mask_gt = self.center_crop_transforms(mask).squeeze()
+        return img_t, mask_gt
+    def __getitem__(self, idx, get_mask_gt=True):
+        if "VOC" in self.name:
+            img, gt_labels = self.loader[idx]
+            if self.evaluation_type == "uod":
+                gt_labels, _ = get_voc_detection_gt(
+                    gt_labels, remove_hards=False
+                )
+            elif self.evaluation_type == "saliency":
+                mask_gt = create_gt_masks_if_voc(gt_labels)
+            img_path = self.loader.images[idx]
+        elif "ImageNet" in self.name:
+            img, _ = self.loader[idx]
+            img_path = self.loader.imgs[idx][0]
+            # empty mask since no gt mask, only class label
+            zeros = np.zeros(np.array(img).shape[:2])
+            mask_gt = Image.fromarray(zeros)
+        elif "COCO" in self.name:
+            img_id = self.img_ids[idx]
+            path = self.cocoGt.loadImgs(img_id)[0]["file_name"]
+            img =  Image.open(os.path.join(self.img_dir, path)).convert("RGB")
+            _ = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(id))
+            img_path = self.img_ids[idx] # What matters most is the id for eval
+            # empty mask since no gt mask, only class label
+            zeros = np.zeros(np.array(img).shape[:2])
+            mask_gt = Image.fromarray(zeros)
+        # For all others
+        else:
+            img_path = self.list_images[idx]
+            with open(img_path, "rb") as f:
+                img = Image.open(f)
+                img = img.convert("RGB")
+                im_name = img_path.split("/")[-1]
+                mask_gt = Image.open(
+                    os.path.join(self.gt_dir, im_name.replace(".jpg", ".png"))
+                ).convert("L")
+        if self.for_eval:
+            img_t = self.full_img_transform(img)
+            img_init = self.no_norm_full_img_transform(img)
+            if self.evaluation_type == "saliency":
+                mask_gt = torch.tensor(np.array(mask_gt)).squeeze()
+                mask_gt = np.array(mask_gt)
+                mask_gt = mask_gt == 255
+                mask_gt = torch.tensor(mask_gt)
+        else:
+            if self.use_aug:
+                img_t, mask_gt = self._preprocess_data_aug(
+                    image=img, mask=mask_gt, ignore_index=self.ignore_index
+                )
+                mask_gt = np.array(mask_gt)
+                mask_gt = mask_gt == 255
+                mask_gt = torch.tensor(mask_gt)
+            else:
+                # no data aug
+                img_t, mask_gt = self._apply_center_crop(image=img, mask=mask_gt)
+                gt_labels = self.center_crop_only_transforms(gt_labels).squeeze()
+                mask_gt = np.asarray(mask_gt, np.int64)
+                mask_gt = mask_gt == 1
+                mask_gt = torch.tensor(mask_gt)
+            img_init = unnormalize(img_t)
+        if not get_mask_gt:
+            mask_gt = None
+        if self.evaluation_type == "uod":
+            gt_labels = torch.tensor(gt_labels)
+            mask_gt = gt_labels
+        return img_t, img_init, mask_gt, img_path
+    def fullimg_mode(self):
+        self.val_full_image = True
+    def training_mode(self):
+        self.val_full_image = False

datasets/geometric_transforms.py ADDED Viewed

	@@ -0,0 +1,160 @@

+"""
+Code adapted from SelfMask: https://github.com/NoelShin/selfmask
+"""
+from random import randint, random, uniform
+from typing import Optional, Tuple, Union
+import numpy as np
+import torch
+import torchvision.transforms.functional as TF
+from PIL import Image
+from torchvision.transforms.functional import InterpolationMode as IM
+def random_crop(
+    image: Union[Image.Image, np.ndarray, torch.Tensor],
+    crop_size: Tuple[int, int],  # (h, w)
+    fill: Union[int, Tuple[int, int, int]],  # an unsigned integer or RGB,
+    offset: Optional[Tuple[int, int]] = None,  # (top, left) coordinate of a crop
+):
+    assert type(crop_size) in (tuple, list) and len(crop_size) == 2
+    if isinstance(image, np.ndarray):
+        image = torch.tensor(image)
+        h, w = image.shape[-2:]
+    elif isinstance(image, Image.Image):
+        w, h = image.size
+    elif isinstance(image, torch.Tensor):
+        h, w = image.shape[-2:]
+    else:
+        raise TypeError(type(image))
+    pad_h, pad_w = max(crop_size[0] - h, 0), max(crop_size[1] - w, 0)
+    image = TF.pad(image, [0, 0, pad_w, pad_h], fill=fill, padding_mode="constant")
+    if isinstance(image, Image.Image):
+        w, h = image.size
+    else:
+        h, w = image.shape[-2:]
+    if offset is None:
+        offset = (randint(0, h - crop_size[0]), randint(0, w - crop_size[1]))
+    image = TF.crop(
+        image, top=offset[0], left=offset[1], height=crop_size[0], width=crop_size[1]
+    )
+    return image, offset
+def compute_size(
+    input_size: Tuple[int, int], output_size: int, edge: str  # h, w
+) -> Tuple[int, int]:
+    assert edge in ["shorter", "longer"]
+    h, w = input_size
+    if edge == "longer":
+        if w > h:
+            h = int(float(h) / w * output_size)
+            w = output_size
+        else:
+            w = int(float(w) / h * output_size)
+            h = output_size
+        assert w <= output_size and h <= output_size
+    else:
+        if w > h:
+            w = int(float(w) / h * output_size)
+            h = output_size
+        else:
+            h = int(float(h) / w * output_size)
+            w = output_size
+        assert w >= output_size and h >= output_size
+    return h, w
+def resize(
+    image: Union[Image.Image, np.ndarray, torch.Tensor],
+    size: Union[int, Tuple[int, int]],
+    interpolation: str,
+    edge: str = "both",
+) -> Union[Image.Image, torch.Tensor]:
+    """
+    :param image: an image to be resized
+    :param size: a resulting image size
+    :param interpolation: sampling mode. ["nearest", "bilinear", "bicubic"]
+    :param edge: Default: "both"
+    No-op if a size is given as a tuple (h, w).
+    If set to "both", resize both height and width to the specified size.
+    If set to "shorter", resize the shorter edge to the specified size keeping the aspect ratio.
+    If set to "longer", resize the longer edge to the specified size keeping the aspect ratio.
+    :return: a resized image
+    """
+    assert interpolation in ["nearest", "bilinear", "bicubic"], ValueError(
+        interpolation
+    )
+    assert edge in ["both", "shorter", "longer"], ValueError(edge)
+    interpolation = {
+        "nearest": IM.NEAREST,
+        "bilinear": IM.BILINEAR,
+        "bicubic": IM.BICUBIC,
+    }[interpolation]
+    if type(image) == torch.Tensor:
+        image = image.clone().detach()
+    elif type(image) == np.ndarray:
+        image = torch.from_numpy(image)
+    if type(size) is tuple:
+        if type(image) == torch.Tensor and len(image.shape) == 2:
+            image = TF.resize(
+                image.unsqueeze(dim=0), size=size, interpolation=interpolation
+            ).squeeze(dim=0)
+        else:
+            image = TF.resize(image, size=size, interpolation=interpolation)
+    else:
+        if edge == "both":
+            image = TF.resize(image, size=[size, size], interpolation=interpolation)
+        else:
+            if isinstance(image, Image.Image):
+                w, h = image.size
+            else:
+                h, w = image.shape[-2:]
+            rh, rw = compute_size(input_size=(h, w), output_size=size, edge=edge)
+            image = TF.resize(image, size=[rh, rw], interpolation=interpolation)
+    return image
+def random_scale(
+    image: Union[Image.Image, np.ndarray, torch.Tensor],
+    random_scale_range: Tuple[float, float],
+    mask: Optional[Union[Image.Image, np.ndarray, torch.Tensor]] = None,
+):
+    scale = uniform(*random_scale_range)
+    if isinstance(image, Image.Image):
+        w, h = image.size
+    else:
+        h, w = image.shape[-2:]
+    w_rs, h_rs = int(w * scale), int(h * scale)
+    image: Image.Image = resize(image, size=(h_rs, w_rs), interpolation="bilinear")
+    if mask is not None:
+        mask = resize(mask, size=(h_rs, w_rs), interpolation="nearest")
+    return image, mask
+def random_hflip(
+    image: Union[Image.Image, np.ndarray, torch.Tensor],
+    p: float,
+    mask: Optional[Union[np.ndarray, torch.Tensor]] = None,
+):
+    assert 0.0 <= p <= 1.0, ValueError(random_hflip)
+    # Return a random floating point number in the range [0.0, 1.0).
+    if random() > p:
+        image = TF.hflip(image)
+        if mask is not None:
+            mask = TF.hflip(mask)
+    return image, mask

datasets/uod_datasets.py ADDED Viewed

	@@ -0,0 +1,384 @@

+# Copyright 2021 - Valeo Comfort and Driving Assistance - Oriane Siméoni @ valeo.ai
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Code adapted from previous method LOST: https://github.com/valeoai/LOST
+"""
+import os
+import math
+import torch
+import json
+import torchvision
+import numpy as np
+import skimage.io
+from PIL import Image
+from tqdm import tqdm
+from torchvision import transforms as pth_transforms
+# Image transformation applied to all images
+transform = pth_transforms.Compose(
+    [
+        pth_transforms.ToTensor(),
+        pth_transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+    ]
+)
+class ImageDataset:
+    def __init__(
+        self,
+        image_path
+    ):
+        self.image_path = image_path
+        self.name = image_path.split("/")[-1]
+        # Read the image
+        with open(image_path, "rb") as f:
+            img = Image.open(f)
+            img = img.convert("RGB")
+        # Build a dataloader
+        img = transform(img)
+        self.dataloader = [[img, image_path]]
+    def get_image_name(self, *args, **kwargs):
+        return self.image_path.split("/")[-1].split(".")[0]
+    def load_image(self, *args, **kwargs):
+        return skimage.io.imread(self.image_path)
+class UODDataset:
+    def __init__(
+        self,
+        dataset_name,
+        dataset_set,
+        root_dir,
+        remove_hards:bool = False,
+    ):
+        """
+        Build the dataloader
+        """
+        self.dataset_name = dataset_name
+        self.set = dataset_set
+        self.root_dir = root_dir
+        if dataset_name == "VOC07":
+            self.root_path = f"{root_dir}/VOC2007"
+            self.year = "2007"
+        elif dataset_name == "VOC12":
+            self.root_path = f"{root_dir}/VOC2012"
+            self.year = "2012"
+        elif dataset_name == "COCO20k":
+            self.year = "2014"
+            self.root_path = f"{root_dir}/COCO/images/{dataset_set}{self.year}"
+            self.sel20k = 'data/coco_20k_filenames.txt'
+            # JSON file constructed based on COCO train2014 gt
+            self.all_annfile = f"{root_dir}/COCO/annotations/instances_train2014.json"
+            self.annfile = f"{root_dir}/instances_train2014_sel20k.json"
+            if not os.path.exists(self.annfile):
+                select_coco_20k(self.sel20k, self.all_annfile)
+        else:
+            raise ValueError("Unknown dataset.")
+        if not os.path.exists(self.root_path):
+            raise ValueError("Please follow the README to setup the datasets.")
+        self.name = f"{self.dataset_name}_{self.set}"
+        # Build the dataloader
+        if "VOC" in dataset_name:
+            self.dataloader = torchvision.datasets.VOCDetection(
+                self.root_path,
+                year=self.year,
+                image_set=self.set,
+                transform=transform,
+                download=False,
+            )
+        elif "COCO20k" == dataset_name:
+            self.dataloader = torchvision.datasets.CocoDetection(
+                self.root_path, annFile=self.annfile, transform=transform
+            )
+        else:
+            raise ValueError("Unknown dataset.")
+        # Set hards images that are not included
+        self.remove_hards = remove_hards
+        self.hards = []
+        if remove_hards:
+            self.name += f"-nohards"
+            self.hards = self.get_hards()
+            print(f"Nb images discarded {len(self.hards)}")
+    def __len__(self) -> int:
+        return len(self.dataloader)
+    def load_image(self, im_name):
+        """
+        Load the image corresponding to the im_name
+        """
+        if "VOC" in self.dataset_name:
+            image = skimage.io.imread(f"{self.root_dir}/VOC{self.year}/JPEGImages/{im_name}")
+        elif "COCO" in self.dataset_name:
+            im_path = self.path_20k[self.sel_20k.index(im_name)]
+            image = skimage.io.imread(f"{self.root_dir}/COCO/images/{im_path}")
+        else:
+            raise ValueError("Unkown dataset.")
+        return image
+    def get_image_name(self, inp):
+        """
+        Return the image name
+        """
+        if "VOC" in self.dataset_name:
+            im_name = inp["annotation"]["filename"]
+        elif "COCO" in self.dataset_name:
+            im_name = str(inp[0]["image_id"])
+        return im_name
+    def extract_gt(self, targets, im_name):
+        if "VOC" in self.dataset_name:
+            return extract_gt_VOC(targets, remove_hards=self.remove_hards)
+        elif "COCO" in self.dataset_name:
+            return extract_gt_COCO(targets, remove_iscrowd=True)
+        else:
+            raise ValueError("Unknown dataset")
+    def extract_classes(self):
+        if "VOC" in self.dataset_name:
+            cls_path = f"classes_{self.set}_{self.year}.txt"
+        elif "COCO" in self.dataset_name:
+            cls_path = f"classes_{self.dataset}_{self.set}_{self.year}.txt"
+        # Load if exists
+        if os.path.exists(cls_path):
+            all_classes = []
+            with open(cls_path, "r") as f:
+                for line in f:
+                    all_classes.append(line.strip())
+        else:
+            print("Extract all classes from the dataset")
+            if "VOC" in self.dataset_name:
+                all_classes = self.extract_classes_VOC()
+            elif "COCO" in self.dataset_name:
+                all_classes = self.extract_classes_COCO()
+            with open(cls_path, "w") as f:
+                for s in all_classes:
+                    f.write(str(s) + "\n")
+        return all_classes
+    def extract_classes_VOC(self):
+        all_classes = []
+        for im_id, inp in enumerate(tqdm(self.dataloader)):
+            objects = inp[1]["annotation"]["object"]
+            for o in range(len(objects)):
+                if objects[o]["name"] not in all_classes:
+                    all_classes.append(objects[o]["name"])
+        return all_classes
+    def extract_classes_COCO(self):
+        all_classes = []
+        for im_id, inp in enumerate(tqdm(self.dataloader)):
+            objects = inp[1]
+            for o in range(len(objects)):
+                if objects[o]["category_id"] not in all_classes:
+                    all_classes.append(objects[o]["category_id"])
+        return all_classes
+    def get_hards(self):
+        hard_path = "datasets/hard_%s_%s_%s.txt" % (self.dataset_name, self.set, self.year)
+        if os.path.exists(hard_path):
+            hards = []
+            with open(hard_path, "r") as f:
+                for line in f:
+                    hards.append(int(line.strip()))
+        else:
+            print("Discover hard images that should be discarded")
+            if "VOC" in self.dataset_name:
+                # set the hards
+                hards = discard_hard_voc(self.dataloader)
+            with open(hard_path, "w") as f:
+                for s in hards:
+                    f.write(str(s) + "\n")
+        return hards
+def discard_hard_voc(dataloader):
+    hards = []
+    for im_id, inp in enumerate(tqdm(dataloader)):
+        objects = inp[1]["annotation"]["object"]
+        nb_obj = len(objects)
+        hard = np.zeros(nb_obj)
+        for i, o in enumerate(range(nb_obj)):
+            hard[i] = (
+                1
+                if (objects[o]["truncated"] == "1" or objects[o]["difficult"] == "1")
+                else 0
+            )
+        # all images with only truncated or difficult objects
+        if np.sum(hard) == nb_obj:
+            hards.append(im_id)
+    return hards
+def extract_gt_COCO(targets, remove_iscrowd=True):
+    objects = targets
+    nb_obj = len(objects)
+    gt_bbxs = []
+    gt_clss = []
+    for o in range(nb_obj):
+        # Remove iscrowd boxes
+        if remove_iscrowd and objects[o]["iscrowd"] == 1:
+            continue
+        gt_cls = objects[o]["category_id"]
+        gt_clss.append(gt_cls)
+        bbx = objects[o]["bbox"]
+        x1y1x2y2 = [bbx[0], bbx[1], bbx[0] + bbx[2], bbx[1] + bbx[3]]
+        x1y1x2y2 = [int(round(x)) for x in x1y1x2y2]
+        gt_bbxs.append(x1y1x2y2)
+    return np.asarray(gt_bbxs), gt_clss
+def extract_gt_VOC(targets, remove_hards=False):
+    objects = targets["annotation"]["object"]
+    nb_obj = len(objects)
+    gt_bbxs = []
+    gt_clss = []
+    for o in range(nb_obj):
+        if remove_hards and (
+            objects[o]["truncated"] == "1" or objects[o]["difficult"] == "1"
+        ):
+            continue
+        gt_cls = objects[o]["name"]
+        gt_clss.append(gt_cls)
+        obj = objects[o]["bndbox"]
+        x1y1x2y2 = [
+            int(obj["xmin"]),
+            int(obj["ymin"]),
+            int(obj["xmax"]),
+            int(obj["ymax"]),
+        ]
+        # Original annotations are integers in the range [1, W or H]
+        # Assuming they mean 1-based pixel indices (inclusive),
+        # a box with annotation (xmin=1, xmax=W) covers the whole image.
+        # In coordinate space this is represented by (xmin=0, xmax=W)
+        x1y1x2y2[0] -= 1
+        x1y1x2y2[1] -= 1
+        gt_bbxs.append(x1y1x2y2)
+    return np.asarray(gt_bbxs), gt_clss
+def bbox_iou(box1, box2, x1y1x2y2=True, GIoU=False, DIoU=False, CIoU=False, eps=1e-7):
+    # https://github.com/ultralytics/yolov5/blob/develop/utils/general.py
+    # Returns the IoU of box1 to box2. box1 is 4, box2 is nx4
+    box2 = box2.T
+    # Get the coordinates of bounding boxes
+    if x1y1x2y2:  # x1, y1, x2, y2 = box1
+        b1_x1, b1_y1, b1_x2, b1_y2 = box1[0], box1[1], box1[2], box1[3]
+        b2_x1, b2_y1, b2_x2, b2_y2 = box2[0], box2[1], box2[2], box2[3]
+    else:  # transform from xywh to xyxy
+        b1_x1, b1_x2 = box1[0] - box1[2] / 2, box1[0] + box1[2] / 2
+        b1_y1, b1_y2 = box1[1] - box1[3] / 2, box1[1] + box1[3] / 2
+        b2_x1, b2_x2 = box2[0] - box2[2] / 2, box2[0] + box2[2] / 2
+        b2_y1, b2_y2 = box2[1] - box2[3] / 2, box2[1] + box2[3] / 2
+    # Intersection area
+    inter = (torch.min(b1_x2, b2_x2) - torch.max(b1_x1, b2_x1)).clamp(0) * (
+        torch.min(b1_y2, b2_y2) - torch.max(b1_y1, b2_y1)
+    ).clamp(0)
+    # Union Area
+    w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps
+    w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps
+    union = w1 * h1 + w2 * h2 - inter + eps
+    iou = inter / union
+    if GIoU or DIoU or CIoU:
+        cw = torch.max(b1_x2, b2_x2) - torch.min(
+            b1_x1, b2_x1
+        )  # convex (smallest enclosing box) width
+        ch = torch.max(b1_y2, b2_y2) - torch.min(b1_y1, b2_y1)  # convex height
+        if CIoU or DIoU:  # Distance or Complete IoU https://arxiv.org/abs/1911.08287v1
+            c2 = cw ** 2 + ch ** 2 + eps  # convex diagonal squared
+            rho2 = (
+                (b2_x1 + b2_x2 - b1_x1 - b1_x2) ** 2
+                + (b2_y1 + b2_y2 - b1_y1 - b1_y2) ** 2
+            ) / 4  # center distance squared
+            if DIoU:
+                return iou - rho2 / c2  # DIoU
+            elif (
+                CIoU
+            ):  # https://github.com/Zzh-tju/DIoU-SSD-pytorch/blob/master/utils/box/box_utils.py#L47
+                v = (4 / math.pi ** 2) * torch.pow(
+                    torch.atan(w2 / h2) - torch.atan(w1 / h1), 2
+                )
+                with torch.no_grad():
+                    alpha = v / (v - iou + (1 + eps))
+                return iou - (rho2 / c2 + v * alpha)  # CIoU
+        else:  # GIoU https://arxiv.org/pdf/1902.09630.pdf
+            c_area = cw * ch + eps  # convex area
+            return iou - (c_area - union) / c_area  # GIoU
+    else:
+        return iou  # IoU
+def select_coco_20k(sel_file, all_annotations_file):
+    print('Building COCO 20k dataset.')
+    # load all annotations
+    with open(all_annotations_file, "r") as f:
+        train2014 = json.load(f)
+    # load selected images
+    with open(sel_file, "r") as f:
+        sel_20k = f.readlines()
+        sel_20k = [s.replace("\n", "") for s in sel_20k]
+    im20k = [str(int(s.split("_")[-1].split(".")[0])) for s in sel_20k]
+    new_anno = []
+    new_images = []
+    for i in tqdm(im20k):
+        new_anno.extend(
+            [a for a in train2014["annotations"] if a["image_id"] == int(i)]
+        )
+        new_images.extend([a for a in train2014["images"] if a["id"] == int(i)])
+    train2014_20k = {}
+    train2014_20k["images"] = new_images
+    train2014_20k["annotations"] = new_anno
+    train2014_20k["categories"] = train2014["categories"]
+    with open("datasets/instances_train2014_sel20k.json", "w") as outfile:
+        json.dump(train2014_20k, outfile)
+    print('Done.')

datasets/utils.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import numpy as np
+import torch
+from PIL import Image
+from torchvision import transforms as T
+NORMALIZE = T.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
+class GaussianBlur:
+    """
+    Code borrowed from SelfMask: https://github.com/NoelShin/selfmask
+    """
+    # Implements Gaussian blur as described in the SimCLR paper
+    def __init__(self, kernel_size: float, min: float = 0.1, max: float = 2.0) -> None:
+        self.min = min
+        self.max = max
+        # kernel size is set to be 10% of the image height/width
+        self.kernel_size = kernel_size
+    def __call__(self, sample: Image.Image, random_gaussian_blur_p: float):
+        sample = np.array(sample)
+        # blur the image with a 50% chance
+        prob = np.random.random_sample()
+        if prob < 0.5:
+            import cv2
+            sigma = (self.max - self.min) * np.random.random_sample() + self.min
+            sample = cv2.GaussianBlur(
+                sample, (self.kernel_size, self.kernel_size), sigma
+            )
+        return sample
+def unnormalize(image, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)):
+    """
+    Code borrowed from STEGO: https://github.com/mhamilton723/STEGO
+    """
+    image2 = torch.clone(image)
+    for t, m, s in zip(image2, mean, std):
+        t.mul_(s).add_(m)
+    return image2

evaluation/__init__.py ADDED Viewed

File without changes

evaluation/metrics/__init__.py ADDED Viewed

File without changes

evaluation/metrics/average_meter.py ADDED Viewed

	@@ -0,0 +1,21 @@

+"""
+Code borrowed from SelfMask: https://github.com/NoelShin/selfmask
+"""
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+    def update(self, val, n: int):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count

evaluation/metrics/f_measure.py ADDED Viewed

	@@ -0,0 +1,111 @@

+"""
+Code borrowed from SelfMask: https://github.com/NoelShin/selfmask
+"""
+import torch
+class FMeasure:
+    def __init__(
+        self,
+        default_thres: float = 0.5,
+        beta_square: float = 0.3,
+        n_bins: int = 255,
+        eps: float = 1e-7,
+    ):
+        """
+        :param default_thres: a hyperparameter for F-measure that is used to binarize a predicted mask. Default: 0.5
+        :param beta_square: a hyperparameter for F-measure. Default: 0.3
+        :param n_bins: the number of thresholds that will be tested for F-max. Default: 255
+        :param eps: a small value for numerical stability
+        """
+        self.beta_square = beta_square
+        self.default_thres = default_thres
+        self.eps = eps
+        self.n_bins = n_bins
+    def _compute_precision_recall(
+        self, binary_pred_mask: torch.Tensor, gt_mask: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        :param binary_pred_mask: (B x H x W) or (H x W)
+        :param gt_mask: (B x H x W) or (H x W), should be the same with binary_pred_mask
+        """
+        tp = torch.logical_and(binary_pred_mask, gt_mask).sum(dim=(-1, -2))
+        tp_fp = binary_pred_mask.sum(dim=(-1, -2))
+        tp_fn = gt_mask.sum(dim=(-1, -2))
+        prec = tp / (tp_fp + self.eps)
+        recall = tp / (tp_fn + self.eps)
+        return prec, recall
+    def _compute_f_measure(
+        self,
+        pred_mask: torch.Tensor,
+        gt_mask: torch.Tensor,
+        thresholds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if thresholds is None:
+            binary_pred_mask = pred_mask > self.default_thres
+        else:
+            binary_pred_mask = pred_mask > thresholds
+        prec, recall = self._compute_precision_recall(binary_pred_mask, gt_mask)
+        f_measure = ((1 + (self.beta_square**2)) * prec * recall) / (
+            (self.beta_square**2) * prec + recall + self.eps
+        )
+        return f_measure.cpu()
+    def _compute_f_max(
+        self, pred_mask: torch.Tensor, gt_mask: torch.Tensor
+    ) -> torch.Tensor:
+        """Compute self.n_bins + 1  F-measures, each of which has a different threshold, then return the maximum
+        F-measure among them.
+        :param pred_mask: (H x W)
+        :param gt_mask: (H x W)
+        """
+        # pred_masks, gt_masks: H x W -> self.n_bins x H x W
+        pred_masks = pred_mask.unsqueeze(dim=0).repeat(self.n_bins, 1, 1)
+        gt_masks = gt_mask.unsqueeze(dim=0).repeat(self.n_bins, 1, 1)
+        # thresholds: self.n_bins x 1 x 1
+        thresholds = (
+            torch.arange(0, 1, 1 / self.n_bins)
+            .view(self.n_bins, 1, 1)
+            .to(pred_masks.device)
+        )
+        # f_measures: self.n_bins
+        f_measures = self._compute_f_measure(pred_masks, gt_masks, thresholds)
+        return torch.max(f_measures).cpu(), f_measures
+    def _compute_f_mean(
+        self,
+        pred_mask: torch.Tensor,
+        gt_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        adaptive_thres = 2 * pred_mask.mean(dim=(-1, -2), keepdim=True)
+        binary_pred_mask = pred_mask > adaptive_thres
+        prec, recall = self._compute_precision_recall(binary_pred_mask, gt_mask)
+        f_mean = ((1 + (self.beta_square**2)) * prec * recall) / (
+            (self.beta_square**2) * prec + recall + self.eps
+        )
+        return f_mean.cpu()
+    def __call__(self, pred_mask: torch.Tensor, gt_mask: torch.Tensor) -> dict:
+        """
+        :param pred_mask: (H x W) a normalized prediction mask with values in [0, 1]
+        :param gt_mask: (H x W) a binary ground truth mask with values in {0, 1}
+        :return: a dictionary with keys being "f_measure" and "f_max" and values being the respective values.
+        """
+        outputs: dict = dict()
+        for k in ("f_measure", "f_mean"):
+            outputs.update({k: getattr(self, f"_compute_{k}")(pred_mask, gt_mask)})
+        f_max_, all_f = self._compute_f_max(pred_mask, gt_mask)
+        outputs["f_max"] = f_max_
+        outputs["all_f"] = all_f  # List of all f values for all thresholds
+        return outputs

evaluation/metrics/iou.py ADDED Viewed

	@@ -0,0 +1,37 @@

+"""
+Code adapted from SelfMask: https://github.com/NoelShin/selfmask
+"""
+from typing import Optional, Union
+import numpy as np
+import torch
+def compute_iou(
+    pred_mask: Union[np.ndarray, torch.Tensor],
+    gt_mask: Union[np.ndarray, torch.Tensor],
+    threshold: Optional[float] = 0.5,
+    eps: float = 1e-7,
+) -> Union[np.ndarray, torch.Tensor]:
+    """
+    :param pred_mask: (B x H x W) or (H x W)
+    :param gt_mask: (B x H x W) or (H x W), same shape with pred_mask
+    :param threshold: a binarization threshold
+    :param eps: a small value for computational stability
+    :return: (B) or (1)
+    """
+    assert pred_mask.shape == gt_mask.shape, f"{pred_mask.shape} != {gt_mask.shape}"
+    # assert 0. <= pred_mask.to(torch.float32).min() and pred_mask.max().to(torch.float32) <= 1., f"{pred_mask.min(), pred_mask.max()}"
+    if threshold is not None:
+        pred_mask = pred_mask > threshold
+    if isinstance(pred_mask, np.ndarray):
+        intersection = np.logical_and(pred_mask, gt_mask).sum(axis=(-1, -2))
+        union = np.logical_or(pred_mask, gt_mask).sum(axis=(-1, -2))
+        ious = intersection / (union + eps)
+    else:
+        intersection = torch.logical_and(pred_mask, gt_mask).sum(dim=(-1, -2))
+        union = torch.logical_or(pred_mask, gt_mask).sum(dim=(-1, -2))
+        ious = (intersection / (union + eps)).cpu()
+    return ious

evaluation/metrics/mae.py ADDED Viewed

	@@ -0,0 +1,14 @@

+"""
+Code borrowed from SelfMask: https://github.com/NoelShin/selfmask
+"""
+import torch
+def compute_mae(pred_mask: torch.Tensor, gt_mask: torch.Tensor) -> torch.Tensor:
+    """
+    :param pred_mask: (H x W) or (B x H x W) a normalized prediction mask with values in [0, 1]
+    :param gt_mask: (H x W) or (B x H x W) a binary ground truth mask with values in {0, 1}
+    """
+    return torch.mean(
+        torch.abs(pred_mask - gt_mask.to(torch.float32)), dim=(-1, -2)
+    ).cpu()

evaluation/metrics/pixel_acc.py ADDED Viewed

	@@ -0,0 +1,21 @@

+"""
+Code borrowed from SelfMask: https://github.com/NoelShin/selfmask
+"""
+from typing import Optional
+import torch
+def compute_pixel_accuracy(
+    pred_mask: torch.Tensor, gt_mask: torch.Tensor, threshold: Optional[float] = 0.5
+) -> torch.Tensor:
+    """
+    :param pred_mask: (H x W) or (B x H x W) a normalized prediction mask with values in [0, 1]
+    :param gt_mask: (H x W) or (B x H x W) a binary ground truth mask with values in {0, 1}
+    """
+    if threshold is not None:
+        binary_pred_mask = pred_mask > threshold
+    else:
+        binary_pred_mask = pred_mask
+    return (binary_pred_mask == gt_mask).to(torch.float32).mean(dim=(-1, -2)).cpu()

evaluation/metrics/s_measure.py ADDED Viewed

	@@ -0,0 +1,126 @@

+# code borrowed from https://github.com/Hanqer/Evaluate-SOD/blob/master/evaluator.py
+import numpy as np
+import torch
+class SMeasure:
+    def __init__(self, alpha: float = 0.5):
+        self.alpha: float = alpha
+        self.cuda: bool = True
+    def _centroid(self, gt):
+        rows, cols = gt.size()[-2:]
+        gt = gt.view(rows, cols)
+        if gt.sum() == 0:
+            if self.cuda:
+                X = torch.eye(1).cuda() * round(cols / 2)
+                Y = torch.eye(1).cuda() * round(rows / 2)
+            else:
+                X = torch.eye(1) * round(cols / 2)
+                Y = torch.eye(1) * round(rows / 2)
+        else:
+            total = gt.sum()
+            if self.cuda:
+                i = torch.from_numpy(np.arange(0, cols)).cuda().float()
+                j = torch.from_numpy(np.arange(0, rows)).cuda().float()
+            else:
+                i = torch.from_numpy(np.arange(0, cols)).float()
+                j = torch.from_numpy(np.arange(0, rows)).float()
+            X = torch.round((gt.sum(dim=0) * i).sum() / total)
+            Y = torch.round((gt.sum(dim=1) * j).sum() / total)
+        return X.long(), Y.long()
+    def _ssim(self, pred, gt):
+        gt = gt.float()
+        h, w = pred.size()[-2:]
+        N = h * w
+        x = pred.mean()
+        y = gt.mean()
+        sigma_x2 = ((pred - x) * (pred - x)).sum() / (N - 1 + 1e-20)
+        sigma_y2 = ((gt - y) * (gt - y)).sum() / (N - 1 + 1e-20)
+        sigma_xy = ((pred - x) * (gt - y)).sum() / (N - 1 + 1e-20)
+        aplha = 4 * x * y * sigma_xy
+        beta = (x * x + y * y) * (sigma_x2 + sigma_y2)
+        if aplha != 0:
+            Q = aplha / (beta + 1e-20)
+        elif aplha == 0 and beta == 0:
+            Q = 1.0
+        else:
+            Q = 0
+        return Q
+    def _object(self, pred, gt):
+        temp = pred[gt == 1]
+        x = temp.mean()
+        sigma_x = temp.std()
+        score = 2.0 * x / (x * x + 1.0 + sigma_x + 1e-20)
+        return score
+    def _s_object(self, pred, gt):
+        fg = torch.where(gt == 0, torch.zeros_like(pred), pred)
+        bg = torch.where(gt == 1, torch.zeros_like(pred), 1 - pred)
+        o_fg = self._object(fg, gt)
+        o_bg = self._object(bg, 1 - gt)
+        u = gt.mean()
+        Q = u * o_fg + (1 - u) * o_bg
+        return Q
+    def _divide_gt(self, gt, X, Y):
+        h, w = gt.size()[-2:]
+        area = h * w
+        gt = gt.view(h, w)
+        LT = gt[:Y, :X]
+        RT = gt[:Y, X:w]
+        LB = gt[Y:h, :X]
+        RB = gt[Y:h, X:w]
+        X = X.float()
+        Y = Y.float()
+        w1 = X * Y / area
+        w2 = (w - X) * Y / area
+        w3 = X * (h - Y) / area
+        w4 = 1 - w1 - w2 - w3
+        return LT, RT, LB, RB, w1, w2, w3, w4
+    def _divide_prediction(self, pred, X, Y):
+        h, w = pred.size()[-2:]
+        pred = pred.view(h, w)
+        LT = pred[:Y, :X]
+        RT = pred[:Y, X:w]
+        LB = pred[Y:h, :X]
+        RB = pred[Y:h, X:w]
+        return LT, RT, LB, RB
+    def _s_region(self, pred, gt):
+        X, Y = self._centroid(gt)
+        gt1, gt2, gt3, gt4, w1, w2, w3, w4 = self._divide_gt(gt, X, Y)
+        p1, p2, p3, p4 = self._divide_prediction(pred, X, Y)
+        Q1 = self._ssim(p1, gt1)
+        Q2 = self._ssim(p2, gt2)
+        Q3 = self._ssim(p3, gt3)
+        Q4 = self._ssim(p4, gt4)
+        Q = w1 * Q1 + w2 * Q2 + w3 * Q3 + w4 * Q4
+        # print(Q)
+        return Q
+    def __call__(self, pred_mask: torch.Tensor, gt_mask: torch.Tensor):
+        assert pred_mask.shape == gt_mask.shape
+        y = gt_mask.mean()
+        if y == 0:
+            x = pred_mask.mean()
+            Q = 1.0 - x
+        elif y == 1:
+            x = pred_mask.mean()
+            Q = x
+        else:
+            gt_mask[gt_mask >= 0.5] = 1
+            gt_mask[gt_mask < 0.5] = 0
+            # print(self._S_object(pred, gt), self._S_region(pred, gt))
+            Q = self.alpha * self._s_object(pred_mask, gt_mask) + (
+                1 - self.alpha
+            ) * self._s_region(pred_mask, gt_mask)
+            if Q.item() < 0:
+                Q = torch.FloatTensor([0.0])
+        return Q.item()

evaluation/saliency.py ADDED Viewed

	@@ -0,0 +1,290 @@

+# Copyright 2022 - Valeo Comfort and Driving Assistance - valeo.ai
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+import torch.nn as nn
+import torch.nn.functional as F
+from tqdm import tqdm
+from scipy import ndimage
+from evaluation.metrics.average_meter import AverageMeter
+from evaluation.metrics.f_measure import FMeasure
+from evaluation.metrics.iou import compute_iou
+from evaluation.metrics.mae import compute_mae
+from evaluation.metrics.pixel_acc import compute_pixel_accuracy
+from evaluation.metrics.s_measure import SMeasure
+from misc import batch_apply_bilateral_solver
+@torch.no_grad()
+def write_metric_tf(
+    writer,
+    metrics,
+    n_iter = -1,
+    name = ""
+):
+    writer.add_scalar(
+        f"Validation/{name}iou_pred",
+        metrics["ious"].avg,
+        n_iter,
+    )
+    writer.add_scalar(
+        f"Validation/{name}acc_pred",
+        metrics["pixel_accs"].avg,
+        n_iter,
+    )
+    writer.add_scalar(
+        f"Validation/{name}f_max",
+        metrics["f_maxs"].avg,
+        n_iter,
+    )
+@torch.no_grad()
+def eval_batch(
+    batch_gt_masks,
+    batch_pred_masks,
+    metrics_res={},
+    reset=False
+):
+    """
+    Evaluation code adapted from SelfMask: https://github.com/NoelShin/selfmask
+    """
+    f_values = {}
+    # Keep track of f_values for each threshold
+    for i in range(255):  # should equal n_bins in metrics/f_measure.py
+        f_values[i] = AverageMeter()
+    if metrics_res == {}:
+        metrics_res["f_scores"] = AverageMeter()
+        metrics_res["f_maxs"] = AverageMeter()
+        metrics_res["f_maxs_fixed"] = AverageMeter()
+        metrics_res["f_means"] = AverageMeter()
+        metrics_res["maes"] = AverageMeter()
+        metrics_res["ious"] = AverageMeter()
+        metrics_res["pixel_accs"] = AverageMeter()
+        metrics_res["s_measures"] = AverageMeter()
+    if reset:
+        metrics_res["f_scores"].reset()
+        metrics_res["f_maxs"].reset()
+        metrics_res["f_maxs_fixed"].reset()
+        metrics_res["f_means"].reset()
+        metrics_res["maes"].reset()
+        metrics_res["ious"].reset()
+        metrics_res["pixel_accs"].reset()
+        metrics_res["s_measures"].reset()
+    # iterate over batch dimension
+    for _, (pred_mask, gt_mask) in enumerate(
+        zip(batch_pred_masks, batch_gt_masks)
+    ):
+        assert pred_mask.shape == gt_mask.shape, f"{pred_mask.shape} != {gt_mask.shape}"
+        assert len(pred_mask.shape) == len(gt_mask.shape) == 2
+        # Compute
+        # Binarize at 0.5 for IoU and pixel accuracy
+        binary_pred = (pred_mask > 0.5).float().squeeze()
+        iou = compute_iou(binary_pred, gt_mask)
+        f_measures = FMeasure()(pred_mask, gt_mask)  # soft mask for F measure
+        mae = compute_mae(binary_pred, gt_mask)
+        pixel_acc = compute_pixel_accuracy(binary_pred, gt_mask)
+        # Update
+        metrics_res["ious"].update(val=iou.numpy(), n=1)
+        metrics_res["f_scores"].update(val=f_measures["f_measure"].numpy(), n=1)
+        metrics_res["f_maxs"].update(val=f_measures["f_max"].numpy(), n=1)
+        metrics_res["f_means"].update(val=f_measures["f_mean"].numpy(), n=1)
+        metrics_res["s_measures"].update(
+            val=SMeasure()(pred_mask=pred_mask, gt_mask=gt_mask.to(torch.float32)), n=1
+        )
+        metrics_res["maes"].update(val=mae.numpy(), n=1)
+        metrics_res["pixel_accs"].update(val=pixel_acc.numpy(), n=1)
+        # Keep track of f_values for each threshold
+        all_f = f_measures["all_f"].numpy()
+        for k, v in f_values.items():
+            v.update(val=all_f[k], n=1)
+        # Then compute the max for the f_max_fixed
+        metrics_res["f_maxs_fixed"].update(
+            val=np.max([v.avg for v in f_values.values()]), n=1
+        )
+    results = {}
+    # F-measure, F-max, F-mean, MAE, S-measure, IoU, pixel acc.
+    results["f_measure"] = metrics_res["f_scores"].avg
+    results["f_max"] = metrics_res["f_maxs"].avg
+    results["f_maxs_fixed"] = metrics_res["f_maxs_fixed"].avg
+    results["f_mean"] = metrics_res["f_means"].avg
+    results["s_measure"] = metrics_res["s_measures"].avg
+    results["mae"] = metrics_res["maes"].avg
+    results["iou"] = float(iou.numpy())
+    results["pixel_acc"] = metrics_res["pixel_accs"].avg
+    return results, metrics_res
+def evaluate_saliency(
+    dataset,
+    model,
+    writer=None,
+    batch_size=1,
+    n_iter=-1,
+    apply_bilateral=False,
+    im_fullsize=True,
+    method="pred",  # can also be "bkg",
+    apply_weights: bool = True,
+    evaluation_mode: str = 'single', # choices are ["single", "multi"]
+):
+    if im_fullsize:
+        # Change transformation
+        dataset.fullimg_mode()
+        batch_size = 1
+    valloader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=batch_size,
+        shuffle=False,
+        num_workers=2
+    )
+    sigmoid = nn.Sigmoid()
+    metrics_res = {}
+    metrics_res_bs = {}
+    valbar = tqdm(enumerate(valloader, 0), leave=None)
+    for i, data in valbar:
+        inputs, _, gt_labels, _ = data
+        inputs = inputs.to("cuda")
+        gt_labels = gt_labels.to("cuda").float()
+        # Forward step
+        with torch.no_grad():
+            preds, _, shape_f, att = model.forward_step(inputs, for_eval=True)
+        if method == "pred":
+            h, w = gt_labels.shape[-2:]
+            preds_up = F.interpolate(
+                preds, scale_factor=model.vit_patch_size, mode="bilinear", align_corners=False
+            )[..., :h, :w]
+            soft_preds = sigmoid(preds_up.detach()).squeeze(0)
+            preds_up = (
+                (sigmoid(preds_up.detach()) > 0.5).squeeze(0).float()
+            )
+        elif method == "bkg":
+            bkg_mask_pred = model.compute_background_batch(
+                att, shape_f,
+                apply_weights=apply_weights,
+            )
+            # Transform bkg detection to foreground detection
+            obj_mask = (
+                ~bkg_mask_pred.bool()
+            ).float()  # Obj labels is inverse of bkg
+            # Fit predictions to image size
+            preds_up = F.interpolate(
+                obj_mask.unsqueeze(1),
+                gt_labels.shape[-2:],
+                mode="bilinear",
+                align_corners=False,
+            )
+            preds_up = (preds_up > 0.5).float()
+            soft_preds = preds_up  # not soft actually
+        reset = True if i == 0 else False
+        if evaluation_mode == 'single':
+            labeled, nr_objects = ndimage.label(preds_up.squeeze().cpu().numpy())
+            if nr_objects == 0:
+                preds_up_one_cc = preds_up.squeeze()
+                print("nr_objects == 0")
+            else:
+                nb_pixel = [np.sum(labeled == i) for i in range(nr_objects + 1)]
+                pixel_order = np.argsort(nb_pixel)
+                cc = [torch.Tensor(labeled == i) for i in pixel_order]
+                cc = torch.stack(cc).cuda()
+                # Find CC set as background, here not necessarily the biggest
+                cc_background = (
+                    (
+                        (
+                            (~(preds_up[None, :, :, :].bool())).float()
+                            + cc[:, None, :, :].cuda()
+                        )
+                        > 1
+                    ).sum(-1).sum(-1).argmax()
+                )
+                pixel_order = np.delete(
+                    pixel_order, int(cc_background.cpu().numpy())
+                )
+                preds_up_one_cc = torch.Tensor(labeled == pixel_order[-1]).cuda()
+            _, metrics_res = eval_batch(
+                gt_labels,
+                preds_up_one_cc.unsqueeze(0),
+                metrics_res=metrics_res,
+                reset=reset,
+            )
+        if writer is not None:
+            write_metric_tf(writer, metrics_res, n_iter=n_iter, name=f"_{evaluation_mode}_")
+        elif evaluation_mode == 'multi':
+            # Eval without bilateral solver
+            _, metrics_res = eval_batch(
+                gt_labels,
+                soft_preds.unsqueeze(0) if len(soft_preds.shape) == 2 else soft_preds,
+                metrics_res=metrics_res,
+                reset=reset,
+            )  # soft preds needed for F beta measure
+        # Apply bilateral solver
+        preds_bs = None
+        if apply_bilateral:
+            get_all_cc = True if evaluation_mode == 'multi' else False
+            preds_bs, _ = batch_apply_bilateral_solver(data,
+                                                       preds_up.detach(),
+                                                       get_all_cc = get_all_cc
+                                                      )
+            _, metrics_res_bs = eval_batch(
+                gt_labels,
+                preds_bs[None,:,:].float(),
+                metrics_res=metrics_res_bs,
+                reset=reset
+            )
+            if writer is not None:
+                write_metric_tf(writer, metrics_res_bs, n_iter=n_iter, name=f"_{evaluation_mode}-BS_")
+        bar_str = f"{dataset.name} | {evaluation_mode} mode | " \
+                  f"F-max {metrics_res['f_maxs'].avg:.3f} " \
+                  f"IoU {metrics_res['ious'].avg:.3f}, " \
+                  f"PA {metrics_res['pixel_accs'].avg:.3f}"
+        if apply_bilateral:
+            bar_str += f" | with bilateral solver: " \
+                       f"F-max {metrics_res_bs['f_maxs'].avg:.3f}, " \
+                       f"IoU {metrics_res_bs['ious'].avg:.3f}, " \
+                       f"PA. {metrics_res_bs['pixel_accs'].avg:.3f}"
+        valbar.set_description(bar_str)
+    # Go back to original transformation
+    if im_fullsize:
+        dataset.training_mode()

evaluation/uod.py ADDED Viewed

	@@ -0,0 +1,118 @@

+# Copyright 2021 - Valeo Comfort and Driving Assistance - Oriane Siméoni @ valeo.ai
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Code adapted from previous method LOST: https://github.com/valeoai/LOST
+"""
+import os
+import time
+import torch
+import torch.nn as nn
+import numpy as np
+from tqdm import tqdm
+from misc import bbox_iou, get_bbox_from_segmentation_labels
+def evaluation_unsupervised_object_discovery(
+    dataset,
+    model,
+    evaluation_mode: str = 'single', # choices are ["single", "multi"]
+    output_dir:str = "outputs",
+    no_hards:bool = False,
+):
+    assert evaluation_mode == "single"
+    sigmoid = nn.Sigmoid()
+    # ----------------------------------------------------
+    # Loop over images
+    preds_dict = {}
+    cnt = 0
+    corloc = np.zeros(len(dataset.dataloader))
+    start_time = time.time()
+    pbar = tqdm(dataset.dataloader)
+    for im_id, inp in enumerate(pbar):
+        # ------------ IMAGE PROCESSING -------------------------------------------
+        img = inp[0]
+        init_image_size = img.shape
+        # Get the name of the image
+        im_name = dataset.get_image_name(inp[1])
+        # Pass in case of no gt boxes in the image
+        if im_name is None:
+            continue
+        # Padding the image with zeros to fit multiple of patch-size
+        size_im = (
+            img.shape[0],
+            int(np.ceil(img.shape[1] / model.vit_patch_size) * model.vit_patch_size),
+            int(np.ceil(img.shape[2] / model.vit_patch_size) * model.vit_patch_size),
+        )
+        paded = torch.zeros(size_im)
+        paded[:, : img.shape[1], : img.shape[2]] = img
+        img = paded
+        # # Move to gpu
+        img = img.cuda(non_blocking=True)
+        # Size for transformers
+        w_featmap = img.shape[-2] // model.vit_patch_size
+        h_featmap = img.shape[-1] // model.vit_patch_size
+        # ------------ GROUND-TRUTH -------------------------------------------
+        gt_bbxs, gt_cls = dataset.extract_gt(inp[1], im_name)
+        if gt_bbxs is not None:
+            # Discard images with no gt annotations
+            # Happens only in the case of VOC07 and VOC12
+            if gt_bbxs.shape[0] == 0 and no_hards:
+                continue
+        outputs = model.forward_step(img[None, :, :, :])
+        preds = (sigmoid(outputs[0].detach()) > 0.5).float().squeeze().cpu().numpy()
+        # get bbox
+        pred = get_bbox_from_segmentation_labels(
+            segmenter_predictions=preds,
+            scales=[model.vit_patch_size, model.vit_patch_size],
+            initial_image_size=init_image_size[1:],
+        )
+        # ------------ Visualizations -------------------------------------------
+        # Save the prediction
+        preds_dict[im_name] = pred
+        # Compare prediction to GT boxes
+        ious = bbox_iou(torch.from_numpy(pred), torch.from_numpy(gt_bbxs))
+        if torch.any(ious >= 0.5):
+            corloc[im_id] = 1
+        cnt += 1
+        if cnt % 50 == 0:
+            pbar.set_description(f"Found {int(np.sum(corloc))}/{cnt}")
+    # Evaluate
+    print(f"corloc: {100*np.sum(corloc)/cnt:.2f} ({int(np.sum(corloc))}/{cnt})")
+    result_file = os.path.join(output_dir, 'uod_results.txt')
+    with open(result_file, 'w') as f:
+        f.write('corloc,%.1f,,\n'%(100*np.sum(corloc)/cnt))
+    print('File saved at %s'%result_file)

main_found_evaluate.py ADDED Viewed

	@@ -0,0 +1,122 @@

+# Copyright 2022 - Valeo Comfort and Driving Assistance - Oriane Siméoni @ valeo.ai
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from model import FoundModel
+from misc import load_config
+from datasets.datasets import build_dataset
+from evaluation.saliency import evaluate_saliency
+from evaluation.uod import evaluation_unsupervised_object_discovery
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description = 'Evaluation of FOUND',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument(
+        "--eval-type",
+        type=str,
+        choices=["saliency", "uod"],
+        help="Evaluation type."
+    )
+    parser.add_argument(
+        "--dataset-eval",
+        type=str,
+        choices=["ECSSD", "DUT-OMRON", "DUTS-TEST", "VOC07", "VOC12", "COCO20k"],
+        help="Name of evaluation dataset."
+    )
+    parser.add_argument(
+        "--dataset-set-eval",
+        type=str,
+        default=None,
+        help="Set of the dataset."
+    )
+    parser.add_argument(
+        "--apply-bilateral",
+        action="store_true",
+        help="use bilateral solver."
+    )
+    parser.add_argument(
+        "--evaluation-mode",
+        type=str,
+        default="multi",
+        choices=["single", "multi"],
+        help="Type of evaluation."
+    )
+    parser.add_argument(
+        "--model-weights",
+        type=str,
+        default="data/weights/decoder_weights.pt",
+    )
+    parser.add_argument(
+        "--dataset-dir",
+        type=str,
+        default="/datasets_local",
+    )
+    parser.add_argument(
+        "--config",
+        type=str,
+        default="configs/found_DUTS-TR.yaml",
+    )
+    args = parser.parse_args()
+    print(args.__dict__)
+    # Configuration
+    config = load_config(args.config)
+    # ------------------------------------
+    # Load the model
+    model = FoundModel(vit_model=config.model["pre_training"],
+                        vit_arch=config.model["arch"],
+                        vit_patch_size=config.model["patch_size"],
+                        enc_type_feats=config.found["feats"],
+                        bkg_type_feats=config.found["feats"],
+                        bkg_th=config.found["bkg_th"])
+    # Load weights
+    model.decoder_load_weights(args.model_weights)
+    model.eval()
+    print(f"Model {args.model_weights} loaded correctly.")
+    # ------------------------------------
+    # Build the validation set
+    val_dataset = build_dataset(
+        root_dir=args.dataset_dir,
+        dataset_name=args.dataset_eval,
+        dataset_set=args.dataset_set_eval,
+        for_eval=True,
+        evaluation_type=args.eval_type,
+    )
+    print(f"\nBuilding dataset {val_dataset.name} (#{len(val_dataset)} images)")
+    # ------------------------------------
+    # Training
+    print(f"\nStarted evaluation on {val_dataset.name}")
+    if args.eval_type == "saliency":
+        evaluate_saliency(
+            val_dataset,
+            model=model,
+            evaluation_mode=args.evaluation_mode,
+            apply_bilateral=args.apply_bilateral,
+        )
+    elif args.eval_type == "uod":
+        if args.apply_bilateral:
+            raise ValueError("Not implemented.")
+        evaluation_unsupervised_object_discovery(
+            val_dataset,
+            model=model,
+            evaluation_mode=args.evaluation_mode,
+        )
+    else:
+        raise ValueError("Other evaluation method to come.")

main_visualize.py ADDED Viewed

	@@ -0,0 +1,99 @@

+# Copyright 2022 - Valeo Comfort and Driving Assistance - Oriane Siméoni @ valeo.ai
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import torch
+import argparse
+import torch.nn as nn
+import torch.nn.functional as F
+import matplotlib.pyplot as plt
+from PIL import Image
+from model import FoundModel
+from misc import load_config
+from torchvision import transforms as T
+NORMALIZE = T.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description = 'Evaluation of FOUND',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument(
+        "--img-path", type=str, default="data/examples/VOC07_000007.jpg", help="Image path."
+    )
+    parser.add_argument(
+        "--model-weights", type=str, default="data/weights/decoder_weights.pt",
+    )
+    parser.add_argument(
+        "--config", type=str, default="configs/found_DUTS-TR.yaml",
+    )
+    parser.add_argument(
+        "--output-dir", type=str, default="outputs",
+    )
+    args = parser.parse_args()
+    # Saving dir
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+    # Configuration
+    config = load_config(args.config)
+    # ------------------------------------
+    # Load the model
+    model = FoundModel(vit_model=config.model["pre_training"],
+                        vit_arch=config.model["arch"],
+                        vit_patch_size=config.model["patch_size"],
+                        enc_type_feats=config.found["feats"],
+                        bkg_type_feats=config.found["feats"],
+                        bkg_th=config.found["bkg_th"])
+    # Load weights
+    model.decoder_load_weights(args.model_weights)
+    model.eval()
+    print(f"Model {args.model_weights} loaded correctly.")
+    # Load the image
+    with open(args.img_path, "rb") as f:
+        img = Image.open(f)
+        img = img.convert("RGB")
+        t = T.Compose([T.ToTensor(), NORMALIZE])
+        img_t = t(img)[None,:,:,:]
+        inputs = img_t.to("cuda")
+    # Forward step
+    with torch.no_grad():
+        preds, _, shape_f, att = model.forward_step(inputs, for_eval=True)
+    # Apply FOUND
+    sigmoid = nn.Sigmoid()
+    h, w = img_t.shape[-2:]
+    preds_up = F.interpolate(
+        preds, scale_factor=model.vit_patch_size, mode="bilinear", align_corners=False
+    )[..., :h, :w]
+    preds_up = (
+        (sigmoid(preds_up.detach()) > 0.5).squeeze(0).float()
+    )
+    plt.figure()
+    plt.imshow(img)
+    plt.imshow(preds_up.cpu().squeeze().numpy(), 'gray', interpolation='none', alpha=0.5)
+    plt.axis('off')
+    img_name = args.img_path
+    img_name = img_name.split('/')[-1].split('.')[0]
+    plt.savefig(os.path.join(args.output_dir, f'{img_name}-found.png'), bbox_inches='tight', pad_inches=0)
+    plt.close()

misc.py ADDED Viewed

	@@ -0,0 +1,254 @@

+import re
+import os
+import cv2
+import yaml
+import math
+import random
+import scipy.ndimage
+import numpy as np
+import torch
+import torch.nn.functional as F
+from typing import List
+from torchvision import transforms as T
+from bilateral_solver import bilateral_solver_output
+loader = yaml.SafeLoader
+loader.add_implicit_resolver(
+    u'tag:yaml.org,2002:float',
+    re.compile(u'''^(?:
+     [-+]?(?:[0-9][0-9_]*)\\.[0-9_]*(?:[eE][-+]?[0-9]+)?
+    |[-+]?(?:[0-9][0-9_]*)(?:[eE][-+]?[0-9]+)
+    |\\.[0-9_]+(?:[eE][-+][0-9]+)?
+    |[-+]?[0-9][0-9_]*(?::[0-5]?[0-9])+\\.[0-9_]*
+    |[-+]?\\.(?:inf|Inf|INF)
+    |\\.(?:nan|NaN|NAN))$''', re.X),
+    list(u'-+0123456789.'))
+class Struct:
+    def __init__(self, **entries):
+        self.__dict__.update(entries)
+def load_config(config_file):
+    with open(config_file, errors='ignore') as f:
+        # conf = yaml.safe_load(f)  # load config
+        conf = yaml.load(f, Loader=loader)
+    print('hyperparameters: ' + ', '.join(f'{k}={v}' for k, v in conf.items()))
+    #TODO yaml_save(save_dir / 'config.yaml', conf)
+    return Struct(**conf)
+def set_seed(seed: int) -> None:
+    """
+    Set all seeds to make results reproducible
+    """
+    # env
+    os.environ["PYTHONHASHSEED"] = str(seed)
+    # python
+    random.seed(seed)
+    # numpy
+    np.random.seed(seed)
+    # torch
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+def IoU(mask1, mask2):
+    """
+    Code adapted from TokenCut: https://github.com/YangtaoWANG95/TokenCut
+    """
+    mask1, mask2 = (mask1 > 0.5).to(torch.bool), (mask2 > 0.5).to(torch.bool)
+    intersection = torch.sum(mask1 * (mask1 == mask2), dim=[-1, -2]).squeeze()
+    union = torch.sum(mask1 + mask2, dim=[-1, -2]).squeeze()
+    return (intersection.to(torch.float) / union).mean().item()
+def batch_apply_bilateral_solver(data,
+                                 masks,
+                                 get_all_cc=True,
+                                 shape=None):
+    cnt_bs = 0
+    masks_bs = []
+    inputs, init_imgs, gt_labels, img_path = data
+    for id in range(inputs.shape[0]):
+        _, bs_mask, use_bs = apply_bilateral_solver(
+            mask=masks[id].squeeze().cpu().numpy(),
+            img=init_imgs[id],
+            img_path=img_path[id],
+            im_fullsize=False,
+            # Careful shape should be opposed
+            shape=(gt_labels.shape[-1], gt_labels.shape[-2]),
+            get_all_cc=get_all_cc,
+        )
+        cnt_bs += use_bs
+        # use the bilateral solver output if IoU > 0.5
+        if use_bs:
+            if shape is None:
+                shape = masks.shape[-2:]
+            # Interpolate to downsample the mask back
+            bs_ds = F.interpolate(
+                torch.Tensor(bs_mask).unsqueeze(0).unsqueeze(0),
+                shape,  # TODO check here
+                mode="bilinear",
+                align_corners=False,
+            )
+            masks_bs.append(bs_ds.bool().cuda().squeeze()[None, :, :])
+        else:
+            # Use initial mask
+            masks_bs.append(masks[id].cuda().squeeze()[None, :, :])
+    return torch.cat(masks_bs).squeeze(), cnt_bs
+def apply_bilateral_solver(
+    mask,
+    img,
+    img_path,
+    shape,
+    im_fullsize=False,
+    get_all_cc=False,
+    bs_iou_threshold: float = 0.5,
+    reshape: bool = True,
+):
+    # Get initial image in the case of using full image
+    img_init = None
+    if not im_fullsize:
+        # Use the image given by dataloader
+        shape = (img.shape[-1], img.shape[-2])
+        t = T.ToPILImage()
+        img_init = t(img)
+    if reshape:
+        # Resize predictions to image size
+        resized_mask = cv2.resize(mask, shape)
+        sel_obj_mask = resized_mask
+    else:
+        resized_mask = mask
+        sel_obj_mask = mask
+    # Apply bilinear solver
+    _, binary_solver = bilateral_solver_output(
+        img_path,
+        resized_mask,
+        img=img_init,
+        sigma_spatial=16,
+        sigma_luma=16,
+        sigma_chroma=8,
+        get_all_cc=get_all_cc,
+    )
+    mask1 = torch.from_numpy(resized_mask).cuda()
+    mask2 = torch.from_numpy(binary_solver).cuda().float()
+    use_bs = 0
+    # If enough overlap, use BS output
+    if IoU(mask1, mask2) > bs_iou_threshold:
+        sel_obj_mask = binary_solver.astype(float)
+        use_bs = 1
+    return resized_mask, sel_obj_mask, use_bs
+def get_bbox_from_segmentation_labels(
+    segmenter_predictions: torch.Tensor,
+    initial_image_size: torch.Size,
+    scales: List[int],
+) -> np.array:
+    """
+    Find the largest connected component in foreground, extract its bounding box
+    """
+    objects, num_objects = scipy.ndimage.label(segmenter_predictions)
+    # find biggest connected component
+    all_foreground_labels = objects.flatten()[objects.flatten() != 0]
+    most_frequent_label = np.bincount(all_foreground_labels).argmax()
+    mask = np.where(objects == most_frequent_label)
+    # Add +1 because excluded max
+    ymin, ymax = min(mask[0]), max(mask[0]) + 1
+    xmin, xmax = min(mask[1]), max(mask[1]) + 1
+    if initial_image_size == segmenter_predictions.shape:
+        # Masks are already upsampled
+        pred = [xmin, ymin, xmax, ymax]
+    else:
+        # Rescale to image size
+        r_xmin, r_xmax = scales[1] * xmin, scales[1] * xmax
+        r_ymin, r_ymax = scales[0] * ymin, scales[0] * ymax
+        pred = [r_xmin, r_ymin, r_xmax, r_ymax]
+    # Check not out of image size (used when padding)
+    if initial_image_size:
+        pred[2] = min(pred[2], initial_image_size[1])
+        pred[3] = min(pred[3], initial_image_size[0])
+    return np.asarray(pred)
+def bbox_iou(
+    box1: np.array,
+    box2: np.array,
+    x1y1x2y2: bool = True,
+    GIoU: bool = False,
+    DIoU: bool = False,
+    CIoU: bool = False,
+    eps: float = 1e-7,
+):
+    # https://github.com/ultralytics/yolov5/blob/develop/utils/general.py
+    # Returns the IoU of box1 to box2. box1 is 4, box2 is nx4
+    box2 = box2.T
+    # Get the coordinates of bounding boxes
+    if x1y1x2y2:  # x1, y1, x2, y2 = box1
+        b1_x1, b1_y1, b1_x2, b1_y2 = box1[0], box1[1], box1[2], box1[3]
+        b2_x1, b2_y1, b2_x2, b2_y2 = box2[0], box2[1], box2[2], box2[3]
+    else:  # transform from xywh to xyxy
+        b1_x1, b1_x2 = box1[0] - box1[2] / 2, box1[0] + box1[2] / 2
+        b1_y1, b1_y2 = box1[1] - box1[3] / 2, box1[1] + box1[3] / 2
+        b2_x1, b2_x2 = box2[0] - box2[2] / 2, box2[0] + box2[2] / 2
+        b2_y1, b2_y2 = box2[1] - box2[3] / 2, box2[1] + box2[3] / 2
+    # Intersection area
+    inter = (torch.min(b1_x2, b2_x2) - torch.max(b1_x1, b2_x1)).clamp(0) * (
+        torch.min(b1_y2, b2_y2) - torch.max(b1_y1, b2_y1)
+    ).clamp(0)
+    # Union Area
+    w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps
+    w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps
+    union = w1 * h1 + w2 * h2 - inter + eps
+    iou = inter / union
+    if GIoU or DIoU or CIoU:
+        cw = torch.max(b1_x2, b2_x2) - torch.min(
+            b1_x1, b2_x1
+        )  # convex (smallest enclosing box) width
+        ch = torch.max(b1_y2, b2_y2) - torch.min(b1_y1, b2_y1)  # convex height
+        if CIoU or DIoU:  # Distance or Complete IoU https://arxiv.org/abs/1911.08287v1
+            c2 = cw**2 + ch**2 + eps  # convex diagonal squared
+            rho2 = (
+                (b2_x1 + b2_x2 - b1_x1 - b1_x2) ** 2
+                + (b2_y1 + b2_y2 - b1_y1 - b1_y2) ** 2
+            ) / 4  # center distance squared
+            if DIoU:
+                return iou - rho2 / c2  # DIoU
+            elif (
+                CIoU
+            ):  # https://github.com/Zzh-tju/DIoU-SSD-pytorch/blob/master/utils/box/box_utils.py#L47
+                v = (4 / math.pi**2) * torch.pow(
+                    torch.atan(w2 / h2) - torch.atan(w1 / h1), 2
+                )
+                with torch.no_grad():
+                    alpha = v / (v - iou + (1 + eps))
+                return iou - (rho2 / c2 + v * alpha)  # CIoU
+        else:  # GIoU https://arxiv.org/pdf/1902.09630.pdf
+            c_area = cw * ch + eps  # convex area
+            return iou - (c_area - union) / c_area  # GIoU
+    else:
+        return iou  # IoU

model.py ADDED Viewed

	@@ -0,0 +1,243 @@

+# Copyright 2022 - Valeo Comfort and Driving Assistance - Oriane Siméoni @ valeo.ai
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import torch
+import torch.nn as nn
+import dino.vision_transformer as vits
+from bkg_seg import compute_img_bkg_seg
+from misc import batch_apply_bilateral_solver
+class FoundModel(nn.Module):
+    def __init__(
+        self,
+        vit_model="dino",
+        vit_arch="vit_small",
+        vit_patch_size=8,
+        enc_type_feats="k",
+        bkg_type_feats="k",
+        bkg_th=0.3
+        ):
+        super(FoundModel, self).__init__()
+        # ----------------------
+        # Encoder
+        self.vit_encoder, self.initial_dim, self.hook_features = get_vit_encoder(
+            vit_arch, vit_model, vit_patch_size, enc_type_feats
+        )
+        self.vit_patch_size = vit_patch_size
+        self.enc_type_feats = enc_type_feats
+        # ----------------------
+        # Background Segmentation
+        self.bkg_type_feats = bkg_type_feats
+        self.bkg_th = bkg_th
+        # ----------------------
+        # Define the simple decoder
+        self.previous_dim = self.initial_dim
+        self.decoder = nn.Conv2d(self.previous_dim, 1, (1, 1))
+    def forward_step(self, batch, decoder=None, for_eval=False):
+        # Make the image divisible by the patch size
+        if for_eval:
+            batch = self.make_input_divisible(batch)
+            _w, _h = batch.shape[-2:]
+            _h, _w = _h // self.vit_patch_size, _w // self.vit_patch_size
+        else:
+            # Cropping used during training, could be changed to improve
+            w, h = (
+                batch.shape[-2] - batch.shape[-2] % self.vit_patch_size,
+                batch.shape[-1] - batch.shape[-1] % self.vit_patch_size,
+            )
+            batch = batch[:, :, :w, :h]
+        w_featmap = batch.shape[-2] // self.vit_patch_size
+        h_featmap = batch.shape[-1] // self.vit_patch_size
+        # Forward pass
+        with torch.no_grad():
+            # Encoder forward pass
+            att = self.vit_encoder.get_last_selfattention(batch)
+            # Get decoder features
+            feats = self.extract_feats(dims=att.shape, type_feats=self.enc_type_feats)
+            feats = feats[:, 1:, :, :].reshape(att.shape[0], w_featmap, h_featmap, -1)
+            feats = feats.permute(0, 3, 1, 2)
+        # Apply decoder
+        if decoder is None:
+            decoder = self.decoder
+        preds = decoder(feats)
+        # return preds_masked
+        return preds, feats, (w_featmap, h_featmap), att
+    def make_input_divisible(self, x: torch.Tensor) -> torch.Tensor:
+        # From selfmask
+        """Pad some pixels to make the input size divisible by the patch size."""
+        B, _, H_0, W_0 = x.shape
+        pad_w = (self.vit_patch_size - W_0 % self.vit_patch_size) % self.vit_patch_size
+        pad_h = (self.vit_patch_size - H_0 % self.vit_patch_size) % self.vit_patch_size
+        x = nn.functional.pad(x, (0, pad_w, 0, pad_h), value=0)
+        return x
+    def compute_background_batch(
+        self,
+        att,
+        shape_f,
+        # mlp_feats = None,
+        ):
+        w_f, h_f = shape_f
+        # Dimensions
+        nb_im = att.shape[0]  # Batch size
+        nh = att.shape[1]  # Number of heads
+        nb_tokens = att.shape[2]  # Number of tokens
+        # Get decoder features
+        feats = self.extract_feats(dims=att.shape,
+                                #    mlp_feats = mlp_feats,
+                                   type_feats=self.bkg_type_feats
+                                   )
+        feats = feats.reshape(nb_im, nb_tokens, -1)
+        bkg_mask = compute_img_bkg_seg(
+            att,
+            feats,
+            (w_f,h_f),
+            th_bkg=self.bkg_th,
+            dim=int(self.initial_dim / nh),
+        )
+        return bkg_mask
+    def get_bkg_pseudo_labels_batch(
+        self,
+        att,
+        shape_f,
+        data,
+        use_bilateral_solver = True,
+        shape=None,
+    ):
+        bkg_mask_pred = self.compute_background_batch(
+            att, shape_f
+        )
+        # Transform bkg detection to foreground detection
+        # Object mask is the inverse of the bkg mask
+        obj_mask = (~bkg_mask_pred.bool()).float()
+        if use_bilateral_solver:
+            pseudo_labels, cnt_bs = batch_apply_bilateral_solver(data, obj_mask, shape)
+            return pseudo_labels, cnt_bs
+        else:
+            return obj_mask, 0
+    @torch.no_grad()
+    def decoder_load_weights(self, weights_path):
+        print(f"Loading model from weights {weights_path}.")
+        # Load states
+        state_dict = torch.load(weights_path)
+        # Decoder
+        self.decoder.load_state_dict(state_dict["decoder"])
+        self.decoder.eval()
+        self.decoder.to("cuda")
+    @torch.no_grad()
+    def decoder_save_weights(self, save_dir, n_iter):
+        state_dict = {}
+        state_dict["decoder"] = self.decoder.state_dict()
+        fname = os.path.join(
+                save_dir, f"decoder_weights_niter{n_iter}.pt"
+                )
+        torch.save(state_dict, fname)
+        print(f"\n----"
+              f"\nModel saved at {fname}"
+            )
+    @torch.no_grad()
+    def extract_feats(self, dims, type_feats="k"):
+        nb_im, nh, nb_tokens, _ = dims
+        qkv = (
+            self.hook_features["qkv"]
+            .reshape(
+                nb_im, nb_tokens, 3, nh, -1 // nh
+            )  # 3 corresponding to |qkv|
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        if type_feats == "q":
+            return q.transpose(1, 2).float()
+        elif type_feats == "k":
+            return k.transpose(1, 2).float()
+        elif type_feats == "v":
+            return v.transpose(1, 2).float()
+        else:
+            raise ValueError("Unknown features")
+def get_vit_encoder(vit_arch, vit_model, vit_patch_size, enc_type_feats):
+    if vit_arch == "vit_small" and vit_patch_size == 16:
+        url = "dino_deitsmall16_pretrain/dino_deitsmall16_pretrain.pth"
+        initial_dim = 384
+    elif vit_arch == "vit_small" and vit_patch_size == 8:
+        url = "dino_deitsmall8_300ep_pretrain/dino_deitsmall8_300ep_pretrain.pth"
+        initial_dim = 384
+    elif vit_arch == "vit_base" and vit_patch_size == 16:
+        if vit_model == "clip":
+            url = "5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt"
+        elif vit_model == "dino":
+            url = "dino_vitbase16_pretrain/dino_vitbase16_pretrain.pth"
+        initial_dim = 768
+    elif vit_arch == "vit_base" and vit_patch_size == 8:
+        url = "dino_vitbase8_pretrain/dino_vitbase8_pretrain.pth"
+        initial_dim = 768
+    if vit_model == "dino":
+        vit_encoder = vits.__dict__[vit_arch](patch_size=vit_patch_size, num_classes=0)
+        # TODO change if want to have last layer not unfrozen
+        for p in vit_encoder.parameters():
+            p.requires_grad = False
+        vit_encoder.eval().cuda()  # mode eval
+        state_dict = torch.hub.load_state_dict_from_url(
+            url="https://dl.fbaipublicfiles.com/dino/" + url
+        )
+        vit_encoder.load_state_dict(state_dict, strict=True)
+        hook_features = {}
+        if enc_type_feats in ["k", "q", "v", "qkv", "mlp"]:
+            # Define the hook
+            def hook_fn_forward_qkv(module, input, output):
+                hook_features["qkv"] = output
+            vit_encoder._modules["blocks"][-1]._modules["attn"]._modules[
+                "qkv"
+            ].register_forward_hook(hook_fn_forward_qkv)
+    else:
+        raise ValueError("Not implemented.")
+    return vit_encoder, initial_dim, hook_features

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+pyyaml
+matplotlib==3.5.2
+numpy==1.21.4
+opencv-python==4.5.5.64
+opencv-python-headless==4.5.5.64
+scipy==1.7.3
+tensorboard
+tqdm==4.64.0
+pycocotools==2.0.4
+Pillow==9.1.1