Spaces:

altndrr
/

vic

Running

App Files Files Community

altndrr commited on Jun 6, 2023

Commit

07a2d78

1 Parent(s): ef912f3

Lint code

Browse files

Files changed (5) hide show

README.md +6 -6
app.py +1 -14
artifacts/models/retrieval/indices.json +1 -1
src/nn.py +3 -3
src/retrieval.py +7 -19

README.md CHANGED Viewed

@@ -20,9 +20,9 @@ Recent advances in large vision-language models have revolutionized the image cl
 <div align="center">
-| <img src="https://altndrr.github.io/vic/assets/images/task_left.png">  | <img src="https://altndrr.github.io/vic/assets/images/task_right.png"> |
-| :----------------------------------------------: | :----------------------------------------------: |
-| Vision Language Model (VLM)-based classification |       Vocabulary-free Image Classification       |
 </div>
@@ -30,7 +30,7 @@ In this work, we first empirically verify that representing this semantic space
 <div align="center">
-|                                                                                                                          <img src="https://altndrr.github.io/vic/assets/images/method.png">                                                                                                                          |
 | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
 | Overview of CaSED. Given an input image, CaSED retrieves the most relevant captions from an external database filtering them to extract candidate categories. We classify image-to-text and text-to-text, using the retrieved captions centroid as the textual counterpart of the input image. |
@@ -42,11 +42,11 @@ If you find this work useful, please consider citing:
 ```latex
 @misc{conti2023vocabularyfree,
-      title={Vocabulary-free Image Classification},
       author={Alessandro Conti and Enrico Fini and Massimiliano Mancini and Paolo Rota and Yiming Wang and Elisa Ricci},
       year={2023},
       eprint={2306.00917},
       archivePrefix={arXiv},
       primaryClass={cs.CV}
 }
-```

 <div align="center">
+| <img src="https://altndrr.github.io/vic/assets/images/task_left.png"> | <img src="https://altndrr.github.io/vic/assets/images/task_right.png"> |
+| :-------------------------------------------------------------------: | :--------------------------------------------------------------------: |
+|           Vision Language Model (VLM)-based classification            |                  Vocabulary-free Image Classification                  |
 </div>
 <div align="center">
+|                                                                                                               <img src="https://altndrr.github.io/vic/assets/images/method.png">                                                                                                               |
 | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
 | Overview of CaSED. Given an input image, CaSED retrieves the most relevant captions from an external database filtering them to extract candidate categories. We classify image-to-text and text-to-text, using the retrieved captions centroid as the textual counterpart of the input image. |
 ```latex
 @misc{conti2023vocabularyfree,
+      title={Vocabulary-free Image Classification},
       author={Alessandro Conti and Enrico Fini and Massimiliano Mancini and Paolo Rota and Yiming Wang and Elisa Ricci},
       year={2023},
       eprint={2306.00917},
       archivePrefix={arXiv},
       primaryClass={cs.CV}
 }
+```

app.py CHANGED Viewed

@@ -49,19 +49,6 @@ def vic(filename: str, alpha: Optional[float] = None):
     return confidences
-def resize_image(image, max_size: int = 256):
-    """Resize image to max_size keeping the aspect ratio."""
-    width, height = image.size
-    if width > height:
-        ratio = width / height
-        new_width = max_size * ratio
-        new_height = max_size
-    else:
-        ratio = height / width
-        new_width = max_size
-        new_height = max_size * ratio
-    return image.resize((int(new_width), int(new_height)))
 demo = gr.Interface(
     fn=vic,
@@ -80,7 +67,7 @@ demo = gr.Interface(
     description=PAPER_DESCRIPTION,
     article=f"Check out <a href={PAPER_URL}>the original paper</a> for more information.",
     examples="./artifacts/examples/",
-    allow_flagging='never',
     theme=gr.themes.Soft(),
     thumbnail="https://altndrr.github.io/vic/assets/images/method.png",
 )

     return confidences
 demo = gr.Interface(
     fn=vic,
     description=PAPER_DESCRIPTION,
     article=f"Check out <a href={PAPER_URL}>the original paper</a> for more information.",
     examples="./artifacts/examples/",
+    allow_flagging="never",
     theme=gr.themes.Soft(),
     thumbnail="https://altndrr.github.io/vic/assets/images/method.png",
 )

artifacts/models/retrieval/indices.json CHANGED Viewed

@@ -1,3 +1,3 @@
 {
     "ViT-L-14_CC12M": "./artifacts/models/databases/cc12m/vit-l-14/"
-}

 {
     "ViT-L-14_CC12M": "./artifacts/models/databases/cc12m/vit-l-14/"
+}

src/nn.py CHANGED Viewed

@@ -11,7 +11,7 @@ import torch
 from open_clip.transformer import Transformer
 from PIL import Image
-from src.retrieval import ArrowMetadataProvider, meta_to_dict
 from src.transforms import TextCompose, default_vocabulary_transforms
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -92,7 +92,7 @@ class CaSED(torch.nn.Module):
         # load faiss indices
         indices_list_dir = Path(self.hparams["artifact_dir"]) / "models" / "retrieval"
         indices_fp = indices_list_dir / "indices.json"
-        self.indices = json.load(open(indices_fp, "r"))
         # load faiss indices and metadata providers
         self.resources = {}
@@ -165,7 +165,7 @@ class CaSED(torch.nn.Module):
             output = {}
             meta = None if key + 1 > len(metadata) else metadata[key]
             if meta is not None:
-                output.update(meta_to_dict(meta))
             output["id"] = i.item()
             output["similarity"] = d.item()
             results.append(output)

 from open_clip.transformer import Transformer
 from PIL import Image
+from src.retrieval import ArrowMetadataProvider
 from src.transforms import TextCompose, default_vocabulary_transforms
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         # load faiss indices
         indices_list_dir = Path(self.hparams["artifact_dir"]) / "models" / "retrieval"
         indices_fp = indices_list_dir / "indices.json"
+        self.indices = json.load(open(indices_fp))
         # load faiss indices and metadata providers
         self.resources = {}
             output = {}
             meta = None if key + 1 > len(metadata) else metadata[key]
             if meta is not None:
+                output.update(meta)
             output["id"] = i.item()
             output["similarity"] = d.item()
             results.append(output)

src/retrieval.py CHANGED Viewed

@@ -1,17 +1,17 @@
 from pathlib import Path
-import pyarrow as pa
 import numpy as np
 class ArrowMetadataProvider:
     """The arrow metadata provider provides metadata from contiguous ids using arrow.
-    Code taken from:
-        https://github.dev/rom1504/clip-retrieval
     """
-    def __init__(self, arrow_folder):
         arrow_files = [str(a) for a in sorted(Path(arrow_folder).glob("**/*")) if a.is_file()]
         self.table = pa.concat_tables(
             [
@@ -20,23 +20,11 @@ class ArrowMetadataProvider:
             ]
         )
-    def get(self, ids, cols=None):
-        """implement the get method from the arrow metadata provide, get metadata from ids"""
         if cols is None:
             cols = self.table.schema.names
         else:
             cols = list(set(self.table.schema.names) & set(cols))
-        t = pa.concat_tables([self.table[i:(i + 1)] for i in ids])
         return t.select(cols).to_pandas().to_dict("records")
-def meta_to_dict(meta):
-    """Convert a metadata list to a dictionary."""
-    output = {}
-    for k, v in meta.items():
-        if isinstance(v, bytes):
-            v = v.decode()
-        elif type(v).__module__ == np.__name__:
-            v = v.item()
-        output[k] = v
-    return output

 from pathlib import Path
+from typing import Optional
 import numpy as np
+import pyarrow as pa
 class ArrowMetadataProvider:
     """The arrow metadata provider provides metadata from contiguous ids using arrow.
+    Code taken from: https://github.dev/rom1504/clip-retrieval
     """
+    def __init__(self, arrow_folder: str):
         arrow_files = [str(a) for a in sorted(Path(arrow_folder).glob("**/*")) if a.is_file()]
         self.table = pa.concat_tables(
             [
             ]
         )
+    def get(self, ids: np.ndarray, cols: Optional[list] = None):
+        """Implement the get method from the arrow metadata provide, get metadata from ids."""
         if cols is None:
             cols = self.table.schema.names
         else:
             cols = list(set(self.table.schema.names) & set(cols))
+        t = pa.concat_tables([self.table[i:j] for i, j in zip(ids, ids + 1)])
         return t.select(cols).to_pandas().to_dict("records")