Spaces:

altndrr
/

vic

Runtime error

App Files Files Community

altndrr commited on Nov 30, 2023

Commit

563a829

•

1 Parent(s): 54f2384

Rewrite interface (#1)

Browse files

- Rewrite interface (2690b62e772a0b91fc15d70d26955a37557376ef)

Files changed (5) hide show

.gitignore +1 -0
app.py +63 -43
examples/log.csv +0 -11
pyproject.toml +1 -1
requirements.txt +5 -4

.gitignore CHANGED Viewed

@@ -149,3 +149,4 @@ dmypy.json
 # Template
 /artifacts/models/databases/*/

 # Template
 /artifacts/models/databases/*/
+/gradio_cached_examples/*

app.py CHANGED Viewed

@@ -1,12 +1,19 @@
 from typing import Optional
 import gradio as gr
 import torch
-from PIL import Image
 from transformers import AutoModel, CLIPProcessor
 PAPER_TITLE = "Vocabulary-free Image Classification"
-PAPER_DESCRIPTION = """
 <div style="display: flex; align-items: center; justify-content: center; margin-bottom: 1rem;">
     <a href="https://github.com/altndrr/vic" style="margin-right: 0.5rem;">
         <img src="https://img.shields.io/badge/code-github.altndrr%2Fvic-blue.svg"/>
@@ -21,31 +28,35 @@ PAPER_DESCRIPTION = """
         <img src="https://img.shields.io/badge/website-gh--pages.altndrr%2Fvic-success.svg"/>
     </a>
 </div>
-Vocabulary-free Image Classification aims to assign a class to an image *without* prior knowledge
-on the list of class names, thus operating on the semantic class space that contains all the
-possible concepts. Our proposed method CaSED finds the best matching category within the
-unconstrained semantic space by multimodal data from large vision-language databases.
-To assign a label to an image, we:
-1. extract the image features using a pre-trained Vision-Language Model (VLM);
-2. retrieve the semantically most similar captions from a textual database;
-3. extract from the captions a set of candidate categories by applying text parsing and filtering;
-4. score the candidates using the multimodal aligned representation of the pre-trained VLM to
-    obtain the best-matching category.
-"""
-PAPER_URL = "https://arxiv.org/abs/2306.00917"
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model = AutoModel.from_pretrained("altndrr/cased", trust_remote_code=True).to(device)
-processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
-def vic(filename: str, alpha: Optional[float] = None):
-    images = processor(images=[Image.open(filename)], return_tensors="pt", padding=True)
-    outputs = model(images, alpha=alpha)
     vocabulary = outputs["vocabularies"][0]
     scores = outputs["scores"][0].tolist()
     confidences = dict(zip(vocabulary, scores))
@@ -53,26 +64,35 @@ def vic(filename: str, alpha: Optional[float] = None):
     return confidences
-demo = gr.Interface(
-    fn=vic,
-    inputs=[
-        gr.Image(type="filepath", label="input"),
-        gr.Slider(
-            0.0,
-            1.0,
-            value=0.5,
-            label="alpha",
-            info="trade-off between the text (left) and image (right) modality",
-        ),
-    ],
-    outputs=[gr.Label(num_top_classes=5, label="output")],
-    title=PAPER_TITLE,
-    description=PAPER_DESCRIPTION,
-    article=f"Check out <a href={PAPER_URL}>the original paper</a> for more information.",
-    examples="./examples/",
-    allow_flagging="never",
-    theme=gr.themes.Soft(),
-    thumbnail="https://altndrr.github.io/vic/assets/images/method.png",
-)
-demo.launch(share=False)

+import os
+from glob import glob
 from typing import Optional
 import gradio as gr
 import torch
+from torchvision.transforms.functional import to_pil_image
 from transformers import AutoModel, CLIPProcessor
 PAPER_TITLE = "Vocabulary-free Image Classification"
+PAPER_URL = "https://arxiv.org/abs/2306.00917"
+MARKDOWN_DESCRIPTION = """
+<div style="display: flex; align-items: center; justify-content: center; margin-bottom: 1rem;">
+    <h1>Vocabulary-free Image Classification</h1>
+</div>
 <div style="display: flex; align-items: center; justify-content: center; margin-bottom: 1rem;">
     <a href="https://github.com/altndrr/vic" style="margin-right: 0.5rem;">
         <img src="https://img.shields.io/badge/code-github.altndrr%2Fvic-blue.svg"/>
         <img src="https://img.shields.io/badge/website-gh--pages.altndrr%2Fvic-success.svg"/>
     </a>
 </div>
+"""
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+MODEL = AutoModel.from_pretrained("altndrr/cased", trust_remote_code=True).to(DEVICE)
+PROCESSOR = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
+def prepare_image(image: gr.Image):
+    if image is None:
+        return None, None
+    PROCESSOR.image_processor.do_normalize = False
+    image_tensor = PROCESSOR(images=[image], return_tensors="pt", padding=True)
+    PROCESSOR.image_processor.do_normalize = True
+    image_tensor = image_tensor.pixel_values[0]
+    curr_image = to_pil_image(image_tensor)
+    return curr_image, image.copy()
+def image_inference(image: gr.Image, alpha: Optional[float] = None):
+    if image is None:
+        return None
+    images = PROCESSOR(images=[image], return_tensors="pt", padding=True)
+    with torch.no_grad():
+        outputs = MODEL(images, alpha=alpha)
     vocabulary = outputs["vocabularies"][0]
     scores = outputs["scores"][0].tolist()
     confidences = dict(zip(vocabulary, scores))
     return confidences
+with gr.Blocks(analytics_enabled=True, title=PAPER_TITLE, theme="soft") as demo:
+    gr.Markdown(MARKDOWN_DESCRIPTION)
+    with gr.Row():
+        with gr.Column():
+            curr_image = gr.Image(label="input", type="pil", height=300)
+            orig_image = gr.Image(
+                label="orig. image", type="pil", visible=False, interactive=False
+            )
+            alpha_slider = gr.Slider(0.0, 1.0, value=0.7, step=0.1, label="alpha")
+            with gr.Row():
+                clear_button = gr.ClearButton([curr_image, orig_image])
+                run_button = gr.Button(value="Submit", variant="primary")
+        with gr.Column():
+            output_label = gr.Label(label="output", num_top_classes=5)
+    examples = gr.Examples(
+        examples=glob(os.path.join(os.path.dirname(__file__), "examples", "*.jpg")),
+        inputs=[orig_image],
+        outputs=[output_label],
+        fn=image_inference,
+        cache_examples=True,
+    )
+    gr.Markdown(f"Check out the <a href={PAPER_URL}>original paper</a> for more information.")
+    curr_image.upload(prepare_image, [curr_image], [curr_image, orig_image])
+    curr_image.clear(lambda: None, [], [orig_image])
+    orig_image.change(prepare_image, [orig_image], [curr_image, orig_image])
+    run_button.click(image_inference, [curr_image, alpha_slider], [output_label])
+if __name__ == "__main__":
+    demo.queue()
+    demo.launch(server_name="0.0.0.0", server_port=7860)

examples/log.csv DELETED Viewed

@@ -1,11 +0,0 @@
-image_fp
-basketball.jpg
-cassowary.jpg
-colosseum.jpg
-desk.jpg
-kitchen.jpg
-monkey.jpg
-park.jpg
-ramen.jpg
-sagrada.jpg
-venice.jpg

pyproject.toml CHANGED Viewed

@@ -15,7 +15,7 @@ line_length = 99
 count = true
 ignore = ["E402"]
 per-file-ignores = ["__init__.py:F401"]
-exclude = ["data/*","logs/*"]
 max-line-length = 99
 [tool.isort]

 count = true
 ignore = ["E402"]
 per-file-ignores = ["__init__.py:F401"]
+exclude = []
 max-line-length = 99
 [tool.isort]

requirements.txt CHANGED Viewed

@@ -1,7 +1,8 @@
 torch==2.0.1
 faiss-cpu==1.7.4
-flair==0.12.2
-gradio==3.33.1
-inflect==6.0.4
 nltk==3.8.1
-transformers==4.29.2

 torch==2.0.1
+torchvision==0.15.2
 faiss-cpu==1.7.4
+flair==0.13.0
+gradio==4.7.1
+inflect==7.0.0
 nltk==3.8.1
+transformers==4.35.1