File size: 3,252 Bytes
a3ee979
 
 
 
 
 
 
 
 
 
 
 
 
2eca6de
 
 
a3ee979
 
 
 
 
 
 
 
 
 
 
 
54a3362
 
 
 
 
 
 
 
a3ee979
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a0fbf80
 
 
 
 
 
 
a3ee979
 
 
 
 
 
 
26b1205
54a3362
a3ee979
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
from typing import Optional

import gradio as gr
import torch

from src.nn import CaSED

PAPER_TITLE = "Vocabulary-free Image Classification"
PAPER_DESCRIPTION = """
<div style="display: flex; align-items: center; justify-content: center; margin-bottom: 1rem;">
    <a href="https://github.com/altndrr/vic" style="margin-right: 0.5rem;">
        <img src="https://img.shields.io/badge/code-github.altndrr%2Fvic-blue.svg"/>
    </a>
    <a href="https://altndrr-vic.hf.space" style="margin-right: 0.5rem;">
        <img src="https://img.shields.io/badge/demo-hf.altndrr%2Fvic-yellow.svg"/>
    </a>
    <a href="https://arxiv.org/abs/2306.00917" style="margin-right: 0.5rem;">
        <img src="https://img.shields.io/badge/paper-arXiv%3A2306.00917-B31B1B.svg"/>
    </a>
    <a href="https://altndrr.github.io/vic/" style="margin-right: 0.5rem;">
        <img src="https://img.shields.io/badge/website-gh--pages.altndrr%2Fvic-success.svg"/>
    </a>
</div>


Vocabulary-free Image Classification aims to assign a class to an image *without* prior knowledge
on the list of class names, thus operating on the semantic class space that contains all the
possible concepts. Our proposed method CaSED finds the best matching category within the
unconstrained semantic space by multimodal data from large vision-language databases.

To assign a label to an image, we:
1. extract the image features using a pre-trained Vision-Language Model (VLM);
2. retrieve the semantically most similar captions from a textual database;
3. extract from the captions a set of candidate categories by applying text parsing and filtering;
4. score the candidates using the multimodal aligned representation of the pre-trained VLM to
    obtain the best-matching category.
"""
PAPER_URL = "https://arxiv.org/abs/2306.00917"

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = CaSED().to(DEVICE).eval()


def vic(filename: str, alpha: Optional[float] = None):
    # get the outputs of the model
    vocabulary, scores = model(filename, alpha=alpha)
    confidences = dict(zip(vocabulary, scores))

    return confidences

def resize_image(image, max_size: int = 256):
    """Resize image to max_size keeping the aspect ratio."""
    width, height = image.size
    if width > height:
        ratio = width / height
        new_width = max_size * ratio
        new_height = max_size
    else:
        ratio = height / width
        new_width = max_size
        new_height = max_size * ratio
    return image.resize((int(new_width), int(new_height)))


demo = gr.Interface(
    fn=vic,
    inputs=[
        gr.Image(type="filepath", label="input"),
        gr.Slider(
            0.0,
            1.0,
            value=0.5,
            label="alpha",
            info="trade-off between the text (left) and image (right) modality",
        ),
    ],
    outputs=[gr.Label(num_top_classes=5, label="output")],
    title=PAPER_TITLE,
    description=PAPER_DESCRIPTION,
    article=f"Check out <a href={PAPER_URL}>the original paper</a> for more information.",
    examples="./artifacts/examples/",
    allow_flagging='never',
    theme=gr.themes.Soft(),
    thumbnail="https://altndrr.github.io/vic/assets/images/method.png",
)

demo.launch(share=False)