Spaces:
Runtime error
Runtime error
from typing import Optional | |
import gradio as gr | |
import torch | |
from src.nn import CaSED | |
PAPER_TITLE = "Vocabulary-free Image Classification" | |
PAPER_DESCRIPTION = """ | |
<div style="display: flex; align-items: center; justify-content: center; margin-bottom: 1rem;"> | |
<a href="https://github.com/altndrr/vic" style="margin-right: 0.5rem;"> | |
<img src="https://img.shields.io/badge/code-github.altndrr%2Fvic-blue.svg"/> | |
</a> | |
<a href="https://altndrr-vic.hf.space" style="margin-right: 0.5rem;"> | |
<img src="https://img.shields.io/badge/demo-hf.altndrr%2Fvic-yellow.svg"/> | |
</a> | |
<a href="https://arxiv.org/abs/2306.00917" style="margin-right: 0.5rem;"> | |
<img src="https://img.shields.io/badge/paper-arXiv%3A2306.00917-B31B1B.svg"/> | |
</a> | |
<a href="https://altndrr.github.io/vic/" style="margin-right: 0.5rem;"> | |
<img src="https://img.shields.io/badge/website-gh--pages.altndrr%2Fvic-success.svg"/> | |
</a> | |
</div> | |
Vocabulary-free Image Classification aims to assign a class to an image *without* prior knowledge | |
on the list of class names, thus operating on the semantic class space that contains all the | |
possible concepts. Our proposed method CaSED finds the best matching category within the | |
unconstrained semantic space by multimodal data from large vision-language databases. | |
To assign a label to an image, we: | |
1. extract the image features using a pre-trained Vision-Language Model (VLM); | |
2. retrieve the semantically most similar captions from a textual database; | |
3. extract from the captions a set of candidate categories by applying text parsing and filtering; | |
4. score the candidates using the multimodal aligned representation of the pre-trained VLM to | |
obtain the best-matching category. | |
""" | |
PAPER_URL = "https://arxiv.org/abs/2306.00917" | |
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
model = CaSED().to(DEVICE).eval() | |
def vic(filename: str, alpha: Optional[float] = None): | |
# get the outputs of the model | |
vocabulary, scores = model(filename, alpha=alpha) | |
confidences = dict(zip(vocabulary, scores)) | |
return confidences | |
demo = gr.Interface( | |
fn=vic, | |
inputs=[ | |
gr.Image(type="filepath", label="input"), | |
gr.Slider( | |
0.0, | |
1.0, | |
value=0.5, | |
label="alpha", | |
info="trade-off between the text (left) and image (right) modality", | |
), | |
], | |
outputs=[gr.Label(num_top_classes=5, label="output")], | |
title=PAPER_TITLE, | |
description=PAPER_DESCRIPTION, | |
article=f"Check out <a href={PAPER_URL}>the original paper</a> for more information.", | |
examples="./artifacts/examples/", | |
allow_flagging="never", | |
theme=gr.themes.Soft(), | |
thumbnail="https://altndrr.github.io/vic/assets/images/method.png", | |
) | |
demo.launch(share=False) | |