Spaces:
Runtime error
Runtime error
from typing import Optional | |
import gradio as gr | |
import torch | |
from PIL import Image | |
from transformers import AutoModel, CLIPProcessor | |
PAPER_TITLE = "Vocabulary-free Image Classification" | |
PAPER_DESCRIPTION = """ | |
<div style="display: flex; align-items: center; justify-content: center; margin-bottom: 1rem;"> | |
<a href="https://github.com/altndrr/vic" style="margin-right: 0.5rem;"> | |
<img src="https://img.shields.io/badge/code-github.altndrr%2Fvic-blue.svg"/> | |
</a> | |
<a href="https://altndrr-vic.hf.space" style="margin-right: 0.5rem;"> | |
<img src="https://img.shields.io/badge/demo-hf.altndrr%2Fvic-yellow.svg"/> | |
</a> | |
<a href="https://arxiv.org/abs/2306.00917" style="margin-right: 0.5rem;"> | |
<img src="https://img.shields.io/badge/paper-arXiv.2306.00917-B31B1B.svg"/> | |
</a> | |
<a href="https://altndrr.github.io/vic/" style="margin-right: 0.5rem;"> | |
<img src="https://img.shields.io/badge/website-gh--pages.altndrr%2Fvic-success.svg"/> | |
</a> | |
</div> | |
Vocabulary-free Image Classification aims to assign a class to an image *without* prior knowledge | |
on the list of class names, thus operating on the semantic class space that contains all the | |
possible concepts. Our proposed method CaSED finds the best matching category within the | |
unconstrained semantic space by multimodal data from large vision-language databases. | |
To assign a label to an image, we: | |
1. extract the image features using a pre-trained Vision-Language Model (VLM); | |
2. retrieve the semantically most similar captions from a textual database; | |
3. extract from the captions a set of candidate categories by applying text parsing and filtering; | |
4. score the candidates using the multimodal aligned representation of the pre-trained VLM to | |
obtain the best-matching category. | |
""" | |
PAPER_URL = "https://arxiv.org/abs/2306.00917" | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
model = AutoModel.from_pretrained("altndrr/cased", trust_remote_code=True).to(device) | |
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14") | |
def vic(filename: str, alpha: Optional[float] = None): | |
images = processor(images=[Image.open(filename)], return_tensors="pt", padding=True) | |
outputs = model(images, alpha=alpha) | |
vocabulary = outputs["vocabularies"][0] | |
scores = outputs["scores"][0] | |
confidences = dict(zip(vocabulary, scores)) | |
return confidences | |
demo = gr.Interface( | |
fn=vic, | |
inputs=[ | |
gr.Image(type="filepath", label="input"), | |
gr.Slider( | |
0.0, | |
1.0, | |
value=0.5, | |
label="alpha", | |
info="trade-off between the text (left) and image (right) modality", | |
), | |
], | |
outputs=[gr.Label(num_top_classes=5, label="output")], | |
title=PAPER_TITLE, | |
description=PAPER_DESCRIPTION, | |
article=f"Check out <a href={PAPER_URL}>the original paper</a> for more information.", | |
examples="./examples/", | |
allow_flagging="never", | |
theme=gr.themes.Soft(), | |
thumbnail="https://altndrr.github.io/vic/assets/images/method.png", | |
) | |
demo.launch(share=False) | |