Spaces:

altndrr
/

vic

Runtime error

App Files Files Community

vic / app.py

altndrr

Update demo link

54f2384 about 1 year ago

raw

history blame

No virus

3.14 kB

	from typing import Optional

	import gradio as gr
	import torch
	from PIL import Image
	from transformers import AutoModel, CLIPProcessor

	PAPER_TITLE = "Vocabulary-free Image Classification"
	PAPER_DESCRIPTION = """
	<div style="display: flex; align-items: center; justify-content: center; margin-bottom: 1rem;">
	<a href="https://github.com/altndrr/vic" style="margin-right: 0.5rem;">
	<img src="https://img.shields.io/badge/code-github.altndrr%2Fvic-blue.svg"/>
	</a>
	<a href="https://huggingface.co/spaces/altndrr/vic" style="margin-right: 0.5rem;">
	<img src="https://img.shields.io/badge/demo-hf.altndrr%2Fvic-yellow.svg"/>
	</a>
	<a href="https://arxiv.org/abs/2306.00917" style="margin-right: 0.5rem;">
	<img src="https://img.shields.io/badge/paper-arXiv.2306.00917-B31B1B.svg"/>
	</a>
	<a href="https://alessandroconti.me/papers/2306.00917" style="margin-right: 0.5rem;">
	<img src="https://img.shields.io/badge/website-gh--pages.altndrr%2Fvic-success.svg"/>
	</a>
	</div>


	Vocabulary-free Image Classification aims to assign a class to an image without prior knowledge
	on the list of class names, thus operating on the semantic class space that contains all the
	possible concepts. Our proposed method CaSED finds the best matching category within the
	unconstrained semantic space by multimodal data from large vision-language databases.

	To assign a label to an image, we:
	1. extract the image features using a pre-trained Vision-Language Model (VLM);
	2. retrieve the semantically most similar captions from a textual database;
	3. extract from the captions a set of candidate categories by applying text parsing and filtering;
	4. score the candidates using the multimodal aligned representation of the pre-trained VLM to
	obtain the best-matching category.
	"""
	PAPER_URL = "https://arxiv.org/abs/2306.00917"


	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model = AutoModel.from_pretrained("altndrr/cased", trust_remote_code=True).to(device)
	processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")


	def vic(filename: str, alpha: Optional[float] = None):
	images = processor(images=[Image.open(filename)], return_tensors="pt", padding=True)
	outputs = model(images, alpha=alpha)
	vocabulary = outputs["vocabularies"][0]
	scores = outputs["scores"][0].tolist()
	confidences = dict(zip(vocabulary, scores))

	return confidences


	demo = gr.Interface(
	fn=vic,
	inputs=[
	gr.Image(type="filepath", label="input"),
	gr.Slider(
	0.0,
	1.0,
	value=0.5,
	label="alpha",
	info="trade-off between the text (left) and image (right) modality",
	),
	],
	outputs=[gr.Label(num_top_classes=5, label="output")],
	title=PAPER_TITLE,
	description=PAPER_DESCRIPTION,
	article=f"Check out <a href={PAPER_URL}>the original paper</a> for more information.",
	examples="./examples/",
	allow_flagging="never",
	theme=gr.themes.Soft(),
	thumbnail="https://altndrr.github.io/vic/assets/images/method.png",
	)

	demo.launch(share=False)