import torch import gradio as gr from transformers import AlignProcessor, AlignModel device = "cuda" if torch.cuda.is_available() else "cpu" processor = AlignProcessor.from_pretrained("kakaobrain/align-base") model = AlignModel.from_pretrained("kakaobrain/align-base").to(device) model.eval() def predict(image, labels): labels = labels.split(', ') inputs = processor(images=image, text=labels, return_tensors="pt").to(device) with torch.no_grad(): outputs = model(**inputs) logits_per_image = outputs.logits_per_image probs = logits_per_image.softmax(dim=1).cpu().numpy() return {k: float(v) for k, v in zip(labels, probs[0])} description = """
ALIGN performance

Gradio demo for ALIGN, as introduced in "Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision ". ALIGN features a dual-encoder architecture with EfficientNet and BERT as its text and vision encoders, and learns to align visual and text representations with contrastive learning. Unlike previous work, ALIGN leverages a massive noisy dataset and shows that the scale of the corpus can be used to achieve SOTA representations with a simple recipe. \n\nALIGN is not open-sourced and the `kakaobrain/align-base` model used for this demo is based on the Kakao Brain implementation that follows the original paper. The model is trained on the open source [COYO]( dataset by the Kakao Brain team. To perform zero-shot image classification with ALIGN, upload an image and enter your candidate labels as free-form text separated by a comma followed by a space.

""" gr.Interface( fn=predict, inputs=[ gr.inputs.Image(label="Image to classify", type="pil"), gr.inputs.Textbox(lines=1, label="Comma separated candidate labels", placeholder="Enter labels separated by ', '",) ], theme="grass", outputs="label", examples=[ ["assets/cartoon.jpeg", "dinosaur, drawing, forest",], ["assets/painting.jpeg", "watercolor painting, oil painting, boats",], ], title="Zero-Shot Image Classification with ALIGN", description=description ).launch()