Spaces:
Sleeping
Sleeping
from transformers import ViTFeatureExtractor, ViTForImageClassification | |
from PIL import Image | |
import torch | |
import gradio as gr | |
from torch.nn import functional as F | |
# gr.load("models/ioanasong/vit-MINC-2500").launch() | |
# Load the pre-trained ViT model and feature extractor | |
model_name = "ioanasong/vit-MINC-2500" | |
model = ViTForImageClassification.from_pretrained(model_name) | |
model.eval() | |
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name) | |
# Define the prediction function | |
# def predict(image): | |
# print(image) | |
# # Preprocess the image | |
# inputs = feature_extractor(images=image, return_tensors="pt") | |
# # Make prediction | |
# with torch.no_grad(): | |
# outputs = model(**inputs) | |
# logits = outputs.logits | |
# # Get predicted label | |
# predicted_class_idx = logits.argmax(-1).item() | |
# predicted_label = model.config.id2label[predicted_class_idx] | |
# return predicted_label | |
def predict(image): | |
# Preprocess the image using the feature extractor | |
inputs = feature_extractor(images=image, return_tensors="pt") | |
# Make prediction using the model | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
logits = outputs.logits | |
# Compute softmax probabilities | |
probs = F.softmax(logits, dim=-1)[0] | |
# Create a dictionary of label and probability | |
prob_dict = {model.config.id2label[i]: prob.item() for i, prob in enumerate(probs)} | |
return prob_dict | |
# Create the Gradio interface | |
iface = gr.Interface( | |
fn=predict, | |
inputs=gr.Image(sources=['webcam'], streaming = True), | |
# outputs=gr.Label(num_top_classes=len(model.config.id2label)), | |
outputs=gr.Label(num_top_classes=5), | |
title="ViT Image Classification", | |
description="Capture an image from the camera and classify it using a pre-trained Vision Transformer (ViT) model.", | |
) | |
# Launch the app | |
iface.launch() |