from transformers import ViTFeatureExtractor, ViTForImageClassification | |
from PIL import Image | |
import torch | |
import torch.nn.functional as F | |
import time | |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224') | |
model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224').to(device) | |
def predict(image): | |
inputs = feature_extractor(images=image, return_tensors="pt").to(device) | |
outputs = model(**inputs) | |
logits = outputs.logits | |
predicted_class_prob = F.softmax(logits, dim=-1).detach().cpu().numpy().max() | |
predicted_class_idx = logits.argmax(-1).item() | |
label = model.config.id2label[predicted_class_idx].split(",")[0] | |
time.sleep(2) | |
return {label: float(predicted_class_prob)} | |
import gradio as gr | |
gr.Interface(predict, gr.Image(type="pil"), "label").queue(concurrency_count=1).launch() |