import gradio as gr import torch from PIL import Image from torchvision.transforms import functional as F from typing import List from transformers import CLIPModel, CLIPProcessor # Load the pre-trained model model_path = "1024_MLP_best-MSE4.1636_ep75.pth" model = torch.load(model_path) model.eval() # Load the CLIP model and processor clip_model = CLIPModel.from_pretrained("ViT-L/14") clip_processor = CLIPProcessor.from_pretrained("ViT-L/14") # Define the prediction function def predict(images: List[Image.Image]) -> float: image_tensors = [F.to_tensor(img) for img in images] inputs = clip_processor(images=image_tensors, return_tensors="pt", padding=True) with torch.no_grad(): outputs = model(inputs.pixel_values) scores = outputs.clamp(0, 10).cpu().numpy().reshape(-1).tolist() return scores # Define the Gradio interface iface = gr.Interface( fn=predict, inputs="image", outputs="number", title="Kemono Aesthetic Scorer", description="Predict the score of a kemono based on aesthetic features.", ) # Run the Gradio interface iface.launch()