from typing import Dict, List, Any import numpy as np from transformers import CLIPProcessor, CLIPModel from PIL import Image from io import BytesIO import base64 class EndpointHandler(): def __init__(self, path=""): # Preload all the elements you we need at inference. self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") def __call__(self, data): images = list(map(file_to_image, [data])) inputs = self.processor(images=images, return_tensors="jax", padding=True) # converts the images into model-acceptable inputs and applies padding emb = self.model.get_image_features(**inputs) return { "embeddings": emb } def file_to_image(file): image = Image.open(BytesIO(file)).convert("RGB") return image