import os import sys sys.path.append(os.path.join(os.path.dirname(__file__), "..", "..")) import time import numpy as np import torch from PIL import Image from transformers import CLIPModel as CLIPTransformersModel from transformers import CLIPProcessor from models.base_model import BaseModelMainModel class CLIPModel(BaseModelMainModel): def __init__( self, name_model: str, freeze_model: bool, pretrained_model: bool, support_set_method: str, ): super().__init__(name_model, freeze_model, pretrained_model, support_set_method) self.init_model() def init_model(self): self.model = CLIPTransformersModel.from_pretrained(self.name_model) for layer in self.model.children(): if hasattr(layer, "reset_parameters") and not self.pretrained_model: layer.reset_parameters() for param in self.model.parameters(): param.required_grad = False if not self.freeze_model else True self.model.to(self.device) self.model.eval() self.processor = CLIPProcessor.from_pretrained(self.name_model) def predict(self, image: np.ndarray, list_class: tuple) -> dict: image = Image.fromarray(image) with torch.no_grad(): inputs = self.processor( text=list_class, images=image, return_tensors="pt", padding=True ) start_time = time.perf_counter() outputs = self.model(**inputs) end_time = time.perf_counter() - start_time logits_per_image = outputs.logits_per_image probs = logits_per_image.softmax(dim=1) argmax_probs = probs.argmax(dim=1) result = list_class[argmax_probs[0]] return { "class": result, "confidence": float(probs[0, argmax_probs[0]]), "inference_time": end_time, }