File size: 2,432 Bytes
ecc6de5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
faf09ce
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
from transformers import CLIPProcessor, CLIPModel, ViTImageProcessor, ViTModel
from PIL import Image
from sklearn.metrics.pairwise import cosine_similarity

from warnings import filterwarnings
filterwarnings("ignore")

models = ["CLIP-ViT Base", "ViT Base", "DINO ViT-S16"]
models_info = {
    "CLIP-ViT Base": {
        "model_size": "386MB",
        "model_url": "openai/clip-vit-base-patch32",
        "efficiency": "High",
    },
    "ViT Base": {
        "model_size": "304MB",
        "model_url": "google/vit-base-patch16-224",
        "efficiency": "High",
    },
    "DINO ViT-S16": {
        "model_size": "1.34GB",
        "model_url": "facebook/dino-vits16",
        "efficiency": "Moderate",
    },
}

class Image_Validator:
    def __init__(self, model_name=None):
        if model_name is None: model_name="ViT Base"
        
        self.model_info = models_info[model_name]
        model_url       = self.model_info["model_url"]
        
        if model_name == "CLIP-ViT Base":
            self.model              = CLIPModel.from_pretrained(model_url)
            self.processor          = CLIPProcessor.from_pretrained(model_url)
            
        elif model_name == "ViT Base":
            self.model              = ViTModel.from_pretrained(model_url)
            self.feature_extractor  = ViTImageProcessor.from_pretrained(model_url)
            
        elif model_name == "DINO ViT-S16":
            self.model              = ViTModel.from_pretrained(model_url)
            self.feature_extractor  = ViTImageProcessor.from_pretrained(model_url)
    
    def get_image_embedding(self, image):
        
        # Process image according to the model
        if hasattr(self, 'processor'):  # CLIP models
            inputs  = self.processor(images=image, return_tensors="pt")
            outputs = self.model.get_image_features(**inputs)
            
        elif hasattr(self, 'feature_extractor'):  # ViT models
            inputs  = self.feature_extractor(images=image, return_tensors="pt")
            outputs = self.model(**inputs).last_hidden_state
        
        return outputs
    
    def similarity_score(self, image1, image2):
        embedding1 = self.get_image_embedding(image1).reshape(1, -1)
        embedding2 = self.get_image_embedding(image2).reshape(1, -1)
        similarity = cosine_similarity(embedding1.detach().numpy(), embedding2.detach().numpy())
        return similarity[0][0]