File size: 1,472 Bytes
2daf3c7
 
8e2b754
 
 
 
 
 
 
 
 
c309418
8e2b754
 
 
 
 
 
 
c309418
8e2b754
 
c309418
 
 
8e2b754
 
c309418
8e2b754
 
 
c309418
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import os

import jax
import torch
from torchvision.io import ImageReadMode, read_image
from transformers import AutoTokenizer

from modeling_hybrid_clip import FlaxHybridCLIP
from run_hybrid_clip import Transform


def prepare_image(image_path, model):
    image = read_image(image_path, mode=ImageReadMode.RGB)
    preprocess = Transform(model.config.vision_config.image_size)
    preprocess = torch.jit.script(preprocess)
    preprocessed_image = preprocess(image)
    pixel_values = torch.stack([preprocessed_image]).permute(0, 2, 3, 1).numpy()
    return pixel_values

def prepare_text(text, tokenizer):
    return tokenizer(text, return_tensors="np")

def run_inference(image_path, text, model, tokenizer):
    pixel_values = prepare_image(image_path, model)
    input_text = prepare_text(text, tokenizer)
    model_output = model(input_text["input_ids"], pixel_values, attention_mask=input_text["attention_mask"], token_type_ids=input_text["token_type_ids"], train=False, return_dict=True)
    logits = model_output["logits_per_image"]
    score = jax.nn.sigmoid(logits)[0][0]
    return score


if __name__ == "__main__":
    model = FlaxHybridCLIP.from_pretrained("clip_spanish_141230_samples")
    tokenizer = AutoTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-cased")

    image_path = f"/home/{os.environ['USER']}/data/wit_scale_converted/Santuar.jpg"
    text = "Fachada del Santuario"

    print(run_inference(image_path, text, model, tokenizer))