File size: 2,503 Bytes
7369efb
 
 
88974f6
 
 
7369efb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3fc62db
 
 
 
 
 
7369efb
 
a01e989
 
 
 
3fc62db
 
90de990
a01e989
 
7369efb
 
 
 
c5ad46a
7369efb
 
 
 
 
c5ad46a
90de990
7369efb
 
 
 
c5ad46a
 
 
 
 
 
 
7369efb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import os
import natsort
from tqdm import tqdm
import torch
from jax import numpy as jnp
from PIL import Image as PilImage


class CustomDataSet(torch.utils.data.Dataset):
    def __init__(self, main_dir, transform):
        self.main_dir = main_dir
        self.transform = transform
        all_imgs = os.listdir(main_dir)
        self.total_imgs = natsort.natsorted(all_imgs)

    def __len__(self):
        return len(self.total_imgs)

    def get_image_name(self, idx):
        return self.total_imgs[idx]

    def __getitem__(self, idx):
        img_loc = os.path.join(self.main_dir, self.total_imgs[idx])
        image = PilImage.open(img_loc).convert("RGB")
        tensor_image = self.transform(image)
        return tensor_image


def text_encoder(text, model, tokenizer):
    inputs = tokenizer(
        [text],
        max_length=96,
        truncation=True,
        padding="max_length",
        return_tensors="np",
    )
    embedding = model.get_text_features(
        inputs["input_ids"], 
        inputs["attention_mask"])[0]
    norms = jnp.linalg.norm(embedding, axis=-1, keepdims=True)
    embedding = embedding / norms
    return jnp.expand_dims(embedding, axis=0), norms


def image_encoder(image, model):
    image = image.permute(1, 2, 0).numpy()
    image = jnp.expand_dims(image, axis=0)  #  add batch size
    features = model.get_image_features(image,)
    norms = jnp.linalg.norm(features, axis=-1, keepdims=True)
    features = features / norms
    return features, norms


def precompute_image_features(model, loader):
    image_features = []
    for i, (images) in enumerate(tqdm(loader)):
        images = images.permute(0, 2, 3, 1).numpy()
        features = model.get_image_features(images,)
        features /= jnp.linalg.norm(features, axis=-1, keepdims=True)
        image_features.extend(features)
    return jnp.array(image_features)


def find_image(text_query, model, dataset, tokenizer, image_features, n, dataset_name):
    zeroshot_weights, _ = text_encoder(text_query, model, tokenizer)
    distances = jnp.dot(image_features, zeroshot_weights.reshape(-1, 1))
    file_paths = []
    for i in range(1, n + 1):
        idx = jnp.argsort(distances, axis=0)[-i, 0]

        if dataset_name == "Unsplash":
            file_paths.append("photos/" + dataset.get_image_name(idx))
        elif dataset_name == "CC":
            file_paths.append(dataset[idx])
        else:
            raise ValueError(f"{dataset_name} not supported here")
    return file_paths