File size: 18,949 Bytes
2df1399
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5624190
2df1399
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
#!/usr/bin/env python3

import torch
from PIL import Image
import numpy as np
from typing import cast
import pprint
from pathlib import Path
import base64
from io import BytesIO
from typing import Union, Tuple
import matplotlib
import re

from colpali_engine.models import ColPali, ColPaliProcessor
from colpali_engine.utils.torch_utils import get_torch_device
from einops import rearrange
from vidore_benchmark.interpretability.plot_utils import plot_similarity_heatmap
from vidore_benchmark.interpretability.torch_utils import (
    normalize_similarity_map_per_query_token,
)
from vidore_benchmark.interpretability.vit_configs import VIT_CONFIG
from vidore_benchmark.utils.image_utils import scale_image
from vespa.application import Vespa
from vespa.io import VespaQueryResponse

matplotlib.use("Agg")

MAX_QUERY_TERMS = 64
# OUTPUT_DIR = Path(__file__).parent.parent / "output" / "sim_maps"
# OUTPUT_DIR.mkdir(exist_ok=True)

COLPALI_GEMMA_MODEL_ID = "vidore--colpaligemma-3b-pt-448-base"
COLPALI_GEMMA_MODEL_SNAPSHOT = "12c59eb7e23bc4c26876f7be7c17760d5d3a1ffa"
COLPALI_GEMMA_MODEL_PATH = (
    Path().home()
    / f".cache/huggingface/hub/models--{COLPALI_GEMMA_MODEL_ID}/snapshots/{COLPALI_GEMMA_MODEL_SNAPSHOT}"
)
COLPALI_MODEL_ID = "vidore--colpali-v1.2"
COLPALI_MODEL_SNAPSHOT = "9912ce6f8a462d8cf2269f5606eabbd2784e764f"
COLPALI_MODEL_PATH = (
    Path().home()
    / f".cache/huggingface/hub/models--{COLPALI_MODEL_ID}/snapshots/{COLPALI_MODEL_SNAPSHOT}"
)
COLPALI_GEMMA_MODEL_NAME = COLPALI_GEMMA_MODEL_ID.replace("--", "/")


def load_model() -> Tuple[ColPali, ColPaliProcessor]:
    model_name = "vidore/colpali-v1.2"

    device = get_torch_device("auto")
    print(f"Using device: {device}")

    # Load the model
    model = cast(
        ColPali,
        ColPali.from_pretrained(
            model_name,
            torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
            device_map=device,
        ),
    ).eval()

    # Load the processor
    processor = cast(ColPaliProcessor, ColPaliProcessor.from_pretrained(model_name))
    return model, processor


def load_vit_config(model):
    # Load the ViT config
    print(f"VIT config: {VIT_CONFIG}")
    vit_config = VIT_CONFIG[COLPALI_GEMMA_MODEL_NAME]
    return vit_config


# Create dummy image
dummy_image = Image.new("RGB", (448, 448), (255, 255, 255))


def gen_similarity_map(
    model, processor, device, vit_config, query, image: Union[Path, str]
):
    # Should take in the b64 image from Vespa query result
    # And possibly the tensor representing the output_image
    if isinstance(image, Path):
        # image is a file path
        try:
            image = Image.open(image)
        except Exception as e:
            raise ValueError(f"Failed to open image from path: {e}")
    elif isinstance(image, str):
        # image is b64 string
        try:
            image = Image.open(BytesIO(base64.b64decode(image)))
        except Exception as e:
            raise ValueError(f"Failed to open image from b64: {e}")

    # Preview the image
    scale_image(image, 512)
    # Preprocess inputs
    input_text_processed = processor.process_queries([query]).to(device)
    input_image_processed = processor.process_images([image]).to(device)
    # Forward passes
    with torch.no_grad():
        output_text = model.forward(**input_text_processed)
        output_image = model.forward(**input_image_processed)
    # output_image is the tensor that we could get from the Vespa query
    # Print shape of output_text and output_image
    # Output image shape: torch.Size([1, 1030, 128])
    # Remove the special tokens from the output
    output_image = output_image[
        :, : processor.image_seq_length, :
    ]  # (1, n_patches_x * n_patches_y, dim)

    # Rearrange the output image tensor to explicitly represent the 2D grid of patches
    output_image = rearrange(
        output_image,
        "b (h w) c -> b h w c",
        h=vit_config.n_patch_per_dim,
        w=vit_config.n_patch_per_dim,
    )  # (1, n_patches_x, n_patches_y, dim)
    # Get the similarity map
    similarity_map = torch.einsum(
        "bnk,bijk->bnij", output_text, output_image
    )  # (1, query_tokens, n_patches_x, n_patches_y)

    # Normalize the similarity map
    similarity_map_normalized = normalize_similarity_map_per_query_token(
        similarity_map
    )  # (1, query_tokens, n_patches_x, n_patches_y)
    # Use this cell output to choose a token using its index
    query_tokens = processor.tokenizer.tokenize(
        processor.decode(input_text_processed.input_ids[0])
    )
    # Choose a token
    token_idx = (
        10  # e.g. if "12: '▁Kazakhstan',", set 12 to choose the token 'Kazakhstan'
    )
    selected_token = processor.decode(input_text_processed.input_ids[0, token_idx])
    # strip whitespace
    selected_token = selected_token.strip()
    print(f"Selected token: `{selected_token}`")
    # Retrieve the similarity map for the chosen token
    pprint.pprint({idx: val for idx, val in enumerate(query_tokens)})
    # Resize the image to square
    input_image_square = image.resize((vit_config.resolution, vit_config.resolution))

    # Plot the similarity map
    fig, ax = plot_similarity_heatmap(
        input_image_square,
        patch_size=vit_config.patch_size,
        image_resolution=vit_config.resolution,
        similarity_map=similarity_map_normalized[0, token_idx, :, :],
    )
    ax = annotate_plot(ax, selected_token)
    return fig, ax


# def save_figure(fig, filename: str = "similarity_map.png"):
#     fig.savefig(
#         OUTPUT_DIR / filename,
#         bbox_inches="tight",
#         pad_inches=0,
#     )


def annotate_plot(ax, query, selected_token):
    # Add the query text
    ax.set_title(query, fontsize=18)
    # Add annotation with selected token
    ax.annotate(
        f"Selected token:`{selected_token}`",
        xy=(0.5, 0.95),
        xycoords="axes fraction",
        ha="center",
        va="center",
        fontsize=18,
        color="black",
        bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="black", lw=1),
    )
    return ax


def gen_similarity_map_new(
    processor: ColPaliProcessor,
    model: ColPali,
    device,
    vit_config,
    query: str,
    query_embs: torch.Tensor,
    token_idx_map: dict,
    token_to_show: str,
    image: Union[Path, str],
):
    if isinstance(image, Path):
        # image is a file path
        try:
            image = Image.open(image)
        except Exception as e:
            raise ValueError(f"Failed to open image from path: {e}")
    elif isinstance(image, str):
        # image is b64 string
        try:
            image = Image.open(BytesIO(base64.b64decode(image)))
        except Exception as e:
            raise ValueError(f"Failed to open image from b64: {e}")
    token_idx = token_idx_map[token_to_show]
    print(f"Selected token: `{token_to_show}`")
    # strip whitespace
    # Preview the image
    # scale_image(image, 512)
    # Preprocess inputs
    input_image_processed = processor.process_images([image]).to(device)
    # Forward passes
    with torch.no_grad():
        output_image = model.forward(**input_image_processed)
    # output_image is the tensor that we could get from the Vespa query
    # Print shape of output_text and output_image
    # Output image shape: torch.Size([1, 1030, 128])
    # Remove the special tokens from the output
    print(f"Output image shape before dim: {output_image.shape}")
    output_image = output_image[
        :, : processor.image_seq_length, :
    ]  # (1, n_patches_x * n_patches_y, dim)
    print(f"Output image shape after dim: {output_image.shape}")
    # Rearrange the output image tensor to explicitly represent the 2D grid of patches
    output_image = rearrange(
        output_image,
        "b (h w) c -> b h w c",
        h=vit_config.n_patch_per_dim,
        w=vit_config.n_patch_per_dim,
    )  # (1, n_patches_x, n_patches_y, dim)
    # Get the similarity map
    print(f"Query embs shape: {query_embs.shape}")
    # Add 1 extra dim to start of query_embs
    query_embs = query_embs.unsqueeze(0).to(device)
    print(f"Output image shape: {output_image.shape}")
    similarity_map = torch.einsum(
        "bnk,bijk->bnij", query_embs, output_image
    )  # (1, query_tokens, n_patches_x, n_patches_y)
    print(f"Similarity map shape: {similarity_map.shape}")
    # Normalize the similarity map
    similarity_map_normalized = normalize_similarity_map_per_query_token(
        similarity_map
    )  # (1, query_tokens, n_patches_x, n_patches_y)
    print(f"Similarity map normalized shape: {similarity_map_normalized.shape}")
    # Use this cell output to choose a token using its index
    input_image_square = image.resize((vit_config.resolution, vit_config.resolution))

    # Plot the similarity map
    fig, ax = plot_similarity_heatmap(
        input_image_square,
        patch_size=vit_config.patch_size,
        image_resolution=vit_config.resolution,
        similarity_map=similarity_map_normalized[0, token_idx, :, :],
    )
    ax = annotate_plot(ax, query, token_to_show)
    # save the figure
    # save_figure(fig, f"similarity_map_{token_to_show}.png")
    return fig, ax


def get_query_embeddings_and_token_map(
    processor, model, query, image
) -> Tuple[torch.Tensor, dict]:
    inputs = processor.process_queries([query]).to(model.device)
    with torch.no_grad():
        embeddings_query = model(**inputs)
        q_emb = embeddings_query.to("cpu")[0]  # Extract the single embedding
    # Use this cell output to choose a token using its index
    query_tokens = processor.tokenizer.tokenize(processor.decode(inputs.input_ids[0]))
    # reverse key, values in dictionary
    print(query_tokens)
    token_to_idx = {val: idx for idx, val in enumerate(query_tokens)}
    return q_emb, token_to_idx


def format_query_results(query, response, hits=5) -> dict:
    query_time = response.json.get("timing", {}).get("searchtime", -1)
    query_time = round(query_time, 2)
    count = response.json.get("root", {}).get("fields", {}).get("totalCount", 0)
    result_text = f"Query text: '{query}', query time {query_time}s, count={count}, top results:\n"
    print(result_text)
    return response.json


async def query_vespa_default(
    app: Vespa,
    query: str,
    q_emb: torch.Tensor,
    hits: int = 3,
    timeout: str = "10s",
    **kwargs,
) -> dict:
    async with app.asyncio(connections=1, total_timeout=120) as session:
        query_embedding = format_q_embs(q_emb)
        response: VespaQueryResponse = await session.query(
            body={
                "yql": "select id,title,url,image,page_number,text from pdf_page where userQuery();",
                "ranking": "default",
                "query": query,
                "timeout": timeout,
                "hits": hits,
                "input.query(qt)": query_embedding,
                "presentation.timing": True,
                **kwargs,
            },
        )
        assert response.is_successful(), response.json
    return format_query_results(query, response)


def float_to_binary_embedding(float_query_embedding: dict) -> dict:
    binary_query_embeddings = {}
    for k, v in float_query_embedding.items():
        binary_vector = (
            np.packbits(np.where(np.array(v) > 0, 1, 0)).astype(np.int8).tolist()
        )
        binary_query_embeddings[k] = binary_vector
        if len(binary_query_embeddings) >= MAX_QUERY_TERMS:
            print(f"Warning: Query has more than {MAX_QUERY_TERMS} terms. Truncating.")
            break
    return binary_query_embeddings


def create_nn_query_strings(
    binary_query_embeddings: dict, target_hits_per_query_tensor: int = 20
) -> Tuple[str, dict]:
    # Query tensors for nearest neighbor calculations
    nn_query_dict = {}
    for i in range(len(binary_query_embeddings)):
        nn_query_dict[f"input.query(rq{i})"] = binary_query_embeddings[i]
    nn = " OR ".join(
        [
            f"({{targetHits:{target_hits_per_query_tensor}}}nearestNeighbor(embedding,rq{i}))"
            for i in range(len(binary_query_embeddings))
        ]
    )
    return nn, nn_query_dict


def format_q_embs(q_embs: torch.Tensor) -> dict:
    float_query_embedding = {k: v.tolist() for k, v in enumerate(q_embs)}
    return float_query_embedding


async def query_vespa_nearest_neighbor(
    app: Vespa,
    query: str,
    q_emb: torch.Tensor,
    target_hits_per_query_tensor: int = 20,
    hits: int = 3,
    timeout: str = "10s",
    **kwargs,
) -> dict:
    # Hyperparameter for speed vs. accuracy
    async with app.asyncio(connections=1, total_timeout=180) as session:
        float_query_embedding = format_q_embs(q_emb)
        binary_query_embeddings = float_to_binary_embedding(float_query_embedding)

        # Mixed tensors for MaxSim calculations
        query_tensors = {
            "input.query(qtb)": binary_query_embeddings,
            "input.query(qt)": float_query_embedding,
        }
        nn_string, nn_query_dict = create_nn_query_strings(
            binary_query_embeddings, target_hits_per_query_tensor
        )
        query_tensors.update(nn_query_dict)
        response: VespaQueryResponse = await session.query(
            body={
                **query_tensors,
                "presentation.timing": True,
                "yql": f"select id,title,text,url,image,page_number from pdf_page where {nn_string}",
                "ranking.profile": "retrieval-and-rerank",
                "timeout": timeout,
                "hits": hits,
                **kwargs,
            },
        )
        assert response.is_successful(), response.json
    return format_query_results(query, response)


def is_special_token(token: str) -> bool:
    # Pattern for tokens that start with '<', numbers, whitespace, or single characters
    pattern = re.compile(r"^<.*$|^\d+$|^\s+$|^.$")
    if pattern.match(token):
        return True
    return False


async def get_result_from_query(
    app: Vespa,
    processor: ColPaliProcessor,
    model: ColPali,
    query: str,
    nn=False,
    gen_sim_map=False,
):
    # Get the query embeddings and token map
    print(query)
    q_embs, token_to_idx = get_query_embeddings_and_token_map(
        processor, model, query, dummy_image
    )
    print(token_to_idx)
    # Use the token map to choose a token randomly for now
    # Dynamically select a token containing 'water'

    if nn:
        result = await query_vespa_nearest_neighbor(app, query, q_embs)
    else:
        result = await query_vespa_default(app, query, q_embs)
    # Print score, title id and text of the results
    for idx, child in enumerate(result["root"]["children"]):
        print(
            f"Result {idx+1}: {child['relevance']}, {child['fields']['title']}, {child['fields']['id']}"
        )

    if gen_sim_map:
        for single_result in result["root"]["children"]:
            img = single_result["fields"]["image"]
            for token in token_to_idx:
                if is_special_token(token):
                    print(f"Skipping special token: {token}")
                    continue
                fig, ax = gen_similarity_map_new(
                    processor,
                    model,
                    model.device,
                    load_vit_config(model),
                    query,
                    q_embs,
                    token_to_idx,
                    token,
                    img,
                )
                sim_map = base64.b64encode(fig.canvas.tostring_rgb()).decode("utf-8")
                single_result["fields"][f"sim_map_{token}"] = sim_map
    return result


def get_result_dummy(query: str, nn: bool = False):
    result = {}
    result["timing"] = {}
    result["timing"]["querytime"] = 0.23700000000000002
    result["timing"]["summaryfetchtime"] = 0.001
    result["timing"]["searchtime"] = 0.23900000000000002
    result["root"] = {}
    result["root"]["id"] = "toplevel"
    result["root"]["relevance"] = 1
    result["root"]["fields"] = {}
    result["root"]["fields"]["totalCount"] = 59
    result["root"]["coverage"] = {}
    result["root"]["coverage"]["coverage"] = 100
    result["root"]["coverage"]["documents"] = 155
    result["root"]["coverage"]["full"] = True
    result["root"]["coverage"]["nodes"] = 1
    result["root"]["coverage"]["results"] = 1
    result["root"]["coverage"]["resultsFull"] = 1
    result["root"]["children"] = []
    elt0 = {}
    elt0["id"] = "index:colpalidemo_content/0/424c85e7dece761d226f060f"
    elt0["relevance"] = 2354.050122871995
    elt0["source"] = "colpalidemo_content"
    elt0["fields"] = {}
    elt0["fields"]["id"] = "a767cb1868be9a776cd56b768347b089"
    elt0["fields"]["url"] = (
        "https://static.conocophillips.com/files/resources/conocophillips-2023-sustainability-report.pdf"
    )
    elt0["fields"]["title"] = "ConocoPhillips 2023 Sustainability Report"
    elt0["fields"]["page_number"] = 50
    elt0["fields"]["image"] = "empty for now - is base64 encoded image"
    result["root"]["children"].append(elt0)
    elt1 = {}
    elt1["id"] = "index:colpalidemo_content/0/b927c4979f0beaf0d7fab8e9"
    elt1["relevance"] = 2313.7529950886965
    elt1["source"] = "colpalidemo_content"
    elt1["fields"] = {}
    elt1["fields"]["id"] = "9f2fc0aa02c9561adfaa1451c875658f"
    elt1["fields"]["url"] = (
        "https://static.conocophillips.com/files/resources/conocophillips-2023-managing-climate-related-risks.pdf"
    )
    elt1["fields"]["title"] = "ConocoPhillips Managing Climate Related Risks"
    elt1["fields"]["page_number"] = 44
    elt1["fields"]["image"] = "empty for now - is base64 encoded image"
    result["root"]["children"].append(elt1)
    elt2 = {}
    elt2["id"] = "index:colpalidemo_content/0/9632d72238829d6afefba6c9"
    elt2["relevance"] = 2312.230182081461
    elt2["source"] = "colpalidemo_content"
    elt2["fields"] = {}
    elt2["fields"]["id"] = "d638ded1ddcb446268b289b3f65430fd"
    elt2["fields"]["url"] = (
        "https://static.conocophillips.com/files/resources/24-0976-sustainability-highlights_nature.pdf"
    )
    elt2["fields"]["title"] = (
        "ConocoPhillips Sustainability Highlights - Nature (24-0976)"
    )
    elt2["fields"]["page_number"] = 0
    elt2["fields"]["image"] = "empty for now - is base64 encoded image"
    result["root"]["children"].append(elt2)
    return result


if __name__ == "__main__":
    model, processor = load_model()
    vit_config = load_vit_config(model)
    query = "How many percent of source water is fresh water?"
    image_filepath = (
        Path(__file__).parent.parent
        / "static"
        / "assets"
        / "ConocoPhillips Sustainability Highlights - Nature (24-0976).png"
    )
    gen_similarity_map(
        model, processor, model.device, vit_config, query=query, image=image_filepath
    )
    result = get_result_dummy("dummy query")
    print(result)
    print("Done")