Spaces:

vespa-engine
/

colpali-vespa-visual-retrieval

Running on L40S

App Files Files Community

thomasht86 commited on Oct 7, 2024

Commit

2df1399

verified ·

1 Parent(s): ecc0caa

Upload colpali.py with huggingface_hub

Browse files

Files changed (1) hide show

colpali.py +521 -0

colpali.py ADDED Viewed

	@@ -0,0 +1,521 @@

+#!/usr/bin/env python3
+import torch
+from PIL import Image
+import numpy as np
+from typing import cast
+import pprint
+from pathlib import Path
+import base64
+from io import BytesIO
+from typing import Union, Tuple
+import matplotlib
+import re
+from colpali_engine.models import ColPali, ColPaliProcessor
+from colpali_engine.utils.torch_utils import get_torch_device
+from einops import rearrange
+from vidore_benchmark.interpretability.plot_utils import plot_similarity_heatmap
+from vidore_benchmark.interpretability.torch_utils import (
+    normalize_similarity_map_per_query_token,
+)
+from vidore_benchmark.interpretability.vit_configs import VIT_CONFIG
+from vidore_benchmark.utils.image_utils import scale_image
+from vespa.application import Vespa
+from vespa.io import VespaQueryResponse
+matplotlib.use("Agg")
+MAX_QUERY_TERMS = 64
+# OUTPUT_DIR = Path(__file__).parent.parent / "output" / "sim_maps"
+# OUTPUT_DIR.mkdir(exist_ok=True)
+COLPALI_GEMMA_MODEL_ID = "vidore--colpaligemma-3b-pt-448-base"
+COLPALI_GEMMA_MODEL_SNAPSHOT = "12c59eb7e23bc4c26876f7be7c17760d5d3a1ffa"
+COLPALI_GEMMA_MODEL_PATH = (
+    Path().home()
+    / f".cache/huggingface/hub/models--{COLPALI_GEMMA_MODEL_ID}/snapshots/{COLPALI_GEMMA_MODEL_SNAPSHOT}"
+)
+COLPALI_MODEL_ID = "vidore--colpali-v1.2"
+COLPALI_MODEL_SNAPSHOT = "9912ce6f8a462d8cf2269f5606eabbd2784e764f"
+COLPALI_MODEL_PATH = (
+    Path().home()
+    / f".cache/huggingface/hub/models--{COLPALI_MODEL_ID}/snapshots/{COLPALI_MODEL_SNAPSHOT}"
+)
+COLPALI_GEMMA_MODEL_NAME = COLPALI_GEMMA_MODEL_ID.replace("--", "/")
+def load_model() -> Tuple[ColPali, ColPaliProcessor]:
+    model_name = "vidore/colpali-v1.2"
+    device = get_torch_device("auto")
+    print(f"Using device: {device}")
+    # Load the model
+    model = cast(
+        ColPali,
+        ColPali.from_pretrained(
+            model_name,
+            torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
+            device_map=device,
+        ),
+    ).eval()
+    # Load the processor
+    processor = cast(ColPaliProcessor, ColPaliProcessor.from_pretrained(model_name))
+    return model, processor
+def load_vit_config(model):
+    # Load the ViT config
+    print(f"VIT config: {VIT_CONFIG}")
+    vit_config = VIT_CONFIG[COLPALI_GEMMA_MODEL_NAME]
+    return vit_config
+# Create dummy image
+dummy_image = Image.new("RGB", (448, 448), (255, 255, 255))
+def gen_similarity_map(
+    model, processor, device, vit_config, query, image: Union[Path, str]
+):
+    # Should take in the b64 image from Vespa query result
+    # And possibly the tensor representing the output_image
+    if isinstance(image, Path):
+        # image is a file path
+        try:
+            image = Image.open(image)
+        except Exception as e:
+            raise ValueError(f"Failed to open image from path: {e}")
+    elif isinstance(image, str):
+        # image is b64 string
+        try:
+            image = Image.open(BytesIO(base64.b64decode(image)))
+        except Exception as e:
+            raise ValueError(f"Failed to open image from b64: {e}")
+    # Preview the image
+    scale_image(image, 512)
+    # Preprocess inputs
+    input_text_processed = processor.process_queries([query]).to(device)
+    input_image_processed = processor.process_images([image]).to(device)
+    # Forward passes
+    with torch.no_grad():
+        output_text = model.forward(**input_text_processed)
+        output_image = model.forward(**input_image_processed)
+    # output_image is the tensor that we could get from the Vespa query
+    # Print shape of output_text and output_image
+    # Output image shape: torch.Size([1, 1030, 128])
+    # Remove the special tokens from the output
+    output_image = output_image[
+        :, : processor.image_seq_length, :
+    ]  # (1, n_patches_x * n_patches_y, dim)
+    # Rearrange the output image tensor to explicitly represent the 2D grid of patches
+    output_image = rearrange(
+        output_image,
+        "b (h w) c -> b h w c",
+        h=vit_config.n_patch_per_dim,
+        w=vit_config.n_patch_per_dim,
+    )  # (1, n_patches_x, n_patches_y, dim)
+    # Get the similarity map
+    similarity_map = torch.einsum(
+        "bnk,bijk->bnij", output_text, output_image
+    )  # (1, query_tokens, n_patches_x, n_patches_y)
+    # Normalize the similarity map
+    similarity_map_normalized = normalize_similarity_map_per_query_token(
+        similarity_map
+    )  # (1, query_tokens, n_patches_x, n_patches_y)
+    # Use this cell output to choose a token using its index
+    query_tokens = processor.tokenizer.tokenize(
+        processor.decode(input_text_processed.input_ids[0])
+    )
+    # Choose a token
+    token_idx = (
+        10  # e.g. if "12: '▁Kazakhstan',", set 12 to choose the token 'Kazakhstan'
+    )
+    selected_token = processor.decode(input_text_processed.input_ids[0, token_idx])
+    # strip whitespace
+    selected_token = selected_token.strip()
+    print(f"Selected token: `{selected_token}`")
+    # Retrieve the similarity map for the chosen token
+    pprint.pprint({idx: val for idx, val in enumerate(query_tokens)})
+    # Resize the image to square
+    input_image_square = image.resize((vit_config.resolution, vit_config.resolution))
+    # Plot the similarity map
+    fig, ax = plot_similarity_heatmap(
+        input_image_square,
+        patch_size=vit_config.patch_size,
+        image_resolution=vit_config.resolution,
+        similarity_map=similarity_map_normalized[0, token_idx, :, :],
+    )
+    ax = annotate_plot(ax, selected_token)
+    return fig, ax
+# def save_figure(fig, filename: str = "similarity_map.png"):
+#     fig.savefig(
+#         OUTPUT_DIR / filename,
+#         bbox_inches="tight",
+#         pad_inches=0,
+#     )
+def annotate_plot(ax, query, selected_token):
+    # Add the query text
+    ax.set_title(query, fontsize=18)
+    # Add annotation with selected token
+    ax.annotate(
+        f"Selected token:`{selected_token}`",
+        xy=(0.5, 0.95),
+        xycoords="axes fraction",
+        ha="center",
+        va="center",
+        fontsize=18,
+        color="black",
+        bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="black", lw=1),
+    )
+    return ax
+def gen_similarity_map_new(
+    processor: ColPaliProcessor,
+    model: ColPali,
+    device,
+    vit_config,
+    query: str,
+    query_embs: torch.Tensor,
+    token_idx_map: dict,
+    token_to_show: str,
+    image: Union[Path, str],
+):
+    if isinstance(image, Path):
+        # image is a file path
+        try:
+            image = Image.open(image)
+        except Exception as e:
+            raise ValueError(f"Failed to open image from path: {e}")
+    elif isinstance(image, str):
+        # image is b64 string
+        try:
+            image = Image.open(BytesIO(base64.b64decode(image)))
+        except Exception as e:
+            raise ValueError(f"Failed to open image from b64: {e}")
+    token_idx = token_idx_map[token_to_show]
+    print(f"Selected token: `{token_to_show}`")
+    # strip whitespace
+    # Preview the image
+    # scale_image(image, 512)
+    # Preprocess inputs
+    input_image_processed = processor.process_images([image]).to(device)
+    # Forward passes
+    with torch.no_grad():
+        output_image = model.forward(**input_image_processed)
+    # output_image is the tensor that we could get from the Vespa query
+    # Print shape of output_text and output_image
+    # Output image shape: torch.Size([1, 1030, 128])
+    # Remove the special tokens from the output
+    print(f"Output image shape before dim: {output_image.shape}")
+    output_image = output_image[
+        :, : processor.image_seq_length, :
+    ]  # (1, n_patches_x * n_patches_y, dim)
+    print(f"Output image shape after dim: {output_image.shape}")
+    # Rearrange the output image tensor to explicitly represent the 2D grid of patches
+    output_image = rearrange(
+        output_image,
+        "b (h w) c -> b h w c",
+        h=vit_config.n_patch_per_dim,
+        w=vit_config.n_patch_per_dim,
+    )  # (1, n_patches_x, n_patches_y, dim)
+    # Get the similarity map
+    print(f"Query embs shape: {query_embs.shape}")
+    # Add 1 extra dim to start of query_embs
+    query_embs = query_embs.unsqueeze(0).to(device)
+    print(f"Output image shape: {output_image.shape}")
+    similarity_map = torch.einsum(
+        "bnk,bijk->bnij", query_embs, output_image
+    )  # (1, query_tokens, n_patches_x, n_patches_y)
+    print(f"Similarity map shape: {similarity_map.shape}")
+    # Normalize the similarity map
+    similarity_map_normalized = normalize_similarity_map_per_query_token(
+        similarity_map
+    )  # (1, query_tokens, n_patches_x, n_patches_y)
+    print(f"Similarity map normalized shape: {similarity_map_normalized.shape}")
+    # Use this cell output to choose a token using its index
+    input_image_square = image.resize((vit_config.resolution, vit_config.resolution))
+    # Plot the similarity map
+    fig, ax = plot_similarity_heatmap(
+        input_image_square,
+        patch_size=vit_config.patch_size,
+        image_resolution=vit_config.resolution,
+        similarity_map=similarity_map_normalized[0, token_idx, :, :],
+    )
+    ax = annotate_plot(ax, query, token_to_show)
+    # save the figure
+    save_figure(fig, f"similarity_map_{token_to_show}.png")
+    return fig, ax
+def get_query_embeddings_and_token_map(
+    processor, model, query, image
+) -> Tuple[torch.Tensor, dict]:
+    inputs = processor.process_queries([query]).to(model.device)
+    with torch.no_grad():
+        embeddings_query = model(**inputs)
+        q_emb = embeddings_query.to("cpu")[0]  # Extract the single embedding
+    # Use this cell output to choose a token using its index
+    query_tokens = processor.tokenizer.tokenize(processor.decode(inputs.input_ids[0]))
+    # reverse key, values in dictionary
+    print(query_tokens)
+    token_to_idx = {val: idx for idx, val in enumerate(query_tokens)}
+    return q_emb, token_to_idx
+def format_query_results(query, response, hits=5) -> dict:
+    query_time = response.json.get("timing", {}).get("searchtime", -1)
+    query_time = round(query_time, 2)
+    count = response.json.get("root", {}).get("fields", {}).get("totalCount", 0)
+    result_text = f"Query text: '{query}', query time {query_time}s, count={count}, top results:\n"
+    print(result_text)
+    return response.json
+async def query_vespa_default(
+    app: Vespa,
+    query: str,
+    q_emb: torch.Tensor,
+    hits: int = 3,
+    timeout: str = "10s",
+    **kwargs,
+) -> dict:
+    async with app.asyncio(connections=1, total_timeout=120) as session:
+        query_embedding = format_q_embs(q_emb)
+        response: VespaQueryResponse = await session.query(
+            body={
+                "yql": "select id,title,url,image,page_number,text from pdf_page where userQuery();",
+                "ranking": "default",
+                "query": query,
+                "timeout": timeout,
+                "hits": hits,
+                "input.query(qt)": query_embedding,
+                "presentation.timing": True,
+                **kwargs,
+            },
+        )
+        assert response.is_successful(), response.json
+    return format_query_results(query, response)
+def float_to_binary_embedding(float_query_embedding: dict) -> dict:
+    binary_query_embeddings = {}
+    for k, v in float_query_embedding.items():
+        binary_vector = (
+            np.packbits(np.where(np.array(v) > 0, 1, 0)).astype(np.int8).tolist()
+        )
+        binary_query_embeddings[k] = binary_vector
+        if len(binary_query_embeddings) >= MAX_QUERY_TERMS:
+            print(f"Warning: Query has more than {MAX_QUERY_TERMS} terms. Truncating.")
+            break
+    return binary_query_embeddings
+def create_nn_query_strings(
+    binary_query_embeddings: dict, target_hits_per_query_tensor: int = 20
+) -> Tuple[str, dict]:
+    # Query tensors for nearest neighbor calculations
+    nn_query_dict = {}
+    for i in range(len(binary_query_embeddings)):
+        nn_query_dict[f"input.query(rq{i})"] = binary_query_embeddings[i]
+    nn = " OR ".join(
+        [
+            f"({{targetHits:{target_hits_per_query_tensor}}}nearestNeighbor(embedding,rq{i}))"
+            for i in range(len(binary_query_embeddings))
+        ]
+    )
+    return nn, nn_query_dict
+def format_q_embs(q_embs: torch.Tensor) -> dict:
+    float_query_embedding = {k: v.tolist() for k, v in enumerate(q_embs)}
+    return float_query_embedding
+async def query_vespa_nearest_neighbor(
+    app: Vespa,
+    query: str,
+    q_emb: torch.Tensor,
+    target_hits_per_query_tensor: int = 20,
+    hits: int = 3,
+    timeout: str = "10s",
+    **kwargs,
+) -> dict:
+    # Hyperparameter for speed vs. accuracy
+    async with app.asyncio(connections=1, total_timeout=180) as session:
+        float_query_embedding = format_q_embs(q_emb)
+        binary_query_embeddings = float_to_binary_embedding(float_query_embedding)
+        # Mixed tensors for MaxSim calculations
+        query_tensors = {
+            "input.query(qtb)": binary_query_embeddings,
+            "input.query(qt)": float_query_embedding,
+        }
+        nn_string, nn_query_dict = create_nn_query_strings(
+            binary_query_embeddings, target_hits_per_query_tensor
+        )
+        query_tensors.update(nn_query_dict)
+        response: VespaQueryResponse = await session.query(
+            body={
+                **query_tensors,
+                "presentation.timing": True,
+                "yql": f"select id,title,text,url,image,page_number from pdf_page where {nn_string}",
+                "ranking.profile": "retrieval-and-rerank",
+                "timeout": timeout,
+                "hits": hits,
+                **kwargs,
+            },
+        )
+        assert response.is_successful(), response.json
+    return format_query_results(query, response)
+def is_special_token(token: str) -> bool:
+    # Pattern for tokens that start with '<', numbers, whitespace, or single characters
+    pattern = re.compile(r"^<.*$|^\d+$|^\s+$|^.$")
+    if pattern.match(token):
+        return True
+    return False
+async def get_result_from_query(
+    app: Vespa,
+    processor: ColPaliProcessor,
+    model: ColPali,
+    query: str,
+    nn=False,
+    gen_sim_map=False,
+):
+    # Get the query embeddings and token map
+    print(query)
+    q_embs, token_to_idx = get_query_embeddings_and_token_map(
+        processor, model, query, dummy_image
+    )
+    print(token_to_idx)
+    # Use the token map to choose a token randomly for now
+    # Dynamically select a token containing 'water'
+    if nn:
+        result = await query_vespa_nearest_neighbor(app, query, q_embs)
+    else:
+        result = await query_vespa_default(app, query, q_embs)
+    # Print score, title id and text of the results
+    for idx, child in enumerate(result["root"]["children"]):
+        print(
+            f"Result {idx+1}: {child['relevance']}, {child['fields']['title']}, {child['fields']['id']}"
+        )
+    if gen_sim_map:
+        for single_result in result["root"]["children"]:
+            img = single_result["fields"]["image"]
+            for token in token_to_idx:
+                if is_special_token(token):
+                    print(f"Skipping special token: {token}")
+                    continue
+                fig, ax = gen_similarity_map_new(
+                    processor,
+                    model,
+                    model.device,
+                    load_vit_config(model),
+                    query,
+                    q_embs,
+                    token_to_idx,
+                    token,
+                    img,
+                )
+                sim_map = base64.b64encode(fig.canvas.tostring_rgb()).decode("utf-8")
+                single_result["fields"][f"sim_map_{token}"] = sim_map
+    return result
+def get_result_dummy(query: str, nn: bool = False):
+    result = {}
+    result["timing"] = {}
+    result["timing"]["querytime"] = 0.23700000000000002
+    result["timing"]["summaryfetchtime"] = 0.001
+    result["timing"]["searchtime"] = 0.23900000000000002
+    result["root"] = {}
+    result["root"]["id"] = "toplevel"
+    result["root"]["relevance"] = 1
+    result["root"]["fields"] = {}
+    result["root"]["fields"]["totalCount"] = 59
+    result["root"]["coverage"] = {}
+    result["root"]["coverage"]["coverage"] = 100
+    result["root"]["coverage"]["documents"] = 155
+    result["root"]["coverage"]["full"] = True
+    result["root"]["coverage"]["nodes"] = 1
+    result["root"]["coverage"]["results"] = 1
+    result["root"]["coverage"]["resultsFull"] = 1
+    result["root"]["children"] = []
+    elt0 = {}
+    elt0["id"] = "index:colpalidemo_content/0/424c85e7dece761d226f060f"
+    elt0["relevance"] = 2354.050122871995
+    elt0["source"] = "colpalidemo_content"
+    elt0["fields"] = {}
+    elt0["fields"]["id"] = "a767cb1868be9a776cd56b768347b089"
+    elt0["fields"]["url"] = (
+        "https://static.conocophillips.com/files/resources/conocophillips-2023-sustainability-report.pdf"
+    )
+    elt0["fields"]["title"] = "ConocoPhillips 2023 Sustainability Report"
+    elt0["fields"]["page_number"] = 50
+    elt0["fields"]["image"] = "empty for now - is base64 encoded image"
+    result["root"]["children"].append(elt0)
+    elt1 = {}
+    elt1["id"] = "index:colpalidemo_content/0/b927c4979f0beaf0d7fab8e9"
+    elt1["relevance"] = 2313.7529950886965
+    elt1["source"] = "colpalidemo_content"
+    elt1["fields"] = {}
+    elt1["fields"]["id"] = "9f2fc0aa02c9561adfaa1451c875658f"
+    elt1["fields"]["url"] = (
+        "https://static.conocophillips.com/files/resources/conocophillips-2023-managing-climate-related-risks.pdf"
+    )
+    elt1["fields"]["title"] = "ConocoPhillips Managing Climate Related Risks"
+    elt1["fields"]["page_number"] = 44
+    elt1["fields"]["image"] = "empty for now - is base64 encoded image"
+    result["root"]["children"].append(elt1)
+    elt2 = {}
+    elt2["id"] = "index:colpalidemo_content/0/9632d72238829d6afefba6c9"
+    elt2["relevance"] = 2312.230182081461
+    elt2["source"] = "colpalidemo_content"
+    elt2["fields"] = {}
+    elt2["fields"]["id"] = "d638ded1ddcb446268b289b3f65430fd"
+    elt2["fields"]["url"] = (
+        "https://static.conocophillips.com/files/resources/24-0976-sustainability-highlights_nature.pdf"
+    )
+    elt2["fields"]["title"] = (
+        "ConocoPhillips Sustainability Highlights - Nature (24-0976)"
+    )
+    elt2["fields"]["page_number"] = 0
+    elt2["fields"]["image"] = "empty for now - is base64 encoded image"
+    result["root"]["children"].append(elt2)
+    return result
+if __name__ == "__main__":
+    model, processor = load_model()
+    vit_config = load_vit_config(model)
+    query = "How many percent of source water is fresh water?"
+    image_filepath = (
+        Path(__file__).parent.parent
+        / "static"
+        / "assets"
+        / "ConocoPhillips Sustainability Highlights - Nature (24-0976).png"
+    )
+    gen_similarity_map(
+        model, processor, model.device, vit_config, query=query, image=image_filepath
+    )
+    result = get_result_dummy("dummy query")
+    print(result)
+    print("Done")