Spaces:

shi-labs
/

OLA-VLM

Running on Zero

App Files Files Community

praeclarumjj3 commited on 10 days ago

Commit

9fa3d89

•

1 Parent(s): d62bbd6

:zap: add code

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +4 -4
app.py +471 -48
demo.py +486 -0
ola_vlm/.DS_Store +0 -0
ola_vlm/__init__.py +2 -0
ola_vlm/constants.py +13 -0
ola_vlm/conversation.py +255 -0
ola_vlm/eval/.DS_Store +0 -0
ola_vlm/eval/eval_cv_bench.py +78 -0
ola_vlm/eval/eval_mmstar.py +17 -0
ola_vlm/eval/eval_probe_task.py +223 -0
ola_vlm/eval/eval_sherlock_dsg.py +282 -0
ola_vlm/eval/get_all_stats.py +132 -0
ola_vlm/eval/get_probe_task_scores.py +197 -0
ola_vlm/eval/get_sherlock_dsg_scores.py +49 -0
ola_vlm/eval/merge_json.py +30 -0
ola_vlm/eval/mmstar/evaluate/__init__.py +1 -0
ola_vlm/eval/mmstar/evaluate/__pycache__/__init__.cpython-310.pyc +0 -0
ola_vlm/eval/mmstar/evaluate/__pycache__/mmstar.cpython-310.pyc +0 -0
ola_vlm/eval/mmstar/evaluate/mmstar.py +87 -0
ola_vlm/eval/mmstar/smp/__init__.py +3 -0
ola_vlm/eval/mmstar/smp/__pycache__/__init__.cpython-310.pyc +0 -0
ola_vlm/eval/mmstar/smp/__pycache__/file.cpython-310.pyc +0 -0
ola_vlm/eval/mmstar/smp/__pycache__/log.cpython-310.pyc +0 -0
ola_vlm/eval/mmstar/smp/__pycache__/misc.cpython-310.pyc +0 -0
ola_vlm/eval/mmstar/smp/__pycache__/vlm.cpython-310.pyc +0 -0
ola_vlm/eval/mmstar/smp/file.py +147 -0
ola_vlm/eval/mmstar/smp/log.py +43 -0
ola_vlm/eval/mmstar/smp/misc.py +174 -0
ola_vlm/eval/model_cvbench_loader.py +166 -0
ola_vlm/eval/model_mmstar_loader.py +164 -0
ola_vlm/mm_utils.py +398 -0
ola_vlm/model/.DS_Store +0 -0
ola_vlm/model/__init__.py +5 -0
ola_vlm/model/apply_delta.py +48 -0
ola_vlm/model/aux_heads/.DS_Store +0 -0
ola_vlm/model/aux_heads/__init__.py +3 -0
ola_vlm/model/aux_heads/da_v2_head.py +457 -0
ola_vlm/model/aux_heads/depth_anything_v2/dinov2.py +415 -0
ola_vlm/model/aux_heads/depth_anything_v2/dinov2_layers/__init__.py +11 -0
ola_vlm/model/aux_heads/depth_anything_v2/dinov2_layers/attention.py +83 -0
ola_vlm/model/aux_heads/depth_anything_v2/dinov2_layers/block.py +252 -0
ola_vlm/model/aux_heads/depth_anything_v2/dinov2_layers/drop_path.py +35 -0
ola_vlm/model/aux_heads/depth_anything_v2/dinov2_layers/layer_scale.py +28 -0
ola_vlm/model/aux_heads/depth_anything_v2/dinov2_layers/mlp.py +41 -0
ola_vlm/model/aux_heads/depth_anything_v2/dinov2_layers/patch_embed.py +90 -0
ola_vlm/model/aux_heads/depth_anything_v2/dinov2_layers/swiglu_ffn.py +63 -0
ola_vlm/model/aux_heads/depth_anything_v2/dpt.py +219 -0
ola_vlm/model/aux_heads/depth_anything_v2/util/blocks.py +148 -0
ola_vlm/model/aux_heads/depth_anything_v2/util/transform.py +158 -0

README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 ---
-title: OLA VLM
-emoji: 💬
-colorFrom: yellow
 colorTo: purple
 sdk: gradio
-sdk_version: 5.0.1
 app_file: app.py
 pinned: false
 license: apache-2.0

 ---
+title: OLA-VLM
+emoji: 🔍
+colorFrom: blue
 colorTo: purple
 sdk: gradio
+sdk_version: 4.16.0
 app_file: app.py
 pinned: false
 license: apache-2.0

app.py CHANGED Viewed

@@ -1,64 +1,487 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
         temperature=temperature,
         top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
 """
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
-if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
+import spaces
+import torch
+import numpy as np
+from ola_vlm.constants import DEFAULT_IMAGE_TOKEN
+from ola_vlm.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
+from ola_vlm.conversation import conv_templates, SeparatorStyle
+from ola_vlm.model.builder import load_pretrained_model
+from ola_vlm.mm_utils import tokenizer_image_token, get_model_name_from_path, process_images
+from diffusers import StableUnCLIPImg2ImgPipeline
+from diffusers import DPMSolverMultistepScheduler
+from transformers import OneFormerProcessor
+from ola_vlm.model.aux_heads.oneformer_head import OneFormerHead
+from ola_vlm.ola_utils import visualize_oneformer_masks_on_image, oneformer_prepare_panoptic_instance_prediction
+import matplotlib
+from PIL import Image, ImageDraw, ImageFont
+import argparse
+import math
+from transformers import TextIteratorStreamer
+from threading import Thread
+def make_grid(pil_images, layer_indices=None):
+    new_images = []
+    new_captions = []
+    # Resize images and prepare captions
+    for i, pil_image in enumerate(pil_images):
+        pil_image = pil_image.resize((256, 256))
+        new_images.append(pil_image)
+        if layer_indices is not None:
+            new_captions.append(f"Layer: {layer_indices[i]}")
+        else:
+            new_captions.append(f"Layer: {i+1}")
+    images = new_images
+    captions = new_captions
+    width, height = images[0].size
+    font_size = 18
+    # Calculate the number of rows and columns for the grid
+    images_per_row = min(len(images), 4)  # Max 4 images per row
+    row_count = math.ceil(len(images) / images_per_row)
+    total_width = width * images_per_row
+    total_height = height * row_count
+    # Create a new blank image
+    new_image = Image.new("RGB", (total_width, total_height), "white")
+    draw = ImageDraw.Draw(new_image)
+    # Load a default font
+    try:
+        font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", font_size)
+    except:
+        font = ImageFont.load_default()
+    # Place images and captions in the grid
+    for i, (image, caption) in enumerate(zip(images, captions)):
+        row = i // images_per_row
+        col = i % images_per_row
+        x_offset = col * width
+        y_offset = row * height
+        # Paste the image
+        new_image.paste(image, (x_offset, y_offset))
+        # Calculate text and background positions
+        text_width, text_height = draw.textsize(caption, font=font)
+        text_position = (x_offset + 10, y_offset + height - text_height - 10)
+        background_position = (
+            text_position[0] - 5,
+            text_position[1] - 5,
+            text_position[0] + text_width + 5,
+            text_position[1] + text_height + 5,
+        )
+        # Draw background rectangle and text
+        draw.rectangle(background_position, fill="white", outline="black")
+        draw.text(text_position, caption, fill="black", font=font)
+    return new_image
+def reload_from_ckpt(model_path, model, cache_dir=None):
+    import os
+    from safetensors import safe_open
+    from huggingface_hub import hf_hub_download, list_repo_files
+    state_dict = {}
+    # Check if the path is a local directory or HF Hub model
+    if os.path.isdir(model_path):
+        # Local directory: Load safetensors files
+        safetensors_paths = [os.path.join(model_path, f) for f in os.listdir(model_path) if f.endswith('.safetensors')]
+    else:
+        # HF Hub: Get list of safetensors files and download them
+        repo_files = list_repo_files(model_path)
+        safetensors_paths = [
+            hf_hub_download(model_path, file_name, cache_dir=cache_dir)
+            for file_name in repo_files if file_name.endswith('.safetensors')
+        ]
+    # Load safetensors files into the state_dict
+    for path in safetensors_paths:
+        with safe_open(path, framework="pt", device="cpu") as f:
+            for key in f.keys():
+                state_dict[key] = f.get_tensor(key)
+    # Load the state dict into the model
+    model.load_state_dict(state_dict, strict=False)
+    return model
+# os.environ['GRADIO_TEMP_DIR'] = './gradio_tmp'
+no_change_btn = gr.Button()
+enable_btn = gr.Button(interactive=True)
+disable_btn = gr.Button(interactive=False)
+argparser = argparse.ArgumentParser()
+argparser.add_argument("--server_name", default="0.0.0.0", type=str)
+argparser.add_argument("--port", default="6324", type=str)
+argparser.add_argument("--model-path", default="shi-labs/pretrain_dsg_OLA-VLM-CLIP-ViT-Llama3-8b", type=str)
+argparser.add_argument("--model-base", type=str, default=None)
+argparser.add_argument("--num-gpus", type=int, default=1)
+argparser.add_argument("--conv-mode", type=str, default="llava_llama_3")
+argparser.add_argument("--temperature", type=float, default=0.2)
+argparser.add_argument("--max-new-tokens", type=int, default=512)
+argparser.add_argument("--num_frames", type=int, default=16)
+argparser.add_argument("--load-8bit", action="store_true")
+argparser.add_argument("--load-4bit", action="store_true")
+argparser.add_argument("--debug", action="store_true")
+args = argparser.parse_args()
+model_path = args.model_path
+conv_mode = args.conv_mode
+filt_invalid="cut"
+model_name = get_model_name_from_path(args.model_path)
+tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, args.load_8bit, args.load_4bit)
+model = reload_from_ckpt("shi-labs/OLA-VLM-CLIP-ViT-Llama3-8b", model)
+our_chatbot = None
+pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(f"stabilityai/stable-diffusion-2-1-unclip", torch_dtype=torch.float16, variant="fp16")
+pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+pipe = pipe.to("cuda")
+oneformer_processor = OneFormerProcessor.from_pretrained("shi-labs/oneformer_coco_swin_large")
+oneformer = OneFormerHead.from_pretrained("shi-labs/oneformer_coco_swin_large").to("cuda")
+gen_layer_indices = model.config.image_gen["img_layer_indices"].split("-")
+seg_layer_indices = model.config.image_seg["seg_layer_indices"].split("-")
+depth_layer_indices = model.config.image_depth["depth_layer_indices"].split("-")
+def clear_history():
+    state =conv_templates[conv_mode].copy()
+    return (state, state.to_gradio_chatbot(), "", None, None, None, None) + (disable_btn,) * 5
+def add_text(state, imagebox, textbox, image_process_mode):
+    if state is None:
+        state = conv_templates[conv_mode].copy()
+    if imagebox is not None:
+        textbox = DEFAULT_IMAGE_TOKEN + '\n' + textbox
+        image = Image.open(imagebox).convert('RGB')
+    if imagebox is not None:
+        textbox = (textbox, image, image_process_mode)
+    state.append_message(state.roles[0], textbox)
+    state.append_message(state.roles[1], None)
+    yield (state, state.to_gradio_chatbot(), "", None) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
+def get_gen_images(out):
+    img_embeds = out.image_embs
+    if len(img_embeds) == 0:
+        return None
+    images = []
+    for img_embed in img_embeds:
+        gen_image = pipe(image_embeds=img_embed.squeeze(1),
+                num_inference_steps=25,
+            ).images[0]
+        images.append(gen_image)
+    grid_image = make_grid(images, gen_layer_indices)
+    return grid_image
+def get_depth_images(out, org_size):
+    depth_preds = out.depth_preds
+    if len(depth_preds) == 0:
+        return None
+    depths = []
+    for i, depth_pred in enumerate(depth_preds):
+        depth = (depth_pred - depth_pred.min()) / (depth_pred.max() - depth_pred.min()) * 255.0
+        depth = depth.squeeze(0).cpu().numpy()
+        depth = depth.astype(np.uint8)
+        cmap = matplotlib.colormaps.get_cmap('Spectral_r')
+        depth = (cmap(depth)[:, :, :3] * 255).astype(np.uint8)
+        depth = Image.fromarray(depth)
+        depth = depth.resize(org_size)
+        depths.append(depth)
+    grid_image = make_grid(depths, depth_layer_indices)
+    return grid_image
+def get_seg_images(out, image):
+    seg_embs = out.seg_embs
+    if len(seg_embs) == 0:
+        return None
+    seg_preds = []
+    inputs = oneformer_processor(image, ["semantic"], return_tensors="pt")
+    inputs["pixel_values"] = inputs["pixel_values"].to(out.logits.device, out.logits.dtype)
+    inputs["task_inputs"] = inputs["task_inputs"].to(out.logits.device, out.logits.dtype)
+    backbone_features = oneformer.get_backbone_feats(**inputs)
+    for i, seg_emb in enumerate(seg_embs):
+        pred = oneformer.get_masks(**inputs, backbone_last_feature=seg_emb.float(), all_backbone_features=backbone_features)
+        pred = oneformer_processor.post_process_panoptic_segmentation(
+                                pred, target_sizes=[image.size[::-1]]
+                            )[0]
+        pred_msk, pred_cls = oneformer_prepare_panoptic_instance_prediction(**pred, oneformer=oneformer)
+        pred = visualize_oneformer_masks_on_image(image, pred_msk, pred_cls)
+        seg_preds.append(pred)
+    grid_image = make_grid(seg_preds, seg_layer_indices)
+    return grid_image
+def delete_text(state, image_process_mode):
+    state.messages[-1][-1] = None
+    prev_human_msg = state.messages[-2]
+    if type(prev_human_msg[1]) in (tuple, list):
+        prev_human_msg[1] = (*prev_human_msg[1][:2], image_process_mode)
+    yield (state, state.to_gradio_chatbot(), "", None) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
+def regenerate(state, image_process_mode):
+    state.messages[-1][-1] = None
+    prev_human_msg = state.messages[-2]
+    if type(prev_human_msg[1]) in (tuple, list):
+        prev_human_msg[1] = (*prev_human_msg[1][:2], image_process_mode)
+    state.skip_next = False
+    return (state, state.to_gradio_chatbot(), "", None) + (disable_btn,) * 5
+@spaces.GPU
+def get_interm_outs(state):
+    prompt = state.get_prompt()
+    images = state.get_images(return_pil=True)
+    #prompt, image_args = process_image(prompt, images)
+    if images is not None and len(images) > 0:
+        if len(images) > 0:
+            if len(images) != prompt.count(DEFAULT_IMAGE_TOKEN):
+                raise ValueError("Number of images does not match number of <image> tokens in prompt")
+            #images = [load_image_from_base64(image) for image in images]
+            image_sizes = [image.size for image in images]
+            inp_images = process_images(images, image_processor, model.config)
+            if type(inp_images) is list:
+                inp_images = [image.to(model.device, dtype=torch.float16) for image in images]
+            else:
+                inp_images = inp_images.to(model.device, dtype=torch.float16)
+        else:
+            inp_images = None
+            image_sizes = None
+        image_args = {"images": inp_images, "image_sizes": image_sizes}
+    else:
+        inp_images = None
+        image_args = {}
+    input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(model.device)
+    interm_outs = model.get_visual_interpretations(
+                input_ids,
+                **image_args
+         )
+    depth_outs = get_depth_images(interm_outs, image_sizes[0])
+    seg_outs =  get_seg_images(interm_outs, images[0])
+    gen_outs = get_gen_images(interm_outs)
+    return depth_outs, seg_outs, gen_outs
+@spaces.GPU
+def generate(state, temperature, top_p, max_output_tokens):
+    prompt = state.get_prompt()
+    images = state.get_images(return_pil=True)
+    #prompt, image_args = process_image(prompt, images)
+    ori_prompt = prompt
+    num_image_tokens = 0
+    if images is not None and len(images) > 0:
+        if len(images) > 0:
+            if len(images) != prompt.count(DEFAULT_IMAGE_TOKEN):
+                raise ValueError("Number of images does not match number of <image> tokens in prompt")
+            #images = [load_image_from_base64(image) for image in images]
+            image_sizes = [image.size for image in images]
+            images = process_images(images, image_processor, model.config)
+            if type(images) is list:
+                images = [image.to(model.device, dtype=torch.float16) for image in images]
+            else:
+                images = images.to(model.device, dtype=torch.float16)
+        else:
+            images = None
+            image_sizes = None
+        image_args = {"images": images, "image_sizes": image_sizes}
+    else:
+        images = None
+        image_args = {}
+    max_context_length = getattr(model.config, 'max_position_embeddings', 2048)
+    max_new_tokens = max_output_tokens
+    do_sample = True if temperature > 0.001 else False
+    stop_str = state.sep if state.sep_style in [SeparatorStyle.SINGLE, SeparatorStyle.MPT] else state.sep2
+    input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(model.device)
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=15)
+    max_new_tokens = min(max_new_tokens, max_context_length - input_ids.shape[-1] - num_image_tokens)
+    if max_new_tokens < 1:
+        return
+    thread = Thread(target=model.generate, kwargs=dict(
+        inputs=input_ids,
+        do_sample=do_sample,
         temperature=temperature,
         top_p=top_p,
+        max_new_tokens=max_new_tokens,
+        streamer=streamer,
+        use_cache=True,
+        pad_token_id=tokenizer.eos_token_id,
+        **image_args
+    ))
+    thread.start()
+    generated_text = ''
+    for new_text in streamer:
+        generated_text += new_text
+        if generated_text.endswith(stop_str):
+            generated_text = generated_text[:-len(stop_str)]
+        state.messages[-1][-1] = generated_text
+        yield (state, state.to_gradio_chatbot(), "", None) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
+    yield (state, state.to_gradio_chatbot(), "", None) + (enable_btn,) * 5
+    torch.cuda.empty_cache()
+txt = gr.Textbox(
+    scale=4,
+    show_label=False,
+    placeholder="Enter text and press enter.",
+    container=False,
+)
+title = "<h1 style='margin-bottom: -10px; text-align: center'>OLA-VLM: Optimizing Language Model Representations for Enhanced Visual Quality and Alignment</h1>"
+description = "<p style='font-size: 16px; margin: 5px; font-weight: w300; text-align: center'> <a href='https://praeclarumjj3.github.io/' style='text-decoration:none' target='_blank'>Jitesh Jain</a> &nbsp;&nbsp <a href='https://zyang-ur.github.io/' style='text-decoration:none' target='_blank'>Zhengyuan Yang</a> &nbsp;&nbsp <a href='https://www.humphreyshi.com/home' style='text-decoration:none' target='_blank'>Humphrey Shi<sup>*</sup></a> &nbsp;&nbsp <a href='https://www.humphreyshi.com/home' style='text-decoration:none' target='_blank'>Jianfeng Gao<sup>*</sup></a> &nbsp;&nbsp <a href='https://jwyang.github.io/' style='text-decoration:none' target='_blank'>Jianwei Yang<sup>*</sup></a></p>" \
+            + "<p style='font-size: 12px; margin: 5px; font-weight: w300; text-align: center'><sup>*</sup>Equal Advising</p>" \
+            + "<p style='font-size: 16px; margin: 5px; font-weight: w600; text-align: center'> <a href='https://praeclarumjj3.github.io/ola_vlm/' target='_blank'>Project Page</a> | <a href='https://youtu.be/' target='_blank'>Video</a> | <a href='https://arxiv.org/abs/' target='_blank'>ArXiv</a> | <a href='https://github.com/SHI-Labs/OLA-VLM' target='_blank'>Github</a></p>"
+tos_markdown = ("""
+### Terms of use
+By using this service, users are required to agree to the following terms:
+The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes.
+""")
+learn_more_markdown = ("""
+### License
+The service is a research preview intended for non-commercial use only, subject to the [License](https://huggingface.co/lmsys/vicuna-7b-v1.5) of Vicuna-v1.5, [License](https://github.com/haotian-liu/LLaVA/blob/main/LICENSE) of LLaVA, [Terms of Use](https://cocodataset.org/#termsofuse) of the COCO dataset, [Terms of Use](https://openai.com/policies/terms-of-use) of the data generated by OpenAI, and [Privacy Practices](https://chrome.google.com/webstore/detail/sharegpt-share-your-chatg/daiacboceoaocpibfodeljbdfacokfjb) of ShareGPT. Please contact us if you find any potential violation.
+""")
+block_css = """
+#buttons button {
+    min-width: min(120px,100%);
+}
 """
+textbox = gr.Textbox(show_label=False, placeholder="Enter text and press ENTER", container=False)
+with gr.Blocks(title="OLA-VLM", theme=gr.themes.Default(), css=block_css) as demo:
+    state = gr.State()
+    gr.Markdown(title)
+    gr.Markdown(description)
+    with gr.Row():
+        with gr.Column(scale=4):
+            imagebox = gr.Image(label="Input Image", type="filepath")
+            image_process_mode = gr.Radio(
+                ["Crop", "Resize", "Pad", "Default"],
+                value="Default",
+                label="Preprocess for non-square image", visible=False)
+            # with gr.Accordion("Parameters", open=False) as parameter_row:
+            with gr.Row():
+                temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.2, step=0.1, interactive=True, label="Temperature",)
+                top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.7, step=0.1, interactive=True, label="Top P",)
+            max_output_tokens = gr.Slider(minimum=0, maximum=1024, value=512, step=64, interactive=True, label="Max output tokens",)
+        with gr.Column(scale=8):
+            chatbot = gr.Chatbot(
+                elem_id="chatbot",
+                label="OLA-VLM",
+                height=300,
+                layout="panel",
+            )
+            textbox.render()
+            with gr.Row(elem_id="buttons") as button_row:
+                upvote_btn = gr.Button(value="👍  Upvote", interactive=False, visible=False)
+                downvote_btn = gr.Button(value="👎  Downvote", interactive=False, visible=False)
+                flag_btn = gr.Button(value="⚠️  Flag", interactive=False, visible=False)
+                #stop_btn = gr.Button(value="⏹️  Stop Generation", interactive=False)
+                regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=False)
+                clear_btn = gr.Button(value="🗑️  Clear", interactive=False)
+                submit_btn = gr.Button(value="Send", variant="primary")
+    with gr.Accordion("Representations from selected layers of the LLM (expects only a single image input)", open=False) as interm_out:
+        inter_vis_btn = gr.Button(value="✨ Visualize")
+        with gr.Row():
+            depth_box = gr.Image(label="depth", type="pil", visible=True)
+            seg_box = gr.Image(label="seg", type="pil", visible=True)
+            gen_box = gr.Image(label="gen", type="pil", visible=True)
+    gr.Examples(examples=[
+            [f"assets/cars.jpg", "Which car is in front: the blue or the brown one?"],
+            [f"assets/pb.jpg", "Where is the bulding located with respect to the man?"],
+        ], inputs=[imagebox, textbox], cache_examples=False)
+    # gr.Markdown(tos_markdown)
+    # gr.Markdown(learn_more_markdown)
+    # url_params = gr.JSON(visible=False)
+    # Register listeners
+    btn_list = [upvote_btn, downvote_btn, flag_btn, regenerate_btn, clear_btn]
+    inter_vis_btn.click(
+        get_interm_outs,
+        [state],
+        [depth_box, seg_box, gen_box],
+    )
+    clear_btn.click(
+        clear_history,
+        None,
+        [state, chatbot, textbox, imagebox, depth_box, gen_box, seg_box] + btn_list,
+        queue=False
+    )
+    regenerate_btn.click(
+        delete_text,
+        [state, image_process_mode],
+        [state, chatbot, textbox, imagebox] + btn_list,
+    ).then(
+        generate,
+        [state, temperature, top_p, max_output_tokens],
+        [state, chatbot, textbox, imagebox] + btn_list,
+    )
+    textbox.submit(
+        add_text,
+        [state, imagebox, textbox, image_process_mode],
+        [state, chatbot, textbox, imagebox] + btn_list,
+    ).then(
+        generate,
+        [state, temperature, top_p, max_output_tokens],
+        [state, chatbot, textbox, imagebox] + btn_list,
+    )
+    submit_btn.click(
+        add_text,
+        [state, imagebox, textbox, image_process_mode],
+        [state, chatbot, textbox, imagebox] + btn_list,
+    ).then(
+        generate,
+        [state, temperature, top_p, max_output_tokens],
+        [state, chatbot, textbox, imagebox] + btn_list,
+    )
+demo.queue(
+    status_update_rate=10,
+    api_open=False
+).launch(share=False)
+demo.queue()

demo.py ADDED Viewed

	@@ -0,0 +1,486 @@

+import gradio as gr
+import os
+import torch
+import numpy as np
+from ola_vlm.constants import DEFAULT_IMAGE_TOKEN
+from ola_vlm.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
+from ola_vlm.conversation import conv_templates, SeparatorStyle
+from ola_vlm.model.builder import load_pretrained_model
+from ola_vlm.mm_utils import tokenizer_image_token, get_model_name_from_path, process_images
+from diffusers import StableUnCLIPImg2ImgPipeline
+from diffusers import DPMSolverMultistepScheduler
+from transformers import OneFormerProcessor
+from ola_vlm.model.aux_heads.oneformer_head import OneFormerHead
+from ola_vlm.ola_utils import visualize_oneformer_masks_on_image, oneformer_prepare_panoptic_instance_prediction
+import matplotlib
+from PIL import Image, ImageDraw, ImageFont
+import argparse
+import math
+from transformers import TextIteratorStreamer
+from threading import Thread
+def make_grid(pil_images, layer_indices=None):
+    new_images = []
+    new_captions = []
+    # Resize images and prepare captions
+    for i, pil_image in enumerate(pil_images):
+        pil_image = pil_image.resize((256, 256))
+        new_images.append(pil_image)
+        if layer_indices is not None:
+            new_captions.append(f"Layer: {layer_indices[i]}")
+        else:
+            new_captions.append(f"Layer: {i+1}")
+    images = new_images
+    captions = new_captions
+    width, height = images[0].size
+    font_size = 18
+    # Calculate the number of rows and columns for the grid
+    images_per_row = min(len(images), 4)  # Max 4 images per row
+    row_count = math.ceil(len(images) / images_per_row)
+    total_width = width * images_per_row
+    total_height = height * row_count
+    # Create a new blank image
+    new_image = Image.new("RGB", (total_width, total_height), "white")
+    draw = ImageDraw.Draw(new_image)
+    # Load a default font
+    try:
+        font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", font_size)
+    except:
+        font = ImageFont.load_default()
+    # Place images and captions in the grid
+    for i, (image, caption) in enumerate(zip(images, captions)):
+        row = i // images_per_row
+        col = i % images_per_row
+        x_offset = col * width
+        y_offset = row * height
+        # Paste the image
+        new_image.paste(image, (x_offset, y_offset))
+        # Calculate text and background positions
+        text_width, text_height = draw.textsize(caption, font=font)
+        text_position = (x_offset + 10, y_offset + height - text_height - 10)
+        background_position = (
+            text_position[0] - 5,
+            text_position[1] - 5,
+            text_position[0] + text_width + 5,
+            text_position[1] + text_height + 5,
+        )
+        # Draw background rectangle and text
+        draw.rectangle(background_position, fill="white", outline="black")
+        draw.text(text_position, caption, fill="black", font=font)
+    return new_image
+def reload_from_ckpt(model_path, model, cache_dir=None):
+    import os
+    from safetensors import safe_open
+    from huggingface_hub import hf_hub_download, list_repo_files
+    state_dict = {}
+    # Check if the path is a local directory or HF Hub model
+    if os.path.isdir(model_path):
+        # Local directory: Load safetensors files
+        safetensors_paths = [os.path.join(model_path, f) for f in os.listdir(model_path) if f.endswith('.safetensors')]
+    else:
+        # HF Hub: Get list of safetensors files and download them
+        repo_files = list_repo_files(model_path)
+        safetensors_paths = [
+            hf_hub_download(model_path, file_name, cache_dir=cache_dir)
+            for file_name in repo_files if file_name.endswith('.safetensors')
+        ]
+    # Load safetensors files into the state_dict
+    for path in safetensors_paths:
+        with safe_open(path, framework="pt", device="cpu") as f:
+            for key in f.keys():
+                state_dict[key] = f.get_tensor(key)
+    # Load the state dict into the model
+    model.load_state_dict(state_dict, strict=False)
+    return model
+# os.environ['GRADIO_TEMP_DIR'] = './gradio_tmp'
+no_change_btn = gr.Button()
+enable_btn = gr.Button(interactive=True)
+disable_btn = gr.Button(interactive=False)
+argparser = argparse.ArgumentParser()
+argparser.add_argument("--server_name", default="0.0.0.0", type=str)
+argparser.add_argument("--port", default="6324", type=str)
+argparser.add_argument("--model-path", default="shi-labs/pretrain_dsg_OLA-VLM-CLIP-ViT-Llama3-8b", type=str)
+argparser.add_argument("--model-base", type=str, default=None)
+argparser.add_argument("--num-gpus", type=int, default=1)
+argparser.add_argument("--conv-mode", type=str, default="llava_llama_3")
+argparser.add_argument("--temperature", type=float, default=0.2)
+argparser.add_argument("--max-new-tokens", type=int, default=512)
+argparser.add_argument("--num_frames", type=int, default=16)
+argparser.add_argument("--load-8bit", action="store_true")
+argparser.add_argument("--load-4bit", action="store_true")
+argparser.add_argument("--debug", action="store_true")
+args = argparser.parse_args()
+model_path = args.model_path
+conv_mode = args.conv_mode
+filt_invalid="cut"
+model_name = get_model_name_from_path(args.model_path)
+tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, args.load_8bit, args.load_4bit)
+model = reload_from_ckpt("shi-labs/OLA-VLM-CLIP-ViT-Llama3-8b", model)
+our_chatbot = None
+pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(f"stabilityai/stable-diffusion-2-1-unclip", torch_dtype=torch.float16, variant="fp16")
+pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+pipe = pipe.to("cuda")
+oneformer_processor = OneFormerProcessor.from_pretrained("shi-labs/oneformer_coco_swin_large")
+oneformer = OneFormerHead.from_pretrained("shi-labs/oneformer_coco_swin_large").to("cuda")
+gen_layer_indices = model.config.image_gen["img_layer_indices"].split("-")
+seg_layer_indices = model.config.image_seg["seg_layer_indices"].split("-")
+depth_layer_indices = model.config.image_depth["depth_layer_indices"].split("-")
+def clear_history():
+    state =conv_templates[conv_mode].copy()
+    return (state, state.to_gradio_chatbot(), "", None, None, None, None) + (disable_btn,) * 5
+def add_text(state, imagebox, textbox, image_process_mode):
+    if state is None:
+        state = conv_templates[conv_mode].copy()
+    if imagebox is not None:
+        textbox = DEFAULT_IMAGE_TOKEN + '\n' + textbox
+        image = Image.open(imagebox).convert('RGB')
+    if imagebox is not None:
+        textbox = (textbox, image, image_process_mode)
+    state.append_message(state.roles[0], textbox)
+    state.append_message(state.roles[1], None)
+    yield (state, state.to_gradio_chatbot(), "", None) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
+def get_gen_images(out):
+    img_embeds = out.image_embs
+    if len(img_embeds) == 0:
+        return None
+    images = []
+    for img_embed in img_embeds:
+        gen_image = pipe(image_embeds=img_embed.squeeze(1),
+                num_inference_steps=25,
+            ).images[0]
+        images.append(gen_image)
+    grid_image = make_grid(images, gen_layer_indices)
+    return grid_image
+def get_depth_images(out, org_size):
+    depth_preds = out.depth_preds
+    if len(depth_preds) == 0:
+        return None
+    depths = []
+    for i, depth_pred in enumerate(depth_preds):
+        depth = (depth_pred - depth_pred.min()) / (depth_pred.max() - depth_pred.min()) * 255.0
+        depth = depth.squeeze(0).cpu().numpy()
+        depth = depth.astype(np.uint8)
+        cmap = matplotlib.colormaps.get_cmap('Spectral_r')
+        depth = (cmap(depth)[:, :, :3] * 255).astype(np.uint8)
+        depth = Image.fromarray(depth)
+        depth = depth.resize(org_size)
+        depths.append(depth)
+    grid_image = make_grid(depths, depth_layer_indices)
+    return grid_image
+def get_seg_images(out, image):
+    seg_embs = out.seg_embs
+    if len(seg_embs) == 0:
+        return None
+    seg_preds = []
+    inputs = oneformer_processor(image, ["semantic"], return_tensors="pt")
+    inputs["pixel_values"] = inputs["pixel_values"].to(out.logits.device, out.logits.dtype)
+    inputs["task_inputs"] = inputs["task_inputs"].to(out.logits.device, out.logits.dtype)
+    backbone_features = oneformer.get_backbone_feats(**inputs)
+    for i, seg_emb in enumerate(seg_embs):
+        pred = oneformer.get_masks(**inputs, backbone_last_feature=seg_emb.float(), all_backbone_features=backbone_features)
+        pred = oneformer_processor.post_process_panoptic_segmentation(
+                                pred, target_sizes=[image.size[::-1]]
+                            )[0]
+        pred_msk, pred_cls = oneformer_prepare_panoptic_instance_prediction(**pred, oneformer=oneformer)
+        pred = visualize_oneformer_masks_on_image(image, pred_msk, pred_cls)
+        seg_preds.append(pred)
+    grid_image = make_grid(seg_preds, seg_layer_indices)
+    return grid_image
+def delete_text(state, image_process_mode):
+    state.messages[-1][-1] = None
+    prev_human_msg = state.messages[-2]
+    if type(prev_human_msg[1]) in (tuple, list):
+        prev_human_msg[1] = (*prev_human_msg[1][:2], image_process_mode)
+    yield (state, state.to_gradio_chatbot(), "", None) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
+def regenerate(state, image_process_mode):
+    state.messages[-1][-1] = None
+    prev_human_msg = state.messages[-2]
+    if type(prev_human_msg[1]) in (tuple, list):
+        prev_human_msg[1] = (*prev_human_msg[1][:2], image_process_mode)
+    state.skip_next = False
+    return (state, state.to_gradio_chatbot(), "", None) + (disable_btn,) * 5
+def get_interm_outs(state):
+    prompt = state.get_prompt()
+    images = state.get_images(return_pil=True)
+    #prompt, image_args = process_image(prompt, images)
+    if images is not None and len(images) > 0:
+        if len(images) > 0:
+            if len(images) != prompt.count(DEFAULT_IMAGE_TOKEN):
+                raise ValueError("Number of images does not match number of <image> tokens in prompt")
+            #images = [load_image_from_base64(image) for image in images]
+            image_sizes = [image.size for image in images]
+            inp_images = process_images(images, image_processor, model.config)
+            if type(inp_images) is list:
+                inp_images = [image.to(model.device, dtype=torch.float16) for image in images]
+            else:
+                inp_images = inp_images.to(model.device, dtype=torch.float16)
+        else:
+            inp_images = None
+            image_sizes = None
+        image_args = {"images": inp_images, "image_sizes": image_sizes}
+    else:
+        inp_images = None
+        image_args = {}
+    input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(model.device)
+    interm_outs = model.get_visual_interpretations(
+                input_ids,
+                **image_args
+         )
+    depth_outs = get_depth_images(interm_outs, image_sizes[0])
+    seg_outs =  get_seg_images(interm_outs, images[0])
+    gen_outs = get_gen_images(interm_outs)
+    return depth_outs, seg_outs, gen_outs
+# @spaces.GPU
+def generate(state, temperature, top_p, max_output_tokens):
+    prompt = state.get_prompt()
+    images = state.get_images(return_pil=True)
+    #prompt, image_args = process_image(prompt, images)
+    ori_prompt = prompt
+    num_image_tokens = 0
+    if images is not None and len(images) > 0:
+        if len(images) > 0:
+            if len(images) != prompt.count(DEFAULT_IMAGE_TOKEN):
+                raise ValueError("Number of images does not match number of <image> tokens in prompt")
+            #images = [load_image_from_base64(image) for image in images]
+            image_sizes = [image.size for image in images]
+            images = process_images(images, image_processor, model.config)
+            if type(images) is list:
+                images = [image.to(model.device, dtype=torch.float16) for image in images]
+            else:
+                images = images.to(model.device, dtype=torch.float16)
+        else:
+            images = None
+            image_sizes = None
+        image_args = {"images": images, "image_sizes": image_sizes}
+    else:
+        images = None
+        image_args = {}
+    max_context_length = getattr(model.config, 'max_position_embeddings', 2048)
+    max_new_tokens = max_output_tokens
+    do_sample = True if temperature > 0.001 else False
+    stop_str = state.sep if state.sep_style in [SeparatorStyle.SINGLE, SeparatorStyle.MPT] else state.sep2
+    input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(model.device)
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=15)
+    max_new_tokens = min(max_new_tokens, max_context_length - input_ids.shape[-1] - num_image_tokens)
+    if max_new_tokens < 1:
+        return
+    thread = Thread(target=model.generate, kwargs=dict(
+        inputs=input_ids,
+        do_sample=do_sample,
+        temperature=temperature,
+        top_p=top_p,
+        max_new_tokens=max_new_tokens,
+        streamer=streamer,
+        use_cache=True,
+        pad_token_id=tokenizer.eos_token_id,
+        **image_args
+    ))
+    thread.start()
+    generated_text = ''
+    for new_text in streamer:
+        generated_text += new_text
+        if generated_text.endswith(stop_str):
+            generated_text = generated_text[:-len(stop_str)]
+        state.messages[-1][-1] = generated_text
+        yield (state, state.to_gradio_chatbot(), "", None) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
+    yield (state, state.to_gradio_chatbot(), "", None) + (enable_btn,) * 5
+    torch.cuda.empty_cache()
+txt = gr.Textbox(
+    scale=4,
+    show_label=False,
+    placeholder="Enter text and press enter.",
+    container=False,
+)
+title = "<h1 style='margin-bottom: -10px; text-align: center'>OLA-VLM: Optimizing Language Model Representations for Enhanced Visual Quality and Alignment</h1>"
+description = "<p style='font-size: 16px; margin: 5px; font-weight: w300; text-align: center'> <a href='https://praeclarumjj3.github.io/' style='text-decoration:none' target='_blank'>Jitesh Jain</a> &nbsp;&nbsp <a href='https://zyang-ur.github.io/' style='text-decoration:none' target='_blank'>Zhengyuan Yang</a> &nbsp;&nbsp <a href='https://www.humphreyshi.com/home' style='text-decoration:none' target='_blank'>Humphrey Shi<sup>*</sup></a> &nbsp;&nbsp <a href='https://www.humphreyshi.com/home' style='text-decoration:none' target='_blank'>Jianfeng Gao<sup>*</sup></a> &nbsp;&nbsp <a href='https://jwyang.github.io/' style='text-decoration:none' target='_blank'>Jianwei Yang<sup>*</sup></a></p>" \
+            + "<p style='font-size: 12px; margin: 5px; font-weight: w300; text-align: center'><sup>*</sup>Equal Advising</p>" \
+            + "<p style='font-size: 16px; margin: 5px; font-weight: w600; text-align: center'> <a href='https://praeclarumjj3.github.io/ola_vlm/' target='_blank'>Project Page</a> | <a href='https://youtu.be/' target='_blank'>Video</a> | <a href='https://arxiv.org/abs/' target='_blank'>ArXiv</a> | <a href='https://github.com/SHI-Labs/OLA-VLM' target='_blank'>Github</a></p>"
+tos_markdown = ("""
+### Terms of use
+By using this service, users are required to agree to the following terms:
+The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes.
+""")
+learn_more_markdown = ("""
+### License
+The service is a research preview intended for non-commercial use only, subject to the [License](https://huggingface.co/lmsys/vicuna-7b-v1.5) of Vicuna-v1.5, [License](https://github.com/haotian-liu/LLaVA/blob/main/LICENSE) of LLaVA, [Terms of Use](https://cocodataset.org/#termsofuse) of the COCO dataset, [Terms of Use](https://openai.com/policies/terms-of-use) of the data generated by OpenAI, and [Privacy Practices](https://chrome.google.com/webstore/detail/sharegpt-share-your-chatg/daiacboceoaocpibfodeljbdfacokfjb) of ShareGPT. Please contact us if you find any potential violation.
+""")
+block_css = """
+#buttons button {
+    min-width: min(120px,100%);
+}
+"""
+textbox = gr.Textbox(show_label=False, placeholder="Enter text and press ENTER", container=False)
+with gr.Blocks(title="OLA-VLM", theme=gr.themes.Default(), css=block_css) as demo:
+    state = gr.State()
+    gr.Markdown(title)
+    gr.Markdown(description)
+    with gr.Row():
+        with gr.Column(scale=4):
+            imagebox = gr.Image(label="Input Image", type="filepath")
+            image_process_mode = gr.Radio(
+                ["Crop", "Resize", "Pad", "Default"],
+                value="Default",
+                label="Preprocess for non-square image", visible=False)
+            # with gr.Accordion("Parameters", open=False) as parameter_row:
+            with gr.Row():
+                temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.2, step=0.1, interactive=True, label="Temperature",)
+                top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.7, step=0.1, interactive=True, label="Top P",)
+            max_output_tokens = gr.Slider(minimum=0, maximum=1024, value=512, step=64, interactive=True, label="Max output tokens",)
+        with gr.Column(scale=8):
+            chatbot = gr.Chatbot(
+                elem_id="chatbot",
+                label="OLA-VLM",
+                height=300,
+                layout="panel",
+            )
+            textbox.render()
+            with gr.Row(elem_id="buttons") as button_row:
+                upvote_btn = gr.Button(value="👍  Upvote", interactive=False, visible=False)
+                downvote_btn = gr.Button(value="👎  Downvote", interactive=False, visible=False)
+                flag_btn = gr.Button(value="⚠️  Flag", interactive=False, visible=False)
+                #stop_btn = gr.Button(value="⏹️  Stop Generation", interactive=False)
+                regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=False)
+                clear_btn = gr.Button(value="🗑️  Clear", interactive=False)
+                submit_btn = gr.Button(value="Send", variant="primary")
+    with gr.Accordion("Representations from selected layers of the LLM (expects only a single image input)", open=False) as interm_out:
+        inter_vis_btn = gr.Button(value="✨ Visualize")
+        with gr.Row():
+            depth_box = gr.Image(label="depth", type="pil", visible=True)
+            seg_box = gr.Image(label="seg", type="pil", visible=True)
+            gen_box = gr.Image(label="gen", type="pil", visible=True)
+    gr.Examples(examples=[
+            [f"assets/cars.jpg", "Which car is in front: the blue or the brown one?"],
+            [f"assets/pb.jpg", "Where is the bulding located with respect to the man?"],
+        ], inputs=[imagebox, textbox], cache_examples=False)
+    # gr.Markdown(tos_markdown)
+    # gr.Markdown(learn_more_markdown)
+    # url_params = gr.JSON(visible=False)
+    # Register listeners
+    btn_list = [upvote_btn, downvote_btn, flag_btn, regenerate_btn, clear_btn]
+    inter_vis_btn.click(
+        get_interm_outs,
+        [state],
+        [depth_box, seg_box, gen_box],
+    )
+    clear_btn.click(
+        clear_history,
+        None,
+        [state, chatbot, textbox, imagebox, depth_box, gen_box, seg_box] + btn_list,
+        queue=False
+    )
+    regenerate_btn.click(
+        delete_text,
+        [state, image_process_mode],
+        [state, chatbot, textbox, imagebox] + btn_list,
+    ).then(
+        generate,
+        [state, temperature, top_p, max_output_tokens],
+        [state, chatbot, textbox, imagebox] + btn_list,
+    )
+    textbox.submit(
+        add_text,
+        [state, imagebox, textbox, image_process_mode],
+        [state, chatbot, textbox, imagebox] + btn_list,
+    ).then(
+        generate,
+        [state, temperature, top_p, max_output_tokens],
+        [state, chatbot, textbox, imagebox] + btn_list,
+    )
+    submit_btn.click(
+        add_text,
+        [state, imagebox, textbox, image_process_mode],
+        [state, chatbot, textbox, imagebox] + btn_list,
+    ).then(
+        generate,
+        [state, temperature, top_p, max_output_tokens],
+        [state, chatbot, textbox, imagebox] + btn_list,
+    )
+demo.queue(
+    status_update_rate=10,
+    api_open=False
+).launch(share=True)
+demo.queue()

ola_vlm/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

ola_vlm/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .model import LlavaLlamaForCausalLM
2	+ from .model import LlavaPhi3ForCausalLM

ola_vlm/constants.py ADDED Viewed

	@@ -0,0 +1,13 @@

+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+LOGDIR = "."
+# Model Constants
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+IMAGE_PLACEHOLDER = "<image-placeholder>"

ola_vlm/conversation.py ADDED Viewed

	@@ -0,0 +1,255 @@

+import dataclasses
+from enum import auto, Enum
+from typing import List, Tuple
+import base64
+from io import BytesIO
+from PIL import Image
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    SINGLE = auto()
+    TWO = auto()
+    MPT = auto()
+    PLAIN = auto()
+    LLAMA_3 = auto()
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "###"
+    sep2: str = None
+    version: str = "Unknown"
+    skip_next: bool = False
+    def get_prompt(self):
+        messages = self.messages
+        if len(messages) > 0 and type(messages[0][1]) is tuple:
+            messages = self.messages.copy()
+            init_role, init_msg = messages[0].copy()
+            init_msg = init_msg[0].replace("<image>", "").strip()
+            if 'mmtag' in self.version:
+                messages[0] = (init_role, init_msg)
+                messages.insert(0, (self.roles[0], "<Image><image></Image>"))
+                messages.insert(1, (self.roles[1], "Received."))
+            else:
+                messages[0] = (init_role, "<image>\n" + init_msg)
+        if self.sep_style == SeparatorStyle.SINGLE:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.MPT:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+        elif self.sep_style == SeparatorStyle.LLAMA_2:
+            wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n" if len(msg) > 0 else msg
+            wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
+            ret = ""
+            for i, (role, message) in enumerate(messages):
+                if i == 0:
+                    assert message, "first message should not be none"
+                    assert role == self.roles[0], "first message should come from user"
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    if i == 0: message = wrap_sys(self.system) + message
+                    if i % 2 == 0:
+                        message = wrap_inst(message)
+                        ret += self.sep + message
+                    else:
+                        ret += " " + message + " " + self.sep2
+                else:
+                    ret += ""
+            ret = ret.lstrip(self.sep)
+        elif self.sep_style == SeparatorStyle.CHATML:
+            ret = "" if self.system == "" else self.system + self.sep + "\n"
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, images, _ = message
+                        message = "<image>" * len(images) + message
+                    ret += role + "\n" + message + self.sep + "\n"
+                else:
+                    ret += role + "\n"
+            return ret
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+        return ret
+    def append_message(self, role, message):
+        if isinstance(self.messages, tuple):
+            self.messages = list(self.messages)
+        self.messages.append([role, message])
+    def process_image(self, image, image_process_mode, return_pil=False, image_format='PNG', max_len=1344, min_len=672):
+        if image_process_mode == "Pad":
+            def expand2square(pil_img, background_color=(122, 116, 104)):
+                width, height = pil_img.size
+                if width == height:
+                    return pil_img
+                elif width > height:
+                    result = Image.new(pil_img.mode, (width, width), background_color)
+                    result.paste(pil_img, (0, (width - height) // 2))
+                    return result
+                else:
+                    result = Image.new(pil_img.mode, (height, height), background_color)
+                    result.paste(pil_img, ((height - width) // 2, 0))
+                    return result
+            image = expand2square(image)
+        elif image_process_mode in ["Default", "Crop"]:
+            pass
+        elif image_process_mode == "Resize":
+            image = image.resize((336, 336))
+        else:
+            raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
+        if max(image.size) > max_len:
+            max_hw, min_hw = max(image.size), min(image.size)
+            aspect_ratio = max_hw / min_hw
+            shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+            longest_edge = int(shortest_edge * aspect_ratio)
+            W, H = image.size
+            if H > W:
+                H, W = longest_edge, shortest_edge
+            else:
+                H, W = shortest_edge, longest_edge
+            image = image.resize((W, H))
+        if return_pil:
+            return image
+        else:
+            buffered = BytesIO()
+            image.save(buffered, format=image_format)
+            img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+            return img_b64_str
+    def get_images(self, return_pil=False):
+        images = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    msg, image, image_process_mode = msg
+                    image = self.process_image(image, image_process_mode, return_pil=return_pil)
+                    images.append(image)
+        return images
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    msg, image, image_process_mode = msg
+                    img_b64_str = self.process_image(
+                        image, "Default", return_pil=False,
+                        image_format='JPEG')
+                    img_str = f'<img src="data:image/jpeg;base64,{img_b64_str}" alt="user upload image" />'
+                    msg = img_str + msg.replace('<image>', '').strip()
+                    ret.append([msg, None])
+                else:
+                    ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            version=self.version)
+    def dict(self):
+        if len(self.get_images()) > 0:
+            return {
+                "system": self.system,
+                "roles": self.roles,
+                "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
+                "offset": self.offset,
+                "sep": self.sep,
+                "sep2": self.sep2,
+            }
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+        }
+conv_vicuna_v1 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_llava_llama_3 = Conversation(
+    system="""<|start_header_id|>system<|end_header_id|>\n\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.""",
+    roles=("<|start_header_id|>user<|end_header_id|>\n\n", "<|start_header_id|>assistant<|end_header_id|>\n\n"),
+    version="llama3",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|eot_id|>",
+)
+conv_llava_phi_3 = Conversation(
+    system="""<|system|>\nYou are a helpful AI assistant.""",
+    roles=("\n<|user|>\n", "\n<|assistant|>\n"),
+    version="phi3",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|end|>",
+)
+default_conversation = conv_llava_phi_3
+conv_templates = {
+    "v1": conv_vicuna_v1,
+    "vicuna_v1": conv_vicuna_v1,
+    "llava_phi_3": conv_llava_phi_3,
+    "llava_llama_3": conv_llava_llama_3,
+}
+if __name__ == "__main__":
+    print(default_conversation.get_prompt())

ola_vlm/eval/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

ola_vlm/eval/eval_cv_bench.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import pandas as pd
+import json
+import argparse
+def load_jsonl(f):
+    lines = open(f, encoding='utf-8').readlines()
+    lines = [x.strip() for x in lines]
+    if lines[-1] == '':
+        lines = lines[:-1]
+    data = [json.loads(x) for x in lines]
+    return data
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--results_file", type=str, default="cv-bench_answer.jsonl")
+    args = parser.parse_args()
+    answers = load_jsonl(args.results_file)
+    data = {
+         "source": [],
+         "result": [],
+         "task": [],
+    }
+    import re
+    for a in answers:
+        data["source"].append(a["source"][0])
+        if "(" in a["prediction"]:
+            match = re.search(r'\(([A-Z])\)', a["prediction"])
+            if match:
+                pred = "(" + match.group(1) + ")"
+        else:
+            pred = "(" + a["prediction"][0] + ")"
+        data["result"].append(pred == a["answer"][0])
+        data["task"].append(a["task"][0])
+    df = pd.DataFrame(data)
+    def calculate_accuracy(df, source):
+        source_df = df[df['source'] == source]
+        accuracy = (source_df['result']).mean()
+        return accuracy
+    def calculate_task_accuracy(df, task):
+        source_df = df[df['task'] == task]
+        accuracy = (source_df['result']).mean()
+        return accuracy
+    accuracy_2d_ade = calculate_accuracy(df, 'ADE20K')
+    accuracy_2d_coco = calculate_accuracy(df, 'COCO')
+    accuracy_3d_omni = calculate_accuracy(df, 'Omni3D')
+    tasks = ["Count", "Depth", "Relation", "Distance"]
+    scores = {}
+    accuracy_2d = (accuracy_2d_ade + accuracy_2d_coco) / 2
+    accuracy_3d = accuracy_3d_omni
+    combined_accuracy = (accuracy_2d + accuracy_3d) / 2
+    scores["Overall"] = combined_accuracy
+    scores["3D"] = accuracy_3d
+    scores["2D"] = accuracy_2d
+    for t in tasks:
+        accuracy = calculate_task_accuracy(df, t)
+        scores[t] = accuracy
+    print("\n=========================CV-Bench Scores===============================")
+    for key, value in scores.items():
+        print(f"{key} -> {value}")
+    print("================================================================")
+    with open(args.results_file.replace('.jsonl', '_score.json'), "w") as f:
+        json.dump(scores, f, indent=2)

ola_vlm/eval/eval_mmstar.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import os
+import argparse
+import json
+from ola_vlm.eval.mmstar.evaluate import MMStar_eval
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--results_file', type=str, default="./playground/data/eval/mmstar_results.jsonl")
+    return parser.parse_args()
+if __name__ == '__main__':
+    args = parse_args()
+    MMStar_eval(args.results_file)

ola_vlm/eval/eval_probe_task.py ADDED Viewed

	@@ -0,0 +1,223 @@

+import argparse
+import torch
+from ola_vlm.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from ola_vlm.conversation import conv_templates
+from ola_vlm.model.builder import load_pretrained_model
+from ola_vlm.utils import disable_torch_init
+from ola_vlm.mm_utils import process_images, tokenizer_image_token, get_model_name_from_path
+from ola_vlm.model.aux_heads.oneformer_head import OneFormerHead
+from transformers import OneFormerProcessor
+from PIL import Image
+import json
+import os
+from tqdm import tqdm
+from icecream import ic
+import warnings
+warnings.filterwarnings("ignore")
+import random
+import numpy as np
+from analyze.analyze_utils import prepare_coco, prepare_da2k
+import math
+from diffusers import StableUnCLIPImg2ImgPipeline
+from diffusers import DPMSolverMultistepScheduler
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+def load_image(image_file):
+    image = Image.open(image_file).convert('RGB')
+    return image
+import glob
+def list_image_files(directory):
+    image_extensions = ['*.png', '*.jpg', '*.jpeg', '*.gif', '*.bmp', '*.tiff']
+    image_files = []
+    for extension in image_extensions:
+        image_files.extend(glob.glob(os.path.join(directory, extension)))
+    return image_files
+def prep_seginw(dir):
+    image_files = list_image_files(dir)
+    prompts = []
+    for image_file in image_files:
+        prompts.append("Describe the image")
+    return image_files, prompts, prompts
+def predict(args):
+    mode = args.mode
+    name = args.model_path.split("/")[-1]
+    os.makedirs(f"plots/probes_task/{name}/", exist_ok=True)
+    # Model
+    disable_torch_init()
+    if mode == 'gen' or mode == 'seg':
+        images, prompts, answers = prepare_coco(args.json_file)
+    elif mode == 'depth':
+        images, prompts, answers = prepare_da2k("/mnt/vlpdatasets/sherlock/eval/DA-2K/DA-2K/images", is_eval=True)
+    images = get_chunk(images, args.num_chunks, args.chunk_idx)
+    prompts = get_chunk(prompts, args.num_chunks, args.chunk_idx)
+    answers = get_chunk(answers, args.num_chunks, args.chunk_idx)
+    model_name = get_model_name_from_path(args.model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, args.load_8bit, args.load_4bit, device=args.device)
+    if mode == "gen":
+        pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(f"playground/jiteshjain_sherlock/stable-diffusion-2-1-unclip", torch_dtype=torch.float16, variant="fp16")
+        pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+        pipe = pipe.to("cuda")
+    elif mode == "seg":
+        oneformer_processor = OneFormerProcessor.from_pretrained("/mnt/projects4jw/jiteshjain_sherlock/oneformer_coco_swin_large")
+        oneformer = OneFormerHead.from_pretrained("/mnt/projects4jw/jiteshjain_sherlock/oneformer_coco_swin_large")
+        oneformer = oneformer.to("cuda")
+    if "mistral" in model_name.lower():
+        conv_mode = "mistral_instruct"
+    elif "v1.6-34b" in model_name.lower():
+        conv_mode = "chatml_direct"
+    elif "llama3" in model_name.lower():
+        conv_mode = "llava_llama_3"
+    elif "qwen" in model_name.lower():
+        conv_mode = "qwen_1_5"
+    elif "v1" in model_name.lower():
+        conv_mode = "llava_v1"
+    elif "phi" in model_name.lower():
+        conv_mode = "llava_phi_3"
+    set_seed(42)
+    if mode == "gen":
+        try:
+            layers = model.config.image_gen["layer_indices"]
+        except:
+            layers = [i+1 for i in range(32)]
+    elif mode == "depth":
+        try:
+            layers = model.config.image_depth["layer_indices"]
+        except:
+            layers = [i+1 for i in range(32)]
+    elif mode == "seg":
+        try:
+            layers = model.config.image_seg["layer_indices"]
+        except:
+            layers = [i+1 for i in range(32)]
+    from tqdm import tqdm
+    for fname, prompt, answer in tqdm(zip(images, prompts, answers), total=len(prompts)):
+        conv = conv_templates[conv_mode].copy()
+        im = fname.split("/")[-1].split(".")[0]
+        image = load_image(fname)
+        image_size = image.size
+        image_tensor = process_images([image], image_processor, model.config)
+        if type(image_tensor) is list:
+            image_tensor = [image.to(model.device, dtype=torch.float16) for image in image_tensor]
+        else:
+            image_tensor = image_tensor.to(model.device, dtype=torch.float16)
+        inp = prompt
+        if image is not None:
+            if model.config.mm_use_im_start_end:
+                inp = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + inp
+            else:
+                inp = DEFAULT_IMAGE_TOKEN + '\n' + inp
+        conv.append_message(conv.roles[0], inp)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(model.device)
+        with torch.inference_mode():
+            out = model.get_visual_interpretations(
+                input_ids,
+                images=image_tensor,
+                image_sizes=image_size,
+            )
+        if mode == "seg":
+            seg_embs = out.seg_embs
+            inputs = oneformer_processor(image, ["semantic"], return_tensors="pt")
+            inputs["pixel_values"] = inputs["pixel_values"].to(out.logits.device, out.logits.dtype)
+            inputs["task_inputs"] = inputs["task_inputs"].to(out.logits.device, out.logits.dtype)
+            backbone_features = oneformer.get_backbone_feats(**inputs)
+            for i, seg_emb in enumerate(seg_embs):
+                pred = oneformer.get_masks(**inputs, backbone_last_feature=seg_emb.float(), all_backbone_features=backbone_features)
+                pred = oneformer_processor.post_process_semantic_segmentation(
+                                        pred, target_sizes=[image.size[::-1]]
+                                    )[0]
+                pred = pred.squeeze().cpu().numpy().astype(np.uint8)
+                pred = Image.fromarray(pred)
+                if not os.path.exists(f"plots/probes_task/{name}/seg/layer_{layers[i]}"):
+                    os.makedirs(f"plots/probes_task/{name}/seg/layer_{layers[i]}", exist_ok=True)
+                save_path = os.path.join(f"plots/probes_task/{name}/seg/layer_{layers[i]}", fname.split("/")[-1].replace("jpg", "png"))
+                pred.save(save_path)
+        elif mode == "gen":
+            img_embeds = out.image_embs
+            images = []
+            for img_emb in img_embeds:
+                gen_image = pipe(image_embeds=img_emb.squeeze(1),
+                            num_inference_steps=25,
+                        ).images[0]
+                images.append(gen_image)
+            for i, image in enumerate(images):
+                image = image.resize((256, 256), Image.LANCZOS)
+                if not os.path.exists(f"plots/probes_task/{name}/gen/layer_{layers[i]}"):
+                    os.makedirs(f"plots/probes_task/{name}/gen/layer_{layers[i]}", exist_ok=True)
+                save_path = os.path.join(f"plots/probes_task/{name}/gen/layer_{layers[i]}", fname.split("/")[-1])
+                image.save(save_path)
+        elif mode == "depth":
+            depth_preds = out.depth_preds
+            for i, depth_pred in enumerate(depth_preds):
+                if not os.path.exists(f"plots/probes_task/{name}/depth/layer_{layers[i]}"):
+                    os.makedirs(f"plots/probes_task/{name}/depth/layer_{layers[i]}", exist_ok=True)
+                depth = depth_pred.squeeze(0).cpu().numpy() * 255.0
+                depth = depth.astype(np.uint8)
+                depth = Image.fromarray(depth)
+                save_path = os.path.join(f"plots/probes_task/{name}/depth/layer_{layers[i]}", fname.split("/")[-1])
+                depth.save(save_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="/mnt/projects4jw/jiteshjain_sherlock/llava-v1.5-7b")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--json-file", type=str, default="/mnt/projects4jw/jiteshjain_sherlock/datasets/coco/annotations/captions_val2017.json")
+    parser.add_argument("--device", type=str, default="cuda")
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--max-new-tokens", type=int, default=10)
+    parser.add_argument("--load-8bit", action="store_true")
+    parser.add_argument("--load-4bit", action="store_true")
+    parser.add_argument("--mode", type=str, default="gen")
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    args = parser.parse_args()
+    predict(args)

ola_vlm/eval/eval_sherlock_dsg.py ADDED Viewed

	@@ -0,0 +1,282 @@

+import argparse
+import torch
+from ola_vlm.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from ola_vlm.conversation import conv_templates
+from ola_vlm.model.builder import load_pretrained_model
+from ola_vlm.utils import disable_torch_init
+from ola_vlm.mm_utils import process_images, tokenizer_image_token, get_model_name_from_path
+from ola_vlm.model.aux_heads.sam_utils.build_sam import sam_model_registry
+from ola_vlm.model.aux_heads.sam_utils.automatic_mask_generator import SamAutomaticMaskGenerator
+from ola_vlm.model.aux_heads.oneformer_head import OneFormerHead, OneFormerSegHead, OneFormerTaskTokenSegHead
+from ola_vlm.model.aux_heads.depth_anything_v2.dpt import DepthAnythingV2
+from transformers import OneFormerProcessor
+from diffusers import (
+    DPMSolverMultistepScheduler,
+    StableUnCLIPImg2ImgPipeline,
+)
+from PIL import Image
+import json
+import os
+from tqdm import tqdm
+from icecream import ic
+import warnings
+warnings.filterwarnings("ignore")
+import random
+import numpy as np
+from analyze.analyze_utils import prepare_coco
+import math
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+def load_image(image_file):
+    image = Image.open(image_file).convert('RGB')
+    return image
+import glob
+def list_image_files(directory):
+    image_extensions = ['*.png', '*.jpg', '*.jpeg', '*.gif', '*.bmp', '*.tiff']
+    image_files = []
+    for extension in image_extensions:
+        image_files.extend(glob.glob(os.path.join(directory, extension)))
+    return image_files
+def get_gen_feats(pipe, image):
+    with torch.no_grad():
+        clip_ims = pipe.feature_extractor(images=image, return_tensors="pt").pixel_values.to("cuda")
+        feat = pipe.image_encoder(clip_ims).image_embeds
+    return feat
+def get_dav2_feats(dav2, image):
+    image = image.resize((336, 336))
+    image = np.array(image)
+    with torch.no_grad():
+        feat = dav2.infer_image(image, is_dsg=True)
+    return feat[-1][0]
+def get_seg_feats(mask_generator, oneformer, oneformer_processor, seg_teacher, image):
+    if seg_teacher == "oneformer":
+        img = image.resize((768, 768))
+        inputs = oneformer_processor(img, ["panoptic"], return_tensors="pt")
+        inputs["pixel_values"] = inputs["pixel_values"].to("cuda")
+        with torch.no_grad():
+            feats = oneformer.forward_features(**inputs)
+    else:
+        img = np.array(image)
+        with torch.no_grad():
+            mask_generator.predictor.set_image(img)
+            feats = mask_generator.predictor.features
+            mask_generator.predictor.reset_image()
+    return feats
+def predict(args):
+    mode = args.mode
+    name = args.model_path.split("/")[-1]
+    os.makedirs(f"plots/probe_scores/{name}/", exist_ok=True)
+    if "cambrian" in name:
+        from ola_vlm.cambrian.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+        from ola_vlm.cambrian.conversation import conv_templates, SeparatorStyle
+        from ola_vlm.cambrian.model.builder import load_pretrained_model
+        from ola_vlm.cambrian.utils import disable_torch_init
+        from ola_vlm.cambrian.mm_utils import process_images, tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
+        disable_torch_init()
+        model_name = get_model_name_from_path(args.model_path)
+        tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, args.load_8bit, args.load_4bit, device=args.device)
+        if 'llama-2' in model_name.lower():
+            conv_mode = "cambrian_llama_2"
+        elif "v1" in model_name.lower():
+            conv_mode = "cambrian_v1"
+        elif "mpt" in model_name.lower():
+            conv_mode = "mpt"
+        else:
+            conv_mode = "cambrian_v0"
+    else:
+        from ola_vlm.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+        from ola_vlm.conversation import conv_templates
+        from ola_vlm.model.builder import load_pretrained_model
+        from ola_vlm.utils import disable_torch_init
+        from ola_vlm.mm_utils import process_images, tokenizer_image_token, get_model_name_from_path
+        disable_torch_init()
+        model_name = get_model_name_from_path(args.model_path)
+        tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, args.load_8bit, args.load_4bit, device=args.device)
+        if "mistral" in model_name.lower():
+            conv_mode = "mistral_instruct"
+        elif "v1.6-34b" in model_name.lower():
+            conv_mode = "chatml_direct"
+        elif "llama3" in model_name.lower():
+            conv_mode = "llava_llama_3"
+        elif "qwen" in model_name.lower():
+            conv_mode = "llava_qwen"
+        elif "v1" in model_name.lower():
+            conv_mode = "llava_v1"
+        elif "phi" in model_name.lower():
+            conv_mode = "llava_phi_3"
+    images, prompts, answers = prepare_coco(args.json_file)
+    images = get_chunk(images, args.num_chunks, args.chunk_idx)
+    prompts = get_chunk(prompts, args.num_chunks, args.chunk_idx)
+    answers = get_chunk(answers, args.num_chunks, args.chunk_idx)
+    if mode == "gen":
+        pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(f"playground/jiteshjain_sherlock/stable-diffusion-2-1-unclip", torch_dtype=torch.float16, variant="fp16")
+        pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+        pipe = pipe.to("cuda")
+    elif mode == "seg":
+        oneformer_processor, oneformer, mask_generator = None, None, None
+        seg_teacher = model.config.image_seg.get("seg_teacher", "sam")
+        if seg_teacher == "sam":
+            sam = sam_model_registry["vit_l"](checkpoint="/mnt/projects4jw/jiteshjain_sherlock/oneformer_coco_swin_large")
+            sam = sam.to("cuda")
+            mask_generator = SamAutomaticMaskGenerator(sam.float())
+        else:
+            oneformer_processor = OneFormerProcessor.from_pretrained("/mnt/projects4jw/jiteshjain_sherlock/oneformer_coco_swin_large")
+            oneformer = OneFormerHead.from_pretrained("/mnt/projects4jw/jiteshjain_sherlock/oneformer_coco_swin_large")
+            oneformer = oneformer.to("cuda")
+    elif mode == "depth":
+        dav2_cfg = {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]}
+        dav2_backbone = DepthAnythingV2(**dav2_cfg)
+        dav2_backbone.load_state_dict(torch.load("/mnt/projects4jw/jiteshjain_sherlock/depth_anything_v2_vitl.pth", map_location='cpu'))
+        dav2_backbone = dav2_backbone.to("cuda")
+    set_seed(42)
+    if mode == "gen":
+        try:
+            layers = model.config.image_gen["layer_indices"]
+        except:
+            layers = [i+1 for i in range(32)]
+    elif mode == "depth":
+        try:
+            layers = model.config.image_depth["layer_indices"]
+        except:
+            layers = [i+1 for i in range(32)]
+    elif mode == "seg":
+        try:
+            layers = model.config.image_seg["layer_indices"]
+        except:
+            layers = [i+1 for i in range(32)]
+    os.makedirs(f"plots/probe_scores/{name}/{mode}/", exist_ok=True)
+    if os.path.exists(f"plots/probe_scores/{name}/{mode}/{args.num_chunks}_{args.chunk_idx}.json"):
+        with open(f"plots/probe_scores/{name}/{mode}/{args.num_chunks}_{args.chunk_idx}.json", 'r') as f:
+            diff_dict = json.load(f)
+    else:
+        diff_dict = {}
+    i = 0
+    from tqdm import tqdm
+    for fname, prompt, answer in tqdm(zip(images, prompts, answers), total=len(prompts)):
+        # if fname.split("/")[-1] in diff_dict.keys():
+        #     continue
+        conv = conv_templates[conv_mode].copy()
+        image = load_image(fname)
+        image = image.resize((640, 640))
+        image_size = image.size
+        image_tensor = process_images([image], image_processor, model.config)
+        if type(image_tensor) is list:
+            image_tensor = [image.to(model.device, dtype=torch.float16) for image in image_tensor]
+        else:
+            image_tensor = image_tensor.to(model.device, dtype=torch.float16)
+        inp = prompt
+        if image is not None:
+            if model.config.mm_use_im_start_end:
+                inp = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + inp
+            else:
+                inp = DEFAULT_IMAGE_TOKEN + '\n' + inp
+        conv.append_message(conv.roles[0], inp)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(model.device)
+        with torch.inference_mode():
+            out = model.get_visual_interpretations(
+                input_ids,
+                images=image_tensor,
+                image_sizes=[image_size],
+            )
+        if mode == "gen":
+            embeds = out.image_embs
+            feats = get_gen_feats(pipe, image)
+        elif mode == "depth":
+            embeds = out.depth_embs
+            embeds = [emb[0][0] for emb in embeds]
+            feats = get_dav2_feats(dav2_backbone, image)
+        elif mode == "seg":
+            embeds = out.seg_embs
+            feats = get_seg_feats(mask_generator, oneformer, oneformer_processor, seg_teacher, image)
+        layer_diff = {}
+        for i, emb in enumerate(embeds):
+            emb = emb.to("cuda")
+            layer_diff[layers[i]] = torch.nn.CosineEmbeddingLoss(reduction="mean")(
+                    emb.reshape(1, -1).float(), feats.reshape(1, -1).float(),
+                    torch.ones(len(emb)).to(feats.device)
+                ).cpu().item()
+            from icecream import ic
+            ic(layer_diff[layers[i]])
+        diff_dict[fname.split("/")[-1]] = layer_diff
+        if i % 200 == 0:
+            # Save progress intermittently
+            with open(f"plots/probe_scores/{name}/{mode}/{args.num_chunks}_{args.chunk_idx}.json", 'w') as f:
+                json.dump(diff_dict, f, indent=2)
+        i += 1
+    with open(f"plots/probe_scores/{name}/{mode}/{args.num_chunks}_{args.chunk_idx}.json", 'w') as f:
+        json.dump(diff_dict, f, indent=2)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="/mnt/projects4jw/jiteshjain_sherlock/llava-v1.5-7b")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--json-file", type=str, default="/mnt/projects4jw/jiteshjain_sherlock/datasets/coco/annotations/captions_val2017.json")
+    parser.add_argument("--device", type=str, default="cuda")
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--max-new-tokens", type=int, default=10)
+    parser.add_argument("--load-8bit", action="store_true")
+    parser.add_argument("--load-4bit", action="store_true")
+    parser.add_argument("--mode", type=str, default="gen")
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    args = parser.parse_args()
+    predict(args)

ola_vlm/eval/get_all_stats.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import json
+import argparse
+from icecream import ic
+import os
+import numpy as np
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--results_folder", type=str, default="./playground/data/eval/results")
+    parser.add_argument("--ckpt", type=str)
+    args = parser.parse_args()
+    scores = {}
+    dirs = os.listdir(f"{args.results_folder}/{args.ckpt}")
+    for dir in dirs:
+        if args.ckpt in dir and dir not in args.ckpt:
+            break
+    try:
+        with open(f"{args.results_folder}/{args.ckpt}/mmstar/merge_score.json", "r") as f:
+            data = json.load(f)
+            scores["MMStar"] = round(data.get("final score", 0)*100, 1) if data.get("final score") is not None else None
+    except:
+        scores["MMStar"] = None
+    cv_scores = {}
+    with open(f"{args.results_folder}/{args.ckpt}/cv-bench/merge_score.json", "r") as f:
+        data = json.load(f)
+        scores["CV-Bench"] = round(data.get("Overall", 0)*100, 1) if data.get("Overall") is not None else None
+        cv_scores["CV-Bench (2D)"] = round(data.get("2D", 0)*100, 1) if data.get("2D") is not None else None
+        cv_scores["CV-Bench (3D)"] = round(data.get("3D", 0)*100, 1) if data.get("3D") is not None else None
+        cv_scores["CV-Bench (Count)"] = round(data.get("Count", 0)*100, 1) if data.get("Count") is not None else None
+        cv_scores["CV-Bench (Depth)"] = round(data.get("Depth", 0)*100, 1) if data.get("Depth") is not None else None
+        cv_scores["CV-Bench (Relation)"] = round(data.get("Relation", 0)*100, 1) if data.get("Relation") is not None else None
+        cv_scores["CV-Bench (Distance)"] = round(data.get("Distance", 0)*100, 1) if data.get("Distance") is not None else None
+    with open(f"{args.results_folder}/{args.ckpt}/{dir}/results.json", "r") as f:
+        results = json.load(f).get("results", {})
+        # scores["MME-Cognition"] = round(results.get("mme", {}).get("mme_cognition_score,none", 0), 1) if results.get("mme", {}).get("mme_cognition_score,none") is not None else None
+        # scores["MME-Perception"] = round(results.get("mme", {}).get("mme_percetion_score,none", 0), 1) if results.get("mme", {}).get("mme_percetion_score,none") is not None else None
+        scores["Realworld-QA"] = round(results.get("realworldqa", {}).get("exact_match,flexible-extract", 0)*100, 1) if results.get("realworldqa", {}).get("exact_match,flexible-extract") is not None else None
+        scores["VizWiz-VQA-Val"] = round(results.get("vizwiz_vqa_val", {}).get("exact_match,none", 0)*100, 1) if results.get("vizwiz_vqa_val", {}).get("exact_match,none") is not None else None
+        # scores["SEEDBench-Image"] = round(results.get("seedbench", {}).get("seed_image,none", 0)*100, 1) if results.get("seedbench", {}).get("seed_image,none") is not None else None
+        # scores["VQAv2-Val"] = round(results.get("vqav2_val", {}).get("exact_match,none", 0)*100, 1) if results.get("vqav2_val", {}).get("exact_match,none") is not None else None
+        # scores["Science-QA-Img"] = round(results.get("scienceqa_img", {}).get("exact_match,none", 0)*100, 1) if results.get("scienceqa_img", {}).get("exact_match,none") is not None else None
+        scores["MMMU-Val"] = round(results.get("mmmu_val", {}).get("mmmu_acc,none", 0)*100, 1) if results.get("mmmu_val", {}).get("mmmu_acc,none") is not None else None
+        # scores["MMBench"] = round(results.get("mmbench_en_dev", {}).get("gpt_eval_score,none", 0), 1) if results.get("mmbench_en_dev", {}).get("gpt_eval_score,none") is not None else None
+        # scores["NaturalBench"] = round(results.get("naturalbench", {}).get("mme_score,none", 0)*100, 1) if results.get("naturalbench", {}).get("mme_score,none") is not None else None
+        # scores["GQA"] = round(results.get("gqa", {}).get("exact_match,none", 0)*100, 1) if results.get("gqa", {}).get("exact_match,none") is not None else None
+        scores["POPE"] = round(results.get("pope", {}).get("pope_accuracy,none", 0)*100, 1) if results.get("pope", {}).get("pope_accuracy,none") is not None else None
+        scores["MMVet"] = round(results.get("mmvet", {}).get("gpt_eval_score", 0)*100, 1) if results.get("mmvet", {}).get("gpt_eval_score") is not None else None
+        scores["OK-VQA"] = round(results.get("ok_vqa", {}).get("exact_match,none", 0)*100, 1) if results.get("ok_vqa", {}).get("exact_match,none") is not None else None
+        # scores["ChartQA"] = round(results.get("chartqa", {}).get("relaxed_overall,none", 0)*100, 1) if results.get("chartqa", {}).get("relaxed_overall,none") is not None else None
+        # scores["DocVQA"] = round(results.get("docvqa_val", {}).get("anls,none", 0)*100, 1) if results.get("docvqa_val", {}).get("anls,none") is not None else None
+        # scores["TextVQA"] = round(results.get("textvqa_val", {}).get("exact_match,none", 0)*100, 1) if results.get("textvqa_val", {}).get("exact_match,none") is not None else None
+    try:
+        with open(f"{args.results_folder}/{args.ckpt}/mmvp/merge_score.json", "r") as f:
+            data = json.load(f)
+            scores["MMVP"] = round(data.get("mmvp", 0)*100, 1) if data.get("mmvp") is not None else None
+    except:
+        scores["MMVP"] = None
+    keys = list(scores.keys())
+    str_scores = [str(scores[key]) if scores[key] is not None else 'None' for key in keys]
+    abl_keys = ["CV-Bench", "MMStar", "VizWiz-VQA-Val", "MMVet", "MMVP", "MMMU-Val"]
+    abl_scores = [scores[key] for key in abl_keys if scores[key] is not None]
+    small_abl_keys = ["CV-Bench", "MMStar", "OK-VQA", "MMMU-Val"]
+    small_abl_scores = [scores[key] for key in small_abl_keys if scores[key] is not None]
+    cv_bench_keys = ["CV-Bench (2D)", "CV-Bench (3D)", "CV-Bench (Count)", "CV-Bench (Depth)", "CV-Bench (Relation)", "CV-Bench (Distance)"]
+    cv_bench_scores = [cv_scores[key] for key in cv_bench_keys if cv_scores[key] is not None]
+    # cat_scores = {}
+    # if os.path.exists(f"{args.results_folder}/{args.ckpt}/categorized_scores.json"):
+    #     with open(f"{args.results_folder}/{args.ckpt}/categorized_scores.json", "r") as f:
+    #         cat_scores = json.load(f)
+    #         cat_scores.pop("Both")
+    print("\n====================All-Scores===========================================")
+    print(" & ".join(keys))
+    print(" & ".join(str_scores))
+    if abl_scores:
+        print("\n====================Abl-Scores===========================================")
+        print(" & ".join(abl_keys))
+        print(" & ".join([str(a) for a in abl_scores]))
+        print(f"Ablation Avg: {round(np.mean(abl_scores), 1)}")
+    else:
+        print("Ablation Avg: None")
+    if small_abl_scores:
+        print("\n====================Small-Abl-Scores===========================================")
+        print(" & ".join(small_abl_keys))
+        print(" & ".join([str(a) for a in small_abl_scores]))
+        print(f"Small-Ablation Avg: {round(np.mean(small_abl_scores), 1)}")
+    else:
+        print("Small-Ablation Avg: None")
+    if cv_bench_scores:
+        print("\n====================CV-Bench-Scores===========================================")
+        print(" & ".join(cv_bench_keys))
+        print(" & ".join([str(c) for c in cv_bench_scores]))
+        print(f"CV-Bench Overall: {round(np.mean(cv_bench_scores[:2]), 1)}")
+    else:
+        print("CV-Bench Avg: None")
+    # if cat_scores is not None:
+    #     print("\n====================Categorized-Scores===========================================")
+    #     cats = []
+    #     class_scores = []
+    #     benches = []
+    #     for k, v in cat_scores.items():
+    #         cats.append(k)
+    #         for bench, score in v.items():
+    #             benches.append(bench)
+    #             class_scores.append(round(score*100, 1))
+    #     print(" & ".join(cats))
+    #     print(" & ".join(benches))
+    #     print(" & ".join([str(c) for c in class_scores]))
+    # print("================================================================")

ola_vlm/eval/get_probe_task_scores.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import argparse
+import torch
+from PIL import Image
+import json
+import os
+from tqdm import tqdm
+import warnings
+import random
+import numpy as np
+import multiprocessing as mp
+from ola_vlm.eval.probe_metrics.fid_score import compute_fid
+from analyze.analyze_utils import prepare_coco, prepare_da2k, parse_json
+from multiprocessing import Pool
+warnings.filterwarnings("ignore")
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+def load_image(image_file):
+    image = Image.open(image_file)
+    return image
+def mask_iou(gt, pred):
+    gt = np.array(gt).astype(np.uint8)
+    pred = np.array(pred).astype(np.uint8)
+    iou_scores = []
+    for category in np.unique(gt):
+        if category == 255:
+            continue
+        gt_mask = (gt == category)
+        pred_mask = (pred == category)
+        intersection = np.logical_and(gt_mask, pred_mask)
+        union = np.logical_or(gt_mask, pred_mask)
+        if np.sum(union) == 0:
+            iou_scores.append(1.0)
+        else:
+            iou_scores.append(np.sum(intersection) / np.sum(union))
+    return np.mean(iou_scores)
+def load_json(path):
+    with open(path) as f:
+        data = json.load(f)
+    return data
+# Helper function for multiprocessing in evaluate_seg
+def process_iou(args):
+    gt_path, layer_folder, dir, fname = args
+    gt_data = load_image(os.path.join(gt_path, fname.replace("jpg", "png")))
+    pred = load_image(os.path.join(layer_folder, dir, fname))
+    return mask_iou(gt_data, pred)
+def evaluate_seg(args):
+    images, _, _ = prepare_coco("/mnt/vlpdatasets/coco/annotations/captions_val2017.json")
+    fnames = [img.split("/")[-1] for img in images][:8]
+    name = args.ckpt
+    gt_path = "/mnt/vlpdatasets/sherlock/eval/coco/annotations/panoptic_semseg_val2017"
+    layer_folder = f"plots/probes_task/{name}/seg"
+    scores = {"m_iou": []}
+    dirs = os.listdir(layer_folder)
+    with mp.Pool() as pool:
+        for dir in dirs:
+            print(f"Evaluating mask iou for {dir}")
+            args_list = [(gt_path, layer_folder, dir, fname) for fname in fnames]
+            m_iou = list(tqdm(pool.imap(process_iou, args_list), total=len(args_list), desc=f"Processing {dir}"))
+            scores["m_iou"].append({dir: round(np.mean(m_iou) * 100, 2)})
+    return scores
+# Helper function for multiprocessing in evaluate_depth
+def process_depth(args):
+    depth_map, point_1, point_2, answer = args
+    return score_points(depth_map, point_1, point_2, answer)
+def score_points(depth_map, point_1, point_2, answer):
+    pt1_depth = depth_map[point_1[0], point_1[1]]
+    pt2_depth = depth_map[point_2[0], point_2[1]]
+    if isinstance(pt1_depth, np.ndarray):
+        pt1_depth = pt1_depth.mean()
+    if isinstance(pt2_depth, np.ndarray):
+        pt2_depth = pt2_depth.mean()
+    return (answer == "point2") if pt1_depth < pt2_depth else (answer == "point1")
+def load_and_process_image(args):
+    folder, fname, entry = args
+    gt_path = os.path.join("/mnt/vlpdatasets/sherlock/plots/dav2_da2k", fname.split("/")[-1].split(".")[0] + ".jpg")
+    pred_path = os.path.join(folder, fname.split("/")[-1])
+    gt = load_image(gt_path)
+    pred = load_image(pred_path)
+    pred = pred.resize(gt.size)
+    pred = np.array(pred) / 255.0
+    # Process depth for each entry within the image
+    return [process_depth((pred, entry["point1"], entry["point2"], entry["closer_point"])) for entry in entry["entries"]]
+def score_da2k_parallel(folder, anns):
+    pred_scores = []
+    tasks = [(folder, fname, {"entries": entries}) for fname, entries in anns.items()]
+    with Pool() as pool:
+        results = list(tqdm(pool.imap(load_and_process_image, tasks), total=len(tasks), desc="Processing images"))
+        for res in results:
+            if res is not None:
+                pred_scores.extend(res)
+    return np.mean(pred_scores) if pred_scores else 0
+def evaluate_depth(args):
+    anns = parse_json("/mnt/vlpdatasets/sherlock/eval/DA-2K/DA-2K/annotations.json")
+    name = args.ckpt
+    layer_folder = f"plots/probes_task/{name}/depth"
+    scores = {"da2k_acc": []}
+    dirs = os.listdir(layer_folder)
+    for dir in dirs:
+        print(f"Evaluating da2k_acc for {dir}")
+        pred_scores = score_da2k_parallel(os.path.join(layer_folder, dir), anns)
+        scores["da2k_acc"].append({dir: round(pred_scores * 100, 2)})
+    return scores
+def evaluate_fid(args):
+    name = args.ckpt
+    gt_path = os.path.join("plots/coco_gt")
+    layer_folder = f"plots/probes_task/{name}/gen"
+    scores = {"fid": []}
+    dirs = os.listdir(layer_folder)
+    for dir in dirs:
+        print(f"Evaluating fid for {dir}")
+        paths = [gt_path, os.path.join(layer_folder, dir)]
+        fid_score = compute_fid(paths)
+        scores["fid"].append({dir.replace("_", "-"): round(fid_score, 2)})
+    return scores
+import re
+def print_sorted_scores(scores, metric_name):
+    # Extract numeric part from layer names for sorting
+    sorted_scores = sorted(scores[metric_name], key=lambda x: int(re.search(r'\d+', list(x.keys())[0]).group()))
+    layers = [list(score.keys())[0] for score in sorted_scores]
+    values = [list(score.values())[0] for score in sorted_scores]
+    # Print sorted layers and scores in the requested format
+    print("\n=========================Results===============================")
+    print(" & ".join(layers))
+    print(" & ".join([f"{value}" for value in values]))
+    print(f"Average score: {round(np.mean(values), 2)}")
+    print("================================================================")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--ckpt", type=str, default="llava-1.5-7b")
+    parser.add_argument("--mode", type=str, default="gen")
+    args = parser.parse_args()
+    mode = args.mode
+    if mode == "gen":
+        scores = evaluate_fid(args)
+        print("\n=========================FID-Scores===============================")
+        for score in scores["fid"]:
+            for key, value in score.items():
+                print(f"{key} -> {value}")
+        print("================================================================")
+    elif mode == "seg":
+        scores = evaluate_seg(args)
+        print("\n=========================Mask-IOU===============================")
+        print_sorted_scores(scores, "m_iou")
+    elif mode == "depth":
+        scores = evaluate_depth(args)
+        print("\n=========================DA2K-Acc===============================")
+        print_sorted_scores(scores, "da2k_acc")
+    else:
+        print("Invalid mode. Choose from [gen, seg, depth]")

ola_vlm/eval/get_sherlock_dsg_scores.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import argparse
+import torch
+import json
+import os
+from tqdm import tqdm
+from icecream import ic
+import warnings
+warnings.filterwarnings("ignore")
+import random
+import numpy as np
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--ckpt", type=str, default="llava-1.5-7b")
+    parser.add_argument("--mode", type=str, default="gen")
+    args = parser.parse_args()
+    mode = args.mode
+    name = args.ckpt.split("/")[-1]
+    with open(f'plots/probe_scores/{name}/{args.mode}.json') as file:
+        scores = json.load(file)
+    layer_scores = {}
+    for img, v in tqdm(scores.items()):
+        for layer, score in v.items():
+            if layer not in layer_scores:
+                layer_scores[layer] = []
+            layer_scores[layer].append(score)
+    for layer, scores in layer_scores.items():
+        layer_scores[layer] = np.mean(scores)
+    with open(f"plots/probe_scores/{name}/{mode}_scores.json", "w") as f:
+        json.dump(layer_scores, f, indent=2)
+    print(f"================Scores: {mode}===============")
+    for layer, score in layer_scores.items():
+        print(f"Layer: {layer}, Score: {score}")
+    print("===========================================")

ola_vlm/eval/merge_json.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import os
+import json
+import argparse
+parser = argparse.ArgumentParser(
+    description='Probe eval')
+parser.add_argument('--ckpt',
+                    help='ckpt',
+                    default='probe_llava-1.5-vicuna-7b-lr-1e-3')
+parser.add_argument('--mode',
+                    help='mode',
+                    default='gen')
+parser.add_argument("--num-chunks", type=int, default=1)
+def save_merged_json(data, output_file):
+    with open(output_file, 'w') as file:
+        json.dump(data, file, indent=4)
+if __name__ == "__main__":
+    args = parser.parse_args()
+    merge_data = {}
+    name = args.ckpt.split("/")[-1]
+    for i in range(args.num_chunks):
+        with open(f'plots/probe_scores/{name}/{args.mode}/{args.num_chunks}_{i}.json', 'r') as file:
+           data = json.load(file)
+        merge_data.update(data)
+    save_merged_json(merge_data, f'plots/probe_scores/{name}/{args.mode}.json')

ola_vlm/eval/mmstar/evaluate/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .mmstar import MMStar_eval

ola_vlm/eval/mmstar/evaluate/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (183 Bytes). View file

ola_vlm/eval/mmstar/evaluate/__pycache__/mmstar.cpython-310.pyc ADDED Viewed

Binary file (2.45 kB). View file

ola_vlm/eval/mmstar/evaluate/mmstar.py ADDED Viewed

	@@ -0,0 +1,87 @@

+from ola_vlm.eval.mmstar.smp import *
+from copy import deepcopy
+def MMStar_eval(eval_file):
+    MMStar_score_l2 = {
+        'coarse perception': {
+            'image scene and topic': 0,
+            'image style & quality': 0,
+            'image emotion': 0
+        },
+        'fine-grained perception': {
+            'object counting': 0,
+            'recognition': 0,
+            'localization': 0
+        },
+        'instance reasoning': {
+            'single-instance reasoning': 0,
+            'cross-instance attribute reasoning': 0,
+            'cross-instance relation reasoning': 0
+        },
+        'logical reasoning': {
+            'code & sequence reasoning': 0,
+            'diagram reasoning': 0,
+            'common reasoning': 0
+        },
+        'science & technology': {
+            'biology & chemistry & physics': 0,
+            'electronics & energy & mechanical eng.': 0,
+            'geography & earth science & agriculture': 0
+        },
+        'math': {
+            'geometry': 0,
+            'numeric commonsense and calculation': 0,
+            'statistical reasoning': 0
+        },
+    }
+    MMStar_counter = deepcopy(MMStar_score_l2)
+    logger = get_logger('Evaluation')
+    data = load(eval_file)
+    lt = len(data)
+    lines = [data[i] for i in range(lt)]
+    for i in tqdm(range(len(lines))):
+        line = lines[i]
+        predict = str(line['prediction'])
+        answers = str(line['answer'])
+        category = str(line['category'])
+        l2_category = str(line['l2_category'])
+        MMStar_counter[category][l2_category] += 1
+        answer = answers.lower().strip().replace('\n', ' ')
+        predict = predict.lower().strip().replace('\n', ' ')
+        try:
+            if answer == predict[0]:
+                MMStar_score_l2[category][l2_category] += 1
+            elif predict[0] == '(' and answer == predict[1]:
+                MMStar_score_l2[category][l2_category] += 1
+            elif predict[0:7] == 'option ' and answer == predict[7]:
+                MMStar_score_l2[category][l2_category] += 1
+            elif predict[0:14] == 'the answer is ' and answer == predict[14]:
+                MMStar_score_l2[category][l2_category] += 1
+        except Exception as e:
+            pass
+    MMStar_score = {}
+    MMStar_score['final score'] = 0
+    for k, v in MMStar_score_l2.items():
+        MMStar_score[k] = 0
+        for l2_k, l2_v in v.items():
+            MMStar_score[f'{k}({l2_k})'] = float(l2_v) / \
+                float(MMStar_counter[k][l2_k])
+            MMStar_score[k] += l2_v
+        MMStar_score['final score'] += MMStar_score[k]
+        MMStar_score[k] = float(MMStar_score[k]) / 250.0
+    MMStar_score['final score'] = float(MMStar_score['final score']) / 1500.0
+    score_pth = eval_file.replace('.jsonl', '_score.json')
+    dump(MMStar_score, score_pth)
+    logger.info(
+        f'MMStar_eval successfully finished evaluating {eval_file}, results saved in {score_pth}')
+    logger.info('Score: ')
+    for key, value in MMStar_score.items():
+        logger.info('{}:{}'.format(key, value))
+    return MMStar_score

ola_vlm/eval/mmstar/smp/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .file import *
+from .misc import *
+from .log import *

ola_vlm/eval/mmstar/smp/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (188 Bytes). View file

ola_vlm/eval/mmstar/smp/__pycache__/file.cpython-310.pyc ADDED Viewed

Binary file (7.12 kB). View file

ola_vlm/eval/mmstar/smp/__pycache__/log.cpython-310.pyc ADDED Viewed

Binary file (1.02 kB). View file

ola_vlm/eval/mmstar/smp/__pycache__/misc.cpython-310.pyc ADDED Viewed

Binary file (5.18 kB). View file

ola_vlm/eval/mmstar/smp/__pycache__/vlm.cpython-310.pyc ADDED Viewed

Binary file (4.99 kB). View file

ola_vlm/eval/mmstar/smp/file.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import csv
+import hashlib
+import json
+import os
+import os.path as osp
+import pickle
+import time
+import numpy as np
+import pandas as pd
+class NumpyEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, (np.int_, np.intc, np.intp, np.int8,
+                            np.int16, np.int32, np.int64, np.uint8,
+                            np.uint16, np.uint32, np.uint64)):
+            return int(obj)
+        elif isinstance(obj, (np.float_, np.float16, np.float32, np.float64)):
+            return float(obj)
+        elif isinstance(obj, (np.complex_, np.complex64, np.complex128)):
+            return {'real': obj.real, 'imag': obj.imag}
+        elif isinstance(obj, (np.ndarray,)):
+            return obj.tolist()
+        elif isinstance(obj, (np.bool_)):
+            return bool(obj)
+        elif isinstance(obj, (np.void)):
+            return None
+        return json.JSONEncoder.default(self, obj)
+# LOAD & DUMP
+def dump(data, f, **kwargs):
+    def dump_pkl(data, pth, **kwargs):
+        pickle.dump(data, open(pth, 'wb'))
+    def dump_json(data, pth, **kwargs):
+        json.dump(data, open(pth, 'w'), indent=4, ensure_ascii=False, cls=NumpyEncoder)
+    def dump_jsonl(data, f, **kwargs):
+        lines = [json.dumps(x, ensure_ascii=False, cls=NumpyEncoder) for x in data]
+        with open(f, 'w', encoding='utf8') as fout:
+            fout.write('\n'.join(lines))
+    def dump_xlsx(data, f, **kwargs):
+        data.to_excel(f, index=False, engine='xlsxwriter')
+    def dump_csv(data, f, quoting=csv.QUOTE_ALL):
+        data.to_csv(f, index=False, encoding='utf-8', quoting=quoting)
+    def dump_tsv(data, f, quoting=csv.QUOTE_ALL):
+        data.to_csv(f, sep='\t', index=False, encoding='utf-8', quoting=quoting)
+    handlers = dict(pkl=dump_pkl, json=dump_json, jsonl=dump_jsonl, xlsx=dump_xlsx, csv=dump_csv, tsv=dump_tsv)
+    suffix = f.split('.')[-1]
+    return handlers[suffix](data, f, **kwargs)
+def load(f):
+    def load_pkl(pth):
+        return pickle.load(open(pth, 'rb'))
+    def load_json(pth):
+        return json.load(open(pth, 'r', encoding='utf-8'))
+    def load_jsonl(f):
+        lines = open(f, encoding='utf-8').readlines()
+        lines = [x.strip() for x in lines]
+        if lines[-1] == '':
+            lines = lines[:-1]
+        data = [json.loads(x) for x in lines]
+        return data
+    def load_xlsx(f):
+        return pd.read_excel(f)
+    def load_csv(f):
+        return pd.read_csv(f)
+    def load_tsv(f):
+        return pd.read_csv(f, sep='\t')
+    handlers = dict(pkl=load_pkl, json=load_json, jsonl=load_jsonl, xlsx=load_xlsx, csv=load_csv, tsv=load_tsv)
+    suffix = f.split('.')[-1]
+    return handlers[suffix](f)
+def download_file(url, filename=None):
+    import urllib.request
+    from tqdm import tqdm
+    class DownloadProgressBar(tqdm):
+        def update_to(self, b=1, bsize=1, tsize=None):
+            if tsize is not None:
+                self.total = tsize
+            self.update(b * bsize - self.n)
+    if filename is None:
+        filename = url.split('/')[-1]
+    with DownloadProgressBar(unit='B', unit_scale=True,
+                             miniters=1, desc=url.split('/')[-1]) as t:
+        urllib.request.urlretrieve(url, filename=filename, reporthook=t.update_to)
+    return filename
+def ls(dirname='.', match='', mode='all', level=1):
+    if dirname == '.':
+        ans = os.listdir(dirname)
+    else:
+        ans = [osp.join(dirname, x) for x in os.listdir(dirname)]
+    assert mode in ['all', 'dir', 'file']
+    assert level >= 1 and isinstance(level, int)
+    if level == 1:
+        ans = [x for x in ans if match in x]
+        if mode == 'dir':
+            ans = [x for x in ans if osp.isdir(x)]
+        elif mode == 'file':
+            ans = [x for x in ans if not osp.isdir(x)]
+    else:
+        ans = [x for x in ans if osp.isdir(x)]
+        res = []
+        for d in ans:
+            res.extend(ls(d, match=match, mode=mode, level=level-1))
+        ans = res
+    return ans
+def mrlines(fname, sp='\n'):
+    f = open(fname).read().split(sp)
+    while f != [] and f[-1] == '':
+        f = f[:-1]
+    return f
+def mwlines(lines, fname):
+    with open(fname, 'w') as fout:
+        fout.write('\n'.join(lines))
+def md5(file_pth):
+    with open(file_pth, 'rb') as f:
+        hash = hashlib.new('md5')
+        for chunk in iter(lambda: f.read(2**20), b''):
+            hash.update(chunk)
+    return str(hash.hexdigest())
+def last_modified(pth):
+    stamp = osp.getmtime(pth)
+    m_ti = time.ctime(stamp)
+    t_obj = time.strptime(m_ti)
+    t = time.strftime('%Y%m%d%H%M%S', t_obj)[2:]
+    return t

ola_vlm/eval/mmstar/smp/log.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import logging
+logger_initialized = {}
+def get_logger(name, log_file=None, log_level=logging.INFO, file_mode='w'):
+    logger = logging.getLogger(name)
+    if name in logger_initialized:
+        return logger
+    for logger_name in logger_initialized:
+        if name.startswith(logger_name):
+            return logger
+    stream_handler = logging.StreamHandler()
+    handlers = [stream_handler]
+    try:
+        import torch.distributed as dist
+        if dist.is_available() and dist.is_initialized():
+            rank = dist.get_rank()
+        else:
+            rank = 0
+    except ImportError:
+        rank = 0
+    if rank == 0 and log_file is not None:
+        file_handler = logging.FileHandler(log_file, file_mode)
+        handlers.append(file_handler)
+    formatter = logging.Formatter(
+        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+    for handler in handlers:
+        handler.setFormatter(formatter)
+        handler.setLevel(log_level)
+        logger.addHandler(handler)
+    if rank == 0:
+        logger.setLevel(log_level)
+    else:
+        logger.setLevel(logging.ERROR)
+    logger_initialized[name] = True
+    return logger

ola_vlm/eval/mmstar/smp/misc.py ADDED Viewed

	@@ -0,0 +1,174 @@

+# flake8: noqa: F401, F403
+import abc
+import argparse
+import copy as cp
+import csv
+import datetime
+import multiprocessing as mp
+import os
+import os.path as osp
+import random as rd
+import shutil
+import subprocess
+import warnings
+from collections import OrderedDict, defaultdict
+from multiprocessing import Pool, current_process
+import matplotlib.pyplot as plt
+import pandas as pd
+import requests
+import seaborn as sns
+from huggingface_hub import scan_cache_dir
+from sty import bg, ef, fg, rs
+from tabulate import tabulate, tabulate_formats
+from tqdm import tqdm
+def process_punctuation(inText):
+    import re
+    outText = inText
+    punct = [
+        ';', r'/', '[', ']', '"', '{', '}', '(', ')', '=', '+', '\\', '_', '-',
+        '>', '<', '@', '`', ',', '?', '!'
+    ]
+    commaStrip = re.compile('(\d)(,)(\d)')  # noqa: W605
+    periodStrip = re.compile('(?!<=\d)(\.)(?!\d)')  # noqa: W605
+    for p in punct:
+        if (p + ' ' in inText or ' ' + p in inText) or (re.search(
+                commaStrip, inText) is not None):
+            outText = outText.replace(p, '')
+        else:
+            outText = outText.replace(p, ' ')
+    outText = periodStrip.sub('', outText, re.UNICODE)
+    return outText
+def h2r(value):
+    if value[0] == '#':
+        value = value[1:]
+    assert len(value) == 6
+    return tuple(int(value[i:i + 2], 16) for i in range(0, 6, 2))
+def r2h(rgb):
+    return '#%02x%02x%02x' % rgb
+def colored(s, color):
+    if isinstance(color, str):
+        if hasattr(fg, color):
+            return getattr(fg, color) + s + fg.rs
+        color = h2r(color)
+    return fg(*color) + s + fg.rs
+def istype(s, type):
+    if isinstance(s, type):
+        return True
+    try:
+        return isinstance(eval(s), type)
+    except Exception as _:
+        return False
+def bincount(lst):
+    bins = defaultdict(lambda: 0)
+    for item in lst:
+        bins[item] += 1
+    return bins
+def get_cache_path(repo_id):
+    hf_cache_info = scan_cache_dir()
+    repos = list(hf_cache_info.repos)
+    repo = None
+    for r in repos:
+        if r.repo_id == repo_id:
+            repo = r
+            break
+    if repo is None:
+        return None
+    revs = list(repo.revisions)
+    rev2keep, last_modified = None, 0
+    for rev in revs:
+        if rev.last_modified > last_modified:
+            rev2keep, last_modified = rev, rev.last_modified
+    if rev2keep is None:
+        return None
+    return str(rev2keep.snapshot_path)
+def proxy_set(s):
+    import os
+    for key in ['http_proxy', 'HTTP_PROXY', 'https_proxy', 'HTTPS_PROXY']:
+        os.environ[key] = s
+def get_rank_and_world_size():
+    local_rank = int(os.environ.get("RANK", 0))
+    world_size = int(os.environ.get("WORLD_SIZE", 1))
+    return local_rank, world_size
+def get_local_rank_and_world_size():
+    local_rank = int(os.environ.get("LOCAL_RANK", 0))
+    world_size = int(os.environ.get("WORLD_SIZE", 1))
+    return local_rank, world_size
+def splitlen(s, sym='/'):
+    return len(s.split(sym))
+def listinstr(lst, s):
+    assert isinstance(lst, list)
+    for item in lst:
+        if item in s:
+            return True
+    return False
+def d2df(D):
+    return pd.DataFrame({x: [D[x]] for x in D})
+def cn_string(s):
+    import re
+    if re.search(u'[\u4e00-\u9fff]', s):
+        return True
+    return False
+try:
+    import decord
+except ImportError:
+    pass
+def timestr(second=True, minute=False):
+    s = datetime.datetime.now().strftime('%Y%m%d%H%M%S')[2:]
+    if second:
+        return s
+    elif minute:
+        return s[:-2]
+    else:
+        return s[:-4]
+def dict_merge(dct, merge_dct):
+    for k, _ in merge_dct.items():
+        if (k in dct and isinstance(dct[k], dict) and isinstance(merge_dct[k], dict)):  # noqa
+            dict_merge(dct[k], merge_dct[k])
+        else:
+            dct[k] = merge_dct[k]
+def youtube_dl(idx):
+    cmd = f'youtube-dl -f best -f mp4 "{idx}"  -o {idx}.mp4'
+    os.system(cmd)
+def run_command(cmd):
+    if isinstance(cmd, str):
+        cmd = cmd.split()
+    return subprocess.check_output(cmd)

ola_vlm/eval/model_cvbench_loader.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import argparse
+import torch
+import os
+import json
+from tqdm import tqdm
+import shortuuid
+from ola_vlm.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from ola_vlm.conversation import conv_templates, SeparatorStyle
+from ola_vlm.model.builder import load_pretrained_model
+from ola_vlm.utils import disable_torch_init
+from ola_vlm.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
+from torch.utils.data import Dataset, DataLoader
+from datasets import load_dataset
+from PIL import Image
+import math
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+def load_jsonl(f):
+        lines = open(f, encoding='utf-8').readlines()
+        lines = [x.strip() for x in lines]
+        if lines[-1] == '':
+            lines = lines[:-1]
+        data = [json.loads(x) for x in lines]
+        return data
+def prepare_CVBench(path):
+    dataset = load_jsonl(os.path.join(path, 'test.jsonl'))
+    data = []
+    for i in range(len(dataset)):
+        d = {
+            "image": os.path.join(path, dataset[i]["filename"]),
+            "question": dataset[i]["prompt"] + "\nOnly answer the option as the output. For example, if your answer is the option A, answer (A).",
+            "answer": dataset[i]["answer"],
+            "task": dataset[i]["task"],
+            "source": dataset[i]["source"]
+        }
+        data.append(d)
+    return data
+# Custom dataset class
+class CustomDataset(Dataset):
+    def __init__(self, data, tokenizer, image_processor, model_config):
+        self.questions = data
+        self.tokenizer = tokenizer
+        self.image_processor = image_processor
+        self.model_config = model_config
+    def __getitem__(self, index):
+        d = self.questions[index]
+        qs = d["question"]
+        image_file = d["image"]
+        ans = d["answer"]
+        task = d["task"]
+        source = d["source"]
+        if self.model_config.mm_use_im_start_end:
+            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
+        else:
+            qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
+        conv = conv_templates[args.conv_mode].copy()
+        conv.append_message(conv.roles[0], qs)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+        image = Image.open(image_file).convert('RGB')
+        image_tensor = process_images([image], self.image_processor, self.model_config)[0]
+        input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')
+        return input_ids, image_tensor, image.size, ans, task, source
+    def __len__(self):
+        return len(self.questions)
+def collate_fn(batch):
+    input_ids, image_tensors, image_sizes, answers, cats, cats_l2 = zip(*batch)
+    input_ids = torch.stack(input_ids, dim=0)
+    image_tensors = torch.stack(image_tensors, dim=0)
+    return input_ids, image_tensors, image_sizes, answers, cats, cats_l2
+# DataLoader
+def create_data_loader(questions, tokenizer, image_processor, model_config, batch_size=1, num_workers=4):
+    assert batch_size == 1, "batch_size must be 1"
+    dataset = CustomDataset(questions, tokenizer, image_processor, model_config)
+    data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False, collate_fn=collate_fn)
+    return data_loader
+def eval_model(args):
+    # Model
+    disable_torch_init()
+    model_path = os.path.expanduser(args.model_path)
+    model_name = get_model_name_from_path(model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
+    questions = prepare_CVBench(args.path)
+    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
+    answers_file = os.path.expanduser(args.answers_file)
+    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
+    ans_file = open(answers_file, "w")
+    if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode:
+        args.conv_mode = args.conv_mode + '_mmtag'
+        print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.')
+    data_loader = create_data_loader(questions, tokenizer, image_processor, model.config)
+    for (input_ids, image_tensor, image_sizes, answer, task, source), line in tqdm(zip(data_loader, questions), total=len(questions)):
+        input_ids = input_ids.to(device='cuda', non_blocking=True)
+        with torch.inference_mode():
+            output_ids = model.generate(
+                input_ids,
+                images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True),
+                image_sizes=image_sizes,
+                do_sample=True if args.temperature > 0 else False,
+                temperature=args.temperature,
+                top_p=args.top_p,
+                num_beams=args.num_beams,
+                max_new_tokens=args.max_new_tokens,
+                use_cache=True)
+        if not isinstance(output_ids, torch.Tensor):
+            output_ids = output_ids.sequences
+        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+        ans_file.write(json.dumps({"prediction": outputs,
+                                   "answer": answer,
+                                   "question": line,
+                                   "source": source,
+                                   "task": task}) + "\n")
+        # ans_file.flush()
+    ans_file.close()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--path", type=str, default="CV-Bench")
+    parser.add_argument("--answers-file", type=str, default="cv-bench_answer.jsonl")
+    parser.add_argument("--conv-mode", type=str, default="llava_phi_3")
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--top_p", type=float, default=None)
+    parser.add_argument("--num_beams", type=int, default=1)
+    parser.add_argument("--max_new_tokens", type=int, default=128)
+    args = parser.parse_args()
+    eval_model(args)

ola_vlm/eval/model_mmstar_loader.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import argparse
+import torch
+import os
+import json
+from tqdm import tqdm
+import shortuuid
+from ola_vlm.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from ola_vlm.conversation import conv_templates, SeparatorStyle
+from ola_vlm.model.builder import load_pretrained_model
+from ola_vlm.utils import disable_torch_init
+from ola_vlm.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
+from torch.utils.data import Dataset, DataLoader
+from datasets import load_dataset
+from PIL import Image
+import math
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+def prepare_MMStar(path):
+    os.makedirs(f"{path}/images", exist_ok=True)
+    dataset = load_dataset(path, "val")
+    dataset = dataset["val"]
+    data = []
+    for i in range(len(dataset)):
+        if not os.path.exists(f"{path}/images/{i}.jpeg"):
+            dataset[i]["image"].save(f"{path}/images/{i}.jpeg")
+        prompt = dataset[i]["question"] + "\n"
+        prompt += "Answer with the option's letter from the given choices directly, such as answer letter 'A' only. \n"
+        d = {
+            "image": f"{path}/images/{i}.jpeg",
+            "question": prompt,
+            "answer": dataset[i]["answer"],
+            "category": dataset[i]["category"],
+            "l2_category": dataset[i]["l2_category"]
+        }
+        data.append(d)
+    return data
+# Custom dataset class
+class CustomDataset(Dataset):
+    def __init__(self, data, tokenizer, image_processor, model_config):
+        self.questions = data
+        self.tokenizer = tokenizer
+        self.image_processor = image_processor
+        self.model_config = model_config
+    def __getitem__(self, index):
+        d = self.questions[index]
+        qs = d["question"]
+        image_file = d["image"]
+        ans = d["answer"]
+        if self.model_config.mm_use_im_start_end:
+            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
+        else:
+            qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
+        conv = conv_templates[args.conv_mode].copy()
+        conv.append_message(conv.roles[0], qs)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+        image = Image.open(image_file).convert('RGB')
+        image_tensor = process_images([image], self.image_processor, self.model_config)[0]
+        input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')
+        return input_ids, image_tensor, image.size, ans, d["category"], d["l2_category"]
+    def __len__(self):
+        return len(self.questions)
+def collate_fn(batch):
+    input_ids, image_tensors, image_sizes, answers, cats, cats_l2 = zip(*batch)
+    input_ids = torch.stack(input_ids, dim=0)
+    image_tensors = torch.stack(image_tensors, dim=0)
+    return input_ids, image_tensors, image_sizes, answers, cats, cats_l2
+# DataLoader
+def create_data_loader(questions, tokenizer, image_processor, model_config, batch_size=1, num_workers=4):
+    assert batch_size == 1, "batch_size must be 1"
+    dataset = CustomDataset(questions, tokenizer, image_processor, model_config)
+    data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False, collate_fn=collate_fn)
+    return data_loader
+def eval_model(args):
+    # Model
+    disable_torch_init()
+    model_path = os.path.expanduser(args.model_path)
+    model_name = get_model_name_from_path(model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
+    questions = prepare_MMStar(args.path)
+    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
+    answers_file = os.path.expanduser(args.answers_file)
+    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
+    ans_file = open(answers_file, "w")
+    if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode:
+        args.conv_mode = args.conv_mode + '_mmtag'
+        print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.')
+    data_loader = create_data_loader(questions, tokenizer, image_processor, model.config)
+    for (input_ids, image_tensor, image_sizes, answer, cat, cat_l2), line in tqdm(zip(data_loader, questions), total=len(questions)):
+        input_ids = input_ids.to(device='cuda', non_blocking=True)
+        with torch.inference_mode():
+            output_ids = model.generate(
+                input_ids,
+                images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True),
+                image_sizes=image_sizes,
+                do_sample=True if args.temperature > 0 else False,
+                temperature=args.temperature,
+                top_p=args.top_p,
+                num_beams=args.num_beams,
+                max_new_tokens=args.max_new_tokens,
+                use_cache=True)
+        if not isinstance(output_ids, torch.Tensor):
+            output_ids = output_ids.sequences
+        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+        ans_file.write(json.dumps({"prediction": outputs,
+                                   "answer": answer[0],
+                                   "question": line,
+                                   "category": cat[0],
+                                   "l2_category": cat_l2[0]}) + "\n")
+        # ans_file.flush()
+    ans_file.close()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--path", type=str, default="MMStar")
+    parser.add_argument("--answers-file", type=str, default="mmstar_answer.jsonl")
+    parser.add_argument("--conv-mode", type=str, default="llava_phi_3")
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--top_p", type=float, default=None)
+    parser.add_argument("--num_beams", type=int, default=1)
+    parser.add_argument("--max_new_tokens", type=int, default=128)
+    args = parser.parse_args()
+    eval_model(args)

ola_vlm/mm_utils.py ADDED Viewed

	@@ -0,0 +1,398 @@

+from PIL import Image
+from io import BytesIO
+import base64
+import torch
+import math
+import ast
+import re
+from transformers import StoppingCriteria
+from ola_vlm.constants import IMAGE_TOKEN_INDEX
+###########################################
+def resize_and_center_crop(image, shortest_edge_length):
+    # Calculate new dimensions and resize
+    aspect_ratio = float(image.width) / float(image.height)
+    if aspect_ratio > 1:
+        new_width = int(shortest_edge_length * aspect_ratio)
+        new_height = shortest_edge_length
+    else:
+        new_width = shortest_edge_length
+        new_height = int(shortest_edge_length / aspect_ratio)
+    resized_image = image.resize((new_width, new_height), Image.ANTIALIAS)
+    # Calculate the position and perform the center crop
+    left = (new_width - shortest_edge_length) / 2
+    top = (new_height - shortest_edge_length) / 2
+    right = (new_width + shortest_edge_length) / 2
+    bottom = (new_height + shortest_edge_length) / 2
+    cropped_image = resized_image.crop((left, top, right, bottom))
+    return cropped_image
+def auto_pad_images(image, grid_params):
+    assert isinstance(image, Image.Image), "Input should be a Pillow Image"
+    assert len(grid_params) > 0, "Grid parameters should not be empty"
+    # Step 1: Calculate and find the closest aspect ratio
+    input_width, input_height = image.size
+    input_aspect_ratio = input_width / input_height
+    candidate_resolutions = [(w / h, w, h) for w in grid_params for h in grid_params]
+    closest_aspect_ratio = min(candidate_resolutions, key=lambda x: abs(input_aspect_ratio - x[0]))
+    candidate_resolutions = [(x[1], x[2]) for x in candidate_resolutions if abs(x[0] - closest_aspect_ratio[0]) < 1e-3]
+    target_resolution = min(candidate_resolutions, key=lambda res: abs(max(input_width, input_height) / max(res) - 1))
+    resize_width, resize_height = target_resolution
+    if input_width > input_height:
+        resize_height = int(resize_width / input_aspect_ratio)
+    else:
+        resize_width = int(resize_height * input_aspect_ratio)
+    resized_image = image.resize((resize_width, resize_height), Image.ANTIALIAS)
+    # Step 5: Pad the resized image if necessary to match the target resolution
+    pad_width = target_resolution[0] - resize_width
+    pad_height = target_resolution[1] - resize_height
+    padded_image = Image.new("RGB", target_resolution, color=(0, 0, 0))
+    padded_image.paste(resized_image, (pad_width // 2, pad_height // 2))
+    return padded_image
+def extract_patches(image, patch_size, overlap_ratio):
+    assert isinstance(image, Image.Image), "Input should be a Pillow Image"
+    assert patch_size > 0, "Patch size should be greater than 0"
+    assert 0 <= overlap_ratio < 1, "Overlap ratio should be between 0 and 1"
+    W, H = image.size
+    patches = []
+    stride = int(patch_size * (1 - overlap_ratio))
+    num_patches_y = (H - patch_size) // stride + 1
+    num_patches_x = (W - patch_size) // stride + 1
+    y_start = (H - (num_patches_y - 1) * stride - patch_size) // 2
+    x_start = (W - (num_patches_x - 1) * stride - patch_size) // 2
+    for y in range(y_start, y_start + num_patches_y * stride, stride):
+        for x in range(x_start, x_start + num_patches_x * stride, stride):
+            patch = image.crop((x, y, x + patch_size, y + patch_size))
+            patches.append(patch)
+    return patches
+def process_highres_image_crop_split(image, data_args, processor=None):
+    crop_resolution = data_args.image_crop_resolution
+    split_resolution = data_args.image_split_resolution
+    if processor is None:
+        processor = data_args.image_processor
+    image_crop = resize_and_center_crop(image, crop_resolution)
+    image_patches = extract_patches(image_crop, patch_size=split_resolution, overlap_ratio=0)
+    image_patches = [processor.preprocess(image_patch, return_tensors="pt")["pixel_values"][0] for image_patch in image_patches]
+    return torch.stack(image_patches, dim=0)
+def process_highres_image(image, processor, grid_pinpoints):
+    grid_params = [int(x) for x in grid_pinpoints.split(",")]
+    width_height = max(image.size)
+    fit_grid_params = [x for x in grid_params if x >= width_height]
+    if len(fit_grid_params) == 0:
+        select_size = max(grid_params)
+    else:
+        select_size = min(fit_grid_params)
+    # FIXME: always select the 448
+    select_size = max(grid_params)
+    image_padded = expand2square(image, tuple(int(x * 255) for x in processor.image_mean))
+    # FIXME: this seems to be a bug that it always resizes instead of padding
+    image_original_resize = image.resize((processor.size["shortest_edge"], processor.size["shortest_edge"]))
+    image_padded = image_padded.resize((select_size, select_size))
+    image_patches = extract_patches(image_padded, patch_size=processor.size["shortest_edge"], overlap_ratio=0)
+    image_patches = [image_original_resize] + image_patches
+    image_patches = [processor.preprocess(image_patch, return_tensors="pt")["pixel_values"][0] for image_patch in image_patches]
+    return torch.stack(image_patches, dim=0)
+########################################
+def select_best_resolution(original_size, possible_resolutions):
+    """
+    Selects the best resolution from a list of possible resolutions based on the original size.
+    Args:
+        original_size (tuple): The original size of the image in the format (width, height).
+        possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
+    Returns:
+        tuple: The best fit resolution in the format (width, height).
+    """
+    original_width, original_height = original_size
+    best_fit = None
+    max_effective_resolution = 0
+    min_wasted_resolution = float('inf')
+    for width, height in possible_resolutions:
+        scale = min(width / original_width, height / original_height)
+        downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
+        effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
+        wasted_resolution = (width * height) - effective_resolution
+        if effective_resolution > max_effective_resolution or (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution):
+            max_effective_resolution = effective_resolution
+            min_wasted_resolution = wasted_resolution
+            best_fit = (width, height)
+    return best_fit
+def resize_and_pad_image(image, target_resolution):
+    """
+    Resize and pad an image to a target resolution while maintaining aspect ratio.
+    Args:
+        image (PIL.Image.Image): The input image.
+        target_resolution (tuple): The target resolution (width, height) of the image.
+    Returns:
+        PIL.Image.Image: The resized and padded image.
+    """
+    original_width, original_height = image.size
+    target_width, target_height = target_resolution
+    scale_w = target_width / original_width
+    scale_h = target_height / original_height
+    if scale_w < scale_h:
+        new_width = target_width
+        new_height = min(math.ceil(original_height * scale_w), target_height)
+    else:
+        new_height = target_height
+        new_width = min(math.ceil(original_width * scale_h), target_width)
+    # Resize the image
+    resized_image = image.resize((new_width, new_height))
+    new_image = Image.new('RGB', (target_width, target_height), (0, 0, 0))
+    paste_x = (target_width - new_width) // 2
+    paste_y = (target_height - new_height) // 2
+    new_image.paste(resized_image, (paste_x, paste_y))
+    return new_image
+def divide_to_patches(image, patch_size):
+    """
+    Divides an image into patches of a specified size.
+    Args:
+        image (PIL.Image.Image): The input image.
+        patch_size (int): The size of each patch.
+    Returns:
+        list: A list of PIL.Image.Image objects representing the patches.
+    """
+    patches = []
+    width, height = image.size
+    for i in range(0, height, patch_size):
+        for j in range(0, width, patch_size):
+            box = (j, i, j + patch_size, i + patch_size)
+            patch = image.crop(box)
+            patches.append(patch)
+    return patches
+def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
+    """
+    Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
+    Args:
+        image_size (tuple): The size of the input image in the format (width, height).
+        grid_pinpoints (str): A string representation of a list of possible resolutions.
+        patch_size (int): The size of each image patch.
+    Returns:
+        tuple: The shape of the image patch grid in the format (width, height).
+    """
+    if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints:
+        assert patch_size in [224, 336, 384, 448, 512], "patch_size should be in [224, 336, 384, 448, 512]"
+        # Use regex to extract the range from the input string
+        matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints)
+        range_start = tuple(map(int, matches[0]))
+        range_end = tuple(map(int, matches[-1]))
+        # Generate a matrix of tuples from (range_start[0], range_start[1]) to (range_end[0], range_end[1])
+        grid_pinpoints = [(i, j) for i in range(range_start[0], range_end[0] + 1) for j in range(range_start[1], range_end[1] + 1)]
+        # Multiply all elements by patch_size
+        grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints]
+    if type(grid_pinpoints) is list:
+        possible_resolutions = grid_pinpoints
+    else:
+        possible_resolutions = ast.literal_eval(grid_pinpoints)
+    width, height = select_best_resolution(image_size, possible_resolutions)
+    return width // patch_size, height // patch_size
+def process_anyres_image(image, processor, grid_pinpoints):
+    """
+    Process an image with variable resolutions.
+    Args:
+        image (PIL.Image.Image): The input image to be processed.
+        processor: The image processor object.
+        grid_pinpoints (str): A string representation of a list of possible resolutions.
+    Returns:
+        torch.Tensor: A tensor containing the processed image patches.
+    """
+    # Convert grid_pinpoints from string to list
+    if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints:
+        try:
+            patch_size = processor.size[0]
+        except Exception as e:
+            patch_size = processor.size["shortest_edge"]
+        assert patch_size in [224, 336, 384, 448, 512], "patch_size should be in [224, 336, 384, 448, 512]"
+        # Use regex to extract the range from the input string
+        matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints)
+        range_start = tuple(map(int, matches[0]))
+        range_end = tuple(map(int, matches[-1]))
+        # Generate a matrix of tuples from (range_start[0], range_start[1]) to (range_end[0], range_end[1])
+        grid_pinpoints = [(i, j) for i in range(range_start[0], range_end[0] + 1) for j in range(range_start[1], range_end[1] + 1)]
+        # Multiply all elements by patch_size
+        grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints]
+    if type(grid_pinpoints) is list:
+        possible_resolutions = grid_pinpoints
+    else:
+        possible_resolutions = ast.literal_eval(grid_pinpoints)
+    best_resolution = select_best_resolution(image.size, possible_resolutions)
+    image_padded = resize_and_pad_image(image, best_resolution)
+    patches = divide_to_patches(image_padded, processor.crop_size["height"])
+    # FIXME: this seems to be a bug that it resizes instead of pad.
+    # but to keep it consistent with previous, i will keep it as it is
+    # TODO: uncomment below to ablate with the padding
+    if isinstance(processor.size, dict):
+        shortest_edge = processor.size["shortest_edge"]
+    else:
+        shortest_edge = min(processor.size)
+    image_original_resize = image.resize((shortest_edge, shortest_edge))
+    # image_padded_square = expand2square(image, tuple(int(x*255) for x in processor.image_mean))
+    # image_original_resize = image_padded_square.resize((processor.size['shortest_edge'], processor.size['shortest_edge']))
+    image_patches = [image_original_resize] + patches
+    image_patches = [processor.preprocess(image_patch, return_tensors="pt")["pixel_values"][0] for image_patch in image_patches]
+    return torch.stack(image_patches, dim=0)
+def load_image_from_base64(image):
+    return Image.open(BytesIO(base64.b64decode(image)))
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+def process_images(images, image_processor, model_cfg):
+    image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
+    new_images = []
+    if image_aspect_ratio == "highres":
+        for image in images:
+            image = process_highres_image(image, image_processor, model_cfg.image_grid_pinpoints)
+            new_images.append(image)
+    elif image_aspect_ratio == "anyres" or "anyres_max" in image_aspect_ratio:
+        for image in images:
+            image = process_anyres_image(image, image_processor, model_cfg.image_grid_pinpoints)
+            new_images.append(image)
+    elif image_aspect_ratio == "crop_split":
+        for image in images:
+            image = process_highres_image_crop_split(image, model_cfg, image_processor)
+            new_images.append(image)
+    elif image_aspect_ratio == "pad":
+        for image in images:
+            image = expand2square(image, tuple(int(x * 255) for x in image_processor.image_mean))
+            image = image_processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
+            new_images.append(image)
+    else:
+        return image_processor.preprocess(images, return_tensors="pt")["pixel_values"]
+    if all(x.shape == new_images[0].shape for x in new_images):
+        new_images = torch.stack(new_images, dim=0)
+    return new_images
+def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
+    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
+    input_ids = []
+    offset = 0
+    if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
+        offset = 1
+        input_ids.append(prompt_chunks[0][0])
+    for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
+        input_ids.extend(x[offset:])
+    if return_tensors is not None:
+        if return_tensors == 'pt':
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f'Unsupported tensor type: {return_tensors}')
+    return input_ids
+def get_model_name_from_path(model_path):
+    model_path = model_path.strip("/")
+    model_paths = model_path.split("/")
+    if model_paths[-1].startswith('checkpoint-'):
+        return model_paths[-2] + "_" + model_paths[-1]
+    else:
+        return model_paths[-1]
+class KeywordsStoppingCriteria(StoppingCriteria):
+    def __init__(self, keywords, tokenizer, input_ids):
+        self.keywords = keywords
+        self.keyword_ids = []
+        self.max_keyword_len = 0
+        for keyword in keywords:
+            cur_keyword_ids = tokenizer(keyword).input_ids
+            if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
+                cur_keyword_ids = cur_keyword_ids[1:]
+            if len(cur_keyword_ids) > self.max_keyword_len:
+                self.max_keyword_len = len(cur_keyword_ids)
+            self.keyword_ids.append(torch.tensor(cur_keyword_ids))
+        self.tokenizer = tokenizer
+        self.start_len = input_ids.shape[1]
+    def call_for_batch(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
+        self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
+        for keyword_id in self.keyword_ids:
+            truncated_output_ids = output_ids[0, -keyword_id.shape[0]:]
+            if torch.equal(truncated_output_ids, keyword_id):
+                return True
+        outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
+        for keyword in self.keywords:
+            if keyword in outputs:
+                return True
+        return False
+    def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        outputs = []
+        for i in range(output_ids.shape[0]):
+            outputs.append(self.call_for_batch(output_ids[i].unsqueeze(0), scores))
+        return all(outputs)

ola_vlm/model/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

ola_vlm/model/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig
+from .language_model.llava_phi3 import LlavaPhi3ForCausalLM, LlavaPhi3Config
+from .language_model.ola_llama import OlaLlavaLlamaForCausalLM, OlaLlavaLlamaConfig
+from .language_model.ola_phi3 import OlaLlavaPhi3ForCausalLM, OlaLlavaPhi3Config
+from .language_model.probe_llava_llama import ProbeDSGLlavaLlamaForCausalLM, ProbeDSGLlavaLlamaConfig

ola_vlm/model/apply_delta.py ADDED Viewed

	@@ -0,0 +1,48 @@

+"""
+Usage:
+python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta
+"""
+import argparse
+import torch
+from tqdm import tqdm
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from llava import LlavaLlamaForCausalLM
+def apply_delta(base_model_path, target_model_path, delta_path):
+    print("Loading base model")
+    base = AutoModelForCausalLM.from_pretrained(
+        base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
+    print("Loading delta")
+    delta = LlavaLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
+    delta_tokenizer = AutoTokenizer.from_pretrained(delta_path)
+    print("Applying delta")
+    for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"):
+        if name not in base.state_dict():
+            assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model'
+            continue
+        if param.data.shape == base.state_dict()[name].shape:
+            param.data += base.state_dict()[name]
+        else:
+            assert name in ['model.embed_tokens.weight', 'lm_head.weight'], \
+                f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}'
+            bparam = base.state_dict()[name]
+            param.data[:bparam.shape[0], :bparam.shape[1]] += bparam
+    print("Saving target model")
+    delta.save_pretrained(target_model_path)
+    delta_tokenizer.save_pretrained(target_model_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base-model-path", type=str, required=True)
+    parser.add_argument("--target-model-path", type=str, required=True)
+    parser.add_argument("--delta-path", type=str, required=True)
+    args = parser.parse_args()
+    apply_delta(args.base_model_path, args.target_model_path, args.delta_path)

ola_vlm/model/aux_heads/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

ola_vlm/model/aux_heads/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .da_v2_head import DepthHead, DAv2_Head, DepthProbeHead, TaskTokenDepthHead
+from .oneformer_head import OneFormerSegHead, OneFormerTaskTokenSegHead
+from .gen_head import GenHead, TaskTokenGenHead

ola_vlm/model/aux_heads/da_v2_head.py ADDED Viewed

	@@ -0,0 +1,457 @@

+import cv2
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from ola_vlm.model.multimodal_projector.resampler import Resampler, TaskTokenResampler
+def _make_scratch(in_shape, out_shape, groups=1, expand=False):
+    scratch = nn.Module()
+    out_shape1 = out_shape
+    out_shape2 = out_shape
+    out_shape3 = out_shape
+    if len(in_shape) >= 4:
+        out_shape4 = out_shape
+    if expand:
+        out_shape1 = out_shape
+        out_shape2 = out_shape * 2
+        out_shape3 = out_shape * 4
+        if len(in_shape) >= 4:
+            out_shape4 = out_shape * 8
+    scratch.layer1_rn = nn.Conv2d(in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
+    scratch.layer2_rn = nn.Conv2d(in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
+    scratch.layer3_rn = nn.Conv2d(in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
+    if len(in_shape) >= 4:
+        scratch.layer4_rn = nn.Conv2d(in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
+    return scratch
+class ResidualConvUnit(nn.Module):
+    """Residual convolution module.
+    """
+    def __init__(self, features, activation, bn):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+        self.bn = bn
+        self.groups=1
+        self.conv1 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
+        self.conv2 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
+        if self.bn == True:
+            self.bn1 = nn.BatchNorm2d(features)
+            self.bn2 = nn.BatchNorm2d(features)
+        self.activation = activation
+        self.skip_add = nn.quantized.FloatFunctional()
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input
+        Returns:
+            tensor: output
+        """
+        out = self.activation(x)
+        out = self.conv1(out)
+        if self.bn == True:
+            out = self.bn1(out)
+        out = self.activation(out)
+        out = self.conv2(out)
+        if self.bn == True:
+            out = self.bn2(out)
+        if self.groups > 1:
+            out = self.conv_merge(out)
+        return self.skip_add.add(out, x)
+class FeatureFusionBlock(nn.Module):
+    """Feature fusion block.
+    """
+    def __init__(
+        self,
+        features,
+        activation,
+        deconv=False,
+        bn=False,
+        expand=False,
+        align_corners=True,
+        size=None
+    ):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super(FeatureFusionBlock, self).__init__()
+        self.deconv = deconv
+        self.align_corners = align_corners
+        self.groups=1
+        self.expand = expand
+        out_features = features
+        if self.expand == True:
+            out_features = features // 2
+        self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1)
+        self.resConfUnit1 = ResidualConvUnit(features, activation, bn)
+        self.resConfUnit2 = ResidualConvUnit(features, activation, bn)
+        self.skip_add = nn.quantized.FloatFunctional()
+        self.size=size
+    def forward(self, *xs, size=None):
+        """Forward pass.
+        Returns:
+            tensor: output
+        """
+        output = xs[0]
+        if len(xs) == 2:
+            res = self.resConfUnit1(xs[1])
+            output = self.skip_add.add(output, res)
+        output = self.resConfUnit2(output)
+        if (size is None) and (self.size is None):
+            modifier = {"scale_factor": 2}
+        elif size is None:
+            modifier = {"size": self.size}
+        else:
+            modifier = {"size": size}
+        output = nn.functional.interpolate(output, **modifier, mode="bilinear", align_corners=self.align_corners)
+        output = self.out_conv(output)
+        return output
+def _make_fusion_block(features, use_bn, size=None):
+    return FeatureFusionBlock(
+        features,
+        nn.ReLU(False),
+        deconv=False,
+        bn=use_bn,
+        expand=False,
+        align_corners=True,
+        size=size,
+    )
+class ConvBlock(nn.Module):
+    def __init__(self, in_feature, out_feature):
+        super().__init__()
+        self.conv_block = nn.Sequential(
+            nn.Conv2d(in_feature, out_feature, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(out_feature),
+            nn.ReLU(True)
+        )
+    def forward(self, x):
+        return self.conv_block(x)
+class DPTHead(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        features=256,
+        use_bn=False,
+        out_channels=[256, 512, 1024, 1024],
+        use_clstoken=False
+    ):
+        super(DPTHead, self).__init__()
+        self.use_clstoken = use_clstoken
+        self.projects = nn.ModuleList([
+            nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_channel,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            ) for out_channel in out_channels
+        ])
+        self.resize_layers = nn.ModuleList([
+            nn.ConvTranspose2d(
+                in_channels=out_channels[0],
+                out_channels=out_channels[0],
+                kernel_size=4,
+                stride=4,
+                padding=0),
+            nn.ConvTranspose2d(
+                in_channels=out_channels[1],
+                out_channels=out_channels[1],
+                kernel_size=2,
+                stride=2,
+                padding=0),
+            nn.Identity(),
+            nn.Conv2d(
+                in_channels=out_channels[3],
+                out_channels=out_channels[3],
+                kernel_size=3,
+                stride=2,
+                padding=1)
+        ])
+        if use_clstoken:
+            self.readout_projects = nn.ModuleList()
+            for _ in range(len(self.projects)):
+                self.readout_projects.append(
+                    nn.Sequential(
+                        nn.Linear(2 * in_channels, in_channels),
+                        nn.GELU()))
+        self.scratch = _make_scratch(
+            out_channels,
+            features,
+            groups=1,
+            expand=False,
+        )
+        self.scratch.stem_transpose = None
+        self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet3 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet4 = _make_fusion_block(features, use_bn)
+        head_features_1 = features
+        head_features_2 = 32
+        self.scratch.output_conv1 = nn.Conv2d(head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1)
+        self.scratch.output_conv2 = nn.Sequential(
+            nn.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(True),
+            nn.Conv2d(head_features_2, 1, kernel_size=1, stride=1, padding=0),
+            nn.ReLU(True),
+            nn.Identity(),
+        )
+    def forward(self, out_features, patch_h, patch_w):
+        out = []
+        for i, x in enumerate(out_features):
+            if self.use_clstoken:
+                x, cls_token = x[0], x[1]
+                readout = cls_token.unsqueeze(1).expand_as(x)
+                x = self.readout_projects[i](torch.cat((x, readout), -1))
+            else:
+                x = x[0]
+            x = x.permute(0, 2, 1).reshape((x.shape[0], x.shape[-1], patch_h, patch_w))
+            x = self.projects[i](x)
+            x = self.resize_layers[i](x)
+            out.append(x)
+        layer_1, layer_2, layer_3, layer_4 = out
+        layer_1_rn = self.scratch.layer1_rn(layer_1)
+        layer_2_rn = self.scratch.layer2_rn(layer_2)
+        layer_3_rn = self.scratch.layer3_rn(layer_3)
+        layer_4_rn = self.scratch.layer4_rn(layer_4)
+        path_4 = self.scratch.refinenet4(layer_4_rn, size=layer_3_rn.shape[2:])
+        path_3 = self.scratch.refinenet3(path_4, layer_3_rn, size=layer_2_rn.shape[2:])
+        path_2 = self.scratch.refinenet2(path_3, layer_2_rn, size=layer_1_rn.shape[2:])
+        path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
+        out = self.scratch.output_conv1(path_1)
+        out = F.interpolate(out, (int(patch_h * 14), int(patch_w * 14)), mode="bilinear", align_corners=True)
+        out = self.scratch.output_conv2(out)
+        return out
+class DAv2_Head(nn.Module):
+    def __init__(
+        self,
+        encoder='vitl',
+        features=256,
+        out_channels=[256, 512, 1024, 1024],
+        use_bn=False,
+        use_clstoken=False
+    ):
+        super(DAv2_Head, self).__init__()
+        self.embd_dims = {
+            'vits': 1024,
+            'vitb': 1024,
+            'vitl': 1024,
+            'vitg': 1024,
+        }
+        self.depth_head = DPTHead(self.embd_dims[encoder], features, use_bn, out_channels=out_channels, use_clstoken=use_clstoken)
+    def forward(self, features):
+        patch_h, patch_w = 336 // 14, 336 // 14
+        depth = self.depth_head(features, patch_h, patch_w)
+        depth = F.relu(depth)
+        return depth.squeeze(1)
+    @torch.no_grad()
+    def infer_feats(self, feats, image_size=(336, 336)):
+        h, w = image_size
+        depth = self.forward(feats)
+        depth = F.interpolate(depth[:, None], (h, w), mode="bilinear", align_corners=True)[0, 0]
+        return depth.cpu().numpy()
+def build_mlp(in_hidden_size, hidden_size):
+    modules = [nn.Linear(in_hidden_size, hidden_size)]
+    modules.append(nn.ReLU())
+    modules.append(nn.Linear(hidden_size, hidden_size))
+    return nn.Sequential(*modules)
+def build_expand_mlp(in_hidden_size, hidden_size, out_size):
+    modules = [nn.Linear(in_hidden_size, hidden_size)]
+    modules.append(nn.ReLU())
+    modules.append(nn.Linear(hidden_size, hidden_size))
+    modules.append(nn.ReLU())
+    modules.append(nn.Linear(hidden_size, out_size))
+    return nn.Sequential(*modules)
+class DepthProbeHead(nn.Module):
+    def __init__(
+        self,
+        llm_hidden_size=4096,
+        proj_config=None,
+    ):
+        super(DepthProbeHead, self).__init__()
+        self.linear_1 = build_mlp(llm_hidden_size, proj_config["output_dim"])
+        self.linear_2 = build_mlp(llm_hidden_size, proj_config["output_dim"])
+        self.linear_3 = build_mlp(llm_hidden_size, proj_config["output_dim"])
+        self.linear_4 = build_mlp(llm_hidden_size, proj_config["output_dim"])
+    #     self._init_weights()
+    # def _init_weights(self):
+    #     for m in self.modules():
+    #         if isinstance(m, nn.Linear):
+    #             nn.init.xavier_uniform_(m.weight)
+    #             if m.bias is not None:
+    #                 nn.init.constant_(m.bias, 0)
+    def forward(self, llm_feats):
+        features = [(self.linear_1(llm_feats), None),
+            (self.linear_1(llm_feats), None),
+            (self.linear_2(llm_feats), None),
+            (self.linear_3(llm_feats), None)
+        ]
+        return features
+class DepthHead(nn.Module):
+    def __init__(
+        self,
+        llm_hidden_size=4096,
+        proj_config=None,
+        use_intermediate_depth=False,
+    ):
+        super(DepthHead, self).__init__()
+        self.projector = Resampler(
+                dim=proj_config["output_dim"],
+                depth=proj_config["depth"],
+                dim_head=proj_config["dim_head"],
+                heads=proj_config["num_heads"],
+                num_queries=proj_config["num_tokens"],
+                embedding_dim=llm_hidden_size,
+                output_dim=proj_config["output_dim"],
+                ff_mult=proj_config["ff_mult"],
+            )
+        self.use_intermediate_depth = use_intermediate_depth
+        if self.use_intermediate_depth:
+            self.linear_1 = build_mlp(proj_config["output_dim"], proj_config["output_dim"])
+            self.linear_2 = build_mlp(proj_config["output_dim"], proj_config["output_dim"])
+            self.linear_3 = build_mlp(proj_config["output_dim"], proj_config["output_dim"])
+    def forward(self, llm_feats):
+        visual_feats = self.projector(llm_feats)
+        features = []
+        if self.use_intermediate_depth:
+            features.append((self.linear_1(visual_feats), None))
+            features.append((self.linear_2(visual_feats), None))
+            features.append((self.linear_3(visual_feats), None))
+        features.append((visual_feats, None))
+        return features
+class TaskTokenDepthHead(nn.Module):
+    def __init__(
+        self,
+        proj_config=None,
+        llm_hidden_size=4096,
+        use_intermediate_depth=False,
+    ):
+        super(TaskTokenDepthHead, self).__init__()
+        self.projector = TaskTokenResampler(
+                dim=llm_hidden_size,
+                depth=proj_config["depth"],
+                dim_head=proj_config["dim_head"],
+                heads=proj_config["num_heads"],
+                num_queries=proj_config["num_tokens"],
+                embedding_dim=llm_hidden_size,
+                output_dim=proj_config["output_dim"],
+                ff_mult=proj_config["ff_mult"],
+            )
+        self.use_intermediate_depth = use_intermediate_depth
+        if self.use_intermediate_depth:
+            self.linear_1 = build_mlp(proj_config["output_dim"], proj_config["output_dim"])
+            self.linear_2 = build_mlp(proj_config["output_dim"], proj_config["output_dim"])
+            self.linear_3 = build_mlp(proj_config["output_dim"], proj_config["output_dim"])
+    def forward(self, llm_feats, latents):
+        visual_feats = self.projector(llm_feats, latents)
+        features = []
+        if self.use_intermediate_depth:
+            features.append((self.linear_1(visual_feats), None))
+            features.append((self.linear_2(visual_feats), None))
+            features.append((self.linear_3(visual_feats), None))
+        features.append((visual_feats, None))
+        return features

ola_vlm/model/aux_heads/depth_anything_v2/dinov2.py ADDED Viewed

	@@ -0,0 +1,415 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+from functools import partial
+import math
+import logging
+from typing import Sequence, Tuple, Union, Callable
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn.init import trunc_normal_
+from .dinov2_layers import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block
+logger = logging.getLogger("dinov2")
+def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
+    if not depth_first and include_root:
+        fn(module=module, name=name)
+    for child_name, child_module in module.named_children():
+        child_name = ".".join((name, child_name)) if name else child_name
+        named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True)
+    if depth_first and include_root:
+        fn(module=module, name=name)
+    return module
+class BlockChunk(nn.ModuleList):
+    def forward(self, x):
+        for b in self:
+            x = b(x)
+        return x
+class DinoVisionTransformer(nn.Module):
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        ffn_bias=True,
+        proj_bias=True,
+        drop_path_rate=0.0,
+        drop_path_uniform=False,
+        init_values=None,  # for layerscale: None or 0 => no layerscale
+        embed_layer=PatchEmbed,
+        act_layer=nn.GELU,
+        block_fn=Block,
+        ffn_layer="mlp",
+        block_chunks=1,
+        num_register_tokens=0,
+        interpolate_antialias=False,
+        interpolate_offset=0.1,
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            proj_bias (bool): enable bias for proj in attn if True
+            ffn_bias (bool): enable bias for ffn if True
+            drop_path_rate (float): stochastic depth rate
+            drop_path_uniform (bool): apply uniform drop rate across blocks
+            weight_init (str): weight init scheme
+            init_values (float): layer-scale init values
+            embed_layer (nn.Module): patch embedding layer
+            act_layer (nn.Module): MLP activation layer
+            block_fn (nn.Module): transformer block class
+            ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
+            block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
+            num_register_tokens: (int) number of extra cls tokens (so-called "registers")
+            interpolate_antialias: (str) flag to apply anti-aliasing when interpolating positional embeddings
+            interpolate_offset: (float) work-around offset to apply when interpolating positional embeddings
+        """
+        super().__init__()
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_tokens = 1
+        self.n_blocks = depth
+        self.num_heads = num_heads
+        self.patch_size = patch_size
+        self.num_register_tokens = num_register_tokens
+        self.interpolate_antialias = interpolate_antialias
+        self.interpolate_offset = interpolate_offset
+        self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+        assert num_register_tokens >= 0
+        self.register_tokens = (
+            nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim)) if num_register_tokens else None
+        )
+        if drop_path_uniform is True:
+            dpr = [drop_path_rate] * depth
+        else:
+            dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        if ffn_layer == "mlp":
+            logger.info("using MLP layer as FFN")
+            ffn_layer = Mlp
+        elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
+            logger.info("using SwiGLU layer as FFN")
+            ffn_layer = SwiGLUFFNFused
+        elif ffn_layer == "identity":
+            logger.info("using Identity layer as FFN")
+            def f(*args, **kwargs):
+                return nn.Identity()
+            ffn_layer = f
+        else:
+            raise NotImplementedError
+        blocks_list = [
+            block_fn(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                proj_bias=proj_bias,
+                ffn_bias=ffn_bias,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                ffn_layer=ffn_layer,
+                init_values=init_values,
+            )
+            for i in range(depth)
+        ]
+        if block_chunks > 0:
+            self.chunked_blocks = True
+            chunked_blocks = []
+            chunksize = depth // block_chunks
+            for i in range(0, depth, chunksize):
+                # this is to keep the block index consistent if we chunk the block list
+                chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize])
+            self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
+        else:
+            self.chunked_blocks = False
+            self.blocks = nn.ModuleList(blocks_list)
+        self.norm = norm_layer(embed_dim)
+        self.head = nn.Identity()
+        self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
+        self.init_weights()
+    def init_weights(self):
+        trunc_normal_(self.pos_embed, std=0.02)
+        nn.init.normal_(self.cls_token, std=1e-6)
+        if self.register_tokens is not None:
+            nn.init.normal_(self.register_tokens, std=1e-6)
+        named_apply(init_weights_vit_timm, self)
+    def interpolate_pos_encoding(self, x, w, h):
+        previous_dtype = x.dtype
+        npatch = x.shape[1] - 1
+        N = self.pos_embed.shape[1] - 1
+        if npatch == N and w == h:
+            return self.pos_embed
+        pos_embed = self.pos_embed.float()
+        class_pos_embed = pos_embed[:, 0]
+        patch_pos_embed = pos_embed[:, 1:]
+        dim = x.shape[-1]
+        w0 = w // self.patch_size
+        h0 = h // self.patch_size
+        # we add a small number to avoid floating point error in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        # DINOv2 with register modify the interpolate_offset from 0.1 to 0.0
+        w0, h0 = w0 + self.interpolate_offset, h0 + self.interpolate_offset
+        # w0, h0 = w0 + 0.1, h0 + 0.1
+        sqrt_N = math.sqrt(N)
+        sx, sy = float(w0) / sqrt_N, float(h0) / sqrt_N
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape(1, int(sqrt_N), int(sqrt_N), dim).permute(0, 3, 1, 2),
+            scale_factor=(sx, sy),
+            # (int(w0), int(h0)), # to solve the upsampling shape issue
+            mode="bicubic",
+            antialias=self.interpolate_antialias
+        )
+        assert int(w0) == patch_pos_embed.shape[-2]
+        assert int(h0) == patch_pos_embed.shape[-1]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
+    def prepare_tokens_with_masks(self, x, masks=None):
+        B, nc, w, h = x.shape
+        x = self.patch_embed(x)
+        if masks is not None:
+            x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
+        x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+        x = x + self.interpolate_pos_encoding(x, w, h)
+        if self.register_tokens is not None:
+            x = torch.cat(
+                (
+                    x[:, :1],
+                    self.register_tokens.expand(x.shape[0], -1, -1),
+                    x[:, 1:],
+                ),
+                dim=1,
+            )
+        return x
+    def forward_features_list(self, x_list, masks_list):
+        x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)]
+        for blk in self.blocks:
+            x = blk(x)
+        all_x = x
+        output = []
+        for x, masks in zip(all_x, masks_list):
+            x_norm = self.norm(x)
+            output.append(
+                {
+                    "x_norm_clstoken": x_norm[:, 0],
+                    "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+                    "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+                    "x_prenorm": x,
+                    "masks": masks,
+                }
+            )
+        return output
+    def forward_features(self, x, masks=None):
+        if isinstance(x, list):
+            return self.forward_features_list(x, masks)
+        x = self.prepare_tokens_with_masks(x, masks)
+        for blk in self.blocks:
+            x = blk(x)
+        x_norm = self.norm(x)
+        return {
+            "x_norm_clstoken": x_norm[:, 0],
+            "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+            "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+            "x_prenorm": x,
+            "masks": masks,
+        }
+    def _get_intermediate_layers_not_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        # If n is an int, take the n last blocks. If it's a list, take them
+        output, total_block_len = [], len(self.blocks)
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if i in blocks_to_take:
+                output.append(x)
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+    def _get_intermediate_layers_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        output, i, total_block_len = [], 0, len(self.blocks[-1])
+        # If n is an int, take the n last blocks. If it's a list, take them
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for block_chunk in self.blocks:
+            for blk in block_chunk[i:]:  # Passing the nn.Identity()
+                x = blk(x)
+                if i in blocks_to_take:
+                    output.append(x)
+                i += 1
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+    def get_intermediate_layers(
+        self,
+        x: torch.Tensor,
+        n: Union[int, Sequence] = 1,  # Layers or n last layers to take
+        reshape: bool = False,
+        return_class_token: bool = False,
+        norm=True
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
+        if self.chunked_blocks:
+            outputs = self._get_intermediate_layers_chunked(x, n)
+        else:
+            outputs = self._get_intermediate_layers_not_chunked(x, n)
+        if norm:
+            outputs = [self.norm(out) for out in outputs]
+        class_tokens = [out[:, 0] for out in outputs]
+        outputs = [out[:, 1 + self.num_register_tokens:] for out in outputs]
+        if reshape:
+            B, _, w, h = x.shape
+            outputs = [
+                out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous()
+                for out in outputs
+            ]
+        if return_class_token:
+            return tuple(zip(outputs, class_tokens))
+        return tuple(outputs)
+    def forward(self, *args, is_training=False, **kwargs):
+        ret = self.forward_features(*args, **kwargs)
+        if is_training:
+            return ret
+        else:
+            return self.head(ret["x_norm_clstoken"])
+def init_weights_vit_timm(module: nn.Module, name: str = ""):
+    """ViT weight initialization, original timm impl (for reproducibility)"""
+    if isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=0.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+def vit_small(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=384,
+        depth=12,
+        num_heads=6,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_base(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_large(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_giant2(patch_size=16, num_register_tokens=0, **kwargs):
+    """
+    Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
+    """
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1536,
+        depth=40,
+        num_heads=24,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def DINOv2(model_name):
+    model_zoo = {
+        "vits": vit_small,
+        "vitb": vit_base,
+        "vitl": vit_large,
+        "vitg": vit_giant2
+    }
+    return model_zoo[model_name](
+        img_size=518,
+        patch_size=14,
+        init_values=1.0,
+        ffn_layer="mlp" if model_name != "vitg" else "swiglufused",
+        block_chunks=0,
+        num_register_tokens=0,
+        interpolate_antialias=False,
+        interpolate_offset=0.1
+    )

ola_vlm/model/aux_heads/depth_anything_v2/dinov2_layers/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from .mlp import Mlp
+from .patch_embed import PatchEmbed
+from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
+from .block import NestedTensorBlock
+from .attention import MemEffAttention

ola_vlm/model/aux_heads/depth_anything_v2/dinov2_layers/attention.py ADDED Viewed

	@@ -0,0 +1,83 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+import logging
+from torch import Tensor
+from torch import nn
+logger = logging.getLogger("dinov2")
+try:
+    from xformers.ops import memory_efficient_attention, unbind, fmha
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    logger.warning("xFormers not available")
+    XFORMERS_AVAILABLE = False
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x: Tensor) -> Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+        attn = q @ k.transpose(-2, -1)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class MemEffAttention(Attention):
+    def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+        if not XFORMERS_AVAILABLE:
+            assert attn_bias is None, "xFormers is required for nested tensors usage"
+            return super().forward(x)
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+        q, k, v = unbind(qkv, 2)
+        x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+        x = x.reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x

ola_vlm/model/aux_heads/depth_anything_v2/dinov2_layers/block.py ADDED Viewed

	@@ -0,0 +1,252 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+import logging
+from typing import Callable, List, Any, Tuple, Dict
+import torch
+from torch import nn, Tensor
+from .attention import Attention, MemEffAttention
+from .drop_path import DropPath
+from .layer_scale import LayerScale
+from .mlp import Mlp
+logger = logging.getLogger("dinov2")
+try:
+    from xformers.ops import fmha
+    from xformers.ops import scaled_index_add, index_select_cat
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    logger.warning("xFormers not available")
+    XFORMERS_AVAILABLE = False
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values=None,
+        drop_path: float = 0.0,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        attn_class: Callable[..., nn.Module] = Attention,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+    ) -> None:
+        super().__init__()
+        # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_class(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = ffn_layer(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+            bias=ffn_bias,
+        )
+        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.sample_drop_ratio = drop_path
+    def forward(self, x: Tensor) -> Tensor:
+        def attn_residual_func(x: Tensor) -> Tensor:
+            return self.ls1(self.attn(self.norm1(x)))
+        def ffn_residual_func(x: Tensor) -> Tensor:
+            return self.ls2(self.mlp(self.norm2(x)))
+        if self.training and self.sample_drop_ratio > 0.1:
+            # the overhead is compensated only for a drop path rate larger than 0.1
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+        elif self.training and self.sample_drop_ratio > 0.0:
+            x = x + self.drop_path1(attn_residual_func(x))
+            x = x + self.drop_path1(ffn_residual_func(x))  # FIXME: drop_path2
+        else:
+            x = x + attn_residual_func(x)
+            x = x + ffn_residual_func(x)
+        return x
+def drop_add_residual_stochastic_depth(
+    x: Tensor,
+    residual_func: Callable[[Tensor], Tensor],
+    sample_drop_ratio: float = 0.0,
+) -> Tensor:
+    # 1) extract subset using permutation
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    x_subset = x[brange]
+    # 2) apply residual_func to get residual
+    residual = residual_func(x_subset)
+    x_flat = x.flatten(1)
+    residual = residual.flatten(1)
+    residual_scale_factor = b / sample_subset_size
+    # 3) add the residual
+    x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    return x_plus_residual.view_as(x)
+def get_branges_scales(x, sample_drop_ratio=0.0):
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    residual_scale_factor = b / sample_subset_size
+    return brange, residual_scale_factor
+def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None):
+    if scaling_vector is None:
+        x_flat = x.flatten(1)
+        residual = residual.flatten(1)
+        x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    else:
+        x_plus_residual = scaled_index_add(
+            x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor
+        )
+    return x_plus_residual
+attn_bias_cache: Dict[Tuple, Any] = {}
+def get_attn_bias_and_cat(x_list, branges=None):
+    """
+    this will perform the index select, cat the tensors, and provide the attn_bias from cache
+    """
+    batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
+    all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
+    if all_shapes not in attn_bias_cache.keys():
+        seqlens = []
+        for b, x in zip(batch_sizes, x_list):
+            for _ in range(b):
+                seqlens.append(x.shape[1])
+        attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
+        attn_bias._batch_sizes = batch_sizes
+        attn_bias_cache[all_shapes] = attn_bias
+    if branges is not None:
+        cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1])
+    else:
+        tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
+        cat_tensors = torch.cat(tensors_bs1, dim=1)
+    return attn_bias_cache[all_shapes], cat_tensors
+def drop_add_residual_stochastic_depth_list(
+    x_list: List[Tensor],
+    residual_func: Callable[[Tensor, Any], Tensor],
+    sample_drop_ratio: float = 0.0,
+    scaling_vector=None,
+) -> Tensor:
+    # 1) generate random set of indices for dropping samples in the batch
+    branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list]
+    branges = [s[0] for s in branges_scales]
+    residual_scale_factors = [s[1] for s in branges_scales]
+    # 2) get attention bias and index+concat the tensors
+    attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
+    # 3) apply residual_func to get residual, and split the result
+    residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias))  # type: ignore
+    outputs = []
+    for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors):
+        outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
+    return outputs
+class NestedTensorBlock(Block):
+    def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]:
+        """
+        x_list contains a list of tensors to nest together and run
+        """
+        assert isinstance(self.attn, MemEffAttention)
+        if self.training and self.sample_drop_ratio > 0.0:
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.attn(self.norm1(x), attn_bias=attn_bias)
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.mlp(self.norm2(x))
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            return x_list
+        else:
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls2(self.mlp(self.norm2(x)))
+            attn_bias, x = get_attn_bias_and_cat(x_list)
+            x = x + attn_residual_func(x, attn_bias=attn_bias)
+            x = x + ffn_residual_func(x)
+            return attn_bias.split(x)
+    def forward(self, x_or_x_list):
+        if isinstance(x_or_x_list, Tensor):
+            return super().forward(x_or_x_list)
+        elif isinstance(x_or_x_list, list):
+            assert XFORMERS_AVAILABLE, "Please install xFormers for nested tensors usage"
+            return self.forward_nested(x_or_x_list)
+        else:
+            raise AssertionError

ola_vlm/model/aux_heads/depth_anything_v2/dinov2_layers/drop_path.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
+from torch import nn
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0:
+        random_tensor.div_(keep_prob)
+    output = x * random_tensor
+    return output
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)

ola_vlm/model/aux_heads/depth_anything_v2/dinov2_layers/layer_scale.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
+from typing import Union
+import torch
+from torch import Tensor
+from torch import nn
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: Union[float, Tensor] = 1e-5,
+        inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x: Tensor) -> Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma

ola_vlm/model/aux_heads/depth_anything_v2/dinov2_layers/mlp.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
+from typing import Callable, Optional
+from torch import Tensor, nn
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x

ola_vlm/model/aux_heads/depth_anything_v2/dinov2_layers/patch_embed.py ADDED Viewed

	@@ -0,0 +1,90 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+from typing import Callable, Optional, Tuple, Union
+from torch import Tensor
+import torch.nn as nn
+def make_2tuple(x):
+    if isinstance(x, tuple):
+        assert len(x) == 2
+        return x
+    assert isinstance(x, int)
+    return (x, x)
+class PatchEmbed(nn.Module):
+    """
+    2D image to patch embedding: (B,C,H,W) -> (B,N,D)
+    Args:
+        img_size: Image size.
+        patch_size: Patch token size.
+        in_chans: Number of input image channels.
+        embed_dim: Number of linear projection output channels.
+        norm_layer: Normalization layer.
+    """
+    def __init__(
+        self,
+        img_size: Union[int, Tuple[int, int]] = 224,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        norm_layer: Optional[Callable] = None,
+        flatten_embedding: bool = True,
+    ) -> None:
+        super().__init__()
+        image_HW = make_2tuple(img_size)
+        patch_HW = make_2tuple(patch_size)
+        patch_grid_size = (
+            image_HW[0] // patch_HW[0],
+            image_HW[1] // patch_HW[1],
+        )
+        self.img_size = image_HW
+        self.patch_size = patch_HW
+        self.patches_resolution = patch_grid_size
+        self.num_patches = patch_grid_size[0] * patch_grid_size[1]
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.flatten_embedding = flatten_embedding
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x: Tensor) -> Tensor:
+        _, _, H, W = x.shape
+        patch_H, patch_W = self.patch_size
+        assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
+        assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
+        x = x.to(self.proj.bias.dtype)
+        x = self.proj(x)  # B C H W
+        H, W = x.size(2), x.size(3)
+        x = x.flatten(2).transpose(1, 2)  # B HW C
+        x = self.norm(x)
+        if not self.flatten_embedding:
+            x = x.reshape(-1, H, W, self.embed_dim)  # B H W C
+        return x
+    def flops(self) -> float:
+        Ho, Wo = self.patches_resolution
+        flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
+        if self.norm is not None:
+            flops += Ho * Wo * self.embed_dim
+        return flops

ola_vlm/model/aux_heads/depth_anything_v2/dinov2_layers/swiglu_ffn.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Callable, Optional
+from torch import Tensor, nn
+import torch.nn.functional as F
+class SwiGLUFFN(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
+        self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
+    def forward(self, x: Tensor) -> Tensor:
+        x12 = self.w12(x)
+        x1, x2 = x12.chunk(2, dim=-1)
+        hidden = F.silu(x1) * x2
+        return self.w3(hidden)
+try:
+    from xformers.ops import SwiGLU
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    SwiGLU = SwiGLUFFN
+    XFORMERS_AVAILABLE = False
+class SwiGLUFFNFused(SwiGLU):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+        super().__init__(
+            in_features=in_features,
+            hidden_features=hidden_features,
+            out_features=out_features,
+            bias=bias,
+        )

ola_vlm/model/aux_heads/depth_anything_v2/dpt.py ADDED Viewed

	@@ -0,0 +1,219 @@

+import cv2
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision.transforms import Compose
+from .dinov2 import DINOv2
+from .util.blocks import FeatureFusionBlock, _make_scratch
+from .util.transform import Resize, NormalizeImage, PrepareForNet
+def _make_fusion_block(features, use_bn, size=None):
+    return FeatureFusionBlock(
+        features,
+        nn.ReLU(False),
+        deconv=False,
+        bn=use_bn,
+        expand=False,
+        align_corners=True,
+        size=size,
+    )
+class ConvBlock(nn.Module):
+    def __init__(self, in_feature, out_feature):
+        super().__init__()
+        self.conv_block = nn.Sequential(
+            nn.Conv2d(in_feature, out_feature, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(out_feature),
+            nn.ReLU(True)
+        )
+    def forward(self, x):
+        return self.conv_block(x)
+class DPTHead(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        features=256,
+        use_bn=False,
+        out_channels=[256, 512, 1024, 1024],
+        use_clstoken=False
+    ):
+        super(DPTHead, self).__init__()
+        self.use_clstoken = use_clstoken
+        self.projects = nn.ModuleList([
+            nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_channel,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            ) for out_channel in out_channels
+        ])
+        self.resize_layers = nn.ModuleList([
+            nn.ConvTranspose2d(
+                in_channels=out_channels[0],
+                out_channels=out_channels[0],
+                kernel_size=4,
+                stride=4,
+                padding=0),
+            nn.ConvTranspose2d(
+                in_channels=out_channels[1],
+                out_channels=out_channels[1],
+                kernel_size=2,
+                stride=2,
+                padding=0),
+            nn.Identity(),
+            nn.Conv2d(
+                in_channels=out_channels[3],
+                out_channels=out_channels[3],
+                kernel_size=3,
+                stride=2,
+                padding=1)
+        ])
+        if use_clstoken:
+            self.readout_projects = nn.ModuleList()
+            for _ in range(len(self.projects)):
+                self.readout_projects.append(
+                    nn.Sequential(
+                        nn.Linear(2 * in_channels, in_channels),
+                        nn.GELU()))
+        self.scratch = _make_scratch(
+            out_channels,
+            features,
+            groups=1,
+            expand=False,
+        )
+        self.scratch.stem_transpose = None
+        self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet3 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet4 = _make_fusion_block(features, use_bn)
+        head_features_1 = features
+        head_features_2 = 32
+        self.scratch.output_conv1 = nn.Conv2d(head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1)
+        self.scratch.output_conv2 = nn.Sequential(
+            nn.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(True),
+            nn.Conv2d(head_features_2, 1, kernel_size=1, stride=1, padding=0),
+            nn.ReLU(True),
+            nn.Identity(),
+        )
+    def forward(self, out_features, patch_h, patch_w):
+        out = []
+        for i, x in enumerate(out_features):
+            if self.use_clstoken:
+                x, cls_token = x[0], x[1]
+                readout = cls_token.unsqueeze(1).expand_as(x)
+                x = self.readout_projects[i](torch.cat((x, readout), -1))
+            else:
+                x = x[0]
+            x = x.permute(0, 2, 1).reshape((x.shape[0], x.shape[-1], patch_h, patch_w))
+            x = self.projects[i](x)
+            x = self.resize_layers[i](x)
+            out.append(x)
+        layer_1, layer_2, layer_3, layer_4 = out
+        layer_1_rn = self.scratch.layer1_rn(layer_1)
+        layer_2_rn = self.scratch.layer2_rn(layer_2)
+        layer_3_rn = self.scratch.layer3_rn(layer_3)
+        layer_4_rn = self.scratch.layer4_rn(layer_4)
+        path_4 = self.scratch.refinenet4(layer_4_rn, size=layer_3_rn.shape[2:])
+        path_3 = self.scratch.refinenet3(path_4, layer_3_rn, size=layer_2_rn.shape[2:])
+        path_2 = self.scratch.refinenet2(path_3, layer_2_rn, size=layer_1_rn.shape[2:])
+        path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
+        out = self.scratch.output_conv1(path_1)
+        out = F.interpolate(out, (int(patch_h * 14), int(patch_w * 14)), mode="bilinear", align_corners=True)
+        out = self.scratch.output_conv2(out)
+        return out
+class DepthAnythingV2(nn.Module):
+    def __init__(
+        self,
+        encoder='vitl',
+        features=256,
+        out_channels=[256, 512, 1024, 1024],
+        use_bn=False,
+        use_clstoken=False
+    ):
+        super(DepthAnythingV2, self).__init__()
+        self.intermediate_layer_idx = {
+            'vits': [2, 5, 8, 11],
+            'vitb': [2, 5, 8, 11],
+            'vitl': [4, 11, 17, 23],
+            'vitg': [9, 19, 29, 39]
+        }
+        self.encoder = encoder
+        self.pretrained = DINOv2(model_name=encoder)
+        self.depth_head = DPTHead(self.pretrained.embed_dim, features, use_bn, out_channels=out_channels, use_clstoken=use_clstoken)
+    def forward(self, x):
+        patch_h, patch_w = x.shape[-2] // 14, x.shape[-1] // 14
+        features = self.pretrained.get_intermediate_layers(x, self.intermediate_layer_idx[self.encoder], return_class_token=True)
+        return features
+    @torch.no_grad()
+    def infer_image(self, raw_image, input_size=336, is_dsg=False):
+        image, (h, w) = self.image2tensor(raw_image, input_size)
+        features = self.forward(image)
+        if is_dsg:
+            return features
+        # feats = torch.cat([f[0] for f in features], dim=2)
+        feats = features[-1][0]
+        return feats
+    def image2tensor(self, raw_image, input_size=518):
+        transform = Compose([
+            Resize(
+                width=input_size,
+                height=input_size,
+                resize_target=False,
+                keep_aspect_ratio=True,
+                ensure_multiple_of=14,
+                resize_method='lower_bound',
+                image_interpolation_method=cv2.INTER_CUBIC,
+            ),
+            NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+            PrepareForNet(),
+        ])
+        h, w = raw_image.shape[:2]
+        image = cv2.cvtColor(raw_image, cv2.COLOR_BGR2RGB) / 255.0
+        image = transform({'image': image})['image']
+        image = torch.from_numpy(image).unsqueeze(0)
+        DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
+        image = image.to(DEVICE)
+        return image, (h, w)

ola_vlm/model/aux_heads/depth_anything_v2/util/blocks.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import torch.nn as nn
+def _make_scratch(in_shape, out_shape, groups=1, expand=False):
+    scratch = nn.Module()
+    out_shape1 = out_shape
+    out_shape2 = out_shape
+    out_shape3 = out_shape
+    if len(in_shape) >= 4:
+        out_shape4 = out_shape
+    if expand:
+        out_shape1 = out_shape
+        out_shape2 = out_shape * 2
+        out_shape3 = out_shape * 4
+        if len(in_shape) >= 4:
+            out_shape4 = out_shape * 8
+    scratch.layer1_rn = nn.Conv2d(in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
+    scratch.layer2_rn = nn.Conv2d(in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
+    scratch.layer3_rn = nn.Conv2d(in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
+    if len(in_shape) >= 4:
+        scratch.layer4_rn = nn.Conv2d(in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
+    return scratch
+class ResidualConvUnit(nn.Module):
+    """Residual convolution module.
+    """
+    def __init__(self, features, activation, bn):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+        self.bn = bn
+        self.groups=1
+        self.conv1 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
+        self.conv2 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
+        if self.bn == True:
+            self.bn1 = nn.BatchNorm2d(features)
+            self.bn2 = nn.BatchNorm2d(features)
+        self.activation = activation
+        self.skip_add = nn.quantized.FloatFunctional()
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input
+        Returns:
+            tensor: output
+        """
+        out = self.activation(x)
+        out = self.conv1(out)
+        if self.bn == True:
+            out = self.bn1(out)
+        out = self.activation(out)
+        out = self.conv2(out)
+        if self.bn == True:
+            out = self.bn2(out)
+        if self.groups > 1:
+            out = self.conv_merge(out)
+        return self.skip_add.add(out, x)
+class FeatureFusionBlock(nn.Module):
+    """Feature fusion block.
+    """
+    def __init__(
+        self,
+        features,
+        activation,
+        deconv=False,
+        bn=False,
+        expand=False,
+        align_corners=True,
+        size=None
+    ):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super(FeatureFusionBlock, self).__init__()
+        self.deconv = deconv
+        self.align_corners = align_corners
+        self.groups=1
+        self.expand = expand
+        out_features = features
+        if self.expand == True:
+            out_features = features // 2
+        self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1)
+        self.resConfUnit1 = ResidualConvUnit(features, activation, bn)
+        self.resConfUnit2 = ResidualConvUnit(features, activation, bn)
+        self.skip_add = nn.quantized.FloatFunctional()
+        self.size=size
+    def forward(self, *xs, size=None):
+        """Forward pass.
+        Returns:
+            tensor: output
+        """
+        output = xs[0]
+        if len(xs) == 2:
+            res = self.resConfUnit1(xs[1])
+            output = self.skip_add.add(output, res)
+        output = self.resConfUnit2(output)
+        if (size is None) and (self.size is None):
+            modifier = {"scale_factor": 2}
+        elif size is None:
+            modifier = {"size": self.size}
+        else:
+            modifier = {"size": size}
+        output = nn.functional.interpolate(output, **modifier, mode="bilinear", align_corners=self.align_corners)
+        output = self.out_conv(output)
+        return output

ola_vlm/model/aux_heads/depth_anything_v2/util/transform.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import numpy as np
+import cv2
+class Resize(object):
+    """Resize sample to given size (width, height).
+    """
+    def __init__(
+        self,
+        width,
+        height,
+        resize_target=True,
+        keep_aspect_ratio=False,
+        ensure_multiple_of=1,
+        resize_method="lower_bound",
+        image_interpolation_method=cv2.INTER_AREA,
+    ):
+        """Init.
+        Args:
+            width (int): desired output width
+            height (int): desired output height
+            resize_target (bool, optional):
+                True: Resize the full sample (image, mask, target).
+                False: Resize image only.
+                Defaults to True.
+            keep_aspect_ratio (bool, optional):
+                True: Keep the aspect ratio of the input sample.
+                Output sample might not have the given width and height, and
+                resize behaviour depends on the parameter 'resize_method'.
+                Defaults to False.
+            ensure_multiple_of (int, optional):
+                Output width and height is constrained to be multiple of this parameter.
+                Defaults to 1.
+            resize_method (str, optional):
+                "lower_bound": Output will be at least as large as the given size.
+                "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.)
+                "minimal": Scale as least as possible.  (Output size might be smaller than given size.)
+                Defaults to "lower_bound".
+        """
+        self.__width = width
+        self.__height = height
+        self.__resize_target = resize_target
+        self.__keep_aspect_ratio = keep_aspect_ratio
+        self.__multiple_of = ensure_multiple_of
+        self.__resize_method = resize_method
+        self.__image_interpolation_method = image_interpolation_method
+    def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
+        y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        if max_val is not None and y > max_val:
+            y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        if y < min_val:
+            y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        return y
+    def get_size(self, width, height):
+        # determine new height and width
+        scale_height = self.__height / height
+        scale_width = self.__width / width
+        if self.__keep_aspect_ratio:
+            if self.__resize_method == "lower_bound":
+                # scale such that output size is lower bound
+                if scale_width > scale_height:
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            elif self.__resize_method == "upper_bound":
+                # scale such that output size is upper bound
+                if scale_width < scale_height:
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            elif self.__resize_method == "minimal":
+                # scale as least as possbile
+                if abs(1 - scale_width) < abs(1 - scale_height):
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            else:
+                raise ValueError(f"resize_method {self.__resize_method} not implemented")
+        if self.__resize_method == "lower_bound":
+            new_height = self.constrain_to_multiple_of(scale_height * height, min_val=self.__height)
+            new_width = self.constrain_to_multiple_of(scale_width * width, min_val=self.__width)
+        elif self.__resize_method == "upper_bound":
+            new_height = self.constrain_to_multiple_of(scale_height * height, max_val=self.__height)
+            new_width = self.constrain_to_multiple_of(scale_width * width, max_val=self.__width)
+        elif self.__resize_method == "minimal":
+            new_height = self.constrain_to_multiple_of(scale_height * height)
+            new_width = self.constrain_to_multiple_of(scale_width * width)
+        else:
+            raise ValueError(f"resize_method {self.__resize_method} not implemented")
+        return (new_width, new_height)
+    def __call__(self, sample):
+        width, height = self.get_size(sample["image"].shape[1], sample["image"].shape[0])
+        # resize sample
+        sample["image"] = cv2.resize(sample["image"], (width, height), interpolation=self.__image_interpolation_method)
+        if self.__resize_target:
+            if "depth" in sample:
+                sample["depth"] = cv2.resize(sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST)
+            if "mask" in sample:
+                sample["mask"] = cv2.resize(sample["mask"].astype(np.float32), (width, height), interpolation=cv2.INTER_NEAREST)
+        return sample
+class NormalizeImage(object):
+    """Normlize image by given mean and std.
+    """
+    def __init__(self, mean, std):
+        self.__mean = mean
+        self.__std = std
+    def __call__(self, sample):
+        sample["image"] = (sample["image"] - self.__mean) / self.__std
+        return sample
+class PrepareForNet(object):
+    """Prepare sample for usage as network input.
+    """
+    def __init__(self):
+        pass
+    def __call__(self, sample):
+        image = np.transpose(sample["image"], (2, 0, 1))
+        sample["image"] = np.ascontiguousarray(image).astype(np.float32)
+        if "depth" in sample:
+            depth = sample["depth"].astype(np.float32)
+            sample["depth"] = np.ascontiguousarray(depth)
+        if "mask" in sample:
+            sample["mask"] = sample["mask"].astype(np.float32)
+            sample["mask"] = np.ascontiguousarray(sample["mask"])
+        return sample