karamjotsingh
/

MultiModalPositionIdDebug

Model card Files Files and versions

xet

Community

karamjotsingh commited on 9 days ago

Commit

b174e81

verified ·

1 Parent(s): ee474a4

Upload position_ids_debug.ipynb with huggingface_hub

Browse files

Files changed (1) hide show

position_ids_debug.ipynb +266 -0

position_ids_debug.ipynb ADDED Viewed

	@@ -0,0 +1,266 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "6511a91c-ed20-41ff-befb-699bda1912a3",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-03-25T05:42:29.023013Z",
+     "iopub.status.busy": "2026-03-25T05:42:29.022863Z",
+     "iopub.status.idle": "2026-03-25T05:42:40.880280Z",
+     "shell.execute_reply": "2026-03-25T05:42:40.879248Z",
+     "shell.execute_reply.started": "2026-03-25T05:42:29.022998Z"
+    },
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8310548c3b0d460899adcb96ee4af2e1",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading (incomplete total...): 0.00B [00:00, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "663ea1161c934235a53948b93d224495",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "667df34dda224931ac9ccd442a5d42f0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading weights:   0%|          | 0/824 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[text] batch=0, tokens=4, pos=0..3 (t=h=w): [0, 1, 2, 3]\n",
+      "get_vision_position_ids: grid_thw=tensor([ 1, 18, 18], device='cuda:0'), llm_grid_thw=(1, 9, 9), start_position=4\n",
+      "  temp_merge_size=1, spatial_merge_size=2\n",
+      "  image_seq_length=81\n",
+      "  position_width (repeat)=[4, 5, 6, 7, 8, 9, 10, 11, 12, 4]...[12, 4, 5, 6, 7, 8, 9, 10, 11, 12]\n",
+      "  position_height (repeat_interleave)=[4, 4, 4, 4, 4, 4, 4, 4, 4, 5]...[11, 12, 12, 12, 12, 12, 12, 12, 12, 12]\n",
+      "  position_temporal (torch.full) (before spacing)=[4, 4, 4, 4, 4, 4, 4, 4, 4, 4]...[4, 4, 4, 4, 4, 4, 4, 4, 4, 4]\n",
+      "  time_interval=2\n",
+      "  position_temporal (after spacing)=[8, 8, 8, 8, 8, 8, 8, 8, 8, 8]...[8, 8, 8, 8, 8, 8, 8, 8, 8, 8]\n",
+      "[vision pos] grid_thw=tensor([ 1, 18, 18], device='cuda:0'), start=4\n",
+      "  t: [8, 8, 8, 8, 8, 8, 8, 8, 8, 8]...[8, 8, 8, 8, 8, 8, 8, 8, 8, 8]\n",
+      "  h: [4, 4, 4, 4, 4, 4, 4, 4, 4, 5]...[11, 12, 12, 12, 12, 12, 12, 12, 12, 12]\n",
+      "  w: [4, 5, 6, 7, 8, 9, 10, 11, 12, 4]...[12, 4, 5, 6, 7, 8, 9, 10, 11, 12]\n",
+      "[text] batch=0, tokens=9, pos=13..21 (t=h=w): [13, 14, 15, 16, 17, 18, 19, 20, 21]\n",
+      "[LLM prefill] position_ids shape: torch.Size([3, 1, 94])  (3=t/h/w, bs, seq_len)\n",
+      "  batch 0 (shape: 94):\n",
+      "    t: [0, 1, 2, 3, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 13, 14, 15, 16, 17, 18, 19, 20, 21] \n",
+      "    h: [0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21] \n",
+      "    w: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 4, 5, 6, 7, 8, 9, 10, 11, 12, 4, 5, 6, 7, 8, 9, 10, 11, 12, 4, 5, 6, 7, 8, 9, 10, 11, 12, 4, 5, 6, 7, 8, 9, 10, 11, 12, 4, 5, 6, 7, 8, 9, 10, 11, 12, 4, 5, 6, 7, 8, 9, 10, 11, 12, 4, 5, 6, 7, 8, 9, 10, 11, 12, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21] \n"
+     ]
+    },
+    {
+     "ename": "SystemExit",
+     "evalue": "Debugging: Terminate after 1st decoder saved cos and sin tensors.",
+     "output_type": "error",
+     "traceback": [
+      "An exception has occurred, use %tb to see the full traceback.\n",
+      "\u001b[31mSystemExit\u001b[39m\u001b[31m:\u001b[39m Debugging: Terminate after 1st decoder saved cos and sin tensors.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/ubuntu/miniconda3/envs/dc_airnd/lib/python3.12/site-packages/IPython/core/interactiveshell.py:3755: UserWarning: To exit: use 'exit', 'quit', or Ctrl-D.\n",
+      "  warn(\"To exit: use 'exit', 'quit', or Ctrl-D.\", stacklevel=1)\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor\n",
+    "from qwen_vl_utils import process_vision_info\n",
+    "\n",
+    "# 1. Load Model and Processor\n",
+    "model_name = \"Qwen/Qwen2.5-VL-3B-Instruct\"\n",
+    "model = Qwen2_5_VLForConditionalGeneration.from_pretrained(\n",
+    "    model_name, torch_dtype=torch.float16, device_map=\"auto\"\n",
+    ")\n",
+    "processor = AutoProcessor.from_pretrained(model_name)\n",
+    "\n",
+    "# 2. Define your inputs manually\n",
+    "image_url = \"./car-1_256_0.jpg\"\n",
+    "user_query = \"Describe the image\"\n",
+    "\n",
+    "# 3. Construct the prompt string manually\n",
+    "# Qwen2.5-VL expects specific tokens to wrap system, user, and assistant roles.\n",
+    "# Note: The <|vision_start|> and <|vision_end|> tags tell the processor \n",
+    "# where to inject the image features.\n",
+    "prompt = (\n",
+    "    \"<|im_start|>user\\n\"\n",
+    "    \"<|vision_start|><|image_pad|><|vision_end|>\"\n",
+    "    f\"{user_query}<|im_end|>\\n\"\n",
+    "    \"<|im_start|>assistant\\n\"\n",
+    ")\n",
+    "\n",
+    "# 4. Process the vision information\n",
+    "# We still use this utility to fetch the image and handle resizing logic\n",
+    "messages = [{\"role\": \"user\", \"content\": [{\"type\": \"image\", \"image\": image_url}]}]\n",
+    "image_inputs, _ = process_vision_info(messages)\n",
+    "\n",
+    "# 5. Tokenize and Prepare Tensors\n",
+    "inputs = processor(\n",
+    "    text=[prompt],\n",
+    "    images=image_inputs,\n",
+    "    videos=None,\n",
+    "    padding=True,\n",
+    "    return_tensors=\"pt\",\n",
+    ")\n",
+    "inputs = inputs.to(model.device)\n",
+    "\n",
+    "# 6. Generate\n",
+    "generated_ids = model.generate(**inputs, max_new_tokens=100)\n",
+    "\n",
+    "# Trim the prompt tokens from the result\n",
+    "generated_ids_trimmed = [\n",
+    "    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n",
+    "]\n",
+    "\n",
+    "output_text = processor.batch_decode(\n",
+    "    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\n",
+    ")\n",
+    "\n",
+    "print(f\"\\nManual Prompt Response: {output_text[0]}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "f45df021-6302-4f47-9e06-8070577885a2",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-03-25T04:36:13.766580Z",
+     "iopub.status.busy": "2026-03-25T04:36:13.766400Z",
+     "iopub.status.idle": "2026-03-25T04:36:13.770145Z",
+     "shell.execute_reply": "2026-03-25T04:36:13.769588Z",
+     "shell.execute_reply.started": "2026-03-25T04:36:13.766563Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'<|im_start|>user\\n<|vision_start|><|image_pad|><|vision_end|>Describe the image<|im_end|>\\n<|im_start|>assistant\\n'"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "prompt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "504fa71b-42b4-4f53-8988-25fcfba38d13",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-03-25T05:43:53.839325Z",
+     "iopub.status.busy": "2026-03-25T05:43:53.839044Z",
+     "iopub.status.idle": "2026-03-25T05:43:53.843214Z",
+     "shell.execute_reply": "2026-03-25T05:43:53.842555Z",
+     "shell.execute_reply.started": "2026-03-25T05:43:53.839304Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "cos = torch.load('cos.pt')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "642d9dcf-e591-4d70-96af-b69bf955d9e1",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-03-25T05:43:54.296041Z",
+     "iopub.status.busy": "2026-03-25T05:43:54.295869Z",
+     "iopub.status.idle": "2026-03-25T05:43:54.299276Z",
+     "shell.execute_reply": "2026-03-25T05:43:54.298634Z",
+     "shell.execute_reply.started": "2026-03-25T05:43:54.296029Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(torch.Size([1, 1, 94, 128]), torch.float16)"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cos.shape, cos.dtype"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f44460e3-58e9-4fd2-898a-06e8a00f9365",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}