karamjotsingh commited on
Commit
b174e81
·
verified ·
1 Parent(s): ee474a4

Upload position_ids_debug.ipynb with huggingface_hub

Browse files
Files changed (1) hide show
  1. position_ids_debug.ipynb +266 -0
position_ids_debug.ipynb ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "6511a91c-ed20-41ff-befb-699bda1912a3",
7
+ "metadata": {
8
+ "execution": {
9
+ "iopub.execute_input": "2026-03-25T05:42:29.023013Z",
10
+ "iopub.status.busy": "2026-03-25T05:42:29.022863Z",
11
+ "iopub.status.idle": "2026-03-25T05:42:40.880280Z",
12
+ "shell.execute_reply": "2026-03-25T05:42:40.879248Z",
13
+ "shell.execute_reply.started": "2026-03-25T05:42:29.022998Z"
14
+ },
15
+ "scrolled": true
16
+ },
17
+ "outputs": [
18
+ {
19
+ "data": {
20
+ "application/vnd.jupyter.widget-view+json": {
21
+ "model_id": "8310548c3b0d460899adcb96ee4af2e1",
22
+ "version_major": 2,
23
+ "version_minor": 0
24
+ },
25
+ "text/plain": [
26
+ "Downloading (incomplete total...): 0.00B [00:00, ?B/s]"
27
+ ]
28
+ },
29
+ "metadata": {},
30
+ "output_type": "display_data"
31
+ },
32
+ {
33
+ "data": {
34
+ "application/vnd.jupyter.widget-view+json": {
35
+ "model_id": "663ea1161c934235a53948b93d224495",
36
+ "version_major": 2,
37
+ "version_minor": 0
38
+ },
39
+ "text/plain": [
40
+ "Fetching 2 files: 0%| | 0/2 [00:00<?, ?it/s]"
41
+ ]
42
+ },
43
+ "metadata": {},
44
+ "output_type": "display_data"
45
+ },
46
+ {
47
+ "data": {
48
+ "application/vnd.jupyter.widget-view+json": {
49
+ "model_id": "667df34dda224931ac9ccd442a5d42f0",
50
+ "version_major": 2,
51
+ "version_minor": 0
52
+ },
53
+ "text/plain": [
54
+ "Loading weights: 0%| | 0/824 [00:00<?, ?it/s]"
55
+ ]
56
+ },
57
+ "metadata": {},
58
+ "output_type": "display_data"
59
+ },
60
+ {
61
+ "name": "stdout",
62
+ "output_type": "stream",
63
+ "text": [
64
+ "[text] batch=0, tokens=4, pos=0..3 (t=h=w): [0, 1, 2, 3]\n",
65
+ "get_vision_position_ids: grid_thw=tensor([ 1, 18, 18], device='cuda:0'), llm_grid_thw=(1, 9, 9), start_position=4\n",
66
+ " temp_merge_size=1, spatial_merge_size=2\n",
67
+ " image_seq_length=81\n",
68
+ " position_width (repeat)=[4, 5, 6, 7, 8, 9, 10, 11, 12, 4]...[12, 4, 5, 6, 7, 8, 9, 10, 11, 12]\n",
69
+ " position_height (repeat_interleave)=[4, 4, 4, 4, 4, 4, 4, 4, 4, 5]...[11, 12, 12, 12, 12, 12, 12, 12, 12, 12]\n",
70
+ " position_temporal (torch.full) (before spacing)=[4, 4, 4, 4, 4, 4, 4, 4, 4, 4]...[4, 4, 4, 4, 4, 4, 4, 4, 4, 4]\n",
71
+ " time_interval=2\n",
72
+ " position_temporal (after spacing)=[8, 8, 8, 8, 8, 8, 8, 8, 8, 8]...[8, 8, 8, 8, 8, 8, 8, 8, 8, 8]\n",
73
+ "[vision pos] grid_thw=tensor([ 1, 18, 18], device='cuda:0'), start=4\n",
74
+ " t: [8, 8, 8, 8, 8, 8, 8, 8, 8, 8]...[8, 8, 8, 8, 8, 8, 8, 8, 8, 8]\n",
75
+ " h: [4, 4, 4, 4, 4, 4, 4, 4, 4, 5]...[11, 12, 12, 12, 12, 12, 12, 12, 12, 12]\n",
76
+ " w: [4, 5, 6, 7, 8, 9, 10, 11, 12, 4]...[12, 4, 5, 6, 7, 8, 9, 10, 11, 12]\n",
77
+ "[text] batch=0, tokens=9, pos=13..21 (t=h=w): [13, 14, 15, 16, 17, 18, 19, 20, 21]\n",
78
+ "[LLM prefill] position_ids shape: torch.Size([3, 1, 94]) (3=t/h/w, bs, seq_len)\n",
79
+ " batch 0 (shape: 94):\n",
80
+ " t: [0, 1, 2, 3, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 13, 14, 15, 16, 17, 18, 19, 20, 21] \n",
81
+ " h: [0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21] \n",
82
+ " w: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 4, 5, 6, 7, 8, 9, 10, 11, 12, 4, 5, 6, 7, 8, 9, 10, 11, 12, 4, 5, 6, 7, 8, 9, 10, 11, 12, 4, 5, 6, 7, 8, 9, 10, 11, 12, 4, 5, 6, 7, 8, 9, 10, 11, 12, 4, 5, 6, 7, 8, 9, 10, 11, 12, 4, 5, 6, 7, 8, 9, 10, 11, 12, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21] \n"
83
+ ]
84
+ },
85
+ {
86
+ "ename": "SystemExit",
87
+ "evalue": "Debugging: Terminate after 1st decoder saved cos and sin tensors.",
88
+ "output_type": "error",
89
+ "traceback": [
90
+ "An exception has occurred, use %tb to see the full traceback.\n",
91
+ "\u001b[31mSystemExit\u001b[39m\u001b[31m:\u001b[39m Debugging: Terminate after 1st decoder saved cos and sin tensors.\n"
92
+ ]
93
+ },
94
+ {
95
+ "name": "stderr",
96
+ "output_type": "stream",
97
+ "text": [
98
+ "/home/ubuntu/miniconda3/envs/dc_airnd/lib/python3.12/site-packages/IPython/core/interactiveshell.py:3755: UserWarning: To exit: use 'exit', 'quit', or Ctrl-D.\n",
99
+ " warn(\"To exit: use 'exit', 'quit', or Ctrl-D.\", stacklevel=1)\n"
100
+ ]
101
+ }
102
+ ],
103
+ "source": [
104
+ "import torch\n",
105
+ "from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor\n",
106
+ "from qwen_vl_utils import process_vision_info\n",
107
+ "\n",
108
+ "# 1. Load Model and Processor\n",
109
+ "model_name = \"Qwen/Qwen2.5-VL-3B-Instruct\"\n",
110
+ "model = Qwen2_5_VLForConditionalGeneration.from_pretrained(\n",
111
+ " model_name, torch_dtype=torch.float16, device_map=\"auto\"\n",
112
+ ")\n",
113
+ "processor = AutoProcessor.from_pretrained(model_name)\n",
114
+ "\n",
115
+ "# 2. Define your inputs manually\n",
116
+ "image_url = \"./car-1_256_0.jpg\"\n",
117
+ "user_query = \"Describe the image\"\n",
118
+ "\n",
119
+ "# 3. Construct the prompt string manually\n",
120
+ "# Qwen2.5-VL expects specific tokens to wrap system, user, and assistant roles.\n",
121
+ "# Note: The <|vision_start|> and <|vision_end|> tags tell the processor \n",
122
+ "# where to inject the image features.\n",
123
+ "prompt = (\n",
124
+ " \"<|im_start|>user\\n\"\n",
125
+ " \"<|vision_start|><|image_pad|><|vision_end|>\"\n",
126
+ " f\"{user_query}<|im_end|>\\n\"\n",
127
+ " \"<|im_start|>assistant\\n\"\n",
128
+ ")\n",
129
+ "\n",
130
+ "# 4. Process the vision information\n",
131
+ "# We still use this utility to fetch the image and handle resizing logic\n",
132
+ "messages = [{\"role\": \"user\", \"content\": [{\"type\": \"image\", \"image\": image_url}]}]\n",
133
+ "image_inputs, _ = process_vision_info(messages)\n",
134
+ "\n",
135
+ "# 5. Tokenize and Prepare Tensors\n",
136
+ "inputs = processor(\n",
137
+ " text=[prompt],\n",
138
+ " images=image_inputs,\n",
139
+ " videos=None,\n",
140
+ " padding=True,\n",
141
+ " return_tensors=\"pt\",\n",
142
+ ")\n",
143
+ "inputs = inputs.to(model.device)\n",
144
+ "\n",
145
+ "# 6. Generate\n",
146
+ "generated_ids = model.generate(**inputs, max_new_tokens=100)\n",
147
+ "\n",
148
+ "# Trim the prompt tokens from the result\n",
149
+ "generated_ids_trimmed = [\n",
150
+ " out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n",
151
+ "]\n",
152
+ "\n",
153
+ "output_text = processor.batch_decode(\n",
154
+ " generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\n",
155
+ ")\n",
156
+ "\n",
157
+ "print(f\"\\nManual Prompt Response: {output_text[0]}\")"
158
+ ]
159
+ },
160
+ {
161
+ "cell_type": "code",
162
+ "execution_count": 4,
163
+ "id": "f45df021-6302-4f47-9e06-8070577885a2",
164
+ "metadata": {
165
+ "execution": {
166
+ "iopub.execute_input": "2026-03-25T04:36:13.766580Z",
167
+ "iopub.status.busy": "2026-03-25T04:36:13.766400Z",
168
+ "iopub.status.idle": "2026-03-25T04:36:13.770145Z",
169
+ "shell.execute_reply": "2026-03-25T04:36:13.769588Z",
170
+ "shell.execute_reply.started": "2026-03-25T04:36:13.766563Z"
171
+ }
172
+ },
173
+ "outputs": [
174
+ {
175
+ "data": {
176
+ "text/plain": [
177
+ "'<|im_start|>user\\n<|vision_start|><|image_pad|><|vision_end|>Describe the image<|im_end|>\\n<|im_start|>assistant\\n'"
178
+ ]
179
+ },
180
+ "execution_count": 4,
181
+ "metadata": {},
182
+ "output_type": "execute_result"
183
+ }
184
+ ],
185
+ "source": [
186
+ "prompt"
187
+ ]
188
+ },
189
+ {
190
+ "cell_type": "code",
191
+ "execution_count": 2,
192
+ "id": "504fa71b-42b4-4f53-8988-25fcfba38d13",
193
+ "metadata": {
194
+ "execution": {
195
+ "iopub.execute_input": "2026-03-25T05:43:53.839325Z",
196
+ "iopub.status.busy": "2026-03-25T05:43:53.839044Z",
197
+ "iopub.status.idle": "2026-03-25T05:43:53.843214Z",
198
+ "shell.execute_reply": "2026-03-25T05:43:53.842555Z",
199
+ "shell.execute_reply.started": "2026-03-25T05:43:53.839304Z"
200
+ }
201
+ },
202
+ "outputs": [],
203
+ "source": [
204
+ "cos = torch.load('cos.pt')"
205
+ ]
206
+ },
207
+ {
208
+ "cell_type": "code",
209
+ "execution_count": 3,
210
+ "id": "642d9dcf-e591-4d70-96af-b69bf955d9e1",
211
+ "metadata": {
212
+ "execution": {
213
+ "iopub.execute_input": "2026-03-25T05:43:54.296041Z",
214
+ "iopub.status.busy": "2026-03-25T05:43:54.295869Z",
215
+ "iopub.status.idle": "2026-03-25T05:43:54.299276Z",
216
+ "shell.execute_reply": "2026-03-25T05:43:54.298634Z",
217
+ "shell.execute_reply.started": "2026-03-25T05:43:54.296029Z"
218
+ }
219
+ },
220
+ "outputs": [
221
+ {
222
+ "data": {
223
+ "text/plain": [
224
+ "(torch.Size([1, 1, 94, 128]), torch.float16)"
225
+ ]
226
+ },
227
+ "execution_count": 3,
228
+ "metadata": {},
229
+ "output_type": "execute_result"
230
+ }
231
+ ],
232
+ "source": [
233
+ "cos.shape, cos.dtype"
234
+ ]
235
+ },
236
+ {
237
+ "cell_type": "code",
238
+ "execution_count": null,
239
+ "id": "f44460e3-58e9-4fd2-898a-06e8a00f9365",
240
+ "metadata": {},
241
+ "outputs": [],
242
+ "source": []
243
+ }
244
+ ],
245
+ "metadata": {
246
+ "kernelspec": {
247
+ "display_name": "Python 3 (ipykernel)",
248
+ "language": "python",
249
+ "name": "python3"
250
+ },
251
+ "language_info": {
252
+ "codemirror_mode": {
253
+ "name": "ipython",
254
+ "version": 3
255
+ },
256
+ "file_extension": ".py",
257
+ "mimetype": "text/x-python",
258
+ "name": "python",
259
+ "nbconvert_exporter": "python",
260
+ "pygments_lexer": "ipython3",
261
+ "version": "3.12.12"
262
+ }
263
+ },
264
+ "nbformat": 4,
265
+ "nbformat_minor": 5
266
+ }