Upload test.ipynb with huggingface_hub
Browse files- test.ipynb +155 -0
test.ipynb
ADDED
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [
|
8 |
+
{
|
9 |
+
"name": "stderr",
|
10 |
+
"output_type": "stream",
|
11 |
+
"text": [
|
12 |
+
"`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46\n"
|
13 |
+
]
|
14 |
+
},
|
15 |
+
{
|
16 |
+
"data": {
|
17 |
+
"application/vnd.jupyter.widget-view+json": {
|
18 |
+
"model_id": "eb580f54d0ec4077b5c39da99f35c4f0",
|
19 |
+
"version_major": 2,
|
20 |
+
"version_minor": 0
|
21 |
+
},
|
22 |
+
"text/plain": [
|
23 |
+
"Loading checkpoint shards: 0%| | 0/38 [00:00<?, ?it/s]"
|
24 |
+
]
|
25 |
+
},
|
26 |
+
"metadata": {},
|
27 |
+
"output_type": "display_data"
|
28 |
+
}
|
29 |
+
],
|
30 |
+
"source": [
|
31 |
+
"from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor\n",
|
32 |
+
"from qwen_vl_utils import process_vision_info\n",
|
33 |
+
"import torch\n",
|
34 |
+
"\n",
|
35 |
+
"#torch_device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
|
36 |
+
"\n",
|
37 |
+
"local_model_path = \"models/QVQ-72B-Preview\"\n",
|
38 |
+
"\n",
|
39 |
+
"# default: Load the model on the available device(s)\n",
|
40 |
+
"model = Qwen2VLForConditionalGeneration.from_pretrained(\n",
|
41 |
+
" #\"Qwen/QVQ-72B-Preview\", \n",
|
42 |
+
" local_model_path,\n",
|
43 |
+
" torch_dtype=\"auto\", \n",
|
44 |
+
" device_map=\"auto\"\n",
|
45 |
+
")\n",
|
46 |
+
"#model.to(\"cuda\") \n",
|
47 |
+
"# default processer\n",
|
48 |
+
"processor = AutoProcessor.from_pretrained(local_model_path)\n",
|
49 |
+
"\n",
|
50 |
+
"# The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.\n",
|
51 |
+
"# min_pixels = 256*28*28\n",
|
52 |
+
"# max_pixels = 1280*28*28\n",
|
53 |
+
"# processor = AutoProcessor.from_pretrained(\"Qwen/QVQ-72B-Preview\", min_pixels=min_pixels, max_pixels=max_pixels)\n"
|
54 |
+
]
|
55 |
+
},
|
56 |
+
{
|
57 |
+
"cell_type": "code",
|
58 |
+
"execution_count": 3,
|
59 |
+
"metadata": {},
|
60 |
+
"outputs": [
|
61 |
+
{
|
62 |
+
"name": "stdout",
|
63 |
+
"output_type": "stream",
|
64 |
+
"text": [
|
65 |
+
"######################################################\n",
|
66 |
+
"{'visual': 0, 'model.embed_tokens': 0, 'model.layers.0': 0, 'model.layers.1': 0, 'model.layers.2': 0, 'model.layers.3': 0, 'model.layers.4': 0, 'model.layers.5': 0, 'model.layers.6': 0, 'model.layers.7': 0, 'model.layers.8': 0, 'model.layers.9': 0, 'model.layers.10': 0, 'model.layers.11': 0, 'model.layers.12': 0, 'model.layers.13': 0, 'model.layers.14': 0, 'model.layers.15': 0, 'model.layers.16': 0, 'model.layers.17': 0, 'model.layers.18': 1, 'model.layers.19': 1, 'model.layers.20': 1, 'model.layers.21': 1, 'model.layers.22': 1, 'model.layers.23': 1, 'model.layers.24': 1, 'model.layers.25': 1, 'model.layers.26': 1, 'model.layers.27': 1, 'model.layers.28': 1, 'model.layers.29': 1, 'model.layers.30': 1, 'model.layers.31': 1, 'model.layers.32': 1, 'model.layers.33': 1, 'model.layers.34': 1, 'model.layers.35': 1, 'model.layers.36': 1, 'model.layers.37': 1, 'model.layers.38': 1, 'model.layers.39': 1, 'model.layers.40': 2, 'model.layers.41': 2, 'model.layers.42': 2, 'model.layers.43': 2, 'model.layers.44': 2, 'model.layers.45': 2, 'model.layers.46': 2, 'model.layers.47': 2, 'model.layers.48': 2, 'model.layers.49': 2, 'model.layers.50': 2, 'model.layers.51': 2, 'model.layers.52': 2, 'model.layers.53': 2, 'model.layers.54': 2, 'model.layers.55': 2, 'model.layers.56': 2, 'model.layers.57': 2, 'model.layers.58': 2, 'model.layers.59': 2, 'model.layers.60': 2, 'model.layers.61': 2, 'model.layers.62': 3, 'model.layers.63': 3, 'model.layers.64': 3, 'model.layers.65': 3, 'model.layers.66': 3, 'model.layers.67': 3, 'model.layers.68': 3, 'model.layers.69': 3, 'model.layers.70': 3, 'model.layers.71': 3, 'model.layers.72': 3, 'model.layers.73': 3, 'model.layers.74': 3, 'model.layers.75': 3, 'model.layers.76': 3, 'model.layers.77': 3, 'model.layers.78': 3, 'model.layers.79': 3, 'model.norm': 3, 'model.rotary_emb': 3, 'lm_head': 3}\n",
|
67 |
+
"cuda:0\n",
|
68 |
+
"input_ids: cuda:0\n",
|
69 |
+
"attention_mask: cuda:0\n",
|
70 |
+
"pixel_values: cuda:0\n",
|
71 |
+
"image_grid_thw: cuda:0\n",
|
72 |
+
"######################################################\n",
|
73 |
+
"[\"So I've got this puzzle here with emojis representing numbers, and I need to figure out what goes in the blank space. Let's see, there are four equations, and each one uses hearts, bows, and dogs as symbols. I need to assign numbers to these symbols based on the equations provided.\\n\\nFirst equation: four hearts added together equal 24. So, 4 hearts = 24. That seems straightforward. If I divide both sides by 4, then one heart equals 6. Okay, so heart = 6.\\n\\nSecond equation: one heart minus one bow equals 1. So, heart - bow = 1. I already know that heart is 6, so 6 - bow = 1. To find bow, I can subtract 6 from both sides, but wait, 6 - bow = 1 would mean bow is 5, because 6 - 5 = 1. Yeah, that makes sense.\\n\\nThird equation: one heart plus one bow plus one dog equals 19. So, heart + bow + dog = 19. I know heart is 6 and bow is 5, so 6 + 5 + dog = 19. That means 11 + dog = 19. Subtracting 11 from both sides, dog = 8. Okay, dog = 8.\\n\\nNow, the fourth equation is: one heart plus one bow times one dog equals what? So, heart + bow × dog = ?. Plugging in the values I have: 6 + 5 × 8 = ?\\n\\nWait a minute, I need to remember the order of operations here. Multiplication comes before addition in PEMDAS, so I should do the multiplication first and then add.\\n\\nSo, 5 × 8 is 40, and then 6 + 40 is 46. Therefore, the blank space should be 46.\\n\\nLet me double-check to make sure I didn't make any mistakes. Starting with heart = 6, bow = 5, and dog = 8.\\n\\nFirst equation: 4 hearts = 24. 4 × 6 = 24. Correct.\\n\\nSecond equation: 6 - 5 = 1. Correct.\\n\\nThird equation: 6 + 5 + 8 = 19. Correct.\\n\\nFourth equation: 6 + (5 × 8) = 46. That seems right.\\n\\nI think that's the answer\"]\n"
|
74 |
+
]
|
75 |
+
}
|
76 |
+
],
|
77 |
+
"source": [
|
78 |
+
"\n",
|
79 |
+
"messages = [\n",
|
80 |
+
" {\n",
|
81 |
+
" \"role\": \"system\",\n",
|
82 |
+
" \"content\": [\n",
|
83 |
+
" {\"type\": \"text\", \"text\": \"You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step.\"}\n",
|
84 |
+
" ],\n",
|
85 |
+
" },\n",
|
86 |
+
" {\n",
|
87 |
+
" \"role\": \"user\",\n",
|
88 |
+
" \"content\": [\n",
|
89 |
+
" {\n",
|
90 |
+
" \"type\": \"image\",\n",
|
91 |
+
" \"image\": \"https://qianwen-res.oss-cn-beijing.aliyuncs.com/QVQ/demo.png\",\n",
|
92 |
+
" },\n",
|
93 |
+
" {\"type\": \"text\", \"text\": \"What value should be filled in the blank space?\"},\n",
|
94 |
+
" ],\n",
|
95 |
+
" }\n",
|
96 |
+
"]\n",
|
97 |
+
"\n",
|
98 |
+
"# Preparation for inference\n",
|
99 |
+
"text = processor.apply_chat_template(\n",
|
100 |
+
" messages, tokenize=False, add_generation_prompt=True\n",
|
101 |
+
")\n",
|
102 |
+
"\n",
|
103 |
+
"image_inputs, video_inputs = process_vision_info(messages)\n",
|
104 |
+
"inputs = processor(\n",
|
105 |
+
" text=[text] ,\n",
|
106 |
+
" images=image_inputs ,\n",
|
107 |
+
" videos=video_inputs ,\n",
|
108 |
+
" padding=True,\n",
|
109 |
+
" return_tensors=\"pt\",\n",
|
110 |
+
")\n",
|
111 |
+
"inputs = inputs.to(\"cuda\")\n",
|
112 |
+
"torch.set_num_threads(16)\n",
|
113 |
+
"print(\"######################################################\")\n",
|
114 |
+
"print(model.hf_device_map)\n",
|
115 |
+
"print(next(model.parameters()).device)\n",
|
116 |
+
"for key, value in inputs.items():\n",
|
117 |
+
" if isinstance(value, torch.Tensor):\n",
|
118 |
+
" print(f\"{key}: {value.device}\")\n",
|
119 |
+
"print(\"######################################################\")\n",
|
120 |
+
"# Inference: Generation of the output\n",
|
121 |
+
"\n",
|
122 |
+
"generated_ids = model.generate(**inputs, max_new_tokens=512)\n",
|
123 |
+
"\n",
|
124 |
+
"generated_ids_trimmed = [\n",
|
125 |
+
" out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n",
|
126 |
+
"]\n",
|
127 |
+
"output_text = processor.batch_decode(\n",
|
128 |
+
" generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\n",
|
129 |
+
")\n",
|
130 |
+
"print(output_text)\n"
|
131 |
+
]
|
132 |
+
}
|
133 |
+
],
|
134 |
+
"metadata": {
|
135 |
+
"kernelspec": {
|
136 |
+
"display_name": "test_env",
|
137 |
+
"language": "python",
|
138 |
+
"name": "python3"
|
139 |
+
},
|
140 |
+
"language_info": {
|
141 |
+
"codemirror_mode": {
|
142 |
+
"name": "ipython",
|
143 |
+
"version": 3
|
144 |
+
},
|
145 |
+
"file_extension": ".py",
|
146 |
+
"mimetype": "text/x-python",
|
147 |
+
"name": "python",
|
148 |
+
"nbconvert_exporter": "python",
|
149 |
+
"pygments_lexer": "ipython3",
|
150 |
+
"version": "3.11.9"
|
151 |
+
}
|
152 |
+
},
|
153 |
+
"nbformat": 4,
|
154 |
+
"nbformat_minor": 2
|
155 |
+
}
|