File size: 8,559 Bytes
ed213a5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "eb580f54d0ec4077b5c39da99f35c4f0",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Loading checkpoint shards: 0%| | 0/38 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor\n",
"from qwen_vl_utils import process_vision_info\n",
"import torch\n",
"\n",
"#torch_device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
"\n",
"local_model_path = \"models/QVQ-72B-Preview\"\n",
"\n",
"# default: Load the model on the available device(s)\n",
"model = Qwen2VLForConditionalGeneration.from_pretrained(\n",
" #\"Qwen/QVQ-72B-Preview\", \n",
" local_model_path,\n",
" torch_dtype=\"auto\", \n",
" device_map=\"auto\"\n",
")\n",
"#model.to(\"cuda\") \n",
"# default processer\n",
"processor = AutoProcessor.from_pretrained(local_model_path)\n",
"\n",
"# The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.\n",
"# min_pixels = 256*28*28\n",
"# max_pixels = 1280*28*28\n",
"# processor = AutoProcessor.from_pretrained(\"Qwen/QVQ-72B-Preview\", min_pixels=min_pixels, max_pixels=max_pixels)\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"######################################################\n",
"{'visual': 0, 'model.embed_tokens': 0, 'model.layers.0': 0, 'model.layers.1': 0, 'model.layers.2': 0, 'model.layers.3': 0, 'model.layers.4': 0, 'model.layers.5': 0, 'model.layers.6': 0, 'model.layers.7': 0, 'model.layers.8': 0, 'model.layers.9': 0, 'model.layers.10': 0, 'model.layers.11': 0, 'model.layers.12': 0, 'model.layers.13': 0, 'model.layers.14': 0, 'model.layers.15': 0, 'model.layers.16': 0, 'model.layers.17': 0, 'model.layers.18': 1, 'model.layers.19': 1, 'model.layers.20': 1, 'model.layers.21': 1, 'model.layers.22': 1, 'model.layers.23': 1, 'model.layers.24': 1, 'model.layers.25': 1, 'model.layers.26': 1, 'model.layers.27': 1, 'model.layers.28': 1, 'model.layers.29': 1, 'model.layers.30': 1, 'model.layers.31': 1, 'model.layers.32': 1, 'model.layers.33': 1, 'model.layers.34': 1, 'model.layers.35': 1, 'model.layers.36': 1, 'model.layers.37': 1, 'model.layers.38': 1, 'model.layers.39': 1, 'model.layers.40': 2, 'model.layers.41': 2, 'model.layers.42': 2, 'model.layers.43': 2, 'model.layers.44': 2, 'model.layers.45': 2, 'model.layers.46': 2, 'model.layers.47': 2, 'model.layers.48': 2, 'model.layers.49': 2, 'model.layers.50': 2, 'model.layers.51': 2, 'model.layers.52': 2, 'model.layers.53': 2, 'model.layers.54': 2, 'model.layers.55': 2, 'model.layers.56': 2, 'model.layers.57': 2, 'model.layers.58': 2, 'model.layers.59': 2, 'model.layers.60': 2, 'model.layers.61': 2, 'model.layers.62': 3, 'model.layers.63': 3, 'model.layers.64': 3, 'model.layers.65': 3, 'model.layers.66': 3, 'model.layers.67': 3, 'model.layers.68': 3, 'model.layers.69': 3, 'model.layers.70': 3, 'model.layers.71': 3, 'model.layers.72': 3, 'model.layers.73': 3, 'model.layers.74': 3, 'model.layers.75': 3, 'model.layers.76': 3, 'model.layers.77': 3, 'model.layers.78': 3, 'model.layers.79': 3, 'model.norm': 3, 'model.rotary_emb': 3, 'lm_head': 3}\n",
"cuda:0\n",
"input_ids: cuda:0\n",
"attention_mask: cuda:0\n",
"pixel_values: cuda:0\n",
"image_grid_thw: cuda:0\n",
"######################################################\n",
"[\"So I've got this puzzle here with emojis representing numbers, and I need to figure out what goes in the blank space. Let's see, there are four equations, and each one uses hearts, bows, and dogs as symbols. I need to assign numbers to these symbols based on the equations provided.\\n\\nFirst equation: four hearts added together equal 24. So, 4 hearts = 24. That seems straightforward. If I divide both sides by 4, then one heart equals 6. Okay, so heart = 6.\\n\\nSecond equation: one heart minus one bow equals 1. So, heart - bow = 1. I already know that heart is 6, so 6 - bow = 1. To find bow, I can subtract 6 from both sides, but wait, 6 - bow = 1 would mean bow is 5, because 6 - 5 = 1. Yeah, that makes sense.\\n\\nThird equation: one heart plus one bow plus one dog equals 19. So, heart + bow + dog = 19. I know heart is 6 and bow is 5, so 6 + 5 + dog = 19. That means 11 + dog = 19. Subtracting 11 from both sides, dog = 8. Okay, dog = 8.\\n\\nNow, the fourth equation is: one heart plus one bow times one dog equals what? So, heart + bow × dog = ?. Plugging in the values I have: 6 + 5 × 8 = ?\\n\\nWait a minute, I need to remember the order of operations here. Multiplication comes before addition in PEMDAS, so I should do the multiplication first and then add.\\n\\nSo, 5 × 8 is 40, and then 6 + 40 is 46. Therefore, the blank space should be 46.\\n\\nLet me double-check to make sure I didn't make any mistakes. Starting with heart = 6, bow = 5, and dog = 8.\\n\\nFirst equation: 4 hearts = 24. 4 × 6 = 24. Correct.\\n\\nSecond equation: 6 - 5 = 1. Correct.\\n\\nThird equation: 6 + 5 + 8 = 19. Correct.\\n\\nFourth equation: 6 + (5 × 8) = 46. That seems right.\\n\\nI think that's the answer\"]\n"
]
}
],
"source": [
"\n",
"messages = [\n",
" {\n",
" \"role\": \"system\",\n",
" \"content\": [\n",
" {\"type\": \"text\", \"text\": \"You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step.\"}\n",
" ],\n",
" },\n",
" {\n",
" \"role\": \"user\",\n",
" \"content\": [\n",
" {\n",
" \"type\": \"image\",\n",
" \"image\": \"https://qianwen-res.oss-cn-beijing.aliyuncs.com/QVQ/demo.png\",\n",
" },\n",
" {\"type\": \"text\", \"text\": \"What value should be filled in the blank space?\"},\n",
" ],\n",
" }\n",
"]\n",
"\n",
"# Preparation for inference\n",
"text = processor.apply_chat_template(\n",
" messages, tokenize=False, add_generation_prompt=True\n",
")\n",
"\n",
"image_inputs, video_inputs = process_vision_info(messages)\n",
"inputs = processor(\n",
" text=[text] ,\n",
" images=image_inputs ,\n",
" videos=video_inputs ,\n",
" padding=True,\n",
" return_tensors=\"pt\",\n",
")\n",
"inputs = inputs.to(\"cuda\")\n",
"torch.set_num_threads(16)\n",
"print(\"######################################################\")\n",
"print(model.hf_device_map)\n",
"print(next(model.parameters()).device)\n",
"for key, value in inputs.items():\n",
" if isinstance(value, torch.Tensor):\n",
" print(f\"{key}: {value.device}\")\n",
"print(\"######################################################\")\n",
"# Inference: Generation of the output\n",
"\n",
"generated_ids = model.generate(**inputs, max_new_tokens=512)\n",
"\n",
"generated_ids_trimmed = [\n",
" out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n",
"]\n",
"output_text = processor.batch_decode(\n",
" generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\n",
")\n",
"print(output_text)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "test_env",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
|