czczup commited on
Commit
ef7aaf8
1 Parent(s): ab60f11

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. README.md +118 -53
README.md CHANGED
@@ -55,10 +55,98 @@ Limitations: Although we have made efforts to ensure the safety of the model dur
55
 
56
  We provide an example code to run Mini-InternVL-Chat-4B-V1-5 using `transformers`.
57
 
58
- We also welcome you to experience the InternVL2 series models in our [online demo](https://internvl.opengvlab.com/). Currently, due to the limited GPU resources with public IP addresses, we can only deploy models up to a maximum of 26B. We will expand soon and deploy larger models to the online demo.
59
 
60
  > Please use transformers==4.37.2 to ensure the model works normally.
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  ```python
63
  import numpy as np
64
  import torch
@@ -71,7 +159,6 @@ from transformers import AutoModel, AutoTokenizer
71
  IMAGENET_MEAN = (0.485, 0.456, 0.406)
72
  IMAGENET_STD = (0.229, 0.224, 0.225)
73
 
74
-
75
  def build_transform(input_size):
76
  MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
77
  transform = T.Compose([
@@ -82,7 +169,6 @@ def build_transform(input_size):
82
  ])
83
  return transform
84
 
85
-
86
  def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
87
  best_ratio_diff = float('inf')
88
  best_ratio = (1, 1)
@@ -98,8 +184,7 @@ def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_
98
  best_ratio = ratio
99
  return best_ratio
100
 
101
-
102
- def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False):
103
  orig_width, orig_height = image.size
104
  aspect_ratio = orig_width / orig_height
105
 
@@ -137,8 +222,7 @@ def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnai
137
  processed_images.append(thumbnail_img)
138
  return processed_images
139
 
140
-
141
- def load_image(image_file, input_size=448, max_num=6):
142
  image = Image.open(image_file).convert('RGB')
143
  transform = build_transform(input_size=input_size)
144
  images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
@@ -146,70 +230,60 @@ def load_image(image_file, input_size=448, max_num=6):
146
  pixel_values = torch.stack(pixel_values)
147
  return pixel_values
148
 
149
-
150
  path = 'OpenGVLab/Mini-InternVL-Chat-4B-V1-5'
151
  model = AutoModel.from_pretrained(
152
  path,
153
  torch_dtype=torch.bfloat16,
154
  low_cpu_mem_usage=True,
155
  trust_remote_code=True).eval().cuda()
 
156
 
157
- tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
158
  # set the max number of tiles in `max_num`
159
- pixel_values = load_image('./examples/image1.jpg', max_num=6).to(torch.bfloat16).cuda()
160
-
161
- generation_config = dict(
162
- num_beams=1,
163
- max_new_tokens=1024,
164
- do_sample=False,
165
- )
166
 
167
  # pure-text conversation (纯文本对话)
168
  question = 'Hello, who are you?'
169
  response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
170
- print(f'User: {question}')
171
- print(f'Assistant: {response}')
172
 
173
  question = 'Can you tell me a story?'
174
  response, history = model.chat(tokenizer, None, question, generation_config, history=history, return_history=True)
175
- print(f'User: {question}')
176
- print(f'Assistant: {response}')
177
 
178
  # single-image single-round conversation (单图单轮对话)
179
  question = '<image>\nPlease describe the image shortly.'
180
  response = model.chat(tokenizer, pixel_values, question, generation_config)
181
- print(f'User: {question}')
182
- print(f'Assistant: {response}')
183
 
184
  # single-image multi-round conversation (单图多轮对话)
185
  question = '<image>\nPlease describe the image in detail.'
186
  response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
187
- print(f'User: {question}')
188
- print(f'Assistant: {response}')
189
 
190
  question = 'Please write a poem according to the image.'
191
  response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
192
- print(f'User: {question}')
193
- print(f'Assistant: {response}')
194
 
195
  # multi-image multi-round conversation, combined images (多图多轮对话,拼接图像)
196
- pixel_values1 = load_image('./examples/image1.jpg', max_num=6).to(torch.bfloat16).cuda()
197
- pixel_values2 = load_image('./examples/image2.jpg', max_num=6).to(torch.bfloat16).cuda()
198
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
199
 
200
  question = '<image>\nDescribe the two images in detail.'
201
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
202
  history=None, return_history=True)
 
203
 
204
  question = 'What are the similarities and differences between these two images.'
205
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
206
  history=history, return_history=True)
207
- print(f'User: {question}')
208
- print(f'Assistant: {response}')
209
 
210
  # multi-image multi-round conversation, separate images (多图多轮对话,独立图像)
211
- pixel_values1 = load_image('./examples/image1.jpg', max_num=6).to(torch.bfloat16).cuda()
212
- pixel_values2 = load_image('./examples/image2.jpg', max_num=6).to(torch.bfloat16).cuda()
213
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
214
  num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
215
 
@@ -217,19 +291,17 @@ question = 'Image-1: <image>\nImage-2: <image>\nDescribe the two images in detai
217
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
218
  num_patches_list=num_patches_list,
219
  history=None, return_history=True)
220
- print(f'User: {question}')
221
- print(f'Assistant: {response}')
222
 
223
  question = 'What are the similarities and differences between these two images.'
224
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
225
  num_patches_list=num_patches_list,
226
  history=history, return_history=True)
227
- print(f'User: {question}')
228
- print(f'Assistant: {response}')
229
 
230
  # batch inference, single image per sample (单图批处理)
231
- pixel_values1 = load_image('./examples/image1.jpg', max_num=6).to(torch.bfloat16).cuda()
232
- pixel_values2 = load_image('./examples/image2.jpg', max_num=6).to(torch.bfloat16).cuda()
233
  num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
234
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
235
 
@@ -239,8 +311,7 @@ responses = model.batch_chat(tokenizer, pixel_values,
239
  questions=questions,
240
  generation_config=generation_config)
241
  for question, response in zip(questions, responses):
242
- print(f'User: {question}')
243
- print(f'Assistant: {response}')
244
 
245
  # video multi-round conversation (视频多轮对话)
246
  def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
@@ -275,29 +346,23 @@ def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=3
275
  pixel_values = torch.cat(pixel_values_list)
276
  return pixel_values, num_patches_list
277
 
278
-
279
  video_path = './examples/red-panda.mp4'
280
- # pixel_values, num_patches_list = load_video(video_path, num_segments=32, max_num=1)
281
  pixel_values, num_patches_list = load_video(video_path, num_segments=8, max_num=1)
282
  pixel_values = pixel_values.to(torch.bfloat16).cuda()
283
  video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
284
  question = video_prefix + 'What is the red panda doing?'
285
- # Frame1: <image>\nFrame2: <image>\n...\nFrame31: <image>\n{question}
286
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
287
- num_patches_list=num_patches_list,
288
- history=None, return_history=True)
289
- print(f'User: {question}')
290
- print(f'Assistant: {response}')
291
 
292
  question = 'Describe this video in detail. Don\'t repeat.'
293
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
294
- num_patches_list=num_patches_list,
295
- history=history, return_history=True)
296
- print(f'User: {question}')
297
- print(f'Assistant: {response}')
298
  ```
299
 
300
- ### Streaming output
301
 
302
  Besides this method, you can also use the following code to get streamed output.
303
 
@@ -308,7 +373,7 @@ from threading import Thread
308
  # Initialize the streamer
309
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10)
310
  # Define the generation configuration
311
- generation_config = dict(num_beams=1, max_new_tokens=1024, do_sample=False, streamer=streamer)
312
  # Start the model chat in a separate thread
313
  thread = Thread(target=model.chat, kwargs=dict(
314
  tokenizer=tokenizer, pixel_values=pixel_values, question=question,
 
55
 
56
  We provide an example code to run Mini-InternVL-Chat-4B-V1-5 using `transformers`.
57
 
58
+ We also welcome you to experience the InternVL2 series models in our [online demo](https://internvl.opengvlab.com/).
59
 
60
  > Please use transformers==4.37.2 to ensure the model works normally.
61
 
62
+ ### Model Loading
63
+
64
+ #### 16-bit (bf16 / fp16)
65
+
66
+ ```python
67
+ import torch
68
+ from transformers import AutoTokenizer, AutoModel
69
+ path = "OpenGVLab/Mini-InternVL-Chat-4B-V1-5"
70
+ model = AutoModel.from_pretrained(
71
+ path,
72
+ torch_dtype=torch.bfloat16,
73
+ low_cpu_mem_usage=True,
74
+ trust_remote_code=True).eval().cuda()
75
+ ```
76
+
77
+ #### BNB 8-bit Quantization
78
+
79
+ ```python
80
+ import torch
81
+ from transformers import AutoTokenizer, AutoModel
82
+ path = "OpenGVLab/Mini-InternVL-Chat-4B-V1-5"
83
+ model = AutoModel.from_pretrained(
84
+ path,
85
+ torch_dtype=torch.bfloat16,
86
+ load_in_8bit=True,
87
+ low_cpu_mem_usage=True,
88
+ trust_remote_code=True).eval()
89
+ ```
90
+
91
+ #### BNB 4-bit Quantization
92
+
93
+ ```python
94
+ import torch
95
+ from transformers import AutoTokenizer, AutoModel
96
+ path = "OpenGVLab/Mini-InternVL-Chat-4B-V1-5"
97
+ model = AutoModel.from_pretrained(
98
+ path,
99
+ torch_dtype=torch.bfloat16,
100
+ load_in_4bit=True,
101
+ low_cpu_mem_usage=True,
102
+ trust_remote_code=True).eval()
103
+ ```
104
+
105
+ #### Multiple GPUs
106
+
107
+ The reason for writing the code this way is to avoid errors that occur during multi-GPU inference due to tensors not being on the same device. By ensuring that the first and last layers of the large language model (LLM) are on the same device, we prevent such errors.
108
+
109
+ ```python
110
+ import math
111
+ import torch
112
+ from transformers import AutoTokenizer, AutoModel
113
+
114
+ def split_model(model_name):
115
+ device_map = {}
116
+ world_size = torch.cuda.device_count()
117
+ num_layers = {'Mini-InternVL-2B-V1-5': 24, 'Mini-InternVL-4B-V1-5': 32, 'InternVL-Chat-V1-5': 48}[model_name]
118
+ # Since the first GPU will be used for ViT, treat it as half a GPU.
119
+ num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
120
+ num_layers_per_gpu = [num_layers_per_gpu] * world_size
121
+ num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
122
+ layer_cnt = 0
123
+ for i, num_layer in enumerate(num_layers_per_gpu):
124
+ for j in range(num_layer):
125
+ device_map[f'language_model.model.layers.{layer_cnt}'] = i
126
+ layer_cnt += 1
127
+ device_map['vision_model'] = 0
128
+ device_map['mlp1'] = 0
129
+ device_map['language_model.model.tok_embeddings'] = 0
130
+ device_map['language_model.model.embed_tokens'] = 0
131
+ device_map['language_model.output'] = 0
132
+ device_map['language_model.model.norm'] = 0
133
+ device_map['language_model.lm_head'] = 0
134
+ device_map[f'language_model.model.layers.{num_layers - 1}'] = 0
135
+
136
+ return device_map
137
+
138
+ path = "OpenGVLab/Mini-InternVL-Chat-4B-V1-5"
139
+ device_map = split_model('Mini-InternVL-Chat-4B-V1-5')
140
+ model = AutoModel.from_pretrained(
141
+ path,
142
+ torch_dtype=torch.bfloat16,
143
+ low_cpu_mem_usage=True,
144
+ trust_remote_code=True,
145
+ device_map=device_map).eval()
146
+ ```
147
+
148
+ ### Inference with Transformers
149
+
150
  ```python
151
  import numpy as np
152
  import torch
 
159
  IMAGENET_MEAN = (0.485, 0.456, 0.406)
160
  IMAGENET_STD = (0.229, 0.224, 0.225)
161
 
 
162
  def build_transform(input_size):
163
  MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
164
  transform = T.Compose([
 
169
  ])
170
  return transform
171
 
 
172
  def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
173
  best_ratio_diff = float('inf')
174
  best_ratio = (1, 1)
 
184
  best_ratio = ratio
185
  return best_ratio
186
 
187
+ def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
 
188
  orig_width, orig_height = image.size
189
  aspect_ratio = orig_width / orig_height
190
 
 
222
  processed_images.append(thumbnail_img)
223
  return processed_images
224
 
225
+ def load_image(image_file, input_size=448, max_num=12):
 
226
  image = Image.open(image_file).convert('RGB')
227
  transform = build_transform(input_size=input_size)
228
  images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
 
230
  pixel_values = torch.stack(pixel_values)
231
  return pixel_values
232
 
233
+ # If you want to load a model using multiple GPUs, please refer to the `Multiple GPUs` section.
234
  path = 'OpenGVLab/Mini-InternVL-Chat-4B-V1-5'
235
  model = AutoModel.from_pretrained(
236
  path,
237
  torch_dtype=torch.bfloat16,
238
  low_cpu_mem_usage=True,
239
  trust_remote_code=True).eval().cuda()
240
+ tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
241
 
 
242
  # set the max number of tiles in `max_num`
243
+ pixel_values = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
244
+ generation_config = dict(max_new_tokens=1024, do_sample=False)
 
 
 
 
 
245
 
246
  # pure-text conversation (纯文本对话)
247
  question = 'Hello, who are you?'
248
  response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
249
+ print(f'User: {question}\nAssistant: {response}')
 
250
 
251
  question = 'Can you tell me a story?'
252
  response, history = model.chat(tokenizer, None, question, generation_config, history=history, return_history=True)
253
+ print(f'User: {question}\nAssistant: {response}')
 
254
 
255
  # single-image single-round conversation (单图单轮对话)
256
  question = '<image>\nPlease describe the image shortly.'
257
  response = model.chat(tokenizer, pixel_values, question, generation_config)
258
+ print(f'User: {question}\nAssistant: {response}')
 
259
 
260
  # single-image multi-round conversation (单图多轮对话)
261
  question = '<image>\nPlease describe the image in detail.'
262
  response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
263
+ print(f'User: {question}\nAssistant: {response}')
 
264
 
265
  question = 'Please write a poem according to the image.'
266
  response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
267
+ print(f'User: {question}\nAssistant: {response}')
 
268
 
269
  # multi-image multi-round conversation, combined images (多图多轮对话,拼接图像)
270
+ pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
271
+ pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
272
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
273
 
274
  question = '<image>\nDescribe the two images in detail.'
275
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
276
  history=None, return_history=True)
277
+ print(f'User: {question}\nAssistant: {response}')
278
 
279
  question = 'What are the similarities and differences between these two images.'
280
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
281
  history=history, return_history=True)
282
+ print(f'User: {question}\nAssistant: {response}')
 
283
 
284
  # multi-image multi-round conversation, separate images (多图多轮对话,独立图像)
285
+ pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
286
+ pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
287
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
288
  num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
289
 
 
291
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
292
  num_patches_list=num_patches_list,
293
  history=None, return_history=True)
294
+ print(f'User: {question}\nAssistant: {response}')
 
295
 
296
  question = 'What are the similarities and differences between these two images.'
297
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
298
  num_patches_list=num_patches_list,
299
  history=history, return_history=True)
300
+ print(f'User: {question}\nAssistant: {response}')
 
301
 
302
  # batch inference, single image per sample (单图批处理)
303
+ pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
304
+ pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
305
  num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
306
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
307
 
 
311
  questions=questions,
312
  generation_config=generation_config)
313
  for question, response in zip(questions, responses):
314
+ print(f'User: {question}\nAssistant: {response}')
 
315
 
316
  # video multi-round conversation (视频多轮对话)
317
  def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
 
346
  pixel_values = torch.cat(pixel_values_list)
347
  return pixel_values, num_patches_list
348
 
 
349
  video_path = './examples/red-panda.mp4'
 
350
  pixel_values, num_patches_list = load_video(video_path, num_segments=8, max_num=1)
351
  pixel_values = pixel_values.to(torch.bfloat16).cuda()
352
  video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
353
  question = video_prefix + 'What is the red panda doing?'
354
+ # Frame1: <image>\nFrame2: <image>\n...\nFrame8: <image>\n{question}
355
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
356
+ num_patches_list=num_patches_list, history=None, return_history=True)
357
+ print(f'User: {question}\nAssistant: {response}')
 
 
358
 
359
  question = 'Describe this video in detail. Don\'t repeat.'
360
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
361
+ num_patches_list=num_patches_list, history=history, return_history=True)
362
+ print(f'User: {question}\nAssistant: {response}')
 
 
363
  ```
364
 
365
+ #### Streaming output
366
 
367
  Besides this method, you can also use the following code to get streamed output.
368
 
 
373
  # Initialize the streamer
374
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10)
375
  # Define the generation configuration
376
+ generation_config = dict(max_new_tokens=1024, do_sample=False, streamer=streamer)
377
  # Start the model chat in a separate thread
378
  thread = Thread(target=model.chat, kwargs=dict(
379
  tokenizer=tokenizer, pixel_values=pixel_values, question=question,