Visual Question Answering
Transformers
TensorBoard
Safetensors
internvl_chat
feature-extraction
custom_code
czczup commited on
Commit
21a393e
1 Parent(s): 043cbb5

Upload folder using huggingface_hub

Browse files
configuration_internvl_chat.py CHANGED
@@ -27,11 +27,10 @@ class InternVLChatConfig(PretrainedConfig):
27
  use_backbone_lora=0,
28
  use_llm_lora=0,
29
  pad2square=False,
30
- select_layer=-4,
31
  force_image_size=None,
32
  downsample_ratio=0.5,
33
  template=None,
34
- image_fold=False,
35
  dynamic_image_size=False,
36
  use_thumbnail=False,
37
  ps_version='v1',
@@ -62,7 +61,6 @@ class InternVLChatConfig(PretrainedConfig):
62
  self.force_image_size = force_image_size
63
  self.downsample_ratio = downsample_ratio
64
  self.template = template
65
- self.image_fold = image_fold
66
  self.dynamic_image_size = dynamic_image_size
67
  self.use_thumbnail = use_thumbnail
68
  self.ps_version = ps_version # pixel shuffle version
@@ -70,7 +68,6 @@ class InternVLChatConfig(PretrainedConfig):
70
  self.max_dynamic_patch = max_dynamic_patch
71
 
72
  logger.info(f'vision_select_layer: {self.select_layer}')
73
- logger.info(f'image_fold: {self.image_fold}')
74
  logger.info(f'ps_version: {self.ps_version}')
75
  logger.info(f'min_dynamic_patch: {self.min_dynamic_patch}')
76
  logger.info(f'max_dynamic_patch: {self.max_dynamic_patch}')
@@ -93,7 +90,6 @@ class InternVLChatConfig(PretrainedConfig):
93
  output['force_image_size'] = self.force_image_size
94
  output['downsample_ratio'] = self.downsample_ratio
95
  output['template'] = self.template
96
- output['image_fold'] = self.image_fold
97
  output['dynamic_image_size'] = self.dynamic_image_size
98
  output['use_thumbnail'] = self.use_thumbnail
99
  output['ps_version'] = self.ps_version
 
27
  use_backbone_lora=0,
28
  use_llm_lora=0,
29
  pad2square=False,
30
+ select_layer=-1,
31
  force_image_size=None,
32
  downsample_ratio=0.5,
33
  template=None,
 
34
  dynamic_image_size=False,
35
  use_thumbnail=False,
36
  ps_version='v1',
 
61
  self.force_image_size = force_image_size
62
  self.downsample_ratio = downsample_ratio
63
  self.template = template
 
64
  self.dynamic_image_size = dynamic_image_size
65
  self.use_thumbnail = use_thumbnail
66
  self.ps_version = ps_version # pixel shuffle version
 
68
  self.max_dynamic_patch = max_dynamic_patch
69
 
70
  logger.info(f'vision_select_layer: {self.select_layer}')
 
71
  logger.info(f'ps_version: {self.ps_version}')
72
  logger.info(f'min_dynamic_patch: {self.min_dynamic_patch}')
73
  logger.info(f'max_dynamic_patch: {self.max_dynamic_patch}')
 
90
  output['force_image_size'] = self.force_image_size
91
  output['downsample_ratio'] = self.downsample_ratio
92
  output['template'] = self.template
 
93
  output['dynamic_image_size'] = self.dynamic_image_size
94
  output['use_thumbnail'] = self.use_thumbnail
95
  output['ps_version'] = self.ps_version
modeling_internvl_chat.py CHANGED
@@ -23,40 +23,6 @@ from .modeling_internlm2 import InternLM2ForCausalLM
23
  logger = logging.get_logger(__name__)
24
 
25
 
26
- def window_partition(x, window_size):
27
- """
28
- Args:
29
- x: (B, C, H, W)
30
- window_size (int): window size, assuming square window
31
-
32
- Returns:
33
- windows: (num_windows*B, C, window_size, window_size)
34
- """
35
- B, C, H, W = x.shape
36
- assert H % window_size == 0 and W % window_size == 0, 'H and W must be divisible by window_size'
37
-
38
- x = x.view(B, C, H // window_size, window_size, W // window_size, window_size)
39
- windows = x.permute(0, 2, 4, 1, 3, 5).contiguous().view(-1, C, window_size, window_size)
40
- return windows
41
-
42
-
43
- def window_reverse(windows, window_size, H, W):
44
- """
45
- Args:
46
- windows: (num_windows*B, window_size, window_size, C)
47
- window_size (int): Window size
48
- H (int): Height of image
49
- W (int): Width of image
50
-
51
- Returns:
52
- x: (B, H * W, C)
53
- """
54
- B = int(windows.shape[0] / (H * W / window_size / window_size))
55
- x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
56
- x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H * W, -1)
57
- return x
58
-
59
-
60
  class InternVLChatModel(PreTrainedModel):
61
  config_class = InternVLChatConfig
62
  main_input_name = 'pixel_values'
@@ -72,7 +38,6 @@ class InternVLChatModel(PreTrainedModel):
72
  self.template = config.template
73
  self.num_image_token = int((image_size // patch_size) ** 2 * (config.downsample_ratio ** 2))
74
  self.downsample_ratio = config.downsample_ratio
75
- self.image_fold = config.image_fold
76
  self.ps_version = config.ps_version
77
 
78
  logger.info(f'num_image_token: {self.num_image_token}')
@@ -242,10 +207,6 @@ class InternVLChatModel(PreTrainedModel):
242
  return vit_embeds + noise
243
 
244
  def extract_feature(self, pixel_values):
245
- if self.image_fold:
246
- image_size = pixel_values.size(-1) # B, C, H, W
247
- pixel_values = window_partition(pixel_values, window_size=image_size // self.image_fold) # 4B, C, H/2, W/2
248
-
249
  if self.select_layer == -1:
250
  vit_embeds = self.vision_model(
251
  pixel_values=pixel_values,
@@ -261,21 +222,55 @@ class InternVLChatModel(PreTrainedModel):
261
  if self.training and self.neftune_alpha is not None:
262
  vit_embeds = self.noised_embed(vit_embeds, self.neftune_alpha)
263
 
264
- if self.image_fold:
265
- vit_embeds = window_reverse(vit_embeds, window_size=image_size // (self.image_fold * self.patch_size),
266
- H=image_size // self.patch_size, W=image_size // self.patch_size)
267
-
268
- # if torch.distributed.get_rank() == 0:
269
- # print("before pixel shuffle:", vit_embeds.shape)
270
  h = w = int(vit_embeds.shape[1] ** 0.5)
271
  vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
272
  vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio)
273
  vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
274
- # if torch.distributed.get_rank() == 0:
275
- # print("after pixel shuffle:", vit_embeds.shape)
276
  vit_embeds = self.mlp1(vit_embeds)
277
  return vit_embeds
278
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
  def chat(self, tokenizer, pixel_values, question, generation_config, history=None, return_history=False,
280
  IMG_START_TOKEN='<img>', IMG_END_TOKEN='</img>', IMG_CONTEXT_TOKEN='<IMG_CONTEXT>'):
281
 
 
23
  logger = logging.get_logger(__name__)
24
 
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  class InternVLChatModel(PreTrainedModel):
27
  config_class = InternVLChatConfig
28
  main_input_name = 'pixel_values'
 
38
  self.template = config.template
39
  self.num_image_token = int((image_size // patch_size) ** 2 * (config.downsample_ratio ** 2))
40
  self.downsample_ratio = config.downsample_ratio
 
41
  self.ps_version = config.ps_version
42
 
43
  logger.info(f'num_image_token: {self.num_image_token}')
 
207
  return vit_embeds + noise
208
 
209
  def extract_feature(self, pixel_values):
 
 
 
 
210
  if self.select_layer == -1:
211
  vit_embeds = self.vision_model(
212
  pixel_values=pixel_values,
 
222
  if self.training and self.neftune_alpha is not None:
223
  vit_embeds = self.noised_embed(vit_embeds, self.neftune_alpha)
224
 
 
 
 
 
 
 
225
  h = w = int(vit_embeds.shape[1] ** 0.5)
226
  vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
227
  vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio)
228
  vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
 
 
229
  vit_embeds = self.mlp1(vit_embeds)
230
  return vit_embeds
231
 
232
+ def batch_chat(self, tokenizer, pixel_values, image_counts, questions, generation_config, history=None,
233
+ return_history=False, IMG_START_TOKEN='<img>', IMG_END_TOKEN='</img>',
234
+ IMG_CONTEXT_TOKEN='<IMG_CONTEXT>'):
235
+ if history is not None or return_history:
236
+ print("Now multi-turn chat is not supported in batch_chat.")
237
+ raise NotImplementedError
238
+ img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
239
+ self.img_context_token_id = img_context_token_id
240
+ if tokenizer.convert_tokens_to_ids('<|im_end|>') != 0:
241
+ eos_token_id = tokenizer.convert_tokens_to_ids('<|im_end|>') # 92542, InternLM2
242
+ else:
243
+ eos_token_id = tokenizer.eos_token_id
244
+
245
+ from .conversation import get_conv_template
246
+
247
+ queries = []
248
+ image_bs = pixel_values.shape[0]
249
+ print(f'dynamic ViT batch size: {image_bs}, image_counts: {image_counts}')
250
+ for idx, image_count in enumerate(image_counts):
251
+ image_token = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * image_count + IMG_END_TOKEN
252
+ question = image_token + '\n' + questions[idx]
253
+ template = get_conv_template(self.template)
254
+ template.append_message(template.roles[0], question)
255
+ template.append_message(template.roles[1], None)
256
+ query = template.get_prompt()
257
+ queries.append(query)
258
+ tokenizer.padding_side = 'left'
259
+ model_inputs = tokenizer(queries, return_tensors='pt', padding=True)
260
+ input_ids = model_inputs['input_ids'].cuda()
261
+ attention_mask = model_inputs['attention_mask'].cuda()
262
+ generation_config['eos_token_id'] = eos_token_id
263
+
264
+ generation_output = self.generate(
265
+ pixel_values=pixel_values,
266
+ input_ids=input_ids,
267
+ attention_mask=attention_mask,
268
+ **generation_config
269
+ )
270
+ responses = tokenizer.batch_decode(generation_output, skip_special_tokens=True)
271
+ responses = [response.split('<|im_end|>')[0].strip() for response in responses] # for InternLM2
272
+ return responses
273
+
274
  def chat(self, tokenizer, pixel_values, question, generation_config, history=None, return_history=False,
275
  IMG_START_TOKEN='<img>', IMG_END_TOKEN='</img>', IMG_CONTEXT_TOKEN='<IMG_CONTEXT>'):
276