czczup commited on
Commit
12e33f0
1 Parent(s): acd8abb

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -39,7 +39,7 @@ For lmdeploy v0.5.0, please configure the chat template config first. Create the
39
  ```json
40
  {
41
  "model_name":"internlm2",
42
- "meta_instruction":"你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。",
43
  "stop_words":["<|im_start|>", "<|im_end|>"]
44
  }
45
  ```
 
39
  ```json
40
  {
41
  "model_name":"internlm2",
42
+ "meta_instruction":"我是书生·万象,英文名是InternVL,是由上海人工智能实验室及多家合作单位联合开发的多模态大语言模型。人工智能实验室致力于原始技术创新,开源开放,共享共创,推动科技进步和产业发展。",
43
  "stop_words":["<|im_start|>", "<|im_end|>"]
44
  }
45
  ```
config.json CHANGED
@@ -1,6 +1,5 @@
1
  {
2
  "_commit_hash": null,
3
- "_name_or_path": "/nvme/shared/InternVL2-2B",
4
  "architectures": [
5
  "InternVLChatModel"
6
  ],
@@ -13,7 +12,7 @@
13
  "dynamic_image_size": true,
14
  "force_image_size": 448,
15
  "llm_config": {
16
- "_name_or_path": "./pretrained/internlm2-chat-1_8b",
17
  "add_cross_attention": false,
18
  "architectures": [
19
  "InternLM2ForCausalLM"
@@ -96,7 +95,7 @@
96
  "tie_word_embeddings": false,
97
  "tokenizer_class": null,
98
  "top_k": 50,
99
- "top_p": 1.0,
100
  "torch_dtype": "bfloat16",
101
  "torchscript": false,
102
  "transformers_version": "4.40.1",
@@ -108,96 +107,40 @@
108
  "max_dynamic_patch": 12,
109
  "min_dynamic_patch": 1,
110
  "model_type": "internvl_chat",
111
- "pad2square": false,
112
  "ps_version": "v2",
113
  "select_layer": -1,
114
  "template": "internlm2-chat",
115
  "torch_dtype": "float16",
116
- "transformers_version": null,
117
  "use_backbone_lora": 0,
118
  "use_llm_lora": 0,
119
  "use_thumbnail": true,
120
  "vision_config": {
121
- "_name_or_path": "",
122
- "add_cross_attention": false,
123
  "architectures": [
124
  "InternVisionModel"
125
  ],
126
  "attention_dropout": 0.0,
127
- "bad_words_ids": null,
128
- "begin_suppress_tokens": null,
129
- "bos_token_id": null,
130
- "chunk_size_feed_forward": 0,
131
- "cross_attention_hidden_size": null,
132
- "decoder_start_token_id": null,
133
- "diversity_penalty": 0.0,
134
- "do_sample": false,
135
- "drop_path_rate": 0.1,
136
  "dropout": 0.0,
137
- "early_stopping": false,
138
- "encoder_no_repeat_ngram_size": 0,
139
- "eos_token_id": null,
140
- "exponential_decay_length_penalty": null,
141
- "finetuning_task": null,
142
- "forced_bos_token_id": null,
143
- "forced_eos_token_id": null,
144
  "hidden_act": "gelu",
145
  "hidden_size": 1024,
146
- "id2label": {
147
- "0": "LABEL_0",
148
- "1": "LABEL_1"
149
- },
150
  "image_size": 448,
151
  "initializer_factor": 1.0,
152
  "initializer_range": 0.02,
153
  "intermediate_size": 4096,
154
- "is_decoder": false,
155
- "is_encoder_decoder": false,
156
- "label2id": {
157
- "LABEL_0": 0,
158
- "LABEL_1": 1
159
- },
160
  "layer_norm_eps": 1e-06,
161
- "length_penalty": 1.0,
162
- "max_length": 20,
163
- "min_length": 0,
164
  "model_type": "intern_vit_6b",
165
- "no_repeat_ngram_size": 0,
166
  "norm_type": "layer_norm",
167
  "num_attention_heads": 16,
168
- "num_beam_groups": 1,
169
- "num_beams": 1,
170
  "num_channels": 3,
171
  "num_hidden_layers": 24,
172
- "num_return_sequences": 1,
173
  "output_attentions": false,
174
  "output_hidden_states": false,
175
- "output_scores": false,
176
- "pad_token_id": null,
177
  "patch_size": 14,
178
- "prefix": null,
179
- "problem_type": null,
180
- "pruned_heads": {},
181
  "qk_normalization": false,
182
  "qkv_bias": true,
183
- "remove_invalid_values": false,
184
- "repetition_penalty": 1.0,
185
  "return_dict": true,
186
- "return_dict_in_generate": false,
187
- "sep_token_id": null,
188
- "suppress_tokens": null,
189
- "task_specific_params": null,
190
- "temperature": 1.0,
191
- "tf_legacy_loss": false,
192
- "tie_encoder_decoder": false,
193
- "tie_word_embeddings": true,
194
- "tokenizer_class": null,
195
- "top_k": 50,
196
- "top_p": 1.0,
197
  "torch_dtype": "bfloat16",
198
- "torchscript": false,
199
  "transformers_version": "4.40.1",
200
- "typical_p": 1.0,
201
  "use_bfloat16": true,
202
  "use_flash_attn": true
203
  }
 
1
  {
2
  "_commit_hash": null,
 
3
  "architectures": [
4
  "InternVLChatModel"
5
  ],
 
12
  "dynamic_image_size": true,
13
  "force_image_size": 448,
14
  "llm_config": {
15
+ "_name_or_path": "internlm/internlm2-chat-1_8b",
16
  "add_cross_attention": false,
17
  "architectures": [
18
  "InternLM2ForCausalLM"
 
95
  "tie_word_embeddings": false,
96
  "tokenizer_class": null,
97
  "top_k": 50,
98
+ "top_p": null,
99
  "torch_dtype": "bfloat16",
100
  "torchscript": false,
101
  "transformers_version": "4.40.1",
 
107
  "max_dynamic_patch": 12,
108
  "min_dynamic_patch": 1,
109
  "model_type": "internvl_chat",
 
110
  "ps_version": "v2",
111
  "select_layer": -1,
112
  "template": "internlm2-chat",
113
  "torch_dtype": "float16",
 
114
  "use_backbone_lora": 0,
115
  "use_llm_lora": 0,
116
  "use_thumbnail": true,
117
  "vision_config": {
 
 
118
  "architectures": [
119
  "InternVisionModel"
120
  ],
121
  "attention_dropout": 0.0,
122
+ "drop_path_rate": 0.0,
 
 
 
 
 
 
 
 
123
  "dropout": 0.0,
 
 
 
 
 
 
 
124
  "hidden_act": "gelu",
125
  "hidden_size": 1024,
 
 
 
 
126
  "image_size": 448,
127
  "initializer_factor": 1.0,
128
  "initializer_range": 0.02,
129
  "intermediate_size": 4096,
 
 
 
 
 
 
130
  "layer_norm_eps": 1e-06,
 
 
 
131
  "model_type": "intern_vit_6b",
 
132
  "norm_type": "layer_norm",
133
  "num_attention_heads": 16,
 
 
134
  "num_channels": 3,
135
  "num_hidden_layers": 24,
 
136
  "output_attentions": false,
137
  "output_hidden_states": false,
 
 
138
  "patch_size": 14,
 
 
 
139
  "qk_normalization": false,
140
  "qkv_bias": true,
 
 
141
  "return_dict": true,
 
 
 
 
 
 
 
 
 
 
 
142
  "torch_dtype": "bfloat16",
 
143
  "transformers_version": "4.40.1",
 
144
  "use_bfloat16": true,
145
  "use_flash_attn": true
146
  }
configuration_internvl_chat.py CHANGED
@@ -26,7 +26,6 @@ class InternVLChatConfig(PretrainedConfig):
26
  llm_config=None,
27
  use_backbone_lora=0,
28
  use_llm_lora=0,
29
- pad2square=False,
30
  select_layer=-1,
31
  force_image_size=None,
32
  downsample_ratio=0.5,
@@ -56,7 +55,6 @@ class InternVLChatConfig(PretrainedConfig):
56
  raise ValueError('Unsupported architecture: {}'.format(llm_config['architectures'][0]))
57
  self.use_backbone_lora = use_backbone_lora
58
  self.use_llm_lora = use_llm_lora
59
- self.pad2square = pad2square
60
  self.select_layer = select_layer
61
  self.force_image_size = force_image_size
62
  self.downsample_ratio = downsample_ratio
@@ -85,7 +83,6 @@ class InternVLChatConfig(PretrainedConfig):
85
  output['model_type'] = self.__class__.model_type
86
  output['use_backbone_lora'] = self.use_backbone_lora
87
  output['use_llm_lora'] = self.use_llm_lora
88
- output['pad2square'] = self.pad2square
89
  output['select_layer'] = self.select_layer
90
  output['force_image_size'] = self.force_image_size
91
  output['downsample_ratio'] = self.downsample_ratio
 
26
  llm_config=None,
27
  use_backbone_lora=0,
28
  use_llm_lora=0,
 
29
  select_layer=-1,
30
  force_image_size=None,
31
  downsample_ratio=0.5,
 
55
  raise ValueError('Unsupported architecture: {}'.format(llm_config['architectures'][0]))
56
  self.use_backbone_lora = use_backbone_lora
57
  self.use_llm_lora = use_llm_lora
 
58
  self.select_layer = select_layer
59
  self.force_image_size = force_image_size
60
  self.downsample_ratio = downsample_ratio
 
83
  output['model_type'] = self.__class__.model_type
84
  output['use_backbone_lora'] = self.use_backbone_lora
85
  output['use_llm_lora'] = self.use_llm_lora
 
86
  output['select_layer'] = self.select_layer
87
  output['force_image_size'] = self.force_image_size
88
  output['downsample_ratio'] = self.downsample_ratio
conversation.py CHANGED
@@ -2,7 +2,7 @@
2
  Conversation prompt templates.
3
 
4
  We kindly request that you import fastchat instead of copying this file if you wish to use it.
5
- If you have any changes in mind, please contribute back so the community can benefit collectively and continue to maintain these valuable templates.
6
  """
7
 
8
  import dataclasses
@@ -330,10 +330,13 @@ def get_conv_template(name: str) -> Conversation:
330
  return conv_templates[name].copy()
331
 
332
 
 
333
  register_conv_template(
334
  Conversation(
335
  name='Hermes-2',
336
  system_template='<|im_start|>system\n{system_message}',
 
 
337
  system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。',
338
  roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
339
  sep_style=SeparatorStyle.MPT,
@@ -343,7 +346,7 @@ register_conv_template(
343
  6,
344
  7,
345
  8,
346
- ], # "<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|im_sep|>"
347
  stop_str='<|endoftext|>',
348
  )
349
  )
@@ -353,6 +356,8 @@ register_conv_template(
353
  Conversation(
354
  name='internlm2-chat',
355
  system_template='<|im_start|>system\n{system_message}',
 
 
356
  system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。',
357
  roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
358
  sep_style=SeparatorStyle.MPT,
@@ -370,6 +375,8 @@ register_conv_template(
370
  Conversation(
371
  name='phi3-chat',
372
  system_template='<|system|>\n{system_message}',
 
 
373
  system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。',
374
  roles=('<|user|>\n', '<|assistant|>\n'),
375
  sep_style=SeparatorStyle.MPT,
 
2
  Conversation prompt templates.
3
 
4
  We kindly request that you import fastchat instead of copying this file if you wish to use it.
5
+ If you have changes in mind, please contribute back so the community can benefit collectively and continue to maintain these valuable templates.
6
  """
7
 
8
  import dataclasses
 
330
  return conv_templates[name].copy()
331
 
332
 
333
+ # Note that for inference, using the Hermes-2 and internlm2-chat templates is equivalent.
334
  register_conv_template(
335
  Conversation(
336
  name='Hermes-2',
337
  system_template='<|im_start|>system\n{system_message}',
338
+ # note: The new system prompt was not used here to avoid changes in benchmark performance.
339
+ # system_message='我是书生·万象,英文名是InternVL,是由上海人工智能实验室及多家合作单位联合开发的多模态大语言模型。人工智能实验室致力于原始技术创新,开源开放,共享共创,推动科技进步和产业发展。',
340
  system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。',
341
  roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
342
  sep_style=SeparatorStyle.MPT,
 
346
  6,
347
  7,
348
  8,
349
+ ],
350
  stop_str='<|endoftext|>',
351
  )
352
  )
 
356
  Conversation(
357
  name='internlm2-chat',
358
  system_template='<|im_start|>system\n{system_message}',
359
+ # note: The new system prompt was not used here to avoid changes in benchmark performance.
360
+ # system_message='我是书生·万象,英文名是InternVL,是由上海人工智能实验室及多家合作单位联合开发的多模态大语言模型。人工智能实验室致力于原始技术创新,开源开放,共享共创,推动科技进步和产业发展。',
361
  system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。',
362
  roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
363
  sep_style=SeparatorStyle.MPT,
 
375
  Conversation(
376
  name='phi3-chat',
377
  system_template='<|system|>\n{system_message}',
378
+ # note: The new system prompt was not used here to avoid changes in benchmark performance.
379
+ # system_message='我是书生·万象,英文名是InternVL,是由上海人工智能实验室及多家合作单位联合开发的多模态大语言模型。人工智能实验室致力于原始技术创新,开源开放,共享共创,推动科技进步和产业发展。',
380
  system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。',
381
  roles=('<|user|>\n', '<|assistant|>\n'),
382
  sep_style=SeparatorStyle.MPT,
modeling_internlm2.py CHANGED
@@ -709,6 +709,7 @@ class InternLM2PreTrainedModel(PreTrainedModel):
709
  supports_gradient_checkpointing = True
710
  _no_split_modules = ['InternLM2DecoderLayer']
711
  _skip_keys_device_placement = 'past_key_values'
 
712
 
713
  def _init_weights(self, module):
714
  std = self.config.initializer_range
 
709
  supports_gradient_checkpointing = True
710
  _no_split_modules = ['InternLM2DecoderLayer']
711
  _skip_keys_device_placement = 'past_key_values'
712
+ _supports_flash_attn_2 = True
713
 
714
  def _init_weights(self, module):
715
  std = self.config.initializer_range
modeling_internvl_chat.py CHANGED
@@ -7,6 +7,7 @@ import warnings
7
  from typing import Any, List, Optional, Tuple, Union
8
 
9
  import torch.utils.checkpoint
 
10
  from torch import nn
11
  from torch.nn import CrossEntropyLoss
12
  from transformers import (AutoModel, GenerationConfig, LlamaForCausalLM,
@@ -23,6 +24,14 @@ from .modeling_internlm2 import InternLM2ForCausalLM
23
  logger = logging.get_logger(__name__)
24
 
25
 
 
 
 
 
 
 
 
 
26
  class InternVLChatModel(PreTrainedModel):
27
  config_class = InternVLChatConfig
28
  main_input_name = 'pixel_values'
@@ -31,6 +40,7 @@ class InternVLChatModel(PreTrainedModel):
31
  def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None):
32
  super().__init__(config)
33
 
 
34
  image_size = config.force_image_size or config.vision_config.image_size
35
  patch_size = config.vision_config.patch_size
36
  self.patch_size = patch_size
@@ -183,36 +193,44 @@ class InternVLChatModel(PreTrainedModel):
183
  vit_embeds = self.mlp1(vit_embeds)
184
  return vit_embeds
185
 
186
- def batch_chat(self, tokenizer, pixel_values, num_patches_list, questions, generation_config, history=None,
187
- return_history=False, IMG_START_TOKEN='<img>', IMG_END_TOKEN='</img>',
188
- IMG_CONTEXT_TOKEN='<IMG_CONTEXT>', verbose=False):
189
  if history is not None or return_history:
190
  print('Now multi-turn chat is not supported in batch_chat.')
191
  raise NotImplementedError
 
 
 
 
 
192
  img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
193
  self.img_context_token_id = img_context_token_id
194
 
195
- from .conversation import get_conv_template
 
 
196
 
197
  queries = []
198
- if verbose:
199
- image_bs = pixel_values.shape[0]
200
- print(f'dynamic ViT batch size: {image_bs}, num_patches_list: {num_patches_list}')
201
  for idx, num_patches in enumerate(num_patches_list):
202
- image_token = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
203
- question = image_token + '\n' + questions[idx]
 
204
  template = get_conv_template(self.template)
205
  template.append_message(template.roles[0], question)
206
  template.append_message(template.roles[1], None)
207
  query = template.get_prompt()
 
 
 
208
  queries.append(query)
 
209
  tokenizer.padding_side = 'left'
210
  model_inputs = tokenizer(queries, return_tensors='pt', padding=True)
211
  input_ids = model_inputs['input_ids'].cuda()
212
  attention_mask = model_inputs['attention_mask'].cuda()
213
  eos_token_id = tokenizer.convert_tokens_to_ids(template.sep)
214
  generation_config['eos_token_id'] = eos_token_id
215
-
216
  generation_output = self.generate(
217
  pixel_values=pixel_values,
218
  input_ids=input_ids,
 
7
  from typing import Any, List, Optional, Tuple, Union
8
 
9
  import torch.utils.checkpoint
10
+ import transformers
11
  from torch import nn
12
  from torch.nn import CrossEntropyLoss
13
  from transformers import (AutoModel, GenerationConfig, LlamaForCausalLM,
 
24
  logger = logging.get_logger(__name__)
25
 
26
 
27
+ def version_cmp(v1, v2, op='eq'):
28
+ import operator
29
+
30
+ from packaging import version
31
+ op_func = getattr(operator, op)
32
+ return op_func(version.parse(v1), version.parse(v2))
33
+
34
+
35
  class InternVLChatModel(PreTrainedModel):
36
  config_class = InternVLChatConfig
37
  main_input_name = 'pixel_values'
 
40
  def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None):
41
  super().__init__(config)
42
 
43
+ assert version_cmp(transformers.__version__, '4.36.2', 'ge')
44
  image_size = config.force_image_size or config.vision_config.image_size
45
  patch_size = config.vision_config.patch_size
46
  self.patch_size = patch_size
 
193
  vit_embeds = self.mlp1(vit_embeds)
194
  return vit_embeds
195
 
196
+ def batch_chat(self, tokenizer, pixel_values, questions, generation_config, num_patches_list=None,
197
+ history=None, return_history=False, IMG_START_TOKEN='<img>', IMG_END_TOKEN='</img>',
198
+ IMG_CONTEXT_TOKEN='<IMG_CONTEXT>', verbose=False, image_counts=None):
199
  if history is not None or return_history:
200
  print('Now multi-turn chat is not supported in batch_chat.')
201
  raise NotImplementedError
202
+
203
+ if image_counts is not None:
204
+ num_patches_list = image_counts
205
+ print('Warning: `image_counts` is deprecated. Please use `num_patches_list` instead.')
206
+
207
  img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
208
  self.img_context_token_id = img_context_token_id
209
 
210
+ if verbose and pixel_values is not None:
211
+ image_bs = pixel_values.shape[0]
212
+ print(f'dynamic ViT batch size: {image_bs}')
213
 
214
  queries = []
 
 
 
215
  for idx, num_patches in enumerate(num_patches_list):
216
+ question = questions[idx]
217
+ if pixel_values is not None and '<image>' not in question:
218
+ question = '<image>\n' + question
219
  template = get_conv_template(self.template)
220
  template.append_message(template.roles[0], question)
221
  template.append_message(template.roles[1], None)
222
  query = template.get_prompt()
223
+
224
+ image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
225
+ query = query.replace('<image>', image_tokens, 1)
226
  queries.append(query)
227
+
228
  tokenizer.padding_side = 'left'
229
  model_inputs = tokenizer(queries, return_tensors='pt', padding=True)
230
  input_ids = model_inputs['input_ids'].cuda()
231
  attention_mask = model_inputs['attention_mask'].cuda()
232
  eos_token_id = tokenizer.convert_tokens_to_ids(template.sep)
233
  generation_config['eos_token_id'] = eos_token_id
 
234
  generation_output = self.generate(
235
  pixel_values=pixel_values,
236
  input_ids=input_ids,