Update model code & README.md
Browse files- config.json +4 -63
- configuration_intern_vit.py +1 -0
- configuration_internvl_chat.py +5 -5
- conversation.py +15 -17
- generation_config.json +5 -1
- modeling_intern_vit.py +1 -0
- modeling_internvl_chat.py +14 -14
- preprocessor_config.json +19 -0
- tokenization_internlm2_fast.py +211 -0
config.json
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
{
|
2 |
"_commit_hash": null,
|
3 |
-
"_name_or_path": "ckpt/OpenGVLab/InternVL2-8B",
|
4 |
"architectures": [
|
5 |
"InternVLChatModel"
|
6 |
],
|
@@ -12,9 +11,8 @@
|
|
12 |
"downsample_ratio": 0.5,
|
13 |
"dynamic_image_size": true,
|
14 |
"force_image_size": 448,
|
15 |
-
"hidden_size": 4096,
|
16 |
"llm_config": {
|
17 |
-
"_name_or_path": "
|
18 |
"add_cross_attention": false,
|
19 |
"architectures": [
|
20 |
"InternLM2ForCausalLM"
|
@@ -94,109 +92,52 @@
|
|
94 |
"tie_word_embeddings": false,
|
95 |
"tokenizer_class": null,
|
96 |
"top_k": 50,
|
97 |
-
"top_p":
|
98 |
"torch_dtype": "bfloat16",
|
99 |
"torchscript": false,
|
100 |
"transformers_version": "4.37.2",
|
101 |
"typical_p": 1.0,
|
102 |
"use_bfloat16": true,
|
103 |
-
"use_cache":
|
104 |
"vocab_size": 92553
|
105 |
},
|
106 |
"max_dynamic_patch": 12,
|
107 |
"min_dynamic_patch": 1,
|
108 |
"model_type": "internvl_chat",
|
109 |
-
"pad2square": false,
|
110 |
"ps_version": "v2",
|
111 |
"select_layer": -1,
|
112 |
"template": "internlm2-chat",
|
113 |
-
"tie_word_embeddings": false,
|
114 |
"torch_dtype": "bfloat16",
|
115 |
-
"transformers_version": null,
|
116 |
"use_backbone_lora": 0,
|
117 |
"use_llm_lora": 0,
|
118 |
"use_thumbnail": true,
|
119 |
"vision_config": {
|
120 |
-
"_name_or_path": "",
|
121 |
-
"add_cross_attention": false,
|
122 |
"architectures": [
|
123 |
"InternVisionModel"
|
124 |
],
|
125 |
"attention_dropout": 0.0,
|
126 |
-
"
|
127 |
-
"begin_suppress_tokens": null,
|
128 |
-
"bos_token_id": null,
|
129 |
-
"chunk_size_feed_forward": 0,
|
130 |
-
"cross_attention_hidden_size": null,
|
131 |
-
"decoder_start_token_id": null,
|
132 |
-
"diversity_penalty": 0.0,
|
133 |
-
"do_sample": false,
|
134 |
-
"drop_path_rate": 0.1,
|
135 |
"dropout": 0.0,
|
136 |
-
"early_stopping": false,
|
137 |
-
"encoder_no_repeat_ngram_size": 0,
|
138 |
-
"eos_token_id": null,
|
139 |
-
"exponential_decay_length_penalty": null,
|
140 |
-
"finetuning_task": null,
|
141 |
-
"forced_bos_token_id": null,
|
142 |
-
"forced_eos_token_id": null,
|
143 |
"hidden_act": "gelu",
|
144 |
"hidden_size": 1024,
|
145 |
-
"id2label": {
|
146 |
-
"0": "LABEL_0",
|
147 |
-
"1": "LABEL_1"
|
148 |
-
},
|
149 |
"image_size": 448,
|
150 |
"initializer_factor": 1.0,
|
151 |
"initializer_range": 0.02,
|
152 |
"intermediate_size": 4096,
|
153 |
-
"is_decoder": false,
|
154 |
-
"is_encoder_decoder": false,
|
155 |
-
"label2id": {
|
156 |
-
"LABEL_0": 0,
|
157 |
-
"LABEL_1": 1
|
158 |
-
},
|
159 |
"layer_norm_eps": 1e-06,
|
160 |
-
"length_penalty": 1.0,
|
161 |
-
"max_length": 20,
|
162 |
-
"min_length": 0,
|
163 |
"model_type": "intern_vit_6b",
|
164 |
-
"no_repeat_ngram_size": 0,
|
165 |
"norm_type": "layer_norm",
|
166 |
"num_attention_heads": 16,
|
167 |
-
"num_beam_groups": 1,
|
168 |
-
"num_beams": 1,
|
169 |
"num_channels": 3,
|
170 |
"num_hidden_layers": 24,
|
171 |
-
"num_return_sequences": 1,
|
172 |
"output_attentions": false,
|
173 |
"output_hidden_states": false,
|
174 |
-
"output_scores": false,
|
175 |
-
"pad_token_id": null,
|
176 |
"patch_size": 14,
|
177 |
-
"prefix": null,
|
178 |
-
"problem_type": null,
|
179 |
-
"pruned_heads": {},
|
180 |
"qk_normalization": false,
|
181 |
"qkv_bias": true,
|
182 |
-
"remove_invalid_values": false,
|
183 |
-
"repetition_penalty": 1.0,
|
184 |
"return_dict": true,
|
185 |
-
"return_dict_in_generate": false,
|
186 |
-
"sep_token_id": null,
|
187 |
-
"suppress_tokens": null,
|
188 |
-
"task_specific_params": null,
|
189 |
-
"temperature": 1.0,
|
190 |
-
"tf_legacy_loss": false,
|
191 |
-
"tie_encoder_decoder": false,
|
192 |
-
"tie_word_embeddings": true,
|
193 |
-
"tokenizer_class": null,
|
194 |
-
"top_k": 50,
|
195 |
-
"top_p": null,
|
196 |
"torch_dtype": "bfloat16",
|
197 |
-
"torchscript": false,
|
198 |
"transformers_version": "4.37.2",
|
199 |
-
"typical_p": 1.0,
|
200 |
"use_bfloat16": true,
|
201 |
"use_flash_attn": true
|
202 |
}
|
|
|
1 |
{
|
2 |
"_commit_hash": null,
|
|
|
3 |
"architectures": [
|
4 |
"InternVLChatModel"
|
5 |
],
|
|
|
11 |
"downsample_ratio": 0.5,
|
12 |
"dynamic_image_size": true,
|
13 |
"force_image_size": 448,
|
|
|
14 |
"llm_config": {
|
15 |
+
"_name_or_path": "internlm/internlm2_5-7b-chat",
|
16 |
"add_cross_attention": false,
|
17 |
"architectures": [
|
18 |
"InternLM2ForCausalLM"
|
|
|
92 |
"tie_word_embeddings": false,
|
93 |
"tokenizer_class": null,
|
94 |
"top_k": 50,
|
95 |
+
"top_p": 1.0,
|
96 |
"torch_dtype": "bfloat16",
|
97 |
"torchscript": false,
|
98 |
"transformers_version": "4.37.2",
|
99 |
"typical_p": 1.0,
|
100 |
"use_bfloat16": true,
|
101 |
+
"use_cache": true,
|
102 |
"vocab_size": 92553
|
103 |
},
|
104 |
"max_dynamic_patch": 12,
|
105 |
"min_dynamic_patch": 1,
|
106 |
"model_type": "internvl_chat",
|
|
|
107 |
"ps_version": "v2",
|
108 |
"select_layer": -1,
|
109 |
"template": "internlm2-chat",
|
|
|
110 |
"torch_dtype": "bfloat16",
|
|
|
111 |
"use_backbone_lora": 0,
|
112 |
"use_llm_lora": 0,
|
113 |
"use_thumbnail": true,
|
114 |
"vision_config": {
|
|
|
|
|
115 |
"architectures": [
|
116 |
"InternVisionModel"
|
117 |
],
|
118 |
"attention_dropout": 0.0,
|
119 |
+
"drop_path_rate": 0.0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
"dropout": 0.0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
"hidden_act": "gelu",
|
122 |
"hidden_size": 1024,
|
|
|
|
|
|
|
|
|
123 |
"image_size": 448,
|
124 |
"initializer_factor": 1.0,
|
125 |
"initializer_range": 0.02,
|
126 |
"intermediate_size": 4096,
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
"layer_norm_eps": 1e-06,
|
|
|
|
|
|
|
128 |
"model_type": "intern_vit_6b",
|
|
|
129 |
"norm_type": "layer_norm",
|
130 |
"num_attention_heads": 16,
|
|
|
|
|
131 |
"num_channels": 3,
|
132 |
"num_hidden_layers": 24,
|
|
|
133 |
"output_attentions": false,
|
134 |
"output_hidden_states": false,
|
|
|
|
|
135 |
"patch_size": 14,
|
|
|
|
|
|
|
136 |
"qk_normalization": false,
|
137 |
"qkv_bias": true,
|
|
|
|
|
138 |
"return_dict": true,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
"torch_dtype": "bfloat16",
|
|
|
140 |
"transformers_version": "4.37.2",
|
|
|
141 |
"use_bfloat16": true,
|
142 |
"use_flash_attn": true
|
143 |
}
|
configuration_intern_vit.py
CHANGED
@@ -3,6 +3,7 @@
|
|
3 |
# Copyright (c) 2024 OpenGVLab
|
4 |
# Licensed under The MIT License [see LICENSE for details]
|
5 |
# --------------------------------------------------------
|
|
|
6 |
import os
|
7 |
from typing import Union
|
8 |
|
|
|
3 |
# Copyright (c) 2024 OpenGVLab
|
4 |
# Licensed under The MIT License [see LICENSE for details]
|
5 |
# --------------------------------------------------------
|
6 |
+
|
7 |
import os
|
8 |
from typing import Union
|
9 |
|
configuration_internvl_chat.py
CHANGED
@@ -39,20 +39,20 @@ class InternVLChatConfig(PretrainedConfig):
|
|
39 |
super().__init__(**kwargs)
|
40 |
|
41 |
if vision_config is None:
|
42 |
-
vision_config = {}
|
43 |
logger.info('vision_config is None. Initializing the InternVisionConfig with default values.')
|
44 |
|
45 |
if llm_config is None:
|
46 |
-
llm_config = {}
|
47 |
logger.info('llm_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`).')
|
48 |
|
49 |
self.vision_config = InternVisionConfig(**vision_config)
|
50 |
-
if llm_config
|
51 |
self.llm_config = LlamaConfig(**llm_config)
|
52 |
-
elif llm_config
|
53 |
self.llm_config = InternLM2Config(**llm_config)
|
54 |
else:
|
55 |
-
raise ValueError('Unsupported architecture: {}'.format(llm_config
|
56 |
self.use_backbone_lora = use_backbone_lora
|
57 |
self.use_llm_lora = use_llm_lora
|
58 |
self.select_layer = select_layer
|
|
|
39 |
super().__init__(**kwargs)
|
40 |
|
41 |
if vision_config is None:
|
42 |
+
vision_config = {'architectures': ['InternVisionModel']}
|
43 |
logger.info('vision_config is None. Initializing the InternVisionConfig with default values.')
|
44 |
|
45 |
if llm_config is None:
|
46 |
+
llm_config = {'architectures': ['InternLM2ForCausalLM']}
|
47 |
logger.info('llm_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`).')
|
48 |
|
49 |
self.vision_config = InternVisionConfig(**vision_config)
|
50 |
+
if llm_config.get('architectures')[0] == 'LlamaForCausalLM':
|
51 |
self.llm_config = LlamaConfig(**llm_config)
|
52 |
+
elif llm_config.get('architectures')[0] == 'InternLM2ForCausalLM':
|
53 |
self.llm_config = InternLM2Config(**llm_config)
|
54 |
else:
|
55 |
+
raise ValueError('Unsupported architecture: {}'.format(llm_config.get('architectures')[0]))
|
56 |
self.use_backbone_lora = use_backbone_lora
|
57 |
self.use_llm_lora = use_llm_lora
|
58 |
self.select_layer = select_layer
|
conversation.py
CHANGED
@@ -3,11 +3,13 @@ Conversation prompt templates.
|
|
3 |
|
4 |
We kindly request that you import fastchat instead of copying this file if you wish to use it.
|
5 |
If you have changes in mind, please contribute back so the community can benefit collectively and continue to maintain these valuable templates.
|
|
|
|
|
6 |
"""
|
7 |
|
8 |
import dataclasses
|
9 |
from enum import IntEnum, auto
|
10 |
-
from typing import
|
11 |
|
12 |
|
13 |
class SeparatorStyle(IntEnum):
|
@@ -344,12 +346,6 @@ register_conv_template(
|
|
344 |
roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
|
345 |
sep_style=SeparatorStyle.MPT,
|
346 |
sep='<|im_end|>',
|
347 |
-
stop_token_ids=[
|
348 |
-
2,
|
349 |
-
6,
|
350 |
-
7,
|
351 |
-
8,
|
352 |
-
],
|
353 |
stop_str='<|endoftext|>',
|
354 |
)
|
355 |
)
|
@@ -365,11 +361,6 @@ register_conv_template(
|
|
365 |
roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
|
366 |
sep_style=SeparatorStyle.MPT,
|
367 |
sep='<|im_end|>',
|
368 |
-
stop_token_ids=[
|
369 |
-
2,
|
370 |
-
92543,
|
371 |
-
92542
|
372 |
-
]
|
373 |
)
|
374 |
)
|
375 |
|
@@ -384,10 +375,17 @@ register_conv_template(
|
|
384 |
roles=('<|user|>\n', '<|assistant|>\n'),
|
385 |
sep_style=SeparatorStyle.MPT,
|
386 |
sep='<|end|>',
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
392 |
)
|
393 |
)
|
|
|
3 |
|
4 |
We kindly request that you import fastchat instead of copying this file if you wish to use it.
|
5 |
If you have changes in mind, please contribute back so the community can benefit collectively and continue to maintain these valuable templates.
|
6 |
+
|
7 |
+
Modified from https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
|
8 |
"""
|
9 |
|
10 |
import dataclasses
|
11 |
from enum import IntEnum, auto
|
12 |
+
from typing import Dict, List, Tuple, Union
|
13 |
|
14 |
|
15 |
class SeparatorStyle(IntEnum):
|
|
|
346 |
roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
|
347 |
sep_style=SeparatorStyle.MPT,
|
348 |
sep='<|im_end|>',
|
|
|
|
|
|
|
|
|
|
|
|
|
349 |
stop_str='<|endoftext|>',
|
350 |
)
|
351 |
)
|
|
|
361 |
roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
|
362 |
sep_style=SeparatorStyle.MPT,
|
363 |
sep='<|im_end|>',
|
|
|
|
|
|
|
|
|
|
|
364 |
)
|
365 |
)
|
366 |
|
|
|
375 |
roles=('<|user|>\n', '<|assistant|>\n'),
|
376 |
sep_style=SeparatorStyle.MPT,
|
377 |
sep='<|end|>',
|
378 |
+
)
|
379 |
+
)
|
380 |
+
|
381 |
+
|
382 |
+
register_conv_template(
|
383 |
+
Conversation(
|
384 |
+
name='internvl2_5',
|
385 |
+
system_template='<|im_start|>system\n{system_message}',
|
386 |
+
system_message='你是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。',
|
387 |
+
roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
|
388 |
+
sep_style=SeparatorStyle.MPT,
|
389 |
+
sep='<|im_end|>\n',
|
390 |
)
|
391 |
)
|
generation_config.json
CHANGED
@@ -1,4 +1,8 @@
|
|
1 |
{
|
2 |
"_from_model_config": true,
|
3 |
-
"transformers_version": "4.37.2"
|
|
|
|
|
|
|
|
|
4 |
}
|
|
|
1 |
{
|
2 |
"_from_model_config": true,
|
3 |
+
"transformers_version": "4.37.2",
|
4 |
+
"eos_token_id": [
|
5 |
+
92542,
|
6 |
+
92543
|
7 |
+
]
|
8 |
}
|
modeling_intern_vit.py
CHANGED
@@ -3,6 +3,7 @@
|
|
3 |
# Copyright (c) 2024 OpenGVLab
|
4 |
# Licensed under The MIT License [see LICENSE for details]
|
5 |
# --------------------------------------------------------
|
|
|
6 |
from typing import Optional, Tuple, Union
|
7 |
|
8 |
import torch
|
|
|
3 |
# Copyright (c) 2024 OpenGVLab
|
4 |
# Licensed under The MIT License [see LICENSE for details]
|
5 |
# --------------------------------------------------------
|
6 |
+
|
7 |
from typing import Optional, Tuple, Union
|
8 |
|
9 |
import torch
|
modeling_internvl_chat.py
CHANGED
@@ -3,8 +3,9 @@
|
|
3 |
# Copyright (c) 2024 OpenGVLab
|
4 |
# Licensed under The MIT License [see LICENSE for details]
|
5 |
# --------------------------------------------------------
|
|
|
6 |
import warnings
|
7 |
-
from typing import
|
8 |
|
9 |
import torch.utils.checkpoint
|
10 |
import transformers
|
@@ -35,13 +36,14 @@ def version_cmp(v1, v2, op='eq'):
|
|
35 |
class InternVLChatModel(PreTrainedModel):
|
36 |
config_class = InternVLChatConfig
|
37 |
main_input_name = 'pixel_values'
|
|
|
38 |
_supports_flash_attn_2 = True
|
39 |
_no_split_modules = ['InternVisionModel', 'LlamaDecoderLayer', 'InternLM2DecoderLayer']
|
40 |
|
41 |
def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None, use_flash_attn=True):
|
42 |
super().__init__(config)
|
43 |
|
44 |
-
assert version_cmp(transformers.__version__, '4.
|
45 |
image_size = config.force_image_size or config.vision_config.image_size
|
46 |
patch_size = config.vision_config.patch_size
|
47 |
self.patch_size = patch_size
|
@@ -101,7 +103,7 @@ class InternVLChatModel(PreTrainedModel):
|
|
101 |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
102 |
|
103 |
image_flags = image_flags.squeeze(-1)
|
104 |
-
input_embeds = self.language_model.get_input_embeddings()(input_ids)
|
105 |
|
106 |
vit_embeds = self.extract_feature(pixel_values)
|
107 |
vit_embeds = vit_embeds[image_flags == 1]
|
@@ -110,7 +112,7 @@ class InternVLChatModel(PreTrainedModel):
|
|
110 |
B, N, C = input_embeds.shape
|
111 |
input_embeds = input_embeds.reshape(B * N, C)
|
112 |
|
113 |
-
if torch.distributed.get_rank() == 0:
|
114 |
print(f'dynamic ViT batch size: {vit_batch_size}, images per sample: {vit_batch_size / B}, dynamic token length: {N}')
|
115 |
|
116 |
input_ids = input_ids.reshape(B * N)
|
@@ -234,9 +236,9 @@ class InternVLChatModel(PreTrainedModel):
|
|
234 |
|
235 |
tokenizer.padding_side = 'left'
|
236 |
model_inputs = tokenizer(queries, return_tensors='pt', padding=True)
|
237 |
-
input_ids = model_inputs['input_ids'].
|
238 |
-
attention_mask = model_inputs['attention_mask'].
|
239 |
-
eos_token_id = tokenizer.convert_tokens_to_ids(template.sep)
|
240 |
generation_config['eos_token_id'] = eos_token_id
|
241 |
generation_output = self.generate(
|
242 |
pixel_values=pixel_values,
|
@@ -245,7 +247,7 @@ class InternVLChatModel(PreTrainedModel):
|
|
245 |
**generation_config
|
246 |
)
|
247 |
responses = tokenizer.batch_decode(generation_output, skip_special_tokens=True)
|
248 |
-
responses = [response.split(template.sep)[0].strip() for response in responses]
|
249 |
return responses
|
250 |
|
251 |
def chat(self, tokenizer, pixel_values, question, generation_config, history=None, return_history=False,
|
@@ -264,7 +266,7 @@ class InternVLChatModel(PreTrainedModel):
|
|
264 |
|
265 |
template = get_conv_template(self.template)
|
266 |
template.system_message = self.system_message
|
267 |
-
eos_token_id = tokenizer.convert_tokens_to_ids(template.sep)
|
268 |
|
269 |
history = [] if history is None else history
|
270 |
for (old_question, old_answer) in history:
|
@@ -283,8 +285,8 @@ class InternVLChatModel(PreTrainedModel):
|
|
283 |
query = query.replace('<image>', image_tokens, 1)
|
284 |
|
285 |
model_inputs = tokenizer(query, return_tensors='pt')
|
286 |
-
input_ids = model_inputs['input_ids'].
|
287 |
-
attention_mask = model_inputs['attention_mask'].
|
288 |
generation_config['eos_token_id'] = eos_token_id
|
289 |
generation_output = self.generate(
|
290 |
pixel_values=pixel_values,
|
@@ -293,7 +295,7 @@ class InternVLChatModel(PreTrainedModel):
|
|
293 |
**generation_config
|
294 |
)
|
295 |
response = tokenizer.batch_decode(generation_output, skip_special_tokens=True)[0]
|
296 |
-
response = response.split(template.sep)[0].strip()
|
297 |
history.append((question, response))
|
298 |
if return_history:
|
299 |
return response, history
|
@@ -313,7 +315,6 @@ class InternVLChatModel(PreTrainedModel):
|
|
313 |
visual_features: Optional[torch.FloatTensor] = None,
|
314 |
generation_config: Optional[GenerationConfig] = None,
|
315 |
output_hidden_states: Optional[bool] = None,
|
316 |
-
return_dict: Optional[bool] = None,
|
317 |
**generate_kwargs,
|
318 |
) -> torch.LongTensor:
|
319 |
|
@@ -341,7 +342,6 @@ class InternVLChatModel(PreTrainedModel):
|
|
341 |
attention_mask=attention_mask,
|
342 |
generation_config=generation_config,
|
343 |
output_hidden_states=output_hidden_states,
|
344 |
-
return_dict=return_dict,
|
345 |
use_cache=True,
|
346 |
**generate_kwargs,
|
347 |
)
|
|
|
3 |
# Copyright (c) 2024 OpenGVLab
|
4 |
# Licensed under The MIT License [see LICENSE for details]
|
5 |
# --------------------------------------------------------
|
6 |
+
|
7 |
import warnings
|
8 |
+
from typing import List, Optional, Tuple, Union
|
9 |
|
10 |
import torch.utils.checkpoint
|
11 |
import transformers
|
|
|
36 |
class InternVLChatModel(PreTrainedModel):
|
37 |
config_class = InternVLChatConfig
|
38 |
main_input_name = 'pixel_values'
|
39 |
+
base_model_prefix = 'language_model'
|
40 |
_supports_flash_attn_2 = True
|
41 |
_no_split_modules = ['InternVisionModel', 'LlamaDecoderLayer', 'InternLM2DecoderLayer']
|
42 |
|
43 |
def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None, use_flash_attn=True):
|
44 |
super().__init__(config)
|
45 |
|
46 |
+
assert version_cmp(transformers.__version__, '4.37.0', 'ge')
|
47 |
image_size = config.force_image_size or config.vision_config.image_size
|
48 |
patch_size = config.vision_config.patch_size
|
49 |
self.patch_size = patch_size
|
|
|
103 |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
104 |
|
105 |
image_flags = image_flags.squeeze(-1)
|
106 |
+
input_embeds = self.language_model.get_input_embeddings()(input_ids).clone()
|
107 |
|
108 |
vit_embeds = self.extract_feature(pixel_values)
|
109 |
vit_embeds = vit_embeds[image_flags == 1]
|
|
|
112 |
B, N, C = input_embeds.shape
|
113 |
input_embeds = input_embeds.reshape(B * N, C)
|
114 |
|
115 |
+
if torch.distributed.is_initialized() and torch.distributed.get_rank() == 0:
|
116 |
print(f'dynamic ViT batch size: {vit_batch_size}, images per sample: {vit_batch_size / B}, dynamic token length: {N}')
|
117 |
|
118 |
input_ids = input_ids.reshape(B * N)
|
|
|
236 |
|
237 |
tokenizer.padding_side = 'left'
|
238 |
model_inputs = tokenizer(queries, return_tensors='pt', padding=True)
|
239 |
+
input_ids = model_inputs['input_ids'].to(self.device)
|
240 |
+
attention_mask = model_inputs['attention_mask'].to(self.device)
|
241 |
+
eos_token_id = tokenizer.convert_tokens_to_ids(template.sep.strip())
|
242 |
generation_config['eos_token_id'] = eos_token_id
|
243 |
generation_output = self.generate(
|
244 |
pixel_values=pixel_values,
|
|
|
247 |
**generation_config
|
248 |
)
|
249 |
responses = tokenizer.batch_decode(generation_output, skip_special_tokens=True)
|
250 |
+
responses = [response.split(template.sep.strip())[0].strip() for response in responses]
|
251 |
return responses
|
252 |
|
253 |
def chat(self, tokenizer, pixel_values, question, generation_config, history=None, return_history=False,
|
|
|
266 |
|
267 |
template = get_conv_template(self.template)
|
268 |
template.system_message = self.system_message
|
269 |
+
eos_token_id = tokenizer.convert_tokens_to_ids(template.sep.strip())
|
270 |
|
271 |
history = [] if history is None else history
|
272 |
for (old_question, old_answer) in history:
|
|
|
285 |
query = query.replace('<image>', image_tokens, 1)
|
286 |
|
287 |
model_inputs = tokenizer(query, return_tensors='pt')
|
288 |
+
input_ids = model_inputs['input_ids'].to(self.device)
|
289 |
+
attention_mask = model_inputs['attention_mask'].to(self.device)
|
290 |
generation_config['eos_token_id'] = eos_token_id
|
291 |
generation_output = self.generate(
|
292 |
pixel_values=pixel_values,
|
|
|
295 |
**generation_config
|
296 |
)
|
297 |
response = tokenizer.batch_decode(generation_output, skip_special_tokens=True)[0]
|
298 |
+
response = response.split(template.sep.strip())[0].strip()
|
299 |
history.append((question, response))
|
300 |
if return_history:
|
301 |
return response, history
|
|
|
315 |
visual_features: Optional[torch.FloatTensor] = None,
|
316 |
generation_config: Optional[GenerationConfig] = None,
|
317 |
output_hidden_states: Optional[bool] = None,
|
|
|
318 |
**generate_kwargs,
|
319 |
) -> torch.LongTensor:
|
320 |
|
|
|
342 |
attention_mask=attention_mask,
|
343 |
generation_config=generation_config,
|
344 |
output_hidden_states=output_hidden_states,
|
|
|
345 |
use_cache=True,
|
346 |
**generate_kwargs,
|
347 |
)
|
preprocessor_config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"crop_size": 448,
|
3 |
+
"do_center_crop": true,
|
4 |
+
"do_normalize": true,
|
5 |
+
"do_resize": true,
|
6 |
+
"feature_extractor_type": "CLIPFeatureExtractor",
|
7 |
+
"image_mean": [
|
8 |
+
0.485,
|
9 |
+
0.456,
|
10 |
+
0.406
|
11 |
+
],
|
12 |
+
"image_std": [
|
13 |
+
0.229,
|
14 |
+
0.224,
|
15 |
+
0.225
|
16 |
+
],
|
17 |
+
"resample": 3,
|
18 |
+
"size": 448
|
19 |
+
}
|
tokenization_internlm2_fast.py
ADDED
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
|
2 |
+
#
|
3 |
+
# This code is based on transformers/src/transformers/models/llama/tokenization_llama_fast.py
|
4 |
+
#
|
5 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6 |
+
# you may not use this file except in compliance with the License.
|
7 |
+
# You may obtain a copy of the License at
|
8 |
+
#
|
9 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10 |
+
#
|
11 |
+
# Unless required by applicable law or agreed to in writing, software
|
12 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14 |
+
# See the License for the specific language governing permissions and
|
15 |
+
# limitations under the License.
|
16 |
+
|
17 |
+
"""Tokenization Fast class for InternLM."""
|
18 |
+
import os
|
19 |
+
from shutil import copyfile
|
20 |
+
from typing import Any, Dict, Optional, Tuple
|
21 |
+
|
22 |
+
from tokenizers import Tokenizer, decoders, normalizers, processors
|
23 |
+
from tokenizers.models import BPE
|
24 |
+
from transformers.convert_slow_tokenizer import (SLOW_TO_FAST_CONVERTERS,
|
25 |
+
SentencePieceExtractor,
|
26 |
+
SpmConverter)
|
27 |
+
from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
|
28 |
+
from transformers.utils import logging
|
29 |
+
|
30 |
+
from .tokenization_internlm2 import InternLM2Tokenizer
|
31 |
+
|
32 |
+
logger = logging.get_logger(__name__)
|
33 |
+
|
34 |
+
VOCAB_FILES_NAMES = {'vocab_file': './tokenizer.model'}
|
35 |
+
|
36 |
+
|
37 |
+
# Modified from transformers.convert_slow_tokenizer.LlamaConverter
|
38 |
+
class InternLM2Converter(SpmConverter):
|
39 |
+
handle_byte_fallback = True
|
40 |
+
|
41 |
+
def vocab(self, proto):
|
42 |
+
vocab = [
|
43 |
+
('<unk>', 0.0),
|
44 |
+
('<s>', 0.0),
|
45 |
+
('</s>', 0.0),
|
46 |
+
]
|
47 |
+
vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]]
|
48 |
+
return vocab
|
49 |
+
|
50 |
+
def unk_id(self, proto):
|
51 |
+
unk_id = 0
|
52 |
+
return unk_id
|
53 |
+
|
54 |
+
def decoder(self, replacement, add_prefix_space):
|
55 |
+
return decoders.Sequence(
|
56 |
+
[
|
57 |
+
decoders.Replace('▁', ' '),
|
58 |
+
decoders.ByteFallback(),
|
59 |
+
decoders.Fuse(),
|
60 |
+
decoders.Strip(content=' ', left=1),
|
61 |
+
]
|
62 |
+
)
|
63 |
+
|
64 |
+
def tokenizer(self, proto):
|
65 |
+
model_type = proto.trainer_spec.model_type
|
66 |
+
vocab_scores = self.vocab(proto)
|
67 |
+
# special tokens
|
68 |
+
added_tokens = self.original_tokenizer.added_tokens_decoder
|
69 |
+
for i in range(len(vocab_scores)):
|
70 |
+
piece, score = vocab_scores[i]
|
71 |
+
if i in added_tokens:
|
72 |
+
vocab_scores[i] = (added_tokens[i].content, score)
|
73 |
+
if model_type == 1:
|
74 |
+
raise RuntimeError('InternLM2 is supposed to be a BPE model!')
|
75 |
+
|
76 |
+
elif model_type == 2:
|
77 |
+
_, merges = SentencePieceExtractor(self.original_tokenizer.vocab_file).extract(vocab_scores)
|
78 |
+
bpe_vocab = {word: i for i, (word, _score) in enumerate(vocab_scores)}
|
79 |
+
tokenizer = Tokenizer(
|
80 |
+
BPE(bpe_vocab, merges, unk_token=proto.trainer_spec.unk_piece, fuse_unk=True, byte_fallback=True)
|
81 |
+
)
|
82 |
+
tokenizer.add_special_tokens(
|
83 |
+
[ added_token for index, added_token in added_tokens.items()]
|
84 |
+
)
|
85 |
+
else:
|
86 |
+
raise Exception(
|
87 |
+
"You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
|
88 |
+
)
|
89 |
+
|
90 |
+
return tokenizer
|
91 |
+
|
92 |
+
def normalizer(self, proto):
|
93 |
+
normalizers_list = []
|
94 |
+
if proto.normalizer_spec.add_dummy_prefix:
|
95 |
+
normalizers_list.append(normalizers.Prepend(prepend='▁'))
|
96 |
+
normalizers_list.append(normalizers.Replace(pattern=' ', content='▁'))
|
97 |
+
return normalizers.Sequence(normalizers_list)
|
98 |
+
|
99 |
+
def pre_tokenizer(self, replacement, add_prefix_space):
|
100 |
+
return None
|
101 |
+
|
102 |
+
|
103 |
+
SLOW_TO_FAST_CONVERTERS['InternLM2Tokenizer'] = InternLM2Converter
|
104 |
+
|
105 |
+
|
106 |
+
# Modified from transformers.model.llama.tokenization_llama_fast.LlamaTokenizerFast -> InternLM2TokenizerFast
|
107 |
+
class InternLM2TokenizerFast(PreTrainedTokenizerFast):
|
108 |
+
vocab_files_names = VOCAB_FILES_NAMES
|
109 |
+
slow_tokenizer_class = InternLM2Tokenizer
|
110 |
+
padding_side = 'left'
|
111 |
+
model_input_names = ['input_ids', 'attention_mask']
|
112 |
+
_auto_class = 'AutoTokenizer'
|
113 |
+
|
114 |
+
def __init__(
|
115 |
+
self,
|
116 |
+
vocab_file,
|
117 |
+
unk_token='<unk>',
|
118 |
+
bos_token='<s>',
|
119 |
+
eos_token='</s>',
|
120 |
+
pad_token='</s>',
|
121 |
+
sp_model_kwargs: Optional[Dict[str, Any]] = None,
|
122 |
+
add_bos_token=True,
|
123 |
+
add_eos_token=False,
|
124 |
+
decode_with_prefix_space=False,
|
125 |
+
clean_up_tokenization_spaces=False,
|
126 |
+
**kwargs,
|
127 |
+
):
|
128 |
+
super().__init__(
|
129 |
+
vocab_file=vocab_file,
|
130 |
+
unk_token=unk_token,
|
131 |
+
bos_token=bos_token,
|
132 |
+
eos_token=eos_token,
|
133 |
+
pad_token=pad_token,
|
134 |
+
sp_model_kwargs=sp_model_kwargs,
|
135 |
+
add_bos_token=add_bos_token,
|
136 |
+
add_eos_token=add_eos_token,
|
137 |
+
decode_with_prefix_space=decode_with_prefix_space,
|
138 |
+
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
139 |
+
**kwargs,
|
140 |
+
)
|
141 |
+
self._add_bos_token = add_bos_token
|
142 |
+
self._add_eos_token = add_eos_token
|
143 |
+
self.update_post_processor()
|
144 |
+
self.vocab_file = vocab_file
|
145 |
+
|
146 |
+
@property
|
147 |
+
def can_save_slow_tokenizer(self) -> bool:
|
148 |
+
return os.path.isfile(self.vocab_file) if self.vocab_file else False
|
149 |
+
|
150 |
+
def update_post_processor(self):
|
151 |
+
"""
|
152 |
+
Updates the underlying post processor with the current `bos_token` and `eos_token`.
|
153 |
+
"""
|
154 |
+
bos = self.bos_token
|
155 |
+
bos_token_id = self.bos_token_id
|
156 |
+
if bos is None and self.add_bos_token:
|
157 |
+
raise ValueError('add_bos_token = True but bos_token = None')
|
158 |
+
|
159 |
+
eos = self.eos_token
|
160 |
+
eos_token_id = self.eos_token_id
|
161 |
+
if eos is None and self.add_eos_token:
|
162 |
+
raise ValueError('add_eos_token = True but eos_token = None')
|
163 |
+
|
164 |
+
single = f"{(bos+':0 ') if self.add_bos_token else ''}$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
|
165 |
+
pair = f"{single}{(' '+bos+':1') if self.add_bos_token else ''} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"
|
166 |
+
|
167 |
+
special_tokens = []
|
168 |
+
if self.add_bos_token:
|
169 |
+
special_tokens.append((bos, bos_token_id))
|
170 |
+
if self.add_eos_token:
|
171 |
+
special_tokens.append((eos, eos_token_id))
|
172 |
+
self._tokenizer.post_processor = processors.TemplateProcessing(
|
173 |
+
single=single, pair=pair, special_tokens=special_tokens
|
174 |
+
)
|
175 |
+
|
176 |
+
@property
|
177 |
+
def add_eos_token(self):
|
178 |
+
return self._add_eos_token
|
179 |
+
|
180 |
+
@property
|
181 |
+
def add_bos_token(self):
|
182 |
+
return self._add_bos_token
|
183 |
+
|
184 |
+
@add_eos_token.setter
|
185 |
+
def add_eos_token(self, value):
|
186 |
+
self._add_eos_token = value
|
187 |
+
self.update_post_processor()
|
188 |
+
|
189 |
+
@add_bos_token.setter
|
190 |
+
def add_bos_token(self, value):
|
191 |
+
self._add_bos_token = value
|
192 |
+
self.update_post_processor()
|
193 |
+
|
194 |
+
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
195 |
+
if not self.can_save_slow_tokenizer:
|
196 |
+
raise ValueError(
|
197 |
+
'Your fast tokenizer does not have the necessary information to save the vocabulary for a slow '
|
198 |
+
'tokenizer.'
|
199 |
+
)
|
200 |
+
|
201 |
+
if not os.path.isdir(save_directory):
|
202 |
+
logger.error(f'Vocabulary path ({save_directory}) should be a directory')
|
203 |
+
return
|
204 |
+
out_vocab_file = os.path.join(
|
205 |
+
save_directory, (filename_prefix + '-' if filename_prefix else '') + VOCAB_FILES_NAMES['vocab_file']
|
206 |
+
)
|
207 |
+
|
208 |
+
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
|
209 |
+
copyfile(self.vocab_file, out_vocab_file)
|
210 |
+
|
211 |
+
return (out_vocab_file,)
|