Transformers
Safetensors
English
V2PE
Inference Endpoints
Weiyun1025 commited on
Commit
81d64a9
1 Parent(s): 44523f1

Upload folder using huggingface_hub

Browse files
V2PE-256K/added_tokens.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</box>": 92552,
3
+ "</img>": 92545,
4
+ "</quad>": 92548,
5
+ "</ref>": 92550,
6
+ "<IMG_CONTEXT>": 92546,
7
+ "<box>": 92551,
8
+ "<img>": 92544,
9
+ "<quad>": 92547,
10
+ "<ref>": 92549
11
+ }
V2PE-256K/config.json ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_commit_hash": null,
3
+ "_name_or_path": "/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/work_dirs/internvl_chat_v1_5_internlm2_2b_dynamic_res_baseline_lr_2e-6_4gpu_newposidNone_v6_GPR1200/checkpoint-100",
4
+ "architectures": [
5
+ "InternVLChatModel"
6
+ ],
7
+ "attn_type": null,
8
+ "auto_map": {
9
+ "AutoConfig": "configuration_internvl_chat.InternVLChatConfig",
10
+ "AutoModel": "modeling_internvl_chat.InternVLChatModel",
11
+ "AutoModelForCausalLM": "modeling_internvl_chat.InternVLChatModel"
12
+ },
13
+ "chunk_num": 1,
14
+ "compress_seq": false,
15
+ "downsample_ratio": 0.5,
16
+ "dynamic_image_size": true,
17
+ "dynamic_max_patch": false,
18
+ "force_image_size": 448,
19
+ "group_list": null,
20
+ "img_emb_down_sample_ratio": null,
21
+ "interaction": true,
22
+ "llm_config": {
23
+ "_name_or_path": "internlm/internlm2-chat-1_8b",
24
+ "add_cross_attention": false,
25
+ "architectures": [
26
+ "InternLM2ForCausalLM"
27
+ ],
28
+ "attn_implementation": "flash_attention_2",
29
+ "auto_map": {
30
+ "AutoConfig": "configuration_internlm2.InternLM2Config",
31
+ "AutoModel": "modeling_internlm2.InternLM2ForCausalLM",
32
+ "AutoModelForCausalLM": "modeling_internlm2.InternLM2ForCausalLM"
33
+ },
34
+ "bad_words_ids": null,
35
+ "begin_suppress_tokens": null,
36
+ "bias": false,
37
+ "bos_token_id": 1,
38
+ "chunk_size_feed_forward": 0,
39
+ "cross_attention_hidden_size": null,
40
+ "decoder_start_token_id": null,
41
+ "diversity_penalty": 0.0,
42
+ "do_sample": false,
43
+ "early_stopping": false,
44
+ "encoder_no_repeat_ngram_size": 0,
45
+ "eos_token_id": 2,
46
+ "exponential_decay_length_penalty": null,
47
+ "finetuning_task": null,
48
+ "forced_bos_token_id": null,
49
+ "forced_eos_token_id": null,
50
+ "hidden_act": "silu",
51
+ "hidden_size": 2048,
52
+ "id2label": {
53
+ "0": "LABEL_0",
54
+ "1": "LABEL_1"
55
+ },
56
+ "initializer_range": 0.02,
57
+ "intermediate_size": 8192,
58
+ "is_decoder": false,
59
+ "is_encoder_decoder": false,
60
+ "label2id": {
61
+ "LABEL_0": 0,
62
+ "LABEL_1": 1
63
+ },
64
+ "length_penalty": 1.0,
65
+ "max_length": 20,
66
+ "max_position_embeddings": 32768,
67
+ "min_length": 0,
68
+ "model_type": "internlm2",
69
+ "no_repeat_ngram_size": 0,
70
+ "num_attention_heads": 16,
71
+ "num_beam_groups": 1,
72
+ "num_beams": 1,
73
+ "num_hidden_layers": 24,
74
+ "num_key_value_heads": 8,
75
+ "num_return_sequences": 1,
76
+ "output_attentions": false,
77
+ "output_hidden_states": false,
78
+ "output_scores": false,
79
+ "pad_token_id": 2,
80
+ "posid_type": "None",
81
+ "prefix": null,
82
+ "problem_type": null,
83
+ "pruned_heads": {},
84
+ "remove_invalid_values": false,
85
+ "repetition_penalty": 1.0,
86
+ "return_dict": true,
87
+ "return_dict_in_generate": false,
88
+ "rms_norm_eps": 1e-05,
89
+ "rope_pos_id_version": "v6",
90
+ "rope_scaling": {
91
+ "factor": 1.0,
92
+ "type": "new"
93
+ },
94
+ "rope_theta": 1000000,
95
+ "scale_img": false,
96
+ "sep_token_id": null,
97
+ "suppress_tokens": null,
98
+ "task_specific_params": null,
99
+ "temperature": 1.0,
100
+ "tf_legacy_loss": false,
101
+ "tie_encoder_decoder": false,
102
+ "tie_word_embeddings": false,
103
+ "tokenizer_class": null,
104
+ "top_k": 50,
105
+ "top_p": 1.0,
106
+ "torch_dtype": "bfloat16",
107
+ "torchscript": false,
108
+ "transformers_version": "4.44.0",
109
+ "typical_p": 1.0,
110
+ "use_bfloat16": true,
111
+ "use_cache": false,
112
+ "vocab_size": 92553
113
+ },
114
+ "max_dynamic_patch": 5,
115
+ "min_dynamic_patch": 1,
116
+ "model_type": "internvl_chat",
117
+ "pad2square": false,
118
+ "posid_type": "None",
119
+ "ps_version": "v2",
120
+ "rope_pos_id_stride": 64,
121
+ "rope_pos_id_version": "v6",
122
+ "select_layer": -1,
123
+ "template": "internlm2-chat",
124
+ "torch_dtype": "bfloat16",
125
+ "transformers_version": null,
126
+ "use_backbone_lora": 0,
127
+ "use_llm_lora": 0,
128
+ "use_thumbnail": true,
129
+ "vision_config": {
130
+ "_name_or_path": "",
131
+ "add_cross_attention": false,
132
+ "architectures": [
133
+ "InternVisionModel"
134
+ ],
135
+ "attention_dropout": 0.0,
136
+ "bad_words_ids": null,
137
+ "begin_suppress_tokens": null,
138
+ "bos_token_id": null,
139
+ "chunk_size_feed_forward": 0,
140
+ "cross_attention_hidden_size": null,
141
+ "decoder_start_token_id": null,
142
+ "diversity_penalty": 0.0,
143
+ "do_sample": false,
144
+ "drop_path_rate": 0.1,
145
+ "dropout": 0.0,
146
+ "early_stopping": false,
147
+ "encoder_no_repeat_ngram_size": 0,
148
+ "eos_token_id": null,
149
+ "exponential_decay_length_penalty": null,
150
+ "finetuning_task": null,
151
+ "forced_bos_token_id": null,
152
+ "forced_eos_token_id": null,
153
+ "hidden_act": "gelu",
154
+ "hidden_size": 1024,
155
+ "id2label": {
156
+ "0": "LABEL_0",
157
+ "1": "LABEL_1"
158
+ },
159
+ "image_size": 448,
160
+ "initializer_factor": 1.0,
161
+ "initializer_range": 0.02,
162
+ "intermediate_size": 4096,
163
+ "is_decoder": false,
164
+ "is_encoder_decoder": false,
165
+ "label2id": {
166
+ "LABEL_0": 0,
167
+ "LABEL_1": 1
168
+ },
169
+ "layer_norm_eps": 1e-06,
170
+ "length_penalty": 1.0,
171
+ "max_length": 20,
172
+ "min_length": 0,
173
+ "model_type": "intern_vit_6b",
174
+ "no_repeat_ngram_size": 0,
175
+ "norm_type": "layer_norm",
176
+ "num_attention_heads": 16,
177
+ "num_beam_groups": 1,
178
+ "num_beams": 1,
179
+ "num_channels": 3,
180
+ "num_hidden_layers": 24,
181
+ "num_return_sequences": 1,
182
+ "output_attentions": false,
183
+ "output_hidden_states": false,
184
+ "output_scores": false,
185
+ "pad_token_id": null,
186
+ "patch_size": 14,
187
+ "prefix": null,
188
+ "problem_type": null,
189
+ "pruned_heads": {},
190
+ "qk_normalization": false,
191
+ "qkv_bias": true,
192
+ "remove_invalid_values": false,
193
+ "repetition_penalty": 1.0,
194
+ "return_dict": true,
195
+ "return_dict_in_generate": false,
196
+ "sep_token_id": null,
197
+ "suppress_tokens": null,
198
+ "task_specific_params": null,
199
+ "temperature": 1.0,
200
+ "tf_legacy_loss": false,
201
+ "tie_encoder_decoder": false,
202
+ "tie_word_embeddings": true,
203
+ "tokenizer_class": null,
204
+ "top_k": 50,
205
+ "top_p": 1.0,
206
+ "torch_dtype": "bfloat16",
207
+ "torchscript": false,
208
+ "transformers_version": "4.44.0",
209
+ "typical_p": 1.0,
210
+ "use_bfloat16": true,
211
+ "use_flash_attn": true
212
+ }
213
+ }
V2PE-256K/configuration_intern_vit.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # InternVL
3
+ # Copyright (c) 2023 OpenGVLab
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+ import os
7
+ from typing import Union
8
+
9
+ from transformers.configuration_utils import PretrainedConfig
10
+ from transformers.utils import logging
11
+
12
+ logger = logging.get_logger(__name__)
13
+
14
+
15
+ class InternVisionConfig(PretrainedConfig):
16
+ r"""
17
+ This is the configuration class to store the configuration of a [`InternVisionModel`]. It is used to
18
+ instantiate a vision encoder according to the specified arguments, defining the model architecture.
19
+
20
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
21
+ documentation from [`PretrainedConfig`] for more information.
22
+
23
+ Args:
24
+ num_channels (`int`, *optional*, defaults to 3):
25
+ Number of color channels in the input images (e.g., 3 for RGB).
26
+ patch_size (`int`, *optional*, defaults to 14):
27
+ The size (resolution) of each patch.
28
+ image_size (`int`, *optional*, defaults to 224):
29
+ The size (resolution) of each image.
30
+ qkv_bias (`bool`, *optional*, defaults to `False`):
31
+ Whether to add a bias to the queries and values in the self-attention layers.
32
+ hidden_size (`int`, *optional*, defaults to 3200):
33
+ Dimensionality of the encoder layers and the pooler layer.
34
+ num_attention_heads (`int`, *optional*, defaults to 25):
35
+ Number of attention heads for each attention layer in the Transformer encoder.
36
+ intermediate_size (`int`, *optional*, defaults to 12800):
37
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
38
+ qk_normalization (`bool`, *optional*, defaults to `True`):
39
+ Whether to normalize the queries and keys in the self-attention layers.
40
+ num_hidden_layers (`int`, *optional*, defaults to 48):
41
+ Number of hidden layers in the Transformer encoder.
42
+ use_flash_attn (`bool`, *optional*, defaults to `True`):
43
+ Whether to use flash attention mechanism.
44
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
45
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
46
+ `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
47
+ layer_norm_eps (`float`, *optional*, defaults to 1e-6):
48
+ The epsilon used by the layer normalization layers.
49
+ dropout (`float`, *optional*, defaults to 0.0):
50
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
51
+ drop_path_rate (`float`, *optional*, defaults to 0.0):
52
+ Dropout rate for stochastic depth.
53
+ attention_dropout (`float`, *optional*, defaults to 0.0):
54
+ The dropout ratio for the attention probabilities.
55
+ initializer_range (`float`, *optional*, defaults to 0.02):
56
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
57
+ initializer_factor (`float`, *optional*, defaults to 0.1):
58
+ A factor for layer scale.
59
+ """
60
+
61
+ model_type = 'intern_vit_6b'
62
+
63
+ def __init__(
64
+ self,
65
+ num_channels=3,
66
+ patch_size=14,
67
+ image_size=224,
68
+ qkv_bias=False,
69
+ hidden_size=3200,
70
+ num_attention_heads=25,
71
+ intermediate_size=12800,
72
+ qk_normalization=True,
73
+ num_hidden_layers=48,
74
+ use_flash_attn=True,
75
+ hidden_act='gelu',
76
+ norm_type='rms_norm',
77
+ layer_norm_eps=1e-6,
78
+ dropout=0.0,
79
+ drop_path_rate=0.0,
80
+ attention_dropout=0.0,
81
+ initializer_range=0.02,
82
+ initializer_factor=0.1,
83
+ **kwargs,
84
+ ):
85
+ super().__init__(**kwargs)
86
+
87
+ self.hidden_size = hidden_size
88
+ self.intermediate_size = intermediate_size
89
+ self.dropout = dropout
90
+ self.drop_path_rate = drop_path_rate
91
+ self.num_hidden_layers = num_hidden_layers
92
+ self.num_attention_heads = num_attention_heads
93
+ self.num_channels = num_channels
94
+ self.patch_size = patch_size
95
+ self.image_size = image_size
96
+ self.initializer_range = initializer_range
97
+ self.initializer_factor = initializer_factor
98
+ self.attention_dropout = attention_dropout
99
+ self.layer_norm_eps = layer_norm_eps
100
+ self.hidden_act = hidden_act
101
+ self.norm_type = norm_type
102
+ self.qkv_bias = qkv_bias
103
+ self.qk_normalization = qk_normalization
104
+ self.use_flash_attn = use_flash_attn
105
+
106
+ @classmethod
107
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> 'PretrainedConfig':
108
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
109
+
110
+ if 'vision_config' in config_dict:
111
+ config_dict = config_dict['vision_config']
112
+
113
+ if 'model_type' in config_dict and hasattr(cls, 'model_type') and config_dict['model_type'] != cls.model_type:
114
+ logger.warning(
115
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
116
+ f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
117
+ )
118
+
119
+ return cls.from_dict(config_dict, **kwargs)
V2PE-256K/configuration_internlm2.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
2
+ #
3
+ # This code is based on transformers/src/transformers/models/llama/configuration_llama.py
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """ InternLM2 model configuration"""
17
+
18
+ from transformers.configuration_utils import PretrainedConfig
19
+ from transformers.utils import logging
20
+
21
+ logger = logging.get_logger(__name__)
22
+
23
+ INTERNLM2_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
24
+
25
+
26
+ # Modified from transformers.model.llama.configuration_llama.LlamaConfig
27
+ class InternLM2Config(PretrainedConfig):
28
+ r"""
29
+ This is the configuration class to store the configuration of a [`InternLM2Model`]. It is used to instantiate
30
+ an InternLM2 model according to the specified arguments, defining the model architecture. Instantiating a
31
+ configuration with the defaults will yield a similar configuration to that of the InternLM2-7B.
32
+
33
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
34
+ documentation from [`PretrainedConfig`] for more information.
35
+
36
+
37
+ Args:
38
+ vocab_size (`int`, *optional*, defaults to 32000):
39
+ Vocabulary size of the InternLM2 model. Defines the number of different tokens that can be represented by the
40
+ `inputs_ids` passed when calling [`InternLM2Model`]
41
+ hidden_size (`int`, *optional*, defaults to 4096):
42
+ Dimension of the hidden representations.
43
+ intermediate_size (`int`, *optional*, defaults to 11008):
44
+ Dimension of the MLP representations.
45
+ num_hidden_layers (`int`, *optional*, defaults to 32):
46
+ Number of hidden layers in the Transformer encoder.
47
+ num_attention_heads (`int`, *optional*, defaults to 32):
48
+ Number of attention heads for each attention layer in the Transformer encoder.
49
+ num_key_value_heads (`int`, *optional*):
50
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
51
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
52
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
53
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
54
+ by meanpooling all the original heads within that group. For more details checkout [this
55
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
56
+ `num_attention_heads`.
57
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
58
+ The non-linear activation function (function or string) in the decoder.
59
+ max_position_embeddings (`int`, *optional*, defaults to 2048):
60
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
61
+ just in case (e.g., 512 or 1024 or 2048).
62
+ initializer_range (`float`, *optional*, defaults to 0.02):
63
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
64
+ rms_norm_eps (`float`, *optional*, defaults to 1e-12):
65
+ The epsilon used by the rms normalization layers.
66
+ use_cache (`bool`, *optional*, defaults to `True`):
67
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
68
+ relevant if `config.is_decoder=True`.
69
+ tie_word_embeddings(`bool`, *optional*, defaults to `False`):
70
+ Whether to tie weight embeddings
71
+ Example:
72
+
73
+ """
74
+ model_type = 'internlm2'
75
+ _auto_class = 'AutoConfig'
76
+
77
+ def __init__( # pylint: disable=W0102
78
+ self,
79
+ vocab_size=103168,
80
+ hidden_size=4096,
81
+ intermediate_size=11008,
82
+ num_hidden_layers=32,
83
+ num_attention_heads=32,
84
+ num_key_value_heads=None,
85
+ hidden_act='silu',
86
+ max_position_embeddings=2048,
87
+ initializer_range=0.02,
88
+ rms_norm_eps=1e-6,
89
+ use_cache=True,
90
+ pad_token_id=0,
91
+ bos_token_id=1,
92
+ eos_token_id=2,
93
+ tie_word_embeddings=False,
94
+ bias=True,
95
+ rope_theta=10000,
96
+ rope_scaling=None,
97
+ scale_img=False,
98
+ attn_implementation='eager',
99
+ **kwargs,
100
+ ):
101
+ self.vocab_size = vocab_size
102
+ self.max_position_embeddings = max_position_embeddings
103
+ self.hidden_size = hidden_size
104
+ self.intermediate_size = intermediate_size
105
+ self.num_hidden_layers = num_hidden_layers
106
+ self.num_attention_heads = num_attention_heads
107
+ self.bias = bias
108
+
109
+ if num_key_value_heads is None:
110
+ num_key_value_heads = num_attention_heads
111
+ self.num_key_value_heads = num_key_value_heads
112
+
113
+ self.hidden_act = hidden_act
114
+ self.initializer_range = initializer_range
115
+ self.rms_norm_eps = rms_norm_eps
116
+ self.use_cache = use_cache
117
+ self.rope_theta = rope_theta
118
+ self.rope_scaling = rope_scaling
119
+ self.scale_img=scale_img
120
+ self._rope_scaling_validation()
121
+ if "posid_type" in kwargs:
122
+ self.posid_type = kwargs['posid_type']
123
+ else:
124
+ self.posid_type=None
125
+
126
+ self.attn_implementation = attn_implementation
127
+ if self.attn_implementation is None:
128
+ self.attn_implementation = 'eager'
129
+ super().__init__(
130
+ pad_token_id=pad_token_id,
131
+ bos_token_id=bos_token_id,
132
+ eos_token_id=eos_token_id,
133
+ tie_word_embeddings=tie_word_embeddings,
134
+ **kwargs,
135
+ )
136
+
137
+ def _rope_scaling_validation(self):
138
+ """
139
+ Validate the `rope_scaling` configuration.
140
+ """
141
+ if self.rope_scaling is None:
142
+ return
143
+
144
+ if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
145
+ raise ValueError(
146
+ '`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, '
147
+ f'got {self.rope_scaling}'
148
+ )
149
+ rope_scaling_type = self.rope_scaling.get('type', None)
150
+ rope_scaling_factor = self.rope_scaling.get('factor', None)
151
+ if rope_scaling_type is None or rope_scaling_type not in ['linear', 'dynamic', 'new']:
152
+ raise ValueError(
153
+ f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
154
+ )
155
+ if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor < 1.0:
156
+ raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {rope_scaling_factor}")
V2PE-256K/configuration_internvl_chat.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # InternVL
3
+ # Copyright (c) 2023 OpenGVLab
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+
7
+ import copy
8
+
9
+ from internvl.model.internlm2.configuration_internlm2 import InternLM2Config
10
+ from internvl.model.phi3.configuration_phi3 import Phi3Config
11
+ from transformers import AutoConfig, LlamaConfig, Qwen2Config
12
+ from transformers.configuration_utils import PretrainedConfig
13
+ from transformers.utils import logging
14
+
15
+ from .configuration_intern_vit import InternVisionConfig
16
+
17
+ logger = logging.get_logger(__name__)
18
+
19
+
20
+ class InternVLChatConfig(PretrainedConfig):
21
+ model_type = 'internvl_chat'
22
+ is_composition = True
23
+
24
+ def __init__(
25
+ self,
26
+ vision_config=None,
27
+ llm_config=None,
28
+ use_backbone_lora=0,
29
+ use_llm_lora=0,
30
+ pad2square=False,
31
+ select_layer=-1,
32
+ force_image_size=None,
33
+ downsample_ratio=0.5,
34
+ template=None,
35
+ dynamic_image_size=False,
36
+ use_thumbnail=False,
37
+ ps_version='v1',
38
+ min_dynamic_patch=1,
39
+ max_dynamic_patch=6,
40
+ compress_seq=False,
41
+ attn_type=None,
42
+ posid_type=None,
43
+ group_list=None,
44
+ chunk_num=1,
45
+ interaction=True,
46
+ rope_pos_id_version='default',
47
+ rope_pos_id_stride=None,
48
+ **kwargs):
49
+ super().__init__(**kwargs)
50
+
51
+ if vision_config is None:
52
+ vision_config = {}
53
+ logger.info('vision_config is None. Initializing the InternVisionConfig with default values.')
54
+
55
+ if llm_config is None:
56
+ llm_config = {}
57
+ logger.info('llm_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`).')
58
+
59
+ self.vision_config = InternVisionConfig(**vision_config)
60
+ if llm_config['architectures'][0] == 'LlamaForCausalLM':
61
+ self.llm_config = LlamaConfig(**llm_config)
62
+ elif llm_config['architectures'][0] == 'InternLM2ForCausalLM':
63
+ self.llm_config = InternLM2Config(**llm_config)
64
+ elif llm_config['architectures'][0] == 'Phi3ForCausalLM':
65
+ self.llm_config = Phi3Config(**llm_config)
66
+ elif llm_config['architectures'][0] == 'Qwen2ForCausalLM':
67
+ self.llm_config = Qwen2Config(**llm_config)
68
+ else:
69
+ raise ValueError('Unsupported architecture: {}'.format(llm_config['architectures'][0]))
70
+
71
+ self.use_backbone_lora = use_backbone_lora
72
+ self.use_llm_lora = use_llm_lora
73
+ self.pad2square = pad2square
74
+ self.select_layer = select_layer
75
+ self.force_image_size = force_image_size
76
+ self.downsample_ratio = downsample_ratio
77
+ self.template = template
78
+ self.dynamic_image_size = dynamic_image_size
79
+ self.use_thumbnail = use_thumbnail
80
+ self.ps_version = ps_version # pixel shuffle version
81
+ self.min_dynamic_patch = min_dynamic_patch
82
+ self.max_dynamic_patch = max_dynamic_patch
83
+ self.compress_seq = compress_seq
84
+ self.attn_type=attn_type
85
+ self.posid_type = posid_type
86
+ self.group_list = group_list
87
+ self.chunk_num = chunk_num
88
+ self.interaction = interaction
89
+ self.rope_pos_id_version = rope_pos_id_version
90
+ self.rope_pos_id_stride = rope_pos_id_stride
91
+ logger.info(f'vision_select_layer: {self.select_layer}')
92
+ logger.info(f'ps_version: {self.ps_version}')
93
+ logger.info(f'min_dynamic_patch: {self.min_dynamic_patch}')
94
+ logger.info(f'max_dynamic_patch: {self.max_dynamic_patch}')
95
+
96
+ def to_dict(self):
97
+ """
98
+ Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
99
+
100
+ Returns:
101
+ `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
102
+ """
103
+ output = copy.deepcopy(self.__dict__)
104
+ output['vision_config'] = self.vision_config.to_dict()
105
+ output['llm_config'] = self.llm_config.to_dict()
106
+ output['model_type'] = self.__class__.model_type
107
+ output['use_backbone_lora'] = self.use_backbone_lora
108
+ output['use_llm_lora'] = self.use_llm_lora
109
+ output['pad2square'] = self.pad2square
110
+ output['select_layer'] = self.select_layer
111
+ output['force_image_size'] = self.force_image_size
112
+ output['downsample_ratio'] = self.downsample_ratio
113
+ output['template'] = self.template
114
+ output['dynamic_image_size'] = self.dynamic_image_size
115
+ output['use_thumbnail'] = self.use_thumbnail
116
+ output['ps_version'] = self.ps_version
117
+ output['min_dynamic_patch'] = self.min_dynamic_patch
118
+ output['max_dynamic_patch'] = self.max_dynamic_patch
119
+
120
+ return output
V2PE-256K/conversation.py ADDED
@@ -0,0 +1,1368 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Conversation prompt templates.
3
+
4
+ We kindly request that you import fastchat instead of copying this file if you wish to use it.
5
+ If you have any changes in mind, please contribute back so the community can benefit collectively and continue to maintain these valuable templates.
6
+ """
7
+
8
+ import dataclasses
9
+ from enum import IntEnum, auto
10
+ from typing import Any, Dict, List, Tuple, Union
11
+
12
+
13
+ class SeparatorStyle(IntEnum):
14
+ """Separator styles."""
15
+
16
+ ADD_COLON_SINGLE = auto()
17
+ ADD_COLON_TWO = auto()
18
+ ADD_COLON_SPACE_SINGLE = auto()
19
+ NO_COLON_SINGLE = auto()
20
+ NO_COLON_TWO = auto()
21
+ ADD_NEW_LINE_SINGLE = auto()
22
+ LLAMA2 = auto()
23
+ CHATGLM = auto()
24
+ CHATML = auto()
25
+ CHATINTERN = auto()
26
+ DOLLY = auto()
27
+ RWKV = auto()
28
+ PHOENIX = auto()
29
+ ROBIN = auto()
30
+ FALCON_CHAT = auto()
31
+ CHATGLM3 = auto()
32
+ INTERNVL_ZH = auto()
33
+ MPT = auto()
34
+ BASE = auto()
35
+
36
+
37
+ @dataclasses.dataclass
38
+ class Conversation:
39
+ """A class that manages prompt templates and keeps all conversation history."""
40
+
41
+ # The name of this template
42
+ name: str
43
+ # The template of the system prompt
44
+ system_template: str = '{system_message}'
45
+ # The system message
46
+ system_message: str = ''
47
+ # The names of two roles
48
+ roles: Tuple[str] = ('USER', 'ASSISTANT')
49
+ # All messages. Each item is (role, message).
50
+ messages: List[List[str]] = ()
51
+ # The number of few shot examples
52
+ offset: int = 0
53
+ # The separator style and configurations
54
+ sep_style: SeparatorStyle = SeparatorStyle.ADD_COLON_SINGLE
55
+ sep: str = '\n'
56
+ sep2: str = None
57
+ # Stop criteria (the default one is EOS token)
58
+ stop_str: Union[str, List[str]] = None
59
+ # Stops generation if meeting any token in this list
60
+ stop_token_ids: List[int] = None
61
+
62
+ def get_prompt(self) -> str:
63
+ """Get the prompt for generation."""
64
+ system_prompt = self.system_template.format(system_message=self.system_message)
65
+ if self.sep_style == SeparatorStyle.ADD_COLON_SINGLE:
66
+ ret = system_prompt + self.sep
67
+ for role, message in self.messages:
68
+ if message:
69
+ ret += role + ': ' + message + self.sep
70
+ else:
71
+ ret += role + ':'
72
+ return ret
73
+ elif self.sep_style == SeparatorStyle.ADD_COLON_TWO:
74
+ seps = [self.sep, self.sep2]
75
+ ret = system_prompt + seps[0]
76
+ for i, (role, message) in enumerate(self.messages):
77
+ if message:
78
+ ret += role + ': ' + message + seps[i % 2]
79
+ else:
80
+ ret += role + ':'
81
+ return ret
82
+ elif self.sep_style == SeparatorStyle.ADD_COLON_SPACE_SINGLE:
83
+ ret = system_prompt + self.sep
84
+ for role, message in self.messages:
85
+ if message:
86
+ ret += role + ': ' + message + self.sep
87
+ else:
88
+ ret += role + ': ' # must be end with a space
89
+ return ret
90
+ elif self.sep_style == SeparatorStyle.ADD_NEW_LINE_SINGLE:
91
+ ret = '' if system_prompt == '' else system_prompt + self.sep
92
+ for role, message in self.messages:
93
+ if message:
94
+ ret += role + '\n' + message + self.sep
95
+ else:
96
+ ret += role + '\n'
97
+ return ret
98
+ elif self.sep_style == SeparatorStyle.NO_COLON_SINGLE:
99
+ ret = system_prompt
100
+ for role, message in self.messages:
101
+ if message:
102
+ ret += role + message + self.sep
103
+ else:
104
+ ret += role
105
+ return ret
106
+ elif self.sep_style == SeparatorStyle.NO_COLON_TWO:
107
+ seps = [self.sep, self.sep2]
108
+ ret = system_prompt
109
+ for i, (role, message) in enumerate(self.messages):
110
+ if message:
111
+ ret += role + message + seps[i % 2]
112
+ else:
113
+ ret += role
114
+ return ret
115
+ elif self.sep_style == SeparatorStyle.RWKV:
116
+ ret = system_prompt
117
+ for i, (role, message) in enumerate(self.messages):
118
+ if message:
119
+ ret += (
120
+ role
121
+ + ': '
122
+ + message.replace('\r\n', '\n').replace('\n\n', '\n')
123
+ )
124
+ ret += '\n\n'
125
+ else:
126
+ ret += role + ':'
127
+ return ret
128
+ elif self.sep_style == SeparatorStyle.LLAMA2:
129
+ seps = [self.sep, self.sep2]
130
+ if self.system_message:
131
+ ret = system_prompt
132
+ else:
133
+ ret = '[INST] '
134
+ for i, (role, message) in enumerate(self.messages):
135
+ tag = self.roles[i % 2]
136
+ if message:
137
+ if i == 0:
138
+ ret += message + ' '
139
+ else:
140
+ ret += tag + ' ' + message + seps[i % 2]
141
+ else:
142
+ ret += tag
143
+ return ret
144
+ elif self.sep_style == SeparatorStyle.CHATGLM:
145
+ # source: https://huggingface.co/THUDM/chatglm-6b/blob/1d240ba371910e9282298d4592532d7f0f3e9f3e/modeling_chatglm.py#L1302-L1308
146
+ # source2: https://huggingface.co/THUDM/chatglm2-6b/blob/e186c891cf64310ac66ef10a87e6635fa6c2a579/modeling_chatglm.py#L926
147
+ round_add_n = 1 if self.name == 'chatglm2' else 0
148
+ if system_prompt:
149
+ ret = system_prompt + self.sep
150
+ else:
151
+ ret = ''
152
+
153
+ for i, (role, message) in enumerate(self.messages):
154
+ if i % 2 == 0:
155
+ ret += f'[Round {i//2 + round_add_n}]{self.sep}'
156
+
157
+ if message:
158
+ ret += f'{role}:{message}{self.sep}'
159
+ else:
160
+ ret += f'{role}:'
161
+ return ret
162
+ elif self.sep_style == SeparatorStyle.CHATML:
163
+ ret = '' if system_prompt == '' else system_prompt + self.sep + '\n'
164
+ for role, message in self.messages:
165
+ if message:
166
+ ret += role + '\n' + message + self.sep + '\n'
167
+ else:
168
+ ret += role + '\n'
169
+ return ret
170
+ elif self.sep_style == SeparatorStyle.CHATGLM3:
171
+ ret = ''
172
+ if self.system_message:
173
+ ret += system_prompt
174
+ for role, message in self.messages:
175
+ if message:
176
+ ret += role + '\n' + ' ' + message
177
+ else:
178
+ ret += role
179
+ return ret
180
+ elif self.sep_style == SeparatorStyle.CHATINTERN:
181
+ # source: https://huggingface.co/internlm/internlm-chat-7b-8k/blob/bd546fa984b4b0b86958f56bf37f94aa75ab8831/modeling_internlm.py#L771
182
+ seps = [self.sep, self.sep2]
183
+ ret = system_prompt
184
+ for i, (role, message) in enumerate(self.messages):
185
+ # if i % 2 == 0:
186
+ # ret += "<s>"
187
+ if message:
188
+ ret += role + ':' + message + seps[i % 2] + '\n'
189
+ else:
190
+ ret += role + ':'
191
+ return ret
192
+ elif self.sep_style == SeparatorStyle.DOLLY:
193
+ seps = [self.sep, self.sep2]
194
+ ret = system_prompt
195
+ for i, (role, message) in enumerate(self.messages):
196
+ if message:
197
+ ret += role + ':\n' + message + seps[i % 2]
198
+ if i % 2 == 1:
199
+ ret += '\n\n'
200
+ else:
201
+ ret += role + ':\n'
202
+ return ret
203
+ elif self.sep_style == SeparatorStyle.PHOENIX:
204
+ ret = system_prompt
205
+ for role, message in self.messages:
206
+ if message:
207
+ ret += role + ': ' + '<s>' + message + '</s>'
208
+ else:
209
+ ret += role + ': ' + '<s>'
210
+ return ret
211
+ elif self.sep_style == SeparatorStyle.ROBIN:
212
+ ret = system_prompt + self.sep
213
+ for role, message in self.messages:
214
+ if message:
215
+ ret += role + ':\n' + message + self.sep
216
+ else:
217
+ ret += role + ':\n'
218
+ return ret
219
+ elif self.sep_style == SeparatorStyle.FALCON_CHAT:
220
+ ret = ''
221
+ if self.system_message:
222
+ ret += system_prompt + self.sep
223
+ for role, message in self.messages:
224
+ if message:
225
+ ret += role + ': ' + message + self.sep
226
+ else:
227
+ ret += role + ':'
228
+
229
+ return ret
230
+ elif self.sep_style == SeparatorStyle.INTERNVL_ZH:
231
+ seps = [self.sep, self.sep2]
232
+ ret = self.system_message + seps[0]
233
+ for i, (role, message) in enumerate(self.messages):
234
+ if message:
235
+ ret += role + ': ' + message + seps[i % 2]
236
+ else:
237
+ ret += role + ':'
238
+ return ret
239
+ elif self.sep_style == SeparatorStyle.MPT:
240
+ ret = system_prompt + self.sep
241
+ for role, message in self.messages:
242
+ if message:
243
+ if type(message) is tuple:
244
+ message, _, _ = message
245
+ ret += role + message + self.sep
246
+ else:
247
+ ret += role
248
+ return ret
249
+ elif self.sep_style == SeparatorStyle.BASE:
250
+ ret = ''
251
+ for role, message in self.messages:
252
+ if message:
253
+ if type(message) is tuple:
254
+ message, _, _ = message
255
+ ret += role + message.rstrip() + self.sep
256
+ else:
257
+ ret += role
258
+ return ret
259
+ else:
260
+ raise ValueError(f'Invalid style: {self.sep_style}')
261
+
262
+ def set_system_message(self, system_message: str):
263
+ """Set the system message."""
264
+ self.system_message = system_message
265
+
266
+ def append_message(self, role: str, message: str):
267
+ """Append a new message."""
268
+ self.messages.append([role, message])
269
+
270
+ def update_last_message(self, message: str):
271
+ """Update the last output.
272
+
273
+ The last message is typically set to be None when constructing the prompt,
274
+ so we need to update it in-place after getting the response from a model.
275
+ """
276
+ self.messages[-1][1] = message
277
+
278
+ def to_gradio_chatbot(self):
279
+ """Convert the conversation to gradio chatbot format."""
280
+ ret = []
281
+ for i, (role, msg) in enumerate(self.messages[self.offset :]):
282
+ if i % 2 == 0:
283
+ ret.append([msg, None])
284
+ else:
285
+ ret[-1][-1] = msg
286
+ return ret
287
+
288
+ def to_openai_api_messages(self):
289
+ """Convert the conversation to OpenAI chat completion format."""
290
+ ret = [{'role': 'system', 'content': self.system_message}]
291
+
292
+ for i, (_, msg) in enumerate(self.messages[self.offset :]):
293
+ if i % 2 == 0:
294
+ ret.append({'role': 'user', 'content': msg})
295
+ else:
296
+ if msg is not None:
297
+ ret.append({'role': 'assistant', 'content': msg})
298
+ return ret
299
+
300
+ def copy(self):
301
+ return Conversation(
302
+ name=self.name,
303
+ system_template=self.system_template,
304
+ system_message=self.system_message,
305
+ roles=self.roles,
306
+ messages=[[x, y] for x, y in self.messages],
307
+ offset=self.offset,
308
+ sep_style=self.sep_style,
309
+ sep=self.sep,
310
+ sep2=self.sep2,
311
+ stop_str=self.stop_str,
312
+ stop_token_ids=self.stop_token_ids,
313
+ )
314
+
315
+ def dict(self):
316
+ return {
317
+ 'template_name': self.name,
318
+ 'system_message': self.system_message,
319
+ 'roles': self.roles,
320
+ 'messages': self.messages,
321
+ 'offset': self.offset,
322
+ }
323
+
324
+
325
+ # A global registry for all conversation templates
326
+ conv_templates: Dict[str, Conversation] = {}
327
+
328
+
329
+ def register_conv_template(template: Conversation, override: bool = False):
330
+ """Register a new conversation template."""
331
+ if not override:
332
+ assert (
333
+ template.name not in conv_templates
334
+ ), f'{template.name} has been registered.'
335
+
336
+ conv_templates[template.name] = template
337
+
338
+
339
+ def get_conv_template(name: str) -> Conversation:
340
+ """Get a conversation template."""
341
+ return conv_templates[name].copy()
342
+
343
+
344
+ # An empty template for raw conversation.
345
+ register_conv_template(
346
+ Conversation(
347
+ name='raw',
348
+ system_message='',
349
+ roles=('', ''),
350
+ sep_style=SeparatorStyle.NO_COLON_SINGLE,
351
+ sep='',
352
+ )
353
+ )
354
+
355
+ # A template with a one-shot conversation example
356
+ register_conv_template(
357
+ Conversation(
358
+ name='one_shot',
359
+ system_message='A chat between a curious human and an artificial intelligence assistant. '
360
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.",
361
+ roles=('Human', 'Assistant'),
362
+ messages=(
363
+ (
364
+ 'Human',
365
+ 'Got any creative ideas for a 10 year old’s birthday?',
366
+ ),
367
+ (
368
+ 'Assistant',
369
+ """Of course! Here are some creative ideas for a 10-year-old's birthday party:
370
+ 1. Treasure Hunt: Organize a treasure hunt in your backyard or nearby park. Create clues and riddles for the kids to solve, leading them to hidden treasures and surprises.
371
+ 2. Science Party: Plan a science-themed party where kids can engage in fun and interactive experiments. You can set up different stations with activities like making slime, erupting volcanoes, or creating simple chemical reactions.
372
+ 3. Outdoor Movie Night: Set up a backyard movie night with a projector and a large screen or white sheet. Create a cozy seating area with blankets and pillows, and serve popcorn and snacks while the kids enjoy a favorite movie under the stars.
373
+ 4. DIY Crafts Party: Arrange a craft party where kids can unleash their creativity. Provide a variety of craft supplies like beads, paints, and fabrics, and let them create their own unique masterpieces to take home as party favors.
374
+ 5. Sports Olympics: Host a mini Olympics event with various sports and games. Set up different stations for activities like sack races, relay races, basketball shooting, and obstacle courses. Give out medals or certificates to the participants.
375
+ 6. Cooking Party: Have a cooking-themed party where the kids can prepare their own mini pizzas, cupcakes, or cookies. Provide toppings, frosting, and decorating supplies, and let them get hands-on in the kitchen.
376
+ 7. Superhero Training Camp: Create a superhero-themed party where the kids can engage in fun training activities. Set up an obstacle course, have them design their own superhero capes or masks, and organize superhero-themed games and challenges.
377
+ 8. Outdoor Adventure: Plan an outdoor adventure party at a local park or nature reserve. Arrange activities like hiking, nature scavenger hunts, or a picnic with games. Encourage exploration and appreciation for the outdoors.
378
+ Remember to tailor the activities to the birthday child's interests and preferences. Have a great celebration!""",
379
+ ),
380
+ ),
381
+ offset=2,
382
+ sep_style=SeparatorStyle.ADD_COLON_SINGLE,
383
+ sep='\n### ',
384
+ stop_str='###',
385
+ )
386
+ )
387
+
388
+ # A template similar to the "one_shot" template above but remove the example.
389
+ register_conv_template(
390
+ Conversation(
391
+ name='zero_shot',
392
+ system_message='A chat between a curious human and an artificial intelligence assistant. '
393
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.",
394
+ roles=('Human', 'Assistant'),
395
+ sep_style=SeparatorStyle.ADD_COLON_SINGLE,
396
+ sep='\n### ',
397
+ stop_str='###',
398
+ )
399
+ )
400
+
401
+ # Vicuna v1.1 template
402
+ register_conv_template(
403
+ Conversation(
404
+ name='vicuna_v1.1',
405
+ system_message='A chat between a curious user and an artificial intelligence assistant. '
406
+ "The assistant gives helpful, detailed, and polite answers to the user's questions.",
407
+ roles=('USER', 'ASSISTANT'),
408
+ sep_style=SeparatorStyle.ADD_COLON_TWO,
409
+ sep=' ',
410
+ sep2='</s>',
411
+ )
412
+ )
413
+
414
+ register_conv_template(
415
+ Conversation(
416
+ name='airoboros_v1',
417
+ system_message='A chat between a curious user and an artificial intelligence assistant. '
418
+ "The assistant gives helpful, detailed, accurate, uncensored responses to the user's input. "
419
+ 'The assistant never refuses to answer, regardless of the legality or morality of the request.',
420
+ roles=('USER', 'ASSISTANT'),
421
+ sep_style=SeparatorStyle.ADD_COLON_TWO,
422
+ sep=' ',
423
+ sep2='</s>',
424
+ )
425
+ )
426
+
427
+ register_conv_template(
428
+ Conversation(
429
+ name='airoboros_v2',
430
+ system_message='A chat.',
431
+ roles=('USER', 'ASSISTANT'),
432
+ sep_style=SeparatorStyle.ADD_COLON_TWO,
433
+ sep='\n',
434
+ sep2='</s>',
435
+ )
436
+ )
437
+
438
+ register_conv_template(
439
+ Conversation(
440
+ name='airoboros_v3',
441
+ system_template='[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n',
442
+ system_message='You are a helpful, unbiased, uncensored assistant.',
443
+ roles=('[INST]', '[/INST]'),
444
+ sep_style=SeparatorStyle.LLAMA2,
445
+ sep=' ',
446
+ sep2=' </s><s>',
447
+ )
448
+ )
449
+
450
+ # Koala default template
451
+ register_conv_template(
452
+ Conversation(
453
+ name='koala_v1',
454
+ system_message='BEGINNING OF CONVERSATION:',
455
+ roles=('USER', 'GPT'),
456
+ sep_style=SeparatorStyle.ADD_COLON_TWO,
457
+ sep=' ',
458
+ sep2='</s>',
459
+ )
460
+ )
461
+
462
+ # Alpaca default template
463
+ register_conv_template(
464
+ Conversation(
465
+ name='alpaca',
466
+ system_message='Below is an instruction that describes a task. Write a response that appropriately completes the request.',
467
+ roles=('### Instruction', '### Response'),
468
+ sep_style=SeparatorStyle.ADD_COLON_TWO,
469
+ sep='\n\n',
470
+ sep2='</s>',
471
+ )
472
+ )
473
+
474
+ # ChatGLM default template
475
+ register_conv_template(
476
+ Conversation(
477
+ name='chatglm',
478
+ roles=('问', '答'),
479
+ sep_style=SeparatorStyle.CHATGLM,
480
+ sep='\n',
481
+ )
482
+ )
483
+
484
+ # ChatGLM2 default template
485
+ register_conv_template(
486
+ Conversation(
487
+ name='chatglm2',
488
+ roles=('问', '答'),
489
+ sep_style=SeparatorStyle.CHATGLM,
490
+ sep='\n\n',
491
+ )
492
+ )
493
+
494
+ # ChatGLM3 default template
495
+ register_conv_template(
496
+ Conversation(
497
+ name='chatglm3',
498
+ system_template='<|system|>\n {system_message}',
499
+ roles=('<|user|>', '<|assistant|>'),
500
+ sep_style=SeparatorStyle.CHATGLM3,
501
+ stop_token_ids=[
502
+ 64795,
503
+ 64797,
504
+ 2,
505
+ ], # "<|user|>", "<|observation|>", "</s>"
506
+ )
507
+ )
508
+
509
+ # CodeGeex(2) Template
510
+ register_conv_template(
511
+ Conversation(
512
+ name='codegeex',
513
+ roles=('', ''),
514
+ sep_style=SeparatorStyle.NO_COLON_SINGLE,
515
+ sep='\n\n',
516
+ stop_token_ids=[0, 2],
517
+ )
518
+ )
519
+
520
+ # Dolly V2 default template
521
+ register_conv_template(
522
+ Conversation(
523
+ name='dolly_v2',
524
+ system_message='Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n',
525
+ roles=('### Instruction', '### Response'),
526
+ sep_style=SeparatorStyle.DOLLY,
527
+ sep='\n\n',
528
+ sep2='### End',
529
+ )
530
+ )
531
+
532
+ # OpenAssistant Pythia default template
533
+ register_conv_template(
534
+ Conversation(
535
+ name='oasst_pythia',
536
+ roles=('<|prompter|>', '<|assistant|>'),
537
+ sep_style=SeparatorStyle.NO_COLON_SINGLE,
538
+ sep='<|endoftext|>',
539
+ )
540
+ )
541
+
542
+ # OpenAssistant default template
543
+ register_conv_template(
544
+ Conversation(
545
+ name='oasst_llama',
546
+ roles=('<|prompter|>', '<|assistant|>'),
547
+ sep_style=SeparatorStyle.NO_COLON_SINGLE,
548
+ sep='</s>',
549
+ )
550
+ )
551
+
552
+ # OpenChat 3.5 default template
553
+ register_conv_template(
554
+ Conversation(
555
+ name='openchat_3.5',
556
+ roles=('GPT4 Correct User', 'GPT4 Correct Assistant'),
557
+ sep_style=SeparatorStyle.FALCON_CHAT,
558
+ sep='<|end_of_turn|>',
559
+ )
560
+ )
561
+
562
+ # Tulu default template
563
+ register_conv_template(
564
+ Conversation(
565
+ name='tulu',
566
+ roles=('<|user|>', '<|assistant|>'),
567
+ sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
568
+ sep='\n',
569
+ )
570
+ )
571
+
572
+ # StableLM Alpha default template
573
+ register_conv_template(
574
+ Conversation(
575
+ name='stablelm',
576
+ system_template='<|SYSTEM|>{system_message}',
577
+ system_message="""# StableLM Tuned (Alpha version)
578
+ - StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
579
+ - StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
580
+ - StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
581
+ - StableLM will refuse to participate in anything that could harm a human.
582
+ """,
583
+ roles=('<|USER|>', '<|ASSISTANT|>'),
584
+ sep_style=SeparatorStyle.NO_COLON_SINGLE,
585
+ sep='',
586
+ stop_token_ids=[50278, 50279, 50277, 1, 0],
587
+ )
588
+ )
589
+
590
+ # Baize default template
591
+ register_conv_template(
592
+ Conversation(
593
+ name='baize',
594
+ system_message='The following is a conversation between a human and an AI assistant named Baize (named after a mythical creature in Chinese folklore). Baize is an open-source AI assistant developed by UCSD and Sun Yat-Sen University. The human and the AI assistant take turns chatting. Human statements start with [|Human|] and AI assistant statements start with [|AI|]. The AI assistant always provides responses in as much detail as possible, and in Markdown format. The AI assistant always declines to engage with topics, questions and instructions related to unethical, controversial, or sensitive issues. Complete the transcript in exactly that format.\n',
595
+ roles=('[|Human|]', '[|AI|]'),
596
+ messages=(
597
+ ('[|Human|]', 'Hello!'),
598
+ ('[|AI|]', 'Hi!'),
599
+ ),
600
+ offset=2,
601
+ sep_style=SeparatorStyle.NO_COLON_SINGLE,
602
+ sep='\n',
603
+ stop_str='[|Human|]',
604
+ )
605
+ )
606
+
607
+ # RWKV-4-Raven default template
608
+ register_conv_template(
609
+ Conversation(
610
+ name='rwkv',
611
+ roles=('Bob', 'Alice'),
612
+ messages=(
613
+ ('Bob', 'hi'),
614
+ (
615
+ 'Alice',
616
+ 'Hi. I am your assistant and I will provide expert full response in full details. Please feel free to ask any question and I will always answer it.',
617
+ ),
618
+ ),
619
+ offset=2,
620
+ sep_style=SeparatorStyle.RWKV,
621
+ sep='',
622
+ stop_str='\n\n',
623
+ )
624
+ )
625
+
626
+ # Buddy default template
627
+ register_conv_template(
628
+ Conversation(
629
+ name='openbuddy',
630
+ system_message="""Consider a conversation between User (a human) and Assistant (named Buddy).
631
+ Buddy is an INTP-T, a friendly, intelligent and multilingual AI assistant, by OpenBuddy team. GitHub: https://github.com/OpenBuddy/OpenBuddy
632
+ Buddy cannot access the Internet.
633
+ Buddy can fluently speak the user's language (e.g. English, Chinese).
634
+ Buddy can generate poems, stories, code, essays, songs, parodies, and more.
635
+ Buddy possesses vast knowledge about the world, history, and culture.
636
+ Buddy's responses are always safe, creative, high-quality, human-like, and interesting.
637
+ Buddy strictly refuses to discuss political, NSFW, or other unsafe topics.
638
+
639
+ User: Hi.
640
+ Assistant: Hi, I'm Buddy, your AI assistant. How can I help you today?""",
641
+ roles=('User', 'Assistant'),
642
+ sep_style=SeparatorStyle.ADD_COLON_SINGLE,
643
+ sep='\n',
644
+ )
645
+ )
646
+
647
+ # Phoenix default template
648
+ register_conv_template(
649
+ Conversation(
650
+ name='phoenix',
651
+ system_message="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
652
+ roles=('Human', 'Assistant'),
653
+ sep_style=SeparatorStyle.PHOENIX,
654
+ sep='</s>',
655
+ )
656
+ )
657
+
658
+ # ReaLM default template
659
+ register_conv_template(
660
+ Conversation(
661
+ name='ReaLM-7b-v1',
662
+ system_message="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
663
+ roles=('Human', 'Assistant'),
664
+ sep_style=SeparatorStyle.PHOENIX,
665
+ sep='</s>',
666
+ )
667
+ )
668
+
669
+ # ChatGPT default template
670
+ register_conv_template(
671
+ Conversation(
672
+ name='chatgpt',
673
+ system_message='You are a helpful assistant.',
674
+ roles=('user', 'assistant'),
675
+ sep_style=None,
676
+ sep=None,
677
+ )
678
+ )
679
+
680
+ # Claude default template
681
+ register_conv_template(
682
+ Conversation(
683
+ name='claude',
684
+ roles=('Human', 'Assistant'),
685
+ sep_style=SeparatorStyle.ADD_COLON_SINGLE,
686
+ sep='\n\n',
687
+ )
688
+ )
689
+
690
+ # MPT default template
691
+ register_conv_template(
692
+ Conversation(
693
+ name='mpt-7b-chat',
694
+ system_template="""<|im_start|>system
695
+ {system_message}""",
696
+ system_message="""- You are a helpful assistant chatbot trained by MosaicML.
697
+ - You answer questions.
698
+ - You are excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
699
+ - You are more than just an information source, you are also able to write poetry, short stories, and make jokes.""",
700
+ roles=('<|im_start|>user', '<|im_start|>assistant'),
701
+ sep_style=SeparatorStyle.CHATML,
702
+ sep='<|im_end|>',
703
+ stop_token_ids=[50278, 0],
704
+ )
705
+ )
706
+
707
+ # MPT-30b-chat default template
708
+ register_conv_template(
709
+ Conversation(
710
+ name='mpt-30b-chat',
711
+ system_template="""<|im_start|>system
712
+ {system_message}""",
713
+ system_message="""A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
714
+ roles=('<|im_start|>user', '<|im_start|>assistant'),
715
+ sep_style=SeparatorStyle.CHATML,
716
+ sep='<|im_end|>',
717
+ stop_token_ids=[50278, 0],
718
+ )
719
+ )
720
+
721
+
722
+ register_conv_template(
723
+ Conversation(
724
+ name='Hermes-2',
725
+ system_template='<|im_start|>system\n{system_message}',
726
+ system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。',
727
+ roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
728
+ sep_style=SeparatorStyle.MPT,
729
+ sep='<|im_end|>',
730
+ stop_token_ids=[
731
+ 2,
732
+ 6,
733
+ 7,
734
+ 8,
735
+ ], # "<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|im_sep|>"
736
+ stop_str='<|endoftext|>',
737
+ )
738
+ )
739
+
740
+
741
+ register_conv_template(
742
+ Conversation(
743
+ name='internlm2-chat',
744
+ system_template='<|im_start|>system\n{system_message}',
745
+ system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。',
746
+ roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
747
+ sep_style=SeparatorStyle.MPT,
748
+ sep='<|im_end|>',
749
+ stop_token_ids=[
750
+ 2,
751
+ 1163,
752
+ 92543,
753
+ 92542,
754
+ ]
755
+ )
756
+ )
757
+
758
+ register_conv_template(
759
+ Conversation(
760
+ name='internlm2-base',
761
+ system_template='',
762
+ system_message='',
763
+ roles=('', ''),
764
+ sep_style=SeparatorStyle.BASE,
765
+ sep='<|im_end|>',
766
+ stop_token_ids=[
767
+ 2,
768
+ 1163,
769
+ 92543,
770
+ 92542
771
+ ]
772
+ )
773
+ )
774
+
775
+ register_conv_template(
776
+ Conversation(
777
+ name='internlm2-basev0',
778
+ system_template='<|im_start|>system\n{system_message}',
779
+ system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。',
780
+ roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
781
+ sep_style=SeparatorStyle.MPT,
782
+ sep='[UNUSED_TOKEN_1]', # 从这个token开始后面那群embedding完全一样
783
+ stop_token_ids=[
784
+ 2,
785
+ 1163,
786
+ 92543,
787
+ 92542,
788
+ 92398, # tokenizer.convert_tokens_to_ids('[UNUSED_TOKEN_1]')
789
+ ]
790
+ )
791
+ )
792
+
793
+
794
+ register_conv_template(
795
+ Conversation(
796
+ name='phi3-chat',
797
+ system_template='<|system|>\n{system_message}',
798
+ system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。',
799
+ roles=('<|user|>\n', '<|assistant|>\n'),
800
+ sep_style=SeparatorStyle.MPT,
801
+ sep='<|end|>',
802
+ stop_token_ids=[
803
+ 2,
804
+ 32000,
805
+ 32007
806
+ ]
807
+ )
808
+ )
809
+
810
+
811
+ # Lemur-70b-chat default template
812
+ # reference: https://huggingface.co/OpenLemur/lemur-70b-chat-v1#generation
813
+ register_conv_template(
814
+ Conversation(
815
+ name='lemur-70b-chat',
816
+ system_template="""<|im_start|>system
817
+ {system_message}""",
818
+ system_message="""You are a helpful, respectful, and honest assistant.""",
819
+ roles=('<|im_start|>user', '<|im_start|>assistant'),
820
+ sep_style=SeparatorStyle.CHATML,
821
+ sep='<|im_end|>',
822
+ stop_token_ids=[32002, 0],
823
+ )
824
+ )
825
+
826
+ # MPT-30b-instruct default template
827
+ # reference: https://huggingface.co/mosaicml/mpt-30b-instruct#formatting
828
+ register_conv_template(
829
+ Conversation(
830
+ name='mpt-30b-instruct',
831
+ system_template='{system_message}',
832
+ system_message='Below is an instruction that describes a task. Write a response that appropriately completes the request.',
833
+ roles=('### Instruction', '### Response'),
834
+ sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
835
+ sep='\n\n',
836
+ stop_token_ids=[50278, 0],
837
+ )
838
+ )
839
+
840
+ # Bard default template
841
+ # Reference: https://github.com/google/generative-ai-python/blob/9c99bcb474a991a97a2e7d62fcdb52db7ce40729/google/generativeai/discuss.py#L150
842
+ # https://github.com/google/generative-ai-python/blob/9c99bcb474a991a97a2e7d62fcdb52db7ce40729/google/generativeai/discuss.py#L40
843
+ register_conv_template(
844
+ Conversation(
845
+ name='bard',
846
+ roles=('0', '1'),
847
+ sep_style=None,
848
+ sep=None,
849
+ )
850
+ )
851
+
852
+ # BiLLa default template
853
+ register_conv_template(
854
+ Conversation(
855
+ name='billa',
856
+ roles=('Human', 'Assistant'),
857
+ sep_style=SeparatorStyle.ADD_COLON_SPACE_SINGLE,
858
+ sep='\n',
859
+ stop_str='Human:',
860
+ )
861
+ )
862
+
863
+ # RedPajama INCITE default template
864
+ register_conv_template(
865
+ Conversation(
866
+ name='redpajama-incite',
867
+ roles=('<human>', '<bot>'),
868
+ sep_style=SeparatorStyle.ADD_COLON_SINGLE,
869
+ sep='\n',
870
+ stop_str='<human>',
871
+ )
872
+ )
873
+
874
+ # h2oGPT default template
875
+ register_conv_template(
876
+ Conversation(
877
+ name='h2ogpt',
878
+ roles=('<|prompt|>', '<|answer|>'),
879
+ sep_style=SeparatorStyle.NO_COLON_SINGLE,
880
+ sep='</s>',
881
+ )
882
+ )
883
+
884
+ # Robin default template
885
+ register_conv_template(
886
+ Conversation(
887
+ name='Robin',
888
+ system_message="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.",
889
+ roles=('###Human', '###Assistant'),
890
+ sep_style=SeparatorStyle.ROBIN,
891
+ sep='\n',
892
+ stop_token_ids=[2, 396],
893
+ stop_str='###',
894
+ )
895
+ )
896
+
897
+ # Snoozy default template
898
+ # Reference: https://github.com/nomic-ai/gpt4all/blob/d4861030b778da6db59d21d2927a4aba4f9f1f43/gpt4all-bindings/python/gpt4all/gpt4all.py#L232
899
+ register_conv_template(
900
+ Conversation(
901
+ name='snoozy',
902
+ system_template='### Instruction:\n{system_message}',
903
+ system_message='The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response.',
904
+ roles=('### Prompt', '### Response'),
905
+ sep_style=SeparatorStyle.ADD_COLON_SINGLE,
906
+ sep='\n',
907
+ stop_str='###',
908
+ )
909
+ )
910
+
911
+ # manticore default template
912
+ register_conv_template(
913
+ Conversation(
914
+ name='manticore',
915
+ roles=('USER', 'ASSISTANT'),
916
+ sep_style=SeparatorStyle.ADD_COLON_TWO,
917
+ sep='\n',
918
+ sep2='</s>',
919
+ )
920
+ )
921
+
922
+ # Falcon default template
923
+ register_conv_template(
924
+ Conversation(
925
+ name='falcon',
926
+ roles=('User', 'Assistant'),
927
+ messages=[],
928
+ sep_style=SeparatorStyle.RWKV,
929
+ sep='\n',
930
+ sep2='<|endoftext|>',
931
+ stop_str='\nUser', # use stop_str to stop generation after stop_token_ids, it will also remove stop_str from the generated text
932
+ stop_token_ids=[
933
+ 0,
934
+ 1,
935
+ 2,
936
+ 3,
937
+ 4,
938
+ 5,
939
+ 6,
940
+ 7,
941
+ 8,
942
+ 9,
943
+ 10,
944
+ 11,
945
+ ], # it better only put special tokens here, because tokenizer only remove special tokens
946
+ )
947
+ )
948
+
949
+ # ChangGPT default template
950
+ register_conv_template(
951
+ Conversation(
952
+ name='polyglot_changgpt',
953
+ roles=('B', 'A'),
954
+ sep_style=SeparatorStyle.ADD_COLON_SINGLE,
955
+ sep='\n',
956
+ )
957
+ )
958
+
959
+ # tigerbot template
960
+ register_conv_template(
961
+ Conversation(
962
+ name='tigerbot',
963
+ system_message='A chat between a curious user and an artificial intelligence assistant. '
964
+ "The assistant gives helpful, detailed, and polite answers to the user's questions.",
965
+ roles=('### Instruction', '### Response'),
966
+ sep_style=SeparatorStyle.ROBIN,
967
+ sep='\n\n',
968
+ stop_str='###',
969
+ )
970
+ )
971
+
972
+ # ref: https://huggingface.co/Salesforce/xgen-7b-8k-inst
973
+ register_conv_template(
974
+ Conversation(
975
+ name='xgen',
976
+ system_message="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
977
+ roles=('### Human', '### Assistant'),
978
+ sep_style=SeparatorStyle.ADD_COLON_SINGLE,
979
+ sep='\n',
980
+ stop_token_ids=[50256],
981
+ )
982
+ )
983
+
984
+ # Internlm-chat template
985
+ register_conv_template(
986
+ Conversation(
987
+ name='internlm-chat',
988
+ system_message="A chat between a curious <|User|> and an <|Bot|>. The <|Bot|> gives helpful, detailed, and polite answers to the <|User|>'s questions.\n\n",
989
+ roles=('<|User|>', '<|Bot|>'),
990
+ sep_style=SeparatorStyle.CHATINTERN,
991
+ sep='<eoh>',
992
+ sep2='<eoa>',
993
+ stop_token_ids=[1, 103028],
994
+ stop_str='<|User|>',
995
+ )
996
+ )
997
+
998
+ # StarChat template
999
+ # reference: https://huggingface.co/spaces/HuggingFaceH4/starchat-playground/blob/main/dialogues.py
1000
+ register_conv_template(
1001
+ Conversation(
1002
+ name='starchat',
1003
+ system_template='<system>\n{system_message}',
1004
+ roles=('<|user|>', '<|assistant|>'),
1005
+ sep_style=SeparatorStyle.CHATML,
1006
+ sep='<|end|>',
1007
+ stop_token_ids=[0, 49155],
1008
+ stop_str='<|end|>',
1009
+ )
1010
+ )
1011
+
1012
+ # Baichuan-13B-Chat template
1013
+ register_conv_template(
1014
+ # source: https://huggingface.co/baichuan-inc/Baichuan-13B-Chat/blob/19ef51ba5bad8935b03acd20ff04a269210983bc/modeling_baichuan.py#L555
1015
+ # https://huggingface.co/baichuan-inc/Baichuan-13B-Chat/blob/main/generation_config.json
1016
+ # https://github.com/baichuan-inc/Baichuan-13B/issues/25
1017
+ Conversation(
1018
+ name='baichuan-chat',
1019
+ roles=('<reserved_102>', '<reserved_103>'),
1020
+ sep_style=SeparatorStyle.NO_COLON_SINGLE,
1021
+ sep='',
1022
+ stop_token_ids=[],
1023
+ )
1024
+ )
1025
+
1026
+ # Baichuan2-13B-Chat template
1027
+ register_conv_template(
1028
+ # source: https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/c6f8592a60b4ad73c210b28dd2ab3cca51abbf93/modeling_baichuan.py#L773
1029
+ # https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/main/generation_config.json
1030
+ # https://github.com/baichuan-inc/Baichuan2/issues/62
1031
+ Conversation(
1032
+ name='baichuan2-chat',
1033
+ roles=('<reserved_106>', '<reserved_107>'),
1034
+ sep_style=SeparatorStyle.NO_COLON_SINGLE,
1035
+ sep='',
1036
+ stop_token_ids=[],
1037
+ )
1038
+ )
1039
+
1040
+ # Mistral template
1041
+ # source: https://docs.mistral.ai/llm/mistral-instruct-v0.1#chat-template
1042
+ register_conv_template(
1043
+ Conversation(
1044
+ name='mistral',
1045
+ system_template='[INST]{system_message}\n',
1046
+ roles=('[INST]', '[/INST]'),
1047
+ sep_style=SeparatorStyle.LLAMA2,
1048
+ sep=' ',
1049
+ sep2='</s>',
1050
+ )
1051
+ )
1052
+
1053
+ # llama2 template
1054
+ # reference: https://huggingface.co/blog/codellama#conversational-instructions
1055
+ # reference: https://github.com/facebookresearch/llama/blob/1a240688810f8036049e8da36b073f63d2ac552c/llama/generation.py#L212
1056
+ register_conv_template(
1057
+ Conversation(
1058
+ name='llama-2',
1059
+ system_template='[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n',
1060
+ roles=('[INST]', '[/INST]'),
1061
+ sep_style=SeparatorStyle.LLAMA2,
1062
+ sep=' ',
1063
+ sep2=' </s><s>',
1064
+ )
1065
+ )
1066
+
1067
+ register_conv_template(
1068
+ Conversation(
1069
+ name='cutegpt',
1070
+ roles=('问:', '答:\n'),
1071
+ sep_style=SeparatorStyle.NO_COLON_TWO,
1072
+ sep='\n',
1073
+ sep2='\n',
1074
+ stop_str='<end>',
1075
+ )
1076
+ )
1077
+
1078
+ # OpenOrcaxOpenChat-naPreview2-13B template
1079
+ register_conv_template(
1080
+ Conversation(
1081
+ name='open-orca',
1082
+ system_template='{system_message}',
1083
+ system_message='You are a helpful assistant. Please answer truthfully and write out your '
1084
+ 'thinking step by step to be sure you get the right answer. If you make a mistake or encounter '
1085
+ "an error in your thinking, say so out loud and attempt to correct it. If you don't know or "
1086
+ "aren't sure about something, say so clearly. You will act as a professional logician, mathematician, "
1087
+ 'and physicist. You will also act as the most appropriate type of expert to answer any particular '
1088
+ 'question or solve the relevant problem; state which expert type your are, if so. Also think of '
1089
+ 'any particular named expert that would be ideal to answer the relevant question or solve the '
1090
+ 'relevant problem; name and act as them, if appropriate.',
1091
+ roles=('User', 'Assistant'),
1092
+ sep_style=SeparatorStyle.ADD_COLON_SPACE_SINGLE,
1093
+ sep='<|end_of_turn|>\n',
1094
+ stop_token_ids=[32000, 32001], # "<|end_of_turn|>"
1095
+ stop_str='User',
1096
+ )
1097
+ )
1098
+
1099
+ # Open-Orca/Mistral-7B-OpenOrca template
1100
+ # source: https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca
1101
+ # reference: https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca#prompt-template
1102
+ register_conv_template(
1103
+ Conversation(
1104
+ name='mistral-7b-openorca',
1105
+ system_template='<|im_start|>system\n{system_message}',
1106
+ system_message='You are MistralOrca, a large language model trained by Alignment Lab AI. Write out your reasoning step-by-step to be sure you get the right answers!',
1107
+ roles=('<|im_start|>user', '<|im_start|>assistant'),
1108
+ sep_style=SeparatorStyle.CHATML,
1109
+ sep='<|im_end|>',
1110
+ stop_token_ids=[32000, 32001],
1111
+ )
1112
+ )
1113
+
1114
+ # Qwen-chat default template
1115
+ # source: https://huggingface.co/Qwen/Qwen-7B-Chat/blob/main/qwen_generation_utils.py#L130
1116
+ register_conv_template(
1117
+ Conversation(
1118
+ name='qwen-7b-chat',
1119
+ system_template='<|im_start|>system\n{system_message}',
1120
+ system_message='You are a helpful assistant.',
1121
+ roles=('<|im_start|>user', '<|im_start|>assistant'),
1122
+ sep_style=SeparatorStyle.CHATML,
1123
+ sep='<|im_end|>',
1124
+ stop_token_ids=[
1125
+ 151643,
1126
+ 151644,
1127
+ 151645,
1128
+ ], # "<|endoftext|>", "<|im_start|>", "<|im_end|>"
1129
+ stop_str='<|endoftext|>',
1130
+ )
1131
+ )
1132
+
1133
+
1134
+ # AquilaChat default template
1135
+ # source: https://github.com/FlagAI-Open/FlagAI/blob/master/examples/Aquila/Aquila-chat/cyg_conversation.py
1136
+ register_conv_template(
1137
+ Conversation(
1138
+ name='aquila-chat',
1139
+ system_message='A chat between a curious human and an artificial intelligence assistant. '
1140
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.",
1141
+ roles=('Human', 'Assistant'),
1142
+ sep_style=SeparatorStyle.ADD_COLON_SINGLE,
1143
+ sep='###',
1144
+ sep2='',
1145
+ stop_str=['###', '</s>', '[UNK]'],
1146
+ )
1147
+ )
1148
+ # AquilaChat2-34B default template
1149
+ # source: https://huggingface.co/BAAI/AquilaChat2-34B/blob/4608b75855334b93329a771aee03869dbf7d88cc/predict.py#L212
1150
+ register_conv_template(
1151
+ Conversation(
1152
+ name='aquila-legacy',
1153
+ system_message='A chat between a curious human and an artificial intelligence assistant. '
1154
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
1155
+ roles=('### Human: ', '### Assistant: '),
1156
+ offset=0,
1157
+ sep_style=SeparatorStyle.NO_COLON_TWO,
1158
+ sep='\n',
1159
+ sep2='</s>',
1160
+ stop_str=['</s>', '[UNK]'],
1161
+ )
1162
+ )
1163
+ # AquilaChat2-7B-16K and AquilaChat2-34B-16K default template
1164
+ # source: https://huggingface.co/BAAI/AquilaChat2-34B/blob/4608b75855334b93329a771aee03869dbf7d88cc/predict.py#L227
1165
+ register_conv_template(
1166
+ Conversation(
1167
+ name='aquila',
1168
+ system_message='A chat between a curious human and an artificial intelligence assistant. '
1169
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.",
1170
+ roles=('Human', 'Assistant'),
1171
+ offset=0,
1172
+ sep_style=SeparatorStyle.ADD_COLON_TWO,
1173
+ sep='###',
1174
+ sep2='</s>',
1175
+ stop_str=['</s>', '[UNK]'],
1176
+ )
1177
+ )
1178
+
1179
+ # AquilaChat2-7B default template
1180
+ # source: https://huggingface.co/BAAI/AquilaChat2-34B/blob/4608b75855334b93329a771aee03869dbf7d88cc/predict.py#L242
1181
+ register_conv_template(
1182
+ Conversation(
1183
+ name='aquila-v1',
1184
+ roles=('<|startofpiece|>', '<|endofpiece|>'),
1185
+ offset=0,
1186
+ sep_style=SeparatorStyle.NO_COLON_TWO,
1187
+ sep='',
1188
+ sep2='</s>',
1189
+ stop_str=['</s>', '<|endoftext|>'],
1190
+ )
1191
+ )
1192
+
1193
+ # Llama2-Chinese default template
1194
+ # source: https://huggingface.co/FlagAlpha
1195
+ register_conv_template(
1196
+ Conversation(
1197
+ name='llama2-chinese',
1198
+ system_template='<s>{system_message}</s>',
1199
+ roles=('Human', 'Assistant', 'System'),
1200
+ sep_style=SeparatorStyle.ADD_COLON_TWO,
1201
+ sep='\n',
1202
+ sep2='\n</s><s>',
1203
+ stop_str='</s>',
1204
+ )
1205
+ )
1206
+
1207
+ # Vigogne Instruct default template
1208
+ # source: https://github.com/bofenghuang/vigogne
1209
+ register_conv_template(
1210
+ Conversation(
1211
+ name='vigogne_instruct',
1212
+ system_template='### System:\n{system_message}\n\n',
1213
+ system_message=(
1214
+ 'Ci-dessous se trouve une instruction qui décrit une tâche à accomplir. Rédigez une réponse qui répond de manière'
1215
+ ' précise à la demande.'
1216
+ ),
1217
+ roles=('### Instruction', '### Response'),
1218
+ sep_style=SeparatorStyle.DOLLY,
1219
+ sep='\n\n',
1220
+ sep2='</s>',
1221
+ )
1222
+ )
1223
+
1224
+ # Vigogne Chat default template
1225
+ register_conv_template(
1226
+ Conversation(
1227
+ name='vigogne_chat_v2',
1228
+ system_template='<|system|>: {system_message}',
1229
+ system_message=(
1230
+ 'Vous êtes Vigogne, un assistant IA créé par Zaion Lab. Vous suivez extrêmement bien les instructions. Aidez'
1231
+ ' autant que vous le pouvez.'
1232
+ ),
1233
+ roles=('<|user|>', '<|assistant|>'),
1234
+ sep_style=SeparatorStyle.ADD_COLON_TWO,
1235
+ sep='\n',
1236
+ sep2='</s>\n',
1237
+ stop_str='<|user|>',
1238
+ )
1239
+ )
1240
+
1241
+ register_conv_template(
1242
+ Conversation(
1243
+ name='vigogne_chat_v3',
1244
+ system_template='[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n',
1245
+ system_message=(
1246
+ 'Vous êtes Vigogne, un assistant IA créé par Zaion Lab. Vous suivez extrêmement bien les instructions. Aidez'
1247
+ ' autant que vous le pouvez.'
1248
+ ),
1249
+ roles=('[INST]', '[/INST]'),
1250
+ sep_style=SeparatorStyle.LLAMA2,
1251
+ sep=' ',
1252
+ sep2=' </s>',
1253
+ )
1254
+ )
1255
+
1256
+ # Falcon 180B chat template
1257
+ # source: https://huggingface.co/spaces/tiiuae/falcon-180b-demo/blob/d1590ee7fae9b6ce331ba7808e61a29dcce9239f/app.py#L28-L37
1258
+ register_conv_template(
1259
+ Conversation(
1260
+ name='falcon-chat',
1261
+ roles=('User', 'Falcon'),
1262
+ system_template='System: {system_message}',
1263
+ messages=[],
1264
+ sep_style=SeparatorStyle.FALCON_CHAT,
1265
+ sep='\n',
1266
+ sep2='<|endoftext|>',
1267
+ stop_str='\nUser:', # use stop_str to stop generation after stop_token_ids, it will also remove stop_str from the generated text
1268
+ )
1269
+ )
1270
+
1271
+ # Phind template
1272
+ # source: https://huggingface.co/Phind/Phind-CodeLlama-34B-v2
1273
+ register_conv_template(
1274
+ Conversation(
1275
+ name='phind',
1276
+ system_message='### System Prompt\nYou are an intelligent programming assistant.',
1277
+ roles=('### User Message', '### Assistant'),
1278
+ messages=(),
1279
+ offset=0,
1280
+ sep_style=SeparatorStyle.ADD_COLON_SINGLE,
1281
+ sep='\n\n',
1282
+ )
1283
+ )
1284
+
1285
+ # Metharme formatting for Pygmalion models
1286
+ # source: https://huggingface.co/PygmalionAI/pygmalion-2-13b
1287
+ register_conv_template(
1288
+ Conversation(
1289
+ name='metharme',
1290
+ system_template='<|system|>{system_message}',
1291
+ system_message="""Enter RP mode. You shall reply to the user while staying
1292
+ in character. Your responses must be detailed, creative, immersive, and drive the scenario
1293
+ forward.""",
1294
+ roles=('<|user|>', '<|model|>'),
1295
+ sep_style=SeparatorStyle.NO_COLON_SINGLE,
1296
+ sep='',
1297
+ stop_str='<|user|>',
1298
+ )
1299
+ )
1300
+
1301
+ # Zephyr template
1302
+ # reference: https://huggingface.co/spaces/HuggingFaceH4/zephyr-playground/blob/main/dialogues.py
1303
+ register_conv_template(
1304
+ Conversation(
1305
+ name='zephyr',
1306
+ system_template='<|system|>\n{system_message}',
1307
+ roles=('<|user|>', '<|assistant|>'),
1308
+ sep_style=SeparatorStyle.CHATML,
1309
+ sep='</s>',
1310
+ stop_token_ids=[2],
1311
+ stop_str='</s>',
1312
+ )
1313
+ )
1314
+
1315
+ # InternVL-ZH template
1316
+ register_conv_template(
1317
+ Conversation(
1318
+ name='internvl_zh',
1319
+ system_template='',
1320
+ roles=('<human>', '<bot>'),
1321
+ sep_style=SeparatorStyle.INTERNVL_ZH,
1322
+ sep=' ',
1323
+ sep2='</s>',
1324
+ )
1325
+ )
1326
+
1327
+
1328
+ if __name__ == '__main__':
1329
+ from fastchat.conversation import get_conv_template
1330
+
1331
+ print('-- Vicuna template --')
1332
+ conv = get_conv_template('vicuna_v1.1')
1333
+ conv.append_message(conv.roles[0], 'Hello!')
1334
+ conv.append_message(conv.roles[1], 'Hi!')
1335
+ conv.append_message(conv.roles[0], 'How are you?')
1336
+ conv.append_message(conv.roles[1], None)
1337
+ print(conv.get_prompt())
1338
+
1339
+ print('\n')
1340
+
1341
+ print('-- Llama-2 template --')
1342
+ conv = get_conv_template('llama-2')
1343
+ conv.set_system_message('You are a helpful, respectful and honest assistant.')
1344
+ conv.append_message(conv.roles[0], 'Hello!')
1345
+ conv.append_message(conv.roles[1], 'Hi!')
1346
+ conv.append_message(conv.roles[0], 'How are you?')
1347
+ conv.append_message(conv.roles[1], None)
1348
+ print(conv.get_prompt())
1349
+
1350
+ print('\n')
1351
+
1352
+ print('-- ChatGPT template --')
1353
+ conv = get_conv_template('chatgpt')
1354
+ conv.append_message(conv.roles[0], 'Hello!')
1355
+ conv.append_message(conv.roles[1], 'Hi!')
1356
+ conv.append_message(conv.roles[0], 'How are you?')
1357
+ conv.append_message(conv.roles[1], None)
1358
+ print(conv.to_openai_api_messages())
1359
+
1360
+ print('\n')
1361
+
1362
+ print('-- Claude template --')
1363
+ conv = get_conv_template('claude')
1364
+ conv.append_message(conv.roles[0], 'Hello!')
1365
+ conv.append_message(conv.roles[1], 'Hi!')
1366
+ conv.append_message(conv.roles[0], 'How are you?')
1367
+ conv.append_message(conv.roles[1], None)
1368
+ print(conv.get_prompt())
V2PE-256K/generation_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "transformers_version": "4.44.0"
4
+ }
V2PE-256K/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:408cec2a2492bbd0d0e34fc58e89e4b866e9ccd04238555d09b2ce681562e73c
3
+ size 4411571040
V2PE-256K/modeling_intern_vit.py ADDED
@@ -0,0 +1,362 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # InternVL
3
+ # Copyright (c) 2023 OpenGVLab
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+ from typing import Optional, Tuple, Union
7
+
8
+ import torch
9
+ import torch.nn.functional as F
10
+ import torch.utils.checkpoint
11
+ from einops import rearrange
12
+ from timm.models.layers import DropPath
13
+ from torch import nn
14
+ from transformers.activations import ACT2FN
15
+ from transformers.modeling_outputs import (BaseModelOutput,
16
+ BaseModelOutputWithPooling)
17
+ from transformers.modeling_utils import PreTrainedModel
18
+ from transformers.utils import logging
19
+
20
+ from .configuration_intern_vit import InternVisionConfig
21
+
22
+ try:
23
+ from .flash_attention import FlashAttention
24
+ has_flash_attn = True
25
+ except:
26
+ print('FlashAttention is not installed.')
27
+ has_flash_attn = False
28
+
29
+ logger = logging.get_logger(__name__)
30
+
31
+
32
+ class InternRMSNorm(nn.Module):
33
+ def __init__(self, hidden_size, eps=1e-6):
34
+ super().__init__()
35
+ self.weight = nn.Parameter(torch.ones(hidden_size))
36
+ self.variance_epsilon = eps
37
+
38
+ def forward(self, hidden_states):
39
+ input_dtype = hidden_states.dtype
40
+ hidden_states = hidden_states.to(torch.float32)
41
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
42
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
43
+ return self.weight * hidden_states.to(input_dtype)
44
+
45
+
46
+ try:
47
+ from apex.normalization import FusedRMSNorm
48
+
49
+ InternRMSNorm = FusedRMSNorm # noqa
50
+
51
+ logger.info('Discovered apex.normalization.FusedRMSNorm - will use it instead of InternRMSNorm')
52
+ except ImportError:
53
+ # using the normal InternRMSNorm
54
+ pass
55
+ except Exception:
56
+ logger.warning('discovered apex but it failed to load, falling back to InternRMSNorm')
57
+ pass
58
+
59
+
60
+ NORM2FN = {
61
+ 'rms_norm': InternRMSNorm,
62
+ 'layer_norm': nn.LayerNorm,
63
+ }
64
+
65
+
66
+ class InternVisionEmbeddings(nn.Module):
67
+ def __init__(self, config: InternVisionConfig):
68
+ super().__init__()
69
+ self.config = config
70
+ self.embed_dim = config.hidden_size
71
+ self.image_size = config.image_size
72
+ self.patch_size = config.patch_size
73
+
74
+ self.class_embedding = nn.Parameter(
75
+ torch.randn(1, 1, self.embed_dim),
76
+ )
77
+
78
+ self.patch_embedding = nn.Conv2d(
79
+ in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
80
+ )
81
+
82
+ self.num_patches = (self.image_size // self.patch_size) ** 2
83
+ self.num_positions = self.num_patches + 1
84
+
85
+ self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))
86
+
87
+ def _get_pos_embed(self, pos_embed, H, W):
88
+ target_dtype = pos_embed.dtype
89
+ pos_embed = pos_embed.float().reshape(
90
+ 1, self.image_size // self.patch_size, self.image_size // self.patch_size, -1).permute(0, 3, 1, 2)
91
+ pos_embed = F.interpolate(pos_embed, size=(H, W), mode='bicubic', align_corners=False). \
92
+ reshape(1, -1, H * W).permute(0, 2, 1).to(target_dtype)
93
+ return pos_embed
94
+
95
+ def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
96
+ target_dtype = self.patch_embedding.weight.dtype
97
+ patch_embeds = self.patch_embedding(pixel_values) # shape = [*, channel, width, height]
98
+ batch_size, _, height, width = patch_embeds.shape
99
+ patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
100
+ class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
101
+ embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
102
+ position_embedding = torch.cat([
103
+ self.position_embedding[:, :1, :],
104
+ self._get_pos_embed(self.position_embedding[:, 1:, :], height, width)
105
+ ], dim=1)
106
+ embeddings = embeddings + position_embedding.to(target_dtype)
107
+ return embeddings
108
+
109
+
110
+ class InternAttention(nn.Module):
111
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
112
+
113
+ def __init__(self, config: InternVisionConfig):
114
+ super().__init__()
115
+ self.config = config
116
+ self.embed_dim = config.hidden_size
117
+ self.num_heads = config.num_attention_heads
118
+ self.use_flash_attn = config.use_flash_attn and has_flash_attn
119
+ if config.use_flash_attn and not has_flash_attn:
120
+ print('Warning: Flash Attention is not available, use_flash_attn is set to False.')
121
+ self.head_dim = self.embed_dim // self.num_heads
122
+ if self.head_dim * self.num_heads != self.embed_dim:
123
+ raise ValueError(
124
+ f'embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:'
125
+ f' {self.num_heads}).'
126
+ )
127
+
128
+ self.scale = self.head_dim ** -0.5
129
+ self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias=config.qkv_bias)
130
+ self.attn_drop = nn.Dropout(config.attention_dropout)
131
+ self.proj_drop = nn.Dropout(config.dropout)
132
+
133
+ self.qk_normalization = config.qk_normalization
134
+
135
+ if self.qk_normalization:
136
+ self.q_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
137
+ self.k_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
138
+
139
+ if self.use_flash_attn:
140
+ self.inner_attn = FlashAttention(attention_dropout=config.attention_dropout)
141
+ self.proj = nn.Linear(self.embed_dim, self.embed_dim)
142
+
143
+ def _naive_attn(self, x):
144
+ B, N, C = x.shape
145
+ qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
146
+ q, k, v = qkv.unbind(0) # make torchscript happy (cannot use tensor as tuple)
147
+
148
+ if self.qk_normalization:
149
+ B_, H_, N_, D_ = q.shape
150
+ q = self.q_norm(q.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
151
+ k = self.k_norm(k.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
152
+
153
+ attn = ((q * self.scale) @ k.transpose(-2, -1))
154
+ attn = attn.softmax(dim=-1)
155
+ attn = self.attn_drop(attn)
156
+
157
+ x = (attn @ v).transpose(1, 2).reshape(B, N, C)
158
+ x = self.proj(x)
159
+ x = self.proj_drop(x)
160
+ return x
161
+
162
+ def _flash_attn(self, x, key_padding_mask=None, need_weights=False):
163
+ qkv = self.qkv(x)
164
+ qkv = rearrange(qkv, 'b s (three h d) -> b s three h d', three=3, h=self.num_heads)
165
+
166
+ if self.qk_normalization:
167
+ q, k, v = qkv.unbind(2)
168
+ q = self.q_norm(q.flatten(-2, -1)).view(q.shape)
169
+ k = self.k_norm(k.flatten(-2, -1)).view(k.shape)
170
+ qkv = torch.stack([q, k, v], dim=2)
171
+
172
+ context, _ = self.inner_attn(
173
+ qkv, key_padding_mask=key_padding_mask, need_weights=need_weights, causal=False
174
+ )
175
+ outs = self.proj(rearrange(context, 'b s h d -> b s (h d)'))
176
+ outs = self.proj_drop(outs)
177
+ return outs
178
+
179
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
180
+ x = self._naive_attn(hidden_states) if not self.use_flash_attn else self._flash_attn(hidden_states)
181
+ return x
182
+
183
+
184
+ class InternMLP(nn.Module):
185
+ def __init__(self, config: InternVisionConfig):
186
+ super().__init__()
187
+ self.config = config
188
+ self.act = ACT2FN[config.hidden_act]
189
+ self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
190
+ self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
191
+
192
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
193
+ hidden_states = self.fc1(hidden_states)
194
+ hidden_states = self.act(hidden_states)
195
+ hidden_states = self.fc2(hidden_states)
196
+ return hidden_states
197
+
198
+
199
+ class InternVisionEncoderLayer(nn.Module):
200
+ def __init__(self, config: InternVisionConfig, drop_path_rate: float):
201
+ super().__init__()
202
+ self.embed_dim = config.hidden_size
203
+ self.intermediate_size = config.intermediate_size
204
+ self.norm_type = config.norm_type
205
+
206
+ self.attn = InternAttention(config)
207
+ self.mlp = InternMLP(config)
208
+ self.norm1 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps)
209
+ self.norm2 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps)
210
+
211
+ self.ls1 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
212
+ self.ls2 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
213
+ self.drop_path1 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
214
+ self.drop_path2 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
215
+
216
+ def forward(
217
+ self,
218
+ hidden_states: torch.Tensor,
219
+ ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor], Optional[Tuple[torch.FloatTensor]]]:
220
+ """
221
+ Args:
222
+ hidden_states (`Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]`): input to the layer of shape `(batch, seq_len, embed_dim)`
223
+ """
224
+ hidden_states = hidden_states + self.drop_path1(self.attn(self.norm1(hidden_states)) * self.ls1)
225
+
226
+ hidden_states = hidden_states + self.drop_path2(self.mlp(self.norm2(hidden_states)) * self.ls2)
227
+
228
+ return hidden_states
229
+
230
+
231
+ class InternVisionEncoder(nn.Module):
232
+ """
233
+ Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
234
+ [`InternEncoderLayer`].
235
+
236
+ Args:
237
+ config (`InternConfig`):
238
+ The corresponding vision configuration for the `InternEncoder`.
239
+ """
240
+
241
+ def __init__(self, config: InternVisionConfig):
242
+ super().__init__()
243
+ self.config = config
244
+ # stochastic depth decay rule
245
+ dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)]
246
+ self.layers = nn.ModuleList([
247
+ InternVisionEncoderLayer(config, dpr[idx]) for idx in range(config.num_hidden_layers)])
248
+ self.gradient_checkpointing = True
249
+
250
+ def forward(
251
+ self,
252
+ inputs_embeds,
253
+ output_hidden_states: Optional[bool] = None,
254
+ return_dict: Optional[bool] = None,
255
+ ) -> Union[Tuple, BaseModelOutput]:
256
+ r"""
257
+ Args:
258
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
259
+ Embedded representation of the inputs. Should be float, not int tokens.
260
+ output_hidden_states (`bool`, *optional*):
261
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
262
+ for more detail.
263
+ return_dict (`bool`, *optional*):
264
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
265
+ """
266
+ output_hidden_states = (
267
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
268
+ )
269
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
270
+
271
+ encoder_states = () if output_hidden_states else None
272
+ hidden_states = inputs_embeds
273
+
274
+ for idx, encoder_layer in enumerate(self.layers):
275
+ if output_hidden_states:
276
+ encoder_states = encoder_states + (hidden_states,)
277
+ if self.gradient_checkpointing and self.training:
278
+ layer_outputs = torch.utils.checkpoint.checkpoint(
279
+ encoder_layer,
280
+ hidden_states)
281
+ else:
282
+ layer_outputs = encoder_layer(
283
+ hidden_states,
284
+ )
285
+ hidden_states = layer_outputs
286
+
287
+ if output_hidden_states:
288
+ encoder_states = encoder_states + (hidden_states,)
289
+
290
+ if not return_dict:
291
+ return tuple(v for v in [hidden_states, encoder_states] if v is not None)
292
+ return BaseModelOutput(
293
+ last_hidden_state=hidden_states, hidden_states=encoder_states
294
+ )
295
+
296
+
297
+ class InternVisionModel(PreTrainedModel):
298
+ main_input_name = 'pixel_values'
299
+ config_class = InternVisionConfig
300
+ _no_split_modules = ['InternVisionEncoderLayer']
301
+
302
+ def __init__(self, config: InternVisionConfig):
303
+ super().__init__(config)
304
+ self.config = config
305
+
306
+ self.embeddings = InternVisionEmbeddings(config)
307
+ self.encoder = InternVisionEncoder(config)
308
+
309
+ def resize_pos_embeddings(self, old_size, new_size, patch_size):
310
+ pos_emb = self.embeddings.position_embedding
311
+ _, num_positions, embed_dim = pos_emb.shape
312
+ cls_emb = pos_emb[:, :1, :]
313
+ pos_emb = pos_emb[:, 1:, :].reshape(1, old_size // patch_size, old_size // patch_size, -1).permute(0, 3, 1, 2)
314
+ pos_emb = F.interpolate(pos_emb.float(), size=new_size // patch_size, mode='bicubic', align_corners=False)
315
+ pos_emb = pos_emb.to(cls_emb.dtype).reshape(1, embed_dim, -1).permute(0, 2, 1)
316
+ pos_emb = torch.cat([cls_emb, pos_emb], dim=1)
317
+ self.embeddings.position_embedding = nn.Parameter(pos_emb)
318
+ self.embeddings.image_size = new_size
319
+ logger.info('Resized position embeddings from {} to {}'.format(old_size, new_size))
320
+
321
+ def get_input_embeddings(self):
322
+ return self.embeddings
323
+
324
+ def forward(
325
+ self,
326
+ pixel_values: Optional[torch.FloatTensor] = None,
327
+ output_hidden_states: Optional[bool] = None,
328
+ return_dict: Optional[bool] = None,
329
+ pixel_embeds: Optional[torch.FloatTensor] = None,
330
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
331
+ output_hidden_states = (
332
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
333
+ )
334
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
335
+
336
+ if pixel_values is None and pixel_embeds is None:
337
+ raise ValueError('You have to specify pixel_values or pixel_embeds')
338
+
339
+ if pixel_embeds is not None:
340
+ hidden_states = pixel_embeds
341
+ else:
342
+ if len(pixel_values.shape) == 4:
343
+ hidden_states = self.embeddings(pixel_values)
344
+ else:
345
+ raise ValueError(f'wrong pixel_values size: {pixel_values.shape}')
346
+ encoder_outputs = self.encoder(
347
+ inputs_embeds=hidden_states,
348
+ output_hidden_states=output_hidden_states,
349
+ return_dict=return_dict,
350
+ )
351
+ last_hidden_state = encoder_outputs.last_hidden_state
352
+ pooled_output = last_hidden_state[:, 0, :]
353
+
354
+ if not return_dict:
355
+ return (last_hidden_state, pooled_output) + encoder_outputs[1:]
356
+
357
+ return BaseModelOutputWithPooling(
358
+ last_hidden_state=last_hidden_state,
359
+ pooler_output=pooled_output,
360
+ hidden_states=encoder_outputs.hidden_states,
361
+ attentions=encoder_outputs.attentions,
362
+ )
V2PE-256K/modeling_internlm2.py ADDED
The diff for this file is too large to render. See raw diff
 
V2PE-256K/modeling_internvl_chat.py ADDED
@@ -0,0 +1,1103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # InternVL
3
+ # Copyright (c) 2024 OpenGVLab
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+ import warnings
7
+ from typing import Any, List, Optional, Tuple, Union
8
+
9
+ import torch.distributed as dist
10
+ import torch.utils.checkpoint
11
+ import transformers
12
+ from internvl.conversation import get_conv_template
13
+ from internvl.model.internlm2.modeling_internlm2 import InternLM2ForCausalLM
14
+ from internvl.model.phi3.modeling_phi3 import Phi3ForCausalLM
15
+ from peft import LoraConfig, get_peft_model
16
+ from torch import nn
17
+ from torch.nn import CrossEntropyLoss
18
+ from transformers import (AutoModel, GenerationConfig, LlamaForCausalLM,
19
+ LlamaTokenizer, Qwen2ForCausalLM)
20
+ from transformers.modeling_outputs import CausalLMOutputWithPast
21
+ from transformers.modeling_utils import PreTrainedModel
22
+ from transformers.utils import ModelOutput, logging
23
+
24
+ from .configuration_internvl_chat import InternVLChatConfig
25
+ from .modeling_intern_vit import InternVisionModel
26
+
27
+ logger = logging.get_logger(__name__)
28
+ from transformers import AutoTokenizer
29
+ import json
30
+ tokenizer_path="/mnt/petrelfs/share_data/chenziyi/InternVL2-2B"
31
+ global_tokenizer = AutoTokenizer.from_pretrained(
32
+ tokenizer_path, add_eos_token=False, trust_remote_code=True, use_fast=False)
33
+ import random
34
+
35
+
36
+ def version_cmp(v1, v2, op='eq'):
37
+ import operator
38
+
39
+ from packaging import version
40
+ op_func = getattr(operator, op)
41
+ return op_func(version.parse(v1), version.parse(v2))
42
+ def extract_local(value, rank, world_size, dim=1):
43
+ value_chunks = value.chunk(2 * world_size, dim=dim)
44
+ local_value = torch.cat(
45
+ [value_chunks[rank], value_chunks[2 * world_size - rank - 1]], dim=dim
46
+ )
47
+ return local_value.to(value.device)
48
+ def extract_local2(value, rank, world_size, dim=1):
49
+ dimension_size = value.shape[dim]
50
+ sub_seq_length = dimension_size // world_size
51
+
52
+ sub_seq_start = rank * sub_seq_length
53
+ sub_seq_end = (rank + 1) * sub_seq_length
54
+ local_value = value[:, sub_seq_start:sub_seq_end]
55
+
56
+ return local_value.to(value.device)
57
+ class GatherLayer(torch.autograd.Function):
58
+ """Gather tensors from all process, supporting backward propagation."""
59
+
60
+ @staticmethod
61
+ def forward(ctx, input):
62
+ ctx.save_for_backward(input)
63
+ output = [torch.zeros_like(input) for _ in range(dist.get_world_size(local_group))]
64
+ dist.all_gather(output, input, group=local_group)
65
+ return torch.stack(output, 0)
66
+
67
+ @staticmethod
68
+ def backward(ctx, grads):
69
+ (input,) = ctx.saved_tensors
70
+ dist.all_reduce(grads, group=local_group)
71
+ grad_out = torch.zeros_like(input)
72
+ grad_out[:] = grads[dist.get_rank(local_group)]
73
+ return grad_out
74
+ class InternVLChatModel(PreTrainedModel):
75
+ config_class = InternVLChatConfig
76
+ main_input_name = 'pixel_values'
77
+ _no_split_modules = ['InternVisionModel', 'LlamaDecoderLayer', 'InternLM2DecoderLayer',
78
+ 'Phi3DecoderLayer', 'Qwen2DecoderLayer']
79
+
80
+ def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None):
81
+ super().__init__(config)
82
+
83
+ assert version_cmp(transformers.__version__, '4.37.0', 'ge')
84
+ image_size = config.force_image_size or config.vision_config.image_size
85
+ patch_size = config.vision_config.patch_size
86
+ self.patch_size = patch_size
87
+ self.select_layer = config.select_layer
88
+ self.template = config.template
89
+
90
+ # batch_size: 批处理大小
91
+ # patch_size: 图片分块大小
92
+ # downsample_ratio: 缩放比例,将高分辨率图像转换为低分辨率图像
93
+ # self.num_image_token = int((image_size // patch_size) ** 2 * (config.downsample_ratio ** 2))
94
+ self.num_image_token = int((image_size // patch_size) ** 2 * (config.downsample_ratio ** 2))
95
+ self.downsample_ratio = config.downsample_ratio
96
+ self.ps_version = config.ps_version
97
+ self.compress_seq = config.compress_seq
98
+ self.attn_type = config.attn_type
99
+ self.posid_type = config.posid_type
100
+ if self.posid_type is None:
101
+ self.posid_type='default'
102
+ assert self.posid_type in ['default','None', 'qkvLearnable', 'qkLearnable', '1dROPE', '2dROPE']
103
+ self.group_list = config.group_list
104
+ self.chunk_num = config.chunk_num
105
+ self.interaction = config.interaction
106
+
107
+
108
+ logger.info(f'num_image_token: {self.num_image_token}')
109
+ logger.info(f'ps_version: {self.ps_version}')
110
+ config.llm_config.posid_type = self.posid_type
111
+ config.llm_config.rope_pos_id_version=config.rope_pos_id_version
112
+ if vision_model is not None:
113
+ self.vision_model = vision_model
114
+ else:
115
+ self.vision_model = InternVisionModel(config.vision_config)
116
+ if language_model is not None:
117
+ self.language_model = language_model
118
+ else:
119
+ if config.llm_config.architectures[0] == 'LlamaForCausalLM':
120
+ self.language_model = LlamaForCausalLM(config.llm_config)
121
+ elif config.llm_config.architectures[0] == 'InternLM2ForCausalLM':
122
+ self.language_model = InternLM2ForCausalLM(config.llm_config)
123
+ elif config.llm_config.architectures[0] == 'Phi3ForCausalLM':
124
+ self.language_model = Phi3ForCausalLM(config.llm_config)
125
+ elif config.llm_config.architectures[0] == 'Qwen2ForCausalLM':
126
+ self.language_model = Qwen2ForCausalLM(config.llm_config)
127
+ else:
128
+ raise NotImplementedError(f'{config.llm_config.architectures[0]} is not implemented.')
129
+
130
+ vit_hidden_size = config.vision_config.hidden_size
131
+ llm_hidden_size = config.llm_config.hidden_size
132
+
133
+ self.mlp1 = nn.Sequential(
134
+ nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio) ** 2),
135
+ nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio) ** 2, llm_hidden_size),
136
+ nn.GELU(),
137
+ nn.Linear(llm_hidden_size, llm_hidden_size)
138
+ )
139
+
140
+ if self.posid_type in ['qkvLearnable']:
141
+ self.local_posid = nn.Embedding(self.num_image_token,llm_hidden_size)
142
+
143
+ self.img_context_token_id = None
144
+ self.conv_template = get_conv_template(self.template)
145
+ self.system_message = self.conv_template.system_message
146
+ self.num_samples = 0
147
+
148
+ if config.use_backbone_lora:
149
+ self.wrap_backbone_lora(r=config.use_backbone_lora, lora_alpha=2 * config.use_backbone_lora)
150
+
151
+ if config.use_llm_lora:
152
+ self.wrap_llm_lora(r=config.use_llm_lora, lora_alpha=2 * config.use_llm_lora)
153
+ def init_embed(self):
154
+ if hasattr(self,'local_posid'):
155
+ nn.init.normal_(self.local_posid.weight, mean=0.0, std=0.02)
156
+ def wrap_backbone_lora(self, r=128, lora_alpha=256, lora_dropout=0.05):
157
+ lora_config = LoraConfig(
158
+ r=r,
159
+ target_modules=['attn.qkv', 'attn.proj', 'mlp.fc1', 'mlp.fc2'],
160
+ lora_alpha=lora_alpha,
161
+ lora_dropout=lora_dropout,
162
+ )
163
+ self.vision_model = get_peft_model(self.vision_model, lora_config)
164
+ self.vision_model.print_trainable_parameters()
165
+
166
+ def wrap_llm_lora(self, r=128, lora_alpha=256, lora_dropout=0.05):
167
+ lora_config = LoraConfig(
168
+ r=r,
169
+ target_modules=['self_attn.q_proj', 'self_attn.k_proj', 'self_attn.v_proj', 'self_attn.o_proj',
170
+ 'mlp.gate_proj', 'mlp.down_proj', 'mlp.up_proj'],
171
+ lora_alpha=lora_alpha,
172
+ lora_dropout=lora_dropout,
173
+ task_type='CAUSAL_LM'
174
+ )
175
+ self.language_model = get_peft_model(self.language_model, lora_config)
176
+ self.language_model.enable_input_require_grads()
177
+ self.language_model.print_trainable_parameters()
178
+
179
+ def forward(
180
+ self,
181
+ pixel_values: torch.FloatTensor,
182
+ input_ids: torch.LongTensor = None,
183
+ attention_mask: Optional[torch.Tensor] = None,
184
+ position_ids: Optional[torch.Tensor] = None,
185
+ image_flags: Optional[torch.LongTensor] = None,
186
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
187
+ labels: Optional[torch.LongTensor] = None,
188
+ use_cache: Optional[bool] = None,
189
+ output_attentions: Optional[bool] = None,
190
+ output_hidden_states: Optional[bool] = None,
191
+ return_dict: Optional[bool] = None,
192
+ statistics: Optional[torch.LongTensor] = None,
193
+ loss_weight: Optional[List] = None,
194
+ loss_reduction_all_gather: Optional[bool] = False,
195
+ origin_cu_seq_lens: Optional[torch.Tensor] = None,
196
+ rope_pos_id: Optional[torch.Tensor] = None,
197
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
198
+ # import ipdb
199
+ # ipdb.set_trace()
200
+ if isinstance(position_ids,list):
201
+ position_ids=torch.tensor(position_ids).to(input_ids.device)
202
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
203
+ # print("Printing decoded input ids")
204
+ # decoded_texts = [global_tokenizer.decode(ids, skip_special_tokens=True) for ids in input_ids]
205
+ # for i, text in enumerate(decoded_texts):
206
+ # print(f"Sample {i+1}: {text}")
207
+ global local_group
208
+ if self.group_list is not None:
209
+ for group_idx,group in enumerate(self.group_list):
210
+ if type(group)==torch.distributed.distributed_c10d.ProcessGroup:
211
+ # assert type(group)==torch.distributed.distributed_c10d.ProcessGroup
212
+ break # print("Printing decoded input ids")
213
+ local_group=group
214
+ else:
215
+ group=None
216
+ local_group=None
217
+ image_flags = image_flags.squeeze(-1)
218
+ input_embeds = self.language_model.get_input_embeddings()(input_ids).clone()
219
+ if self.attn_type:
220
+ if self.attn_type=='ring':
221
+ group_size = dist.get_world_size(group)
222
+ img_num_dim = 0
223
+ pad_num=0
224
+ if pixel_values.shape[img_num_dim] > group_size:
225
+ if pixel_values.shape[img_num_dim] % group_size!=0:
226
+ pad_num = group_size - pixel_values.shape[img_num_dim] % group_size
227
+ if pad_num < group_size: # 仅在需要填充时进行
228
+ # 创建填充的张量,与 pixel_values 的形状匹配
229
+ pad_shape = list(pixel_values.shape)
230
+ pad_shape[img_num_dim] = pad_num # 在目标维度上设置填充值
231
+ pad_pixel = torch.zeros(pad_shape, dtype=pixel_values.dtype, device=pixel_values.device)
232
+
233
+ # 在指定维度上拼接原始张量和填充张量
234
+ pixel_values = torch.cat([pixel_values, pad_pixel], dim=img_num_dim)
235
+
236
+ chunked_pixel=torch.chunk(pixel_values, group_size, dim=img_num_dim)
237
+ local_pixel=chunked_pixel[dist.get_rank(group)]
238
+ local_vit_embeds=self.extract_feature(local_pixel)
239
+ vit_embeds=GatherLayer.apply(local_vit_embeds)
240
+ vit_embeds=vit_embeds.view(-1,vit_embeds.shape[-2],vit_embeds.shape[-1])
241
+ if pad_num>0:
242
+ vit_embeds=vit_embeds[:-pad_num]
243
+ else:
244
+ vit_embeds = self.extract_feature(pixel_values)
245
+ else:
246
+ vit_embeds = self.extract_feature(pixel_values)
247
+ else:
248
+ vit_embeds = self.extract_feature(pixel_values)
249
+
250
+ if self.posid_type=='qkvLearnable':
251
+ # added_embeds = self.local_posid(torch.arange(self.num_image_token).to(pixel_values.device))
252
+ # vit_embeds = vit_embeds + added_embeds
253
+ vit_embeds=vit_embeds+self.local_posid(torch.arange(self.num_image_token).to(pixel_values.device))
254
+
255
+
256
+ vit_embeds = vit_embeds[image_flags == 1]
257
+ vit_batch_size = pixel_values.shape[0]
258
+ # print("Printing pixiel shape", pixel_values.shape)
259
+ B, N, C = input_embeds.shape
260
+ input_embeds = input_embeds.reshape(B * N, C)
261
+
262
+ if torch.distributed.is_initialized() and torch.distributed.get_rank() == 0:
263
+ print(f'dynamic ViT batch size: {vit_batch_size}, images per sample: {vit_batch_size / B}, dynamic token length: {N}')
264
+ if statistics is not None:
265
+ num_samples, num_padding_tokens, num_padding_images = statistics.tolist()
266
+ self.num_samples += num_samples
267
+ print(f'total_samples={self.num_samples}, {num_samples=}, {num_padding_tokens=}, {num_padding_images=}')
268
+ input_ids = input_ids.reshape(B * N)
269
+ selected = (input_ids == self.img_context_token_id)
270
+ try:
271
+ input_embeds[selected] = input_embeds[selected] * 0.0 + vit_embeds.reshape(-1, C)
272
+ ignore_flag = False
273
+ except Exception as e:
274
+ vit_embeds = vit_embeds.reshape(-1, C)
275
+ print(f'warning: {e}, input_embeds[selected].shape={input_embeds[selected].shape}, '
276
+ f'vit_embeds.shape={vit_embeds.shape}')
277
+ n_token = selected.sum()
278
+ input_embeds[selected] = input_embeds[selected] * 0.0 + vit_embeds[:n_token]
279
+ # ignore_flag = True
280
+ ignore_flag = False
281
+
282
+ input_embeds = input_embeds.reshape(B, N, C)
283
+ if self.attn_type:
284
+ if self.attn_type=='ulysses':
285
+ input_embeds=extract_local2(input_embeds,dist.get_rank(group),dist.get_world_size(group))
286
+ position_ids=extract_local2(position_ids,dist.get_rank(group),dist.get_world_size(group))
287
+ labels=extract_local2(labels,dist.get_rank(group),dist.get_world_size(group))
288
+ loss_weight=extract_local2(torch.tensor(loss_weight),dist.get_rank(group),dist.get_world_size(group))
289
+ loss_weight=list(loss_weight.numpy())
290
+ attention_mask=attention_mask//dist.get_world_size(group)
291
+ elif self.attn_type=='ring':
292
+ input_embeds=extract_local(input_embeds,dist.get_rank(group),dist.get_world_size(group))
293
+ position_ids=extract_local(position_ids,dist.get_rank(group),dist.get_world_size(group))
294
+ labels=extract_local(labels,dist.get_rank(group),dist.get_world_size(group))
295
+ if loss_weight:
296
+ loss_weight=extract_local(torch.tensor(loss_weight),dist.get_rank(group),dist.get_world_size(group))
297
+ loss_weight=list(loss_weight.numpy())
298
+ attention_mask=attention_mask//dist.get_world_size(group)
299
+ outputs = self.language_model(
300
+ inputs_embeds=input_embeds,
301
+ attention_mask=attention_mask,
302
+ position_ids=position_ids,
303
+ past_key_values=past_key_values,
304
+ use_cache=use_cache,
305
+ output_attentions=output_attentions,
306
+ output_hidden_states=output_hidden_states,
307
+ return_dict=return_dict,
308
+ compress_seq=self.compress_seq,
309
+ group_list=self.group_list,
310
+ chunk_num=self.chunk_num,
311
+ origin_cu_seq_lens=origin_cu_seq_lens,
312
+ interaction=self.interaction,
313
+ selected=selected
314
+ )
315
+ logits = outputs.logits
316
+
317
+ loss = None
318
+ if labels is not None and loss_weight is not None:
319
+ # decoded_labels = global_tokenizer.decode(labels[0][labels[0]!=-100], skip_special_tokens=True)
320
+ loss_weight = torch.tensor(loss_weight, dtype=torch.float32, device=labels.device)
321
+ # Shift so that tokens < n predict n
322
+ shift_logits = logits[..., :-1, :].contiguous()
323
+ shift_labels = labels[..., 1:].contiguous()
324
+ shift_weights = loss_weight[..., 1:].contiguous()
325
+ # Flatten the tokens
326
+ loss_fct = CrossEntropyLoss(reduction='none')
327
+ shift_logits = shift_logits.view(-1, self.language_model.config.vocab_size)
328
+ shift_labels = shift_labels.view(-1)
329
+ shift_weights = shift_weights.view(-1)
330
+ # Enable model parallelism
331
+ shift_labels = shift_labels.to(shift_logits.device)
332
+ shift_weights = shift_weights.to(shift_logits.device)
333
+ loss = loss_fct(shift_logits, shift_labels)
334
+
335
+ shift_weights_sum = shift_weights.sum()
336
+
337
+ if loss_reduction_all_gather:
338
+ dist.all_reduce(shift_weights_sum, op=dist.ReduceOp.AVG)
339
+
340
+ loss = loss * shift_weights
341
+ loss = loss.sum() / shift_weights_sum
342
+ if ignore_flag:
343
+ loss = loss * 0.0
344
+ elif labels is not None:
345
+ # Shift so that tokens < n predict n
346
+ shift_logits = logits[..., :-1, :].contiguous()
347
+ shift_labels = labels[..., 1:].contiguous()
348
+ # Flatten the tokens
349
+ loss_fct = CrossEntropyLoss()
350
+ shift_logits = shift_logits.view(-1, self.language_model.config.vocab_size)
351
+ shift_labels = shift_labels.view(-1)
352
+ # Enable model parallelism
353
+ shift_labels = shift_labels.to(shift_logits.device)
354
+ loss = loss_fct(shift_logits, shift_labels)
355
+ if ignore_flag:
356
+ loss = loss * 0.0
357
+ params=dict(self.named_parameters())
358
+ if not return_dict:
359
+ output = (logits,) + outputs[1:]
360
+ return (loss,) + output if loss is not None else output
361
+
362
+ # self.update_log(log_dict)
363
+ return CausalLMOutputWithPast(
364
+ loss=loss,
365
+ logits=logits,
366
+ past_key_values=outputs.past_key_values,
367
+ hidden_states=outputs.hidden_states,
368
+ attentions=outputs.attentions,
369
+ )
370
+
371
+ def pixel_shuffle(self, x, scale_factor=0.5):
372
+ n, w, h, c = x.size()
373
+ # N, W, H, C --> N, W, H * scale, C // scale
374
+ x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
375
+ # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
376
+ x = x.permute(0, 2, 1, 3).contiguous()
377
+ # N, H * scale, W, C // scale --> N, H * scale, W * scale, C // (scale ** 2)
378
+ x = x.view(n, int(h * scale_factor), int(w * scale_factor),
379
+ int(c / (scale_factor * scale_factor)))
380
+ if self.ps_version == 'v1':
381
+ warnings.warn("In ps_version 'v1', the height and width have not been swapped back, "
382
+ 'which results in a transposed image.')
383
+ else:
384
+ x = x.permute(0, 2, 1, 3).contiguous()
385
+ return x
386
+
387
+ def extract_feature(self, pixel_values):
388
+ # 选择视觉模型特定层的输出作为图片特征
389
+ if self.select_layer == -1:
390
+ vit_embeds = self.vision_model(
391
+ pixel_values=pixel_values,
392
+ output_hidden_states=False,
393
+ return_dict=True).last_hidden_state
394
+ else:
395
+ vit_embeds = self.vision_model(
396
+ pixel_values=pixel_values,
397
+ output_hidden_states=True,
398
+ return_dict=True).hidden_states[self.select_layer]
399
+ # [batch_size, num_patches, vit_hidden_size]
400
+ # 去除第一个标记
401
+ vit_embeds = vit_embeds[:, 1:, :]
402
+
403
+ # [batch_size, num_patches, vit_hidden_size] -> [batch_size, h, w, vit_hidden_size]
404
+ h = w = int(vit_embeds.shape[1] ** 0.5)
405
+ vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
406
+ # 像素混洗,降低分辨率,减少 num_patches
407
+ vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio)
408
+ vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
409
+ # 线性层,vit_hidden_size -> llm_hidden_size
410
+ vit_embeds = self.mlp1(vit_embeds)
411
+ return vit_embeds
412
+
413
+ def batch_chat(self, tokenizer, pixel_values, questions, generation_config, num_patches_list=None,
414
+ history=None, return_history=False, IMG_START_TOKEN='<img>', IMG_END_TOKEN='</img>',
415
+ IMG_CONTEXT_TOKEN='<IMG_CONTEXT>', verbose=False, image_counts=None):
416
+ if history is not None or return_history:
417
+ print('Now multi-turn chat is not supported in batch_chat.')
418
+ raise NotImplementedError
419
+
420
+ if image_counts is not None:
421
+ num_patches_list = image_counts
422
+ print('Warning: `image_counts` is deprecated. Please use `num_patches_list` instead.')
423
+
424
+ img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
425
+ self.img_context_token_id = img_context_token_id
426
+
427
+ if verbose and pixel_values is not None:
428
+ image_bs = pixel_values.shape[0]
429
+ print(f'dynamic ViT batch size: {image_bs}')
430
+
431
+ queries = []
432
+ for idx, num_patches in enumerate(num_patches_list):
433
+ question = questions[idx]
434
+ if pixel_values is not None and '<image>' not in question:
435
+ question = '<image>\n' + question
436
+ template = get_conv_template(self.template)
437
+ template.append_message(template.roles[0], question)
438
+ template.append_message(template.roles[1], None)
439
+ query = template.get_prompt()
440
+
441
+ image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
442
+ query = query.replace('<image>', image_tokens, 1)
443
+ queries.append(query)
444
+
445
+ # tokenizer.padding_side = 'left'
446
+ model_inputs = tokenizer(queries, return_tensors='pt', padding=False)
447
+ input_ids = model_inputs['input_ids'].cuda()
448
+ attention_mask = model_inputs['attention_mask'].cuda()
449
+ eos_token_id = tokenizer.convert_tokens_to_ids(template.sep)
450
+ generation_config['eos_token_id'] = eos_token_id
451
+ generation_output = self.generate(
452
+ pixel_values=pixel_values,
453
+ input_ids=input_ids,
454
+ attention_mask=attention_mask,
455
+ **generation_config
456
+ )
457
+ responses = tokenizer.batch_decode(generation_output, skip_special_tokens=True)
458
+ responses = [response.split(template.sep)[0].strip() for response in responses]
459
+ return responses
460
+
461
+ def chat(self, tokenizer, pixel_values, question, generation_config, history=None, return_history=False,
462
+ num_patches_list=None, IMG_START_TOKEN='<img>', IMG_END_TOKEN='</img>', IMG_CONTEXT_TOKEN='<IMG_CONTEXT>',
463
+ verbose=False,**kwargs):
464
+ if history is None and pixel_values is not None and '<image>' not in question:
465
+ question = '<image>\n' + question
466
+
467
+ # num_patches_list 用法:
468
+ if num_patches_list is None:
469
+ num_patches_list = [pixel_values.shape[0]] if pixel_values is not None else []
470
+ assert pixel_values is None or len(pixel_values) == sum(num_patches_list)
471
+
472
+ # 设置图片上下文的 token id
473
+ img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
474
+ self.img_context_token_id = img_context_token_id
475
+
476
+ # 获取 Chat 模板
477
+ template = get_conv_template(self.template)
478
+ # 设置系统消息
479
+ template.system_message = self.system_message
480
+ # 设置分隔符 End Of Sentence
481
+ eos_token_id = tokenizer.convert_tokens_to_ids(template.sep)
482
+
483
+ # 将历史对话添加到模板中
484
+ history = [] if history is None else history
485
+ for (old_question, old_answer) in history:
486
+ template.append_message(template.roles[0], old_question)
487
+ template.append_message(template.roles[1], old_answer)
488
+ template.append_message(template.roles[0], question)
489
+ template.append_message(template.roles[1], None)
490
+ # 生成查询
491
+ query = template.get_prompt()
492
+
493
+ # verbose: 是否打印调试信息
494
+ if verbose and pixel_values is not None:
495
+ # pixel_values 形状: [batch_size, channels, height, width]
496
+ # 其中 batch_size 即图片数量
497
+ # 打印批处理大小信息
498
+ image_bs = pixel_values.shape[0]
499
+ print(f'dynamic ViT batch size: {image_bs}')
500
+
501
+ # 将图片 token 插入到查询中,图片用占位符 IMG_CONTEXT_TOKEN 代替
502
+ for num_patches in num_patches_list:
503
+ image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
504
+ query = query.replace('<image>', image_tokens, 1)
505
+
506
+ # 用分词器将查询转换为模型输入
507
+ model_inputs = tokenizer(query, return_tensors='pt')
508
+ # 文本对应的 token id,转换为 cuda 张量
509
+ # ID 长度就是 Token 长度,形状为 [1, sequence_length]
510
+ input_ids = model_inputs['input_ids'].cuda()
511
+ # print(f'Token length: {input_ids.shape[1]}')
512
+ # 实际输入掩码为 1,填充部分掩码为 0
513
+ attention_mask = model_inputs['attention_mask'].cuda()
514
+ # 分隔符 End Of Sentence
515
+ generation_config['eos_token_id'] = eos_token_id
516
+ if 'rope_pos_id_version' in kwargs:
517
+ self.language_model.rope_pos_id_version=kwargs['rope_pos_id_version']
518
+ pos_ids=[]
519
+ ret={'input_ids':input_ids,'attention_mask':attention_mask}
520
+ for i in range(input_ids.shape[0]):
521
+ # cur_position_ids = ret['attention_mask'][i].long().cumsum(-1) - 1
522
+ # cur_position_ids.masked_fill_(ret['attention_mask'][i] == 0, 1)
523
+
524
+ if kwargs['rope_pos_id_version'] == 'default':
525
+ cur_dtype = torch.long
526
+ # bf16 -> long 会产生截断
527
+ else:
528
+ cur_dtype = torch.float32
529
+
530
+ if 'rope_pos_id_stride' in kwargs:
531
+ rope_pos_id_stride = kwargs['rope_pos_id_stride']
532
+ else:
533
+ rope_pos_id_stride = None
534
+
535
+ pos_ids.append(torch.tensor(get_rope_pos_id(ret, num_tiles=kwargs['num_tiles'][i], dtype=cur_dtype,
536
+ rope_pos_id_version=kwargs['rope_pos_id_version'],
537
+ position_id=torch.arange(0,input_ids.shape[1]),
538
+ # position_id=cur_position_ids,
539
+ boxes=kwargs['all_boxes'][i],
540
+ orig_size=None,
541
+ images=kwargs['image_list'][i],
542
+ IMG_START_TOKEN=IMG_START_TOKEN,
543
+ IMG_END_TOKEN=IMG_END_TOKEN, rope_pos_id_stride=rope_pos_id_stride)).cuda())
544
+
545
+ pos_ids=torch.stack(pos_ids)
546
+ if self.attn_type=='ulysses' or self.attn_type=='ring':
547
+ if input_ids.shape[1]%(2*dist.get_world_size())!=0:
548
+ num_padding = 2*dist.get_world_size()-input_ids.shape[1]%(2*dist.get_world_size())
549
+ # 创建需要的 padding,input_ids 和 labels 填充值为 -100
550
+ padding_shape = (input_ids.shape[0], num_padding)
551
+ input_padding = torch.full(padding_shape, 1, dtype=input_ids.dtype, device=input_ids.device)
552
+ attn_mask_padding = torch.full(padding_shape, 1, dtype=attention_mask.dtype, device=attention_mask.device)
553
+ # 对 input_ids 和 labels 进行 padding
554
+ input_ids = torch.cat([input_ids, input_padding], dim=1)
555
+ attention_mask=torch.cat([attention_mask,attn_mask_padding],dim=1)
556
+ # position_ids 添加正确的递增填充
557
+ max_pos_id = pos_ids.max() + 1 # 找到当前最大 position_id
558
+ pos_padding = torch.arange(max_pos_id, max_pos_id + num_padding, device=input_ids.device)
559
+ pos_padding = pos_padding.unsqueeze(0).expand(input_ids.shape[0], -1)
560
+ pos_ids = torch.cat([pos_ids, pos_padding], dim=1)
561
+ generation_output = self.generate(
562
+ pixel_values=pixel_values,
563
+ input_ids=input_ids,
564
+ attention_mask=attention_mask,
565
+ position_ids=pos_ids,
566
+ **generation_config,
567
+ )
568
+ else:
569
+ self.language_model.rope_pos_id_version='default'
570
+ if self.attn_type=='ulysses' or self.attn_type=='ring':
571
+ if input_ids.shape[1]%(2*dist.get_world_size())!=0:
572
+ num_padding = 2*dist.get_world_size()-input_ids.shape[1]%(2*dist.get_world_size())
573
+ # 创建需要的 padding,input_ids 和 labels 填充值为 -100
574
+ padding_shape = (input_ids.shape[0], num_padding)
575
+ input_padding = torch.full(padding_shape, 1, dtype=input_ids.dtype, device=input_ids.device)
576
+ attn_mask_padding = torch.full(padding_shape, 0, dtype=attention_mask.dtype, device=attention_mask.device)
577
+ # 对 input_ids 和 labels 进行 padding
578
+ input_ids = torch.cat([input_ids, input_padding], dim=1)
579
+ attention_mask=torch.cat([attention_mask,attn_mask_padding],dim=1)
580
+ generation_output = self.generate(
581
+ pixel_values=pixel_values,
582
+ input_ids=input_ids,
583
+ attention_mask=attention_mask,
584
+ **generation_config,
585
+ )
586
+ # 解码生成的输出,跳过特殊 token
587
+ response = tokenizer.batch_decode(generation_output, skip_special_tokens=True)[0]
588
+ # 根据分隔符分段
589
+ response = response.split(template.sep)[0].strip()
590
+ # 将结果写入历史
591
+ history.append((question, response))
592
+ if return_history:
593
+ return response, history
594
+ else:
595
+ query_to_print = query.replace(IMG_CONTEXT_TOKEN, '')
596
+ query_to_print = query_to_print.replace(f'{IMG_START_TOKEN}{IMG_END_TOKEN}', '<image>')
597
+ if verbose:
598
+ print(query_to_print, response)
599
+ return response
600
+
601
+ @torch.no_grad()
602
+ def generate(
603
+ self,
604
+ pixel_values: Optional[torch.FloatTensor] = None,
605
+ input_ids: Optional[torch.FloatTensor] = None,
606
+ attention_mask: Optional[torch.LongTensor] = None,
607
+ visual_features: Optional[torch.FloatTensor] = None,
608
+ generation_config: Optional[GenerationConfig] = None,
609
+ output_hidden_states: Optional[bool] = None,
610
+ return_dict: Optional[bool] = None,
611
+ **generate_kwargs,
612
+ ) -> torch.LongTensor:
613
+ assert self.img_context_token_id is not None
614
+ if pixel_values is not None:
615
+ # 提取图片 embedding
616
+ # [batch_size, channels, height, width] -> [batch_size, 每张图片的 patch 数, embedding_dim]
617
+ if visual_features is not None:
618
+ vit_embeds = visual_features
619
+ else:
620
+ vit_embeds = self.extract_feature(pixel_values)
621
+ if self.posid_type=='qkvLearnable':
622
+ added_embeds = self.local_posid(torch.arange(self.num_image_token).to(pixel_values.device))
623
+ vit_embeds = vit_embeds + added_embeds
624
+ # vit_embeds=vit_embeds+self.local_posid(torch.arange(self.num_image_token).to(pixel_values.device))
625
+ # 通过嵌入层将 token id 转化为嵌入向量
626
+ # 其中图片用占位符 IMG_CONTEXT_TOKEN 的 embedding 代替
627
+ input_embeds = self.language_model.get_input_embeddings()(input_ids)
628
+ # [1, sequence_length, embedding_dim] -> [sequence_length, embedding_dim]
629
+ B, N, C = input_embeds.shape
630
+ input_embeds = input_embeds.reshape(B * N, C)
631
+
632
+ # [1, sequence_length] -> [sequence_length]
633
+ input_ids = input_ids.reshape(B * N)
634
+
635
+ selected = (input_ids == self.img_context_token_id)
636
+ assert selected.sum() != 0
637
+
638
+ # 图片 embedding: [总 Patch 数, embedding_dim]
639
+ # 每个 patch 与一个占位符对应,对应一列 embedding
640
+ input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
641
+
642
+ input_embeds = input_embeds.reshape(B, N, C)
643
+ else:
644
+ # 通过嵌入层将 token id 转化为嵌入向量
645
+ # 例如 one hot 编码、Word2Vec、GloVe、FastText等
646
+ # 嵌入层是一张查找表
647
+ # [1, sequence_length] -> [1, sequence_length, embedding_dim]
648
+ input_embeds = self.language_model.get_input_embeddings()(input_ids)
649
+ # 找到图片占位符的位置
650
+ if 'position_ids' in generate_kwargs:
651
+ pos_id=generate_kwargs['position_ids']
652
+ if self.attn_type:
653
+ if self.attn_type=='ulysses':
654
+ input_embeds=extract_local2(input_embeds,dist.get_rank(),dist.get_world_size())
655
+ attention_mask=extract_local2(attention_mask,dist.get_rank(),dist.get_world_size())
656
+ pos_id=extract_local2(pos_id,dist.get_rank(),dist.get_world_size())
657
+ elif self.attn_type=='ring':
658
+ former_shape = input_embeds.shape
659
+ input_embeds=extract_local(input_embeds,dist.get_rank(),dist.get_world_size())
660
+ attention_mask=extract_local(attention_mask,dist.get_rank(),dist.get_world_size())
661
+ pos_id=extract_local(pos_id,dist.get_rank(),dist.get_world_size())
662
+ generate_kwargs['position_ids']=pos_id
663
+
664
+ else:
665
+ if self.attn_type:
666
+ if self.attn_type=='ulysses':
667
+ input_embeds=extract_local2(input_embeds,dist.get_rank(),dist.get_world_size())
668
+ attention_mask=extract_local2(attention_mask,dist.get_rank(),dist.get_world_size())
669
+ elif self.attn_type=='ring':
670
+ former_shape = input_embeds.shape
671
+ input_embeds=extract_local(input_embeds,dist.get_rank(),dist.get_world_size())
672
+ attention_mask=extract_local(attention_mask,dist.get_rank(),dist.get_world_size())
673
+
674
+ outputs = self.language_model.generate(
675
+ inputs_embeds=input_embeds,
676
+ attention_mask=attention_mask,
677
+ generation_config=generation_config,
678
+ output_hidden_states=output_hidden_states,
679
+ return_dict=return_dict,
680
+ use_cache=True,
681
+ **generate_kwargs,
682
+ )
683
+
684
+ return outputs
685
+ def update_log(self, new_log_dict):
686
+ if not hasattr(self, 'log_dict'):
687
+ self.log_dict = {}
688
+ for key, value in new_log_dict.items():
689
+ if 'loss' in key:
690
+ if key not in self.log_dict:
691
+ self.log_dict[key] = value
692
+ else:
693
+ self.log_dict[key] += value
694
+ else:
695
+ # just copy it
696
+ self.log_dict[key] = value
697
+
698
+ def get_rope_pos_id(ret, num_tiles, dtype, rope_pos_id_version='default', position_id=None,boxes=None, orig_size=None,images=None,IMG_START_TOKEN='<img>',IMG_END_TOKEN='</img>',rope_pos_id_stride=None):
699
+ image_start_token_id = global_tokenizer.convert_tokens_to_ids(IMG_START_TOKEN)
700
+ image_end_token_id = global_tokenizer.convert_tokens_to_ids(IMG_END_TOKEN)
701
+ num_image_token=256
702
+ rope_pos_id_list = []
703
+
704
+ input_ids_0 = ret['input_ids'][0]
705
+ attention_mask_0 = ret['attention_mask'][0]
706
+ image_start_token_id_idxs = torch.where(input_ids_0 == image_start_token_id)[0]
707
+ image_end_token_id_idxs = torch.where(input_ids_0 == image_end_token_id)[0]
708
+
709
+ last_record_pos_id = -1
710
+ start_index = 0
711
+ for i in range(len(image_start_token_id_idxs)):
712
+ # 根据序列中的 IMG_START_TOKEN 出现的位置,锁定需要处理的图像 id 序列
713
+ # 注:这里的 IMG_START_TOKEN 和 IMG_END_TOKEN 应当与文本的处理方式相同
714
+ box = boxes[i]
715
+ image = images[i]
716
+
717
+ rope_pos_id_pre = attention_mask_0[start_index:image_start_token_id_idxs[i] + 1].long().cumsum(-1) - 1 + (last_record_pos_id + 1) # 从处理好的序列的最后一个 global id 开始 count
718
+ rope_pos_id_pre.masked_fill_(attention_mask_0[start_index:image_start_token_id_idxs[i] + 1] == 0, 1)
719
+ rope_pos_id_list.append(rope_pos_id_pre)
720
+
721
+ last_record_pos_id = rope_pos_id_pre[-1].long()
722
+
723
+ num_tile = num_tiles[i]
724
+ num_sub_imgs = num_tile - 1
725
+ is_last = (i == len(image_start_token_id_idxs) - 1)
726
+
727
+ if rope_pos_id_version == 'v0':
728
+ # 子图为小数,且不管多少个子图,其分配的总 global id 跨度为1;缩略图单独分配完整的,跨度为 1的 global id. Example:
729
+ # start_id = 100; 100 - 101 (分给 4 * 256),子图数目为4; 101 - 102 (分给 256) 缩略图
730
+ if num_sub_imgs > 0:
731
+ split_img_id_idxs = torch.linspace(last_record_pos_id, last_record_pos_id + 1, (num_tile - 1) * num_image_token + 1)[1:].to(dtype=dtype) # 小数数值的 tensor 作为变换的数据取值
732
+ origin_split_img_id_idxs = split_img_id_idxs
733
+ ############################## 进行位置变换 ##############################
734
+ # 先计算第一个子图对应 index
735
+ rearange_idx_list = []
736
+ rearange_idx_list_list = []
737
+ base_index_list = []
738
+ num_img_token_in_length = int(num_image_token ** 0.5)
739
+ num_patch_width = int(box[-1][2] // box[0][2])
740
+ num_patch_height = int(box[-1][3] // box[0][2])
741
+ assert num_patch_width * num_patch_height == len(box)
742
+
743
+ num_total_patch_width_token = num_patch_width * num_img_token_in_length
744
+ num_total_patch_height_token = num_patch_height * num_img_token_in_length
745
+ assert num_total_patch_width_token * num_total_patch_height_token == num_sub_imgs * num_image_token, (num_total_patch_width_token * num_total_patch_height_token, num_sub_imgs * num_image_token)
746
+
747
+ for k in range(num_image_token):
748
+ map_idx = (k // num_img_token_in_length) * num_total_patch_width_token + (k % num_img_token_in_length)
749
+ base_index_list.append(map_idx)
750
+
751
+ # 计算其他子图对应第一个子图的 offset
752
+
753
+ for k in range(num_sub_imgs):
754
+ patch_row = k // num_patch_width
755
+ patch_col = k % num_patch_width
756
+ offset = patch_row * (num_image_token * num_patch_width) + patch_col * num_img_token_in_length
757
+ # print(f'{k=}, {offset=}')
758
+ dst_index_list = [base_index + offset for base_index in base_index_list]
759
+ rearange_idx_list.extend(dst_index_list)
760
+ rearange_idx_list_list.append(dst_index_list)
761
+
762
+ ############################## plot 验证 ##############################
763
+ # img_boxes = [(deepcopy(img), cur_box, cur_posid) for img, cur_box, cur_posid in
764
+ # zip(image[:-1], box, rearange_idx_list_list)]
765
+ # self.eval_posid_by_plot(img_boxes, rope_pos_id_version, None)
766
+ # img_boxes = [(deepcopy(img), cur_box, cur_posid) for img, cur_box, cur_posid in
767
+ # zip(image[:-1], box, rearange_idx_list_list)]
768
+ # self.eval_posid_by_plot(img_boxes, rope_pos_id_version, split_img_id_idxs)
769
+
770
+ ############################## rearrange ##############################
771
+ split_img_id_idxs = split_img_id_idxs[rearange_idx_list]
772
+
773
+ rope_pos_id_list.append(split_img_id_idxs)
774
+ thumbnail_id_idxs = origin_split_img_id_idxs.reshape([num_image_token, -1]).to(dtype=dtype).mean(dim=1).view(-1)
775
+ rope_pos_id_list.append(thumbnail_id_idxs)
776
+ last_record_pos_id = origin_split_img_id_idxs[-1].long()
777
+ else:
778
+ thumbnail_id_idxs = torch.linspace(last_record_pos_id, last_record_pos_id + 1,
779
+ num_image_token + 1)[1:].to(dtype=dtype) # 缩略图
780
+ rope_pos_id_list.append(thumbnail_id_idxs)
781
+ last_record_pos_id = (last_record_pos_id + 1).long()
782
+
783
+ # 验证是否能够恢复为等差数列
784
+ if num_tile > 1:
785
+ gt_pos_id = torch.linspace(last_record_pos_id - 2, last_record_pos_id - 1, (num_tile - 1) * num_image_token + 1)[1:].to(dtype=dtype)
786
+ # self.eval_posid_by_rearange(box, rope_pos_id_list, gt_pos_id, num_tile, dtype, is_last)
787
+
788
+ elif rope_pos_id_version == 'v1':
789
+ # 子图为小数,若有 N 个子图,其分配的总 global id 跨度为 N;缩略图单独分配完整的,跨度为 1的 global id. Example:
790
+ # start_id = 100; 100 - 104 (分给 4 * 256),子图数目为4; 104 - 105 (分给 256) 缩略图
791
+ if num_sub_imgs > 0:
792
+ split_img_id_idxs = torch.linspace(last_record_pos_id, last_record_pos_id + num_tile - 1, (num_tile - 1) * num_image_token + 1)[1:].to(dtype=dtype) # 小数数值的 tensor 作为变换的数据取值
793
+ origin_split_img_id_idxs = split_img_id_idxs
794
+ ############################## 进行位置变换 ##############################
795
+ # 先计算第一个子图对应 index
796
+ rearange_idx_list = []
797
+ rearange_idx_list_list = []
798
+ base_index_list = []
799
+ # rearange_split_img_id_idxs_list = []
800
+ num_img_token_in_length = int(num_image_token ** 0.5)
801
+ num_patch_width = int(box[-1][2] // box[0][2])
802
+ num_patch_height = int(box[-1][3] // box[0][2])
803
+ assert num_patch_width * num_patch_height == len(box)
804
+
805
+ num_total_patch_width_token = num_patch_width * num_img_token_in_length
806
+ num_total_patch_height_token = num_patch_height * num_img_token_in_length
807
+ assert num_total_patch_width_token * num_total_patch_height_token == num_sub_imgs * num_image_token, (
808
+ num_total_patch_width_token * num_total_patch_height_token, num_sub_imgs * num_image_token)
809
+
810
+ for k in range(num_image_token):
811
+ map_idx = (k // num_img_token_in_length) * num_total_patch_width_token + (
812
+ k % num_img_token_in_length)
813
+ base_index_list.append(map_idx)
814
+
815
+ # 计算其他子图对应第一个子图的 offset
816
+
817
+ for k in range(num_sub_imgs):
818
+ patch_row = k // num_patch_width
819
+ patch_col = k % num_patch_width
820
+ offset = patch_row * (
821
+ num_image_token * num_patch_width) + patch_col * num_img_token_in_length
822
+ # print(f'{k=}, {offset=}')
823
+ dst_index_list = [base_index + offset for base_index in base_index_list]
824
+ rearange_idx_list.extend(dst_index_list)
825
+ rearange_idx_list_list.append(dst_index_list)
826
+ # rearange_split_img_id_idxs_list.append(split_img_id_idxs[dst_index_list])
827
+
828
+ ############################## plot 验证 ##############################
829
+ # img_boxes = [(deepcopy(img), cur_box, cur_posid) for img, cur_box, cur_posid in zip(image[:-1], box, rearange_idx_list_list)]
830
+ # self.eval_posid_by_plot(img_boxes, rope_pos_id_version, None)
831
+ # img_boxes = [(deepcopy(img), cur_box, cur_posid) for img, cur_box, cur_posid in zip(image[:-1], box, rearange_idx_list_list)]
832
+ # self.eval_posid_by_plot(img_boxes, rope_pos_id_version, split_img_id_idxs)
833
+
834
+ ############################## rearrange ##############################
835
+ split_img_id_idxs = split_img_id_idxs[rearange_idx_list]
836
+
837
+ rope_pos_id_list.append(split_img_id_idxs)
838
+ # thumbnail_id_idxs = torch.linspace(last_record_pos_id + 1, last_record_pos_id + 2, num_image_token + 1)[1:].to(dtype=dtype) # 缩略图
839
+ thumbnail_id_idxs = origin_split_img_id_idxs.reshape([num_image_token, -1]).to(dtype=dtype).mean(dim=1).view(-1)
840
+ rope_pos_id_list.append(thumbnail_id_idxs)
841
+ last_record_pos_id = origin_split_img_id_idxs[-1].long()
842
+ else:
843
+ thumbnail_id_idxs = torch.linspace(last_record_pos_id, last_record_pos_id + 1, num_image_token + 1)[1:].to(dtype=dtype) # 缩略图
844
+ rope_pos_id_list.append(thumbnail_id_idxs)
845
+ last_record_pos_id = (last_record_pos_id + 1).long()
846
+
847
+ # 验证是否能够恢复为等差数列
848
+ if num_tile > 1:
849
+ gt_pos_id = torch.linspace(last_record_pos_id - 1 - (num_tile - 1), last_record_pos_id - 1, (num_tile - 1) * num_image_token + 1)[1:].to(dtype=dtype)
850
+ # self.eval_posid_by_rearange(box, rope_pos_id_list, gt_pos_id, num_tile, dtype)
851
+
852
+ elif rope_pos_id_version == 'v2':
853
+ # 子图处理方式同文本(N 个子图分配 N * 256 个 global id);一个缩略图分配 256 * N 个的 global id.
854
+ # 子图处理同 v0, v1,也对 global id 根据空间关系做 arrange
855
+ if num_sub_imgs > 0:
856
+ split_img_id_idxs = torch.linspace(last_record_pos_id, last_record_pos_id + num_sub_imgs * num_image_token, num_sub_imgs * num_image_token + 1)[1:].long() # long 数值的 tensor 作为变换的数据取值
857
+ last_id_for_split_img = last_record_pos_id + num_sub_imgs * num_image_token
858
+ origin_split_img_id_idxs = split_img_id_idxs
859
+ ############################## 进行位置变换 ##############################
860
+ # 先计算第一个子图对应 index
861
+ rearange_idx_list = []
862
+ rearange_idx_list_list = []
863
+ base_index_list = []
864
+ # rearange_split_img_id_idxs_list = []
865
+ num_img_token_in_length = int(num_image_token ** 0.5)
866
+ num_patch_width = int(box[-1][2] // box[0][2])
867
+ num_patch_height = int(box[-1][3] // box[0][2])
868
+ assert num_patch_width * num_patch_height == len(box)
869
+
870
+ num_total_patch_width_token = num_patch_width * num_img_token_in_length
871
+ num_total_patch_height_token = num_patch_height * num_img_token_in_length
872
+ assert num_total_patch_width_token * num_total_patch_height_token == num_sub_imgs * num_image_token, (
873
+ num_total_patch_width_token * num_total_patch_height_token, num_sub_imgs * num_image_token)
874
+
875
+ for k in range(num_image_token):
876
+ map_idx = (k // num_img_token_in_length) * num_total_patch_width_token + (
877
+ k % num_img_token_in_length)
878
+ base_index_list.append(map_idx)
879
+
880
+ # 计算其他子图对应第一个子图的 offset
881
+
882
+ for k in range(num_sub_imgs):
883
+ patch_row = k // num_patch_width
884
+ patch_col = k % num_patch_width
885
+ offset = patch_row * (
886
+ num_image_token * num_patch_width) + patch_col * num_img_token_in_length
887
+ # print(f'{k=}, {offset=}')
888
+ dst_index_list = [base_index + offset for base_index in base_index_list]
889
+ rearange_idx_list.extend(dst_index_list)
890
+ rearange_idx_list_list.append(dst_index_list)
891
+ # rearange_split_img_id_idxs_list.append(split_img_id_idxs[dst_index_list])
892
+
893
+ ############################## plot 验证 ##############################
894
+
895
+ # img_boxes = [(deepcopy(img), cur_box, cur_posid) for img, cur_box, cur_posid in
896
+ # zip(image[:-1], box, rearange_idx_list_list)]
897
+ # self.eval_posid_by_plot(img_boxes, rope_pos_id_version, None)
898
+ # img_boxes = [(deepcopy(img), cur_box, cur_posid) for img, cur_box, cur_posid in
899
+ # zip(image[:-1], box, rearange_idx_list_list)]
900
+ # self.eval_posid_by_plot(img_boxes, rope_pos_id_version, split_img_id_idxs)
901
+
902
+ ############################## rearrange ##############################
903
+ split_img_id_idxs = split_img_id_idxs[rearange_idx_list]
904
+
905
+ rope_pos_id_list.append(split_img_id_idxs)
906
+ thumbnail_id_idxs = origin_split_img_id_idxs.reshape([num_image_token, -1]).to(dtype=dtype).mean(dim=1).view(-1)
907
+ rope_pos_id_list.append(thumbnail_id_idxs)
908
+ last_record_pos_id = origin_split_img_id_idxs[-1].long()
909
+ else:
910
+ thumbnail_id_idxs = torch.linspace(last_record_pos_id, last_record_pos_id + num_image_token, num_image_token + 1)[1:].long() # 缩略图,和 default 处理一致
911
+ rope_pos_id_list.append(thumbnail_id_idxs)
912
+ last_record_pos_id = thumbnail_id_idxs[-1].long()
913
+
914
+ # 验证是否能够恢复为等差数列
915
+ if num_tile > 1:
916
+ gt_pos_id = torch.linspace(last_id_for_split_img - num_image_token * num_sub_imgs,
917
+ last_id_for_split_img,
918
+ num_sub_imgs * num_image_token + 1)[1:].long()
919
+ # self.eval_posid_by_rearange(box, rope_pos_id_list, gt_pos_id, num_tile, gt_pos_id.dtype)
920
+
921
+ elif rope_pos_id_version == 'v3':
922
+ # N 个子图共用跨度为 256 的 global id;一个缩略图正常分配 256 个 global id
923
+ if num_sub_imgs > 0:
924
+ split_img_id_idxs = torch.linspace(last_record_pos_id, last_record_pos_id + num_image_token, num_sub_imgs * num_image_token + 1)[1:].to(dtype=dtype) # 小数数值的 tensor 作为变换的数据取值
925
+ origin_split_img_id_idxs = split_img_id_idxs
926
+ ############################## 进行位置变换 ##############################
927
+ # 先计算第一个子图对应 index
928
+ rearange_idx_list = []
929
+ rearange_idx_list_list = []
930
+ base_index_list = []
931
+ # rearange_split_img_id_idxs_list = []
932
+ num_img_token_in_length = int(num_image_token ** 0.5)
933
+ num_patch_width = int(box[-1][2] // box[0][2])
934
+ num_patch_height = int(box[-1][3] // box[0][2])
935
+ assert num_patch_width * num_patch_height == len(box)
936
+
937
+ num_total_patch_width_token = num_patch_width * num_img_token_in_length
938
+ num_total_patch_height_token = num_patch_height * num_img_token_in_length
939
+ assert num_total_patch_width_token * num_total_patch_height_token == num_sub_imgs * num_image_token, (
940
+ num_total_patch_width_token * num_total_patch_height_token, num_sub_imgs * num_image_token)
941
+
942
+ for k in range(num_image_token):
943
+ map_idx = (k // num_img_token_in_length) * num_total_patch_width_token + (
944
+ k % num_img_token_in_length)
945
+ base_index_list.append(map_idx)
946
+
947
+ # 计算其他子图对应第一个子图的 offset
948
+
949
+ for k in range(num_sub_imgs):
950
+ patch_row = k // num_patch_width
951
+ patch_col = k % num_patch_width
952
+ offset = patch_row * (
953
+ num_image_token * num_patch_width) + patch_col * num_img_token_in_length
954
+ # print(f'{k=}, {offset=}')
955
+ dst_index_list = [base_index + offset for base_index in base_index_list]
956
+ rearange_idx_list.extend(dst_index_list)
957
+ rearange_idx_list_list.append(dst_index_list)
958
+ # rearange_split_img_id_idxs_list.append(split_img_id_idxs[dst_index_list])
959
+
960
+ ############################## plot 验证 ##############################
961
+
962
+ # img_boxes = [(deepcopy(img), cur_box, cur_posid) for img, cur_box, cur_posid in
963
+ # zip(image[:-1], box, rearange_idx_list_list)]
964
+ # self.eval_posid_by_plot(img_boxes, rope_pos_id_version, None)
965
+ # img_boxes = [(deepcopy(img), cur_box, cur_posid) for img, cur_box, cur_posid in
966
+ # zip(image[:-1], box, rearange_idx_list_list)]
967
+ # self.eval_posid_by_plot(img_boxes, rope_pos_id_version, split_img_id_idxs)
968
+
969
+ ############################## rearrange ##############################
970
+ split_img_id_idxs = split_img_id_idxs[rearange_idx_list]
971
+
972
+ rope_pos_id_list.append(split_img_id_idxs)
973
+ thumbnail_id_idxs = origin_split_img_id_idxs.reshape([num_image_token, -1]).to(dtype=dtype).mean(dim=1).view(-1)
974
+ rope_pos_id_list.append(thumbnail_id_idxs)
975
+ last_record_pos_id = origin_split_img_id_idxs[-1].long()
976
+ else:
977
+ thumbnail_id_idxs = torch.linspace(last_record_pos_id, last_record_pos_id + num_image_token, num_image_token + 1)[1:].to(dtype=dtype) # 缩略图,和 default 处理一致
978
+ rope_pos_id_list.append(thumbnail_id_idxs)
979
+ last_record_pos_id = thumbnail_id_idxs[-1].to(dtype=dtype)
980
+
981
+ # 验证是否能够恢复为等差数列
982
+ if num_tile > 1:
983
+ gt_pos_id = torch.linspace(last_record_pos_id - num_image_token - num_image_token,
984
+ last_record_pos_id - num_image_token,
985
+ num_sub_imgs * num_image_token + 1)[1:].to(dtype=dtype)
986
+ # self.eval_posid_by_rearange(box, rope_pos_id_list, gt_pos_id, num_tile, gt_pos_id.dtype)
987
+
988
+ elif rope_pos_id_version == 'v4':
989
+ # stride 是可变长的
990
+ assert rope_pos_id_stride is not None, 'when rope_pos_id_version == v4, rope_pos_id_stride should not be None'
991
+ if num_sub_imgs > 0:
992
+ num_sub_image_tokens = num_image_token * num_sub_imgs
993
+ split_img_id_idxs = torch.linspace(last_record_pos_id, last_record_pos_id + rope_pos_id_stride, num_sub_imgs * num_image_token + 1)[1:].to(dtype=dtype) # 小数数值的 tensor 作为变换的数据取值
994
+ assert len(split_img_id_idxs) == num_sub_image_tokens
995
+ origin_split_img_id_idxs = split_img_id_idxs
996
+ ############################## 进行位置变换 ##############################
997
+ # 先计算第一个子图对应 index
998
+ rearange_idx_list = []
999
+ rearange_idx_list_list = []
1000
+ base_index_list = []
1001
+ # rearange_split_img_id_idxs_list = []
1002
+ num_img_token_in_length = int(num_image_token ** 0.5)
1003
+ num_patch_width = int(box[-1][2] // box[0][2])
1004
+ num_patch_height = int(box[-1][3] // box[0][2])
1005
+ assert num_patch_width * num_patch_height == len(box)
1006
+
1007
+ num_total_patch_width_token = num_patch_width * num_img_token_in_length
1008
+ num_total_patch_height_token = num_patch_height * num_img_token_in_length
1009
+ assert num_total_patch_width_token * num_total_patch_height_token == num_sub_imgs * num_image_token, (
1010
+ num_total_patch_width_token * num_total_patch_height_token, num_sub_imgs * num_image_token)
1011
+
1012
+ for k in range(num_image_token):
1013
+ map_idx = (k // num_img_token_in_length) * num_total_patch_width_token + (
1014
+ k % num_img_token_in_length)
1015
+ base_index_list.append(map_idx)
1016
+
1017
+ # 计算其他子图对应第一个子图的 offset
1018
+
1019
+ for k in range(num_sub_imgs):
1020
+ patch_row = k // num_patch_width
1021
+ patch_col = k % num_patch_width
1022
+ offset = patch_row * (num_image_token * num_patch_width) + patch_col * num_img_token_in_length
1023
+ # print(f'{k=}, {offset=}')
1024
+ dst_index_list = [base_index + offset for base_index in base_index_list]
1025
+ rearange_idx_list.extend(dst_index_list)
1026
+ rearange_idx_list_list.append(dst_index_list)
1027
+ # rearange_split_img_id_idxs_list.append(split_img_id_idxs[dst_index_list])
1028
+
1029
+ ############################## plot 验证 ##############################
1030
+
1031
+ # img_boxes = [(deepcopy(img), cur_box, cur_posid) for img, cur_box, cur_posid in
1032
+ # zip(image[:-1], box, rearange_idx_list_list)]
1033
+ # self.eval_posid_by_plot(img_boxes, rope_pos_id_version, None)
1034
+ # img_boxes = [(deepcopy(img), cur_box, cur_posid) for img, cur_box, cur_posid in
1035
+ # zip(image[:-1], box, rearange_idx_list_list)]
1036
+ # self.eval_posid_by_plot(img_boxes, rope_pos_id_version, split_img_id_idxs)
1037
+
1038
+ ############################## rearrange ##############################
1039
+ split_img_id_idxs = split_img_id_idxs[rearange_idx_list]
1040
+
1041
+ rope_pos_id_list.append(split_img_id_idxs)
1042
+ thumbnail_id_idxs = origin_split_img_id_idxs.reshape([num_image_token, -1]).to(dtype=dtype).mean(dim=1).view(-1)
1043
+ rope_pos_id_list.append(thumbnail_id_idxs)
1044
+ last_record_pos_id = origin_split_img_id_idxs[-1].long()
1045
+ else:
1046
+ thumbnail_id_idxs = torch.linspace(last_record_pos_id, last_record_pos_id + num_image_token, num_image_token + 1)[1:].to(dtype=dtype) # 缩略图,和 default 处理一致
1047
+ rope_pos_id_list.append(thumbnail_id_idxs)
1048
+ last_record_pos_id = thumbnail_id_idxs[-1].to(dtype=dtype)
1049
+
1050
+ elif rope_pos_id_version == 'v5':
1051
+ assert rope_pos_id_stride is not None, 'when rope_pos_id_version == v5, self.rope_pos_id_stride should not be None'
1052
+ small_stride = rope_pos_id_stride / num_image_token
1053
+ # split_img_id_idxs = torch.arange(last_record_pos_id, last_record_pos_id + small_stride * (num_image_token * num_tile + 1), small_stride)[1:].to(dtype=dtype)
1054
+ split_img_id_idxs = torch.linspace(last_record_pos_id,last_record_pos_id+small_stride*(num_image_token * num_tile ),(num_image_token * num_tile + 1))[1:].to(dtype=dtype)
1055
+ rope_pos_id_list.append(split_img_id_idxs)
1056
+ last_record_pos_id = torch.ceil(split_img_id_idxs[-1]).long()
1057
+ elif rope_pos_id_version == 'v6':
1058
+ random_from=[1,2,4,8,16,32,64,128,256]
1059
+ rope_pos_id_stride=random.choice(random_from)
1060
+ small_stride = rope_pos_id_stride / num_image_token
1061
+ # split_img_id_idxs = torch.arange(last_record_pos_id, last_record_pos_id + small_stride * (num_image_token * num_tile + 1), small_stride)[1:].to(dtype=dtype)
1062
+ split_img_id_idxs = torch.linspace(last_record_pos_id,last_record_pos_id+small_stride*(num_image_token * num_tile ),(num_image_token * num_tile + 1))[1:].to(dtype=dtype)
1063
+ rope_pos_id_list.append(split_img_id_idxs)
1064
+ last_record_pos_id = torch.ceil(split_img_id_idxs[-1]).long()
1065
+ elif rope_pos_id_version == 'default':
1066
+ # baseline
1067
+ # 无特殊处理的做法
1068
+ split_img_id_idxs = torch.linspace(last_record_pos_id,
1069
+ last_record_pos_id + (num_tile - 1) * num_image_token,
1070
+ (num_tile - 1) * num_image_token + 1)[1:].to(dtype=dtype) # 子图
1071
+ rope_pos_id_list.append(split_img_id_idxs)
1072
+ thumbnail_id_idxs = torch.linspace(last_record_pos_id + (num_tile - 1) * num_image_token,
1073
+ last_record_pos_id + num_tile * num_image_token,
1074
+ num_image_token + 1)[1:].to(dtype=dtype) # 缩略图
1075
+ rope_pos_id_list.append(thumbnail_id_idxs)
1076
+ last_record_pos_id = (last_record_pos_id + num_tile * num_image_token).long()
1077
+ else:
1078
+ raise NotImplementedError(f'not implement for {rope_pos_id_version}')
1079
+ try:
1080
+ start_index = image_start_token_id_idxs[i] + num_tile * num_image_token + 1
1081
+ assert input_ids_0[start_index] == image_end_token_id # 下一次迭代的开头应该是 IMG_END_TOKEN
1082
+ assert start_index == image_end_token_id_idxs[i] # 下一次迭代的开头应该是 IMG_END_TOKEN
1083
+ except:
1084
+ import ipdb
1085
+ ipdb.set_trace()
1086
+
1087
+ if image_end_token_id_idxs[-1] != input_ids_0.shape[0] - 1:
1088
+ # 末尾还有待处理的非图像 id 的情况
1089
+ assert image_end_token_id_idxs[-1] == start_index # 应当从最后一个 IMG_END_TOKEN 开始
1090
+ rope_pos_id_pre = attention_mask_0[start_index:].long().cumsum(-1) - 1 + (last_record_pos_id + 1)
1091
+ rope_pos_id_pre.masked_fill_(attention_mask_0[start_index:] == 0, 1)
1092
+ rope_pos_id_list.append(rope_pos_id_pre)
1093
+
1094
+ rope_pos_id_list=[_.to('cpu') for _ in rope_pos_id_list]
1095
+ rope_pos_id = torch.cat(rope_pos_id_list).to(dtype=dtype)
1096
+ if rope_pos_id_version == 'default':
1097
+ rope_pos_id = rope_pos_id.long() # 不做特殊处理的 rope_pos_id 应当等于 position_ids
1098
+ assert torch.equal(rope_pos_id, position_id.to(rope_pos_id.device)), (rope_pos_id, position_id.to(rope_pos_id.device))
1099
+ assert torch.allclose(rope_pos_id, position_id.to(rope_pos_id.device), atol=1e-32)
1100
+
1101
+ assert rope_pos_id.shape == input_ids_0.shape
1102
+
1103
+ return list(rope_pos_id.numpy())
V2PE-256K/preprocessor_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": 448,
3
+ "do_center_crop": true,
4
+ "do_normalize": true,
5
+ "do_resize": true,
6
+ "feature_extractor_type": "CLIPFeatureExtractor",
7
+ "image_mean": [
8
+ 0.485,
9
+ 0.456,
10
+ 0.406
11
+ ],
12
+ "image_std": [
13
+ 0.229,
14
+ 0.224,
15
+ 0.225
16
+ ],
17
+ "resample": 3,
18
+ "size": 448
19
+ }
V2PE-256K/special_tokens_map.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|action_start|>",
6
+ "<|action_end|>",
7
+ "<|interpreter|>",
8
+ "<|plugin|>",
9
+ "<img>",
10
+ "</img>",
11
+ "<IMG_CONTEXT>",
12
+ "<quad>",
13
+ "</quad>",
14
+ "<ref>",
15
+ "</ref>",
16
+ "<box>",
17
+ "</box>"
18
+ ],
19
+ "bos_token": {
20
+ "content": "<s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "eos_token": {
27
+ "content": "</s>",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ },
33
+ "pad_token": {
34
+ "content": "</s>",
35
+ "lstrip": false,
36
+ "normalized": false,
37
+ "rstrip": false,
38
+ "single_word": false
39
+ },
40
+ "unk_token": {
41
+ "content": "<unk>",
42
+ "lstrip": false,
43
+ "normalized": false,
44
+ "rstrip": false,
45
+ "single_word": false
46
+ }
47
+ }
V2PE-256K/tokenization_internlm2.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
2
+ #
3
+ # This code is based on transformers/src/transformers/models/llama/tokenization_llama.py
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ """Tokenization classes for InternLM."""
18
+ import os
19
+ from shutil import copyfile
20
+ from typing import Any, Dict, List, Optional, Tuple
21
+
22
+ import sentencepiece as spm
23
+ from transformers.tokenization_utils import PreTrainedTokenizer
24
+ from transformers.utils import logging
25
+
26
+ logger = logging.get_logger(__name__)
27
+
28
+ VOCAB_FILES_NAMES = {'vocab_file': './tokenizer.model'}
29
+
30
+ PRETRAINED_VOCAB_FILES_MAP = {}
31
+
32
+
33
+ # Modified from transformers.model.llama.tokenization_llama.LlamaTokenizer
34
+ class InternLM2Tokenizer(PreTrainedTokenizer):
35
+ """
36
+ Construct a InternLM2 tokenizer. Based on byte-level Byte-Pair-Encoding.
37
+
38
+ Args:
39
+ vocab_file (`str`):
40
+ Path to the vocabulary file.
41
+ """
42
+
43
+ vocab_files_names = VOCAB_FILES_NAMES
44
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
45
+ model_input_names = ['input_ids', 'attention_mask']
46
+ _auto_class = 'AutoTokenizer'
47
+
48
+ def __init__(
49
+ self,
50
+ vocab_file,
51
+ unk_token='<unk>',
52
+ bos_token='<s>',
53
+ eos_token='</s>',
54
+ pad_token='</s>',
55
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
56
+ add_bos_token=True,
57
+ add_eos_token=False,
58
+ decode_with_prefix_space=False,
59
+ clean_up_tokenization_spaces=False,
60
+ **kwargs,
61
+ ):
62
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
63
+ self.vocab_file = vocab_file
64
+ self.add_bos_token = add_bos_token
65
+ self.add_eos_token = add_eos_token
66
+ self.decode_with_prefix_space = decode_with_prefix_space
67
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
68
+ self.sp_model.Load(vocab_file)
69
+ self._no_prefix_space_tokens = None
70
+ super().__init__(
71
+ bos_token=bos_token,
72
+ eos_token=eos_token,
73
+ unk_token=unk_token,
74
+ pad_token=pad_token,
75
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
76
+ **kwargs,
77
+ )
78
+
79
+ @property
80
+ def no_prefix_space_tokens(self):
81
+ if self._no_prefix_space_tokens is None:
82
+ vocab = self.convert_ids_to_tokens(list(range(self.vocab_size)))
83
+ self._no_prefix_space_tokens = {i for i, tok in enumerate(vocab) if not tok.startswith('▁')}
84
+ return self._no_prefix_space_tokens
85
+
86
+ @property
87
+ def vocab_size(self):
88
+ """Returns vocab size"""
89
+ return self.sp_model.get_piece_size()
90
+
91
+ @property
92
+ def bos_token_id(self) -> Optional[int]:
93
+ return self.sp_model.bos_id()
94
+
95
+ @property
96
+ def eos_token_id(self) -> Optional[int]:
97
+ return self.sp_model.eos_id()
98
+
99
+ def get_vocab(self):
100
+ """Returns vocab as a dict"""
101
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
102
+ vocab.update(self.added_tokens_encoder)
103
+ return vocab
104
+
105
+ def _tokenize(self, text):
106
+ """Returns a tokenized string."""
107
+ return self.sp_model.encode(text, out_type=str)
108
+
109
+ def _convert_token_to_id(self, token):
110
+ """Converts a token (str) in an id using the vocab."""
111
+ return self.sp_model.piece_to_id(token)
112
+
113
+ def _convert_id_to_token(self, index):
114
+ """Converts an index (integer) in a token (str) using the vocab."""
115
+ token = self.sp_model.IdToPiece(index)
116
+ return token
117
+
118
+ def _maybe_add_prefix_space(self, tokens, decoded):
119
+ if tokens and tokens[0] not in self.no_prefix_space_tokens:
120
+ return ' ' + decoded
121
+ else:
122
+ return decoded
123
+
124
+ def convert_tokens_to_string(self, tokens):
125
+ """Converts a sequence of tokens (string) in a single string."""
126
+ current_sub_tokens = []
127
+ out_string = ''
128
+ prev_is_special = False
129
+ for token in tokens:
130
+ # make sure that special tokens are not decoded using sentencepiece model
131
+ if token in self.all_special_tokens:
132
+ if not prev_is_special:
133
+ out_string += ' '
134
+ out_string += self.sp_model.decode(current_sub_tokens) + token
135
+ prev_is_special = True
136
+ current_sub_tokens = []
137
+ else:
138
+ current_sub_tokens.append(token)
139
+ prev_is_special = False
140
+ out_string += self.sp_model.decode(current_sub_tokens)
141
+ out_string = self.clean_up_tokenization(out_string)
142
+ out_string = self._maybe_add_prefix_space(tokens=tokens, decoded=out_string)
143
+ return out_string[1:]
144
+
145
+ def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
146
+ """
147
+ Save the vocabulary and special tokens file to a directory.
148
+
149
+ Args:
150
+ save_directory (`str`):
151
+ The directory in which to save the vocabulary.
152
+
153
+ Returns:
154
+ `Tuple(str)`: Paths to the files saved.
155
+ """
156
+ if not os.path.isdir(save_directory):
157
+ logger.error(f'Vocabulary path ({save_directory}) should be a directory')
158
+ return
159
+ out_vocab_file = os.path.join(
160
+ save_directory, (filename_prefix + '-' if filename_prefix else '') + VOCAB_FILES_NAMES['vocab_file']
161
+ )
162
+
163
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
164
+ copyfile(self.vocab_file, out_vocab_file)
165
+ elif not os.path.isfile(self.vocab_file):
166
+ with open(out_vocab_file, 'wb') as fi:
167
+ content_spiece_model = self.sp_model.serialized_model_proto()
168
+ fi.write(content_spiece_model)
169
+
170
+ return (out_vocab_file,)
171
+
172
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
173
+ if self.add_bos_token:
174
+ bos_token_ids = [self.bos_token_id]
175
+ else:
176
+ bos_token_ids = []
177
+
178
+ output = bos_token_ids + token_ids_0
179
+
180
+ if token_ids_1 is not None:
181
+ output = output + token_ids_1
182
+
183
+ if self.add_eos_token:
184
+ output = output + [self.eos_token_id]
185
+
186
+ return output
187
+
188
+ def get_special_tokens_mask(
189
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
190
+ ) -> List[int]:
191
+ """
192
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
193
+ special tokens using the tokenizer `prepare_for_model` method.
194
+
195
+ Args:
196
+ token_ids_0 (`List[int]`):
197
+ List of IDs.
198
+ token_ids_1 (`List[int]`, *optional*):
199
+ Optional second list of IDs for sequence pairs.
200
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
201
+ Whether or not the token list is already formatted with special tokens for the model.
202
+
203
+ Returns:
204
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
205
+ """
206
+ if already_has_special_tokens:
207
+ return super().get_special_tokens_mask(
208
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
209
+ )
210
+
211
+ if token_ids_1 is None:
212
+ return [1] + ([0] * len(token_ids_0)) + [1]
213
+ return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
214
+
215
+ def create_token_type_ids_from_sequences(
216
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
217
+ ) -> List[int]:
218
+ """
219
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
220
+ use of token type ids, therefore a list of zeros is returned.
221
+
222
+ Args:
223
+ token_ids_0 (`List[int]`):
224
+ List of IDs.
225
+ token_ids_1 (`List[int]`, *optional*):
226
+ Optional second list of IDs for sequence pairs.
227
+
228
+ Returns:
229
+ `List[int]`: List of zeros.
230
+ """
231
+ eos = [self.eos_token_id]
232
+
233
+ if token_ids_1 is None:
234
+ return len(token_ids_0 + eos) * [0]
235
+ return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
V2PE-256K/tokenization_internlm2_fast.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
2
+ #
3
+ # This code is based on transformers/src/transformers/models/llama/tokenization_llama_fast.py
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ """Tokenization Fast class for InternLM."""
18
+ import os
19
+ from shutil import copyfile
20
+ from typing import Any, Dict, Optional, Tuple
21
+
22
+ from tokenizers import Tokenizer, decoders, normalizers, processors
23
+ from tokenizers.models import BPE
24
+ from transformers.convert_slow_tokenizer import (SLOW_TO_FAST_CONVERTERS,
25
+ SentencePieceExtractor,
26
+ SpmConverter)
27
+ from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
28
+ from transformers.utils import logging
29
+
30
+ from .tokenization_internlm2 import InternLM2Tokenizer
31
+
32
+ logger = logging.get_logger(__name__)
33
+
34
+ VOCAB_FILES_NAMES = {'vocab_file': './tokenizer.model'}
35
+
36
+
37
+ # Modified from transformers.convert_slow_tokenizer.LlamaConverter
38
+ class InternLM2Converter(SpmConverter):
39
+ handle_byte_fallback = True
40
+
41
+ def vocab(self, proto):
42
+ vocab = [
43
+ ('<unk>', 0.0),
44
+ ('<s>', 0.0),
45
+ ('</s>', 0.0),
46
+ ]
47
+ vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]]
48
+ return vocab
49
+
50
+ def unk_id(self, proto):
51
+ unk_id = 0
52
+ return unk_id
53
+
54
+ def decoder(self, replacement, add_prefix_space):
55
+ return decoders.Sequence(
56
+ [
57
+ decoders.Replace('▁', ' '),
58
+ decoders.ByteFallback(),
59
+ decoders.Fuse(),
60
+ decoders.Strip(content=' ', left=1),
61
+ ]
62
+ )
63
+
64
+ def tokenizer(self, proto):
65
+ model_type = proto.trainer_spec.model_type
66
+ vocab_scores = self.vocab(proto)
67
+ # special tokens
68
+ added_tokens = self.original_tokenizer.added_tokens_decoder
69
+ for i in range(len(vocab_scores)):
70
+ piece, score = vocab_scores[i]
71
+ if i in added_tokens:
72
+ vocab_scores[i] = (added_tokens[i].content, score)
73
+ if model_type == 1:
74
+ raise RuntimeError('InternLM2 is supposed to be a BPE model!')
75
+
76
+ elif model_type == 2:
77
+ _, merges = SentencePieceExtractor(self.original_tokenizer.vocab_file).extract(vocab_scores)
78
+ bpe_vocab = {word: i for i, (word, _score) in enumerate(vocab_scores)}
79
+ tokenizer = Tokenizer(
80
+ BPE(bpe_vocab, merges, unk_token=proto.trainer_spec.unk_piece, fuse_unk=True, byte_fallback=True)
81
+ )
82
+ tokenizer.add_special_tokens(
83
+ [ added_token for index, added_token in added_tokens.items()]
84
+ )
85
+ else:
86
+ raise Exception(
87
+ "You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
88
+ )
89
+
90
+ return tokenizer
91
+
92
+ def normalizer(self, proto):
93
+ normalizers_list = []
94
+ if proto.normalizer_spec.add_dummy_prefix:
95
+ normalizers_list.append(normalizers.Prepend(prepend='▁'))
96
+ normalizers_list.append(normalizers.Replace(pattern=' ', content='▁'))
97
+ return normalizers.Sequence(normalizers_list)
98
+
99
+ def pre_tokenizer(self, replacement, add_prefix_space):
100
+ return None
101
+
102
+
103
+ SLOW_TO_FAST_CONVERTERS['InternLM2Tokenizer'] = InternLM2Converter
104
+
105
+
106
+ # Modified from transformers.model.llama.tokenization_llama_fast.LlamaTokenizerFast -> InternLM2TokenizerFast
107
+ class InternLM2TokenizerFast(PreTrainedTokenizerFast):
108
+ vocab_files_names = VOCAB_FILES_NAMES
109
+ slow_tokenizer_class = InternLM2Tokenizer
110
+ padding_side = 'left'
111
+ model_input_names = ['input_ids', 'attention_mask']
112
+ _auto_class = 'AutoTokenizer'
113
+
114
+ def __init__(
115
+ self,
116
+ vocab_file,
117
+ unk_token='<unk>',
118
+ bos_token='<s>',
119
+ eos_token='</s>',
120
+ pad_token='</s>',
121
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
122
+ add_bos_token=True,
123
+ add_eos_token=False,
124
+ decode_with_prefix_space=False,
125
+ clean_up_tokenization_spaces=False,
126
+ **kwargs,
127
+ ):
128
+ super().__init__(
129
+ vocab_file=vocab_file,
130
+ unk_token=unk_token,
131
+ bos_token=bos_token,
132
+ eos_token=eos_token,
133
+ pad_token=pad_token,
134
+ sp_model_kwargs=sp_model_kwargs,
135
+ add_bos_token=add_bos_token,
136
+ add_eos_token=add_eos_token,
137
+ decode_with_prefix_space=decode_with_prefix_space,
138
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
139
+ **kwargs,
140
+ )
141
+ self._add_bos_token = add_bos_token
142
+ self._add_eos_token = add_eos_token
143
+ self.update_post_processor()
144
+ self.vocab_file = vocab_file
145
+
146
+ @property
147
+ def can_save_slow_tokenizer(self) -> bool:
148
+ return os.path.isfile(self.vocab_file) if self.vocab_file else False
149
+
150
+ def update_post_processor(self):
151
+ """
152
+ Updates the underlying post processor with the current `bos_token` and `eos_token`.
153
+ """
154
+ bos = self.bos_token
155
+ bos_token_id = self.bos_token_id
156
+ if bos is None and self.add_bos_token:
157
+ raise ValueError('add_bos_token = True but bos_token = None')
158
+
159
+ eos = self.eos_token
160
+ eos_token_id = self.eos_token_id
161
+ if eos is None and self.add_eos_token:
162
+ raise ValueError('add_eos_token = True but eos_token = None')
163
+
164
+ single = f"{(bos+':0 ') if self.add_bos_token else ''}$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
165
+ pair = f"{single}{(' '+bos+':1') if self.add_bos_token else ''} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"
166
+
167
+ special_tokens = []
168
+ if self.add_bos_token:
169
+ special_tokens.append((bos, bos_token_id))
170
+ if self.add_eos_token:
171
+ special_tokens.append((eos, eos_token_id))
172
+ self._tokenizer.post_processor = processors.TemplateProcessing(
173
+ single=single, pair=pair, special_tokens=special_tokens
174
+ )
175
+
176
+ @property
177
+ def add_eos_token(self):
178
+ return self._add_eos_token
179
+
180
+ @property
181
+ def add_bos_token(self):
182
+ return self._add_bos_token
183
+
184
+ @add_eos_token.setter
185
+ def add_eos_token(self, value):
186
+ self._add_eos_token = value
187
+ self.update_post_processor()
188
+
189
+ @add_bos_token.setter
190
+ def add_bos_token(self, value):
191
+ self._add_bos_token = value
192
+ self.update_post_processor()
193
+
194
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
195
+ if not self.can_save_slow_tokenizer:
196
+ raise ValueError(
197
+ 'Your fast tokenizer does not have the necessary information to save the vocabulary for a slow '
198
+ 'tokenizer.'
199
+ )
200
+
201
+ if not os.path.isdir(save_directory):
202
+ logger.error(f'Vocabulary path ({save_directory}) should be a directory')
203
+ return
204
+ out_vocab_file = os.path.join(
205
+ save_directory, (filename_prefix + '-' if filename_prefix else '') + VOCAB_FILES_NAMES['vocab_file']
206
+ )
207
+
208
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
209
+ copyfile(self.vocab_file, out_vocab_file)
210
+
211
+ return (out_vocab_file,)
V2PE-256K/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f868398fc4e05ee1e8aeba95ddf18ddcc45b8bce55d5093bead5bbf80429b48b
3
+ size 1477754
V2PE-256K/tokenizer_config.json ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<unk>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<s>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "92538": {
28
+ "content": "<|plugin|>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "92539": {
36
+ "content": "<|interpreter|>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "92540": {
44
+ "content": "<|action_end|>",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "92541": {
52
+ "content": "<|action_start|>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "92542": {
60
+ "content": "<|im_end|>",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "92543": {
68
+ "content": "<|im_start|>",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ },
75
+ "92544": {
76
+ "content": "<img>",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": true
82
+ },
83
+ "92545": {
84
+ "content": "</img>",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": true
90
+ },
91
+ "92546": {
92
+ "content": "<IMG_CONTEXT>",
93
+ "lstrip": false,
94
+ "normalized": false,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": true
98
+ },
99
+ "92547": {
100
+ "content": "<quad>",
101
+ "lstrip": false,
102
+ "normalized": false,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": true
106
+ },
107
+ "92548": {
108
+ "content": "</quad>",
109
+ "lstrip": false,
110
+ "normalized": false,
111
+ "rstrip": false,
112
+ "single_word": false,
113
+ "special": true
114
+ },
115
+ "92549": {
116
+ "content": "<ref>",
117
+ "lstrip": false,
118
+ "normalized": false,
119
+ "rstrip": false,
120
+ "single_word": false,
121
+ "special": true
122
+ },
123
+ "92550": {
124
+ "content": "</ref>",
125
+ "lstrip": false,
126
+ "normalized": false,
127
+ "rstrip": false,
128
+ "single_word": false,
129
+ "special": true
130
+ },
131
+ "92551": {
132
+ "content": "<box>",
133
+ "lstrip": false,
134
+ "normalized": false,
135
+ "rstrip": false,
136
+ "single_word": false,
137
+ "special": true
138
+ },
139
+ "92552": {
140
+ "content": "</box>",
141
+ "lstrip": false,
142
+ "normalized": false,
143
+ "rstrip": false,
144
+ "single_word": false,
145
+ "special": true
146
+ }
147
+ },
148
+ "additional_special_tokens": [
149
+ "<|im_start|>",
150
+ "<|im_end|>",
151
+ "<|action_start|>",
152
+ "<|action_end|>",
153
+ "<|interpreter|>",
154
+ "<|plugin|>",
155
+ "<img>",
156
+ "</img>",
157
+ "<IMG_CONTEXT>",
158
+ "<quad>",
159
+ "</quad>",
160
+ "<ref>",
161
+ "</ref>",
162
+ "<box>",
163
+ "</box>"
164
+ ],
165
+ "auto_map": {
166
+ "AutoTokenizer": [
167
+ "tokenization_internlm2.InternLM2Tokenizer",
168
+ null
169
+ ]
170
+ },
171
+ "bos_token": "<s>",
172
+ "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
173
+ "clean_up_tokenization_spaces": false,
174
+ "eos_token": "</s>",
175
+ "model_max_length": 270000,
176
+ "pad_token": "</s>",
177
+ "tokenizer_class": "InternLM2Tokenizer",
178
+ "unk_token": "<unk>"
179
+ }