Visual Question Answering
Transformers
PyTorch
internvl_chat
feature-extraction
custom_code
czczup commited on
Commit
2a99b9c
1 Parent(s): 2914b34

Upload folder using huggingface_hub

Browse files
configuration_intern_vit.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # InternVL
3
+ # Copyright (c) 2023 OpenGVLab
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+ import os
7
+ from typing import Union
8
+
9
+ from transformers.configuration_utils import PretrainedConfig
10
+ from transformers.utils import logging
11
+
12
+ logger = logging.get_logger(__name__)
13
+
14
+
15
+ class InternVisionConfig(PretrainedConfig):
16
+ r"""
17
+ This is the configuration class to store the configuration of a [`InternVisionModel`]. It is used to
18
+ instantiate a vision encoder according to the specified arguments, defining the model architecture.
19
+
20
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
21
+ documentation from [`PretrainedConfig`] for more information.
22
+
23
+ Args:
24
+ num_channels (`int`, *optional*, defaults to 3):
25
+ Number of color channels in the input images (e.g., 3 for RGB).
26
+ patch_size (`int`, *optional*, defaults to 14):
27
+ The size (resolution) of each patch.
28
+ image_size (`int`, *optional*, defaults to 224):
29
+ The size (resolution) of each image.
30
+ qkv_bias (`bool`, *optional*, defaults to `False`):
31
+ Whether to add a bias to the queries and values in the self-attention layers.
32
+ hidden_size (`int`, *optional*, defaults to 3200):
33
+ Dimensionality of the encoder layers and the pooler layer.
34
+ num_attention_heads (`int`, *optional*, defaults to 25):
35
+ Number of attention heads for each attention layer in the Transformer encoder.
36
+ intermediate_size (`int`, *optional*, defaults to 12800):
37
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
38
+ qk_normalization (`bool`, *optional*, defaults to `True`):
39
+ Whether to normalize the queries and keys in the self-attention layers.
40
+ num_hidden_layers (`int`, *optional*, defaults to 48):
41
+ Number of hidden layers in the Transformer encoder.
42
+ use_flash_attn (`bool`, *optional*, defaults to `True`):
43
+ Whether to use flash attention mechanism.
44
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
45
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
46
+ `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
47
+ layer_norm_eps (`float`, *optional*, defaults to 1e-6):
48
+ The epsilon used by the layer normalization layers.
49
+ dropout (`float`, *optional*, defaults to 0.0):
50
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
51
+ drop_path_rate (`float`, *optional*, defaults to 0.0):
52
+ Dropout rate for stochastic depth.
53
+ attention_dropout (`float`, *optional*, defaults to 0.0):
54
+ The dropout ratio for the attention probabilities.
55
+ initializer_range (`float`, *optional*, defaults to 0.02):
56
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
57
+ initializer_factor (`float`, *optional*, defaults to 0.1):
58
+ A factor for layer scale.
59
+ """
60
+
61
+ model_type = 'intern_vit_6b'
62
+
63
+ def __init__(
64
+ self,
65
+ num_channels=3,
66
+ patch_size=14,
67
+ image_size=224,
68
+ qkv_bias=False,
69
+ hidden_size=3200,
70
+ num_attention_heads=25,
71
+ intermediate_size=12800,
72
+ qk_normalization=True,
73
+ num_hidden_layers=48,
74
+ use_flash_attn=True,
75
+ hidden_act='gelu',
76
+ layer_norm_eps=1e-6,
77
+ dropout=0.0,
78
+ drop_path_rate=0.0,
79
+ attention_dropout=0.0,
80
+ initializer_range=0.02,
81
+ initializer_factor=0.1,
82
+ **kwargs,
83
+ ):
84
+ super().__init__(**kwargs)
85
+
86
+ self.hidden_size = hidden_size
87
+ self.intermediate_size = intermediate_size
88
+ self.dropout = dropout
89
+ self.drop_path_rate = drop_path_rate
90
+ self.num_hidden_layers = num_hidden_layers
91
+ self.num_attention_heads = num_attention_heads
92
+ self.num_channels = num_channels
93
+ self.patch_size = patch_size
94
+ self.image_size = image_size
95
+ self.initializer_range = initializer_range
96
+ self.initializer_factor = initializer_factor
97
+ self.attention_dropout = attention_dropout
98
+ self.layer_norm_eps = layer_norm_eps
99
+ self.hidden_act = hidden_act
100
+ self.qkv_bias = qkv_bias
101
+ self.qk_normalization = qk_normalization
102
+ self.use_flash_attn = use_flash_attn
103
+
104
+ @classmethod
105
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> 'PretrainedConfig':
106
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
107
+
108
+ if 'vision_config' in config_dict:
109
+ config_dict = config_dict['vision_config']
110
+
111
+ if 'model_type' in config_dict and hasattr(cls, 'model_type') and config_dict['model_type'] != cls.model_type:
112
+ logger.warning(
113
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
114
+ f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
115
+ )
116
+
117
+ return cls.from_dict(config_dict, **kwargs)
configuration_internvl_chat.py CHANGED
@@ -4,121 +4,16 @@
4
  # Licensed under The MIT License [see LICENSE for details]
5
  # --------------------------------------------------------
6
 
7
- import os
8
  import copy
9
- from typing import Union
10
 
11
  from transformers import LlamaConfig
12
  from transformers.configuration_utils import PretrainedConfig
13
  from transformers.utils import logging
14
 
15
- logger = logging.get_logger(__name__)
16
-
17
-
18
- class InternVisionConfig(PretrainedConfig):
19
- r"""
20
- This is the configuration class to store the configuration of a [`InternVisionModel`]. It is used to
21
- instantiate a vision encoder according to the specified arguments, defining the model architecture.
22
-
23
- Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
24
- documentation from [`PretrainedConfig`] for more information.
25
 
26
- Args:
27
- num_channels (`int`, *optional*, defaults to 3):
28
- Number of color channels in the input images (e.g., 3 for RGB).
29
- patch_size (`int`, *optional*, defaults to 14):
30
- The size (resolution) of each patch.
31
- image_size (`int`, *optional*, defaults to 224):
32
- The size (resolution) of each image.
33
- qkv_bias (`bool`, *optional*, defaults to `False`):
34
- Whether to add a bias to the queries and values in the self-attention layers.
35
- hidden_size (`int`, *optional*, defaults to 3200):
36
- Dimensionality of the encoder layers and the pooler layer.
37
- num_attention_heads (`int`, *optional*, defaults to 25):
38
- Number of attention heads for each attention layer in the Transformer encoder.
39
- intermediate_size (`int`, *optional*, defaults to 12800):
40
- Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
41
- qk_normalization (`bool`, *optional*, defaults to `True`):
42
- Whether to normalize the queries and keys in the self-attention layers.
43
- num_hidden_layers (`int`, *optional*, defaults to 48):
44
- Number of hidden layers in the Transformer encoder.
45
- use_flash_attn (`bool`, *optional*, defaults to `True`):
46
- Whether to use flash attention mechanism.
47
- hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
48
- The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
49
- `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
50
- layer_norm_eps (`float`, *optional*, defaults to 1e-6):
51
- The epsilon used by the layer normalization layers.
52
- dropout (`float`, *optional*, defaults to 0.0):
53
- The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
54
- drop_path_rate (`float`, *optional*, defaults to 0.0):
55
- Dropout rate for stochastic depth.
56
- attention_dropout (`float`, *optional*, defaults to 0.0):
57
- The dropout ratio for the attention probabilities.
58
- initializer_range (`float`, *optional*, defaults to 0.02):
59
- The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
60
- initializer_factor (`float`, *optional*, defaults to 0.1):
61
- A factor for layer scale.
62
- """
63
-
64
- model_type = 'intern_vit_6b'
65
-
66
- def __init__(
67
- self,
68
- num_channels=3,
69
- patch_size=14,
70
- image_size=224,
71
- qkv_bias=False,
72
- hidden_size=3200,
73
- num_attention_heads=25,
74
- intermediate_size=12800,
75
- qk_normalization=True,
76
- num_hidden_layers=48,
77
- use_flash_attn=True,
78
- hidden_act='gelu',
79
- layer_norm_eps=1e-6,
80
- dropout=0.0,
81
- drop_path_rate=0.0,
82
- attention_dropout=0.0,
83
- initializer_range=0.02,
84
- initializer_factor=0.1,
85
- **kwargs,
86
- ):
87
- super().__init__(**kwargs)
88
-
89
- self.hidden_size = hidden_size
90
- self.intermediate_size = intermediate_size
91
- self.dropout = dropout
92
- self.drop_path_rate = drop_path_rate
93
- self.num_hidden_layers = num_hidden_layers
94
- self.num_attention_heads = num_attention_heads
95
- self.num_channels = num_channels
96
- self.patch_size = patch_size
97
- self.image_size = image_size
98
- self.initializer_range = initializer_range
99
- self.initializer_factor = initializer_factor
100
- self.attention_dropout = attention_dropout
101
- self.layer_norm_eps = layer_norm_eps
102
- self.hidden_act = hidden_act
103
- self.qkv_bias = qkv_bias
104
- self.qk_normalization = qk_normalization
105
- self.use_flash_attn = use_flash_attn
106
-
107
- @classmethod
108
- def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> 'PretrainedConfig':
109
- config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
110
-
111
- if 'vision_config' in config_dict:
112
- config_dict = config_dict['vision_config']
113
-
114
- if 'model_type' in config_dict and hasattr(cls, 'model_type') and config_dict['model_type'] != cls.model_type:
115
- logger.warning(
116
- f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
117
- f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
118
- )
119
-
120
- return cls.from_dict(config_dict, **kwargs)
121
 
 
122
 
123
 
124
  class InternVLChatConfig(PretrainedConfig):
 
4
  # Licensed under The MIT License [see LICENSE for details]
5
  # --------------------------------------------------------
6
 
 
7
  import copy
 
8
 
9
  from transformers import LlamaConfig
10
  from transformers.configuration_utils import PretrainedConfig
11
  from transformers.utils import logging
12
 
13
+ from .configuration_intern_vit import InternVisionConfig
 
 
 
 
 
 
 
 
 
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
+ logger = logging.get_logger(__name__)
17
 
18
 
19
  class InternVLChatConfig(PretrainedConfig):
modeling_intern_vit.py CHANGED
@@ -20,7 +20,13 @@ from transformers.utils import logging
20
  from .configuration_intern_vit import InternVisionConfig
21
 
22
  try:
23
- from .flash_attention import FlashAttention
 
 
 
 
 
 
24
  has_flash_attn = True
25
  except:
26
  print('FlashAttention is not installed.')
@@ -30,6 +36,70 @@ except:
30
  logger = logging.get_logger(__name__)
31
 
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  class InternRMSNorm(nn.Module):
34
  def __init__(self, hidden_size, eps=1e-6):
35
  super().__init__()
@@ -279,6 +349,7 @@ class InternVisionEncoder(nn.Module):
279
  class InternVisionModel(PreTrainedModel):
280
  main_input_name = 'pixel_values'
281
  config_class = InternVisionConfig
 
282
 
283
  def __init__(self, config: InternVisionConfig):
284
  super().__init__(config)
 
20
  from .configuration_intern_vit import InternVisionConfig
21
 
22
  try:
23
+ try: # v1
24
+ from flash_attn.flash_attn_interface import \
25
+ flash_attn_unpadded_qkvpacked_func
26
+ except: # v2
27
+ from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func
28
+
29
+ from flash_attn.bert_padding import pad_input, unpad_input
30
  has_flash_attn = True
31
  except:
32
  print('FlashAttention is not installed.')
 
36
  logger = logging.get_logger(__name__)
37
 
38
 
39
+ class FlashAttention(nn.Module):
40
+ """Implement the scaled dot product attention with softmax.
41
+ Arguments
42
+ ---------
43
+ softmax_scale: The temperature to use for the softmax attention.
44
+ (default: 1/sqrt(d_keys) where d_keys is computed at
45
+ runtime)
46
+ attention_dropout: The dropout rate to apply to the attention
47
+ (default: 0.0)
48
+ """
49
+
50
+ def __init__(self, softmax_scale=None, attention_dropout=0.0, device=None, dtype=None):
51
+ super().__init__()
52
+ self.softmax_scale = softmax_scale
53
+ self.dropout_p = attention_dropout
54
+
55
+ def forward(self, qkv, key_padding_mask=None, causal=False, cu_seqlens=None,
56
+ max_s=None, need_weights=False):
57
+ """Implements the multihead softmax attention.
58
+ Arguments
59
+ ---------
60
+ qkv: The tensor containing the query, key, and value. (B, S, 3, H, D) if key_padding_mask is None
61
+ if unpadded: (nnz, 3, h, d)
62
+ key_padding_mask: a bool tensor of shape (B, S)
63
+ """
64
+ assert not need_weights
65
+ assert qkv.dtype in [torch.float16, torch.bfloat16]
66
+ assert qkv.is_cuda
67
+
68
+ if cu_seqlens is None:
69
+ batch_size = qkv.shape[0]
70
+ seqlen = qkv.shape[1]
71
+ if key_padding_mask is None:
72
+ qkv = rearrange(qkv, 'b s ... -> (b s) ...')
73
+ max_s = seqlen
74
+ cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32,
75
+ device=qkv.device)
76
+ output = flash_attn_unpadded_qkvpacked_func(
77
+ qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
78
+ softmax_scale=self.softmax_scale, causal=causal
79
+ )
80
+ output = rearrange(output, '(b s) ... -> b s ...', b=batch_size)
81
+ else:
82
+ nheads = qkv.shape[-2]
83
+ x = rearrange(qkv, 'b s three h d -> b s (three h d)')
84
+ x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask)
85
+ x_unpad = rearrange(x_unpad, 'nnz (three h d) -> nnz three h d', three=3, h=nheads)
86
+ output_unpad = flash_attn_unpadded_qkvpacked_func(
87
+ x_unpad, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
88
+ softmax_scale=self.softmax_scale, causal=causal
89
+ )
90
+ output = rearrange(pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'),
91
+ indices, batch_size, seqlen),
92
+ 'b s (h d) -> b s h d', h=nheads)
93
+ else:
94
+ assert max_s is not None
95
+ output = flash_attn_unpadded_qkvpacked_func(
96
+ qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
97
+ softmax_scale=self.softmax_scale, causal=causal
98
+ )
99
+
100
+ return output, None
101
+
102
+
103
  class InternRMSNorm(nn.Module):
104
  def __init__(self, hidden_size, eps=1e-6):
105
  super().__init__()
 
349
  class InternVisionModel(PreTrainedModel):
350
  main_input_name = 'pixel_values'
351
  config_class = InternVisionConfig
352
+ _no_split_modules = ['InternAttention']
353
 
354
  def __init__(self, config: InternVisionConfig):
355
  super().__init__(config)
modeling_internvl_chat.py CHANGED
@@ -23,6 +23,7 @@ logger = logging.get_logger(__name__)
23
  class InternVLChatModel(PreTrainedModel):
24
  config_class = InternVLChatConfig
25
  main_input_name = 'pixel_values'
 
26
 
27
  def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None):
28
  super().__init__(config)
@@ -193,7 +194,6 @@ class InternVLChatModel(PreTrainedModel):
193
 
194
  img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
195
  self.img_context_token_id = img_context_token_id
196
-
197
  from .conversation import get_conv_template
198
 
199
  template = get_conv_template(self.template)
 
23
  class InternVLChatModel(PreTrainedModel):
24
  config_class = InternVLChatConfig
25
  main_input_name = 'pixel_values'
26
+ _no_split_modules = ['InternAttention', 'LlamaDecoderLayer', 'LlamaForCausalLM']
27
 
28
  def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None):
29
  super().__init__(config)
 
194
 
195
  img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
196
  self.img_context_token_id = img_context_token_id
 
197
  from .conversation import get_conv_template
198
 
199
  template = get_conv_template(self.template)
preprocessor_config.json CHANGED
@@ -16,4 +16,4 @@
16
  ],
17
  "resample": 3,
18
  "size": 448
19
- }
 
16
  ],
17
  "resample": 3,
18
  "size": 448
19
+ }
special_tokens_map.json CHANGED
@@ -1,70 +1,31 @@
1
  {
2
  "additional_special_tokens": [
 
 
 
 
 
 
 
 
 
 
3
  {
4
- "content": "<human>",
5
  "lstrip": false,
6
  "normalized": false,
7
  "rstrip": false,
8
  "single_word": false
9
  },
10
  {
11
- "content": "<bot>",
12
  "lstrip": false,
13
  "normalized": false,
14
  "rstrip": false,
15
  "single_word": false
16
  },
17
  {
18
- "content": "<img>",
19
- "lstrip": false,
20
- "normalized": false,
21
- "rstrip": false,
22
- "single_word": false
23
- },
24
- {
25
- "content": "</img>",
26
- "lstrip": false,
27
- "normalized": false,
28
- "rstrip": false,
29
- "single_word": false
30
- },
31
- {
32
- "content": "<vid>",
33
- "lstrip": false,
34
- "normalized": false,
35
- "rstrip": false,
36
- "single_word": false
37
- },
38
- {
39
- "content": "</vid>",
40
- "lstrip": false,
41
- "normalized": false,
42
- "rstrip": false,
43
- "single_word": false
44
- },
45
- {
46
- "content": "<box>",
47
- "lstrip": false,
48
- "normalized": false,
49
- "rstrip": false,
50
- "single_word": false
51
- },
52
- {
53
- "content": "</box>",
54
- "lstrip": false,
55
- "normalized": false,
56
- "rstrip": false,
57
- "single_word": false
58
- },
59
- {
60
- "content": "<ref>",
61
- "lstrip": false,
62
- "normalized": false,
63
- "rstrip": false,
64
- "single_word": false
65
- },
66
- {
67
- "content": "</ref>",
68
  "lstrip": false,
69
  "normalized": false,
70
  "rstrip": false,
@@ -85,7 +46,13 @@
85
  "rstrip": false,
86
  "single_word": false
87
  },
88
- "pad_token": "<unk>",
 
 
 
 
 
 
89
  "unk_token": {
90
  "content": "<unk>",
91
  "lstrip": false,
 
1
  {
2
  "additional_special_tokens": [
3
+ "<human>",
4
+ "<bot>",
5
+ "<img>",
6
+ "</img>",
7
+ "<vid>",
8
+ "</vid>",
9
+ "<box>",
10
+ "</box>",
11
+ "<ref>",
12
+ "</ref>",
13
  {
14
+ "content": "<IMG_CONTEXT>",
15
  "lstrip": false,
16
  "normalized": false,
17
  "rstrip": false,
18
  "single_word": false
19
  },
20
  {
21
+ "content": "<quad>",
22
  "lstrip": false,
23
  "normalized": false,
24
  "rstrip": false,
25
  "single_word": false
26
  },
27
  {
28
+ "content": "</quad>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  "lstrip": false,
30
  "normalized": false,
31
  "rstrip": false,
 
46
  "rstrip": false,
47
  "single_word": false
48
  },
49
+ "pad_token": {
50
+ "content": "<unk>",
51
+ "lstrip": false,
52
+ "normalized": false,
53
+ "rstrip": false,
54
+ "single_word": false
55
+ },
56
  "unk_token": {
57
  "content": "<unk>",
58
  "lstrip": false,
tokenizer_config.json CHANGED
@@ -105,6 +105,30 @@
105
  "rstrip": false,
106
  "single_word": false,
107
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  }
109
  },
110
  "additional_special_tokens": [
@@ -117,47 +141,22 @@
117
  "<box>",
118
  "</box>",
119
  "<ref>",
120
- "</ref>"
 
 
 
121
  ],
122
- "bos_token": {
123
- "__type": "AddedToken",
124
- "content": "<s>",
125
- "lstrip": false,
126
- "normalized": true,
127
- "rstrip": false,
128
- "single_word": false
129
- },
130
  "clean_up_tokenization_spaces": false,
131
- "eos_token": {
132
- "__type": "AddedToken",
133
- "content": "</s>",
134
- "lstrip": false,
135
- "normalized": true,
136
- "rstrip": false,
137
- "single_word": false
138
- },
139
  "legacy": true,
140
  "model_max_length": 768,
141
- "pad_token": {
142
- "__type": "AddedToken",
143
- "content": "<unk>",
144
- "lstrip": false,
145
- "normalized": true,
146
- "rstrip": false,
147
- "single_word": false
148
- },
149
  "padding_side": "right",
150
  "sp_model_kwargs": {},
151
  "spaces_between_special_tokens": false,
152
  "tokenizer_class": "LlamaTokenizer",
153
- "unk_token": {
154
- "__type": "AddedToken",
155
- "content": "<unk>",
156
- "lstrip": false,
157
- "normalized": true,
158
- "rstrip": false,
159
- "single_word": false
160
- },
161
  "use_default_system_prompt": true,
162
  "use_fast": true
163
  }
 
105
  "rstrip": false,
106
  "single_word": false,
107
  "special": true
108
+ },
109
+ "41916": {
110
+ "content": "<IMG_CONTEXT>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "41917": {
118
+ "content": "<quad>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": true
124
+ },
125
+ "41918": {
126
+ "content": "</quad>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": true
132
  }
133
  },
134
  "additional_special_tokens": [
 
141
  "<box>",
142
  "</box>",
143
  "<ref>",
144
+ "</ref>",
145
+ "<IMG_CONTEXT>",
146
+ "<quad>",
147
+ "</quad>"
148
  ],
149
+ "bos_token": "<s>",
 
 
 
 
 
 
 
150
  "clean_up_tokenization_spaces": false,
151
+ "eos_token": "</s>",
 
 
 
 
 
 
 
152
  "legacy": true,
153
  "model_max_length": 768,
154
+ "pad_token": "<unk>",
 
 
 
 
 
 
 
155
  "padding_side": "right",
156
  "sp_model_kwargs": {},
157
  "spaces_between_special_tokens": false,
158
  "tokenizer_class": "LlamaTokenizer",
159
+ "unk_token": "<unk>",
 
 
 
 
 
 
 
160
  "use_default_system_prompt": true,
161
  "use_fast": true
162
  }