rkazants commited on
Commit
4b8510e
·
verified ·
1 Parent(s): 01a97ef

Upload 11 files

Browse files
chat_template.jinja ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- set tools_system_message_prefix = 'You are a helpful assistant with access to the following tools. You may call one or more tools to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>' %}
2
+ {%- set tools_system_message_suffix = '\n</tools>\n\nFor each tool call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call>. If a tool does not exist in the provided list of tools, notify the user that you do not have the ability to fulfill the request.' %}
3
+ {%- set documents_system_message_prefix = 'You are a helpful assistant with access to the following documents. You may use one or more documents to assist with the user query.\n\nYou are given a list of documents within <documents></documents> XML tags:\n<documents>' %}
4
+ {%- set documents_system_message_suffix = '\n</documents>\n\nWrite the response to the user\'s input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data.' %}
5
+ {%- set g4_default_system_message = 'You are a helpful assistant. Please ensure responses are professional, accurate, and safe.' %}
6
+ {%- if available_tools is defined and available_tools %}
7
+ {%- set tools = available_tools %}
8
+ {%- endif %}
9
+ {%- set ns = namespace(tools_system_message=tools_system_message_prefix,
10
+ documents_system_message=documents_system_message_prefix,
11
+ default_system_message=g4_default_system_message,
12
+ system_message=''
13
+ ) %}
14
+ {%- if tools %}
15
+ {%- for tool in tools %}
16
+ {%- set ns.tools_system_message = ns.tools_system_message + '\n' + (tool | tojson) %}
17
+ {%- endfor %}
18
+ {%- set ns.tools_system_message = ns.tools_system_message + tools_system_message_suffix %}
19
+ {%- else %}
20
+ {%- set ns.tools_system_message = '' %}
21
+ {%- endif %}
22
+ {%- if documents %}
23
+ {%- for document in documents %}
24
+ {%- set ns.documents_system_message = ns.documents_system_message + '\n' + (document | tojson) %}
25
+ {%- endfor %}
26
+ {%- set ns.documents_system_message = ns.documents_system_message + documents_system_message_suffix %}
27
+ {%- else %}
28
+ {%- set ns.documents_system_message = '' %}
29
+ {%- endif %}
30
+ {%- if messages[0].role == 'system' %}
31
+ {%- if messages[0].content is string %}
32
+ {%- set ns.system_message = messages[0].content %}
33
+ {%- elif messages[0].content is iterable %}
34
+ {%- for entry in messages[0].content %}
35
+ {%- if entry.type== 'text' %}
36
+ {%- if ns.system_message != '' %}
37
+ {%- set ns.system_message = ns.system_message + '\n' %}
38
+ {%- endif %}
39
+ {%- set ns.system_message = ns.system_message + entry.text %}
40
+ {%- endif %}
41
+ {%- endfor %}
42
+ {%- endif %}
43
+ {%- if tools and documents %}
44
+ {%- set ns.system_message = ns.system_message + '\n\n' + ns.tools_system_message + '\n\n' + ns.documents_system_message %}
45
+ {%- elif tools %}
46
+ {%- set ns.system_message = ns.system_message + '\n\n' + ns.tools_system_message %}
47
+ {%- elif documents %}
48
+ {%- set ns.system_message = ns.system_message + '\n\n' + ns.documents_system_message %}
49
+ {%- endif %}
50
+ {%- else %}
51
+ {%- if tools and documents %}
52
+ {%- set ns.system_message = ns.tools_system_message + '\n\n' + ns.documents_system_message %}
53
+ {%- elif tools %}
54
+ {%- set ns.system_message = ns.tools_system_message %}
55
+ {%- elif documents %}
56
+ {%- set ns.system_message = ns.documents_system_message %}
57
+ {%- endif %}
58
+ {%- endif %}
59
+ {%- if ns.system_message %}
60
+ {{- '<|start_of_role|>system<|end_of_role|>' + ns.system_message + '<|end_of_text|>\n' }}
61
+ {%- else %}
62
+ {{- '<|start_of_role|>system<|end_of_role|>' + ns.default_system_message + '<|end_of_text|>\n' }}
63
+ {%- endif %}
64
+ {%- for message in messages %}
65
+ {%- set content = namespace(val='') %}
66
+ {%- if message.content is string %}
67
+ {%- set content.val = message.content %}
68
+ {%- else %}
69
+ {%- if message.content is iterable %}
70
+ {%- for entry in message.content %}
71
+ {%- if entry.type== 'text' %}
72
+ {%- if content.val != '' %}
73
+ {%- set content.val = content.val + '\n' %}
74
+ {%- endif %}
75
+ {%- set content.val = content.val + entry.text %}
76
+ {%- endif %}
77
+ {%- endfor %}
78
+ {%- endif %}
79
+ {%- endif %}
80
+ {%- if (message.role == 'user') or (message.role == 'system' and not loop.first) %}
81
+ {{- '<|start_of_role|>' + message.role + '<|end_of_role|>' + content.val + '<|end_of_text|>\n' }}
82
+ {%- elif message.role == 'assistant' %}
83
+ {{- '<|start_of_role|>' + message.role + '<|end_of_role|>' + content.val }}
84
+ {%- if message.tool_calls %}
85
+ {%- for tool_call in message.tool_calls %}
86
+ {%- if (loop.first and content.val) or (not loop.first) %}
87
+ {{- '\n' }}
88
+ {%- endif %}
89
+ {%- if tool_call.function %}
90
+ {%- set tool_call = tool_call.function %}
91
+ {%- endif %}
92
+ {{- '<tool_call>\n{"name": "' }}
93
+ {{- tool_call.name }}
94
+ {{- '", "arguments": ' }}
95
+ {%- if tool_call.arguments is string %}
96
+ {{- tool_call.arguments }}
97
+ {%- else %}
98
+ {{- tool_call.arguments | tojson }}
99
+ {%- endif %}
100
+ {{- '}\n</tool_call>' }}
101
+ {%- endfor %}
102
+ {%- endif %}
103
+ {{- '<|end_of_text|>\n' }}
104
+ {%- elif message.role == 'tool' %}
105
+ {%- if loop.first or (messages[loop.index0 - 1].role != 'tool') %}
106
+ {{- '<|start_of_role|>user<|end_of_role|>' }}
107
+ {%- endif %}
108
+ {{- '\n<tool_response>\n' }}
109
+ {{- content.val }}
110
+ {{- '\n</tool_response>' }}
111
+ {%- if loop.last or (messages[loop.index0 + 1].role != 'tool') %}
112
+ {{- '<|end_of_text|>\n' }}
113
+ {%- endif %}
114
+ {%- endif %}
115
+ {%- endfor %}
116
+ {%- if add_generation_prompt %}
117
+ {{- '<|start_of_role|>assistant<|end_of_role|>' }}
118
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "AfmoeForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "dtype": "float32",
7
+ "global_attn_every_n_layers": 4,
8
+ "head_dim": 16,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 32,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 6144,
13
+ "layer_types": [
14
+ "sliding_attention",
15
+ "full_attention",
16
+ "sliding_attention",
17
+ "full_attention"
18
+ ],
19
+ "max_position_embeddings": 16384,
20
+ "model_type": "afmoe",
21
+ "moe_intermediate_size": 16,
22
+ "mup_enabled": false,
23
+ "n_group": 1,
24
+ "num_attention_heads": 2,
25
+ "num_dense_layers": 2,
26
+ "num_expert_groups": 1,
27
+ "num_experts": 4,
28
+ "num_experts_per_tok": 2,
29
+ "num_hidden_layers": 4,
30
+ "num_key_value_heads": 2,
31
+ "num_limited_groups": 1,
32
+ "num_shared_experts": 1,
33
+ "rms_norm_eps": 1e-05,
34
+ "rope_scaling": null,
35
+ "rope_theta": 10000.0,
36
+ "route_norm": true,
37
+ "route_scale": 1.0,
38
+ "score_func": "sigmoid",
39
+ "sliding_window": 1024,
40
+ "tie_word_embeddings": false,
41
+ "topk_group": 1,
42
+ "transformers_version": "4.57.3",
43
+ "use_cache": true,
44
+ "vocab_size": 100352
45
+ }
configuration_afmoe.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ from transformers.configuration_utils import PretrainedConfig
16
+ from transformers.modeling_rope_utils import rope_config_validation
17
+ from transformers.configuration_utils import layer_type_validation
18
+ from transformers.utils import logging
19
+
20
+ logger = logging.get_logger(__name__)
21
+
22
+ class AfmoeConfig(PretrainedConfig):
23
+ """
24
+ n_group (`int`, *optional*, defaults to 1):
25
+ Number of groups for routed experts.
26
+ topk_group (`int`, *optional*, defaults to 1):
27
+ Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
28
+ """
29
+ model_type = "afmoe"
30
+ base_model_pp_plan = {
31
+ "embed_tokens": (["input_ids"], ["inputs_embeds"]),
32
+ "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
33
+ "norm": (["hidden_states"], ["hidden_states"]),
34
+ }
35
+
36
+ def __init__(
37
+ self,
38
+ num_hidden_layers: int = 32,
39
+ vocab_size: int = 200192,
40
+ hidden_size: int = 2048,
41
+ intermediate_size: int = 6144,
42
+ moe_intermediate_size=1408,
43
+ num_dense_layers=1,
44
+ num_attention_heads=16,
45
+ num_key_value_heads=None,
46
+ head_dim=128,
47
+ hidden_act="silu",
48
+ max_position_embeddings=16384,
49
+ initializer_range=0.02,
50
+ rms_norm_eps=1e-5,
51
+ use_cache=True,
52
+ tie_word_embeddings=False,
53
+ rope_theta=10000.0,
54
+ rope_scaling=None,
55
+ num_experts=64,
56
+ num_experts_per_tok=6,
57
+ num_shared_experts=2,
58
+ num_expert_groups=1,
59
+ num_limited_groups=1,
60
+ score_func="sigmoid",
61
+ route_norm=True,
62
+ route_scale=1.0,
63
+ global_attn_every_n_layers=4,
64
+ sliding_window=1024,
65
+ mup_enabled=False,
66
+ layer_types=None,
67
+ attention_dropout: float = 0.0,
68
+ n_group: int = 1,
69
+ topk_group: int = 1,
70
+ **kwargs,
71
+ ):
72
+ self.vocab_size = vocab_size
73
+ self.max_position_embeddings = max_position_embeddings
74
+ self.hidden_size = hidden_size
75
+ self.intermediate_size = intermediate_size
76
+ self.num_hidden_layers = num_hidden_layers
77
+ self.num_dense_layers = num_dense_layers
78
+ self.num_attention_heads = num_attention_heads
79
+ self.head_dim = head_dim
80
+ self.hidden_act = hidden_act
81
+ self.initializer_range = initializer_range
82
+ self.rms_norm_eps = rms_norm_eps
83
+ self.use_cache = use_cache
84
+ self.rope_theta = rope_theta
85
+ self.rope_scaling = rope_scaling
86
+
87
+
88
+ # MoE specific
89
+ self.moe_intermediate_size = moe_intermediate_size
90
+ self.num_experts_per_tok = num_experts_per_tok
91
+ self.n_group = n_group
92
+ self.topk_group = topk_group
93
+ self.num_experts = num_experts
94
+ self.num_shared_experts = num_shared_experts
95
+ self.num_expert_groups = num_expert_groups
96
+ self.num_limited_groups = num_limited_groups
97
+ self.score_func = score_func
98
+ self.route_norm = route_norm
99
+ self.route_scale = route_scale
100
+
101
+
102
+ # Attention specific
103
+ self.attention_dropout = attention_dropout
104
+ self.global_attn_every_n_layers = global_attn_every_n_layers
105
+ self.sliding_window = sliding_window
106
+ self.layer_types = layer_types
107
+ if self.layer_types is None:
108
+ self.layer_types = [
109
+ "sliding_attention" if bool((i + 1) % global_attn_every_n_layers) else "full_attention" for i in range(self.num_hidden_layers)
110
+ ]
111
+ layer_type_validation(self.layer_types)
112
+
113
+ # muP specific
114
+ self.mup_enabled = mup_enabled
115
+
116
+ if num_key_value_heads is None:
117
+ num_key_value_heads = num_attention_heads
118
+
119
+ self.num_key_value_heads = num_key_value_heads
120
+
121
+
122
+ # Validate rope configs
123
+ if self.rope_scaling is not None and "type" in self.rope_scaling:
124
+ self.rope_scaling["rope_type"] = self.rope_scaling["type"]
125
+ rope_config_validation(self)
126
+
127
+ super().__init__(
128
+ tie_word_embeddings=tie_word_embeddings,
129
+ **kwargs,
130
+ )
131
+
132
+
133
+ __all__ = ["AfmoeConfig"]
generation_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "transformers_version": "4.57.3"
4
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c70f8f4de11e44cdf5b00d0d25d78b78d73453211b22c73e40d59a896c6e062e
3
+ size 30565352
modeling_afmoe.py ADDED
@@ -0,0 +1,680 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Callable, Optional, Tuple, Union
2
+
3
+ import torch
4
+ import torch.nn.functional as F
5
+ from torch import nn
6
+
7
+ from transformers.activations import ACT2FN
8
+ from transformers.generation import GenerationMixin
9
+ from transformers.modeling_outputs import (
10
+ MoeCausalLMOutputWithPast,
11
+ MoeModelOutputWithPast,
12
+ )
13
+ from transformers.modeling_utils import PreTrainedModel, ALL_ATTENTION_FUNCTIONS
14
+ from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
15
+ from transformers.masking_utils import (
16
+ create_causal_mask,
17
+ create_sliding_window_causal_mask,
18
+ )
19
+ from transformers.modeling_layers import GradientCheckpointingLayer
20
+ from transformers.processing_utils import Unpack
21
+ from transformers.utils import TransformersKwargs
22
+ from transformers.cache_utils import Cache, DynamicCache
23
+ from transformers.integrations import use_kernel_forward_from_hub
24
+
25
+
26
+ try:
27
+ from .configuration_afmoe import AfmoeConfig
28
+ except:
29
+ from configuration_afmoe import AfmoeConfig
30
+
31
+ class AfmoeRotaryEmbedding(nn.Module):
32
+
33
+ def __init__(self, config: AfmoeConfig, device=None):
34
+ super().__init__()
35
+ # BC: "rope_type" was originally "type"
36
+ if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
37
+ self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
38
+ else:
39
+ self.rope_type = "default"
40
+ self.max_seq_len_cached = config.max_position_embeddings
41
+ self.original_max_seq_len = config.max_position_embeddings
42
+
43
+ self.config = config
44
+ self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
45
+
46
+ inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
47
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
48
+ self.original_inv_freq = self.inv_freq
49
+
50
+ def _dynamic_frequency_update(self, position_ids, device):
51
+ """
52
+ dynamic RoPE layers should recompute `inv_freq` in the following situations:
53
+ 1 - growing beyond the cached sequence length (allow scaling)
54
+ 2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
55
+ """
56
+ seq_len = torch.max(position_ids) + 1
57
+ if seq_len > self.max_seq_len_cached: # growth
58
+ inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
59
+ self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: may break with compilation
60
+ self.max_seq_len_cached = seq_len
61
+
62
+ if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len: # reset
63
+ # This .to() is needed if the model has been moved to a device after being initialized (because
64
+ # the buffer is automatically moved, but not the original copy)
65
+ self.original_inv_freq = self.original_inv_freq.to(device)
66
+ self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
67
+ self.max_seq_len_cached = self.original_max_seq_len
68
+
69
+ @torch.no_grad()
70
+ def forward(self, x, position_ids):
71
+ if "dynamic" in self.rope_type:
72
+ self._dynamic_frequency_update(position_ids, device=x.device)
73
+
74
+ # Core RoPE block
75
+ inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
76
+ position_ids_expanded = position_ids[:, None, :].float()
77
+ # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
78
+ device_type = x.device.type
79
+ device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
80
+ with torch.autocast(device_type=device_type, enabled=False):
81
+ freqs = (inv_freq_expanded.float().to(x.device) @ position_ids_expanded.float()).transpose(1, 2)
82
+ emb = torch.cat((freqs, freqs), dim=-1)
83
+ cos = emb.cos()
84
+ sin = emb.sin()
85
+
86
+ # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
87
+ cos = cos * self.attention_scaling
88
+ sin = sin * self.attention_scaling
89
+
90
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
91
+
92
+
93
+ def rotate_half(x):
94
+ """Rotates half the hidden dims of the input."""
95
+ x1 = x[..., : x.shape[-1] // 2]
96
+ x2 = x[..., x.shape[-1] // 2 :]
97
+ return torch.cat((-x2, x1), dim=-1)
98
+
99
+
100
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
101
+ """Applies Rotary Position Embedding to the query and key tensors.
102
+
103
+ Args:
104
+ q (`torch.Tensor`): The query tensor.
105
+ k (`torch.Tensor`): The key tensor.
106
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
107
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
108
+ position_ids (`torch.Tensor`, *optional*):
109
+ Deprecated and unused.
110
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
111
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
112
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
113
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
114
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
115
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
116
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
117
+ Returns:
118
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
119
+ """
120
+ cos = cos.unsqueeze(unsqueeze_dim)
121
+ sin = sin.unsqueeze(unsqueeze_dim)
122
+ q_embed = (q * cos) + (rotate_half(q) * sin)
123
+ k_embed = (k * cos) + (rotate_half(k) * sin)
124
+ return q_embed, k_embed
125
+
126
+
127
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
128
+ """
129
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
130
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
131
+ """
132
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
133
+ if n_rep == 1:
134
+ return hidden_states
135
+ hidden_states = hidden_states[:, :, None, :, :].expand(
136
+ batch, num_key_value_heads, n_rep, slen, head_dim
137
+ )
138
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
139
+
140
+ @use_kernel_forward_from_hub("RMSNorm")
141
+ class AfmoeRMSNorm(nn.Module):
142
+ def __init__(self, hidden_size: int, eps: float):
143
+ """
144
+ AfmoeRMSNorm is equivalent to T5LayerNorm
145
+ """
146
+ super().__init__()
147
+ self.weight = nn.Parameter(torch.ones(hidden_size))
148
+ self.variance_epsilon = eps
149
+
150
+ def forward(self, hidden_states):
151
+ input_dtype = hidden_states.dtype
152
+ hidden_states = hidden_states.to(torch.float32)
153
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
154
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
155
+ return self.weight * hidden_states.to(input_dtype)
156
+
157
+ def extra_repr(self):
158
+ return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
159
+
160
+
161
+
162
+ def eager_attention_forward(
163
+ module: nn.Module,
164
+ query: torch.Tensor,
165
+ key: torch.Tensor,
166
+ value: torch.Tensor,
167
+ attention_mask: Optional[torch.Tensor],
168
+ scaling: float,
169
+ dropout: float = 0.0,
170
+ **kwargs,
171
+ ):
172
+ key_states = repeat_kv(key, module.num_key_value_groups)
173
+ value_states = repeat_kv(value, module.num_key_value_groups)
174
+
175
+ attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
176
+ if attention_mask is not None:
177
+ causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
178
+ attn_weights = attn_weights + causal_mask
179
+
180
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(
181
+ query.dtype
182
+ )
183
+ attn_weights = nn.functional.dropout(
184
+ attn_weights, p=dropout, training=module.training
185
+ )
186
+ attn_output = torch.matmul(attn_weights, value_states)
187
+ attn_output = attn_output.transpose(1, 2).contiguous()
188
+
189
+ return attn_output, attn_weights
190
+
191
+
192
+ class AfmoeMLP(nn.Module):
193
+ def __init__(self, config, intermediate_size=None):
194
+ super().__init__()
195
+ self.config = config
196
+ self.hidden_size = config.hidden_size
197
+ self.intermediate_size = intermediate_size or config.intermediate_size
198
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
199
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
200
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
201
+ self.act_fn = ACT2FN[config.hidden_act]
202
+
203
+ def forward(self, x):
204
+ return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
205
+
206
+
207
+ class AfmoeTokenChoiceRouter(nn.Module):
208
+ """Token-choice top-K router for MoE routing."""
209
+
210
+ def __init__(self, config):
211
+ super().__init__()
212
+ self.config = config
213
+ self.top_k = config.num_experts_per_tok
214
+ self.num_experts = config.num_experts
215
+ self.score_func = config.score_func
216
+ self.route_norm = config.route_norm
217
+ self.route_scale = config.route_scale
218
+ self.gate = nn.Linear(config.hidden_size, config.num_experts, bias=False)
219
+
220
+ def forward(self, hidden_states, expert_bias: torch.Tensor | None):
221
+ _, _, hidden_dim = hidden_states.shape
222
+ hidden_states = hidden_states.view(-1, hidden_dim)
223
+
224
+ scores = self.gate(hidden_states)
225
+
226
+ # Apply scoring function in float32 for stability
227
+ if self.score_func == "sigmoid":
228
+ scores = torch.sigmoid(scores.to(torch.float32))
229
+ else:
230
+ scores = F.softmax(scores.to(torch.float32), dim=-1)
231
+
232
+ if expert_bias is not None:
233
+ _, selected_experts = torch.topk(scores + expert_bias, k=self.top_k, dim=1)
234
+ top_scores = scores.gather(dim=1, index=selected_experts)
235
+ else:
236
+ top_scores, selected_experts = torch.topk(scores, k=self.top_k, dim=1)
237
+
238
+ # Normalize weights if using sigmoid
239
+ if self.score_func == "sigmoid" and self.route_norm:
240
+ denominator = top_scores.sum(dim=-1, keepdim=True) + 1e-20
241
+ top_scores = top_scores / denominator
242
+
243
+ top_scores = top_scores * self.route_scale
244
+ return top_scores, selected_experts
245
+
246
+ class AfmoeMoE(nn.Module):
247
+ def __init__(self, config):
248
+ super().__init__()
249
+ self.config = config
250
+ self.router = AfmoeTokenChoiceRouter(config)
251
+
252
+ self.shared_experts = None
253
+ if config.num_shared_experts > 0:
254
+ self.shared_experts = AfmoeMLP(
255
+ config, config.moe_intermediate_size * config.num_shared_experts
256
+ )
257
+ self.experts = nn.ModuleList(
258
+ [AfmoeMLP(
259
+ config, intermediate_size=config.moe_intermediate_size
260
+ ) for _ in range(config.num_experts)]
261
+ )
262
+ self.expert_bias = nn.Parameter(torch.zeros(config.num_experts, dtype=torch.float32), requires_grad=False)
263
+
264
+
265
+ def forward(self, hidden_states):
266
+ batch_size, seq_len, hidden_dim = hidden_states.shape
267
+ hidden_states_flat = hidden_states.view(-1, hidden_dim)
268
+
269
+ # Get routing decisions
270
+ top_scores, selected_experts = self.router(hidden_states, self.expert_bias)
271
+
272
+ # Process through shared experts
273
+ if self.shared_experts is not None:
274
+ shared_output = self.shared_experts(hidden_states_flat)
275
+ else:
276
+ shared_output = torch.zeros_like(hidden_states_flat)
277
+
278
+ # Reorder tokens by expert for efficient processing
279
+ token_indices_sorted = torch.argsort(selected_experts.view(-1), stable=True)
280
+ top_scores_sorted = top_scores.view(-1)[token_indices_sorted]
281
+ token_to_expert = selected_experts.view(-1)[token_indices_sorted]
282
+ token_indices_sorted = token_indices_sorted // self.config.num_experts_per_tok
283
+
284
+ # Gather input tokens
285
+ token_indices_expanded = token_indices_sorted.unsqueeze(-1).expand(
286
+ -1, hidden_dim
287
+ )
288
+ routed_input = torch.gather(
289
+ hidden_states_flat, dim=0, index=token_indices_expanded
290
+ )
291
+
292
+ routed_output = torch.zeros_like(routed_input)
293
+ for expert_id in range(self.config.num_experts):
294
+ mask = token_to_expert == expert_id
295
+ if mask.any():
296
+ expert_input = routed_input[mask]
297
+ expert_out = self.experts[expert_id](expert_input)
298
+ routed_output[mask] = expert_out
299
+
300
+ routed_output = (
301
+ routed_output.to(torch.float32) * top_scores_sorted.unsqueeze(-1)
302
+ ).to(hidden_states.dtype)
303
+
304
+ # Scatter back to original positions
305
+ output = shared_output.scatter_add(
306
+ dim=0, index=token_indices_expanded, src=routed_output
307
+ )
308
+
309
+ return output.view(batch_size, seq_len, hidden_dim)
310
+
311
+
312
+ class AfmoeAttention(nn.Module):
313
+ """Multi-headed attention with local/global pattern and gating."""
314
+
315
+ def __init__(self, config: AfmoeConfig, layer_idx: int):
316
+ super().__init__()
317
+ self.config = config
318
+ self.layer_idx = layer_idx
319
+ self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
320
+ self.num_heads = config.num_attention_heads
321
+ self.num_key_value_heads = config.num_key_value_heads
322
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
323
+
324
+ self.scaling = self.head_dim**-0.5
325
+ self.attention_dropout = config.attention_dropout
326
+ self.is_local_attention = config.layer_types[layer_idx] == "sliding_attention"
327
+ self.sliding_window = config.sliding_window if self.is_local_attention else None
328
+
329
+ self.q_proj = nn.Linear(
330
+ config.hidden_size, self.num_heads * self.head_dim, bias=False
331
+ )
332
+ self.k_proj = nn.Linear(
333
+ config.hidden_size, self.num_key_value_heads * self.head_dim, bias=False
334
+ )
335
+ self.v_proj = nn.Linear(
336
+ config.hidden_size, self.num_key_value_heads * self.head_dim, bias=False
337
+ )
338
+ self.o_proj = nn.Linear(
339
+ self.num_heads * self.head_dim, config.hidden_size, bias=False
340
+ )
341
+
342
+ self.q_norm = AfmoeRMSNorm(self.head_dim, eps=config.rms_norm_eps)
343
+ self.k_norm = AfmoeRMSNorm(self.head_dim, eps=config.rms_norm_eps)
344
+
345
+ self.gate_proj = nn.Linear(
346
+ config.hidden_size, self.num_heads * self.head_dim, bias=False
347
+ )
348
+
349
+ def forward(
350
+ self,
351
+ hidden_states: torch.Tensor,
352
+ position_embeddings: tuple[torch.Tensor, torch.Tensor],
353
+ attention_mask: Optional[torch.Tensor],
354
+ past_key_value: Optional[Cache] = None,
355
+ cache_position: Optional[torch.LongTensor] = None,
356
+ **kwargs: Unpack[TransformersKwargs],
357
+ ) -> torch.Tensor:
358
+
359
+ input_shape = hidden_states.shape[:-1]
360
+ hidden_shape = (*input_shape, -1, self.head_dim)
361
+
362
+ query_states = self.q_proj(hidden_states).view(hidden_shape)
363
+ key_states = self.k_proj(hidden_states).view(hidden_shape)
364
+ value_states = self.v_proj(hidden_states).view(hidden_shape)
365
+ gate_states = self.gate_proj(hidden_states)
366
+
367
+ query_states = self.q_norm(query_states)
368
+ key_states = self.k_norm(key_states)
369
+
370
+ query_states = query_states.transpose(1, 2)
371
+ key_states = key_states.transpose(1, 2)
372
+ value_states = value_states.transpose(1, 2)
373
+
374
+ if self.is_local_attention:
375
+ cos, sin = position_embeddings
376
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
377
+
378
+ if past_key_value is not None:
379
+ cache_kwargs = {"cache_position": cache_position}
380
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
381
+
382
+ attention_interface: Callable = eager_attention_forward
383
+ if self.config._attn_implementation != "eager":
384
+ attention_interface = ALL_ATTENTION_FUNCTIONS[
385
+ self.config._attn_implementation
386
+ ]
387
+
388
+ output, _ = attention_interface(
389
+ self,
390
+ query_states,
391
+ key_states,
392
+ value_states,
393
+ attention_mask=attention_mask,
394
+ dropout=0.0 if not self.training else self.attention_dropout,
395
+ scaling=self.scaling,
396
+ sliding_window=self.sliding_window,
397
+ **kwargs,
398
+ )
399
+
400
+ output = output.view(*input_shape, -1).contiguous()
401
+ output = output * F.sigmoid(gate_states)
402
+ return self.o_proj(output)
403
+
404
+
405
+ class AfmoeDecoderLayer(GradientCheckpointingLayer):
406
+ def __init__(self, config: AfmoeConfig, layer_idx: int):
407
+ super().__init__()
408
+ self.hidden_size = config.hidden_size
409
+ self.layer_idx = layer_idx
410
+
411
+ self.self_attn = AfmoeAttention(config=config, layer_idx=layer_idx)
412
+ self.attention_type = config.layer_types[layer_idx]
413
+
414
+ # Dual normalization for attention
415
+ self.input_layernorm = AfmoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
416
+ self.post_attention_layernorm = AfmoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
417
+
418
+ # Dual normalization for FFN
419
+ self.pre_mlp_layernorm = AfmoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
420
+ self.post_mlp_layernorm = AfmoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
421
+
422
+ # MoE or dense FFN
423
+ self.moe_enabled = layer_idx >= config.num_dense_layers
424
+ if self.moe_enabled:
425
+ self.mlp = AfmoeMoE(config)
426
+ else:
427
+ self.mlp = AfmoeMLP(config)
428
+
429
+ def forward(
430
+ self,
431
+ hidden_states: torch.Tensor,
432
+ attention_mask: Optional[torch.Tensor] = None,
433
+ position_ids: Optional[torch.LongTensor] = None,
434
+ past_key_value: Optional[Cache] = None,
435
+ use_cache: Optional[bool] = None,
436
+ cache_position: Optional[torch.LongTensor] = None,
437
+ position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
438
+ **kwargs: Unpack[TransformersKwargs],
439
+ ) -> torch.FloatTensor:
440
+ residual = hidden_states
441
+
442
+ # Self Attention with dual normalization
443
+ hidden_states = self.input_layernorm(hidden_states)
444
+ hidden_states = self.self_attn(
445
+ hidden_states=hidden_states,
446
+ attention_mask=attention_mask,
447
+ position_ids=position_ids,
448
+ past_key_value=past_key_value,
449
+ use_cache=use_cache,
450
+ cache_position=cache_position,
451
+ position_embeddings=position_embeddings,
452
+ **kwargs,
453
+ )
454
+ hidden_states = self.post_attention_layernorm(hidden_states)
455
+ hidden_states = residual + hidden_states
456
+
457
+ # FFN with dual normalization
458
+ residual = hidden_states
459
+ hidden_states = self.pre_mlp_layernorm(hidden_states)
460
+
461
+ if self.moe_enabled:
462
+ hidden_states = self.mlp(hidden_states)
463
+ else:
464
+ hidden_states = self.mlp(hidden_states)
465
+
466
+ hidden_states = self.post_mlp_layernorm(hidden_states)
467
+ hidden_states = residual + hidden_states
468
+ return hidden_states
469
+
470
+
471
+ class AfmoePreTrainedModel(PreTrainedModel):
472
+ config_class = AfmoeConfig
473
+ base_model_prefix = "model"
474
+ _no_split_modules = ["AfmoeDecoderLayer"]
475
+ _skip_keys_device_placement = ["past_key_values"]
476
+ _keep_in_fp32_modules = [
477
+ "input_layernorm",
478
+ "post_attention_layernorm",
479
+ "pre_mlp_layernorm",
480
+ "post_mlp_layernorm",
481
+ "q_norm",
482
+ "k_norm",
483
+ "norm",
484
+ ]
485
+ _supports_sdpa = True
486
+ _supports_attention_backend = True
487
+ supports_gradient_checkpointing = True
488
+
489
+
490
+ class AfmoeModel(AfmoePreTrainedModel):
491
+ _no_split_modules = ["AfmoeDecoderLayer"]
492
+
493
+ def __init__(self, config: AfmoeConfig):
494
+ super().__init__(config)
495
+ self.padding_idx = config.pad_token_id
496
+ self.vocab_size = config.vocab_size
497
+
498
+ self.embed_tokens = nn.Embedding(
499
+ config.vocab_size, config.hidden_size, self.padding_idx
500
+ )
501
+ self.layers = nn.ModuleList(
502
+ [
503
+ AfmoeDecoderLayer(config, layer_idx)
504
+ for layer_idx in range(config.num_hidden_layers)
505
+ ]
506
+ )
507
+ self.norm = AfmoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
508
+ self.rotary_emb = AfmoeRotaryEmbedding(config=config)
509
+ self.gradient_checkpointing = False
510
+
511
+ self.post_init()
512
+
513
+ def get_input_embeddings(self):
514
+ return self.embed_tokens
515
+
516
+ def set_input_embeddings(self, value):
517
+ self.embed_tokens = value
518
+
519
+
520
+ def forward(
521
+ self,
522
+ input_ids: torch.LongTensor,
523
+ attention_mask: Optional[torch.Tensor] = None,
524
+ position_ids: Optional[torch.LongTensor] = None,
525
+ past_key_values: Optional[list[torch.FloatTensor]] = None,
526
+ inputs_embeds: Optional[torch.FloatTensor] = None,
527
+ use_cache: Optional[bool] = None,
528
+ cache_position: Optional[torch.LongTensor] = None,
529
+ **kwargs: Unpack[TransformersKwargs],
530
+ ) -> MoeModelOutputWithPast:
531
+ if (input_ids is None) ^ (inputs_embeds is not None):
532
+ raise ValueError(
533
+ "You must specify exactly one of input_ids or inputs_embeds"
534
+ )
535
+
536
+ if use_cache and past_key_values is None:
537
+ past_key_values = DynamicCache()
538
+
539
+ if inputs_embeds is None:
540
+ inputs_embeds = self.embed_tokens(input_ids)
541
+
542
+ if cache_position is None:
543
+ past_seen_tokens = (
544
+ past_key_values.get_seq_length() if past_key_values is not None else 0
545
+ )
546
+ cache_position = torch.arange(
547
+ past_seen_tokens,
548
+ past_seen_tokens + inputs_embeds.shape[1],
549
+ device=inputs_embeds.device,
550
+ )
551
+ if position_ids is None:
552
+ position_ids = cache_position.unsqueeze(0)
553
+
554
+ # It may already have been prepared by e.g. `generate`
555
+ if not isinstance(causal_mask_mapping := attention_mask, dict):
556
+ mask_kwargs = {
557
+ "config": self.config,
558
+ "input_embeds": inputs_embeds,
559
+ "attention_mask": attention_mask,
560
+ "cache_position": cache_position,
561
+ "past_key_values": past_key_values,
562
+ }
563
+ causal_mask_mapping = {
564
+ "full_attention": create_causal_mask(**mask_kwargs),
565
+ "sliding_attention": create_sliding_window_causal_mask(**mask_kwargs),
566
+ }
567
+
568
+ hidden_states = inputs_embeds
569
+
570
+ # Apply muP input scaling if enabled
571
+ if self.config.mup_enabled:
572
+ hidden_states = hidden_states * (self.config.hidden_size**0.5)
573
+
574
+ position_embeddings = self.rotary_emb(hidden_states, position_ids)
575
+
576
+ for decoder_layer in self.layers:
577
+ hidden_states = decoder_layer(
578
+ hidden_states,
579
+ attention_mask=causal_mask_mapping[decoder_layer.attention_type],
580
+ position_ids=position_ids,
581
+ past_key_value=past_key_values,
582
+ use_cache=use_cache,
583
+ cache_position=cache_position,
584
+ position_embeddings=position_embeddings,
585
+ **kwargs,
586
+ )
587
+
588
+ hidden_states = self.norm(hidden_states)
589
+ return MoeModelOutputWithPast(
590
+ last_hidden_state=hidden_states,
591
+ past_key_values=past_key_values,
592
+ )
593
+
594
+
595
+ class AfmoeForCausalLM(AfmoePreTrainedModel, GenerationMixin):
596
+ _tied_weights_keys = ["lm_head.weight"]
597
+ _tp_plan = {"lm_head": "colwise_rep"}
598
+ _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
599
+
600
+ def __init__(self, config):
601
+ super().__init__(config)
602
+ self.model = AfmoeModel(config)
603
+ self.vocab_size = config.vocab_size
604
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
605
+
606
+ # Initialize weights and apply final processing
607
+ self.post_init()
608
+
609
+ def get_input_embeddings(self):
610
+ return self.model.embed_tokens
611
+
612
+ def set_input_embeddings(self, value):
613
+ self.model.embed_tokens = value
614
+
615
+ def get_output_embeddings(self):
616
+ return self.lm_head
617
+
618
+ def set_output_embeddings(self, new_embeddings):
619
+ self.lm_head = new_embeddings
620
+
621
+ def set_decoder(self, decoder):
622
+ self.model = decoder
623
+
624
+ def get_decoder(self):
625
+ return self.model
626
+
627
+ def forward(
628
+ self,
629
+ input_ids: torch.LongTensor,
630
+ attention_mask: Optional[torch.Tensor] = None,
631
+ position_ids: Optional[torch.LongTensor] = None,
632
+ past_key_values: Optional[Cache] = None,
633
+ inputs_embeds: Optional[torch.FloatTensor] = None,
634
+ labels: Optional[torch.LongTensor] = None,
635
+ use_cache: Optional[bool] = None,
636
+ cache_position: Optional[torch.LongTensor] = None,
637
+ logits_to_keep: Union[int, torch.Tensor] = 0,
638
+ token_type_ids: Optional[torch.Tensor] = None, # will be ignored
639
+ **kwargs: Unpack[TransformersKwargs],
640
+ ) -> Union[Tuple, MoeCausalLMOutputWithPast]:
641
+ outputs: MoeModelOutputWithPast = self.model(
642
+ input_ids=input_ids,
643
+ attention_mask=attention_mask,
644
+ position_ids=position_ids,
645
+ past_key_values=past_key_values,
646
+ inputs_embeds=inputs_embeds,
647
+ use_cache=use_cache,
648
+ cache_position=cache_position,
649
+ **kwargs,
650
+ )
651
+
652
+ hidden_states = outputs.last_hidden_state
653
+ # Only compute necessary logits
654
+ slice_indices = (
655
+ slice(-logits_to_keep, None)
656
+ if isinstance(logits_to_keep, int)
657
+ else logits_to_keep
658
+ )
659
+ logits = self.lm_head(hidden_states[:, slice_indices, :])
660
+
661
+ loss = None
662
+ if labels is not None:
663
+ loss = self.loss_function(logits, labels, self.vocab_size, **kwargs)
664
+
665
+
666
+ return MoeCausalLMOutputWithPast(
667
+ loss=loss,
668
+ logits=logits,
669
+ past_key_values=outputs.past_key_values,
670
+ hidden_states=outputs.hidden_states,
671
+ attentions=outputs.attentions,
672
+ router_logits=outputs.router_logits,
673
+ )
674
+
675
+
676
+ __all__ = [
677
+ "AfmoeForCausalLM",
678
+ "AfmoeModel",
679
+ "AfmoePreTrainedModel",
680
+ ]
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|end_of_text|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|end_of_text|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|pad|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|unk|>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,783 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "100256": {
6
+ "content": "<|pad|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "100257": {
14
+ "content": "<|end_of_text|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "100258": {
22
+ "content": "<|fim_prefix|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": false
28
+ },
29
+ "100259": {
30
+ "content": "<|fim_middle|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": false
36
+ },
37
+ "100260": {
38
+ "content": "<|fim_suffix|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": false
44
+ },
45
+ "100261": {
46
+ "content": "<|fim_pad|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": false
52
+ },
53
+ "100262": {
54
+ "content": "<|filename|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": false
60
+ },
61
+ "100263": {
62
+ "content": "<|reponame|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": false
68
+ },
69
+ "100264": {
70
+ "content": "<|start_of_role|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "100265": {
78
+ "content": "<|end_of_role|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "100266": {
86
+ "content": "<|unused_1|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "100267": {
94
+ "content": "<|start_of_plugin|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "100268": {
102
+ "content": "<|end_of_plugin|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "100269": {
110
+ "content": "<|unk|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "100270": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "100271": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "100272": {
134
+ "content": "<tool_response>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "100273": {
142
+ "content": "</tool_response>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "100274": {
150
+ "content": "<think>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "100275": {
158
+ "content": "</think>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "100276": {
166
+ "content": "<think_on>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": true
172
+ },
173
+ "100277": {
174
+ "content": "<think_off>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": true
180
+ },
181
+ "100278": {
182
+ "content": "<schema>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": true
188
+ },
189
+ "100279": {
190
+ "content": "</schema>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": true
196
+ },
197
+ "100280": {
198
+ "content": "<tools>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": true
204
+ },
205
+ "100281": {
206
+ "content": "</tools>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": true
212
+ },
213
+ "100282": {
214
+ "content": "<documents>",
215
+ "lstrip": false,
216
+ "normalized": false,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": true
220
+ },
221
+ "100283": {
222
+ "content": "</documents>",
223
+ "lstrip": false,
224
+ "normalized": false,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": true
228
+ },
229
+ "100284": {
230
+ "content": "<|unused_15|>",
231
+ "lstrip": false,
232
+ "normalized": false,
233
+ "rstrip": false,
234
+ "single_word": false,
235
+ "special": true
236
+ },
237
+ "100285": {
238
+ "content": "<|unused_16|>",
239
+ "lstrip": false,
240
+ "normalized": false,
241
+ "rstrip": false,
242
+ "single_word": false,
243
+ "special": true
244
+ },
245
+ "100286": {
246
+ "content": "<|unused_17|>",
247
+ "lstrip": false,
248
+ "normalized": false,
249
+ "rstrip": false,
250
+ "single_word": false,
251
+ "special": true
252
+ },
253
+ "100287": {
254
+ "content": "<|unused_18|>",
255
+ "lstrip": false,
256
+ "normalized": false,
257
+ "rstrip": false,
258
+ "single_word": false,
259
+ "special": true
260
+ },
261
+ "100288": {
262
+ "content": "<|unused_19|>",
263
+ "lstrip": false,
264
+ "normalized": false,
265
+ "rstrip": false,
266
+ "single_word": false,
267
+ "special": true
268
+ },
269
+ "100289": {
270
+ "content": "<|unused_20|>",
271
+ "lstrip": false,
272
+ "normalized": false,
273
+ "rstrip": false,
274
+ "single_word": false,
275
+ "special": true
276
+ },
277
+ "100290": {
278
+ "content": "<|unused_21|>",
279
+ "lstrip": false,
280
+ "normalized": false,
281
+ "rstrip": false,
282
+ "single_word": false,
283
+ "special": true
284
+ },
285
+ "100291": {
286
+ "content": "<|unused_22|>",
287
+ "lstrip": false,
288
+ "normalized": false,
289
+ "rstrip": false,
290
+ "single_word": false,
291
+ "special": true
292
+ },
293
+ "100292": {
294
+ "content": "<|unused_23|>",
295
+ "lstrip": false,
296
+ "normalized": false,
297
+ "rstrip": false,
298
+ "single_word": false,
299
+ "special": true
300
+ },
301
+ "100293": {
302
+ "content": "<|unused_24|>",
303
+ "lstrip": false,
304
+ "normalized": false,
305
+ "rstrip": false,
306
+ "single_word": false,
307
+ "special": true
308
+ },
309
+ "100294": {
310
+ "content": "<|unused_25|>",
311
+ "lstrip": false,
312
+ "normalized": false,
313
+ "rstrip": false,
314
+ "single_word": false,
315
+ "special": true
316
+ },
317
+ "100295": {
318
+ "content": "<|unused_26|>",
319
+ "lstrip": false,
320
+ "normalized": false,
321
+ "rstrip": false,
322
+ "single_word": false,
323
+ "special": true
324
+ },
325
+ "100296": {
326
+ "content": "<|unused_27|>",
327
+ "lstrip": false,
328
+ "normalized": false,
329
+ "rstrip": false,
330
+ "single_word": false,
331
+ "special": true
332
+ },
333
+ "100297": {
334
+ "content": "<|unused_28|>",
335
+ "lstrip": false,
336
+ "normalized": false,
337
+ "rstrip": false,
338
+ "single_word": false,
339
+ "special": true
340
+ },
341
+ "100298": {
342
+ "content": "<|unused_29|>",
343
+ "lstrip": false,
344
+ "normalized": false,
345
+ "rstrip": false,
346
+ "single_word": false,
347
+ "special": true
348
+ },
349
+ "100299": {
350
+ "content": "<|unused_30|>",
351
+ "lstrip": false,
352
+ "normalized": false,
353
+ "rstrip": false,
354
+ "single_word": false,
355
+ "special": true
356
+ },
357
+ "100300": {
358
+ "content": "<|unused_31|>",
359
+ "lstrip": false,
360
+ "normalized": false,
361
+ "rstrip": false,
362
+ "single_word": false,
363
+ "special": true
364
+ },
365
+ "100301": {
366
+ "content": "<|unused_32|>",
367
+ "lstrip": false,
368
+ "normalized": false,
369
+ "rstrip": false,
370
+ "single_word": false,
371
+ "special": true
372
+ },
373
+ "100302": {
374
+ "content": "<|unused_33|>",
375
+ "lstrip": false,
376
+ "normalized": false,
377
+ "rstrip": false,
378
+ "single_word": false,
379
+ "special": true
380
+ },
381
+ "100303": {
382
+ "content": "<|unused_34|>",
383
+ "lstrip": false,
384
+ "normalized": false,
385
+ "rstrip": false,
386
+ "single_word": false,
387
+ "special": true
388
+ },
389
+ "100304": {
390
+ "content": "<|unused_35|>",
391
+ "lstrip": false,
392
+ "normalized": false,
393
+ "rstrip": false,
394
+ "single_word": false,
395
+ "special": true
396
+ },
397
+ "100305": {
398
+ "content": "<|unused_36|>",
399
+ "lstrip": false,
400
+ "normalized": false,
401
+ "rstrip": false,
402
+ "single_word": false,
403
+ "special": true
404
+ },
405
+ "100306": {
406
+ "content": "<|unused_37|>",
407
+ "lstrip": false,
408
+ "normalized": false,
409
+ "rstrip": false,
410
+ "single_word": false,
411
+ "special": true
412
+ },
413
+ "100307": {
414
+ "content": "<|unused_38|>",
415
+ "lstrip": false,
416
+ "normalized": false,
417
+ "rstrip": false,
418
+ "single_word": false,
419
+ "special": true
420
+ },
421
+ "100308": {
422
+ "content": "<|unused_39|>",
423
+ "lstrip": false,
424
+ "normalized": false,
425
+ "rstrip": false,
426
+ "single_word": false,
427
+ "special": true
428
+ },
429
+ "100309": {
430
+ "content": "<|unused_40|>",
431
+ "lstrip": false,
432
+ "normalized": false,
433
+ "rstrip": false,
434
+ "single_word": false,
435
+ "special": true
436
+ },
437
+ "100310": {
438
+ "content": "<|unused_41|>",
439
+ "lstrip": false,
440
+ "normalized": false,
441
+ "rstrip": false,
442
+ "single_word": false,
443
+ "special": true
444
+ },
445
+ "100311": {
446
+ "content": "<|unused_42|>",
447
+ "lstrip": false,
448
+ "normalized": false,
449
+ "rstrip": false,
450
+ "single_word": false,
451
+ "special": true
452
+ },
453
+ "100312": {
454
+ "content": "<|unused_43|>",
455
+ "lstrip": false,
456
+ "normalized": false,
457
+ "rstrip": false,
458
+ "single_word": false,
459
+ "special": true
460
+ },
461
+ "100313": {
462
+ "content": "<|unused_44|>",
463
+ "lstrip": false,
464
+ "normalized": false,
465
+ "rstrip": false,
466
+ "single_word": false,
467
+ "special": true
468
+ },
469
+ "100314": {
470
+ "content": "<|unused_45|>",
471
+ "lstrip": false,
472
+ "normalized": false,
473
+ "rstrip": false,
474
+ "single_word": false,
475
+ "special": true
476
+ },
477
+ "100315": {
478
+ "content": "<|unused_46|>",
479
+ "lstrip": false,
480
+ "normalized": false,
481
+ "rstrip": false,
482
+ "single_word": false,
483
+ "special": true
484
+ },
485
+ "100316": {
486
+ "content": "<|unused_47|>",
487
+ "lstrip": false,
488
+ "normalized": false,
489
+ "rstrip": false,
490
+ "single_word": false,
491
+ "special": true
492
+ },
493
+ "100317": {
494
+ "content": "<|unused_48|>",
495
+ "lstrip": false,
496
+ "normalized": false,
497
+ "rstrip": false,
498
+ "single_word": false,
499
+ "special": true
500
+ },
501
+ "100318": {
502
+ "content": "<|unused_49|>",
503
+ "lstrip": false,
504
+ "normalized": false,
505
+ "rstrip": false,
506
+ "single_word": false,
507
+ "special": true
508
+ },
509
+ "100319": {
510
+ "content": "<|unused_50|>",
511
+ "lstrip": false,
512
+ "normalized": false,
513
+ "rstrip": false,
514
+ "single_word": false,
515
+ "special": true
516
+ },
517
+ "100320": {
518
+ "content": "<|unused_51|>",
519
+ "lstrip": false,
520
+ "normalized": false,
521
+ "rstrip": false,
522
+ "single_word": false,
523
+ "special": true
524
+ },
525
+ "100321": {
526
+ "content": "<|unused_52|>",
527
+ "lstrip": false,
528
+ "normalized": false,
529
+ "rstrip": false,
530
+ "single_word": false,
531
+ "special": true
532
+ },
533
+ "100322": {
534
+ "content": "<|unused_53|>",
535
+ "lstrip": false,
536
+ "normalized": false,
537
+ "rstrip": false,
538
+ "single_word": false,
539
+ "special": true
540
+ },
541
+ "100323": {
542
+ "content": "<|unused_54|>",
543
+ "lstrip": false,
544
+ "normalized": false,
545
+ "rstrip": false,
546
+ "single_word": false,
547
+ "special": true
548
+ },
549
+ "100324": {
550
+ "content": "<|unused_55|>",
551
+ "lstrip": false,
552
+ "normalized": false,
553
+ "rstrip": false,
554
+ "single_word": false,
555
+ "special": true
556
+ },
557
+ "100325": {
558
+ "content": "<|unused_56|>",
559
+ "lstrip": false,
560
+ "normalized": false,
561
+ "rstrip": false,
562
+ "single_word": false,
563
+ "special": true
564
+ },
565
+ "100326": {
566
+ "content": "<|unused_57|>",
567
+ "lstrip": false,
568
+ "normalized": false,
569
+ "rstrip": false,
570
+ "single_word": false,
571
+ "special": true
572
+ },
573
+ "100327": {
574
+ "content": "<|unused_58|>",
575
+ "lstrip": false,
576
+ "normalized": false,
577
+ "rstrip": false,
578
+ "single_word": false,
579
+ "special": true
580
+ },
581
+ "100328": {
582
+ "content": "<|unused_59|>",
583
+ "lstrip": false,
584
+ "normalized": false,
585
+ "rstrip": false,
586
+ "single_word": false,
587
+ "special": true
588
+ },
589
+ "100329": {
590
+ "content": "<|unused_60|>",
591
+ "lstrip": false,
592
+ "normalized": false,
593
+ "rstrip": false,
594
+ "single_word": false,
595
+ "special": true
596
+ },
597
+ "100330": {
598
+ "content": "<|unused_61|>",
599
+ "lstrip": false,
600
+ "normalized": false,
601
+ "rstrip": false,
602
+ "single_word": false,
603
+ "special": true
604
+ },
605
+ "100331": {
606
+ "content": "<|unused_62|>",
607
+ "lstrip": false,
608
+ "normalized": false,
609
+ "rstrip": false,
610
+ "single_word": false,
611
+ "special": true
612
+ },
613
+ "100332": {
614
+ "content": "<|unused_63|>",
615
+ "lstrip": false,
616
+ "normalized": false,
617
+ "rstrip": false,
618
+ "single_word": false,
619
+ "special": true
620
+ },
621
+ "100333": {
622
+ "content": "<|unused_64|>",
623
+ "lstrip": false,
624
+ "normalized": false,
625
+ "rstrip": false,
626
+ "single_word": false,
627
+ "special": true
628
+ },
629
+ "100334": {
630
+ "content": "<|unused_65|>",
631
+ "lstrip": false,
632
+ "normalized": false,
633
+ "rstrip": false,
634
+ "single_word": false,
635
+ "special": true
636
+ },
637
+ "100335": {
638
+ "content": "<|unused_66|>",
639
+ "lstrip": false,
640
+ "normalized": false,
641
+ "rstrip": false,
642
+ "single_word": false,
643
+ "special": true
644
+ },
645
+ "100336": {
646
+ "content": "<|unused_67|>",
647
+ "lstrip": false,
648
+ "normalized": false,
649
+ "rstrip": false,
650
+ "single_word": false,
651
+ "special": true
652
+ },
653
+ "100337": {
654
+ "content": "<|unused_68|>",
655
+ "lstrip": false,
656
+ "normalized": false,
657
+ "rstrip": false,
658
+ "single_word": false,
659
+ "special": true
660
+ },
661
+ "100338": {
662
+ "content": "<|unused_69|>",
663
+ "lstrip": false,
664
+ "normalized": false,
665
+ "rstrip": false,
666
+ "single_word": false,
667
+ "special": true
668
+ },
669
+ "100339": {
670
+ "content": "<|unused_70|>",
671
+ "lstrip": false,
672
+ "normalized": false,
673
+ "rstrip": false,
674
+ "single_word": false,
675
+ "special": true
676
+ },
677
+ "100340": {
678
+ "content": "<|unused_71|>",
679
+ "lstrip": false,
680
+ "normalized": false,
681
+ "rstrip": false,
682
+ "single_word": false,
683
+ "special": true
684
+ },
685
+ "100341": {
686
+ "content": "<|unused_72|>",
687
+ "lstrip": false,
688
+ "normalized": false,
689
+ "rstrip": false,
690
+ "single_word": false,
691
+ "special": true
692
+ },
693
+ "100342": {
694
+ "content": "<|unused_73|>",
695
+ "lstrip": false,
696
+ "normalized": false,
697
+ "rstrip": false,
698
+ "single_word": false,
699
+ "special": true
700
+ },
701
+ "100343": {
702
+ "content": "<|unused_74|>",
703
+ "lstrip": false,
704
+ "normalized": false,
705
+ "rstrip": false,
706
+ "single_word": false,
707
+ "special": true
708
+ },
709
+ "100344": {
710
+ "content": "<|unused_75|>",
711
+ "lstrip": false,
712
+ "normalized": false,
713
+ "rstrip": false,
714
+ "single_word": false,
715
+ "special": true
716
+ },
717
+ "100345": {
718
+ "content": "<|unused_76|>",
719
+ "lstrip": false,
720
+ "normalized": false,
721
+ "rstrip": false,
722
+ "single_word": false,
723
+ "special": true
724
+ },
725
+ "100346": {
726
+ "content": "<|unused_77|>",
727
+ "lstrip": false,
728
+ "normalized": false,
729
+ "rstrip": false,
730
+ "single_word": false,
731
+ "special": true
732
+ },
733
+ "100347": {
734
+ "content": "<|unused_78|>",
735
+ "lstrip": false,
736
+ "normalized": false,
737
+ "rstrip": false,
738
+ "single_word": false,
739
+ "special": true
740
+ },
741
+ "100348": {
742
+ "content": "<|unused_79|>",
743
+ "lstrip": false,
744
+ "normalized": false,
745
+ "rstrip": false,
746
+ "single_word": false,
747
+ "special": true
748
+ },
749
+ "100349": {
750
+ "content": "<|unused_80|>",
751
+ "lstrip": false,
752
+ "normalized": false,
753
+ "rstrip": false,
754
+ "single_word": false,
755
+ "special": true
756
+ },
757
+ "100350": {
758
+ "content": "<|unused_81|>",
759
+ "lstrip": false,
760
+ "normalized": false,
761
+ "rstrip": false,
762
+ "single_word": false,
763
+ "special": true
764
+ },
765
+ "100351": {
766
+ "content": "<|unused_82|>",
767
+ "lstrip": false,
768
+ "normalized": false,
769
+ "rstrip": false,
770
+ "single_word": false,
771
+ "special": true
772
+ }
773
+ },
774
+ "bos_token": "<|end_of_text|>",
775
+ "clean_up_tokenization_spaces": false,
776
+ "eos_token": "<|end_of_text|>",
777
+ "extra_special_tokens": {},
778
+ "model_max_length": 1000000000000000019884624838656,
779
+ "pad_token": "<|pad|>",
780
+ "padding_side": "left",
781
+ "tokenizer_class": "GPT2Tokenizer",
782
+ "unk_token": "<|unk|>"
783
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff