namespace-Pt commited on
Commit
0192243
1 Parent(s): 55963c0

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "ultragist-llama2-7b-chat",
3
+ "architectures": [
4
+ "LlamaForCausalLM"
5
+ ],
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_llama.LlamaConfig",
8
+ "AutoModelForCausalLM": "modeling_llama.LlamaForCausalLM"
9
+ },
10
+ "attention_bias": false,
11
+ "attention_dropout": 0.0,
12
+ "ultragist_attend_prev": true,
13
+ "ultragist_attn": "step-expansion",
14
+ "ultragist_embed_init": "eos",
15
+ "ultragist_param": [
16
+ "q",
17
+ "k",
18
+ "v",
19
+ "o"
20
+ ],
21
+ "ultragist_ratio": [
22
+ 2,
23
+ 4,
24
+ 8,
25
+ 16,
26
+ 32
27
+ ],
28
+ "ultragist_ratio_mix": "adapt-1024",
29
+ "ultragist_sink_size": 1,
30
+ "ultragist_stride": 1024,
31
+ "ultragist_window": 1024,
32
+ "bos_token_id": 1,
33
+ "eos_token_id": 2,
34
+ "hidden_act": "silu",
35
+ "hidden_size": 4096,
36
+ "initializer_range": 0.02,
37
+ "intermediate_size": 11008,
38
+ "max_position_embeddings": 4096,
39
+ "model_type": "llama",
40
+ "num_attention_heads": 32,
41
+ "num_hidden_layers": 32,
42
+ "num_key_value_heads": 32,
43
+ "pretraining_tp": 1,
44
+ "rms_norm_eps": 1e-05,
45
+ "rope_scaling": null,
46
+ "rope_theta": 10000.0,
47
+ "tie_word_embeddings": false,
48
+ "torch_dtype": "bfloat16",
49
+ "transformers_version": "4.39.3",
50
+ "use_cache": true,
51
+ "vocab_size": 32000
52
+ }
configuration_llama.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+ """ LLaMA model configuration"""
21
+
22
+ from transformers.configuration_utils import PretrainedConfig
23
+ from transformers.utils import logging
24
+
25
+
26
+ logger = logging.get_logger(__name__)
27
+
28
+ LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
29
+
30
+
31
+ class LlamaConfig(PretrainedConfig):
32
+ r"""
33
+ This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA
34
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
35
+ defaults will yield a similar configuration to that of the LLaMA-7B.
36
+
37
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
38
+ documentation from [`PretrainedConfig`] for more information.
39
+
40
+
41
+ Args:
42
+ vocab_size (`int`, *optional*, defaults to 32000):
43
+ Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
44
+ `inputs_ids` passed when calling [`LlamaModel`]
45
+ hidden_size (`int`, *optional*, defaults to 4096):
46
+ Dimension of the hidden representations.
47
+ intermediate_size (`int`, *optional*, defaults to 11008):
48
+ Dimension of the MLP representations.
49
+ num_hidden_layers (`int`, *optional*, defaults to 32):
50
+ Number of hidden layers in the Transformer decoder.
51
+ num_attention_heads (`int`, *optional*, defaults to 32):
52
+ Number of attention heads for each attention layer in the Transformer decoder.
53
+ num_key_value_heads (`int`, *optional*):
54
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
55
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
56
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
57
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
58
+ by meanpooling all the original heads within that group. For more details checkout [this
59
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
60
+ `num_attention_heads`.
61
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
62
+ The non-linear activation function (function or string) in the decoder.
63
+ max_position_embeddings (`int`, *optional*, defaults to 2048):
64
+ The maximum sequence length that this model might ever be used with. Llama 1 supports up to 2048 tokens,
65
+ Llama 2 up to 4096, CodeLlama up to 16384.
66
+ initializer_range (`float`, *optional*, defaults to 0.02):
67
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
68
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
69
+ The epsilon used by the rms normalization layers.
70
+ use_cache (`bool`, *optional*, defaults to `True`):
71
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
72
+ relevant if `config.is_decoder=True`.
73
+ pad_token_id (`int`, *optional*):
74
+ Padding token id.
75
+ bos_token_id (`int`, *optional*, defaults to 1):
76
+ Beginning of stream token id.
77
+ eos_token_id (`int`, *optional*, defaults to 2):
78
+ End of stream token id.
79
+ pretraining_tp (`int`, *optional*, defaults to 1):
80
+ Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
81
+ document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
82
+ necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
83
+ issue](https://github.com/pytorch/pytorch/issues/76232).
84
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
85
+ Whether to tie weight embeddings
86
+ rope_theta (`float`, *optional*, defaults to 10000.0):
87
+ The base period of the RoPE embeddings.
88
+ rope_scaling (`Dict`, *optional*):
89
+ Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
90
+ strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
91
+ `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
92
+ `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
93
+ these scaling strategies behave:
94
+ https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
95
+ experimental feature, subject to breaking API changes in future versions.
96
+ attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
97
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
98
+ attention_dropout (`float`, *optional*, defaults to 0.0):
99
+ The dropout ratio for the attention probabilities.
100
+
101
+ ```python
102
+ >>> from transformers import LlamaModel, LlamaConfig
103
+
104
+ >>> # Initializing a LLaMA llama-7b style configuration
105
+ >>> configuration = LlamaConfig()
106
+
107
+ >>> # Initializing a model from the llama-7b style configuration
108
+ >>> model = LlamaModel(configuration)
109
+
110
+ >>> # Accessing the model configuration
111
+ >>> configuration = model.config
112
+ ```"""
113
+
114
+ model_type = "llama"
115
+ keys_to_ignore_at_inference = ["past_key_values"]
116
+
117
+ def __init__(
118
+ self,
119
+ vocab_size=32000,
120
+ hidden_size=4096,
121
+ intermediate_size=11008,
122
+ num_hidden_layers=32,
123
+ num_attention_heads=32,
124
+ num_key_value_heads=None,
125
+ hidden_act="silu",
126
+ max_position_embeddings=2048,
127
+ initializer_range=0.02,
128
+ rms_norm_eps=1e-6,
129
+ use_cache=True,
130
+ pad_token_id=None,
131
+ bos_token_id=1,
132
+ eos_token_id=2,
133
+ pretraining_tp=1,
134
+ tie_word_embeddings=False,
135
+ rope_theta=10000.0,
136
+ rope_scaling=None,
137
+ attention_bias=False,
138
+ attention_dropout=0.0,
139
+ ultragist_window=1024,
140
+ ultragist_stride=1024,
141
+ ultragist_attn="step-expansion",
142
+ ultragist_ratio=[2,4,8,16,32],
143
+ ultragist_ratio_mix="step-random",
144
+ ultragist_param=["q","k","v","o"],
145
+ ultragist_embed_init="eos",
146
+ ultragist_sink_size=0,
147
+ ultragist_attend_prev=True,
148
+ retrieval_method=None,
149
+ retrieval_topk=None,
150
+ retrieval_key_length=None,
151
+ **kwargs,
152
+ ):
153
+ self.vocab_size = vocab_size
154
+ self.max_position_embeddings = max_position_embeddings
155
+ self.hidden_size = hidden_size
156
+ self.intermediate_size = intermediate_size
157
+ self.num_hidden_layers = num_hidden_layers
158
+ self.num_attention_heads = num_attention_heads
159
+
160
+ # for backward compatibility
161
+ if num_key_value_heads is None:
162
+ num_key_value_heads = num_attention_heads
163
+
164
+ self.num_key_value_heads = num_key_value_heads
165
+ self.hidden_act = hidden_act
166
+ self.initializer_range = initializer_range
167
+ self.rms_norm_eps = rms_norm_eps
168
+ self.pretraining_tp = pretraining_tp
169
+ self.use_cache = use_cache
170
+ self.rope_theta = rope_theta
171
+ self.rope_scaling = rope_scaling
172
+ self._rope_scaling_validation()
173
+ self.attention_bias = attention_bias
174
+ self.attention_dropout = attention_dropout
175
+
176
+ self.ultragist_window = ultragist_window
177
+ self.ultragist_stride = ultragist_stride
178
+ self.ultragist_attn = ultragist_attn
179
+ self.ultragist_ratio = ultragist_ratio
180
+ self.ultragist_ratio_mix = ultragist_ratio_mix
181
+ self.ultragist_param = ultragist_param
182
+ self.ultragist_embed_init = ultragist_embed_init
183
+ self.ultragist_sink_size = ultragist_sink_size
184
+ self.ultragist_attend_prev = ultragist_attend_prev
185
+ self.retrieval_method = retrieval_method
186
+ self.retrieval_topk = retrieval_topk
187
+ self.retrieval_key_length = retrieval_key_length
188
+
189
+ super().__init__(
190
+ pad_token_id=pad_token_id,
191
+ bos_token_id=bos_token_id,
192
+ eos_token_id=eos_token_id,
193
+ tie_word_embeddings=tie_word_embeddings,
194
+ **kwargs,
195
+ )
196
+
197
+
198
+ def _rope_scaling_validation(self):
199
+ """
200
+ Validate the `rope_scaling` configuration.
201
+ """
202
+ if self.rope_scaling is None:
203
+ return
204
+
205
+ if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
206
+ raise ValueError(
207
+ "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
208
+ f"got {self.rope_scaling}"
209
+ )
210
+ rope_scaling_type = self.rope_scaling.get("type", None)
211
+ rope_scaling_factor = self.rope_scaling.get("factor", None)
212
+ if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
213
+ raise ValueError(
214
+ f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
215
+ )
216
+ if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
217
+ raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "do_sample": true,
4
+ "eos_token_id": 2,
5
+ "max_length": 4096,
6
+ "pad_token_id": 0,
7
+ "temperature": 0.6,
8
+ "top_p": 0.9,
9
+ "transformers_version": "4.39.3"
10
+ }
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e724498a663c9192f2521771f8044dec09f1744f6f19ee970c5e654c8168b63
3
+ size 4932654472
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6dadb7e94374e54d0d1f80d386e823c528d738bfe33142f0240b198bf02e55a
3
+ size 4941051712
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82d498b097cc0925203403a79974562f65832e1543ebdaa9830eebbc8a389442
3
+ size 4974622888
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44d55d74a3b5a8283b72b5534846e020dff2fb92cbe115603cee0ae4220dbcb9
3
+ size 2923527888
model.safetensors.index.json ADDED
@@ -0,0 +1,427 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 17771806720
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00004-of-00004.safetensors",
7
+ "model.ultragist_embed_tokens.weight": "model-00001-of-00004.safetensors",
8
+ "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
9
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
10
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
11
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
12
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
13
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
14
+ "model.layers.0.self_attn.ultragist_k_proj.weight": "model-00001-of-00004.safetensors",
15
+ "model.layers.0.self_attn.ultragist_o_proj.weight": "model-00001-of-00004.safetensors",
16
+ "model.layers.0.self_attn.ultragist_q_proj.weight": "model-00001-of-00004.safetensors",
17
+ "model.layers.0.self_attn.ultragist_v_proj.weight": "model-00001-of-00004.safetensors",
18
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
19
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
20
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
21
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
22
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
23
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
24
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
25
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
26
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
27
+ "model.layers.1.self_attn.ultragist_k_proj.weight": "model-00001-of-00004.safetensors",
28
+ "model.layers.1.self_attn.ultragist_o_proj.weight": "model-00001-of-00004.safetensors",
29
+ "model.layers.1.self_attn.ultragist_q_proj.weight": "model-00001-of-00004.safetensors",
30
+ "model.layers.1.self_attn.ultragist_v_proj.weight": "model-00001-of-00004.safetensors",
31
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
32
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
33
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
34
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
35
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
36
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
37
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
38
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
39
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
40
+ "model.layers.10.self_attn.ultragist_k_proj.weight": "model-00002-of-00004.safetensors",
41
+ "model.layers.10.self_attn.ultragist_o_proj.weight": "model-00002-of-00004.safetensors",
42
+ "model.layers.10.self_attn.ultragist_q_proj.weight": "model-00002-of-00004.safetensors",
43
+ "model.layers.10.self_attn.ultragist_v_proj.weight": "model-00002-of-00004.safetensors",
44
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
45
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
46
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
47
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
48
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
49
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
50
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
51
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
52
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
53
+ "model.layers.11.self_attn.ultragist_k_proj.weight": "model-00002-of-00004.safetensors",
54
+ "model.layers.11.self_attn.ultragist_o_proj.weight": "model-00002-of-00004.safetensors",
55
+ "model.layers.11.self_attn.ultragist_q_proj.weight": "model-00002-of-00004.safetensors",
56
+ "model.layers.11.self_attn.ultragist_v_proj.weight": "model-00002-of-00004.safetensors",
57
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
58
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
59
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
60
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
61
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
62
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
63
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
64
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
65
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
66
+ "model.layers.12.self_attn.ultragist_k_proj.weight": "model-00002-of-00004.safetensors",
67
+ "model.layers.12.self_attn.ultragist_o_proj.weight": "model-00002-of-00004.safetensors",
68
+ "model.layers.12.self_attn.ultragist_q_proj.weight": "model-00002-of-00004.safetensors",
69
+ "model.layers.12.self_attn.ultragist_v_proj.weight": "model-00002-of-00004.safetensors",
70
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
71
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
72
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
73
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
74
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
75
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
76
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
77
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
78
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
79
+ "model.layers.13.self_attn.ultragist_k_proj.weight": "model-00002-of-00004.safetensors",
80
+ "model.layers.13.self_attn.ultragist_o_proj.weight": "model-00002-of-00004.safetensors",
81
+ "model.layers.13.self_attn.ultragist_q_proj.weight": "model-00002-of-00004.safetensors",
82
+ "model.layers.13.self_attn.ultragist_v_proj.weight": "model-00002-of-00004.safetensors",
83
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
84
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
85
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
86
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
87
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
88
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
89
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
90
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
91
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
92
+ "model.layers.14.self_attn.ultragist_k_proj.weight": "model-00002-of-00004.safetensors",
93
+ "model.layers.14.self_attn.ultragist_o_proj.weight": "model-00002-of-00004.safetensors",
94
+ "model.layers.14.self_attn.ultragist_q_proj.weight": "model-00002-of-00004.safetensors",
95
+ "model.layers.14.self_attn.ultragist_v_proj.weight": "model-00002-of-00004.safetensors",
96
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
97
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
98
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
99
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
100
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
101
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
102
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
103
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
104
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
105
+ "model.layers.15.self_attn.ultragist_k_proj.weight": "model-00002-of-00004.safetensors",
106
+ "model.layers.15.self_attn.ultragist_o_proj.weight": "model-00002-of-00004.safetensors",
107
+ "model.layers.15.self_attn.ultragist_q_proj.weight": "model-00002-of-00004.safetensors",
108
+ "model.layers.15.self_attn.ultragist_v_proj.weight": "model-00002-of-00004.safetensors",
109
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
110
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
111
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
112
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
113
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
114
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
115
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
116
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
117
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
118
+ "model.layers.16.self_attn.ultragist_k_proj.weight": "model-00002-of-00004.safetensors",
119
+ "model.layers.16.self_attn.ultragist_o_proj.weight": "model-00002-of-00004.safetensors",
120
+ "model.layers.16.self_attn.ultragist_q_proj.weight": "model-00002-of-00004.safetensors",
121
+ "model.layers.16.self_attn.ultragist_v_proj.weight": "model-00002-of-00004.safetensors",
122
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
123
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
124
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
125
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
126
+ "model.layers.17.input_layernorm.weight": "model-00003-of-00004.safetensors",
127
+ "model.layers.17.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
128
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
129
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
130
+ "model.layers.17.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
131
+ "model.layers.17.self_attn.ultragist_k_proj.weight": "model-00002-of-00004.safetensors",
132
+ "model.layers.17.self_attn.ultragist_o_proj.weight": "model-00002-of-00004.safetensors",
133
+ "model.layers.17.self_attn.ultragist_q_proj.weight": "model-00002-of-00004.safetensors",
134
+ "model.layers.17.self_attn.ultragist_v_proj.weight": "model-00002-of-00004.safetensors",
135
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
136
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
137
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
138
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
139
+ "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
140
+ "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
141
+ "model.layers.18.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
142
+ "model.layers.18.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
143
+ "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
144
+ "model.layers.18.self_attn.ultragist_k_proj.weight": "model-00003-of-00004.safetensors",
145
+ "model.layers.18.self_attn.ultragist_o_proj.weight": "model-00003-of-00004.safetensors",
146
+ "model.layers.18.self_attn.ultragist_q_proj.weight": "model-00003-of-00004.safetensors",
147
+ "model.layers.18.self_attn.ultragist_v_proj.weight": "model-00003-of-00004.safetensors",
148
+ "model.layers.18.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
149
+ "model.layers.18.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
150
+ "model.layers.18.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
151
+ "model.layers.18.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
152
+ "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
153
+ "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
154
+ "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
155
+ "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
156
+ "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
157
+ "model.layers.19.self_attn.ultragist_k_proj.weight": "model-00003-of-00004.safetensors",
158
+ "model.layers.19.self_attn.ultragist_o_proj.weight": "model-00003-of-00004.safetensors",
159
+ "model.layers.19.self_attn.ultragist_q_proj.weight": "model-00003-of-00004.safetensors",
160
+ "model.layers.19.self_attn.ultragist_v_proj.weight": "model-00003-of-00004.safetensors",
161
+ "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
162
+ "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
163
+ "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
164
+ "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
165
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
166
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
167
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
168
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
169
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
170
+ "model.layers.2.self_attn.ultragist_k_proj.weight": "model-00001-of-00004.safetensors",
171
+ "model.layers.2.self_attn.ultragist_o_proj.weight": "model-00001-of-00004.safetensors",
172
+ "model.layers.2.self_attn.ultragist_q_proj.weight": "model-00001-of-00004.safetensors",
173
+ "model.layers.2.self_attn.ultragist_v_proj.weight": "model-00001-of-00004.safetensors",
174
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
175
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
176
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
177
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
178
+ "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
179
+ "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
180
+ "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
181
+ "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
182
+ "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
183
+ "model.layers.20.self_attn.ultragist_k_proj.weight": "model-00003-of-00004.safetensors",
184
+ "model.layers.20.self_attn.ultragist_o_proj.weight": "model-00003-of-00004.safetensors",
185
+ "model.layers.20.self_attn.ultragist_q_proj.weight": "model-00003-of-00004.safetensors",
186
+ "model.layers.20.self_attn.ultragist_v_proj.weight": "model-00003-of-00004.safetensors",
187
+ "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
188
+ "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
189
+ "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
190
+ "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
191
+ "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
192
+ "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
193
+ "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
194
+ "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
195
+ "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
196
+ "model.layers.21.self_attn.ultragist_k_proj.weight": "model-00003-of-00004.safetensors",
197
+ "model.layers.21.self_attn.ultragist_o_proj.weight": "model-00003-of-00004.safetensors",
198
+ "model.layers.21.self_attn.ultragist_q_proj.weight": "model-00003-of-00004.safetensors",
199
+ "model.layers.21.self_attn.ultragist_v_proj.weight": "model-00003-of-00004.safetensors",
200
+ "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
201
+ "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
202
+ "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
203
+ "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
204
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
205
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
206
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
207
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
208
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
209
+ "model.layers.22.self_attn.ultragist_k_proj.weight": "model-00003-of-00004.safetensors",
210
+ "model.layers.22.self_attn.ultragist_o_proj.weight": "model-00003-of-00004.safetensors",
211
+ "model.layers.22.self_attn.ultragist_q_proj.weight": "model-00003-of-00004.safetensors",
212
+ "model.layers.22.self_attn.ultragist_v_proj.weight": "model-00003-of-00004.safetensors",
213
+ "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
214
+ "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
215
+ "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
216
+ "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
217
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
218
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
219
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
220
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
221
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
222
+ "model.layers.23.self_attn.ultragist_k_proj.weight": "model-00003-of-00004.safetensors",
223
+ "model.layers.23.self_attn.ultragist_o_proj.weight": "model-00003-of-00004.safetensors",
224
+ "model.layers.23.self_attn.ultragist_q_proj.weight": "model-00003-of-00004.safetensors",
225
+ "model.layers.23.self_attn.ultragist_v_proj.weight": "model-00003-of-00004.safetensors",
226
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
227
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
228
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
229
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
230
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
231
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
232
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
233
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
234
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
235
+ "model.layers.24.self_attn.ultragist_k_proj.weight": "model-00003-of-00004.safetensors",
236
+ "model.layers.24.self_attn.ultragist_o_proj.weight": "model-00003-of-00004.safetensors",
237
+ "model.layers.24.self_attn.ultragist_q_proj.weight": "model-00003-of-00004.safetensors",
238
+ "model.layers.24.self_attn.ultragist_v_proj.weight": "model-00003-of-00004.safetensors",
239
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
240
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
241
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
242
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
243
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
244
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
245
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
246
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
247
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
248
+ "model.layers.25.self_attn.ultragist_k_proj.weight": "model-00003-of-00004.safetensors",
249
+ "model.layers.25.self_attn.ultragist_o_proj.weight": "model-00003-of-00004.safetensors",
250
+ "model.layers.25.self_attn.ultragist_q_proj.weight": "model-00003-of-00004.safetensors",
251
+ "model.layers.25.self_attn.ultragist_v_proj.weight": "model-00003-of-00004.safetensors",
252
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
253
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
254
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
255
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
256
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
257
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
258
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
259
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
260
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
261
+ "model.layers.26.self_attn.ultragist_k_proj.weight": "model-00003-of-00004.safetensors",
262
+ "model.layers.26.self_attn.ultragist_o_proj.weight": "model-00003-of-00004.safetensors",
263
+ "model.layers.26.self_attn.ultragist_q_proj.weight": "model-00003-of-00004.safetensors",
264
+ "model.layers.26.self_attn.ultragist_v_proj.weight": "model-00003-of-00004.safetensors",
265
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
266
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
267
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
268
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
269
+ "model.layers.27.input_layernorm.weight": "model-00004-of-00004.safetensors",
270
+ "model.layers.27.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
271
+ "model.layers.27.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
272
+ "model.layers.27.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
273
+ "model.layers.27.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
274
+ "model.layers.27.self_attn.ultragist_k_proj.weight": "model-00004-of-00004.safetensors",
275
+ "model.layers.27.self_attn.ultragist_o_proj.weight": "model-00004-of-00004.safetensors",
276
+ "model.layers.27.self_attn.ultragist_q_proj.weight": "model-00004-of-00004.safetensors",
277
+ "model.layers.27.self_attn.ultragist_v_proj.weight": "model-00004-of-00004.safetensors",
278
+ "model.layers.27.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
279
+ "model.layers.27.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
280
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
281
+ "model.layers.27.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
282
+ "model.layers.28.input_layernorm.weight": "model-00004-of-00004.safetensors",
283
+ "model.layers.28.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
284
+ "model.layers.28.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
285
+ "model.layers.28.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
286
+ "model.layers.28.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
287
+ "model.layers.28.self_attn.ultragist_k_proj.weight": "model-00004-of-00004.safetensors",
288
+ "model.layers.28.self_attn.ultragist_o_proj.weight": "model-00004-of-00004.safetensors",
289
+ "model.layers.28.self_attn.ultragist_q_proj.weight": "model-00004-of-00004.safetensors",
290
+ "model.layers.28.self_attn.ultragist_v_proj.weight": "model-00004-of-00004.safetensors",
291
+ "model.layers.28.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
292
+ "model.layers.28.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
293
+ "model.layers.28.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
294
+ "model.layers.28.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
295
+ "model.layers.29.input_layernorm.weight": "model-00004-of-00004.safetensors",
296
+ "model.layers.29.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
297
+ "model.layers.29.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
298
+ "model.layers.29.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
299
+ "model.layers.29.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
300
+ "model.layers.29.self_attn.ultragist_k_proj.weight": "model-00004-of-00004.safetensors",
301
+ "model.layers.29.self_attn.ultragist_o_proj.weight": "model-00004-of-00004.safetensors",
302
+ "model.layers.29.self_attn.ultragist_q_proj.weight": "model-00004-of-00004.safetensors",
303
+ "model.layers.29.self_attn.ultragist_v_proj.weight": "model-00004-of-00004.safetensors",
304
+ "model.layers.29.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
305
+ "model.layers.29.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
306
+ "model.layers.29.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
307
+ "model.layers.29.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
308
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
309
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
310
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
311
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
312
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
313
+ "model.layers.3.self_attn.ultragist_k_proj.weight": "model-00001-of-00004.safetensors",
314
+ "model.layers.3.self_attn.ultragist_o_proj.weight": "model-00001-of-00004.safetensors",
315
+ "model.layers.3.self_attn.ultragist_q_proj.weight": "model-00001-of-00004.safetensors",
316
+ "model.layers.3.self_attn.ultragist_v_proj.weight": "model-00001-of-00004.safetensors",
317
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
318
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
319
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
320
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
321
+ "model.layers.30.input_layernorm.weight": "model-00004-of-00004.safetensors",
322
+ "model.layers.30.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
323
+ "model.layers.30.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
324
+ "model.layers.30.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
325
+ "model.layers.30.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
326
+ "model.layers.30.self_attn.ultragist_k_proj.weight": "model-00004-of-00004.safetensors",
327
+ "model.layers.30.self_attn.ultragist_o_proj.weight": "model-00004-of-00004.safetensors",
328
+ "model.layers.30.self_attn.ultragist_q_proj.weight": "model-00004-of-00004.safetensors",
329
+ "model.layers.30.self_attn.ultragist_v_proj.weight": "model-00004-of-00004.safetensors",
330
+ "model.layers.30.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
331
+ "model.layers.30.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
332
+ "model.layers.30.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
333
+ "model.layers.30.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
334
+ "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors",
335
+ "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
336
+ "model.layers.31.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
337
+ "model.layers.31.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
338
+ "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
339
+ "model.layers.31.self_attn.ultragist_k_proj.weight": "model-00004-of-00004.safetensors",
340
+ "model.layers.31.self_attn.ultragist_o_proj.weight": "model-00004-of-00004.safetensors",
341
+ "model.layers.31.self_attn.ultragist_q_proj.weight": "model-00004-of-00004.safetensors",
342
+ "model.layers.31.self_attn.ultragist_v_proj.weight": "model-00004-of-00004.safetensors",
343
+ "model.layers.31.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
344
+ "model.layers.31.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
345
+ "model.layers.31.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
346
+ "model.layers.31.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
347
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
348
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
349
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
350
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
351
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
352
+ "model.layers.4.self_attn.ultragist_k_proj.weight": "model-00001-of-00004.safetensors",
353
+ "model.layers.4.self_attn.ultragist_o_proj.weight": "model-00001-of-00004.safetensors",
354
+ "model.layers.4.self_attn.ultragist_q_proj.weight": "model-00001-of-00004.safetensors",
355
+ "model.layers.4.self_attn.ultragist_v_proj.weight": "model-00001-of-00004.safetensors",
356
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
357
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
358
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
359
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
360
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
361
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
362
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
363
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
364
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
365
+ "model.layers.5.self_attn.ultragist_k_proj.weight": "model-00001-of-00004.safetensors",
366
+ "model.layers.5.self_attn.ultragist_o_proj.weight": "model-00001-of-00004.safetensors",
367
+ "model.layers.5.self_attn.ultragist_q_proj.weight": "model-00001-of-00004.safetensors",
368
+ "model.layers.5.self_attn.ultragist_v_proj.weight": "model-00001-of-00004.safetensors",
369
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
370
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
371
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
372
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
373
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
374
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
375
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
376
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
377
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
378
+ "model.layers.6.self_attn.ultragist_k_proj.weight": "model-00001-of-00004.safetensors",
379
+ "model.layers.6.self_attn.ultragist_o_proj.weight": "model-00001-of-00004.safetensors",
380
+ "model.layers.6.self_attn.ultragist_q_proj.weight": "model-00001-of-00004.safetensors",
381
+ "model.layers.6.self_attn.ultragist_v_proj.weight": "model-00001-of-00004.safetensors",
382
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
383
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
384
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
385
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
386
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
387
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
388
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
389
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
390
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
391
+ "model.layers.7.self_attn.ultragist_k_proj.weight": "model-00001-of-00004.safetensors",
392
+ "model.layers.7.self_attn.ultragist_o_proj.weight": "model-00001-of-00004.safetensors",
393
+ "model.layers.7.self_attn.ultragist_q_proj.weight": "model-00001-of-00004.safetensors",
394
+ "model.layers.7.self_attn.ultragist_v_proj.weight": "model-00001-of-00004.safetensors",
395
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
396
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
397
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
398
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
399
+ "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
400
+ "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
401
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
402
+ "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
403
+ "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
404
+ "model.layers.8.self_attn.ultragist_k_proj.weight": "model-00001-of-00004.safetensors",
405
+ "model.layers.8.self_attn.ultragist_o_proj.weight": "model-00001-of-00004.safetensors",
406
+ "model.layers.8.self_attn.ultragist_q_proj.weight": "model-00001-of-00004.safetensors",
407
+ "model.layers.8.self_attn.ultragist_v_proj.weight": "model-00001-of-00004.safetensors",
408
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
409
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
410
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
411
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
412
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
413
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
414
+ "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
415
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
416
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
417
+ "model.layers.9.self_attn.ultragist_k_proj.weight": "model-00002-of-00004.safetensors",
418
+ "model.layers.9.self_attn.ultragist_o_proj.weight": "model-00002-of-00004.safetensors",
419
+ "model.layers.9.self_attn.ultragist_q_proj.weight": "model-00002-of-00004.safetensors",
420
+ "model.layers.9.self_attn.ultragist_v_proj.weight": "model-00002-of-00004.safetensors",
421
+ "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
422
+ "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
423
+ "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
424
+ "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
425
+ "model.norm.weight": "model-00004-of-00004.safetensors"
426
+ }
427
+ }
modeling_llama.py ADDED
@@ -0,0 +1,1461 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+ """ PyTorch LLaMA model."""
21
+ import time
22
+ import math
23
+ import warnings
24
+ from typing import List, Optional, Tuple, Union, Mapping
25
+ from contextlib import nullcontext
26
+ from dataclasses import dataclass
27
+ from collections import defaultdict
28
+ from tqdm import tqdm
29
+ from accelerate import Accelerator
30
+
31
+ import os
32
+ import torch
33
+ import torch.nn.functional as F
34
+ import torch.utils.checkpoint
35
+ from torch import nn
36
+
37
+ from transformers.activations import ACT2FN
38
+ from transformers.cache_utils import Cache
39
+ from transformers.modeling_attn_mask_utils import (
40
+ AttentionMaskConverter,
41
+ _prepare_4d_attention_mask,
42
+ _prepare_4d_causal_attention_mask,
43
+ _prepare_4d_causal_attention_mask_for_sdpa,
44
+ )
45
+ from transformers.modeling_outputs import BaseModelOutputWithPast
46
+ from transformers.modeling_utils import PreTrainedModel
47
+ from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS, is_torch_greater_or_equal_than_1_13
48
+ from transformers.utils import (
49
+ add_start_docstrings,
50
+ add_start_docstrings_to_model_forward,
51
+ is_flash_attn_2_available,
52
+ is_flash_attn_greater_or_equal_2_10,
53
+ logging,
54
+ replace_return_docstrings,
55
+ )
56
+ from transformers.integrations import is_deepspeed_zero3_enabled
57
+ from transformers.utils.import_utils import is_torch_fx_available
58
+
59
+ # This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
60
+ # It means that the function will not be traced through and simply appear as a node in the graph.
61
+ if is_torch_fx_available():
62
+ if not is_torch_greater_or_equal_than_1_13:
63
+ import torch.fx
64
+
65
+ _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
66
+
67
+ from .configuration_llama import LlamaConfig
68
+ from .modeling_ultragist import Memory
69
+ from .modeling_utils import optional_grad_ctx, compute_loss, ModelOutput
70
+
71
+
72
+ logger = logging.get_logger(__name__)
73
+
74
+ _CONFIG_FOR_DOC = "LlamaConfig"
75
+
76
+
77
+ class LlamaRMSNorm(nn.Module):
78
+ def __init__(self, hidden_size, eps=1e-6):
79
+ """
80
+ LlamaRMSNorm is equivalent to T5LayerNorm
81
+ """
82
+ super().__init__()
83
+ self.weight = nn.Parameter(torch.ones(hidden_size))
84
+ self.variance_epsilon = eps
85
+
86
+ def forward(self, hidden_states):
87
+ input_dtype = hidden_states.dtype
88
+ hidden_states = hidden_states.to(torch.float32)
89
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
90
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
91
+ return self.weight * hidden_states.to(input_dtype)
92
+
93
+
94
+ ALL_LAYERNORM_LAYERS.append(LlamaRMSNorm)
95
+
96
+
97
+ class LlamaRotaryEmbedding(nn.Module):
98
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
99
+ super().__init__()
100
+
101
+ self.dim = dim
102
+ self.max_position_embeddings = max_position_embeddings
103
+ self.base = base
104
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
105
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
106
+
107
+ # Build here to make `torch.jit.trace` work.
108
+ self._set_cos_sin_cache(
109
+ seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
110
+ )
111
+
112
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
113
+ self.max_seq_len_cached = seq_len
114
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
115
+
116
+ freqs = torch.outer(t, self.inv_freq)
117
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
118
+ emb = torch.cat((freqs, freqs), dim=-1)
119
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
120
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
121
+
122
+ def forward(self, x, seq_len=None):
123
+ # x: [bs, num_attention_heads, seq_len, head_size]
124
+ if seq_len > self.max_seq_len_cached:
125
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
126
+
127
+ return (
128
+ self.cos_cached[:seq_len].to(dtype=x.dtype),
129
+ self.sin_cached[:seq_len].to(dtype=x.dtype),
130
+ )
131
+
132
+
133
+ class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
134
+ """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
135
+
136
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
137
+ self.scaling_factor = scaling_factor
138
+ super().__init__(dim, max_position_embeddings, base, device)
139
+
140
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
141
+ self.max_seq_len_cached = seq_len
142
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
143
+ t = t / self.scaling_factor
144
+
145
+ freqs = torch.outer(t, self.inv_freq)
146
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
147
+ emb = torch.cat((freqs, freqs), dim=-1)
148
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
149
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
150
+
151
+
152
+ class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
153
+ """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
154
+
155
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
156
+ self.scaling_factor = scaling_factor
157
+ super().__init__(dim, max_position_embeddings, base, device)
158
+
159
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
160
+ self.max_seq_len_cached = seq_len
161
+
162
+ if seq_len > self.max_position_embeddings:
163
+ base = self.base * (
164
+ (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
165
+ ) ** (self.dim / (self.dim - 2))
166
+ inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
167
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
168
+
169
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
170
+
171
+ freqs = torch.outer(t, self.inv_freq)
172
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
173
+ emb = torch.cat((freqs, freqs), dim=-1)
174
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
175
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
176
+
177
+
178
+ def rotate_half(x):
179
+ """Rotates half the hidden dims of the input."""
180
+ x1 = x[..., : x.shape[-1] // 2]
181
+ x2 = x[..., x.shape[-1] // 2 :]
182
+ return torch.cat((-x2, x1), dim=-1)
183
+
184
+
185
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
186
+ """Applies Rotary Position Embedding to the query and key tensors.
187
+
188
+ Args:
189
+ q (`torch.Tensor`): The query tensor.
190
+ k (`torch.Tensor`): The key tensor.
191
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
192
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
193
+ position_ids (`torch.Tensor`):
194
+ The position indices of the tokens corresponding to the query and key tensors. For example, this can be
195
+ used to pass offsetted position ids when working with a KV-cache.
196
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
197
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
198
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
199
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
200
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
201
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
202
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
203
+ Returns:
204
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
205
+ """
206
+ cos = cos[position_ids].unsqueeze(unsqueeze_dim)
207
+ sin = sin[position_ids].unsqueeze(unsqueeze_dim)
208
+ q_embed = (q * cos) + (rotate_half(q) * sin)
209
+ k_embed = (k * cos) + (rotate_half(k) * sin)
210
+ return q_embed, k_embed
211
+
212
+ # Copied from streaming-llm
213
+ def apply_rotary_pos_emb_single(x, cos, sin, position_ids):
214
+ # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
215
+ cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
216
+ sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
217
+ x_embed = (x * cos) + (rotate_half(x) * sin)
218
+ return x_embed
219
+
220
+
221
+ class LlamaMLP(nn.Module):
222
+ def __init__(self, config):
223
+ super().__init__()
224
+ self.config = config
225
+ self.hidden_size = config.hidden_size
226
+ self.intermediate_size = config.intermediate_size
227
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
228
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
229
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
230
+ self.act_fn = ACT2FN[config.hidden_act]
231
+
232
+ if "mlp" in config.ultragist_param:
233
+ self.ultragist_up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
234
+ self.ultragist_up_proj.weight.data.zero_()
235
+ self.ultragist_up_proj._is_hf_initialized = True
236
+
237
+ self.ultragist_down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
238
+ self.ultragist_down_proj.weight.data.zero_()
239
+ self.ultragist_down_proj._is_hf_initialized = True
240
+
241
+ def _init_ultragist_proj(self, missing_keys):
242
+ """Initialize the ultragist projection weight with that of the ordinal projection."""
243
+ if "mlp" in self.config.ultragist_param:
244
+ if is_deepspeed_zero3_enabled():
245
+ import deepspeed
246
+ params = [self.up_proj.weight, self.down_proj.weight, self.ultragist_up_proj.weight, self.ultragist_down_proj.weight]
247
+ with deepspeed.zero.GatheredParameters(params, modifier_rank=0):
248
+ if (self.ultragist_up_proj.weight.sum(-1) == 0).any():
249
+ self.ultragist_up_proj.weight.data[:] = self.up_proj.weight.data
250
+ self.ultragist_down_proj.weight.data[:] = self.down_proj.weight.data
251
+ else:
252
+ if any("ultragist_up_proj" in missing_key for missing_key in missing_keys):
253
+ # only copy the value in-place, without tieing the weight
254
+ self.ultragist_up_proj.weight.data[:] = self.up_proj.weight.data
255
+ self.ultragist_down_proj.weight.data[:] = self.down_proj.weight.data
256
+
257
+ def forward(self, x, ultragist_size):
258
+ if self.config.pretraining_tp > 1:
259
+ # TODO: support pretraining_tp
260
+ raise NotImplementedError
261
+
262
+ slice = self.intermediate_size // self.config.pretraining_tp
263
+ gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
264
+ up_proj_slices = self.up_proj.weight.split(slice, dim=0)
265
+ down_proj_slices = self.down_proj.weight.split(slice, dim=1)
266
+
267
+ gate_proj = torch.cat(
268
+ [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
269
+ )
270
+ up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
271
+
272
+ intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
273
+ down_proj = [
274
+ F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
275
+ ]
276
+ down_proj = sum(down_proj)
277
+
278
+ else:
279
+ if "mlp" in self.config.ultragist_param:
280
+ if ultragist_size > 0:
281
+ ordinal_hidden_states = x[:, :-ultragist_size]
282
+ ultragist_hidden_states = x[:, -ultragist_size:]
283
+
284
+ ordinal_down_proj = self.down_proj(self.act_fn(self.gate_proj(ordinal_hidden_states)) * self.up_proj(ordinal_hidden_states))
285
+ ultragist_down_proj = self.ultragist_down_proj(self.act_fn(self.gate_proj(ultragist_hidden_states)) * self.ultragist_up_proj(ultragist_hidden_states))
286
+ down_proj = torch.cat([ordinal_down_proj, ultragist_down_proj], dim=1)
287
+ else:
288
+ down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
289
+ else:
290
+ down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
291
+
292
+ return down_proj
293
+
294
+
295
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
296
+ """
297
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
298
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
299
+ """
300
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
301
+ if n_rep == 1:
302
+ return hidden_states
303
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
304
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
305
+
306
+
307
+ class LlamaAttention(nn.Module):
308
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
309
+
310
+ def __init__(self, config: LlamaConfig, layer_idx: Optional[int] = None):
311
+ super().__init__()
312
+ self.config = config
313
+ self.layer_idx = layer_idx
314
+ if layer_idx is None:
315
+ logger.warning_once(
316
+ f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
317
+ "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
318
+ "when creating this class."
319
+ )
320
+
321
+ self.attention_dropout = config.attention_dropout
322
+ self.hidden_size = config.hidden_size
323
+ self.num_heads = config.num_attention_heads
324
+ self.head_dim = self.hidden_size // self.num_heads
325
+ self.num_key_value_heads = config.num_key_value_heads
326
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
327
+ self.max_position_embeddings = config.max_position_embeddings
328
+ self.rope_theta = config.rope_theta
329
+ self.is_causal = True
330
+
331
+ if (self.head_dim * self.num_heads) != self.hidden_size:
332
+ raise ValueError(
333
+ f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
334
+ f" and `num_heads`: {self.num_heads})."
335
+ )
336
+
337
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
338
+ self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
339
+ self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
340
+ self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
341
+ self._init_rope()
342
+
343
+ # NOTE: add extra parameters for ultragist tokens
344
+ # skip post initialization to speed up loading
345
+ if "q" in config.ultragist_param:
346
+ self.ultragist_q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
347
+ # NOTE: initialize the ultragist parameters as zero
348
+ self.ultragist_q_proj.weight.data.zero_()
349
+ self.ultragist_q_proj._is_hf_initialized = True
350
+ if "k" in config.ultragist_param:
351
+ self.ultragist_k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
352
+ self.ultragist_k_proj.weight.data.zero_()
353
+ self.ultragist_k_proj._is_hf_initialized = True
354
+ if "v" in config.ultragist_param:
355
+ self.ultragist_v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
356
+ self.ultragist_v_proj.weight.data.zero_()
357
+ self.ultragist_v_proj._is_hf_initialized = True
358
+ if "o" in config.ultragist_param:
359
+ self.ultragist_o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
360
+ self.ultragist_o_proj.weight.data.zero_()
361
+ self.ultragist_o_proj._is_hf_initialized = True
362
+
363
+ def _init_rope(self):
364
+ if self.config.rope_scaling is None:
365
+ self.rotary_emb = LlamaRotaryEmbedding(
366
+ self.head_dim,
367
+ max_position_embeddings=self.max_position_embeddings,
368
+ base=self.rope_theta,
369
+ )
370
+ else:
371
+ scaling_type = self.config.rope_scaling["type"]
372
+ scaling_factor = self.config.rope_scaling["factor"]
373
+ if scaling_type == "linear":
374
+ self.rotary_emb = LlamaLinearScalingRotaryEmbedding(
375
+ self.head_dim,
376
+ max_position_embeddings=self.max_position_embeddings,
377
+ scaling_factor=scaling_factor,
378
+ base=self.rope_theta,
379
+ )
380
+ elif scaling_type == "dynamic":
381
+ self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding(
382
+ self.head_dim,
383
+ max_position_embeddings=self.max_position_embeddings,
384
+ scaling_factor=scaling_factor,
385
+ base=self.rope_theta,
386
+ )
387
+ else:
388
+ raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
389
+
390
+ def _init_ultragist_proj(self, missing_keys):
391
+ """Initialize the ultragist projection weight with that of the ordinal projection."""
392
+ ultragist_param = self.config.ultragist_param
393
+
394
+ if is_deepspeed_zero3_enabled():
395
+ import deepspeed
396
+ if "q" in ultragist_param:
397
+ with deepspeed.zero.GatheredParameters([self.ultragist_q_proj.weight, self.q_proj.weight], modifier_rank=0):
398
+ # FIXME: after deepspeed initialization, some weights becomes non-zero, but there are rows taht are full of zeros
399
+ if (self.ultragist_q_proj.weight.sum(-1) == 0).any():
400
+ self.ultragist_q_proj.weight.data[:] = self.q_proj.weight.data
401
+ if "k" in ultragist_param:
402
+ with deepspeed.zero.GatheredParameters([self.ultragist_k_proj.weight, self.k_proj.weight], modifier_rank=0):
403
+ if (self.ultragist_k_proj.weight.sum(-1) == 0).any():
404
+ self.ultragist_k_proj.weight.data[:] = self.k_proj.weight.data
405
+ if "v" in ultragist_param:
406
+ with deepspeed.zero.GatheredParameters([self.ultragist_v_proj.weight, self.v_proj.weight], modifier_rank=0):
407
+ if (self.ultragist_v_proj.weight.sum(-1) == 0).any():
408
+ self.ultragist_v_proj.weight.data[:] = self.v_proj.weight.data
409
+ if "o" in ultragist_param:
410
+ with deepspeed.zero.GatheredParameters([self.ultragist_o_proj.weight, self.o_proj.weight], modifier_rank=0):
411
+ if (self.ultragist_o_proj.weight.sum(-1) == 0).any():
412
+ self.ultragist_o_proj.weight.data[:] = self.o_proj.weight.data
413
+ else:
414
+ # only copy the value in-place, without tieing the weight
415
+ if "q" in ultragist_param and any("ultragist_q_proj" in missing_key for missing_key in missing_keys):
416
+ if (self.ultragist_q_proj.weight == 0).all():
417
+ self.ultragist_q_proj.weight.data[:] = self.q_proj.weight.data
418
+ if "k" in ultragist_param and any("ultragist_k_proj" in missing_key for missing_key in missing_keys):
419
+ if (self.ultragist_k_proj.weight == 0).all():
420
+ self.ultragist_k_proj.weight.data[:] = self.k_proj.weight.data
421
+ if "v" in ultragist_param and any("ultragist_v_proj" in missing_key for missing_key in missing_keys):
422
+ if (self.ultragist_v_proj.weight == 0).all():
423
+ self.ultragist_v_proj.weight.data[:] = self.v_proj.weight.data
424
+ if "o" in ultragist_param and any("ultragist_o_proj" in missing_key for missing_key in missing_keys):
425
+ if (self.ultragist_o_proj.weight == 0).all():
426
+ self.ultragist_o_proj.weight.data[:] = self.o_proj.weight.data
427
+
428
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
429
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
430
+
431
+ def qkv_proj_with_ultragist(self, hidden_states, ultragist_size=0):
432
+ if ultragist_size > 0:
433
+ ordinal_hidden_states = hidden_states[:, :-ultragist_size]
434
+ ultragist_hidden_states = hidden_states[:, -ultragist_size:]
435
+
436
+ if "q" in self.config.ultragist_param:
437
+ ordinal_query_states = self.q_proj(ordinal_hidden_states)
438
+ ultragist_query_states = self.ultragist_q_proj(ultragist_hidden_states)
439
+ query_states = torch.cat([ordinal_query_states, ultragist_query_states], dim=1)
440
+ else:
441
+ query_states = self.q_proj(hidden_states)
442
+
443
+ if "k" in self.config.ultragist_param:
444
+ ordinal_key_states = self.k_proj(ordinal_hidden_states)
445
+ ultragist_key_states = self.ultragist_k_proj(ultragist_hidden_states)
446
+ key_states = torch.cat([ordinal_key_states, ultragist_key_states], dim=1)
447
+ else:
448
+ key_states = self.k_proj(hidden_states)
449
+
450
+ if "v" in self.config.ultragist_param:
451
+ ordinal_value_states = self.v_proj(ordinal_hidden_states)
452
+ ultragist_value_states = self.ultragist_v_proj(ultragist_hidden_states)
453
+ value_states = torch.cat([ordinal_value_states, ultragist_value_states], dim=1)
454
+ else:
455
+ value_states = self.v_proj(hidden_states)
456
+
457
+ else:
458
+ query_states = self.q_proj(hidden_states)
459
+ key_states = self.k_proj(hidden_states)
460
+ value_states = self.v_proj(hidden_states)
461
+
462
+ return query_states, key_states, value_states
463
+
464
+ def o_proj_with_ultragist(self, attn_output, ultragist_size=0):
465
+ if ultragist_size > 0:
466
+ if "o" in self.config.ultragist_param:
467
+ ordinal_attn_output = self.o_proj(attn_output[:, :-ultragist_size])
468
+ ultragist_attn_output = self.ultragist_o_proj(attn_output[:, -ultragist_size:])
469
+ attn_output = torch.cat([ordinal_attn_output, ultragist_attn_output], dim=1)
470
+ else:
471
+ attn_output = self.o_proj(attn_output)
472
+ else:
473
+ attn_output = self.o_proj(attn_output)
474
+ return attn_output
475
+
476
+ def forward(
477
+ self,
478
+ hidden_states: torch.Tensor,
479
+ attention_mask: Optional[torch.Tensor] = None,
480
+ position_ids: Optional[torch.LongTensor] = None,
481
+ past_key_value: Optional[Cache] = None,
482
+ output_attentions: bool = False,
483
+ use_cache: bool = False,
484
+ **kwargs,
485
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
486
+ if "padding_mask" in kwargs:
487
+ warnings.warn(
488
+ "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
489
+ )
490
+
491
+ bsz, q_len, _ = hidden_states.size()
492
+ kv_seq_len = hidden_states.shape[-2]
493
+ past_key, past_value, ultragist_sizes, total_ultragist_size, raw_size_to_cache, window_size = past_key_value
494
+
495
+ if past_key is not None:
496
+ past_seq_len = past_key.shape[2]
497
+ kv_seq_len += past_seq_len
498
+ else:
499
+ past_seq_len = 0
500
+
501
+ query_states, key_states, value_states = self.qkv_proj_with_ultragist(hidden_states, total_ultragist_size)
502
+
503
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
504
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
505
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
506
+
507
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
508
+
509
+ # return keys and values before rope
510
+ # NOTE: incrementally return keys and values for efficiency
511
+ if window_size > 0:
512
+ past_key_value = (key_states, value_states, ultragist_sizes, total_ultragist_size, raw_size_to_cache, window_size)
513
+
514
+ if past_key is not None:
515
+ # reuse k, v, self_attention
516
+ key_states = torch.cat([past_key, key_states], dim=2)
517
+ value_states = torch.cat([past_value, value_states], dim=2)
518
+
519
+ # NOTE: window_size == 0 indicates the ultragist is disabled, the model works as is, so the new past_key_values should concatenate old ones
520
+ if window_size == 0:
521
+ past_key_value = (key_states, value_states, ultragist_sizes, total_ultragist_size, raw_size_to_cache, window_size)
522
+
523
+ key_position_ids = position_ids
524
+ # align query position_ids with key
525
+ query_position_ids = key_position_ids[:, -q_len:]
526
+
527
+ key_states = apply_rotary_pos_emb_single(key_states, cos, sin, key_position_ids)
528
+ query_states = apply_rotary_pos_emb_single(query_states, cos, sin, query_position_ids)
529
+
530
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
531
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
532
+
533
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
534
+
535
+ # debug attention weights
536
+ # if q_len == 1:
537
+ # with open(f"data/debug/{self.layer_idx}.txt", "w") as f:
538
+ # torch.set_printoptions(profile="full",linewidth=10000000,precision=1,sci_mode=False)
539
+ # a = attn_weights.mean(1)
540
+ # f.write(f"past_length: {past_key.shape[2]}\nattn_weight: {a.shape}\n")
541
+ # f.write(str(a))
542
+ # torch.set_printoptions(profile="default")
543
+ # if self.layer_idx == self.config.num_hidden_layers - 1:
544
+ # print("this is time!!!")
545
+ # input()
546
+
547
+ if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
548
+ raise ValueError(
549
+ f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
550
+ f" {attn_weights.size()}"
551
+ )
552
+
553
+ if attention_mask is not None:
554
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
555
+ raise ValueError(
556
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
557
+ )
558
+ attn_weights = attn_weights + attention_mask
559
+
560
+ # upcast attention to fp32
561
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
562
+ attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
563
+ attn_output = torch.matmul(attn_weights, value_states)
564
+
565
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
566
+ raise ValueError(
567
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
568
+ f" {attn_output.size()}"
569
+ )
570
+
571
+ # for debug
572
+ # if past_key.shape[2] == 128 and self.layer_idx == 0:
573
+ # torch.save({
574
+ # "hidden": hidden_states,
575
+ # "query": query_states,
576
+ # "key": key_states,
577
+ # "value": value_states,
578
+ # "output": attn_output,
579
+ # "query_position_ids": query_position_ids,
580
+ # "key_position_ids": key_position_ids,
581
+ # }, "attn-output.pt")
582
+
583
+ attn_output = attn_output.transpose(1, 2).contiguous()
584
+
585
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
586
+
587
+ attn_output = self.o_proj_with_ultragist(attn_output, total_ultragist_size)
588
+
589
+ if not output_attentions:
590
+ attn_weights = None
591
+
592
+ return attn_output, attn_weights, past_key_value
593
+
594
+
595
+ class LlamaSdpaAttention(LlamaAttention):
596
+ """
597
+ Llama attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
598
+ `LlamaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
599
+ SDPA API.
600
+ """
601
+
602
+ # Adapted from LlamaAttention.forward
603
+ def forward(
604
+ self,
605
+ hidden_states: torch.Tensor,
606
+ attention_mask: Optional[torch.Tensor] = None,
607
+ position_ids: Optional[torch.LongTensor] = None,
608
+ past_key_value: Optional[Cache] = None,
609
+ output_attentions: bool = False,
610
+ use_cache: bool = False,
611
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
612
+ if output_attentions:
613
+ # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
614
+ logger.warning_once(
615
+ "LlamaModel is using LlamaSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
616
+ 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
617
+ )
618
+ return super().forward(
619
+ hidden_states=hidden_states,
620
+ attention_mask=attention_mask,
621
+ position_ids=position_ids,
622
+ past_key_value=past_key_value,
623
+ output_attentions=output_attentions,
624
+ use_cache=use_cache,
625
+ )
626
+ bsz, q_len, _ = hidden_states.size()
627
+ kv_seq_len = hidden_states.shape[-2]
628
+ past_key, past_value, ultragist_sizes, total_ultragist_size, raw_size_to_cache, window_size = past_key_value
629
+ if past_key is not None:
630
+ past_seq_len = past_key.shape[2]
631
+ kv_seq_len += past_seq_len
632
+ else:
633
+ past_seq_len = 0
634
+
635
+ query_states, key_states, value_states = self.qkv_proj_with_ultragist(hidden_states, total_ultragist_size)
636
+
637
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
638
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
639
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
640
+
641
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
642
+
643
+ # return keys and values before rope
644
+ # NOTE: incrementally return keys and values for efficiency
645
+ if window_size > 0:
646
+ past_key_value = (key_states, value_states, ultragist_sizes, total_ultragist_size, raw_size_to_cache, window_size)
647
+
648
+ if past_key is not None:
649
+ # reuse k, v, self_attention
650
+ key_states = torch.cat([past_key, key_states], dim=2)
651
+ value_states = torch.cat([past_value, value_states], dim=2)
652
+
653
+ # NOTE: window_size == 0 indicates the ultragist is disabled, the model works as is, so the new past_key_values should concatenate old ones
654
+ if window_size == 0:
655
+ past_key_value = (key_states, value_states, ultragist_sizes, total_ultragist_size, raw_size_to_cache, window_size)
656
+
657
+ key_position_ids = position_ids
658
+ # align query position_ids with key
659
+ query_position_ids = key_position_ids[:, -q_len:]
660
+
661
+ key_states = apply_rotary_pos_emb_single(key_states, cos, sin, key_position_ids)
662
+ query_states = apply_rotary_pos_emb_single(query_states, cos, sin, query_position_ids)
663
+
664
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
665
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
666
+
667
+ if attention_mask is not None:
668
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
669
+ raise ValueError(
670
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
671
+ )
672
+
673
+ # if self.layer_idx == 0 and past_key is None:
674
+ # with open(f"attention_mask.txt", "w") as f:
675
+ # torch.set_printoptions(profile="full",linewidth=10000000,precision=1,sci_mode=False)
676
+ # a = attention_mask
677
+ # f.write(str(a))
678
+ # torch.set_printoptions(profile="default")
679
+ # with open(f"position_ids.txt", "w") as f:
680
+ # torch.set_printoptions(profile="full",linewidth=10000000,precision=1,sci_mode=False)
681
+ # a = position_ids
682
+ # f.write(str(a))
683
+ # torch.set_printoptions(profile="default")
684
+
685
+ # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
686
+ # Reference: https://github.com/pytorch/pytorch/issues/112577.
687
+ if query_states.device.type == "cuda" and attention_mask is not None:
688
+ query_states = query_states.contiguous()
689
+ key_states = key_states.contiguous()
690
+ value_states = value_states.contiguous()
691
+
692
+ attn_output = torch.nn.functional.scaled_dot_product_attention(
693
+ query_states,
694
+ key_states,
695
+ value_states,
696
+ attn_mask=attention_mask,
697
+ dropout_p=self.attention_dropout if self.training else 0.0,
698
+ # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
699
+ is_causal=self.is_causal and attention_mask is None and q_len > 1,
700
+ )
701
+
702
+ # for debug
703
+ # if past_key is not None and past_key.shape[2] == 128 and self.layer_idx == 0:
704
+ # torch.save({
705
+ # "hidden": hidden_states,
706
+ # "query": query_states,
707
+ # "key": key_states,
708
+ # "value": value_states,
709
+ # "output": attn_output,
710
+ # "query_position_ids": query_position_ids,
711
+ # "key_position_ids": key_position_ids,
712
+ # }, "attn-output.pt")
713
+
714
+ attn_output = attn_output.transpose(1, 2).contiguous()
715
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
716
+ attn_output = self.o_proj_with_ultragist(attn_output, total_ultragist_size)
717
+
718
+ return attn_output, None, past_key_value
719
+
720
+
721
+ LLAMA_ATTENTION_CLASSES = {
722
+ "eager": LlamaAttention,
723
+ "sdpa": LlamaSdpaAttention,
724
+ }
725
+
726
+
727
+ class LlamaDecoderLayer(nn.Module):
728
+ def __init__(self, config: LlamaConfig, layer_idx: int):
729
+ super().__init__()
730
+ self.hidden_size = config.hidden_size
731
+
732
+ self.self_attn = LLAMA_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
733
+
734
+ self.mlp = LlamaMLP(config)
735
+ self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
736
+ self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
737
+
738
+ def forward(
739
+ self,
740
+ hidden_states: torch.Tensor,
741
+ attention_mask: Optional[torch.Tensor] = None,
742
+ position_ids: Optional[torch.LongTensor] = None,
743
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
744
+ output_attentions: Optional[bool] = False,
745
+ use_cache: Optional[bool] = False,
746
+ **kwargs,
747
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
748
+ """
749
+ Args:
750
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
751
+ attention_mask (`torch.FloatTensor`, *optional*):
752
+ attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
753
+ query_sequence_length, key_sequence_length)` if default attention is used.
754
+ output_attentions (`bool`, *optional*):
755
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
756
+ returned tensors for more detail.
757
+ use_cache (`bool`, *optional*):
758
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
759
+ (see `past_key_values`).
760
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
761
+ """
762
+ if "padding_mask" in kwargs:
763
+ warnings.warn(
764
+ "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
765
+ )
766
+
767
+ # NOTE: get ultragist_size in case the mlp is included in ultragist_param
768
+ past_key, past_value, ultragist_sizes, total_ultragist_size, raw_size_to_cache, window_size = past_key_value
769
+
770
+ residual = hidden_states
771
+
772
+ hidden_states = self.input_layernorm(hidden_states)
773
+
774
+ # Self Attention
775
+ hidden_states, self_attn_weights, present_key_value = self.self_attn(
776
+ hidden_states=hidden_states,
777
+ attention_mask=attention_mask,
778
+ position_ids=position_ids,
779
+ past_key_value=past_key_value,
780
+ output_attentions=output_attentions,
781
+ use_cache=use_cache,
782
+ **kwargs,
783
+ )
784
+ hidden_states = residual + hidden_states
785
+
786
+ # Fully Connected
787
+ residual = hidden_states
788
+ hidden_states = self.post_attention_layernorm(hidden_states)
789
+ hidden_states = self.mlp(hidden_states, total_ultragist_size)
790
+ hidden_states = residual + hidden_states
791
+
792
+ outputs = (hidden_states,)
793
+
794
+ if output_attentions:
795
+ outputs += (self_attn_weights,)
796
+
797
+ if use_cache:
798
+ outputs += (present_key_value,)
799
+
800
+ return outputs
801
+
802
+
803
+ LLAMA_START_DOCSTRING = r"""
804
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
805
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
806
+ etc.)
807
+
808
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
809
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
810
+ and behavior.
811
+
812
+ Parameters:
813
+ config ([`LlamaConfig`]):
814
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
815
+ load the weights associated with the model, only the configuration. Check out the
816
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
817
+ """
818
+
819
+
820
+ @add_start_docstrings(
821
+ "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
822
+ LLAMA_START_DOCSTRING,
823
+ )
824
+ class LlamaPreTrainedModel(PreTrainedModel):
825
+ config_class = LlamaConfig
826
+ base_model_prefix = "model"
827
+ supports_gradient_checkpointing = True
828
+ _no_split_modules = ["LlamaDecoderLayer"]
829
+ _skip_keys_device_placement = "past_key_values"
830
+ _supports_sdpa = True
831
+ _supports_cache_class = True
832
+
833
+ def _init_weights(self, module):
834
+ std = self.config.initializer_range
835
+ if isinstance(module, nn.Linear):
836
+ module.weight.data.normal_(mean=0.0, std=std)
837
+ if module.bias is not None:
838
+ module.bias.data.zero_()
839
+ elif isinstance(module, nn.Embedding):
840
+ module.weight.data.normal_(mean=0.0, std=std)
841
+ if module.padding_idx is not None:
842
+ module.weight.data[module.padding_idx].zero_()
843
+
844
+
845
+ LLAMA_INPUTS_DOCSTRING = r"""
846
+ Args:
847
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
848
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
849
+ it.
850
+
851
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
852
+ [`PreTrainedTokenizer.__call__`] for details.
853
+
854
+ [What are input IDs?](../glossary#input-ids)
855
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
856
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
857
+
858
+ - 1 for tokens that are **not masked**,
859
+ - 0 for tokens that are **masked**.
860
+
861
+ [What are attention masks?](../glossary#attention-mask)
862
+
863
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
864
+ [`PreTrainedTokenizer.__call__`] for details.
865
+
866
+ If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
867
+ `past_key_values`).
868
+
869
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
870
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
871
+ information on the default strategy.
872
+
873
+ - 1 indicates the head is **not masked**,
874
+ - 0 indicates the head is **masked**.
875
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
876
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
877
+ config.n_positions - 1]`.
878
+
879
+ [What are position IDs?](../glossary#position-ids)
880
+ past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
881
+ Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
882
+ blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
883
+ returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
884
+
885
+ Two formats are allowed:
886
+ - a [`~cache_utils.Cache`] instance;
887
+ - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
888
+ shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
889
+ cache format.
890
+
891
+ The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
892
+ legacy cache format will be returned.
893
+
894
+ If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
895
+ have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
896
+ of shape `(batch_size, sequence_length)`.
897
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
898
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
899
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
900
+ model's internal embedding lookup matrix.
901
+ use_cache (`bool`, *optional*):
902
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
903
+ `past_key_values`).
904
+ output_attentions (`bool`, *optional*):
905
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
906
+ tensors for more detail.
907
+ output_hidden_states (`bool`, *optional*):
908
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
909
+ more detail.
910
+ return_dict (`bool`, *optional*):
911
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
912
+ """
913
+
914
+
915
+ @add_start_docstrings(
916
+ "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
917
+ LLAMA_START_DOCSTRING,
918
+ )
919
+ class LlamaModel(LlamaPreTrainedModel):
920
+ """
921
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
922
+
923
+ Args:
924
+ config: LlamaConfig
925
+ """
926
+
927
+ def __init__(self, config: LlamaConfig):
928
+ super().__init__(config)
929
+ self.padding_idx = config.pad_token_id
930
+ self.vocab_size = config.vocab_size
931
+
932
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
933
+
934
+ # ultragist: add ultragist embedding
935
+ self.ultragist_embed_tokens = nn.Embedding(1, config.hidden_size, self.padding_idx)
936
+ self.ultragist_embed_tokens._is_hf_initialized = True
937
+
938
+ self.layers = nn.ModuleList(
939
+ [LlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
940
+ )
941
+ self._use_sdpa = config._attn_implementation == "sdpa"
942
+ self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
943
+
944
+ self.gradient_checkpointing = False
945
+ # Initialize weights and apply final processing
946
+ self.post_init()
947
+
948
+ def _init_ultragist_embed(self, missing_keys):
949
+ """Initialize the ultragist token embedding with that of the eos token."""
950
+ if is_deepspeed_zero3_enabled():
951
+ import deepspeed
952
+ params = [self.ultragist_embed_tokens.weight, self.embed_tokens.weight]
953
+ with deepspeed.zero.GatheredParameters(params, modifier_rank=0):
954
+ # deepspeed will initialize the parameters to zero
955
+ if (self.ultragist_embed_tokens.weight == 0).all():
956
+ if self.config.ultragist_embed_init == "bos":
957
+ self.ultragist_embed_tokens.weight.data[:] = self.embed_tokens.weight.data[self.config.bos_token_id]
958
+ elif self.config.ultragist_embed_init == "eos":
959
+ self.ultragist_embed_tokens.weight.data[:] = self.embed_tokens.weight.data[self.config.eos_token_id]
960
+ else:
961
+ raise NotImplementedError(f"Make sure ultragist_embed_init is either eos or bos, found {self.config.ultragist_embed_init}")
962
+ else:
963
+ if any("ultragist_embed_tokens" in missing_key for missing_key in missing_keys):
964
+ if self.config.ultragist_embed_init == "bos":
965
+ self.ultragist_embed_tokens.weight.data[:] = self.embed_tokens.weight.data[self.config.bos_token_id]
966
+ elif self.config.ultragist_embed_init == "eos":
967
+ self.ultragist_embed_tokens.weight.data[:] = self.embed_tokens.weight.data[self.config.eos_token_id]
968
+ else:
969
+ raise NotImplementedError(f"Make sure ultragist_embed_init is either eos or bos, found {self.config.ultragist_embed_init}")
970
+
971
+ def get_input_embeddings(self):
972
+ return self.embed_tokens
973
+
974
+ def set_input_embeddings(self, value):
975
+ self.embed_tokens = value
976
+
977
+ @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
978
+ def forward(
979
+ self,
980
+ input_ids: torch.LongTensor = None,
981
+ attention_mask: Optional[torch.Tensor] = None,
982
+ position_ids: Optional[torch.LongTensor] = None,
983
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
984
+ inputs_embeds: Optional[torch.FloatTensor] = None,
985
+ use_cache: Optional[bool] = None,
986
+ output_attentions: Optional[bool] = None,
987
+ output_hidden_states: Optional[bool] = None,
988
+ return_dict: Optional[bool] = None,
989
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
990
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
991
+ output_hidden_states = (
992
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
993
+ )
994
+ # ultragist: always use cache
995
+ use_cache = True
996
+
997
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
998
+
999
+ # retrieve input_ids and inputs_embeds
1000
+ if input_ids is not None and inputs_embeds is not None:
1001
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
1002
+ elif input_ids is not None:
1003
+ batch_size, seq_length = input_ids.shape[:2]
1004
+ elif inputs_embeds is not None:
1005
+ batch_size, seq_length = inputs_embeds.shape[:2]
1006
+ else:
1007
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
1008
+
1009
+ # ultragist: create position_ids for all keys including past_keys
1010
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
1011
+ seq_length_with_past = seq_length
1012
+ past_key_values_length = 0
1013
+ past_key, past_value, ultragist_sizes, total_ultragist_size, raw_size_to_cache, window_size = past_key_values[0]
1014
+
1015
+ if past_key is not None:
1016
+ past_key_values_length = past_key.shape[2]
1017
+ seq_length_with_past = seq_length_with_past + past_key_values_length
1018
+
1019
+ # ultragist: separately embed ordinal tokens and ultragist tokens because ordinal tokens do not receive gradients
1020
+ if total_ultragist_size > 0:
1021
+ ordinal_input_ids = input_ids[:, :-total_ultragist_size]
1022
+ ultragist_input_ids = input_ids[:, -total_ultragist_size:]
1023
+ ordinal_inputs_embeds = self.embed_tokens(ordinal_input_ids)
1024
+ # bias ultragist_token_ids because they are newly initialized
1025
+ ultragist_input_embeds = self.ultragist_embed_tokens(ultragist_input_ids - self.config.vocab_size)
1026
+ inputs_embeds = torch.cat([ordinal_inputs_embeds, ultragist_input_embeds], dim=1)
1027
+ else:
1028
+ inputs_embeds = self.embed_tokens(input_ids)
1029
+
1030
+ # when total_ultragist_size > 0, we need to modify attention mask
1031
+ if self._use_sdpa and not output_attentions and total_ultragist_size == 0:
1032
+ # output_attentions=True can not be supported when using SDPA, and we fall back on
1033
+ # the manual implementation that requires a 4D causal mask in all cases.
1034
+ attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
1035
+ attention_mask,
1036
+ (batch_size, seq_length),
1037
+ inputs_embeds,
1038
+ past_key_values_length,
1039
+ )
1040
+ else:
1041
+ # 4d mask is passed through the layers
1042
+ attention_mask = _prepare_4d_causal_attention_mask(
1043
+ attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
1044
+ )
1045
+
1046
+ position_ids = torch.arange(seq_length_with_past, dtype=torch.long, device=device).repeat(batch_size, 1)
1047
+
1048
+ # prepare attention mask and position ids for ultragists
1049
+ # NOTE: we must modify the position_ids here instead of inside the self_attn forward, otherwise the version of position_ids is incompatible when enabling gradient checkpointing
1050
+ if total_ultragist_size > 0:
1051
+ # number of tokens to condense by the ultragists
1052
+ condensing_size = window_size - raw_size_to_cache
1053
+ # number of tokens in current window (containing cached raw activations)
1054
+ window_size_with_ultragist = window_size + total_ultragist_size
1055
+ # number of ultragists in cache
1056
+ memory_size = seq_length_with_past - window_size_with_ultragist
1057
+ min_value = torch.finfo(inputs_embeds.dtype).min
1058
+
1059
+ ultragist_start_idx = -total_ultragist_size
1060
+
1061
+ # batch_size, head_num, window_size
1062
+ reference_attention_mask = attention_mask[..., -total_ultragist_size - 1, -window_size_with_ultragist: -total_ultragist_size]
1063
+
1064
+ for ultragist_size in ultragist_sizes:
1065
+ # in this case, the activations of ordinal tokens are used as ultragist activations
1066
+ if ultragist_size < 0:
1067
+ continue
1068
+
1069
+ token_per_ultragist = condensing_size // ultragist_size
1070
+
1071
+ # the end_idx may be -0, in that case, use max instead
1072
+ ultragist_end_idx = ultragist_start_idx + ultragist_size
1073
+ if ultragist_end_idx == 0:
1074
+ ultragist_end_idx = torch.iinfo(torch.long).max
1075
+
1076
+ if self.config.ultragist_attn == "step-expansion":
1077
+ # each ultragist can attend to one more sub-interval than its predecessor
1078
+
1079
+ # token_per_ultragist, 2 * token_per_ultragist, ..., ultragist_size * token_per_ultragist
1080
+ ultragist_arange = torch.arange(1, ultragist_size + 1, device=device) * token_per_ultragist
1081
+ # 0, 1, 2, ..., window_size - 1
1082
+ ordinal_arange = torch.arange(window_size, device=device)
1083
+ # ultragist_size, window_size
1084
+ valid_pos = ordinal_arange.expand(ultragist_size, window_size) < ultragist_arange.unsqueeze(-1)
1085
+ # ultragist_size, window_size
1086
+ ordinal_attention_mask = torch.where(valid_pos, 0, min_value)
1087
+ # NOTE: add reference attention_mask so that padding tokens are considered
1088
+ ordinal_attention_mask = ordinal_attention_mask[None, None, ...] + reference_attention_mask.unsqueeze(-2)
1089
+
1090
+ if self.config.ultragist_attend_prev:
1091
+ ultragist_attention_mask = attention_mask.new_full((ultragist_size, ultragist_size), min_value).triu(1)
1092
+ # the ultragist token is next to the last oridinal token it attends to
1093
+ ultragist_position_ids = torch.arange(token_per_ultragist, token_per_ultragist * ultragist_size + 1, token_per_ultragist) + memory_size
1094
+ ultragist_position_ids = ultragist_position_ids + torch.arange(ultragist_size)
1095
+ position_ids[:, ultragist_start_idx: ultragist_end_idx] = ultragist_position_ids
1096
+ else:
1097
+ ultragist_attention_mask = attention_mask.new_full((ultragist_size, ultragist_size), min_value).fill_diagonal_(0)
1098
+ # the ultragist token is next to the last oridinal token it attends to
1099
+ ultragist_position_ids = torch.arange(token_per_ultragist, token_per_ultragist * ultragist_size + 1, token_per_ultragist) + memory_size
1100
+ position_ids[:, ultragist_start_idx: ultragist_end_idx] = ultragist_position_ids
1101
+
1102
+ attention_mask[..., ultragist_start_idx: ultragist_end_idx, -window_size_with_ultragist: -total_ultragist_size] = ordinal_attention_mask
1103
+ attention_mask[..., ultragist_start_idx: ultragist_end_idx, ultragist_start_idx: ultragist_end_idx] = ultragist_attention_mask
1104
+ # ultragists of different ratios are blind to others
1105
+ attention_mask[..., ultragist_start_idx: ultragist_end_idx, -total_ultragist_size: ultragist_start_idx] = min_value
1106
+
1107
+ elif self.config.ultragist_attn == "segmentation":
1108
+ # each ultragist can attend to its corresponding sub-interval
1109
+
1110
+ # ultragist_size, token_per_ultragist
1111
+ indices = torch.arange(token_per_ultragist * ultragist_size, device=device).view(ultragist_size, -1)
1112
+ # ultragist_size, window_size
1113
+ ordinal_attention_mask = attention_mask.new_full((ultragist_size, window_size), min_value)
1114
+ ordinal_attention_mask.scatter_(dim=-1, index=indices, value=0)
1115
+
1116
+ # NOTE: add reference attention_mask so that padding tokens are considered
1117
+ ordinal_attention_mask = ordinal_attention_mask[None, None, ...] + reference_attention_mask.unsqueeze(-2)
1118
+
1119
+ if self.config.ultragist_attend_prev:
1120
+ ultragist_attention_mask = attention_mask.new_full((ultragist_size, ultragist_size), min_value).triu(1)
1121
+ # the ultragist token is next to the last oridinal token it attends to
1122
+ ultragist_position_ids = position_ids.new_full(ultragist_size, fill_value=token_per_ultragist + memory_size)
1123
+ ultragist_position_ids = ultragist_position_ids + torch.arange(ultragist_size)
1124
+ position_ids[:, ultragist_start_idx: ultragist_end_idx] = ultragist_position_ids
1125
+ else:
1126
+ ultragist_attention_mask = attention_mask.new_full((ultragist_size, ultragist_size), min_value).fill_diagonal_(0)
1127
+ # the ultragist token is next to the last oridinal token it attends to
1128
+ ultragist_position_ids = position_ids.new_full(ultragist_size, fill_value=token_per_ultragist + memory_size)
1129
+ position_ids[:, ultragist_start_idx: ultragist_end_idx] = ultragist_position_ids
1130
+
1131
+ attention_mask[..., ultragist_start_idx: ultragist_end_idx, -window_size_with_ultragist: -total_ultragist_size] = ordinal_attention_mask
1132
+ attention_mask[..., ultragist_start_idx: ultragist_end_idx, ultragist_start_idx: ultragist_end_idx] = ultragist_attention_mask
1133
+ # ultragists of different ratios are blind to others
1134
+ attention_mask[..., ultragist_start_idx: ultragist_end_idx, -total_ultragist_size: ultragist_start_idx] = min_value
1135
+
1136
+ elif self.config.ultragist_attn == "full-coverage":
1137
+ pass
1138
+
1139
+ else:
1140
+ raise NotImplementedError
1141
+
1142
+ ultragist_start_idx = ultragist_end_idx
1143
+
1144
+ # print(f"total_ultragist_size: {total_ultragist_size}")
1145
+ # print(f"raw_size_to_cache: {raw_size_to_cache}")
1146
+ # print(f"position_ids: {position_ids}")
1147
+ # print(f"attention_mask:\n{attention_mask}")
1148
+ # x = input()
1149
+ # if x == "s":
1150
+ # return
1151
+
1152
+ # embed positions
1153
+ hidden_states = inputs_embeds
1154
+
1155
+ # decoder layers
1156
+ all_hidden_states = () if output_hidden_states else None
1157
+ all_self_attns = () if output_attentions else None
1158
+ # ultragist: still use tuple to organize cache
1159
+ next_decoder_cache = () if use_cache else None
1160
+
1161
+ for idx, decoder_layer in enumerate(self.layers):
1162
+ if output_hidden_states:
1163
+ all_hidden_states += (hidden_states,)
1164
+
1165
+ # ultragist: slice out the past_key_value of the corresponding layer
1166
+ past_key_value = past_key_values[idx] if past_key_values is not None else None
1167
+
1168
+ if self.gradient_checkpointing and self.training:
1169
+ layer_outputs = self._gradient_checkpointing_func(
1170
+ decoder_layer.__call__,
1171
+ hidden_states,
1172
+ attention_mask,
1173
+ position_ids,
1174
+ past_key_value,
1175
+ output_attentions,
1176
+ use_cache,
1177
+ )
1178
+ else:
1179
+ layer_outputs = decoder_layer(
1180
+ hidden_states,
1181
+ attention_mask=attention_mask,
1182
+ position_ids=position_ids,
1183
+ past_key_value=past_key_value,
1184
+ output_attentions=output_attentions,
1185
+ use_cache=use_cache,
1186
+ )
1187
+
1188
+ hidden_states = layer_outputs[0]
1189
+
1190
+ if use_cache:
1191
+ next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
1192
+
1193
+ if output_attentions:
1194
+ all_self_attns += (layer_outputs[1],)
1195
+
1196
+ hidden_states = self.norm(hidden_states)
1197
+
1198
+ # add hidden states from the last decoder layer
1199
+ if output_hidden_states:
1200
+ all_hidden_states += (hidden_states,)
1201
+
1202
+ next_cache = next_decoder_cache if use_cache else None
1203
+
1204
+ if not return_dict:
1205
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
1206
+ return BaseModelOutputWithPast(
1207
+ last_hidden_state=hidden_states,
1208
+ past_key_values=next_cache,
1209
+ hidden_states=all_hidden_states,
1210
+ attentions=all_self_attns,
1211
+ )
1212
+
1213
+
1214
+ class LlamaForCausalLM(LlamaPreTrainedModel):
1215
+ _tied_weights_keys = ["lm_head.weight"]
1216
+
1217
+ def __init__(self, config):
1218
+ super().__init__(config)
1219
+ self.model = LlamaModel(config)
1220
+ self.vocab_size = config.vocab_size
1221
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
1222
+ # Initialize weights and apply final processing
1223
+ self.post_init()
1224
+
1225
+ def get_input_embeddings(self):
1226
+ return self.model.embed_tokens
1227
+
1228
+ def set_input_embeddings(self, value):
1229
+ self.model.embed_tokens = value
1230
+
1231
+ def get_output_embeddings(self):
1232
+ return self.lm_head
1233
+
1234
+ def set_output_embeddings(self, new_embeddings):
1235
+ self.lm_head = new_embeddings
1236
+
1237
+ def set_decoder(self, decoder):
1238
+ self.model = decoder
1239
+
1240
+ def get_decoder(self):
1241
+ return self.model
1242
+
1243
+ @classmethod
1244
+ def from_pretrained(cls, *args, **kwargs):
1245
+ """Override the default from_pretrained to extend vocab size according to ultragist_size."""
1246
+ kwargs.update(output_loading_info=True)
1247
+ model, loading_info = super().from_pretrained(*args, **kwargs)
1248
+
1249
+ # NOTE: set memory after from_pretrained because there may be another transformer model inside the Memory object, which may cause weird erros during loading
1250
+ config = model.config
1251
+ model.memory = Memory(
1252
+ model_config=config,
1253
+ k_seq_dim=2,
1254
+ v_seq_dim=2,
1255
+ )
1256
+
1257
+ missing_keys = loading_info["missing_keys"]
1258
+ # NOTE: the ultragist parameters may or may not be loaded from the checkpoint
1259
+ # if it is loaded from the checkpoint, we should not re-initilize it
1260
+ model.model._init_ultragist_embed(missing_keys)
1261
+ # initialize weights of possible q,k,v,o,mlp
1262
+ for layer in model.model.layers:
1263
+ layer.self_attn._init_ultragist_proj(missing_keys)
1264
+ layer.mlp._init_ultragist_proj(missing_keys)
1265
+
1266
+ return model
1267
+
1268
+ def _native_forward(
1269
+ self,
1270
+ input_ids: torch.LongTensor = None,
1271
+ attention_mask: Optional[torch.Tensor] = None,
1272
+ position_ids: Optional[torch.LongTensor] = None,
1273
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1274
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1275
+ labels: Optional[torch.LongTensor] = None,
1276
+ shift_labels: Optional[bool] = True,
1277
+ use_cache: Optional[bool] = None,
1278
+ output_attentions: Optional[bool] = None,
1279
+ output_hidden_states: Optional[bool] = None,
1280
+ return_dict: Optional[bool] = None,
1281
+ ) -> Union[Tuple, ModelOutput]:
1282
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1283
+ output_hidden_states = (
1284
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1285
+ )
1286
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1287
+
1288
+ # when we directly call _native_forward, the past_key_values would be None
1289
+ if past_key_values is None:
1290
+ # NOTE: set window size to 0, so that new past_key_values are returned properly, see MistralAttention.forward
1291
+ past_key_values = [(None, None, [0], 0, 0, 0) for _ in range(self.config.num_hidden_layers)]
1292
+
1293
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
1294
+ outputs = self.model(
1295
+ input_ids=input_ids,
1296
+ attention_mask=attention_mask,
1297
+ position_ids=position_ids,
1298
+ past_key_values=past_key_values,
1299
+ inputs_embeds=inputs_embeds,
1300
+ use_cache=use_cache,
1301
+ output_attentions=output_attentions,
1302
+ output_hidden_states=output_hidden_states,
1303
+ return_dict=return_dict,
1304
+ )
1305
+
1306
+ hidden_states = outputs[0]
1307
+ if self.config.pretraining_tp > 1:
1308
+ lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
1309
+ logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
1310
+ logits = torch.cat(logits, dim=-1)
1311
+ else:
1312
+ logits = self.lm_head(hidden_states)
1313
+ logits = logits.float()
1314
+
1315
+ loss = None
1316
+ batch_loss = None
1317
+ valid_token_num = None
1318
+
1319
+ if labels is not None:
1320
+ loss, batch_loss, valid_token_num = compute_loss(logits, labels, shift=shift_labels)
1321
+
1322
+ if not return_dict:
1323
+ output = (logits,) + outputs[1:]
1324
+ return (loss,) + output if loss is not None else output
1325
+
1326
+ return ModelOutput(
1327
+ loss=loss,
1328
+ batch_loss=batch_loss,
1329
+ valid_token_num=valid_token_num,
1330
+ logits=logits,
1331
+ past_key_values=outputs.past_key_values,
1332
+ hidden_states=outputs.hidden_states,
1333
+ attentions=outputs.attentions,
1334
+ )
1335
+
1336
+ def _ultragist_forward(self,
1337
+ input_ids: torch.LongTensor = None,
1338
+ attention_mask: Optional[torch.Tensor] = None,
1339
+ position_ids: Optional[torch.LongTensor] = None,
1340
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1341
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1342
+ labels: Optional[torch.LongTensor] = None,
1343
+ use_cache: Optional[bool] = None,
1344
+ output_attentions: Optional[bool] = None,
1345
+ output_hidden_states: Optional[bool] = None,
1346
+ return_dict: Optional[bool] = None,
1347
+ ):
1348
+ # t1 = time.time()
1349
+ # initialize cache
1350
+ self.memory.prepare(
1351
+ input_ids=input_ids,
1352
+ attention_mask=attention_mask,
1353
+ labels=labels
1354
+ )
1355
+ # t2 = time.time()
1356
+ # print(f"{torch.distributed.get_rank()}: {input_ids.shape}")
1357
+
1358
+ # after the first window, one token at a time
1359
+ while not self.memory.finish:
1360
+ # for _ in range(2):
1361
+ # t3 = time.time()
1362
+
1363
+ input_ids, attention_mask, past_key_values, labels = self.memory.step()
1364
+
1365
+ # NOTE: the first window is encoded without ultragist parameters, we should skip it when computing loss
1366
+ if self.training and self.memory._step_idx == 1:
1367
+ labels[:] = -100
1368
+ # t4 = time.time()
1369
+
1370
+ outputs = self._native_forward(
1371
+ input_ids=input_ids,
1372
+ attention_mask=attention_mask,
1373
+ position_ids=position_ids,
1374
+ past_key_values=past_key_values,
1375
+ inputs_embeds=inputs_embeds,
1376
+ use_cache=use_cache,
1377
+ output_attentions=output_attentions,
1378
+ output_hidden_states=output_hidden_states,
1379
+ return_dict=return_dict,
1380
+ labels=labels,
1381
+ # NOTE: the labels have been shifted so that all tokens in the window have the proper loss
1382
+ shift_labels=False,
1383
+ )
1384
+ # t5 = time.time()
1385
+
1386
+ # update past_key_values
1387
+ self.memory.update_memory(outputs.past_key_values)
1388
+
1389
+ # t6 = time.time()
1390
+
1391
+ if labels is not None:
1392
+ # update loss
1393
+ self.memory.update_loss(outputs.batch_loss, outputs.valid_token_num)
1394
+
1395
+ # t7 = time.time()
1396
+
1397
+ # print(f"Loop step time: {t4-t3}")
1398
+ # print(f"Loop forward time: {t5-t4}")
1399
+ # print(f"Loop update time: {t6-t5}")
1400
+ # print(f"Loop loss time: {t7-t6}")
1401
+ # input()
1402
+
1403
+ # t8 = time.time()
1404
+
1405
+ # output loss, past_key_values, and perplexity
1406
+ outputs = self.memory.output(outputs)
1407
+
1408
+ # t9 = time.time()
1409
+ # print(f"Prepare time: {t2-t1}")
1410
+ # print(f"Output time: {t9-t8}")
1411
+ return outputs
1412
+
1413
+ def forward(self, **kwargs):
1414
+ """Forward computation over a batch of sequences.
1415
+ """
1416
+ # only allow gradient when training
1417
+ with optional_grad_ctx(with_grad=self.training):
1418
+ # we can disable ultragist to use the original llama
1419
+ if hasattr(self, "_enable_ultragist") and self._enable_ultragist == False:
1420
+ return self._native_forward(**kwargs)
1421
+ else:
1422
+ return self._ultragist_forward(**kwargs)
1423
+
1424
+ def prepare_inputs_for_generation(
1425
+ self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
1426
+ ):
1427
+ if past_key_values:
1428
+ input_ids = input_ids[:, -1:]
1429
+
1430
+ position_ids = kwargs.get("position_ids", None)
1431
+ if attention_mask is not None and position_ids is None:
1432
+ # create position_ids on the fly for batch generation
1433
+ position_ids = attention_mask.long().cumsum(-1) - 1
1434
+ position_ids.masked_fill_(attention_mask == 0, 1)
1435
+ if past_key_values:
1436
+ position_ids = position_ids[:, -1].unsqueeze(-1)
1437
+
1438
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
1439
+ if inputs_embeds is not None and past_key_values is None:
1440
+ model_inputs = {"inputs_embeds": inputs_embeds}
1441
+ else:
1442
+ model_inputs = {"input_ids": input_ids}
1443
+
1444
+ model_inputs.update(
1445
+ {
1446
+ "position_ids": position_ids,
1447
+ "past_key_values": past_key_values,
1448
+ "use_cache": kwargs.get("use_cache"),
1449
+ "attention_mask": attention_mask,
1450
+ }
1451
+ )
1452
+ return model_inputs
1453
+
1454
+ @staticmethod
1455
+ def _reorder_cache(past_key_values, beam_idx):
1456
+ reordered_past = ()
1457
+ for layer_past in past_key_values:
1458
+ reordered_past += (
1459
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
1460
+ )
1461
+ return reordered_past
modeling_ultragist.py ADDED
@@ -0,0 +1,711 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import numpy as np
4
+ import torch.distributed as dist
5
+ from transformers.utils import logging
6
+ from transformers import AutoTokenizer
7
+ from itertools import cycle
8
+ from typing import List
9
+
10
+ logger = logging.get_logger(__name__)
11
+
12
+
13
+ class Memory(torch.nn.Module):
14
+ def __init__(
15
+ self,
16
+ model_config,
17
+ k_seq_dim:int=2,
18
+ v_seq_dim:int=2,
19
+ ):
20
+ """Setup necessary attributes."""
21
+ super().__init__()
22
+
23
+ self.model_config = model_config
24
+
25
+ # initialize necessary parameters
26
+ self.k_seq_dim = k_seq_dim
27
+ self.v_seq_dim = v_seq_dim
28
+ self.num_layers = model_config.num_hidden_layers
29
+ self.max_position_embeddings = model_config.max_position_embeddings
30
+ self.rng = np.random.default_rng(42)
31
+
32
+ self.ultragist_window = model_config.ultragist_window
33
+ self.ultragist_stride = model_config.ultragist_stride
34
+ self.ultragist_attn = model_config.ultragist_attn
35
+ self.ultragist_ratio = model_config.ultragist_ratio
36
+ self.ultragist_ratio_mix = model_config.ultragist_ratio_mix
37
+ self.ultragist_param = model_config.ultragist_param
38
+ self.ultragist_sink_size = model_config.ultragist_sink_size
39
+ self.ultragist_attend_prev = model_config.ultragist_attend_prev
40
+
41
+ self.ultragist_tokens = torch.zeros(1, dtype=torch.long) + model_config.vocab_size
42
+
43
+ self._post_validation()
44
+ self.reset()
45
+
46
+ def _post_validation(self, verbose=True):
47
+ assert self.ultragist_window >= self.ultragist_stride, f"Make sure the ultragist_window {self.ultragist_window} >= ultragist_stride {self.ultragist_stride}!"
48
+ for ratio in self.ultragist_ratio:
49
+ assert ratio >= 0, f"Make sure all ultragist ratios are greater than or equal to 0, found {self.ultragist_ratio}!"
50
+ assert self.ultragist_attn in ["segmentation", "step-expansion", "full-coverage"], f"ultragist_attn {self.ultragist_attn} not implemented!"
51
+ assert self.ultragist_ratio_mix in ["instance-random", "step-random", "sequence", "join"] or "adapt-" in self.ultragist_ratio_mix, f"ultragist_ratio_mix {self.ultragist_ratio_mix} not implemented!"
52
+ if self.ultragist_ratio_mix == "join":
53
+ # create another stream for moving gpu tensor to cpu
54
+ # self.stream = torch.cuda.Stream()
55
+ pass
56
+
57
+ self._cpu = torch.device("cpu")
58
+
59
+ if verbose:
60
+ info = f"applying ultragist on {self.ultragist_param} (the ultragist embedding is initialized from {'bos' if self.model_config.ultragist_embed_init == 'bos' else 'eos'} embedding), with window size {self.ultragist_window}, stride {self.ultragist_stride}, {self.ultragist_attn} attention{' (attending to previous ultragists)' if self.ultragist_attend_prev else ' (no attending to previous ultragists)'}, sink size {self.ultragist_sink_size}, condensing ratio {self.ultragist_ratio} (mixed by {self.ultragist_ratio_mix})..."
61
+ logger.info(info)
62
+
63
+ def set(self, verbose=True, **kwargs):
64
+ if "ultragist_ratio_mix" in kwargs and kwargs["ultragist_ratio_mix"] == "join" and self.ultragist_ratio_mix != "join":
65
+ raise ValueError(f"You cannot switch ultragist_ratio_mix from non-join strategy to join!")
66
+ if self.ultragist_ratio_mix == "join" and "ultragist_ratio" in kwargs and sorted(kwargs["ultragist_ratio"]) != sorted(self.ultragist_ratio):
67
+ raise ValueError(f"You cannot change ultragist_ratio given ultragist_ratio_mix=join!")
68
+ for k, v in kwargs.items():
69
+ setattr(self, k, v)
70
+ self._post_validation(verbose=verbose)
71
+
72
+ def reset(self):
73
+ """Initialize attributes for a new sequence."""
74
+ # the cursor pointing to the start of the current window
75
+ self._start_idx = 0
76
+ # the cursor pointing to the end of the current window
77
+ self._end_idx = 0
78
+ # the ultragist sizes of all strides
79
+ self._total_ultragist_sizes = []
80
+ # the ultragist ratios of all strides
81
+ self._main_ultragist_sizes = []
82
+ # the loss per batch
83
+ self._batch_loss = None
84
+ # the valid token number per batch
85
+ self._valid_token_num = None
86
+ # the step index for processing the input_ids
87
+ self._step_idx = 0
88
+
89
+ # used in set_compression_ratio
90
+ self._ratio = None
91
+ self._ultragist_ratio_iter = None
92
+
93
+ self.all_input_ids = torch.tensor([], dtype=torch.long)
94
+ self.all_attention_mask = torch.tensor([], dtype=torch.long)
95
+ if hasattr(self, "all_labels"):
96
+ del self.all_labels
97
+
98
+ # the raw activations of recent tokens
99
+ self.raw_activations = [(None, None) for _ in range(self.num_layers)]
100
+ # the attention sink activations
101
+ self.sink_activations = [(None, None) for _ in range(self.num_layers)]
102
+
103
+ # the ultragist activations
104
+ if self.ultragist_ratio_mix == "join":
105
+ self.l1_to_ln_ultragist_activations = [
106
+ [(None, None) for _ in range(self.num_layers)]
107
+ for _ in self.ultragist_ratio
108
+ ]
109
+ else:
110
+ self.l1_to_ln_ultragist_activations = [
111
+ [(None, None) for _ in range(self.num_layers)]
112
+ ]
113
+
114
+ def rewind(self, size=None, trim=False):
115
+ """
116
+ Rewind raw activations that have not been condensed yet.
117
+
118
+ Args:
119
+ trim: if true, the input_ids corresponding to the raw activations are trimmed.
120
+ """
121
+ raw_memory_size = self.get_memory_size()[1]
122
+ if size is None:
123
+ size = raw_memory_size
124
+ assert size <= raw_memory_size, f"Make sure the rewind size ({size}) is smaller or equal to the raw memory size ({raw_memory_size})!"
125
+
126
+ if size > 0:
127
+ self._end_idx -= size
128
+ for layer_idx, (key, value) in enumerate(self.raw_activations):
129
+ key = slice_tensor(key, end=-size, dim=self.k_seq_dim)
130
+ value = slice_tensor(value, end=-size, dim=self.v_seq_dim)
131
+ self.raw_activations[layer_idx] = (key, value)
132
+
133
+ if trim:
134
+ self.all_input_ids = self.all_input_ids[:, :-size]
135
+ self.all_attention_mask = self.all_attention_mask[:, :-size]
136
+ if hasattr(self, "all_labels"):
137
+ self.all_labels = self.all_labels[:, :-size]
138
+
139
+ @property
140
+ def finish(self):
141
+ is_finish = self._end_idx == self.all_sequence_length
142
+
143
+ # print(f"{dist.get_rank()} Finish: {self._end_idx}, {self.all_sequence_length}")
144
+ # if is_finish and hasattr(self, "stream"):
145
+ # self.stream.synchronize()
146
+ return is_finish
147
+
148
+ def get_memory_size(self):
149
+ ultragist_memory_size = 0
150
+ raw_memory_size = 0
151
+ sink_memory_size = 0
152
+ if self.l1_to_ln_ultragist_activations[0][0][0] is not None:
153
+ ultragist_memory_size += self.l1_to_ln_ultragist_activations[0][0][0].shape[self.k_seq_dim]
154
+ if self.raw_activations[0][0] is not None:
155
+ raw_memory_size += self.raw_activations[0][0].shape[self.k_seq_dim]
156
+ if self.sink_activations[0][0] is not None:
157
+ sink_memory_size += self.sink_activations[0][0].shape[self.k_seq_dim]
158
+ return ultragist_memory_size, raw_memory_size, sink_memory_size
159
+
160
+ def get_memory(self, ultragist_sizes=None, total_ultragist_size=None, raw_size_to_cache=None, window_size=None):
161
+ """
162
+ Get the compressed kv cache for generating next tokens.
163
+ """
164
+ past_key_values = []
165
+ for layer_idx in range(self.num_layers):
166
+ sink_key, sink_value = self.sink_activations[layer_idx]
167
+ ultragist_key, ultragist_value = self.l1_to_ln_ultragist_activations[0][layer_idx]
168
+ raw_key, raw_value = self.raw_activations[layer_idx]
169
+
170
+ key = cat_tensor([
171
+ sink_key, ultragist_key, raw_key,
172
+ ], dim=self.k_seq_dim)
173
+ value = cat_tensor([
174
+ sink_value, ultragist_value, raw_value,
175
+ ], dim=self.v_seq_dim)
176
+
177
+ if ultragist_sizes is not None:
178
+ layer_past_key_values = (key, value, ultragist_sizes, total_ultragist_size, raw_size_to_cache, window_size)
179
+ else:
180
+ layer_past_key_values = (key, value)
181
+
182
+ past_key_values.append(layer_past_key_values)
183
+ return past_key_values
184
+
185
+ def prepare(self, input_ids, attention_mask, labels):
186
+ """
187
+ Prepare inputs for the model. These inputs belong to the same sequence.
188
+ """
189
+ assert input_ids.shape[0] == 1, "Make sure the batch size is 1!"
190
+ assert attention_mask is None or (attention_mask == 1).all(), "Make sure there is no padding!"
191
+
192
+ if not hasattr(self, "_device"):
193
+ self._device = input_ids.device
194
+
195
+ # accumulate input_ids and attention_mask
196
+ self.all_input_ids = torch.cat([self.all_input_ids, input_ids.cpu()], dim=1)
197
+ if attention_mask is None:
198
+ attention_mask = torch.ones_like(input_ids)
199
+ self.all_attention_mask = torch.cat([self.all_attention_mask, attention_mask.cpu()], dim=1)
200
+ self.all_sequence_length = self.all_input_ids.shape[1]
201
+
202
+ if labels is not None:
203
+ # rotate labels in advance so that the loss of the last token is not ignored in every window
204
+ labels = torch.cat([labels[:, 1:].cpu(), torch.tensor([-100]).expand(labels.shape[0], 1)], dim=1)
205
+ if not hasattr(self, "all_labels"):
206
+ self.all_labels = labels
207
+ else:
208
+ self.all_labels = torch.cat([self.all_labels, labels], dim=1)
209
+ assert self.all_input_ids.shape[1] == self.all_labels.shape[1], f"Found inconsistent all_input_ids {self.all_input_ids.shape} and all_labels {self.all_labels.shape}!"
210
+
211
+ def set_compression_ratio(self, start_idx, end_idx):
212
+ """Choose a condensing ratio from self.ultragist_ratio"""
213
+ def filter_ratio(ratios, stride):
214
+ valid_ratios = []
215
+ for ratio in ratios:
216
+ # stride must be bigger than condensing ratio because we there must be at least one ultragist
217
+ if stride < ratio:
218
+ continue
219
+ # the stride must be evenly divisible by condensing ratio
220
+ if ratio > 0 and (stride % ratio) != 0:
221
+ continue
222
+ # when training, ratio=0 is valid if previous windows contain ultragist or later windows contain ultragist
223
+ if ratio == 0 and self.training:
224
+ previous_has_zero = -1 in self._main_ultragist_sizes
225
+ following_has_nonzero = (start_idx + stride + self.ultragist_window) <= self.all_sequence_length
226
+ if previous_has_zero or (not following_has_nonzero):
227
+ continue
228
+ valid_ratios.append(ratio)
229
+ assert len(valid_ratios), f"Cannot find valid condensing ratio (among {ratios}) for stride {stride}!"
230
+ return valid_ratios
231
+
232
+ def get_max_length(ratios):
233
+ max_lengths = []
234
+ for condensing_ratio in ratios:
235
+ if condensing_ratio > 0:
236
+ max_lengths.append((self.max_position_embeddings - self.ultragist_window) * condensing_ratio + self.ultragist_window)
237
+ else:
238
+ max_lengths.append(self.max_position_embeddings)
239
+ return max_lengths
240
+
241
+ if len(self.ultragist_ratio) == 1:
242
+ return [self.ultragist_ratio[0]]
243
+
244
+ ratio_mix = self.ultragist_ratio_mix
245
+
246
+ ultragist_ratio = filter_ratio(self.ultragist_ratio, self.ultragist_stride)
247
+
248
+ if ratio_mix == "instance-random":
249
+ if self._ratio is None:
250
+ ultragist_ratio = self.rng.choice(ultragist_ratio, size=1).tolist()
251
+ self._ratio = ultragist_ratio
252
+ else:
253
+ ultragist_ratio = self._ratio
254
+
255
+ elif ratio_mix == "step-random":
256
+ ultragist_ratio = self.rng.choice(ultragist_ratio, size=1).tolist()
257
+
258
+ elif ratio_mix == "sequence":
259
+ if self._ultragist_ratio_iter is None:
260
+ self._ultragist_ratio_iter = cycle(ultragist_ratio)
261
+ ultragist_ratio = [next(self._ultragist_ratio_iter)]
262
+
263
+ elif ratio_mix == "join":
264
+ ultragist_ratio = ultragist_ratio
265
+
266
+ elif "adapt" in ratio_mix:
267
+ if self._ratio is None:
268
+ future_length = int(ratio_mix.split("-")[1])
269
+ sequence_length = self.all_input_ids.shape[1] + future_length
270
+ max_lengths = get_max_length(ultragist_ratio)
271
+ # ascendingly sort the max lengths
272
+ valid_max_lengths_and_indices = [x for x in enumerate(max_lengths) if x[1] >= sequence_length]
273
+ if len(valid_max_lengths_and_indices):
274
+ minimum_length_index = min(valid_max_lengths_and_indices, key=lambda x: x[1])[0]
275
+ # use the minimal possible length for this sequence (the smallest fold ratio)
276
+ ultragist_ratio = [ultragist_ratio[minimum_length_index]]
277
+ else:
278
+ ultragist_ratio = [max(ultragist_ratio)]
279
+ # logger.warning(f"Failed to find valid fold window and size for sequence length {sequence_length}, as the maximum theoretical length is {max(max_lengths)}. Fall back to use the maximum one: {ultragist_ratio}.")
280
+ self._ratio = ultragist_ratio
281
+ else:
282
+ ultragist_ratio = self._ratio
283
+
284
+ return ultragist_ratio
285
+
286
+ def step(self):
287
+ """
288
+ Yield one window with the following logic:
289
+
290
+ The window size is L, the stride is S.
291
+ The window moves over S tokens at a time. The raw activations passed by the window are condensed according to a condensing_ratio.
292
+ The ultragists are added if and only if the raw activations fulfill the window.
293
+ In the future, we may switch window size to decrease cache size of raw activations.
294
+ """
295
+ # the starting position of the current window w.r.t. the start of the current input sequence
296
+ start_idx = self._start_idx
297
+ # the end position of the current window w.r.t. the start of the current input sequence
298
+ end_idx = start_idx + self.ultragist_window
299
+
300
+ # indicates if the current window is completely filled by raw activations and new tokens
301
+ # we only append ultragist tokens for full windows
302
+ if end_idx > self.all_sequence_length:
303
+ # the input is shorter than the initial window size
304
+ end_idx = self.all_sequence_length
305
+ is_full_window = False
306
+ else:
307
+ is_full_window = True
308
+
309
+ # NOTE: in training, the entire sequence is input to the model at once
310
+ # In the last window, we do not need to append ultragists because they will not be used at all
311
+ if self.training and end_idx == self.all_sequence_length:
312
+ is_full_window = False
313
+
314
+ # the real window size (remaining_size + new_token_size)
315
+ window_size = end_idx - start_idx
316
+
317
+ if is_full_window:
318
+ ultragist_stride = self.ultragist_stride
319
+ # a list of condensing ratios
320
+ compression_ratios = self.set_compression_ratio(start_idx=start_idx, end_idx=end_idx)
321
+
322
+ ultragist_sizes = []
323
+ for condensing_ratio in compression_ratios:
324
+ if condensing_ratio > 0:
325
+ # the stride must be evenly divisible by condensing_ratio
326
+ ultragist_sizes.append(ultragist_stride // condensing_ratio)
327
+ else:
328
+ # the raw activations are used as ultragist activations
329
+ ultragist_sizes.append(-1)
330
+ # forward start_idx and end_idx
331
+ next_start_idx = start_idx + ultragist_stride
332
+ # how many raw activations to save
333
+ raw_size_to_cache = end_idx - next_start_idx
334
+
335
+ else:
336
+ # no stride because the sequence has finished
337
+ next_start_idx = start_idx
338
+ # cache all recent raw activations to be used in the next window
339
+ raw_size_to_cache = window_size
340
+ ultragist_sizes = [0]
341
+ compression_ratios = [0]
342
+
343
+ total_ultragist_size = sum(s for s in ultragist_sizes if s >= 0)
344
+
345
+ past_key_values = self.get_memory(
346
+ ultragist_sizes,
347
+ total_ultragist_size,
348
+ raw_size_to_cache,
349
+ window_size
350
+ )
351
+
352
+ # streamingly add new input_ids
353
+ input_ids = self.all_input_ids[:, self._end_idx: end_idx].to(self._device)
354
+ attention_mask = self.all_attention_mask[:, self._end_idx: end_idx].to(self._device)
355
+ if hasattr(self, "all_labels"):
356
+ labels = self.all_labels[:, self._end_idx: end_idx].to(self._device)
357
+ else:
358
+ labels = None
359
+ batch_size = input_ids.shape[0]
360
+
361
+ # append ultragists if necessary
362
+ if is_full_window:
363
+ if total_ultragist_size > 0:
364
+ input_ids = torch.cat([input_ids, self.ultragist_tokens.expand(batch_size, total_ultragist_size).to(input_ids.device, dtype=input_ids.dtype)], dim=1)
365
+ # NOTE: prepend ultragist_memory_size 1 to attention_mask because we have past_key_values
366
+ attention_mask = torch.cat([attention_mask, attention_mask.new_ones(batch_size, total_ultragist_size)], dim=1)
367
+ if labels is not None:
368
+ labels = torch.cat([labels, labels.new_zeros(batch_size, total_ultragist_size) - 100], dim=1)
369
+
370
+ # prepend 1 to attention mask for previous memory
371
+ first_key = past_key_values[0][0]
372
+ memory_size = first_key.shape[self.k_seq_dim] if first_key is not None else 0
373
+ if memory_size > 0:
374
+ attention_mask = torch.cat([attention_mask.new_ones(batch_size, memory_size), attention_mask], dim=1)
375
+
376
+ # involked in self.output()
377
+ self._total_ultragist_sizes.append(total_ultragist_size)
378
+ # involked in self.set_compression_ratio
379
+ self._main_ultragist_sizes.append(ultragist_sizes[0])
380
+
381
+ # update end_idx
382
+ self._start_idx = next_start_idx
383
+ self._end_idx = end_idx
384
+ self._step_idx += 1
385
+
386
+ # print("****************************************")
387
+ # if is_full_window:
388
+ # print(f"stride: {ultragist_stride}")
389
+ # print(f"compression ratios: {compression_ratios}")
390
+ # print(f"ultragist_sizes: {ultragist_sizes}")
391
+ # print(f"input_ids: {input_ids.shape}")
392
+ # print(f"start_idx: {start_idx}")
393
+ # print(f"next_start_idx: {next_start_idx}")
394
+ # print(f"end_idx: {end_idx}")
395
+ # x = input()
396
+ # if x == "s":
397
+ # return
398
+
399
+ return input_ids, attention_mask, past_key_values, labels
400
+
401
+ def update_memory(self, past_key_values):
402
+ """
403
+ Accumulate ultragist activations and raw activations.
404
+ """
405
+ for layer_idx, (key, value, ultragist_sizes, total_ultragist_size, raw_size_to_cache, window_size) in enumerate(past_key_values):
406
+ # NOTE: the past_key_values are incrementally returned (only the new keys and values are returned)
407
+
408
+ # key/value: (num_layer, 2, batch_size, num_head, new_seq_len, head_dim)
409
+ # ultragist_size: how many ultragist activations are in key and value
410
+ # raw_size_to_cache: how many raw activations should be kept
411
+
412
+ previous_raw_key, previous_raw_value = self.raw_activations[layer_idx]
413
+
414
+ if self._step_idx == 1:
415
+ # save the sink activations
416
+ # NOTE: we do not slice the key/value activations, which may cause duplication when ultragist_ratio=-1 for the first window, but it's okay
417
+ self.sink_activations[layer_idx] = [
418
+ slice_tensor(key, end=self.ultragist_sink_size, dim=self.k_seq_dim),
419
+ slice_tensor(value, end=self.ultragist_sink_size, dim=self.v_seq_dim),
420
+ ]
421
+
422
+ if ultragist_sizes == [0]:
423
+ # this means the current input does not fulfill a window
424
+ # thus, the key and value are all raw activations, and we accumulate them until the window is fulfilled
425
+ assert raw_size_to_cache == window_size
426
+ raw_key = cat_tensor([
427
+ previous_raw_key,
428
+ key
429
+ ], dim=self.k_seq_dim)
430
+ raw_value = cat_tensor([
431
+ previous_raw_value,
432
+ value
433
+ ], dim=self.v_seq_dim)
434
+ self.raw_activations[layer_idx] = (raw_key, raw_value)
435
+
436
+ else:
437
+ for ultragist_size_idx, ultragist_size in enumerate(ultragist_sizes):
438
+ # NOTE: use the correct previous_ultragist_key and value!
439
+ previous_ultragist_key, previous_ultragist_value = self.l1_to_ln_ultragist_activations[ultragist_size_idx][layer_idx]
440
+
441
+ # if ultragist_size_idx == 0:
442
+ # ctx_manager = nullcontext()
443
+ # else:
444
+ # ctx_manager = torch.cuda.stream(self.stream)
445
+ # FIXME: only the first iteration works...
446
+ # with ctx_manager:
447
+
448
+ ultragist_key, ultragist_value, raw_key, raw_value = self._extract_ultragist_and_raw_memory(key, value, previous_ultragist_key, previous_ultragist_value, previous_raw_key, previous_raw_value, raw_size_to_cache, total_ultragist_size, ultragist_sizes, ultragist_size_idx)
449
+
450
+ self.l1_to_ln_ultragist_activations[ultragist_size_idx][layer_idx] = (ultragist_key, ultragist_value)
451
+ if ultragist_size_idx == 0:
452
+ self.raw_activations[layer_idx] = (raw_key, raw_value)
453
+
454
+ # if ultragist_size_idx != 0:
455
+ # print(self.stream.query())
456
+
457
+ def update_loss(self, batch_loss, valid_token_num):
458
+ """
459
+ Accumulate loss for later perplexity computation and backward pass; past_key_values according to cache_method.
460
+ """
461
+ # print(f"process {dist.get_rank()}: valid_token_num: {valid_token_num}; loss {batch_loss}")
462
+ if self._batch_loss is None:
463
+ # NOTE: multiply valid_token_num because batch_loss is divided by it in advance
464
+ self._batch_loss = batch_loss * valid_token_num
465
+ self._valid_token_num = valid_token_num
466
+ else:
467
+ # NOTE: avoid in-place operations, otherwise there will be gradient errors in training
468
+ self._batch_loss = self._batch_loss + batch_loss * valid_token_num
469
+ self._valid_token_num = self._valid_token_num + valid_token_num
470
+
471
+ def output(self, model_outputs):
472
+ """
473
+ Override loss with accumulated loss.
474
+ """
475
+ # override loss
476
+ if self._batch_loss is not None:
477
+ # here the batch_loss is the summation of all token losses in each element
478
+ loss = self._batch_loss.sum() / self._valid_token_num.sum()
479
+
480
+ # NOTE: prevent nan
481
+ batch_loss = self._batch_loss / self._valid_token_num
482
+ if (self._valid_token_num == 0).any():
483
+ batch_loss = batch_loss.masked_fill(self._valid_token_num == 0, 0.)
484
+
485
+ # NOTE: we must use dict to override values, otherwise trainer cannot find loss
486
+ model_outputs["loss"] = loss
487
+ model_outputs["batch_loss"] = batch_loss
488
+ model_outputs["valid_token_num"] = self._valid_token_num
489
+
490
+ # override last_hidden_states (used in generation)
491
+ ultragist_size = self._total_ultragist_sizes[-1]
492
+ # remove logits corresponding to ultragist tokens
493
+ if ultragist_size > 0:
494
+ model_outputs["logits"] = model_outputs["logits"][:, :-ultragist_size]
495
+
496
+ return model_outputs
497
+
498
+ def _extract_ultragist_and_raw_memory(self, key, value, previous_ultragist_key, previous_ultragist_value, previous_raw_key, previous_raw_value, raw_size_to_cache, total_ultragist_size, ultragist_sizes, ultragist_size_idx):
499
+ """Extract ultragist and raw memory from the returned key and value. The raw memory is computed only if the ultragist_size_idx == 0."""
500
+ ultragist_size = ultragist_sizes[ultragist_size_idx]
501
+ # NOTE: ignore -1
502
+ previous_ultragist_size = sum(x for x in ultragist_sizes[:ultragist_size_idx] if x > 0)
503
+
504
+ if previous_ultragist_key is not None:
505
+ target_device = previous_ultragist_key.device
506
+ else:
507
+ if ultragist_size_idx == 0:
508
+ target_device = self._device
509
+ else:
510
+ target_device = self._cpu
511
+
512
+ if ultragist_size == -1:
513
+ actual_ultragist_size = self.ultragist_window - raw_size_to_cache
514
+
515
+ # the raw activations are used as ultragist activations
516
+ concat_raw_key = cat_tensor([
517
+ previous_raw_key,
518
+ key
519
+ ], dim=self.k_seq_dim)
520
+ concat_raw_value = cat_tensor([
521
+ previous_raw_value,
522
+ value
523
+ ], dim=self.v_seq_dim)
524
+
525
+ ultragist_key = cat_tensor([
526
+ previous_ultragist_key,
527
+ slice_tensor(concat_raw_key, end=actual_ultragist_size, dim=self.k_seq_dim).to(target_device, non_blocking=True)
528
+ ], dim=self.k_seq_dim)
529
+ ultragist_value = cat_tensor([
530
+ previous_ultragist_value,
531
+ slice_tensor(concat_raw_value, end=actual_ultragist_size, dim=self.v_seq_dim).to(target_device, non_blocking=True)
532
+ ], dim=self.v_seq_dim)
533
+
534
+ if ultragist_size_idx == 0:
535
+ raw_key = slice_tensor(concat_raw_key, start=actual_ultragist_size, end=self.ultragist_window, dim=self.k_seq_dim)
536
+ raw_value = slice_tensor(concat_raw_value, start=actual_ultragist_size, end=self.ultragist_window, dim=self.v_seq_dim)
537
+
538
+ else:
539
+ # [-ultragist_size:] activations are from ultragists, need to be accumulated
540
+ # [-raw_cache_size-ultragist_size:-ultragist_size] raw activations will be cached; if they are shorter than raw_cache_size, part of the previous raw activations will also be kept
541
+
542
+ ultragist_start_idx = - total_ultragist_size + previous_ultragist_size
543
+ ultragist_end_idx = ultragist_start_idx + ultragist_size
544
+
545
+ # NOTE: avoid end=0 for slicing
546
+ if ultragist_end_idx == 0:
547
+ ultragist_end_idx = None
548
+
549
+ ultragist_key = cat_tensor([
550
+ previous_ultragist_key,
551
+ slice_tensor(key, start=ultragist_start_idx, end=ultragist_end_idx, dim=self.k_seq_dim).to(target_device, non_blocking=True)
552
+ ], dim=self.k_seq_dim)
553
+ ultragist_value = cat_tensor([
554
+ previous_ultragist_value,
555
+ slice_tensor(value, start=ultragist_start_idx, end=ultragist_end_idx, dim=self.v_seq_dim).to(target_device, non_blocking=True)
556
+ ], dim=self.v_seq_dim)
557
+
558
+ # the raw activations are only updated once
559
+ if ultragist_size_idx == 0:
560
+ if key.shape[self.k_seq_dim] < raw_size_to_cache + ultragist_size:
561
+ concat_raw_key = cat_tensor([
562
+ previous_raw_key,
563
+ key
564
+ ], dim=self.k_seq_dim)
565
+ concat_raw_value = cat_tensor([
566
+ previous_raw_value,
567
+ value
568
+ ], dim=self.v_seq_dim)
569
+ raw_key = slice_tensor(concat_raw_key, start=self.ultragist_window - raw_size_to_cache, end=self.ultragist_window, dim=self.k_seq_dim)
570
+ raw_value = slice_tensor(concat_raw_value, start=self.ultragist_window - raw_size_to_cache, end=self.ultragist_window, dim=self.v_seq_dim)
571
+ else:
572
+ # becomes None when raw_size_to_cache = 0
573
+ raw_key = slice_tensor(key, start=ultragist_start_idx - raw_size_to_cache, end=ultragist_start_idx, dim=self.k_seq_dim)
574
+ raw_value = slice_tensor(value, start=ultragist_start_idx - raw_size_to_cache, end=ultragist_start_idx, dim=self.v_seq_dim)
575
+
576
+ if ultragist_size_idx == 0:
577
+ return ultragist_key, ultragist_value, raw_key, raw_value
578
+ else:
579
+ # NOTE: only l1 ultragist activations are kept on GPU
580
+ return ultragist_key.detach().to(target_device, non_blocking=True), ultragist_value.detach().to(target_device, non_blocking=True), None, None
581
+ # return ultragist_key, ultragist_value, None, None
582
+
583
+
584
+ def slice_tensor(x, start=None, end=None, dim=2):
585
+ if x is None:
586
+ return None
587
+ if end == 0:
588
+ return None
589
+ if start == x.shape[dim]:
590
+ return None
591
+ if start == end:
592
+ return None
593
+ if dim == 2:
594
+ if start is None and end is not None:
595
+ return x[:, :, :end, ...]
596
+ elif start is not None and end is None:
597
+ return x[:, :, start:, ...]
598
+ elif start is not None and end is not None:
599
+ return x[:, :, start:end, ...]
600
+ elif dim == 1:
601
+ if start is None and end is not None:
602
+ return x[:, :end, ...]
603
+ elif start is not None and end is None:
604
+ return x[:, start:, ...]
605
+ elif start is not None and end is not None:
606
+ return x[:, start:end, ...]
607
+ else:
608
+ raise NotImplementedError
609
+
610
+ def cat_tensor(list_of_tensors, dim=-1):
611
+ list_of_tensors = [t for t in list_of_tensors if t is not None]
612
+ if len(list_of_tensors) > 1:
613
+ result = torch.cat(list_of_tensors, dim=dim)
614
+ elif len(list_of_tensors) == 1:
615
+ result = list_of_tensors[0]
616
+ else:
617
+ result = None
618
+ return result
619
+
620
+ def slice_activations(activations, start=None, end=None, k_seq_dim=2, v_seq_dim=2):
621
+ new_activations = []
622
+ for key, value in activations:
623
+ new_key = slice_tensor(key, start=start, end=end, dim=k_seq_dim)
624
+ new_value = slice_tensor(value, start=start, end=end, dim=v_seq_dim)
625
+ new_activations.append([new_key, new_value])
626
+ return new_activations
627
+
628
+ def cat_activations(list_of_activations, k_seq_dim=2, v_seq_dim=2):
629
+ assert all(len(x) == len(list_of_activations[0]) for x in list_of_activations), f"Make sure all activations have the same number of layers! Found {[len(x) for x in list_of_activations]}."
630
+
631
+ new_activations = []
632
+ for layer_idx in range(len(list_of_activations[0])):
633
+ keys = [x[layer_idx][0] for x in list_of_activations]
634
+ values = [x[layer_idx][1] for x in list_of_activations]
635
+
636
+ new_key = cat_tensor(keys, dim=k_seq_dim)
637
+ new_value = cat_tensor(values, dim=v_seq_dim)
638
+ new_activations.append([new_key, new_value])
639
+ return new_activations
640
+
641
+ def interleave_activations(main_activations, augment_activations, main_spans, augment_spans, k_seq_dim=2, v_seq_dim=2, device=torch.device("cuda")):
642
+ """ Interleave main_activations and augment_activations according to main_span and augment_span.
643
+
644
+ Args:
645
+ main_span: a list of tuples (start_idx, end_idx). when start_idx and end_idx is None, the augment_activations will be plugged in.
646
+ augment_span: a list of tuples (start_idx, end_idx)
647
+ """
648
+ assert len(main_activations) == len(augment_activations) , f"Make sure main and augment activations have the same number of layers! Found {len(main_activations)} and {len(augment_activations)}!"
649
+ assert sum(x[0] is None and x[1] is None for x in main_spans) == len(augment_spans), f"Make sure the number of slots for augmentation (start_idx=None and end_idx=None in main_spans) matches the number of augmentations. Found {sum(x for x in main_spans if x[0] is None and x[1] is None)} slots but {len(augment_spans)} augmentations!"
650
+
651
+ new_activations = []
652
+ for layer_idx in range(len(main_activations)):
653
+ main_key, main_value = main_activations[layer_idx]
654
+ augment_key, augment_value = augment_activations[layer_idx]
655
+
656
+ sliced_keys = []
657
+ sliced_values = []
658
+
659
+ augment_idx = 0
660
+ for start, end in main_spans:
661
+ if start is None and end is None:
662
+ # this means the augment key/value should be plugged in
663
+ augment_start, augment_end = augment_spans[augment_idx]
664
+ sliced_key = slice_tensor(
665
+ augment_key,
666
+ start=augment_start,
667
+ end=augment_end,
668
+ dim=k_seq_dim
669
+ ).to(device)
670
+ sliced_value = slice_tensor(
671
+ augment_value,
672
+ start=augment_start,
673
+ end=augment_end,
674
+ dim=v_seq_dim
675
+ ).to(device)
676
+
677
+ else:
678
+ sliced_key = slice_tensor(
679
+ main_key,
680
+ start=start,
681
+ end=end,
682
+ dim=k_seq_dim
683
+ )
684
+ sliced_value = slice_tensor(
685
+ main_value,
686
+ start=start,
687
+ end=end,
688
+ dim=v_seq_dim
689
+ )
690
+
691
+ sliced_keys.append(sliced_key)
692
+ sliced_values.append(sliced_value)
693
+
694
+ new_key = cat_tensor(sliced_keys, dim=k_seq_dim)
695
+ new_value = cat_tensor(sliced_values, dim=v_seq_dim)
696
+ new_activations.append([new_key, new_value])
697
+
698
+ return new_activations
699
+
700
+ def softmax(x:np.ndarray, axis=-1, temperature=1):
701
+ if isinstance(x, list):
702
+ x = np.array(x)
703
+ x = x / temperature
704
+ x = x - x.max(axis=axis, keepdims=True)
705
+ y = np.exp(x)
706
+ return y / y.sum(axis=axis, keepdims=True)
707
+
708
+ def l1_norm(x):
709
+ sum_x = sum(x)
710
+ x = [y/sum_x for y in x]
711
+ return x
modeling_utils.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from tqdm import tqdm
4
+ from dataclasses import dataclass
5
+ from contextlib import nullcontext
6
+ from typing import Mapping, Optional, Tuple
7
+ from accelerate import Accelerator
8
+ from collections import defaultdict
9
+ from transformers.modeling_outputs import BaseModelOutputWithPast
10
+
11
+
12
+ def optional_grad_ctx(with_grad=False):
13
+ if with_grad:
14
+ return nullcontext()
15
+ else:
16
+ return torch.no_grad()
17
+
18
+ def move_to_device(data, device):
19
+ """
20
+ Prepares one `data` before feeding it to the model, be it a tensor or a nested list/dictionary of tensors.
21
+ """
22
+ if isinstance(data, Mapping):
23
+ return type(data)({k: move_to_device(v, device) for k, v in data.items()})
24
+ elif isinstance(data, (tuple, list)):
25
+ return type(data)(move_to_device(v, device) for v in data)
26
+ elif isinstance(data, torch.Tensor):
27
+ kwargs = {"device": device}
28
+ return data.to(**kwargs)
29
+ else:
30
+ return data
31
+
32
+ def compute_loss(logits, labels, shift=False):
33
+ """
34
+ Returns:
35
+ token_loss: batch_size, seq_length
36
+ """
37
+ if shift:
38
+ logits = logits[:, :-1, :].contiguous()
39
+ labels = labels[:, 1:].contiguous()
40
+
41
+ labels = labels.to(logits.device)
42
+ batch_size = logits.shape[0]
43
+
44
+ # NOTE: the loss on -100 labels is 0 by default
45
+ token_loss = torch.nn.functional.cross_entropy(
46
+ logits.flatten(0, 1),
47
+ labels.reshape(-1),
48
+ reduction="none"
49
+ ).reshape(batch_size, -1) # batch_size, seq_len
50
+
51
+ valid_token_num = (labels != -100).sum(-1) # batch_size
52
+ all_valid_token_num = valid_token_num.sum()
53
+
54
+ if all_valid_token_num > 0:
55
+ loss = token_loss.sum() / valid_token_num.sum()
56
+ else:
57
+ loss = token_loss.sum()
58
+
59
+ batch_loss = token_loss.sum(-1) / valid_token_num
60
+ # prevent nan
61
+ if (valid_token_num == 0).any():
62
+ batch_loss = batch_loss.masked_fill(valid_token_num == 0, 0.)
63
+
64
+ return loss, batch_loss, valid_token_num
65
+
66
+
67
+ @torch.no_grad()
68
+ def evaluate_perplexity(model, dataloader, accelerator:Optional[Accelerator]=None):
69
+ if accelerator is not None and type(dataloader) == torch.utils.data.DataLoader:
70
+ # if the dataloader has been prepared, we shall not prepare it twice, especially in case of deepspeed
71
+ dataloader = accelerator.prepare(dataloader)
72
+
73
+ # if accelerator.process_index == 0:
74
+ # for name, x in model.named_parameters():
75
+ # print(f"{name: ^80} {x.dtype}")
76
+
77
+ all_loss = defaultdict(list)
78
+ for i, x in enumerate(tqdm(dataloader, desc="Computing Perplexity")):
79
+ # NOTE: important to reset memory for every batch
80
+ if hasattr(model, "memory"):
81
+ model.memory.reset()
82
+
83
+ # the seq id
84
+ index = x.pop("index")
85
+ # length is used to group training data, no use here
86
+ length = x.pop("length", None)
87
+
88
+ output = model(**x)
89
+
90
+ # NOTE: we need the loss for each element in the batch for accurate computation, because the number of valid tokens may differ among elements
91
+ if hasattr(output, "batch_loss"):
92
+ # output from our model has batch_loss by default
93
+ batch_loss = output.batch_loss
94
+ valid_token_num = output.valid_token_num
95
+ else:
96
+ # output from other models does not
97
+ loss, batch_loss, valid_token_num = compute_loss(output.logits, x["labels"], shift=True)
98
+
99
+ if accelerator is not None and accelerator.num_processes > 1:
100
+ # num_device * batch_size
101
+ index = accelerator.gather_for_metrics(index)
102
+ batch_loss = accelerator.gather_for_metrics(batch_loss)
103
+ valid_token_num = accelerator.gather_for_metrics(valid_token_num)
104
+
105
+ for _id, _loss, _num in zip(index.tolist(), batch_loss.tolist(), valid_token_num.tolist()):
106
+ # loss times num is the total loss of all valid tokens
107
+ all_loss[_id].append((_loss * _num, _num))
108
+
109
+ all_loss = dict(all_loss)
110
+ for _id, loss_and_num in all_loss.items():
111
+ # sum up the loss for all valid tokens in the entire sequence, and divide the number of valid tokens
112
+ all_loss[_id] = sum([x[0] for x in loss_and_num]) / sum(x[1] for x in loss_and_num)
113
+
114
+ # average across then take exp
115
+ perplexity = math.exp(sum(all_loss.values()) / len(all_loss))
116
+ return perplexity
117
+
118
+
119
+ @torch.no_grad()
120
+ def evaluate_generation(model, dataloader, accelerator:Optional[Accelerator]=None, tokenizer=None, return_new_tokens_only=True, return_decoded=True, **generation_config):
121
+ if accelerator is not None and type(dataloader) == torch.utils.data.DataLoader:
122
+ # if the dataloader has been prepared, we shall not prepare it twice, especially in case of deepspeed
123
+ dataloader = accelerator.prepare(dataloader)
124
+
125
+ all_indices = []
126
+ all_outputs = []
127
+
128
+ for i, x in enumerate(tqdm(dataloader, desc="Computing Generation")):
129
+ # if i > 3:
130
+ # break
131
+
132
+ # NOTE: important to reset memory for every batch
133
+ if hasattr(model, "memory"):
134
+ model.memory.reset()
135
+
136
+ indices = x.pop("index")
137
+ # length is used to group training data, no use here
138
+ length = x.pop("length", None)
139
+
140
+ outputs = model.generate(**x, **generation_config)
141
+ if return_new_tokens_only:
142
+ start_idx = x["input_ids"].shape[1]
143
+ outputs = outputs[:, start_idx:]
144
+
145
+ if accelerator is not None and accelerator.num_processes > 1:
146
+ # must be contiguous
147
+ outputs = accelerator.pad_across_processes(outputs.contiguous(), pad_index=tokenizer.pad_token_id, dim=1)
148
+ outputs = accelerator.gather_for_metrics(outputs)
149
+ indices = accelerator.gather_for_metrics(indices)
150
+
151
+ outputs = outputs.tolist()
152
+ indices = indices.tolist()
153
+ if return_decoded:
154
+ outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
155
+ all_indices.extend(indices)
156
+ all_outputs.extend(outputs)
157
+
158
+ return all_indices, all_outputs
159
+
160
+
161
+ @torch.no_grad()
162
+ def evaluate_nll(model, dataloader, accelerator:Optional[Accelerator]=None):
163
+ if accelerator is not None and type(dataloader) == torch.utils.data.DataLoader:
164
+ # if the dataloader has been prepared, we shall not prepare it twice, especially in case of deepspeed
165
+ dataloader = accelerator.prepare(dataloader)
166
+
167
+ # if accelerator.process_index == 0:
168
+ # for name, x in model.named_parameters():
169
+ # print(f"{name: ^80} {x.dtype}")
170
+
171
+ all_loss = defaultdict(list)
172
+ for i, x in enumerate(tqdm(dataloader, desc="Computing Perplexity")):
173
+ # NOTE: important to reset memory for every batch
174
+ if hasattr(model, "memory"):
175
+ model.memory.reset()
176
+
177
+ # the seq id
178
+ index = x.pop("index")
179
+ # length is used to group training data, no use here
180
+ length = x.pop("length", None)
181
+
182
+ output = model(**x)
183
+
184
+ # NOTE: we need the loss for each element in the batch for accurate computation, because the number of valid tokens may differ among elements
185
+ if hasattr(output, "batch_loss"):
186
+ # output from our model has batch_loss by default
187
+ batch_loss = output.batch_loss
188
+ valid_token_num = output.valid_token_num
189
+ else:
190
+ # output from other models does not
191
+ loss, batch_loss, valid_token_num = compute_loss(output.logits, x["labels"], shift=True)
192
+
193
+ if accelerator is not None and accelerator.num_processes > 1:
194
+ # num_device * batch_size
195
+ index = accelerator.gather_for_metrics(index)
196
+ batch_loss = accelerator.gather_for_metrics(batch_loss)
197
+ valid_token_num = accelerator.gather_for_metrics(valid_token_num)
198
+
199
+ for _id, _loss in zip(index.tolist(), batch_loss.tolist()):
200
+ # loss times num is the total loss of all valid tokens
201
+ all_loss[_id].append(_loss)
202
+
203
+ return all_loss
204
+
205
+
206
+
207
+ @dataclass
208
+ class ModelOutput(BaseModelOutputWithPast):
209
+ loss: Optional[torch.FloatTensor] = None
210
+ batch_loss: Optional[torch.FloatTensor] = None
211
+ valid_token_num: Optional[torch.LongTensor] = None
212
+ logits: torch.FloatTensor = None
213
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
214
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
215
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
nqa.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"input": "How long had Mortimer Trefinnis' sister been dead when the doctor examined the body?", "context": "Produced by David Brannan. HTML version by Al Haines.\n\n\n\n\n\n\n\n\n\nThe Adventure of the Devil's Foot\n\n\nBy\n\nSir Arthur Conan Doyle\n\n\n\n\nIn recording from time to time some of the curious experiences and\ninteresting recollections which I associate with my long and intimate\nfriendship with Mr. Sherlock Holmes, I have continually been faced by\ndifficulties caused by his own aversion to publicity. To his sombre\nand cynical spirit all popular applause was always abhorrent, and\nnothing amused him more at the end of a successful case than to hand\nover the actual exposure to some orthodox official, and to listen with\na mocking smile to the general chorus of misplaced congratulation. It\nwas indeed this attitude upon the part of my friend and certainly not\nany lack of interesting material which has caused me of late years to\nlay very few of my records before the public. My participation in some\nof his adventures was always a privilege which entailed discretion and\nreticence upon me.\n\nIt was, then, with considerable surprise that I received a telegram\nfrom Holmes last Tuesday--he has never been known to write where a\ntelegram would serve--in the following terms:\n\nWhy not tell them of the Cornish horror--strangest case I have handled.\n\nI have no idea what backward sweep of memory had brought the matter\nfresh to his mind, or what freak had caused him to desire that I should\nrecount it; but I hasten, before another cancelling telegram may\narrive, to hunt out the notes which give me the exact details of the\ncase and to lay the narrative before my readers.\n\nIt was, then, in the spring of the year 1897 that Holmes's iron\nconstitution showed some symptoms of giving way in the face of constant\nhard work of a most exacting kind, aggravated, perhaps, by occasional\nindiscretions of his own. In March of that year Dr. Moore Agar, of\nHarley Street, whose dramatic introduction to Holmes I may some day\nrecount, gave positive injunctions that the famous private agent lay\naside all his cases and surrender himself to complete rest if he wished\nto avert an absolute breakdown. The state of his health was not a\nmatter in which he himself took the faintest interest, for his mental\ndetachment was absolute, but he was induced at last, on the threat of\nbeing permanently disqualified from work, to give himself a complete\nchange of scene and air. Thus it was that in the early spring of that\nyear we found ourselves together in a small cottage near Poldhu Bay, at\nthe further extremity of the Cornish peninsula.\n\nIt was a singular spot, and one peculiarly well suited to the grim\nhumour of my patient. From the windows of our little whitewashed\nhouse, which stood high upon a grassy headland, we looked down upon the\nwhole sinister semicircle of Mounts Bay, that old death trap of sailing\nvessels, with its fringe of black cliffs and surge-swept reefs on which\ninnumerable seamen have met their end. With a northerly breeze it lies\nplacid and sheltered, inviting the storm-tossed craft to tack into it\nfor rest and protection.\n\nThen come the sudden swirl round of the wind, the blistering gale from\nthe south-west, the dragging anchor, the lee shore, and the last battle\nin the creaming breakers. The wise mariner stands far out from that\nevil place.\n\nOn the land side our surroundings were as sombre as on the sea. It was\na country of rolling moors, lonely and dun-colored, with an occasional\nchurch tower to mark the site of some old-world village. In every\ndirection upon these moors there were traces of some vanished race\nwhich had passed utterly away, and left as its sole record strange\nmonuments of stone, irregular mounds which contained the burned ashes\nof the dead, and curious earthworks which hinted at prehistoric strife.\nThe glamour and mystery of the place, with its sinister atmosphere of\nforgotten nations, appealed to the imagination of my friend, and he\nspent much of his time in long walks and solitary meditations upon the\nmoor. The ancient Cornish language had also arrested his attention, and\nhe had, I remember, conceived the idea that it was akin to the\nChaldean, and had been largely derived from the Phoenician traders in\ntin. He had received a consignment of books upon philology and was\nsettling down to develop this thesis when suddenly, to my sorrow and to\nhis unfeigned delight, we found ourselves, even in that land of dreams,\nplunged into a problem at our very doors which was more intense, more\nengrossing, and infinitely more mysterious than any of those which had\ndriven us from London. Our simple life and peaceful, healthy routine\nwere violently interrupted, and we were precipitated into the midst of\na series of events which caused the utmost excitement not only in\nCornwall but throughout the whole west of England. Many of my readers\nmay retain some recollection of what was called at the time \"The\nCornish Horror,\" though a most imperfect account of the matter reached\nthe London press. Now, after thirteen years, I will give the true\ndetails of this inconceivable affair to the public.\n\nI have said that scattered towers marked the villages which dotted this\npart of Cornwall. The nearest of these was the hamlet of Tredannick\nWollas, where the cottages of a couple of hundred inhabitants clustered\nround an ancient, moss-grown church. The vicar of the parish, Mr.\nRoundhay, was something of an archaeologist, and as such Holmes had\nmade his acquaintance. He was a middle-aged man, portly and affable,\nwith a considerable fund of local lore. At his invitation we had taken\ntea at the vicarage and had come to know, also, Mr. Mortimer Tregennis,\nan independent gentleman, who increased the clergyman's scanty\nresources by taking rooms in his large, straggling house. The vicar,\nbeing a bachelor, was glad to come to such an arrangement, though he\nhad little in common with his lodger, who was a thin, dark, spectacled\nman, with a stoop which gave the impression of actual, physical\ndeformity. I remember that during our short visit we found the vicar\ngarrulous, but his lodger strangely reticent, a sad-faced,\nintrospective man, sitting with averted eyes, brooding apparently upon\nhis own affairs.\n\nThese were the two men who entered abruptly into our little\nsitting-room on Tuesday, March the 16th, shortly after our breakfast\nhour, as we were smoking together, preparatory to our daily excursion\nupon the moors.\n\n\"Mr. Holmes,\" said the vicar in an agitated voice, \"the most\nextraordinary and tragic affair has occurred during the night. It is\nthe most unheard-of business. We can only regard it as a special\nProvidence that you should chance to be here at the time, for in all\nEngland you are the one man we need.\"\n\nI glared at the intrusive vicar with no very friendly eyes; but Holmes\ntook his pipe from his lips and sat up in his chair like an old hound\nwho hears the view-halloa. He waved his hand to the sofa, and our\npalpitating visitor with his agitated companion sat side by side upon\nit. Mr. Mortimer Tregennis was more self-contained than the clergyman,\nbut the twitching of his thin hands and the brightness of his dark eyes\nshowed that they shared a common emotion.\n\n\"Shall I speak or you?\" he asked of the vicar.\n\n\"Well, as you seem to have made the discovery, whatever it may be, and\nthe vicar to have had it second-hand, perhaps you had better do the\nspeaking,\" said Holmes.\n\nI glanced at the hastily clad clergyman, with the formally dressed\nlodger seated beside him, and was amused at the surprise which Holmes's\nsimple deduction had brought to their faces.\n\n\"Perhaps I had best say a few words first,\" said the vicar, \"and then\nyou can judge if you will listen to the details from Mr. Tregennis, or\nwhether we should not hasten at once to the scene of this mysterious\naffair. I may explain, then, that our friend here spent last evening\nin the company of his two brothers, Owen and George, and of his sister\nBrenda, at their house of Tredannick Wartha, which is near the old\nstone cross upon the moor. He left them shortly after ten o'clock,\nplaying cards round the dining-room table, in excellent health and\nspirits. This morning, being an early riser, he walked in that\ndirection before breakfast and was overtaken by the carriage of Dr.\nRichards, who explained that he had just been sent for on a most urgent\ncall to Tredannick Wartha. Mr. Mortimer Tregennis naturally went with\nhim. When he arrived at Tredannick Wartha he found an extraordinary\nstate of things. His two brothers and his sister were seated round the\ntable exactly as he had left them, the cards still spread in front of\nthem and the candles burned down to their sockets. The sister lay back\nstone-dead in her chair, while the two brothers sat on each side of her\nlaughing, shouting, and singing, the senses stricken clean out of them.\nAll three of them, the dead woman and the two demented men, retained\nupon their faces an expression of the utmost horror--a convulsion of\nterror which was dreadful to look upon. There was no sign of the\npresence of anyone in the house, except Mrs. Porter, the old cook and\nhousekeeper, who declared that she had slept deeply and heard no sound\nduring the night. Nothing had been stolen or disarranged, and there is\nabsolutely no explanation of what the horror can be which has\nfrightened a woman to death and two strong men out of their senses.\nThere is the situation, Mr. Holmes, in a nutshell, and if you can help\nus to clear it up you will have done a great work.\"\n\nI had hoped that in some way I could coax my companion back into the\nquiet which had been the object of our journey; but one glance at his\nintense face and contracted eyebrows told me how vain was now the\nexpectation. He sat for some little time in silence, absorbed in the\nstrange drama which had broken in upon our peace.\n\n\"I will look into this matter,\" he said at last. \"On the face of it,\nit would appear to be a case of a very exceptional nature. Have you\nbeen there yourself, Mr. Roundhay?\"\n\n\"No, Mr. Holmes. Mr. Tregennis brought back the account to the\nvicarage, and I at once hurried over with him to consult you.\"\n\n\"How far is it to the house where this singular tragedy occurred?\"\n\n\"About a mile inland.\"\n\n\"Then we shall walk over together. But before we start I must ask you\na few questions, Mr. Mortimer Tregennis.\"\n\nThe other had been silent all this time, but I had observed that his\nmore controlled excitement was even greater than the obtrusive emotion\nof the clergyman. He sat with a pale, drawn face, his anxious gaze\nfixed upon Holmes, and his thin hands clasped convulsively together.\nHis pale lips quivered as he listened to the dreadful experience which\nhad befallen his family, and his dark eyes seemed to reflect something\nof the horror of the scene.\n\n\"Ask what you like, Mr. Holmes,\" said he eagerly. \"It is a bad thing\nto speak of, but I will answer you the truth.\"\n\n\"Tell me about last night.\"\n\n\"Well, Mr. Holmes, I supped there, as the vicar has said, and my elder\nbrother George proposed a game of whist afterwards. We sat down about\nnine o'clock. It was a quarter-past ten when I moved to go. I left\nthem all round the table, as merry as could be.\"\n\n\"Who let you out?\"\n\n\"Mrs. Porter had gone to bed, so I let myself out. I shut the hall\ndoor behind me. The window of the room in which they sat was closed,\nbut the blind was not drawn down. There was no change in door or\nwindow this morning, or any reason to think that any stranger had been\nto the house. Yet there they sat, driven clean mad with terror, and\nBrenda lying dead of fright, with her head hanging over the arm of the\nchair. I'll never get the sight of that room out of my mind so long as\nI live.\"\n\n\"The facts, as you state them, are certainly most remarkable,\" said\nHolmes. \"I take it that you have no theory yourself which can in any\nway account for them?\"\n\n\"It's devilish, Mr. Holmes, devilish!\" cried Mortimer Tregennis. \"It is\nnot of this world. Something has come into that room which has dashed\nthe light of reason from their minds. What human contrivance could do\nthat?\"\n\n\"I fear,\" said Holmes, \"that if the matter is beyond humanity it is\ncertainly beyond me. Yet we must exhaust all natural explanations\nbefore we fall back upon such a theory as this. As to yourself, Mr.\nTregennis, I take it you were divided in some way from your family,\nsince they lived together and you had rooms apart?\"\n\n\"That is so, Mr. Holmes, though the matter is past and done with. We\nwere a family of tin-miners at Redruth, but we sold our venture to a\ncompany, and so retired with enough to keep us. I won't deny that\nthere was some feeling about the division of the money and it stood\nbetween us for a time, but it was all forgiven and forgotten, and we\nwere the best of friends together.\"\n\n\"Looking back at the evening which you spent together, does anything\nstand out in your memory as throwing any possible light upon the\ntragedy? Think carefully, Mr. Tregennis, for any clue which can help\nme.\"\n\n\"There is nothing at all, sir.\"\n\n\"Your people were in their usual spirits?\"\n\n\"Never better.\"\n\n\"Were they nervous people? Did they ever show any apprehension of\ncoming danger?\"\n\n\"Nothing of the kind.\"\n\n\"You have nothing to add then, which could assist me?\"\n\nMortimer Tregennis considered earnestly for a moment.\n\n\"There is one thing occurs to me,\" said he at last. \"As we sat at the\ntable my back was to the window, and my brother George, he being my\npartner at cards, was facing it. I saw him once look hard over my\nshoulder, so I turned round and looked also. The blind was up and the\nwindow shut, but I could just make out the bushes on the lawn, and it\nseemed to me for a moment that I saw something moving among them. I\ncouldn't even say if it was man or animal, but I just thought there was\nsomething there. When I asked him what he was looking at, he told me\nthat he had the same feeling. That is all that I can say.\"\n\n\"Did you not investigate?\"\n\n\"No; the matter passed as unimportant.\"\n\n\"You left them, then, without any premonition of evil?\"\n\n\"None at all.\"\n\n\"I am not clear how you came to hear the news so early this morning.\"\n\n\"I am an early riser and generally take a walk before breakfast. This\nmorning I had hardly started when the doctor in his carriage overtook\nme. He told me that old Mrs. Porter had sent a boy down with an urgent\nmessage. I sprang in beside him and we drove on. When we got there we\nlooked into that dreadful room. The candles and the fire must have\nburned out hours before, and they had been sitting there in the dark\nuntil dawn had broken. The doctor said Brenda must have been dead at\nleast six hours. There were no signs of violence. She just lay across\nthe arm of the chair with that look on her face. George and Owen were\nsinging snatches of songs and gibbering like two great apes. Oh, it\nwas awful to see! I couldn't stand it, and the doctor was as white as\na sheet. Indeed, he fell into a chair in a sort of faint, and we\nnearly had him on our hands as well.\"\n\n\"Remarkable--most remarkable!\" said Holmes, rising and taking his hat.\n\"I think, perhaps, we had better go down to Tredannick Wartha without\nfurther delay. I confess that I have seldom known a case which at\nfirst sight presented a more singular problem.\"\n\n\nOur proceedings of that first morning did little to advance the\ninvestigation. It was marked, however, at the outset by an incident\nwhich left the most sinister impression upon my mind. The approach to\nthe spot at which the tragedy occurred is down a narrow, winding,\ncountry lane. While we made our way along it we heard the rattle of a\ncarriage coming towards us and stood aside to let it pass. As it drove\nby us I caught a glimpse through the closed window of a horribly\ncontorted, grinning face glaring out at us. Those staring eyes and\ngnashing teeth flashed past us like a dreadful vision.\n\n\"My brothers!\" cried Mortimer Tregennis, white to his lips. \"They are\ntaking them to Helston.\"\n\nWe looked with horror after the black carriage, lumbering upon its way.\nThen we turned our steps towards this ill-omened house in which they\nhad met their strange fate.\n\nIt was a large and bright dwelling, rather a villa than a cottage, with\na considerable garden which was already, in that Cornish air, well\nfilled with spring flowers. Towards this garden the window of the\nsitting-room fronted, and from it, according to Mortimer Tregennis,\nmust have come that thing of evil which had by sheer horror in a single\ninstant blasted their minds. Holmes walked slowly and thoughtfully\namong the flower-plots and along the path before we entered the porch.\nSo absorbed was he in his thoughts, I remember, that he stumbled over\nthe watering-pot, upset its contents, and deluged both our feet and the\ngarden path. Inside the house we were met by the elderly Cornish\nhousekeeper, Mrs. Porter, who, with the aid of a young girl, looked\nafter the wants of the family. She readily answered all Holmes's\nquestions. She had heard nothing in the night. Her employers had all\nbeen in excellent spirits lately, and she had never known them more\ncheerful and prosperous. She had fainted with horror upon entering the\nroom in the morning and seeing that dreadful company round the table.\nShe had, when she recovered, thrown open the window to let the morning\nair in, and had run down to the lane, whence she sent a farm-lad for\nthe doctor. The lady was on her bed upstairs if we cared to see her.\nIt took four strong men to get the brothers into the asylum carriage.\nShe would not herself stay in the house another day and was starting\nthat very afternoon to rejoin her family at St. Ives.\n\nWe ascended the stairs and viewed the body. Miss Brenda Tregennis had\nbeen a very beautiful girl, though now verging upon middle age. Her\ndark, clear-cut face was handsome, even in death, but there still\nlingered upon it something of that convulsion of horror which had been\nher last human emotion. From her bedroom we descended to the\nsitting-room, where this strange tragedy had actually occurred. The\ncharred ashes of the overnight fire lay in the grate. On the table\nwere the four guttered and burned-out candles, with the cards scattered\nover its surface. The chairs had been moved back against the walls,\nbut all else was as it had been the night before. Holmes paced with\nlight, swift steps about the room; he sat in the various chairs,\ndrawing them up and reconstructing their positions. He tested how much\nof the garden was visible; he examined the floor, the ceiling, and the\nfireplace; but never once did I see that sudden brightening of his eyes\nand tightening of his lips which would have told me that he saw some\ngleam of light in this utter darkness.\n\n\"Why a fire?\" he asked once. \"Had they always a fire in this small\nroom on a spring evening?\"\n\nMortimer Tregennis explained that the night was cold and damp. For that\nreason, after his arrival, the fire was lit. \"What are you going to do\nnow, Mr. Holmes?\" he asked.\n\nMy friend smiled and laid his hand upon my arm. \"I think, Watson, that\nI shall resume that course of tobacco-poisoning which you have so often\nand so justly condemned,\" said he. \"With your permission, gentlemen,\nwe will now return to our cottage, for I am not aware that any new\nfactor is likely to come to our notice here. I will turn the facts\nover in my mind, Mr. Tregennis, and should anything occur to me I will\ncertainly communicate with you and the vicar. In the meantime I wish\nyou both good-morning.\"\n\nIt was not until long after we were back in Poldhu Cottage that Holmes\nbroke his complete and absorbed silence. He sat coiled in his\narmchair, his haggard and ascetic face hardly visible amid the blue\nswirl of his tobacco smoke, his black brows drawn down, his forehead\ncontracted, his eyes vacant and far away. Finally he laid down his\npipe and sprang to his feet.\n\n\"It won't do, Watson!\" said he with a laugh. \"Let us walk along the\ncliffs together and search for flint arrows. We are more likely to\nfind them than clues to this problem. To let the brain work without\nsufficient material is like racing an engine. It racks itself to\npieces. The sea air, sunshine, and patience, Watson--all else will\ncome.\n\n\"Now, let us calmly define our position, Watson,\" he continued as we\nskirted the cliffs together. \"Let us get a firm grip of the very\nlittle which we DO know, so that when fresh facts arise we may be ready\nto fit them into their places. I take it, in the first place, that\nneither of us is prepared to admit diabolical intrusions into the\naffairs of men. Let us begin by ruling that entirely out of our minds.\nVery good. There remain three persons who have been grievously\nstricken by some conscious or unconscious human agency. That is firm\nground. Now, when did this occur? Evidently, assuming his narrative\nto be true, it was immediately after Mr. Mortimer Tregennis had left\nthe room. That is a very important point. The presumption is that it\nwas within a few minutes afterwards. The cards still lay upon the\ntable. It was already past their usual hour for bed. Yet they had not\nchanged their position or pushed back their chairs. I repeat, then,\nthat the occurrence was immediately after his departure, and not later\nthan eleven o'clock last night.\n\n\"Our next obvious step is to check, so far as we can, the movements of\nMortimer Tregennis after he left the room. In this there is no\ndifficulty, and they seem to be above suspicion. Knowing my methods as\nyou do, you were, of course, conscious of the somewhat clumsy water-pot\nexpedient by which I obtained a clearer impress of his foot than might\notherwise have been possible. The wet, sandy path took it admirably.\nLast night was also wet, you will remember, and it was not\ndifficult--having obtained a sample print--to pick out his track among\nothers and to follow his movements. He appears to have walked away\nswiftly in the direction of the vicarage.\n\n\"If, then, Mortimer Tregennis disappeared from the scene, and yet some\noutside person affected the card-players, how can we reconstruct that\nperson, and how was such an impression of horror conveyed? Mrs. Porter\nmay be eliminated. She is evidently harmless. Is there any evidence\nthat someone crept up to the garden window and in some manner produced\nso terrific an effect that he drove those who saw it out of their\nsenses? The only suggestion in this direction comes from Mortimer\nTregennis himself, who says that his brother spoke about some movement\nin the garden. That is certainly remarkable, as the night was rainy,\ncloudy, and dark. Anyone who had the design to alarm these people\nwould be compelled to place his very face against the glass before he\ncould be seen. There is a three-foot flower-border outside this\nwindow, but no indication of a footmark. It is difficult to imagine,\nthen, how an outsider could have made so terrible an impression upon\nthe company, nor have we found any possible motive for so strange and\nelaborate an attempt. You perceive our difficulties, Watson?\"\n\n\"They are only too clear,\" I answered with conviction.\n\n\"And yet, with a little more material, we may prove that they are not\ninsurmountable,\" said Holmes. \"I fancy that among your extensive\narchives, Watson, you may find some which were nearly as obscure.\nMeanwhile, we shall put the case aside until more accurate data are\navailable, and devote the rest of our morning to the pursuit of\nneolithic man.\"\n\nI may have commented upon my friend's power of mental detachment, but\nnever have I wondered at it more than upon that spring morning in\nCornwall when for two hours he discoursed upon celts, arrowheads, and\nshards, as lightly as if no sinister mystery were waiting for his\nsolution. It was not until we had returned in the afternoon to our\ncottage that we found a visitor awaiting us, who soon brought our minds\nback to the matter in hand. Neither of us needed to be told who that\nvisitor was. The huge body, the craggy and deeply seamed face with the\nfierce eyes and hawk-like nose, the grizzled hair which nearly brushed\nour cottage ceiling, the beard--golden at the fringes and white near\nthe lips, save for the nicotine stain from his perpetual cigar--all\nthese were as well known in London as in Africa, and could only be\nassociated with the tremendous personality of Dr. Leon Sterndale, the\ngreat lion-hunter and explorer.\n\nWe had heard of his presence in the district and had once or twice\ncaught sight of his tall figure upon the moorland paths. He made no\nadvances to us, however, nor would we have dreamed of doing so to him,\nas it was well known that it was his love of seclusion which caused him\nto spend the greater part of the intervals between his journeys in a\nsmall bungalow buried in the lonely wood of Beauchamp Arriance. Here,\namid his books and his maps, he lived an absolutely lonely life,\nattending to his own simple wants and paying little apparent heed to\nthe affairs of his neighbours. It was a surprise to me, therefore, to\nhear him asking Holmes in an eager voice whether he had made any\nadvance in his reconstruction of this mysterious episode. \"The county\npolice are utterly at fault,\" said he, \"but perhaps your wider\nexperience has suggested some conceivable explanation. My only claim\nto being taken into your confidence is that during my many residences\nhere I have come to know this family of Tregennis very well--indeed,\nupon my Cornish mother's side I could call them cousins--and their\nstrange fate has naturally been a great shock to me. I may tell you\nthat I had got as far as Plymouth upon my way to Africa, but the news\nreached me this morning, and I came straight back again to help in the\ninquiry.\"\n\nHolmes raised his eyebrows.\n\n\"Did you lose your boat through it?\"\n\n\"I will take the next.\"\n\n\"Dear me! that is friendship indeed.\"\n\n\"I tell you they were relatives.\"\n\n\"Quite so--cousins of your mother. Was your baggage aboard the ship?\"\n\n\"Some of it, but the main part at the hotel.\"\n\n\"I see. But surely this event could not have found its way into the\nPlymouth morning papers.\"\n\n\"No, sir; I had a telegram.\"\n\n\"Might I ask from whom?\"\n\nA shadow passed over the gaunt face of the explorer.\n\n\"You are very inquisitive, Mr. Holmes.\"\n\n\"It is my business.\"\n\nWith an effort Dr. Sterndale recovered his ruffled composure.\n\n\"I have no objection to telling you,\" he said. \"It was Mr. Roundhay,\nthe vicar, who sent me the telegram which recalled me.\"\n\n\"Thank you,\" said Holmes. \"I may say in answer to your original\nquestion that I have not cleared my mind entirely on the subject of\nthis case, but that I have every hope of reaching some conclusion. It\nwould be premature to say more.\"\n\n\"Perhaps you would not mind telling me if your suspicions point in any\nparticular direction?\"\n\n\"No, I can hardly answer that.\"\n\n\"Then I have wasted my time and need not prolong my visit.\" The famous\ndoctor strode out of our cottage in considerable ill-humour, and within\nfive minutes Holmes had followed him. I saw him no more until the\nevening, when he returned with a slow step and haggard face which\nassured me that he had made no great progress with his investigation.\nHe glanced at a telegram which awaited him and threw it into the grate.\n\n\"From the Plymouth hotel, Watson,\" he said. \"I learned the name of it\nfrom the vicar, and I wired to make certain that Dr. Leon Sterndale's\naccount was true. It appears that he did indeed spend last night\nthere, and that he has actually allowed some of his baggage to go on to\nAfrica, while he returned to be present at this investigation. What do\nyou make of that, Watson?\"\n\n\"He is deeply interested.\"\n\n\"Deeply interested--yes. There is a thread here which we had not yet\ngrasped and which might lead us through the tangle. Cheer up, Watson,\nfor I am very sure that our material has not yet all come to hand.\nWhen it does we may soon leave our difficulties behind us.\"\n\nLittle did I think how soon the words of Holmes would be realized, or\nhow strange and sinister would be that new development which opened up\nan entirely fresh line of investigation. I was shaving at my window in\nthe morning when I heard the rattle of hoofs and, looking up, saw a\ndog-cart coming at a gallop down the road. It pulled up at our door,\nand our friend, the vicar, sprang from it and rushed up our garden\npath. Holmes was already dressed, and we hastened down to meet him.\n\nOur visitor was so excited that he could hardly articulate, but at last\nin gasps and bursts his tragic story came out of him.\n\n\"We are devil-ridden, Mr. Holmes! My poor parish is devil-ridden!\" he\ncried. \"Satan himself is loose in it! We are given over into his\nhands!\" He danced about in his agitation, a ludicrous object if it\nwere not for his ashy face and startled eyes. Finally he shot out his\nterrible news.\n\n\"Mr. Mortimer Tregennis died during the night, and with exactly the\nsame symptoms as the rest of his family.\"\n\nHolmes sprang to his feet, all energy in an instant.\n\n\"Can you fit us both into your dog-cart?\"\n\n\"Yes, I can.\"\n\n\"Then, Watson, we will postpone our breakfast. Mr. Roundhay, we are\nentirely at your disposal. Hurry--hurry, before things get\ndisarranged.\"\n\nThe lodger occupied two rooms at the vicarage, which were in an angle\nby themselves, the one above the other. Below was a large\nsitting-room; above, his bedroom. They looked out upon a croquet lawn\nwhich came up to the windows. We had arrived before the doctor or the\npolice, so that everything was absolutely undisturbed. Let me describe\nexactly the scene as we saw it upon that misty March morning. It has\nleft an impression which can never be effaced from my mind.\n\nThe atmosphere of the room was of a horrible and depressing stuffiness.\nThe servant who had first entered had thrown up the window, or it would\nhave been even more intolerable. This might partly be due to the fact\nthat a lamp stood flaring and smoking on the centre table. Beside it\nsat the dead man, leaning back in his chair, his thin beard projecting,\nhis spectacles pushed up on to his forehead, and his lean dark face\nturned towards the window and twisted into the same distortion of\nterror which had marked the features of his dead sister. His limbs\nwere convulsed and his fingers contorted as though he had died in a\nvery paroxysm of fear. He was fully clothed, though there were signs\nthat his dressing had been done in a hurry. We had already learned\nthat his bed had been slept in, and that the tragic end had come to him\nin the early morning.\n\nOne realized the red-hot energy which underlay Holmes's phlegmatic\nexterior when one saw the sudden change which came over him from the\nmoment that he entered the fatal apartment. In an instant he was tense\nand alert, his eyes shining, his face set, his limbs quivering with\neager activity. He was out on the lawn, in through the window, round\nthe room, and up into the bedroom, for all the world like a dashing\nfoxhound drawing a cover. In the bedroom he made a rapid cast around\nand ended by throwing open the window, which appeared to give him some\nfresh cause for excitement, for he leaned out of it with loud\nejaculations of interest and delight. Then he rushed down the stair,\nout through the open window, threw himself upon his face on the lawn,\nsprang up and into the room once more, all with the energy of the\nhunter who is at the very heels of his quarry. The lamp, which was an\nordinary standard, he examined with minute care, making certain\nmeasurements upon its bowl. He carefully scrutinized with his lens the\ntalc shield which covered the top of the chimney and scraped off some\nashes which adhered to its upper surface, putting some of them into an\nenvelope, which he placed in his pocketbook. Finally, just as the\ndoctor and the official police put in an appearance, he beckoned to the\nvicar and we all three went out upon the lawn.\n\n\"I am glad to say that my investigation has not been entirely barren,\"\nhe remarked. \"I cannot remain to discuss the matter with the police,\nbut I should be exceedingly obliged, Mr. Roundhay, if you would give\nthe inspector my compliments and direct his attention to the bedroom\nwindow and to the sitting-room lamp. Each is suggestive, and together\nthey are almost conclusive. If the police would desire further\ninformation I shall be happy to see any of them at the cottage. And\nnow, Watson, I think that, perhaps, we shall be better employed\nelsewhere.\"\n\nIt may be that the police resented the intrusion of an amateur, or that\nthey imagined themselves to be upon some hopeful line of investigation;\nbut it is certain that we heard nothing from them for the next two\ndays. During this time Holmes spent some of his time smoking and\ndreaming in the cottage; but a greater portion in country walks which\nhe undertook alone, returning after many hours without remark as to\nwhere he had been. One experiment served to show me the line of his\ninvestigation. He had bought a lamp which was the duplicate of the one\nwhich had burned in the room of Mortimer Tregennis on the morning of\nthe tragedy. This he filled with the same oil as that used at the\nvicarage, and he carefully timed the period which it would take to be\nexhausted. Another experiment which he made was of a more unpleasant\nnature, and one which I am not likely ever to forget.\n\n\"You will remember, Watson,\" he remarked one afternoon, \"that there is\na single common point of resemblance in the varying reports which have\nreached us. This concerns the effect of the atmosphere of the room in\neach case upon those who had first entered it. You will recollect that\nMortimer Tregennis, in describing the episode of his last visit to his\nbrother's house, remarked that the doctor on entering the room fell\ninto a chair? You had forgotten? Well I can answer for it that it was\nso. Now, you will remember also that Mrs. Porter, the housekeeper, told\nus that she herself fainted upon entering the room and had afterwards\nopened the window. In the second case--that of Mortimer Tregennis\nhimself--you cannot have forgotten the horrible stuffiness of the room\nwhen we arrived, though the servant had thrown open the window. That\nservant, I found upon inquiry, was so ill that she had gone to her bed.\nYou will admit, Watson, that these facts are very suggestive. In each\ncase there is evidence of a poisonous atmosphere. In each case, also,\nthere is combustion going on in the room--in the one case a fire, in\nthe other a lamp. The fire was needed, but the lamp was lit--as a\ncomparison of the oil consumed will show--long after it was broad\ndaylight. Why? Surely because there is some connection between three\nthings--the burning, the stuffy atmosphere, and, finally, the madness\nor death of those unfortunate people. That is clear, is it not?\"\n\n\"It would appear so.\"\n\n\"At least we may accept it as a working hypothesis. We will suppose,\nthen, that something was burned in each case which produced an\natmosphere causing strange toxic effects. Very good. In the first\ninstance--that of the Tregennis family--this substance was placed in\nthe fire. Now the window was shut, but the fire would naturally carry\nfumes to some extent up the chimney. Hence one would expect the\neffects of the poison to be less than in the second case, where there\nwas less escape for the vapour. The result seems to indicate that it\nwas so, since in the first case only the woman, who had presumably the\nmore sensitive organism, was killed, the others exhibiting that\ntemporary or permanent lunacy which is evidently the first effect of\nthe drug. In the second case the result was complete. The facts,\ntherefore, seem to bear out the theory of a poison which worked by\ncombustion.\n\n\"With this train of reasoning in my head I naturally looked about in\nMortimer Tregennis's room to find some remains of this substance. The\nobvious place to look was the talc shelf or smoke-guard of the lamp.\nThere, sure enough, I perceived a number of flaky ashes, and round the\nedges a fringe of brownish powder, which had not yet been consumed.\nHalf of this I took, as you saw, and I placed it in an envelope.\"\n\n\"Why half, Holmes?\"\n\n\"It is not for me, my dear Watson, to stand in the way of the official\npolice force. I leave them all the evidence which I found. The poison\nstill remained upon the talc had they the wit to find it. Now, Watson,\nwe will light our lamp; we will, however, take the precaution to open\nour window to avoid the premature decease of two deserving members of\nsociety, and you will seat yourself near that open window in an\narmchair unless, like a sensible man, you determine to have nothing to\ndo with the affair. Oh, you will see it out, will you? I thought I\nknew my Watson. This chair I will place opposite yours, so that we may\nbe the same distance from the poison and face to face. The door we\nwill leave ajar. Each is now in a position to watch the other and to\nbring the experiment to an end should the symptoms seem alarming. Is\nthat all clear? Well, then, I take our powder--or what remains of\nit--from the envelope, and I lay it above the burning lamp. So! Now,\nWatson, let us sit down and await developments.\"\n\nThey were not long in coming. I had hardly settled in my chair before\nI was conscious of a thick, musky odour, subtle and nauseous. At the\nvery first whiff of it my brain and my imagination were beyond all\ncontrol. A thick, black cloud swirled before my eyes, and my mind told\nme that in this cloud, unseen as yet, but about to spring out upon my\nappalled senses, lurked all that was vaguely horrible, all that was\nmonstrous and inconceivably wicked in the universe. Vague shapes\nswirled and swam amid the dark cloud-bank, each a menace and a warning\nof something coming, the advent of some unspeakable dweller upon the\nthreshold, whose very shadow would blast my soul. A freezing horror\ntook possession of me. I felt that my hair was rising, that my eyes\nwere protruding, that my mouth was opened, and my tongue like leather.\nThe turmoil within my brain was such that something must surely snap.\nI tried to scream and was vaguely aware of some hoarse croak which was\nmy own voice, but distant and detached from myself. At the same moment,\nin some effort of escape, I broke through that cloud of despair and had\na glimpse of Holmes's face, white, rigid, and drawn with horror--the\nvery look which I had seen upon the features of the dead. It was that\nvision which gave me an instant of sanity and of strength. I dashed\nfrom my chair, threw my arms round Holmes, and together we lurched\nthrough the door, and an instant afterwards had thrown ourselves down\nupon the grass plot and were lying side by side, conscious only of the\nglorious sunshine which was bursting its way through the hellish cloud\nof terror which had girt us in. Slowly it rose from our souls like the\nmists from a landscape until peace and reason had returned, and we were\nsitting upon the grass, wiping our clammy foreheads, and looking with\napprehension at each other to mark the last traces of that terrific\nexperience which we had undergone.\n\n\"Upon my word, Watson!\" said Holmes at last with an unsteady voice, \"I\nowe you both my thanks and an apology. It was an unjustifiable\nexperiment even for one's self, and doubly so for a friend. I am\nreally very sorry.\"\n\n\"You know,\" I answered with some emotion, for I have never seen so much\nof Holmes's heart before, \"that it is my greatest joy and privilege to\nhelp you.\"\n\nHe relapsed at once into the half-humorous, half-cynical vein which was\nhis habitual attitude to those about him. \"It would be superfluous to\ndrive us mad, my dear Watson,\" said he. \"A candid observer would\ncertainly declare that we were so already before we embarked upon so\nwild an experiment. I confess that I never imagined that the effect\ncould be so sudden and so severe.\" He dashed into the cottage, and,\nreappearing with the burning lamp held at full arm's length, he threw\nit among a bank of brambles. \"We must give the room a little time to\nclear. I take it, Watson, that you have no longer a shadow of a doubt\nas to how these tragedies were produced?\"\n\n\"None whatever.\"\n\n\"But the cause remains as obscure as before. Come into the arbour here\nand let us discuss it together. That villainous stuff seems still to\nlinger round my throat. I think we must admit that all the evidence\npoints to this man, Mortimer Tregennis, having been the criminal in the\nfirst tragedy, though he was the victim in the second one. We must\nremember, in the first place, that there is some story of a family\nquarrel, followed by a reconciliation. How bitter that quarrel may\nhave been, or how hollow the reconciliation we cannot tell. When I\nthink of Mortimer Tregennis, with the foxy face and the small shrewd,\nbeady eyes behind the spectacles, he is not a man whom I should judge\nto be of a particularly forgiving disposition. Well, in the next place,\nyou will remember that this idea of someone moving in the garden, which\ntook our attention for a moment from the real cause of the tragedy,\nemanated from him. He had a motive in misleading us. Finally, if he\ndid not throw the substance into the fire at the moment of leaving the\nroom, who did do so? The affair happened immediately after his\ndeparture. Had anyone else come in, the family would certainly have\nrisen from the table. Besides, in peaceful Cornwall, visitors did not\narrive after ten o'clock at night. We may take it, then, that all the\nevidence points to Mortimer Tregennis as the culprit.\"\n\n\"Then his own death was suicide!\"\n\n\"Well, Watson, it is on the face of it a not impossible supposition.\nThe man who had the guilt upon his soul of having brought such a fate\nupon his own family might well be driven by remorse to inflict it upon\nhimself. There are, however, some cogent reasons against it.\nFortunately, there is one man in England who knows all about it, and I\nhave made arrangements by which we shall hear the facts this afternoon\nfrom his own lips. Ah! he is a little before his time. Perhaps you\nwould kindly step this way, Dr. Leon Sterndale. We have been conducing\na chemical experiment indoors which has left our little room hardly fit\nfor the reception of so distinguished a visitor.\"\n\nI had heard the click of the garden gate, and now the majestic figure\nof the great African explorer appeared upon the path. He turned in\nsome surprise towards the rustic arbour in which we sat.\n\n\"You sent for me, Mr. Holmes. I had your note about an hour ago, and I\nhave come, though I really do not know why I should obey your summons.\"\n\n\"Perhaps we can clear the point up before we separate,\" said Holmes.\n\"Meanwhile, I am much obliged to you for your courteous acquiescence.\nYou will excuse this informal reception in the open air, but my friend\nWatson and I have nearly furnished an additional chapter to what the\npapers call the Cornish Horror, and we prefer a clear atmosphere for\nthe present. Perhaps, since the matters which we have to discuss will\naffect you personally in a very intimate fashion, it is as well that we\nshould talk where there can be no eavesdropping.\"\n\nThe explorer took his cigar from his lips and gazed sternly at my\ncompanion.\n\n\"I am at a loss to know, sir,\" he said, \"what you can have to speak\nabout which affects me personally in a very intimate fashion.\"\n\n\"The killing of Mortimer Tregennis,\" said Holmes.\n\nFor a moment I wished that I were armed. Sterndale's fierce face\nturned to a dusky red, his eyes glared, and the knotted, passionate\nveins started out in his forehead, while he sprang forward with\nclenched hands towards my companion. Then he stopped, and with a\nviolent effort he resumed a cold, rigid calmness, which was, perhaps,\nmore suggestive of danger than his hot-headed outburst.\n\n\"I have lived so long among savages and beyond the law,\" said he, \"that\nI have got into the way of being a law to myself. You would do well,\nMr. Holmes, not to forget it, for I have no desire to do you an injury.\"\n\n\"Nor have I any desire to do you an injury, Dr. Sterndale. Surely the\nclearest proof of it is that, knowing what I know, I have sent for you\nand not for the police.\"\n\nSterndale sat down with a gasp, overawed for, perhaps, the first time\nin his adventurous life. There was a calm assurance of power in\nHolmes's manner which could not be withstood. Our visitor stammered\nfor a moment, his great hands opening and shutting in his agitation.\n\n\"What do you mean?\" he asked at last. \"If this is bluff upon your\npart, Mr. Holmes, you have chosen a bad man for your experiment. Let us\nhave no more beating about the bush. What DO you mean?\"\n\n\"I will tell you,\" said Holmes, \"and the reason why I tell you is that\nI hope frankness may beget frankness. What my next step may be will\ndepend entirely upon the nature of your own defence.\"\n\n\"My defence?\"\n\n\"Yes, sir.\"\n\n\"My defence against what?\"\n\n\"Against the charge of killing Mortimer Tregennis.\"\n\nSterndale mopped his forehead with his handkerchief. \"Upon my word,\nyou are getting on,\" said he. \"Do all your successes depend upon this\nprodigious power of bluff?\"\n\n\"The bluff,\" said Holmes sternly, \"is upon your side, Dr. Leon\nSterndale, and not upon mine. As a proof I will tell you some of the\nfacts upon which my conclusions are based. Of your return from\nPlymouth, allowing much of your property to go on to Africa, I will say\nnothing save that it first informed me that you were one of the factors\nwhich had to be taken into account in reconstructing this drama--\"\n\n\"I came back--\"\n\n\"I have heard your reasons and regard them as unconvincing and\ninadequate. We will pass that. You came down here to ask me whom I\nsuspected. I refused to answer you. You then went to the vicarage,\nwaited outside it for some time, and finally returned to your cottage.\"\n\n\"How do you know that?\"\n\n\"I followed you.\"\n\n\"I saw no one.\"\n\n\"That is what you may expect to see when I follow you. You spent a\nrestless night at your cottage, and you formed certain plans, which in\nthe early morning you proceeded to put into execution. Leaving your\ndoor just as day was breaking, you filled your pocket with some reddish\ngravel that was lying heaped beside your gate.\"\n\nSterndale gave a violent start and looked at Holmes in amazement.\n\n\"You then walked swiftly for the mile which separated you from the\nvicarage. You were wearing, I may remark, the same pair of ribbed\ntennis shoes which are at the present moment upon your feet. At the\nvicarage you passed through the orchard and the side hedge, coming out\nunder the window of the lodger Tregennis. It was now daylight, but the\nhousehold was not yet stirring. You drew some of the gravel from your\npocket, and you threw it up at the window above you.\"\n\nSterndale sprang to his feet.\n\n\"I believe that you are the devil himself!\" he cried.\n\nHolmes smiled at the compliment. \"It took two, or possibly three,\nhandfuls before the lodger came to the window. You beckoned him to\ncome down. He dressed hurriedly and descended to his sitting-room.\nYou entered by the window. There was an interview--a short one--during\nwhich you walked up and down the room. Then you passed out and closed\nthe window, standing on the lawn outside smoking a cigar and watching\nwhat occurred. Finally, after the death of Tregennis, you withdrew as\nyou had come. Now, Dr. Sterndale, how do you justify such conduct, and\nwhat were the motives for your actions? If you prevaricate or trifle\nwith me, I give you my assurance that the matter will pass out of my\nhands forever.\"\n\nOur visitor's face had turned ashen gray as he listened to the words of\nhis accuser. Now he sat for some time in thought with his face sunk in\nhis hands. Then with a sudden impulsive gesture he plucked a\nphotograph from his breast-pocket and threw it on the rustic table\nbefore us.\n\n\"That is why I have done it,\" said he.\n\nIt showed the bust and face of a very beautiful woman. Holmes stooped\nover it.\n\n\"Brenda Tregennis,\" said he.\n\n\"Yes, Brenda Tregennis,\" repeated our visitor. \"For years I have loved\nher. For years she has loved me. There is the secret of that Cornish\nseclusion which people have marvelled at. It has brought me close to\nthe one thing on earth that was dear to me. I could not marry her, for\nI have a wife who has left me for years and yet whom, by the deplorable\nlaws of England, I could not divorce. For years Brenda waited. For\nyears I waited. And this is what we have waited for.\" A terrible sob\nshook his great frame, and he clutched his throat under his brindled\nbeard. Then with an effort he mastered himself and spoke on:\n\n\"The vicar knew. He was in our confidence. He would tell you that she\nwas an angel upon earth. That was why he telegraphed to me and I\nreturned. What was my baggage or Africa to me when I learned that such\na fate had come upon my darling? There you have the missing clue to my\naction, Mr. Holmes.\"\n\n\"Proceed,\" said my friend.\n\nDr. Sterndale drew from his pocket a paper packet and laid it upon the\ntable. On the outside was written \"Radix pedis diaboli\" with a red\npoison label beneath it. He pushed it towards me. \"I understand that\nyou are a doctor, sir. Have you ever heard of this preparation?\"\n\n\"Devil's-foot root! No, I have never heard of it.\"\n\n\"It is no reflection upon your professional knowledge,\" said he, \"for I\nbelieve that, save for one sample in a laboratory at Buda, there is no\nother specimen in Europe. It has not yet found its way either into the\npharmacopoeia or into the literature of toxicology. The root is shaped\nlike a foot, half human, half goatlike; hence the fanciful name given\nby a botanical missionary. It is used as an ordeal poison by the\nmedicine-men in certain districts of West Africa and is kept as a\nsecret among them. This particular specimen I obtained under very\nextraordinary circumstances in the Ubangi country.\" He opened the\npaper as he spoke and disclosed a heap of reddish-brown, snuff-like\npowder.\n\n\"Well, sir?\" asked Holmes sternly.\n\n\"I am about to tell you, Mr. Holmes, all that actually occurred, for\nyou already know so much that it is clearly to my interest that you\nshould know all. I have already explained the relationship in which I\nstood to the Tregennis family. For the sake of the sister I was\nfriendly with the brothers. There was a family quarrel about money\nwhich estranged this man Mortimer, but it was supposed to be made up,\nand I afterwards met him as I did the others. He was a sly, subtle,\nscheming man, and several things arose which gave me a suspicion of\nhim, but I had no cause for any positive quarrel.\n\n\"One day, only a couple of weeks ago, he came down to my cottage and I\nshowed him some of my African curiosities. Among other things I\nexhibited this powder, and I told him of its strange properties, how it\nstimulates those brain centres which control the emotion of fear, and\nhow either madness or death is the fate of the unhappy native who is\nsubjected to the ordeal by the priest of his tribe. I told him also\nhow powerless European science would be to detect it. How he took it I\ncannot say, for I never left the room, but there is no doubt that it\nwas then, while I was opening cabinets and stooping to boxes, that he\nmanaged to abstract some of the devil's-foot root. I well remember how\nhe plied me with questions as to the amount and the time that was\nneeded for its effect, but I little dreamed that he could have a\npersonal reason for asking.\n\n\"I thought no more of the matter until the vicar's telegram reached me\nat Plymouth. This villain had thought that I would be at sea before\nthe news could reach me, and that I should be lost for years in Africa.\nBut I returned at once. Of course, I could not listen to the details\nwithout feeling assured that my poison had been used. I came round to\nsee you on the chance that some other explanation had suggested itself\nto you. But there could be none. I was convinced that Mortimer\nTregennis was the murderer; that for the sake of money, and with the\nidea, perhaps, that if the other members of his family were all insane\nhe would be the sole guardian of their joint property, he had used the\ndevil's-foot powder upon them, driven two of them out of their senses,\nand killed his sister Brenda, the one human being whom I have ever\nloved or who has ever loved me. There was his crime; what was to be\nhis punishment?\n\n\"Should I appeal to the law? Where were my proofs? I knew that the\nfacts were true, but could I help to make a jury of countrymen believe\nso fantastic a story? I might or I might not. But I could not afford\nto fail. My soul cried out for revenge. I have said to you once\nbefore, Mr. Holmes, that I have spent much of my life outside the law,\nand that I have come at last to be a law to myself. So it was even\nnow. I determined that the fate which he had given to others should be\nshared by himself. Either that or I would do justice upon him with my\nown hand. In all England there can be no man who sets less value upon\nhis own life than I do at the present moment.\n\n\"Now I have told you all. You have yourself supplied the rest. I did,\nas you say, after a restless night, set off early from my cottage. I\nforesaw the difficulty of arousing him, so I gathered some gravel from\nthe pile which you have mentioned, and I used it to throw up to his\nwindow. He came down and admitted me through the window of the\nsitting-room. I laid his offence before him. I told him that I had\ncome both as judge and executioner. The wretch sank into a chair,\nparalyzed at the sight of my revolver. I lit the lamp, put the powder\nabove it, and stood outside the window, ready to carry out my threat to\nshoot him should he try to leave the room. In five minutes he died.\nMy God! how he died! But my heart was flint, for he endured nothing\nwhich my innocent darling had not felt before him. There is my story,\nMr. Holmes. Perhaps, if you loved a woman, you would have done as much\nyourself. At any rate, I am in your hands. You can take what steps\nyou like. As I have already said, there is no man living who can fear\ndeath less than I do.\"\n\nHolmes sat for some little time in silence.\n\n\"What were your plans?\" he asked at last.\n\n\"I had intended to bury myself in central Africa. My work there is but\nhalf finished.\"\n\n\"Go and do the other half,\" said Holmes. \"I, at least, am not prepared\nto prevent you.\"\n\nDr. Sterndale raised his giant figure, bowed gravely, and walked from\nthe arbour. Holmes lit his pipe and handed me his pouch.\n\n\"Some fumes which are not poisonous would be a welcome change,\" said\nhe. \"I think you must agree, Watson, that it is not a case in which we\nare called upon to interfere. Our investigation has been independent,\nand our action shall be so also. You would not denounce the man?\"\n\n\"Certainly not,\" I answered.\n\n\"I have never loved, Watson, but if I did and if the woman I loved had\nmet such an end, I might act even as our lawless lion-hunter has done.\nWho knows? Well, Watson, I will not offend your intelligence by\nexplaining what is obvious. The gravel upon the window-sill was, of\ncourse, the starting-point of my research. It was unlike anything in\nthe vicarage garden. Only when my attention had been drawn to Dr.\nSterndale and his cottage did I find its counterpart. The lamp shining\nin broad daylight and the remains of powder upon the shield were\nsuccessive links in a fairly obvious chain. And now, my dear Watson, I\nthink we may dismiss the matter from our mind and go back with a clear\nconscience to the study of those Chaldean roots which are surely to be\ntraced in the Cornish branch of the great Celtic speech.\"\n\n\n\n\n\n\n\n\n\nEnd of the Project Gutenberg EBook of The Adventure of the Devil's Foot, by \nArthur Conan Doyle", "answers": ["Six hours."]}
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ }
29
+ },
30
+ "bos_token": "<s>",
31
+ "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "legacy": false,
35
+ "model_max_length": 1000000000000000019884624838656,
36
+ "pad_token": "</s>",
37
+ "padding_side": "left",
38
+ "sp_model_kwargs": {},
39
+ "tokenizer_class": "LlamaTokenizer",
40
+ "unk_token": "<unk>",
41
+ "use_default_system_prompt": false
42
+ }