Upload folder using huggingface_hub

Browse files

Files changed (15) hide show

config.json +52 -0
configuration_llama.py +217 -0
generation_config.json +10 -0
model-00001-of-00004.safetensors +3 -0
model-00002-of-00004.safetensors +3 -0
model-00003-of-00004.safetensors +3 -0
model-00004-of-00004.safetensors +3 -0
model.safetensors.index.json +427 -0
modeling_llama.py +1461 -0
modeling_ultragist.py +711 -0
modeling_utils.py +215 -0
nqa.json +1 -0
special_tokens_map.json +30 -0
tokenizer.json +0 -0
tokenizer_config.json +42 -0

config.json ADDED Viewed

	@@ -0,0 +1,52 @@

+{
+  "_name_or_path": "ultragist-llama2-7b-chat",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_llama.LlamaConfig",
+    "AutoModelForCausalLM": "modeling_llama.LlamaForCausalLM"
+  },
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "ultragist_attend_prev": true,
+  "ultragist_attn": "step-expansion",
+  "ultragist_embed_init": "eos",
+  "ultragist_param": [
+    "q",
+    "k",
+    "v",
+    "o"
+  ],
+  "ultragist_ratio": [
+    2,
+    4,
+    8,
+    16,
+    32
+  ],
+  "ultragist_ratio_mix": "adapt-1024",
+  "ultragist_sink_size": 1,
+  "ultragist_stride": 1024,
+  "ultragist_window": 1024,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "max_position_embeddings": 4096,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 32,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.39.3",
+  "use_cache": true,
+  "vocab_size": 32000
+}

configuration_llama.py ADDED Viewed

	@@ -0,0 +1,217 @@

+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" LLaMA model configuration"""
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+class LlamaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the LLaMA-7B.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`LlamaModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Llama 1 supports up to 2048 tokens,
+            Llama 2 up to 4096, CodeLlama up to 16384.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        pretraining_tp (`int`, *optional*, defaults to 1):
+            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
+            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
+            issue](https://github.com/pytorch/pytorch/issues/76232).
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+    ```python
+    >>> from transformers import LlamaModel, LlamaConfig
+    >>> # Initializing a LLaMA llama-7b style configuration
+    >>> configuration = LlamaConfig()
+    >>> # Initializing a model from the llama-7b style configuration
+    >>> model = LlamaModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "llama"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        ultragist_window=1024,
+        ultragist_stride=1024,
+        ultragist_attn="step-expansion",
+        ultragist_ratio=[2,4,8,16,32],
+        ultragist_ratio_mix="step-random",
+        ultragist_param=["q","k","v","o"],
+        ultragist_embed_init="eos",
+        ultragist_sink_size=0,
+        ultragist_attend_prev=True,
+        retrieval_method=None,
+        retrieval_topk=None,
+        retrieval_key_length=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.ultragist_window = ultragist_window
+        self.ultragist_stride = ultragist_stride
+        self.ultragist_attn = ultragist_attn
+        self.ultragist_ratio = ultragist_ratio
+        self.ultragist_ratio_mix = ultragist_ratio_mix
+        self.ultragist_param = ultragist_param
+        self.ultragist_embed_init = ultragist_embed_init
+        self.ultragist_sink_size = ultragist_sink_size
+        self.ultragist_attend_prev = ultragist_attend_prev
+        self.retrieval_method = retrieval_method
+        self.retrieval_topk = retrieval_topk
+        self.retrieval_key_length = retrieval_key_length
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
+                f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")

generation_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "bos_token_id": 1,
+  "do_sample": true,
+  "eos_token_id": 2,
+  "max_length": 4096,
+  "pad_token_id": 0,
+  "temperature": 0.6,
+  "top_p": 0.9,
+  "transformers_version": "4.39.3"
+}

model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6e724498a663c9192f2521771f8044dec09f1744f6f19ee970c5e654c8168b63
+size 4932654472

model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f6dadb7e94374e54d0d1f80d386e823c528d738bfe33142f0240b198bf02e55a
+size 4941051712

model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:82d498b097cc0925203403a79974562f65832e1543ebdaa9830eebbc8a389442
+size 4974622888

model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:44d55d74a3b5a8283b72b5534846e020dff2fb92cbe115603cee0ae4220dbcb9
+size 2923527888

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,427 @@

+{
+  "metadata": {
+    "total_size": 17771806720
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00004-of-00004.safetensors",
+    "model.ultragist_embed_tokens.weight": "model-00001-of-00004.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.ultragist_k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.ultragist_o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.ultragist_q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.ultragist_v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.ultragist_k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.ultragist_o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.ultragist_q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.ultragist_v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.ultragist_k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.ultragist_o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.ultragist_q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.ultragist_v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.ultragist_k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.ultragist_o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.ultragist_q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.ultragist_v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.ultragist_k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.ultragist_o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.ultragist_q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.ultragist_v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.ultragist_k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.ultragist_o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.ultragist_q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.ultragist_v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.ultragist_k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.ultragist_o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.ultragist_q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.ultragist_v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.ultragist_k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.ultragist_o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.ultragist_q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.ultragist_v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.ultragist_k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.ultragist_o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.ultragist_q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.ultragist_v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.17.self_attn.ultragist_k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.ultragist_o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.ultragist_q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.ultragist_v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.ultragist_k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.ultragist_o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.ultragist_q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.ultragist_v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.ultragist_k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.ultragist_o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.ultragist_q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.ultragist_v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.ultragist_k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.ultragist_o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.ultragist_q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.ultragist_v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.ultragist_k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.ultragist_o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.ultragist_q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.ultragist_v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.ultragist_k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.ultragist_o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.ultragist_q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.ultragist_v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.ultragist_k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.ultragist_o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.ultragist_q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.ultragist_v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.ultragist_k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.ultragist_o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.ultragist_q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.ultragist_v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.ultragist_k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.ultragist_o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.ultragist_q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.ultragist_v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.ultragist_k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.ultragist_o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.ultragist_q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.ultragist_v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.ultragist_k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.ultragist_o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.ultragist_q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.ultragist_v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.self_attn.ultragist_k_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.self_attn.ultragist_o_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.self_attn.ultragist_q_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.self_attn.ultragist_v_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.28.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.28.self_attn.ultragist_k_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.28.self_attn.ultragist_o_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.28.self_attn.ultragist_q_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.28.self_attn.ultragist_v_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.28.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.28.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.28.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.29.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.29.self_attn.ultragist_k_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.29.self_attn.ultragist_o_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.29.self_attn.ultragist_q_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.29.self_attn.ultragist_v_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.29.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.29.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.29.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.ultragist_k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.ultragist_o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.ultragist_q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.ultragist_v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.30.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.30.self_attn.ultragist_k_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.30.self_attn.ultragist_o_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.30.self_attn.ultragist_q_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.30.self_attn.ultragist_v_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.self_attn.ultragist_k_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.self_attn.ultragist_o_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.self_attn.ultragist_q_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.self_attn.ultragist_v_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.31.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.ultragist_k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.ultragist_o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.ultragist_q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.ultragist_v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.ultragist_k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.ultragist_o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.ultragist_q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.ultragist_v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.ultragist_k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.ultragist_o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.ultragist_q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.ultragist_v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.ultragist_k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.ultragist_o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.ultragist_q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.ultragist_v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.ultragist_k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.ultragist_o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.ultragist_q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.ultragist_v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.ultragist_k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.ultragist_o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.ultragist_q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.ultragist_v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.norm.weight": "model-00004-of-00004.safetensors"
+  }
+}

modeling_llama.py ADDED Viewed

	@@ -0,0 +1,1461 @@

+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch LLaMA model."""
+import time
+import math
+import warnings
+from typing import List, Optional, Tuple, Union, Mapping
+from contextlib import nullcontext
+from dataclasses import dataclass
+from collections import defaultdict
+from tqdm import tqdm
+from accelerate import Accelerator
+import os
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache
+from transformers.modeling_attn_mask_utils import (
+    AttentionMaskConverter,
+    _prepare_4d_attention_mask,
+    _prepare_4d_causal_attention_mask,
+    _prepare_4d_causal_attention_mask_for_sdpa,
+)
+from transformers.modeling_outputs import BaseModelOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS, is_torch_greater_or_equal_than_1_13
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+    replace_return_docstrings,
+)
+from transformers.integrations import is_deepspeed_zero3_enabled
+from transformers.utils.import_utils import is_torch_fx_available
+# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
+# It means that the function will not be traced through and simply appear as a node in the graph.
+if is_torch_fx_available():
+    if not is_torch_greater_or_equal_than_1_13:
+        import torch.fx
+    _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
+from .configuration_llama import LlamaConfig
+from .modeling_ultragist import Memory
+from .modeling_utils import optional_grad_ctx, compute_loss, ModelOutput
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "LlamaConfig"
+class LlamaRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        LlamaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+ALL_LAYERNORM_LAYERS.append(LlamaRMSNorm)
+class LlamaRotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
+    """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        t = t / self.scaling_factor
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
+    """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        if seq_len > self.max_position_embeddings:
+            base = self.base * (
+                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+            ) ** (self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+            self.register_buffer("inv_freq", inv_freq, persistent=False)
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+# Copied from streaming-llm
+def apply_rotary_pos_emb_single(x, cos, sin, position_ids):
+    # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
+    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    x_embed = (x * cos) + (rotate_half(x) * sin)
+    return x_embed
+class LlamaMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+        if "mlp" in config.ultragist_param:
+            self.ultragist_up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+            self.ultragist_up_proj.weight.data.zero_()
+            self.ultragist_up_proj._is_hf_initialized = True
+            self.ultragist_down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+            self.ultragist_down_proj.weight.data.zero_()
+            self.ultragist_down_proj._is_hf_initialized = True
+    def _init_ultragist_proj(self, missing_keys):
+        """Initialize the ultragist projection weight with that of the ordinal projection."""
+        if "mlp" in self.config.ultragist_param:
+            if is_deepspeed_zero3_enabled():
+                import deepspeed
+                params = [self.up_proj.weight, self.down_proj.weight, self.ultragist_up_proj.weight, self.ultragist_down_proj.weight]
+                with deepspeed.zero.GatheredParameters(params, modifier_rank=0):
+                    if (self.ultragist_up_proj.weight.sum(-1) == 0).any():
+                        self.ultragist_up_proj.weight.data[:] = self.up_proj.weight.data
+                        self.ultragist_down_proj.weight.data[:] = self.down_proj.weight.data
+            else:
+                if any("ultragist_up_proj" in missing_key for missing_key in missing_keys):
+                    # only copy the value in-place, without tieing the weight
+                    self.ultragist_up_proj.weight.data[:] = self.up_proj.weight.data
+                    self.ultragist_down_proj.weight.data[:] = self.down_proj.weight.data
+    def forward(self, x, ultragist_size):
+        if self.config.pretraining_tp > 1:
+            # TODO: support pretraining_tp
+            raise NotImplementedError
+            slice = self.intermediate_size // self.config.pretraining_tp
+            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
+            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
+            down_proj_slices = self.down_proj.weight.split(slice, dim=1)
+            gate_proj = torch.cat(
+                [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
+            )
+            up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
+            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
+            down_proj = [
+                F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
+            ]
+            down_proj = sum(down_proj)
+        else:
+            if "mlp" in self.config.ultragist_param:
+                if ultragist_size > 0:
+                    ordinal_hidden_states = x[:, :-ultragist_size]
+                    ultragist_hidden_states = x[:, -ultragist_size:]
+                    ordinal_down_proj = self.down_proj(self.act_fn(self.gate_proj(ordinal_hidden_states)) * self.up_proj(ordinal_hidden_states))
+                    ultragist_down_proj = self.ultragist_down_proj(self.act_fn(self.gate_proj(ultragist_hidden_states)) * self.ultragist_up_proj(ultragist_hidden_states))
+                    down_proj = torch.cat([ordinal_down_proj, ultragist_down_proj], dim=1)
+                else:
+                    down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+            else:
+                down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class LlamaAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: LlamaConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
+                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
+        self._init_rope()
+        # NOTE: add extra parameters for ultragist tokens
+        # skip post initialization to speed up loading
+        if "q" in config.ultragist_param:
+            self.ultragist_q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+            # NOTE: initialize the ultragist parameters as zero
+            self.ultragist_q_proj.weight.data.zero_()
+            self.ultragist_q_proj._is_hf_initialized = True
+        if "k" in config.ultragist_param:
+            self.ultragist_k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+            self.ultragist_k_proj.weight.data.zero_()
+            self.ultragist_k_proj._is_hf_initialized = True
+        if "v" in config.ultragist_param:
+            self.ultragist_v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+            self.ultragist_v_proj.weight.data.zero_()
+            self.ultragist_v_proj._is_hf_initialized = True
+        if "o" in config.ultragist_param:
+            self.ultragist_o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
+            self.ultragist_o_proj.weight.data.zero_()
+            self.ultragist_o_proj._is_hf_initialized = True
+    def _init_rope(self):
+        if self.config.rope_scaling is None:
+            self.rotary_emb = LlamaRotaryEmbedding(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                base=self.rope_theta,
+            )
+        else:
+            scaling_type = self.config.rope_scaling["type"]
+            scaling_factor = self.config.rope_scaling["factor"]
+            if scaling_type == "linear":
+                self.rotary_emb = LlamaLinearScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            elif scaling_type == "dynamic":
+                self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            else:
+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+    def _init_ultragist_proj(self, missing_keys):
+        """Initialize the ultragist projection weight with that of the ordinal projection."""
+        ultragist_param = self.config.ultragist_param
+        if is_deepspeed_zero3_enabled():
+            import deepspeed
+            if "q" in ultragist_param:
+                with deepspeed.zero.GatheredParameters([self.ultragist_q_proj.weight, self.q_proj.weight], modifier_rank=0):
+                    # FIXME: after deepspeed initialization, some weights becomes non-zero, but there are rows taht are full of zeros
+                    if (self.ultragist_q_proj.weight.sum(-1) == 0).any():
+                        self.ultragist_q_proj.weight.data[:] = self.q_proj.weight.data
+            if "k" in ultragist_param:
+                with deepspeed.zero.GatheredParameters([self.ultragist_k_proj.weight, self.k_proj.weight], modifier_rank=0):
+                    if (self.ultragist_k_proj.weight.sum(-1) == 0).any():
+                        self.ultragist_k_proj.weight.data[:] = self.k_proj.weight.data
+            if "v" in ultragist_param:
+                with deepspeed.zero.GatheredParameters([self.ultragist_v_proj.weight, self.v_proj.weight], modifier_rank=0):
+                    if (self.ultragist_v_proj.weight.sum(-1) == 0).any():
+                        self.ultragist_v_proj.weight.data[:] = self.v_proj.weight.data
+            if "o" in ultragist_param:
+                with deepspeed.zero.GatheredParameters([self.ultragist_o_proj.weight, self.o_proj.weight], modifier_rank=0):
+                    if (self.ultragist_o_proj.weight.sum(-1) == 0).any():
+                        self.ultragist_o_proj.weight.data[:] = self.o_proj.weight.data
+        else:
+            # only copy the value in-place, without tieing the weight
+            if "q" in ultragist_param and any("ultragist_q_proj" in missing_key for missing_key in missing_keys):
+                if (self.ultragist_q_proj.weight == 0).all():
+                    self.ultragist_q_proj.weight.data[:] = self.q_proj.weight.data
+            if "k" in ultragist_param and any("ultragist_k_proj" in missing_key for missing_key in missing_keys):
+                if (self.ultragist_k_proj.weight == 0).all():
+                    self.ultragist_k_proj.weight.data[:] = self.k_proj.weight.data
+            if "v" in ultragist_param and any("ultragist_v_proj" in missing_key for missing_key in missing_keys):
+                if (self.ultragist_v_proj.weight == 0).all():
+                    self.ultragist_v_proj.weight.data[:] = self.v_proj.weight.data
+            if "o" in ultragist_param and any("ultragist_o_proj" in missing_key for missing_key in missing_keys):
+                if (self.ultragist_o_proj.weight == 0).all():
+                    self.ultragist_o_proj.weight.data[:] = self.o_proj.weight.data
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def qkv_proj_with_ultragist(self, hidden_states, ultragist_size=0):
+        if ultragist_size > 0:
+            ordinal_hidden_states = hidden_states[:, :-ultragist_size]
+            ultragist_hidden_states = hidden_states[:, -ultragist_size:]
+            if "q" in self.config.ultragist_param:
+                ordinal_query_states = self.q_proj(ordinal_hidden_states)
+                ultragist_query_states = self.ultragist_q_proj(ultragist_hidden_states)
+                query_states = torch.cat([ordinal_query_states, ultragist_query_states], dim=1)
+            else:
+                query_states = self.q_proj(hidden_states)
+            if "k" in self.config.ultragist_param:
+                ordinal_key_states = self.k_proj(ordinal_hidden_states)
+                ultragist_key_states = self.ultragist_k_proj(ultragist_hidden_states)
+                key_states = torch.cat([ordinal_key_states, ultragist_key_states], dim=1)
+            else:
+                key_states = self.k_proj(hidden_states)
+            if "v" in self.config.ultragist_param:
+                ordinal_value_states = self.v_proj(ordinal_hidden_states)
+                ultragist_value_states = self.ultragist_v_proj(ultragist_hidden_states)
+                value_states = torch.cat([ordinal_value_states, ultragist_value_states], dim=1)
+            else:
+                value_states = self.v_proj(hidden_states)
+        else:
+            query_states = self.q_proj(hidden_states)
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
+        return query_states, key_states, value_states
+    def o_proj_with_ultragist(self, attn_output, ultragist_size=0):
+        if ultragist_size > 0:
+            if "o" in self.config.ultragist_param:
+                ordinal_attn_output = self.o_proj(attn_output[:, :-ultragist_size])
+                ultragist_attn_output = self.ultragist_o_proj(attn_output[:, -ultragist_size:])
+                attn_output = torch.cat([ordinal_attn_output, ultragist_attn_output], dim=1)
+            else:
+                attn_output = self.o_proj(attn_output)
+        else:
+            attn_output = self.o_proj(attn_output)
+        return attn_output
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+        bsz, q_len, _ = hidden_states.size()
+        kv_seq_len = hidden_states.shape[-2]
+        past_key, past_value, ultragist_sizes, total_ultragist_size, raw_size_to_cache, window_size = past_key_value
+        if past_key is not None:
+            past_seq_len = past_key.shape[2]
+            kv_seq_len += past_seq_len
+        else:
+            past_seq_len = 0
+        query_states, key_states, value_states = self.qkv_proj_with_ultragist(hidden_states, total_ultragist_size)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        # return keys and values before rope
+        # NOTE: incrementally return keys and values for efficiency
+        if window_size > 0:
+            past_key_value = (key_states, value_states, ultragist_sizes, total_ultragist_size, raw_size_to_cache, window_size)
+        if past_key is not None:
+            # reuse k, v, self_attention
+            key_states = torch.cat([past_key, key_states], dim=2)
+            value_states = torch.cat([past_value, value_states], dim=2)
+        # NOTE: window_size == 0 indicates the ultragist is disabled, the model works as is, so the new past_key_values should concatenate old ones
+        if window_size == 0:
+            past_key_value = (key_states, value_states, ultragist_sizes, total_ultragist_size, raw_size_to_cache, window_size)
+        key_position_ids = position_ids
+        # align query position_ids with key
+        query_position_ids = key_position_ids[:, -q_len:]
+        key_states = apply_rotary_pos_emb_single(key_states, cos, sin, key_position_ids)
+        query_states = apply_rotary_pos_emb_single(query_states, cos, sin, query_position_ids)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        # debug attention weights
+        # if q_len == 1:
+        #     with open(f"data/debug/{self.layer_idx}.txt", "w") as f:
+        #         torch.set_printoptions(profile="full",linewidth=10000000,precision=1,sci_mode=False)
+        #         a = attn_weights.mean(1)
+        #         f.write(f"past_length: {past_key.shape[2]}\nattn_weight: {a.shape}\n")
+        #         f.write(str(a))
+        #         torch.set_printoptions(profile="default")
+        #     if self.layer_idx == self.config.num_hidden_layers - 1:
+        #         print("this is time!!!")
+        #         input()
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        # for debug
+        # if past_key.shape[2] == 128 and self.layer_idx == 0:
+        #     torch.save({
+        #         "hidden": hidden_states,
+        #         "query": query_states,
+        #         "key": key_states,
+        #         "value": value_states,
+        #         "output": attn_output,
+        #         "query_position_ids": query_position_ids,
+        #         "key_position_ids": key_position_ids,
+        #     }, "attn-output.pt")
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj_with_ultragist(attn_output, total_ultragist_size)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class LlamaSdpaAttention(LlamaAttention):
+    """
+    Llama attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `LlamaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+    # Adapted from LlamaAttention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "LlamaModel is using LlamaSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+        bsz, q_len, _ = hidden_states.size()
+        kv_seq_len = hidden_states.shape[-2]
+        past_key, past_value, ultragist_sizes, total_ultragist_size, raw_size_to_cache, window_size = past_key_value
+        if past_key is not None:
+            past_seq_len = past_key.shape[2]
+            kv_seq_len += past_seq_len
+        else:
+            past_seq_len = 0
+        query_states, key_states, value_states = self.qkv_proj_with_ultragist(hidden_states, total_ultragist_size)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        # return keys and values before rope
+        # NOTE: incrementally return keys and values for efficiency
+        if window_size > 0:
+            past_key_value = (key_states, value_states, ultragist_sizes, total_ultragist_size, raw_size_to_cache, window_size)
+        if past_key is not None:
+            # reuse k, v, self_attention
+            key_states = torch.cat([past_key, key_states], dim=2)
+            value_states = torch.cat([past_value, value_states], dim=2)
+        # NOTE: window_size == 0 indicates the ultragist is disabled, the model works as is, so the new past_key_values should concatenate old ones
+        if window_size == 0:
+            past_key_value = (key_states, value_states, ultragist_sizes, total_ultragist_size, raw_size_to_cache, window_size)
+        key_position_ids = position_ids
+        # align query position_ids with key
+        query_position_ids = key_position_ids[:, -q_len:]
+        key_states = apply_rotary_pos_emb_single(key_states, cos, sin, key_position_ids)
+        query_states = apply_rotary_pos_emb_single(query_states, cos, sin, query_position_ids)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+        # if self.layer_idx == 0 and past_key is None:
+        #     with open(f"attention_mask.txt", "w") as f:
+        #         torch.set_printoptions(profile="full",linewidth=10000000,precision=1,sci_mode=False)
+        #         a = attention_mask
+        #         f.write(str(a))
+        #         torch.set_printoptions(profile="default")
+        #     with open(f"position_ids.txt", "w") as f:
+        #         torch.set_printoptions(profile="full",linewidth=10000000,precision=1,sci_mode=False)
+        #         a = position_ids
+        #         f.write(str(a))
+        #         torch.set_printoptions(profile="default")
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+            is_causal=self.is_causal and attention_mask is None and q_len > 1,
+        )
+        # for debug
+        # if past_key is not None and past_key.shape[2] == 128 and self.layer_idx == 0:
+        #     torch.save({
+        #         "hidden": hidden_states,
+        #         "query": query_states,
+        #         "key": key_states,
+        #         "value": value_states,
+        #         "output": attn_output,
+        #         "query_position_ids": query_position_ids,
+        #         "key_position_ids": key_position_ids,
+        #     }, "attn-output.pt")
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj_with_ultragist(attn_output, total_ultragist_size)
+        return attn_output, None, past_key_value
+LLAMA_ATTENTION_CLASSES = {
+    "eager": LlamaAttention,
+    "sdpa": LlamaSdpaAttention,
+}
+class LlamaDecoderLayer(nn.Module):
+    def __init__(self, config: LlamaConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = LLAMA_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
+        self.mlp = LlamaMLP(config)
+        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+        # NOTE: get ultragist_size in case the mlp is included in ultragist_param
+        past_key, past_value, ultragist_sizes, total_ultragist_size, raw_size_to_cache, window_size = past_key_value
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states, total_ultragist_size)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+LLAMA_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`LlamaConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+@add_start_docstrings(
+    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+    LLAMA_START_DOCSTRING,
+)
+class LlamaPreTrainedModel(PreTrainedModel):
+    config_class = LlamaConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["LlamaDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_sdpa = True
+    _supports_cache_class = True
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+LLAMA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+@add_start_docstrings(
+    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+    LLAMA_START_DOCSTRING,
+)
+class LlamaModel(LlamaPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
+    Args:
+        config: LlamaConfig
+    """
+    def __init__(self, config: LlamaConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        # ultragist: add ultragist embedding
+        self.ultragist_embed_tokens = nn.Embedding(1, config.hidden_size, self.padding_idx)
+        self.ultragist_embed_tokens._is_hf_initialized = True
+        self.layers = nn.ModuleList(
+            [LlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self._use_sdpa = config._attn_implementation == "sdpa"
+        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def _init_ultragist_embed(self, missing_keys):
+        """Initialize the ultragist token embedding with that of the eos token."""
+        if is_deepspeed_zero3_enabled():
+            import deepspeed
+            params = [self.ultragist_embed_tokens.weight, self.embed_tokens.weight]
+            with deepspeed.zero.GatheredParameters(params, modifier_rank=0):
+                # deepspeed will initialize the parameters to zero
+                if (self.ultragist_embed_tokens.weight == 0).all():
+                    if self.config.ultragist_embed_init == "bos":
+                        self.ultragist_embed_tokens.weight.data[:] = self.embed_tokens.weight.data[self.config.bos_token_id]
+                    elif self.config.ultragist_embed_init == "eos":
+                        self.ultragist_embed_tokens.weight.data[:] = self.embed_tokens.weight.data[self.config.eos_token_id]
+                    else:
+                        raise NotImplementedError(f"Make sure ultragist_embed_init is either eos or bos, found {self.config.ultragist_embed_init}")
+        else:
+            if any("ultragist_embed_tokens" in missing_key for missing_key in missing_keys):
+                if self.config.ultragist_embed_init == "bos":
+                    self.ultragist_embed_tokens.weight.data[:] = self.embed_tokens.weight.data[self.config.bos_token_id]
+                elif self.config.ultragist_embed_init == "eos":
+                    self.ultragist_embed_tokens.weight.data[:] = self.embed_tokens.weight.data[self.config.eos_token_id]
+                else:
+                    raise NotImplementedError(f"Make sure ultragist_embed_init is either eos or bos, found {self.config.ultragist_embed_init}")
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        # ultragist: always use cache
+        use_cache = True
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape[:2]
+        elif inputs_embeds is not None:
+            batch_size, seq_length = inputs_embeds.shape[:2]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        # ultragist: create position_ids for all keys including past_keys
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+        past_key, past_value, ultragist_sizes, total_ultragist_size, raw_size_to_cache, window_size = past_key_values[0]
+        if past_key is not None:
+            past_key_values_length = past_key.shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+        # ultragist: separately embed ordinal tokens and ultragist tokens because ordinal tokens do not receive gradients
+        if total_ultragist_size > 0:
+            ordinal_input_ids = input_ids[:, :-total_ultragist_size]
+            ultragist_input_ids = input_ids[:, -total_ultragist_size:]
+            ordinal_inputs_embeds = self.embed_tokens(ordinal_input_ids)
+            # bias ultragist_token_ids because they are newly initialized
+            ultragist_input_embeds = self.ultragist_embed_tokens(ultragist_input_ids - self.config.vocab_size)
+            inputs_embeds = torch.cat([ordinal_inputs_embeds, ultragist_input_embeds], dim=1)
+        else:
+            inputs_embeds = self.embed_tokens(input_ids)
+        # when total_ultragist_size > 0, we need to modify attention mask
+        if self._use_sdpa and not output_attentions and total_ultragist_size == 0:
+            # output_attentions=True can not be supported when using SDPA, and we fall back on
+            # the manual implementation that requires a 4D causal mask in all cases.
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+            )
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+            )
+        position_ids = torch.arange(seq_length_with_past, dtype=torch.long, device=device).repeat(batch_size, 1)
+        # prepare attention mask and position ids for ultragists
+        # NOTE: we must modify the position_ids here instead of inside the self_attn forward, otherwise the version of position_ids is incompatible when enabling gradient checkpointing
+        if total_ultragist_size > 0:
+            # number of tokens to condense by the ultragists
+            condensing_size = window_size - raw_size_to_cache
+            # number of tokens in current window (containing cached raw activations)
+            window_size_with_ultragist = window_size + total_ultragist_size
+            # number of ultragists in cache
+            memory_size = seq_length_with_past - window_size_with_ultragist
+            min_value = torch.finfo(inputs_embeds.dtype).min
+            ultragist_start_idx = -total_ultragist_size
+            # batch_size, head_num, window_size
+            reference_attention_mask = attention_mask[..., -total_ultragist_size - 1, -window_size_with_ultragist: -total_ultragist_size]
+            for ultragist_size in ultragist_sizes:
+                # in this case, the activations of ordinal tokens are used as ultragist activations
+                if ultragist_size < 0:
+                    continue
+                token_per_ultragist = condensing_size // ultragist_size
+                # the end_idx may be -0, in that case, use max instead
+                ultragist_end_idx = ultragist_start_idx + ultragist_size
+                if ultragist_end_idx == 0:
+                    ultragist_end_idx = torch.iinfo(torch.long).max
+                if self.config.ultragist_attn == "step-expansion":
+                    # each ultragist can attend to one more sub-interval than its predecessor
+                    # token_per_ultragist, 2 * token_per_ultragist, ..., ultragist_size * token_per_ultragist
+                    ultragist_arange = torch.arange(1, ultragist_size + 1, device=device) * token_per_ultragist
+                    # 0, 1, 2, ..., window_size - 1
+                    ordinal_arange = torch.arange(window_size, device=device)
+                    # ultragist_size, window_size
+                    valid_pos = ordinal_arange.expand(ultragist_size, window_size) < ultragist_arange.unsqueeze(-1)
+                    # ultragist_size, window_size
+                    ordinal_attention_mask = torch.where(valid_pos, 0, min_value)
+                    # NOTE: add reference attention_mask so that padding tokens are considered
+                    ordinal_attention_mask = ordinal_attention_mask[None, None, ...] + reference_attention_mask.unsqueeze(-2)
+                    if self.config.ultragist_attend_prev:
+                        ultragist_attention_mask = attention_mask.new_full((ultragist_size, ultragist_size), min_value).triu(1)
+                        # the ultragist token is next to the last oridinal token it attends to
+                        ultragist_position_ids = torch.arange(token_per_ultragist, token_per_ultragist * ultragist_size + 1, token_per_ultragist) + memory_size
+                        ultragist_position_ids = ultragist_position_ids + torch.arange(ultragist_size)
+                        position_ids[:, ultragist_start_idx: ultragist_end_idx] = ultragist_position_ids
+                    else:
+                        ultragist_attention_mask = attention_mask.new_full((ultragist_size, ultragist_size), min_value).fill_diagonal_(0)
+                        # the ultragist token is next to the last oridinal token it attends to
+                        ultragist_position_ids = torch.arange(token_per_ultragist, token_per_ultragist * ultragist_size + 1, token_per_ultragist) + memory_size
+                        position_ids[:, ultragist_start_idx: ultragist_end_idx] = ultragist_position_ids
+                    attention_mask[..., ultragist_start_idx: ultragist_end_idx, -window_size_with_ultragist: -total_ultragist_size] = ordinal_attention_mask
+                    attention_mask[..., ultragist_start_idx: ultragist_end_idx, ultragist_start_idx: ultragist_end_idx] = ultragist_attention_mask
+                    # ultragists of different ratios are blind to others
+                    attention_mask[..., ultragist_start_idx: ultragist_end_idx, -total_ultragist_size: ultragist_start_idx] = min_value
+                elif self.config.ultragist_attn == "segmentation":
+                    # each ultragist can attend to its corresponding sub-interval
+                    # ultragist_size, token_per_ultragist
+                    indices = torch.arange(token_per_ultragist * ultragist_size, device=device).view(ultragist_size, -1)
+                    # ultragist_size, window_size
+                    ordinal_attention_mask = attention_mask.new_full((ultragist_size, window_size), min_value)
+                    ordinal_attention_mask.scatter_(dim=-1, index=indices, value=0)
+                    # NOTE: add reference attention_mask so that padding tokens are considered
+                    ordinal_attention_mask = ordinal_attention_mask[None, None, ...] + reference_attention_mask.unsqueeze(-2)
+                    if self.config.ultragist_attend_prev:
+                        ultragist_attention_mask = attention_mask.new_full((ultragist_size, ultragist_size), min_value).triu(1)
+                        # the ultragist token is next to the last oridinal token it attends to
+                        ultragist_position_ids = position_ids.new_full(ultragist_size, fill_value=token_per_ultragist + memory_size)
+                        ultragist_position_ids = ultragist_position_ids + torch.arange(ultragist_size)
+                        position_ids[:, ultragist_start_idx: ultragist_end_idx] = ultragist_position_ids
+                    else:
+                        ultragist_attention_mask = attention_mask.new_full((ultragist_size, ultragist_size), min_value).fill_diagonal_(0)
+                        # the ultragist token is next to the last oridinal token it attends to
+                        ultragist_position_ids = position_ids.new_full(ultragist_size, fill_value=token_per_ultragist + memory_size)
+                        position_ids[:, ultragist_start_idx: ultragist_end_idx] = ultragist_position_ids
+                    attention_mask[..., ultragist_start_idx: ultragist_end_idx, -window_size_with_ultragist: -total_ultragist_size] = ordinal_attention_mask
+                    attention_mask[..., ultragist_start_idx: ultragist_end_idx, ultragist_start_idx: ultragist_end_idx] = ultragist_attention_mask
+                    # ultragists of different ratios are blind to others
+                    attention_mask[..., ultragist_start_idx: ultragist_end_idx, -total_ultragist_size: ultragist_start_idx] = min_value
+                elif self.config.ultragist_attn == "full-coverage":
+                    pass
+                else:
+                    raise NotImplementedError
+                ultragist_start_idx = ultragist_end_idx
+        # print(f"total_ultragist_size:  {total_ultragist_size}")
+        # print(f"raw_size_to_cache:  {raw_size_to_cache}")
+        # print(f"position_ids:       {position_ids}")
+        # print(f"attention_mask:\n{attention_mask}")
+        # x = input()
+        # if x == "s":
+            # return
+        # embed positions
+        hidden_states = inputs_embeds
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        # ultragist: still use tuple to organize cache
+        next_decoder_cache = () if use_cache else None
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            # ultragist: slice out the past_key_value of the corresponding layer
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_value,
+                    output_attentions,
+                    use_cache,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class LlamaForCausalLM(LlamaPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = LlamaModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        """Override the default from_pretrained to extend vocab size according to ultragist_size."""
+        kwargs.update(output_loading_info=True)
+        model, loading_info = super().from_pretrained(*args, **kwargs)
+        # NOTE: set memory after from_pretrained because there may be another transformer model inside the Memory object, which may cause weird erros during loading
+        config = model.config
+        model.memory = Memory(
+            model_config=config,
+            k_seq_dim=2,
+            v_seq_dim=2,
+        )
+        missing_keys = loading_info["missing_keys"]
+        # NOTE: the ultragist parameters may or may not be loaded from the checkpoint
+        # if it is loaded from the checkpoint, we should not re-initilize it
+        model.model._init_ultragist_embed(missing_keys)
+        # initialize weights of possible q,k,v,o,mlp
+        for layer in model.model.layers:
+            layer.self_attn._init_ultragist_proj(missing_keys)
+            layer.mlp._init_ultragist_proj(missing_keys)
+        return model
+    def _native_forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        shift_labels: Optional[bool] = True,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, ModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # when we directly call _native_forward, the past_key_values would be None
+        if past_key_values is None:
+            # NOTE: set window size to 0, so that new past_key_values are returned properly, see MistralAttention.forward
+            past_key_values = [(None, None, [0], 0, 0, 0) for _ in range(self.config.num_hidden_layers)]
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        if self.config.pretraining_tp > 1:
+            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
+            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
+            logits = torch.cat(logits, dim=-1)
+        else:
+            logits = self.lm_head(hidden_states)
+        logits = logits.float()
+        loss = None
+        batch_loss = None
+        valid_token_num = None
+        if labels is not None:
+            loss, batch_loss, valid_token_num = compute_loss(logits, labels, shift=shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return ModelOutput(
+            loss=loss,
+            batch_loss=batch_loss,
+            valid_token_num=valid_token_num,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def _ultragist_forward(self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        # t1 = time.time()
+        # initialize cache
+        self.memory.prepare(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            labels=labels
+        )
+        # t2 = time.time()
+        # print(f"{torch.distributed.get_rank()}: {input_ids.shape}")
+        # after the first window, one token at a time
+        while not self.memory.finish:
+        # for _ in range(2):
+            # t3 = time.time()
+            input_ids, attention_mask, past_key_values, labels = self.memory.step()
+            # NOTE: the first window is encoded without ultragist parameters, we should skip it when computing loss
+            if self.training and self.memory._step_idx == 1:
+                labels[:] = -100
+            # t4 = time.time()
+            outputs = self._native_forward(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                labels=labels,
+                # NOTE: the labels have been shifted so that all tokens in the window have the proper loss
+                shift_labels=False,
+            )
+            # t5 = time.time()
+            # update past_key_values
+            self.memory.update_memory(outputs.past_key_values)
+            # t6 = time.time()
+            if labels is not None:
+                # update loss
+                self.memory.update_loss(outputs.batch_loss, outputs.valid_token_num)
+            # t7 = time.time()
+            # print(f"Loop step time:         {t4-t3}")
+            # print(f"Loop forward time:      {t5-t4}")
+            # print(f"Loop update time:       {t6-t5}")
+            # print(f"Loop loss time:         {t7-t6}")
+            # input()
+        # t8 = time.time()
+        # output loss, past_key_values, and perplexity
+        outputs = self.memory.output(outputs)
+        # t9 = time.time()
+        # print(f"Prepare time:           {t2-t1}")
+        # print(f"Output time:            {t9-t8}")
+        return outputs
+    def forward(self, **kwargs):
+        """Forward computation over a batch of sequences.
+        """
+        # only allow gradient when training
+        with optional_grad_ctx(with_grad=self.training):
+            # we can disable ultragist to use the original llama
+            if hasattr(self, "_enable_ultragist") and self._enable_ultragist == False:
+                return self._native_forward(**kwargs)
+            else:
+                return self._ultragist_forward(**kwargs)
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past

modeling_ultragist.py ADDED Viewed

	@@ -0,0 +1,711 @@

+import os
+import torch
+import numpy as np
+import torch.distributed as dist
+from transformers.utils import logging
+from transformers import AutoTokenizer
+from itertools import cycle
+from typing import List
+logger = logging.get_logger(__name__)
+class Memory(torch.nn.Module):
+    def __init__(
+        self,
+        model_config,
+        k_seq_dim:int=2,
+        v_seq_dim:int=2,
+    ):
+        """Setup necessary attributes."""
+        super().__init__()
+        self.model_config = model_config
+        # initialize necessary parameters
+        self.k_seq_dim = k_seq_dim
+        self.v_seq_dim = v_seq_dim
+        self.num_layers = model_config.num_hidden_layers
+        self.max_position_embeddings = model_config.max_position_embeddings
+        self.rng = np.random.default_rng(42)
+        self.ultragist_window = model_config.ultragist_window
+        self.ultragist_stride = model_config.ultragist_stride
+        self.ultragist_attn = model_config.ultragist_attn
+        self.ultragist_ratio = model_config.ultragist_ratio
+        self.ultragist_ratio_mix = model_config.ultragist_ratio_mix
+        self.ultragist_param = model_config.ultragist_param
+        self.ultragist_sink_size = model_config.ultragist_sink_size
+        self.ultragist_attend_prev = model_config.ultragist_attend_prev
+        self.ultragist_tokens = torch.zeros(1, dtype=torch.long) + model_config.vocab_size
+        self._post_validation()
+        self.reset()
+    def _post_validation(self, verbose=True):
+        assert self.ultragist_window >= self.ultragist_stride, f"Make sure the ultragist_window {self.ultragist_window} >= ultragist_stride {self.ultragist_stride}!"
+        for ratio in self.ultragist_ratio:
+            assert ratio >= 0, f"Make sure all ultragist ratios are greater than or equal to 0, found {self.ultragist_ratio}!"
+        assert self.ultragist_attn in ["segmentation", "step-expansion", "full-coverage"], f"ultragist_attn {self.ultragist_attn} not implemented!"
+        assert self.ultragist_ratio_mix in ["instance-random", "step-random", "sequence", "join"] or "adapt-" in self.ultragist_ratio_mix, f"ultragist_ratio_mix {self.ultragist_ratio_mix} not implemented!"
+        if self.ultragist_ratio_mix == "join":
+            # create another stream for moving gpu tensor to cpu
+            # self.stream = torch.cuda.Stream()
+            pass
+        self._cpu = torch.device("cpu")
+        if verbose:
+            info = f"applying ultragist on {self.ultragist_param} (the ultragist embedding is initialized from {'bos' if self.model_config.ultragist_embed_init == 'bos' else 'eos'} embedding), with window size {self.ultragist_window}, stride {self.ultragist_stride}, {self.ultragist_attn} attention{' (attending to previous ultragists)' if self.ultragist_attend_prev else ' (no attending to previous ultragists)'}, sink size {self.ultragist_sink_size}, condensing ratio {self.ultragist_ratio} (mixed by {self.ultragist_ratio_mix})..."
+            logger.info(info)
+    def set(self, verbose=True, **kwargs):
+        if "ultragist_ratio_mix" in kwargs and kwargs["ultragist_ratio_mix"] == "join" and self.ultragist_ratio_mix != "join":
+            raise ValueError(f"You cannot switch ultragist_ratio_mix from non-join strategy to join!")
+        if self.ultragist_ratio_mix == "join" and "ultragist_ratio" in kwargs and sorted(kwargs["ultragist_ratio"]) != sorted(self.ultragist_ratio):
+            raise ValueError(f"You cannot change ultragist_ratio given ultragist_ratio_mix=join!")
+        for k, v in kwargs.items():
+            setattr(self, k, v)
+        self._post_validation(verbose=verbose)
+    def reset(self):
+        """Initialize attributes for a new sequence."""
+        # the cursor pointing to the start of the current window
+        self._start_idx = 0
+        # the cursor pointing to the end of the current window
+        self._end_idx = 0
+        # the ultragist sizes of all strides
+        self._total_ultragist_sizes = []
+        # the ultragist ratios of all strides
+        self._main_ultragist_sizes = []
+        # the loss per batch
+        self._batch_loss = None
+        # the valid token number per batch
+        self._valid_token_num = None
+        # the step index for processing the input_ids
+        self._step_idx = 0
+        # used in set_compression_ratio
+        self._ratio = None
+        self._ultragist_ratio_iter = None
+        self.all_input_ids = torch.tensor([], dtype=torch.long)
+        self.all_attention_mask = torch.tensor([], dtype=torch.long)
+        if hasattr(self, "all_labels"):
+            del self.all_labels
+        # the raw activations of recent tokens
+        self.raw_activations = [(None, None) for _ in range(self.num_layers)]
+        # the attention sink activations
+        self.sink_activations = [(None, None) for _ in range(self.num_layers)]
+        # the ultragist activations
+        if self.ultragist_ratio_mix == "join":
+            self.l1_to_ln_ultragist_activations = [
+                [(None, None) for _ in range(self.num_layers)]
+                for _ in self.ultragist_ratio
+            ]
+        else:
+            self.l1_to_ln_ultragist_activations = [
+                [(None, None) for _ in range(self.num_layers)]
+            ]
+    def rewind(self, size=None, trim=False):
+        """
+        Rewind raw activations that have not been condensed yet.
+        Args:
+            trim: if true, the input_ids corresponding to the raw activations are trimmed.
+        """
+        raw_memory_size = self.get_memory_size()[1]
+        if size is None:
+            size = raw_memory_size
+        assert size <= raw_memory_size, f"Make sure the rewind size ({size}) is smaller or equal to the raw memory size ({raw_memory_size})!"
+        if size > 0:
+            self._end_idx -= size
+            for layer_idx, (key, value) in enumerate(self.raw_activations):
+                key = slice_tensor(key, end=-size, dim=self.k_seq_dim)
+                value = slice_tensor(value, end=-size, dim=self.v_seq_dim)
+                self.raw_activations[layer_idx] = (key, value)
+            if trim:
+                self.all_input_ids = self.all_input_ids[:, :-size]
+                self.all_attention_mask = self.all_attention_mask[:, :-size]
+                if hasattr(self, "all_labels"):
+                    self.all_labels = self.all_labels[:, :-size]
+    @property
+    def finish(self):
+        is_finish = self._end_idx == self.all_sequence_length
+        # print(f"{dist.get_rank()} Finish: {self._end_idx}, {self.all_sequence_length}")
+        # if is_finish and hasattr(self, "stream"):
+        #     self.stream.synchronize()
+        return is_finish
+    def get_memory_size(self):
+        ultragist_memory_size = 0
+        raw_memory_size = 0
+        sink_memory_size = 0
+        if self.l1_to_ln_ultragist_activations[0][0][0] is not None:
+            ultragist_memory_size += self.l1_to_ln_ultragist_activations[0][0][0].shape[self.k_seq_dim]
+        if self.raw_activations[0][0] is not None:
+            raw_memory_size += self.raw_activations[0][0].shape[self.k_seq_dim]
+        if self.sink_activations[0][0] is not None:
+            sink_memory_size += self.sink_activations[0][0].shape[self.k_seq_dim]
+        return ultragist_memory_size, raw_memory_size, sink_memory_size
+    def get_memory(self, ultragist_sizes=None, total_ultragist_size=None, raw_size_to_cache=None, window_size=None):
+        """
+        Get the compressed kv cache for generating next tokens.
+        """
+        past_key_values = []
+        for layer_idx in range(self.num_layers):
+            sink_key, sink_value = self.sink_activations[layer_idx]
+            ultragist_key, ultragist_value = self.l1_to_ln_ultragist_activations[0][layer_idx]
+            raw_key, raw_value = self.raw_activations[layer_idx]
+            key = cat_tensor([
+                sink_key, ultragist_key, raw_key,
+            ], dim=self.k_seq_dim)
+            value = cat_tensor([
+                sink_value, ultragist_value, raw_value,
+            ], dim=self.v_seq_dim)
+            if ultragist_sizes is not None:
+                layer_past_key_values = (key, value, ultragist_sizes, total_ultragist_size, raw_size_to_cache, window_size)
+            else:
+                layer_past_key_values = (key, value)
+            past_key_values.append(layer_past_key_values)
+        return past_key_values
+    def prepare(self, input_ids, attention_mask, labels):
+        """
+        Prepare inputs for the model. These inputs belong to the same sequence.
+        """
+        assert input_ids.shape[0] == 1, "Make sure the batch size is 1!"
+        assert attention_mask is None or (attention_mask == 1).all(), "Make sure there is no padding!"
+        if not hasattr(self, "_device"):
+            self._device = input_ids.device
+        # accumulate input_ids and attention_mask
+        self.all_input_ids = torch.cat([self.all_input_ids, input_ids.cpu()], dim=1)
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        self.all_attention_mask = torch.cat([self.all_attention_mask, attention_mask.cpu()], dim=1)
+        self.all_sequence_length = self.all_input_ids.shape[1]
+        if labels is not None:
+            # rotate labels in advance so that the loss of the last token is not ignored in every window
+            labels = torch.cat([labels[:, 1:].cpu(), torch.tensor([-100]).expand(labels.shape[0], 1)], dim=1)
+            if not hasattr(self, "all_labels"):
+                self.all_labels = labels
+            else:
+                self.all_labels = torch.cat([self.all_labels, labels], dim=1)
+            assert self.all_input_ids.shape[1] == self.all_labels.shape[1], f"Found inconsistent all_input_ids {self.all_input_ids.shape} and all_labels {self.all_labels.shape}!"
+    def set_compression_ratio(self, start_idx, end_idx):
+        """Choose a condensing ratio from self.ultragist_ratio"""
+        def filter_ratio(ratios, stride):
+            valid_ratios = []
+            for ratio in ratios:
+                # stride must be bigger than condensing ratio because we there must be at least one ultragist
+                if stride < ratio:
+                    continue
+                # the stride must be evenly divisible by condensing ratio
+                if ratio > 0 and (stride % ratio) != 0:
+                    continue
+                # when training, ratio=0 is valid if previous windows contain ultragist or later windows contain ultragist
+                if ratio == 0 and self.training:
+                    previous_has_zero = -1 in self._main_ultragist_sizes
+                    following_has_nonzero = (start_idx + stride + self.ultragist_window) <= self.all_sequence_length
+                    if previous_has_zero or (not following_has_nonzero):
+                        continue
+                valid_ratios.append(ratio)
+            assert len(valid_ratios), f"Cannot find valid condensing ratio (among {ratios}) for stride {stride}!"
+            return valid_ratios
+        def get_max_length(ratios):
+            max_lengths = []
+            for condensing_ratio in ratios:
+                if condensing_ratio > 0:
+                    max_lengths.append((self.max_position_embeddings - self.ultragist_window) * condensing_ratio + self.ultragist_window)
+                else:
+                    max_lengths.append(self.max_position_embeddings)
+            return max_lengths
+        if len(self.ultragist_ratio) == 1:
+            return [self.ultragist_ratio[0]]
+        ratio_mix = self.ultragist_ratio_mix
+        ultragist_ratio = filter_ratio(self.ultragist_ratio, self.ultragist_stride)
+        if ratio_mix == "instance-random":
+            if self._ratio is None:
+                ultragist_ratio = self.rng.choice(ultragist_ratio, size=1).tolist()
+                self._ratio = ultragist_ratio
+            else:
+                ultragist_ratio = self._ratio
+        elif ratio_mix == "step-random":
+            ultragist_ratio = self.rng.choice(ultragist_ratio, size=1).tolist()
+        elif ratio_mix == "sequence":
+            if self._ultragist_ratio_iter is None:
+                self._ultragist_ratio_iter = cycle(ultragist_ratio)
+            ultragist_ratio = [next(self._ultragist_ratio_iter)]
+        elif ratio_mix == "join":
+            ultragist_ratio = ultragist_ratio
+        elif "adapt" in ratio_mix:
+            if self._ratio is None:
+                future_length = int(ratio_mix.split("-")[1])
+                sequence_length = self.all_input_ids.shape[1] + future_length
+                max_lengths = get_max_length(ultragist_ratio)
+                # ascendingly sort the max lengths
+                valid_max_lengths_and_indices = [x for x in enumerate(max_lengths) if x[1] >= sequence_length]
+                if len(valid_max_lengths_and_indices):
+                    minimum_length_index = min(valid_max_lengths_and_indices, key=lambda x: x[1])[0]
+                    # use the minimal possible length for this sequence (the smallest fold ratio)
+                    ultragist_ratio = [ultragist_ratio[minimum_length_index]]
+                else:
+                    ultragist_ratio = [max(ultragist_ratio)]
+                    # logger.warning(f"Failed to find valid fold window and size for sequence length {sequence_length}, as the maximum theoretical length is {max(max_lengths)}. Fall back to use the maximum one: {ultragist_ratio}.")
+                self._ratio = ultragist_ratio
+            else:
+                ultragist_ratio = self._ratio
+        return ultragist_ratio
+    def step(self):
+        """
+        Yield one window with the following logic:
+        The window size is L, the stride is S.
+        The window moves over S tokens at a time. The raw activations passed by the window are condensed according to a condensing_ratio.
+        The ultragists are added if and only if the raw activations fulfill the window.
+        In the future, we may switch window size to decrease cache size of raw activations.
+        """
+        # the starting position of the current window w.r.t. the start of the current input sequence
+        start_idx = self._start_idx
+        # the end position of the current window w.r.t. the start of the current input sequence
+        end_idx = start_idx + self.ultragist_window
+        # indicates if the current window is completely filled by raw activations and new tokens
+        # we only append ultragist tokens for full windows
+        if end_idx > self.all_sequence_length:
+            # the input is shorter than the initial window size
+            end_idx = self.all_sequence_length
+            is_full_window = False
+        else:
+            is_full_window = True
+        # NOTE: in training, the entire sequence is input to the model at once
+        # In the last window, we do not need to append ultragists because they will not be used at all
+        if self.training and end_idx == self.all_sequence_length:
+            is_full_window = False
+        # the real window size (remaining_size + new_token_size)
+        window_size = end_idx - start_idx
+        if is_full_window:
+            ultragist_stride = self.ultragist_stride
+            # a list of condensing ratios
+            compression_ratios = self.set_compression_ratio(start_idx=start_idx, end_idx=end_idx)
+            ultragist_sizes = []
+            for condensing_ratio in compression_ratios:
+                if condensing_ratio > 0:
+                    # the stride must be evenly divisible by condensing_ratio
+                    ultragist_sizes.append(ultragist_stride // condensing_ratio)
+                else:
+                    # the raw activations are used as ultragist activations
+                    ultragist_sizes.append(-1)
+            # forward start_idx and end_idx
+            next_start_idx = start_idx + ultragist_stride
+            # how many raw activations to save
+            raw_size_to_cache = end_idx - next_start_idx
+        else:
+            # no stride because the sequence has finished
+            next_start_idx = start_idx
+            # cache all recent raw activations to be used in the next window
+            raw_size_to_cache = window_size
+            ultragist_sizes = [0]
+            compression_ratios = [0]
+        total_ultragist_size = sum(s for s in ultragist_sizes if s >= 0)
+        past_key_values = self.get_memory(
+            ultragist_sizes,
+            total_ultragist_size,
+            raw_size_to_cache,
+            window_size
+        )
+        # streamingly add new input_ids
+        input_ids = self.all_input_ids[:, self._end_idx: end_idx].to(self._device)
+        attention_mask = self.all_attention_mask[:, self._end_idx: end_idx].to(self._device)
+        if hasattr(self, "all_labels"):
+            labels = self.all_labels[:, self._end_idx: end_idx].to(self._device)
+        else:
+            labels = None
+        batch_size = input_ids.shape[0]
+        # append ultragists if necessary
+        if is_full_window:
+            if total_ultragist_size > 0:
+                input_ids = torch.cat([input_ids, self.ultragist_tokens.expand(batch_size, total_ultragist_size).to(input_ids.device, dtype=input_ids.dtype)], dim=1)
+                # NOTE: prepend ultragist_memory_size 1 to attention_mask because we have past_key_values
+                attention_mask = torch.cat([attention_mask, attention_mask.new_ones(batch_size, total_ultragist_size)], dim=1)
+                if labels is not None:
+                    labels = torch.cat([labels, labels.new_zeros(batch_size, total_ultragist_size) - 100], dim=1)
+        # prepend 1 to attention mask for previous memory
+        first_key = past_key_values[0][0]
+        memory_size = first_key.shape[self.k_seq_dim] if first_key is not None else 0
+        if memory_size > 0:
+            attention_mask = torch.cat([attention_mask.new_ones(batch_size, memory_size), attention_mask], dim=1)
+        # involked in self.output()
+        self._total_ultragist_sizes.append(total_ultragist_size)
+        # involked in self.set_compression_ratio
+        self._main_ultragist_sizes.append(ultragist_sizes[0])
+        # update end_idx
+        self._start_idx = next_start_idx
+        self._end_idx = end_idx
+        self._step_idx += 1
+        # print("****************************************")
+        # if is_full_window:
+        #     print(f"stride:               {ultragist_stride}")
+        #     print(f"compression ratios:   {compression_ratios}")
+        #     print(f"ultragist_sizes:      {ultragist_sizes}")
+        # print(f"input_ids:          {input_ids.shape}")
+        # print(f"start_idx:          {start_idx}")
+        # print(f"next_start_idx:     {next_start_idx}")
+        # print(f"end_idx:            {end_idx}")
+        # x = input()
+        # if x == "s":
+        #     return
+        return input_ids, attention_mask, past_key_values, labels
+    def update_memory(self, past_key_values):
+        """
+        Accumulate ultragist activations and raw activations.
+        """
+        for layer_idx, (key, value, ultragist_sizes, total_ultragist_size, raw_size_to_cache, window_size) in enumerate(past_key_values):
+            # NOTE: the past_key_values are incrementally returned (only the new keys and values are returned)
+            # key/value: (num_layer, 2, batch_size, num_head, new_seq_len, head_dim)
+            # ultragist_size: how many ultragist activations are in key and value
+            # raw_size_to_cache: how many raw activations should be kept
+            previous_raw_key, previous_raw_value = self.raw_activations[layer_idx]
+            if self._step_idx == 1:
+                # save the sink activations
+                # NOTE: we do not slice the key/value activations, which may cause duplication when ultragist_ratio=-1 for the first window, but it's okay
+                self.sink_activations[layer_idx] = [
+                    slice_tensor(key, end=self.ultragist_sink_size, dim=self.k_seq_dim),
+                    slice_tensor(value, end=self.ultragist_sink_size, dim=self.v_seq_dim),
+                ]
+            if ultragist_sizes == [0]:
+                # this means the current input does not fulfill a window
+                # thus, the key and value are all raw activations, and we accumulate them until the window is fulfilled
+                assert raw_size_to_cache == window_size
+                raw_key = cat_tensor([
+                    previous_raw_key,
+                    key
+                ], dim=self.k_seq_dim)
+                raw_value = cat_tensor([
+                    previous_raw_value,
+                    value
+                ], dim=self.v_seq_dim)
+                self.raw_activations[layer_idx] = (raw_key, raw_value)
+            else:
+                for ultragist_size_idx, ultragist_size in enumerate(ultragist_sizes):
+                    # NOTE: use the correct previous_ultragist_key and value!
+                    previous_ultragist_key, previous_ultragist_value = self.l1_to_ln_ultragist_activations[ultragist_size_idx][layer_idx]
+                    # if ultragist_size_idx == 0:
+                    #     ctx_manager = nullcontext()
+                    # else:
+                    #     ctx_manager = torch.cuda.stream(self.stream)
+                    # FIXME: only the first iteration works...
+                    # with ctx_manager:
+                    ultragist_key, ultragist_value, raw_key, raw_value = self._extract_ultragist_and_raw_memory(key, value, previous_ultragist_key, previous_ultragist_value, previous_raw_key, previous_raw_value, raw_size_to_cache, total_ultragist_size, ultragist_sizes, ultragist_size_idx)
+                    self.l1_to_ln_ultragist_activations[ultragist_size_idx][layer_idx] = (ultragist_key, ultragist_value)
+                    if ultragist_size_idx == 0:
+                        self.raw_activations[layer_idx] = (raw_key, raw_value)
+                    # if ultragist_size_idx != 0:
+                    #     print(self.stream.query())
+    def update_loss(self, batch_loss, valid_token_num):
+        """
+        Accumulate loss for later perplexity computation and backward pass; past_key_values according to cache_method.
+        """
+        # print(f"process {dist.get_rank()}: valid_token_num: {valid_token_num}; loss {batch_loss}")
+        if self._batch_loss is None:
+            # NOTE: multiply valid_token_num because batch_loss is divided by it in advance
+            self._batch_loss = batch_loss * valid_token_num
+            self._valid_token_num = valid_token_num
+        else:
+            # NOTE: avoid in-place operations, otherwise there will be gradient errors in training
+            self._batch_loss = self._batch_loss + batch_loss * valid_token_num
+            self._valid_token_num = self._valid_token_num + valid_token_num
+    def output(self, model_outputs):
+        """
+        Override loss with accumulated loss.
+        """
+        # override loss
+        if self._batch_loss is not None:
+            # here the batch_loss is the summation of all token losses in each element
+            loss = self._batch_loss.sum() / self._valid_token_num.sum()
+            # NOTE: prevent nan
+            batch_loss = self._batch_loss / self._valid_token_num
+            if (self._valid_token_num == 0).any():
+                batch_loss = batch_loss.masked_fill(self._valid_token_num == 0, 0.)
+            # NOTE: we must use dict to override values, otherwise trainer cannot find loss
+            model_outputs["loss"] = loss
+            model_outputs["batch_loss"] = batch_loss
+            model_outputs["valid_token_num"] = self._valid_token_num
+        # override last_hidden_states (used in generation)
+        ultragist_size = self._total_ultragist_sizes[-1]
+        # remove logits corresponding to ultragist tokens
+        if ultragist_size > 0:
+            model_outputs["logits"] = model_outputs["logits"][:, :-ultragist_size]
+        return model_outputs
+    def _extract_ultragist_and_raw_memory(self, key, value, previous_ultragist_key, previous_ultragist_value, previous_raw_key, previous_raw_value, raw_size_to_cache, total_ultragist_size, ultragist_sizes, ultragist_size_idx):
+        """Extract ultragist and raw memory from the returned key and value. The raw memory is computed only if the ultragist_size_idx == 0."""
+        ultragist_size = ultragist_sizes[ultragist_size_idx]
+        # NOTE: ignore -1
+        previous_ultragist_size = sum(x for x in ultragist_sizes[:ultragist_size_idx] if x > 0)
+        if previous_ultragist_key is not None:
+            target_device = previous_ultragist_key.device
+        else:
+            if ultragist_size_idx == 0:
+                target_device = self._device
+            else:
+                target_device = self._cpu
+        if ultragist_size == -1:
+            actual_ultragist_size = self.ultragist_window - raw_size_to_cache
+            # the raw activations are used as ultragist activations
+            concat_raw_key = cat_tensor([
+                previous_raw_key,
+                key
+            ], dim=self.k_seq_dim)
+            concat_raw_value = cat_tensor([
+                previous_raw_value,
+                value
+            ], dim=self.v_seq_dim)
+            ultragist_key = cat_tensor([
+                previous_ultragist_key,
+                slice_tensor(concat_raw_key, end=actual_ultragist_size, dim=self.k_seq_dim).to(target_device, non_blocking=True)
+            ], dim=self.k_seq_dim)
+            ultragist_value = cat_tensor([
+                previous_ultragist_value,
+                slice_tensor(concat_raw_value, end=actual_ultragist_size, dim=self.v_seq_dim).to(target_device, non_blocking=True)
+            ], dim=self.v_seq_dim)
+            if ultragist_size_idx == 0:
+                raw_key = slice_tensor(concat_raw_key, start=actual_ultragist_size, end=self.ultragist_window, dim=self.k_seq_dim)
+                raw_value = slice_tensor(concat_raw_value, start=actual_ultragist_size, end=self.ultragist_window, dim=self.v_seq_dim)
+        else:
+            # [-ultragist_size:] activations are from ultragists, need to be accumulated
+            # [-raw_cache_size-ultragist_size:-ultragist_size] raw activations will be cached; if they are shorter than raw_cache_size, part of the previous raw activations will also be kept
+            ultragist_start_idx = - total_ultragist_size + previous_ultragist_size
+            ultragist_end_idx = ultragist_start_idx + ultragist_size
+            # NOTE: avoid end=0 for slicing
+            if ultragist_end_idx == 0:
+                ultragist_end_idx = None
+            ultragist_key = cat_tensor([
+                previous_ultragist_key,
+                slice_tensor(key, start=ultragist_start_idx, end=ultragist_end_idx, dim=self.k_seq_dim).to(target_device, non_blocking=True)
+            ], dim=self.k_seq_dim)
+            ultragist_value = cat_tensor([
+                previous_ultragist_value,
+                slice_tensor(value, start=ultragist_start_idx, end=ultragist_end_idx, dim=self.v_seq_dim).to(target_device, non_blocking=True)
+            ], dim=self.v_seq_dim)
+            # the raw activations are only updated once
+            if ultragist_size_idx == 0:
+                if key.shape[self.k_seq_dim] < raw_size_to_cache + ultragist_size:
+                    concat_raw_key = cat_tensor([
+                        previous_raw_key,
+                        key
+                    ], dim=self.k_seq_dim)
+                    concat_raw_value = cat_tensor([
+                        previous_raw_value,
+                        value
+                    ], dim=self.v_seq_dim)
+                    raw_key = slice_tensor(concat_raw_key, start=self.ultragist_window - raw_size_to_cache, end=self.ultragist_window, dim=self.k_seq_dim)
+                    raw_value = slice_tensor(concat_raw_value, start=self.ultragist_window - raw_size_to_cache, end=self.ultragist_window, dim=self.v_seq_dim)
+                else:
+                    # becomes None when raw_size_to_cache = 0
+                    raw_key = slice_tensor(key, start=ultragist_start_idx - raw_size_to_cache, end=ultragist_start_idx, dim=self.k_seq_dim)
+                    raw_value = slice_tensor(value, start=ultragist_start_idx - raw_size_to_cache, end=ultragist_start_idx, dim=self.v_seq_dim)
+        if ultragist_size_idx == 0:
+            return ultragist_key, ultragist_value, raw_key, raw_value
+        else:
+            # NOTE: only l1 ultragist activations are kept on GPU
+            return ultragist_key.detach().to(target_device, non_blocking=True), ultragist_value.detach().to(target_device, non_blocking=True), None, None
+            # return ultragist_key, ultragist_value, None, None
+def slice_tensor(x, start=None, end=None, dim=2):
+    if x is None:
+        return None
+    if end == 0:
+        return None
+    if start == x.shape[dim]:
+        return None
+    if start == end:
+        return None
+    if dim == 2:
+        if start is None and end is not None:
+            return x[:, :, :end, ...]
+        elif start is not None and end is None:
+            return x[:, :, start:, ...]
+        elif start is not None and end is not None:
+            return x[:, :, start:end, ...]
+    elif dim == 1:
+        if start is None and end is not None:
+            return x[:, :end, ...]
+        elif start is not None and end is None:
+            return x[:, start:, ...]
+        elif start is not None and end is not None:
+            return x[:, start:end, ...]
+    else:
+        raise NotImplementedError
+def cat_tensor(list_of_tensors, dim=-1):
+    list_of_tensors = [t for t in list_of_tensors if t is not None]
+    if len(list_of_tensors) > 1:
+        result = torch.cat(list_of_tensors, dim=dim)
+    elif len(list_of_tensors) == 1:
+        result = list_of_tensors[0]
+    else:
+        result = None
+    return result
+def slice_activations(activations, start=None, end=None, k_seq_dim=2, v_seq_dim=2):
+    new_activations = []
+    for key, value in activations:
+        new_key = slice_tensor(key, start=start, end=end, dim=k_seq_dim)
+        new_value = slice_tensor(value, start=start, end=end, dim=v_seq_dim)
+        new_activations.append([new_key, new_value])
+    return new_activations
+def cat_activations(list_of_activations, k_seq_dim=2, v_seq_dim=2):
+    assert all(len(x) == len(list_of_activations[0]) for x in list_of_activations), f"Make sure all activations have the same number of layers! Found {[len(x) for x in list_of_activations]}."
+    new_activations = []
+    for layer_idx in range(len(list_of_activations[0])):
+        keys = [x[layer_idx][0] for x in list_of_activations]
+        values = [x[layer_idx][1] for x in list_of_activations]
+        new_key = cat_tensor(keys, dim=k_seq_dim)
+        new_value = cat_tensor(values, dim=v_seq_dim)
+        new_activations.append([new_key, new_value])
+    return new_activations
+def interleave_activations(main_activations, augment_activations, main_spans, augment_spans, k_seq_dim=2, v_seq_dim=2, device=torch.device("cuda")):
+    """ Interleave main_activations and augment_activations according to main_span and augment_span.
+    Args:
+        main_span: a list of tuples (start_idx, end_idx). when start_idx and end_idx is None, the augment_activations will be plugged in.
+        augment_span: a list of tuples (start_idx, end_idx)
+    """
+    assert len(main_activations) == len(augment_activations) , f"Make sure main and augment activations have the same number of layers! Found {len(main_activations)} and {len(augment_activations)}!"
+    assert sum(x[0] is None and x[1] is None for x in main_spans) == len(augment_spans), f"Make sure the number of slots for augmentation (start_idx=None and end_idx=None in main_spans) matches the number of augmentations. Found {sum(x for x in main_spans if x[0] is None and x[1] is None)} slots but {len(augment_spans)} augmentations!"
+    new_activations = []
+    for layer_idx in range(len(main_activations)):
+        main_key, main_value = main_activations[layer_idx]
+        augment_key, augment_value = augment_activations[layer_idx]
+        sliced_keys = []
+        sliced_values = []
+        augment_idx = 0
+        for start, end in main_spans:
+            if start is None and end is None:
+                # this means the augment key/value should be plugged in
+                augment_start, augment_end = augment_spans[augment_idx]
+                sliced_key = slice_tensor(
+                    augment_key,
+                    start=augment_start,
+                    end=augment_end,
+                    dim=k_seq_dim
+                ).to(device)
+                sliced_value = slice_tensor(
+                    augment_value,
+                    start=augment_start,
+                    end=augment_end,
+                    dim=v_seq_dim
+                ).to(device)
+            else:
+                sliced_key = slice_tensor(
+                    main_key,
+                    start=start,
+                    end=end,
+                    dim=k_seq_dim
+                )
+                sliced_value = slice_tensor(
+                    main_value,
+                    start=start,
+                    end=end,
+                    dim=v_seq_dim
+                )
+            sliced_keys.append(sliced_key)
+            sliced_values.append(sliced_value)
+        new_key = cat_tensor(sliced_keys, dim=k_seq_dim)
+        new_value = cat_tensor(sliced_values, dim=v_seq_dim)
+        new_activations.append([new_key, new_value])
+    return new_activations
+def softmax(x:np.ndarray, axis=-1, temperature=1):
+    if isinstance(x, list):
+        x = np.array(x)
+    x = x / temperature
+    x = x - x.max(axis=axis, keepdims=True)
+    y = np.exp(x)
+    return y / y.sum(axis=axis, keepdims=True)
+def l1_norm(x):
+    sum_x = sum(x)
+    x = [y/sum_x for y in x]
+    return x

modeling_utils.py ADDED Viewed

	@@ -0,0 +1,215 @@

+import math
+import torch
+from tqdm import tqdm
+from dataclasses import dataclass
+from contextlib import nullcontext
+from typing import Mapping, Optional, Tuple
+from accelerate import Accelerator
+from collections import defaultdict
+from transformers.modeling_outputs import BaseModelOutputWithPast
+def optional_grad_ctx(with_grad=False):
+    if with_grad:
+        return nullcontext()
+    else:
+        return torch.no_grad()
+def move_to_device(data, device):
+    """
+    Prepares one `data` before feeding it to the model, be it a tensor or a nested list/dictionary of tensors.
+    """
+    if isinstance(data, Mapping):
+        return type(data)({k: move_to_device(v, device) for k, v in data.items()})
+    elif isinstance(data, (tuple, list)):
+        return type(data)(move_to_device(v, device) for v in data)
+    elif isinstance(data, torch.Tensor):
+        kwargs = {"device": device}
+        return data.to(**kwargs)
+    else:
+        return data
+def compute_loss(logits, labels, shift=False):
+    """
+    Returns:
+        token_loss: batch_size, seq_length
+    """
+    if shift:
+        logits = logits[:, :-1, :].contiguous()
+        labels = labels[:, 1:].contiguous()
+    labels = labels.to(logits.device)
+    batch_size = logits.shape[0]
+    # NOTE: the loss on -100 labels is 0 by default
+    token_loss = torch.nn.functional.cross_entropy(
+        logits.flatten(0, 1),
+        labels.reshape(-1),
+        reduction="none"
+    ).reshape(batch_size, -1)   # batch_size, seq_len
+    valid_token_num = (labels != -100).sum(-1)  # batch_size
+    all_valid_token_num = valid_token_num.sum()
+    if all_valid_token_num > 0:
+        loss = token_loss.sum() / valid_token_num.sum()
+    else:
+        loss = token_loss.sum()
+    batch_loss = token_loss.sum(-1) / valid_token_num
+    # prevent nan
+    if (valid_token_num == 0).any():
+        batch_loss = batch_loss.masked_fill(valid_token_num == 0, 0.)
+    return loss, batch_loss, valid_token_num
+@torch.no_grad()
+def evaluate_perplexity(model, dataloader, accelerator:Optional[Accelerator]=None):
+    if accelerator is not None and type(dataloader) == torch.utils.data.DataLoader:
+        # if the dataloader has been prepared, we shall not prepare it twice, especially in case of deepspeed
+        dataloader = accelerator.prepare(dataloader)
+    # if accelerator.process_index == 0:
+    #     for name, x in model.named_parameters():
+    #         print(f"{name: ^80} {x.dtype}")
+    all_loss = defaultdict(list)
+    for i, x in enumerate(tqdm(dataloader, desc="Computing Perplexity")):
+        # NOTE: important to reset memory for every batch
+        if hasattr(model, "memory"):
+            model.memory.reset()
+        # the seq id
+        index = x.pop("index")
+        # length is used to group training data, no use here
+        length = x.pop("length", None)
+        output = model(**x)
+        # NOTE: we need the loss for each element in the batch for accurate computation, because the number of valid tokens may differ among elements
+        if hasattr(output, "batch_loss"):
+            # output from our model has batch_loss by default
+            batch_loss = output.batch_loss
+            valid_token_num = output.valid_token_num
+        else:
+            # output from other models does not
+            loss, batch_loss, valid_token_num = compute_loss(output.logits, x["labels"], shift=True)
+        if accelerator is not None and accelerator.num_processes > 1:
+            # num_device * batch_size
+            index = accelerator.gather_for_metrics(index)
+            batch_loss = accelerator.gather_for_metrics(batch_loss)
+            valid_token_num = accelerator.gather_for_metrics(valid_token_num)
+        for _id, _loss, _num in zip(index.tolist(), batch_loss.tolist(), valid_token_num.tolist()):
+            # loss times num is the total loss of all valid tokens
+            all_loss[_id].append((_loss * _num, _num))
+    all_loss = dict(all_loss)
+    for _id, loss_and_num in all_loss.items():
+        # sum up the loss for all valid tokens in the entire sequence, and divide the number of valid tokens
+        all_loss[_id] = sum([x[0] for x in loss_and_num]) / sum(x[1] for x in loss_and_num)
+    # average across then take exp
+    perplexity = math.exp(sum(all_loss.values()) / len(all_loss))
+    return perplexity
+@torch.no_grad()
+def evaluate_generation(model, dataloader, accelerator:Optional[Accelerator]=None, tokenizer=None, return_new_tokens_only=True, return_decoded=True, **generation_config):
+    if accelerator is not None and type(dataloader) == torch.utils.data.DataLoader:
+        # if the dataloader has been prepared, we shall not prepare it twice, especially in case of deepspeed
+        dataloader = accelerator.prepare(dataloader)
+    all_indices = []
+    all_outputs = []
+    for i, x in enumerate(tqdm(dataloader, desc="Computing Generation")):
+        # if i > 3:
+        #     break
+        # NOTE: important to reset memory for every batch
+        if hasattr(model, "memory"):
+            model.memory.reset()
+        indices = x.pop("index")
+        # length is used to group training data, no use here
+        length = x.pop("length", None)
+        outputs = model.generate(**x, **generation_config)
+        if return_new_tokens_only:
+            start_idx = x["input_ids"].shape[1]
+            outputs = outputs[:, start_idx:]
+        if accelerator is not None and accelerator.num_processes > 1:
+            # must be contiguous
+            outputs = accelerator.pad_across_processes(outputs.contiguous(), pad_index=tokenizer.pad_token_id, dim=1)
+            outputs = accelerator.gather_for_metrics(outputs)
+            indices = accelerator.gather_for_metrics(indices)
+        outputs = outputs.tolist()
+        indices = indices.tolist()
+        if return_decoded:
+            outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        all_indices.extend(indices)
+        all_outputs.extend(outputs)
+    return all_indices, all_outputs
+@torch.no_grad()
+def evaluate_nll(model, dataloader, accelerator:Optional[Accelerator]=None):
+    if accelerator is not None and type(dataloader) == torch.utils.data.DataLoader:
+        # if the dataloader has been prepared, we shall not prepare it twice, especially in case of deepspeed
+        dataloader = accelerator.prepare(dataloader)
+    # if accelerator.process_index == 0:
+    #     for name, x in model.named_parameters():
+    #         print(f"{name: ^80} {x.dtype}")
+    all_loss = defaultdict(list)
+    for i, x in enumerate(tqdm(dataloader, desc="Computing Perplexity")):
+        # NOTE: important to reset memory for every batch
+        if hasattr(model, "memory"):
+            model.memory.reset()
+        # the seq id
+        index = x.pop("index")
+        # length is used to group training data, no use here
+        length = x.pop("length", None)
+        output = model(**x)
+        # NOTE: we need the loss for each element in the batch for accurate computation, because the number of valid tokens may differ among elements
+        if hasattr(output, "batch_loss"):
+            # output from our model has batch_loss by default
+            batch_loss = output.batch_loss
+            valid_token_num = output.valid_token_num
+        else:
+            # output from other models does not
+            loss, batch_loss, valid_token_num = compute_loss(output.logits, x["labels"], shift=True)
+        if accelerator is not None and accelerator.num_processes > 1:
+            # num_device * batch_size
+            index = accelerator.gather_for_metrics(index)
+            batch_loss = accelerator.gather_for_metrics(batch_loss)
+            valid_token_num = accelerator.gather_for_metrics(valid_token_num)
+        for _id, _loss in zip(index.tolist(), batch_loss.tolist()):
+            # loss times num is the total loss of all valid tokens
+            all_loss[_id].append(_loss)
+    return all_loss
+@dataclass
+class ModelOutput(BaseModelOutputWithPast):
+    loss: Optional[torch.FloatTensor] = None
+    batch_loss: Optional[torch.FloatTensor] = None
+    valid_token_num: Optional[torch.LongTensor] = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None

nqa.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"input": "How long had Mortimer Trefinnis' sister been dead when the doctor examined the body?", "context": "Produced by David Brannan. HTML version by Al Haines.\n\n\n\n\n\n\n\n\n\nThe Adventure of the Devil's Foot\n\n\nBy\n\nSir Arthur Conan Doyle\n\n\n\n\nIn recording from time to time some of the curious experiences and\ninteresting recollections which I associate with my long and intimate\nfriendship with Mr. Sherlock Holmes, I have continually been faced by\ndifficulties caused by his own aversion to publicity. To his sombre\nand cynical spirit all popular applause was always abhorrent, and\nnothing amused him more at the end of a successful case than to hand\nover the actual exposure to some orthodox official, and to listen with\na mocking smile to the general chorus of misplaced congratulation. It\nwas indeed this attitude upon the part of my friend and certainly not\nany lack of interesting material which has caused me of late years to\nlay very few of my records before the public. My participation in some\nof his adventures was always a privilege which entailed discretion and\nreticence upon me.\n\nIt was, then, with considerable surprise that I received a telegram\nfrom Holmes last Tuesday--he has never been known to write where a\ntelegram would serve--in the following terms:\n\nWhy not tell them of the Cornish horror--strangest case I have handled.\n\nI have no idea what backward sweep of memory had brought the matter\nfresh to his mind, or what freak had caused him to desire that I should\nrecount it; but I hasten, before another cancelling telegram may\narrive, to hunt out the notes which give me the exact details of the\ncase and to lay the narrative before my readers.\n\nIt was, then, in the spring of the year 1897 that Holmes's iron\nconstitution showed some symptoms of giving way in the face of constant\nhard work of a most exacting kind, aggravated, perhaps, by occasional\nindiscretions of his own. In March of that year Dr. Moore Agar, of\nHarley Street, whose dramatic introduction to Holmes I may some day\nrecount, gave positive injunctions that the famous private agent lay\naside all his cases and surrender himself to complete rest if he wished\nto avert an absolute breakdown. The state of his health was not a\nmatter in which he himself took the faintest interest, for his mental\ndetachment was absolute, but he was induced at last, on the threat of\nbeing permanently disqualified from work, to give himself a complete\nchange of scene and air. Thus it was that in the early spring of that\nyear we found ourselves together in a small cottage near Poldhu Bay, at\nthe further extremity of the Cornish peninsula.\n\nIt was a singular spot, and one peculiarly well suited to the grim\nhumour of my patient. From the windows of our little whitewashed\nhouse, which stood high upon a grassy headland, we looked down upon the\nwhole sinister semicircle of Mounts Bay, that old death trap of sailing\nvessels, with its fringe of black cliffs and surge-swept reefs on which\ninnumerable seamen have met their end. With a northerly breeze it lies\nplacid and sheltered, inviting the storm-tossed craft to tack into it\nfor rest and protection.\n\nThen come the sudden swirl round of the wind, the blistering gale from\nthe south-west, the dragging anchor, the lee shore, and the last battle\nin the creaming breakers. The wise mariner stands far out from that\nevil place.\n\nOn the land side our surroundings were as sombre as on the sea. It was\na country of rolling moors, lonely and dun-colored, with an occasional\nchurch tower to mark the site of some old-world village. In every\ndirection upon these moors there were traces of some vanished race\nwhich had passed utterly away, and left as its sole record strange\nmonuments of stone, irregular mounds which contained the burned ashes\nof the dead, and curious earthworks which hinted at prehistoric strife.\nThe glamour and mystery of the place, with its sinister atmosphere of\nforgotten nations, appealed to the imagination of my friend, and he\nspent much of his time in long walks and solitary meditations upon the\nmoor. The ancient Cornish language had also arrested his attention, and\nhe had, I remember, conceived the idea that it was akin to the\nChaldean, and had been largely derived from the Phoenician traders in\ntin. He had received a consignment of books upon philology and was\nsettling down to develop this thesis when suddenly, to my sorrow and to\nhis unfeigned delight, we found ourselves, even in that land of dreams,\nplunged into a problem at our very doors which was more intense, more\nengrossing, and infinitely more mysterious than any of those which had\ndriven us from London. Our simple life and peaceful, healthy routine\nwere violently interrupted, and we were precipitated into the midst of\na series of events which caused the utmost excitement not only in\nCornwall but throughout the whole west of England. Many of my readers\nmay retain some recollection of what was called at the time \"The\nCornish Horror,\" though a most imperfect account of the matter reached\nthe London press. Now, after thirteen years, I will give the true\ndetails of this inconceivable affair to the public.\n\nI have said that scattered towers marked the villages which dotted this\npart of Cornwall. The nearest of these was the hamlet of Tredannick\nWollas, where the cottages of a couple of hundred inhabitants clustered\nround an ancient, moss-grown church. The vicar of the parish, Mr.\nRoundhay, was something of an archaeologist, and as such Holmes had\nmade his acquaintance. He was a middle-aged man, portly and affable,\nwith a considerable fund of local lore. At his invitation we had taken\ntea at the vicarage and had come to know, also, Mr. Mortimer Tregennis,\nan independent gentleman, who increased the clergyman's scanty\nresources by taking rooms in his large, straggling house. The vicar,\nbeing a bachelor, was glad to come to such an arrangement, though he\nhad little in common with his lodger, who was a thin, dark, spectacled\nman, with a stoop which gave the impression of actual, physical\ndeformity. I remember that during our short visit we found the vicar\ngarrulous, but his lodger strangely reticent, a sad-faced,\nintrospective man, sitting with averted eyes, brooding apparently upon\nhis own affairs.\n\nThese were the two men who entered abruptly into our little\nsitting-room on Tuesday, March the 16th, shortly after our breakfast\nhour, as we were smoking together, preparatory to our daily excursion\nupon the moors.\n\n\"Mr. Holmes,\" said the vicar in an agitated voice, \"the most\nextraordinary and tragic affair has occurred during the night. It is\nthe most unheard-of business. We can only regard it as a special\nProvidence that you should chance to be here at the time, for in all\nEngland you are the one man we need.\"\n\nI glared at the intrusive vicar with no very friendly eyes; but Holmes\ntook his pipe from his lips and sat up in his chair like an old hound\nwho hears the view-halloa. He waved his hand to the sofa, and our\npalpitating visitor with his agitated companion sat side by side upon\nit. Mr. Mortimer Tregennis was more self-contained than the clergyman,\nbut the twitching of his thin hands and the brightness of his dark eyes\nshowed that they shared a common emotion.\n\n\"Shall I speak or you?\" he asked of the vicar.\n\n\"Well, as you seem to have made the discovery, whatever it may be, and\nthe vicar to have had it second-hand, perhaps you had better do the\nspeaking,\" said Holmes.\n\nI glanced at the hastily clad clergyman, with the formally dressed\nlodger seated beside him, and was amused at the surprise which Holmes's\nsimple deduction had brought to their faces.\n\n\"Perhaps I had best say a few words first,\" said the vicar, \"and then\nyou can judge if you will listen to the details from Mr. Tregennis, or\nwhether we should not hasten at once to the scene of this mysterious\naffair. I may explain, then, that our friend here spent last evening\nin the company of his two brothers, Owen and George, and of his sister\nBrenda, at their house of Tredannick Wartha, which is near the old\nstone cross upon the moor. He left them shortly after ten o'clock,\nplaying cards round the dining-room table, in excellent health and\nspirits. This morning, being an early riser, he walked in that\ndirection before breakfast and was overtaken by the carriage of Dr.\nRichards, who explained that he had just been sent for on a most urgent\ncall to Tredannick Wartha. Mr. Mortimer Tregennis naturally went with\nhim. When he arrived at Tredannick Wartha he found an extraordinary\nstate of things. His two brothers and his sister were seated round the\ntable exactly as he had left them, the cards still spread in front of\nthem and the candles burned down to their sockets. The sister lay back\nstone-dead in her chair, while the two brothers sat on each side of her\nlaughing, shouting, and singing, the senses stricken clean out of them.\nAll three of them, the dead woman and the two demented men, retained\nupon their faces an expression of the utmost horror--a convulsion of\nterror which was dreadful to look upon. There was no sign of the\npresence of anyone in the house, except Mrs. Porter, the old cook and\nhousekeeper, who declared that she had slept deeply and heard no sound\nduring the night. Nothing had been stolen or disarranged, and there is\nabsolutely no explanation of what the horror can be which has\nfrightened a woman to death and two strong men out of their senses.\nThere is the situation, Mr. Holmes, in a nutshell, and if you can help\nus to clear it up you will have done a great work.\"\n\nI had hoped that in some way I could coax my companion back into the\nquiet which had been the object of our journey; but one glance at his\nintense face and contracted eyebrows told me how vain was now the\nexpectation. He sat for some little time in silence, absorbed in the\nstrange drama which had broken in upon our peace.\n\n\"I will look into this matter,\" he said at last. \"On the face of it,\nit would appear to be a case of a very exceptional nature. Have you\nbeen there yourself, Mr. Roundhay?\"\n\n\"No, Mr. Holmes. Mr. Tregennis brought back the account to the\nvicarage, and I at once hurried over with him to consult you.\"\n\n\"How far is it to the house where this singular tragedy occurred?\"\n\n\"About a mile inland.\"\n\n\"Then we shall walk over together. But before we start I must ask you\na few questions, Mr. Mortimer Tregennis.\"\n\nThe other had been silent all this time, but I had observed that his\nmore controlled excitement was even greater than the obtrusive emotion\nof the clergyman. He sat with a pale, drawn face, his anxious gaze\nfixed upon Holmes, and his thin hands clasped convulsively together.\nHis pale lips quivered as he listened to the dreadful experience which\nhad befallen his family, and his dark eyes seemed to reflect something\nof the horror of the scene.\n\n\"Ask what you like, Mr. Holmes,\" said he eagerly. \"It is a bad thing\nto speak of, but I will answer you the truth.\"\n\n\"Tell me about last night.\"\n\n\"Well, Mr. Holmes, I supped there, as the vicar has said, and my elder\nbrother George proposed a game of whist afterwards. We sat down about\nnine o'clock. It was a quarter-past ten when I moved to go. I left\nthem all round the table, as merry as could be.\"\n\n\"Who let you out?\"\n\n\"Mrs. Porter had gone to bed, so I let myself out. I shut the hall\ndoor behind me. The window of the room in which they sat was closed,\nbut the blind was not drawn down. There was no change in door or\nwindow this morning, or any reason to think that any stranger had been\nto the house. Yet there they sat, driven clean mad with terror, and\nBrenda lying dead of fright, with her head hanging over the arm of the\nchair. I'll never get the sight of that room out of my mind so long as\nI live.\"\n\n\"The facts, as you state them, are certainly most remarkable,\" said\nHolmes. \"I take it that you have no theory yourself which can in any\nway account for them?\"\n\n\"It's devilish, Mr. Holmes, devilish!\" cried Mortimer Tregennis. \"It is\nnot of this world. Something has come into that room which has dashed\nthe light of reason from their minds. What human contrivance could do\nthat?\"\n\n\"I fear,\" said Holmes, \"that if the matter is beyond humanity it is\ncertainly beyond me. Yet we must exhaust all natural explanations\nbefore we fall back upon such a theory as this. As to yourself, Mr.\nTregennis, I take it you were divided in some way from your family,\nsince they lived together and you had rooms apart?\"\n\n\"That is so, Mr. Holmes, though the matter is past and done with. We\nwere a family of tin-miners at Redruth, but we sold our venture to a\ncompany, and so retired with enough to keep us. I won't deny that\nthere was some feeling about the division of the money and it stood\nbetween us for a time, but it was all forgiven and forgotten, and we\nwere the best of friends together.\"\n\n\"Looking back at the evening which you spent together, does anything\nstand out in your memory as throwing any possible light upon the\ntragedy? Think carefully, Mr. Tregennis, for any clue which can help\nme.\"\n\n\"There is nothing at all, sir.\"\n\n\"Your people were in their usual spirits?\"\n\n\"Never better.\"\n\n\"Were they nervous people? Did they ever show any apprehension of\ncoming danger?\"\n\n\"Nothing of the kind.\"\n\n\"You have nothing to add then, which could assist me?\"\n\nMortimer Tregennis considered earnestly for a moment.\n\n\"There is one thing occurs to me,\" said he at last. \"As we sat at the\ntable my back was to the window, and my brother George, he being my\npartner at cards, was facing it. I saw him once look hard over my\nshoulder, so I turned round and looked also. The blind was up and the\nwindow shut, but I could just make out the bushes on the lawn, and it\nseemed to me for a moment that I saw something moving among them. I\ncouldn't even say if it was man or animal, but I just thought there was\nsomething there. When I asked him what he was looking at, he told me\nthat he had the same feeling. That is all that I can say.\"\n\n\"Did you not investigate?\"\n\n\"No; the matter passed as unimportant.\"\n\n\"You left them, then, without any premonition of evil?\"\n\n\"None at all.\"\n\n\"I am not clear how you came to hear the news so early this morning.\"\n\n\"I am an early riser and generally take a walk before breakfast. This\nmorning I had hardly started when the doctor in his carriage overtook\nme. He told me that old Mrs. Porter had sent a boy down with an urgent\nmessage. I sprang in beside him and we drove on. When we got there we\nlooked into that dreadful room. The candles and the fire must have\nburned out hours before, and they had been sitting there in the dark\nuntil dawn had broken. The doctor said Brenda must have been dead at\nleast six hours. There were no signs of violence. She just lay across\nthe arm of the chair with that look on her face. George and Owen were\nsinging snatches of songs and gibbering like two great apes. Oh, it\nwas awful to see! I couldn't stand it, and the doctor was as white as\na sheet. Indeed, he fell into a chair in a sort of faint, and we\nnearly had him on our hands as well.\"\n\n\"Remarkable--most remarkable!\" said Holmes, rising and taking his hat.\n\"I think, perhaps, we had better go down to Tredannick Wartha without\nfurther delay. I confess that I have seldom known a case which at\nfirst sight presented a more singular problem.\"\n\n\nOur proceedings of that first morning did little to advance the\ninvestigation. It was marked, however, at the outset by an incident\nwhich left the most sinister impression upon my mind. The approach to\nthe spot at which the tragedy occurred is down a narrow, winding,\ncountry lane. While we made our way along it we heard the rattle of a\ncarriage coming towards us and stood aside to let it pass. As it drove\nby us I caught a glimpse through the closed window of a horribly\ncontorted, grinning face glaring out at us. Those staring eyes and\ngnashing teeth flashed past us like a dreadful vision.\n\n\"My brothers!\" cried Mortimer Tregennis, white to his lips. \"They are\ntaking them to Helston.\"\n\nWe looked with horror after the black carriage, lumbering upon its way.\nThen we turned our steps towards this ill-omened house in which they\nhad met their strange fate.\n\nIt was a large and bright dwelling, rather a villa than a cottage, with\na considerable garden which was already, in that Cornish air, well\nfilled with spring flowers. Towards this garden the window of the\nsitting-room fronted, and from it, according to Mortimer Tregennis,\nmust have come that thing of evil which had by sheer horror in a single\ninstant blasted their minds. Holmes walked slowly and thoughtfully\namong the flower-plots and along the path before we entered the porch.\nSo absorbed was he in his thoughts, I remember, that he stumbled over\nthe watering-pot, upset its contents, and deluged both our feet and the\ngarden path. Inside the house we were met by the elderly Cornish\nhousekeeper, Mrs. Porter, who, with the aid of a young girl, looked\nafter the wants of the family. She readily answered all Holmes's\nquestions. She had heard nothing in the night. Her employers had all\nbeen in excellent spirits lately, and she had never known them more\ncheerful and prosperous. She had fainted with horror upon entering the\nroom in the morning and seeing that dreadful company round the table.\nShe had, when she recovered, thrown open the window to let the morning\nair in, and had run down to the lane, whence she sent a farm-lad for\nthe doctor. The lady was on her bed upstairs if we cared to see her.\nIt took four strong men to get the brothers into the asylum carriage.\nShe would not herself stay in the house another day and was starting\nthat very afternoon to rejoin her family at St. Ives.\n\nWe ascended the stairs and viewed the body. Miss Brenda Tregennis had\nbeen a very beautiful girl, though now verging upon middle age. Her\ndark, clear-cut face was handsome, even in death, but there still\nlingered upon it something of that convulsion of horror which had been\nher last human emotion. From her bedroom we descended to the\nsitting-room, where this strange tragedy had actually occurred. The\ncharred ashes of the overnight fire lay in the grate. On the table\nwere the four guttered and burned-out candles, with the cards scattered\nover its surface. The chairs had been moved back against the walls,\nbut all else was as it had been the night before. Holmes paced with\nlight, swift steps about the room; he sat in the various chairs,\ndrawing them up and reconstructing their positions. He tested how much\nof the garden was visible; he examined the floor, the ceiling, and the\nfireplace; but never once did I see that sudden brightening of his eyes\nand tightening of his lips which would have told me that he saw some\ngleam of light in this utter darkness.\n\n\"Why a fire?\" he asked once. \"Had they always a fire in this small\nroom on a spring evening?\"\n\nMortimer Tregennis explained that the night was cold and damp. For that\nreason, after his arrival, the fire was lit. \"What are you going to do\nnow, Mr. Holmes?\" he asked.\n\nMy friend smiled and laid his hand upon my arm. \"I think, Watson, that\nI shall resume that course of tobacco-poisoning which you have so often\nand so justly condemned,\" said he. \"With your permission, gentlemen,\nwe will now return to our cottage, for I am not aware that any new\nfactor is likely to come to our notice here. I will turn the facts\nover in my mind, Mr. Tregennis, and should anything occur to me I will\ncertainly communicate with you and the vicar. In the meantime I wish\nyou both good-morning.\"\n\nIt was not until long after we were back in Poldhu Cottage that Holmes\nbroke his complete and absorbed silence. He sat coiled in his\narmchair, his haggard and ascetic face hardly visible amid the blue\nswirl of his tobacco smoke, his black brows drawn down, his forehead\ncontracted, his eyes vacant and far away. Finally he laid down his\npipe and sprang to his feet.\n\n\"It won't do, Watson!\" said he with a laugh. \"Let us walk along the\ncliffs together and search for flint arrows. We are more likely to\nfind them than clues to this problem. To let the brain work without\nsufficient material is like racing an engine. It racks itself to\npieces. The sea air, sunshine, and patience, Watson--all else will\ncome.\n\n\"Now, let us calmly define our position, Watson,\" he continued as we\nskirted the cliffs together. \"Let us get a firm grip of the very\nlittle which we DO know, so that when fresh facts arise we may be ready\nto fit them into their places. I take it, in the first place, that\nneither of us is prepared to admit diabolical intrusions into the\naffairs of men. Let us begin by ruling that entirely out of our minds.\nVery good. There remain three persons who have been grievously\nstricken by some conscious or unconscious human agency. That is firm\nground. Now, when did this occur? Evidently, assuming his narrative\nto be true, it was immediately after Mr. Mortimer Tregennis had left\nthe room. That is a very important point. The presumption is that it\nwas within a few minutes afterwards. The cards still lay upon the\ntable. It was already past their usual hour for bed. Yet they had not\nchanged their position or pushed back their chairs. I repeat, then,\nthat the occurrence was immediately after his departure, and not later\nthan eleven o'clock last night.\n\n\"Our next obvious step is to check, so far as we can, the movements of\nMortimer Tregennis after he left the room. In this there is no\ndifficulty, and they seem to be above suspicion. Knowing my methods as\nyou do, you were, of course, conscious of the somewhat clumsy water-pot\nexpedient by which I obtained a clearer impress of his foot than might\notherwise have been possible. The wet, sandy path took it admirably.\nLast night was also wet, you will remember, and it was not\ndifficult--having obtained a sample print--to pick out his track among\nothers and to follow his movements. He appears to have walked away\nswiftly in the direction of the vicarage.\n\n\"If, then, Mortimer Tregennis disappeared from the scene, and yet some\noutside person affected the card-players, how can we reconstruct that\nperson, and how was such an impression of horror conveyed? Mrs. Porter\nmay be eliminated. She is evidently harmless. Is there any evidence\nthat someone crept up to the garden window and in some manner produced\nso terrific an effect that he drove those who saw it out of their\nsenses? The only suggestion in this direction comes from Mortimer\nTregennis himself, who says that his brother spoke about some movement\nin the garden. That is certainly remarkable, as the night was rainy,\ncloudy, and dark. Anyone who had the design to alarm these people\nwould be compelled to place his very face against the glass before he\ncould be seen. There is a three-foot flower-border outside this\nwindow, but no indication of a footmark. It is difficult to imagine,\nthen, how an outsider could have made so terrible an impression upon\nthe company, nor have we found any possible motive for so strange and\nelaborate an attempt. You perceive our difficulties, Watson?\"\n\n\"They are only too clear,\" I answered with conviction.\n\n\"And yet, with a little more material, we may prove that they are not\ninsurmountable,\" said Holmes. \"I fancy that among your extensive\narchives, Watson, you may find some which were nearly as obscure.\nMeanwhile, we shall put the case aside until more accurate data are\navailable, and devote the rest of our morning to the pursuit of\nneolithic man.\"\n\nI may have commented upon my friend's power of mental detachment, but\nnever have I wondered at it more than upon that spring morning in\nCornwall when for two hours he discoursed upon celts, arrowheads, and\nshards, as lightly as if no sinister mystery were waiting for his\nsolution. It was not until we had returned in the afternoon to our\ncottage that we found a visitor awaiting us, who soon brought our minds\nback to the matter in hand. Neither of us needed to be told who that\nvisitor was. The huge body, the craggy and deeply seamed face with the\nfierce eyes and hawk-like nose, the grizzled hair which nearly brushed\nour cottage ceiling, the beard--golden at the fringes and white near\nthe lips, save for the nicotine stain from his perpetual cigar--all\nthese were as well known in London as in Africa, and could only be\nassociated with the tremendous personality of Dr. Leon Sterndale, the\ngreat lion-hunter and explorer.\n\nWe had heard of his presence in the district and had once or twice\ncaught sight of his tall figure upon the moorland paths. He made no\nadvances to us, however, nor would we have dreamed of doing so to him,\nas it was well known that it was his love of seclusion which caused him\nto spend the greater part of the intervals between his journeys in a\nsmall bungalow buried in the lonely wood of Beauchamp Arriance. Here,\namid his books and his maps, he lived an absolutely lonely life,\nattending to his own simple wants and paying little apparent heed to\nthe affairs of his neighbours. It was a surprise to me, therefore, to\nhear him asking Holmes in an eager voice whether he had made any\nadvance in his reconstruction of this mysterious episode. \"The county\npolice are utterly at fault,\" said he, \"but perhaps your wider\nexperience has suggested some conceivable explanation. My only claim\nto being taken into your confidence is that during my many residences\nhere I have come to know this family of Tregennis very well--indeed,\nupon my Cornish mother's side I could call them cousins--and their\nstrange fate has naturally been a great shock to me. I may tell you\nthat I had got as far as Plymouth upon my way to Africa, but the news\nreached me this morning, and I came straight back again to help in the\ninquiry.\"\n\nHolmes raised his eyebrows.\n\n\"Did you lose your boat through it?\"\n\n\"I will take the next.\"\n\n\"Dear me! that is friendship indeed.\"\n\n\"I tell you they were relatives.\"\n\n\"Quite so--cousins of your mother. Was your baggage aboard the ship?\"\n\n\"Some of it, but the main part at the hotel.\"\n\n\"I see. But surely this event could not have found its way into the\nPlymouth morning papers.\"\n\n\"No, sir; I had a telegram.\"\n\n\"Might I ask from whom?\"\n\nA shadow passed over the gaunt face of the explorer.\n\n\"You are very inquisitive, Mr. Holmes.\"\n\n\"It is my business.\"\n\nWith an effort Dr. Sterndale recovered his ruffled composure.\n\n\"I have no objection to telling you,\" he said. \"It was Mr. Roundhay,\nthe vicar, who sent me the telegram which recalled me.\"\n\n\"Thank you,\" said Holmes. \"I may say in answer to your original\nquestion that I have not cleared my mind entirely on the subject of\nthis case, but that I have every hope of reaching some conclusion. It\nwould be premature to say more.\"\n\n\"Perhaps you would not mind telling me if your suspicions point in any\nparticular direction?\"\n\n\"No, I can hardly answer that.\"\n\n\"Then I have wasted my time and need not prolong my visit.\" The famous\ndoctor strode out of our cottage in considerable ill-humour, and within\nfive minutes Holmes had followed him. I saw him no more until the\nevening, when he returned with a slow step and haggard face which\nassured me that he had made no great progress with his investigation.\nHe glanced at a telegram which awaited him and threw it into the grate.\n\n\"From the Plymouth hotel, Watson,\" he said. \"I learned the name of it\nfrom the vicar, and I wired to make certain that Dr. Leon Sterndale's\naccount was true. It appears that he did indeed spend last night\nthere, and that he has actually allowed some of his baggage to go on to\nAfrica, while he returned to be present at this investigation. What do\nyou make of that, Watson?\"\n\n\"He is deeply interested.\"\n\n\"Deeply interested--yes. There is a thread here which we had not yet\ngrasped and which might lead us through the tangle. Cheer up, Watson,\nfor I am very sure that our material has not yet all come to hand.\nWhen it does we may soon leave our difficulties behind us.\"\n\nLittle did I think how soon the words of Holmes would be realized, or\nhow strange and sinister would be that new development which opened up\nan entirely fresh line of investigation. I was shaving at my window in\nthe morning when I heard the rattle of hoofs and, looking up, saw a\ndog-cart coming at a gallop down the road. It pulled up at our door,\nand our friend, the vicar, sprang from it and rushed up our garden\npath. Holmes was already dressed, and we hastened down to meet him.\n\nOur visitor was so excited that he could hardly articulate, but at last\nin gasps and bursts his tragic story came out of him.\n\n\"We are devil-ridden, Mr. Holmes! My poor parish is devil-ridden!\" he\ncried. \"Satan himself is loose in it! We are given over into his\nhands!\" He danced about in his agitation, a ludicrous object if it\nwere not for his ashy face and startled eyes. Finally he shot out his\nterrible news.\n\n\"Mr. Mortimer Tregennis died during the night, and with exactly the\nsame symptoms as the rest of his family.\"\n\nHolmes sprang to his feet, all energy in an instant.\n\n\"Can you fit us both into your dog-cart?\"\n\n\"Yes, I can.\"\n\n\"Then, Watson, we will postpone our breakfast. Mr. Roundhay, we are\nentirely at your disposal. Hurry--hurry, before things get\ndisarranged.\"\n\nThe lodger occupied two rooms at the vicarage, which were in an angle\nby themselves, the one above the other. Below was a large\nsitting-room; above, his bedroom. They looked out upon a croquet lawn\nwhich came up to the windows. We had arrived before the doctor or the\npolice, so that everything was absolutely undisturbed. Let me describe\nexactly the scene as we saw it upon that misty March morning. It has\nleft an impression which can never be effaced from my mind.\n\nThe atmosphere of the room was of a horrible and depressing stuffiness.\nThe servant who had first entered had thrown up the window, or it would\nhave been even more intolerable. This might partly be due to the fact\nthat a lamp stood flaring and smoking on the centre table. Beside it\nsat the dead man, leaning back in his chair, his thin beard projecting,\nhis spectacles pushed up on to his forehead, and his lean dark face\nturned towards the window and twisted into the same distortion of\nterror which had marked the features of his dead sister. His limbs\nwere convulsed and his fingers contorted as though he had died in a\nvery paroxysm of fear. He was fully clothed, though there were signs\nthat his dressing had been done in a hurry. We had already learned\nthat his bed had been slept in, and that the tragic end had come to him\nin the early morning.\n\nOne realized the red-hot energy which underlay Holmes's phlegmatic\nexterior when one saw the sudden change which came over him from the\nmoment that he entered the fatal apartment. In an instant he was tense\nand alert, his eyes shining, his face set, his limbs quivering with\neager activity. He was out on the lawn, in through the window, round\nthe room, and up into the bedroom, for all the world like a dashing\nfoxhound drawing a cover. In the bedroom he made a rapid cast around\nand ended by throwing open the window, which appeared to give him some\nfresh cause for excitement, for he leaned out of it with loud\nejaculations of interest and delight. Then he rushed down the stair,\nout through the open window, threw himself upon his face on the lawn,\nsprang up and into the room once more, all with the energy of the\nhunter who is at the very heels of his quarry. The lamp, which was an\nordinary standard, he examined with minute care, making certain\nmeasurements upon its bowl. He carefully scrutinized with his lens the\ntalc shield which covered the top of the chimney and scraped off some\nashes which adhered to its upper surface, putting some of them into an\nenvelope, which he placed in his pocketbook. Finally, just as the\ndoctor and the official police put in an appearance, he beckoned to the\nvicar and we all three went out upon the lawn.\n\n\"I am glad to say that my investigation has not been entirely barren,\"\nhe remarked. \"I cannot remain to discuss the matter with the police,\nbut I should be exceedingly obliged, Mr. Roundhay, if you would give\nthe inspector my compliments and direct his attention to the bedroom\nwindow and to the sitting-room lamp. Each is suggestive, and together\nthey are almost conclusive. If the police would desire further\ninformation I shall be happy to see any of them at the cottage. And\nnow, Watson, I think that, perhaps, we shall be better employed\nelsewhere.\"\n\nIt may be that the police resented the intrusion of an amateur, or that\nthey imagined themselves to be upon some hopeful line of investigation;\nbut it is certain that we heard nothing from them for the next two\ndays. During this time Holmes spent some of his time smoking and\ndreaming in the cottage; but a greater portion in country walks which\nhe undertook alone, returning after many hours without remark as to\nwhere he had been. One experiment served to show me the line of his\ninvestigation. He had bought a lamp which was the duplicate of the one\nwhich had burned in the room of Mortimer Tregennis on the morning of\nthe tragedy. This he filled with the same oil as that used at the\nvicarage, and he carefully timed the period which it would take to be\nexhausted. Another experiment which he made was of a more unpleasant\nnature, and one which I am not likely ever to forget.\n\n\"You will remember, Watson,\" he remarked one afternoon, \"that there is\na single common point of resemblance in the varying reports which have\nreached us. This concerns the effect of the atmosphere of the room in\neach case upon those who had first entered it. You will recollect that\nMortimer Tregennis, in describing the episode of his last visit to his\nbrother's house, remarked that the doctor on entering the room fell\ninto a chair? You had forgotten? Well I can answer for it that it was\nso. Now, you will remember also that Mrs. Porter, the housekeeper, told\nus that she herself fainted upon entering the room and had afterwards\nopened the window. In the second case--that of Mortimer Tregennis\nhimself--you cannot have forgotten the horrible stuffiness of the room\nwhen we arrived, though the servant had thrown open the window. That\nservant, I found upon inquiry, was so ill that she had gone to her bed.\nYou will admit, Watson, that these facts are very suggestive. In each\ncase there is evidence of a poisonous atmosphere. In each case, also,\nthere is combustion going on in the room--in the one case a fire, in\nthe other a lamp. The fire was needed, but the lamp was lit--as a\ncomparison of the oil consumed will show--long after it was broad\ndaylight. Why? Surely because there is some connection between three\nthings--the burning, the stuffy atmosphere, and, finally, the madness\nor death of those unfortunate people. That is clear, is it not?\"\n\n\"It would appear so.\"\n\n\"At least we may accept it as a working hypothesis. We will suppose,\nthen, that something was burned in each case which produced an\natmosphere causing strange toxic effects. Very good. In the first\ninstance--that of the Tregennis family--this substance was placed in\nthe fire. Now the window was shut, but the fire would naturally carry\nfumes to some extent up the chimney. Hence one would expect the\neffects of the poison to be less than in the second case, where there\nwas less escape for the vapour. The result seems to indicate that it\nwas so, since in the first case only the woman, who had presumably the\nmore sensitive organism, was killed, the others exhibiting that\ntemporary or permanent lunacy which is evidently the first effect of\nthe drug. In the second case the result was complete. The facts,\ntherefore, seem to bear out the theory of a poison which worked by\ncombustion.\n\n\"With this train of reasoning in my head I naturally looked about in\nMortimer Tregennis's room to find some remains of this substance. The\nobvious place to look was the talc shelf or smoke-guard of the lamp.\nThere, sure enough, I perceived a number of flaky ashes, and round the\nedges a fringe of brownish powder, which had not yet been consumed.\nHalf of this I took, as you saw, and I placed it in an envelope.\"\n\n\"Why half, Holmes?\"\n\n\"It is not for me, my dear Watson, to stand in the way of the official\npolice force. I leave them all the evidence which I found. The poison\nstill remained upon the talc had they the wit to find it. Now, Watson,\nwe will light our lamp; we will, however, take the precaution to open\nour window to avoid the premature decease of two deserving members of\nsociety, and you will seat yourself near that open window in an\narmchair unless, like a sensible man, you determine to have nothing to\ndo with the affair. Oh, you will see it out, will you? I thought I\nknew my Watson. This chair I will place opposite yours, so that we may\nbe the same distance from the poison and face to face. The door we\nwill leave ajar. Each is now in a position to watch the other and to\nbring the experiment to an end should the symptoms seem alarming. Is\nthat all clear? Well, then, I take our powder--or what remains of\nit--from the envelope, and I lay it above the burning lamp. So! Now,\nWatson, let us sit down and await developments.\"\n\nThey were not long in coming. I had hardly settled in my chair before\nI was conscious of a thick, musky odour, subtle and nauseous. At the\nvery first whiff of it my brain and my imagination were beyond all\ncontrol. A thick, black cloud swirled before my eyes, and my mind told\nme that in this cloud, unseen as yet, but about to spring out upon my\nappalled senses, lurked all that was vaguely horrible, all that was\nmonstrous and inconceivably wicked in the universe. Vague shapes\nswirled and swam amid the dark cloud-bank, each a menace and a warning\nof something coming, the advent of some unspeakable dweller upon the\nthreshold, whose very shadow would blast my soul. A freezing horror\ntook possession of me. I felt that my hair was rising, that my eyes\nwere protruding, that my mouth was opened, and my tongue like leather.\nThe turmoil within my brain was such that something must surely snap.\nI tried to scream and was vaguely aware of some hoarse croak which was\nmy own voice, but distant and detached from myself. At the same moment,\nin some effort of escape, I broke through that cloud of despair and had\na glimpse of Holmes's face, white, rigid, and drawn with horror--the\nvery look which I had seen upon the features of the dead. It was that\nvision which gave me an instant of sanity and of strength. I dashed\nfrom my chair, threw my arms round Holmes, and together we lurched\nthrough the door, and an instant afterwards had thrown ourselves down\nupon the grass plot and were lying side by side, conscious only of the\nglorious sunshine which was bursting its way through the hellish cloud\nof terror which had girt us in. Slowly it rose from our souls like the\nmists from a landscape until peace and reason had returned, and we were\nsitting upon the grass, wiping our clammy foreheads, and looking with\napprehension at each other to mark the last traces of that terrific\nexperience which we had undergone.\n\n\"Upon my word, Watson!\" said Holmes at last with an unsteady voice, \"I\nowe you both my thanks and an apology. It was an unjustifiable\nexperiment even for one's self, and doubly so for a friend. I am\nreally very sorry.\"\n\n\"You know,\" I answered with some emotion, for I have never seen so much\nof Holmes's heart before, \"that it is my greatest joy and privilege to\nhelp you.\"\n\nHe relapsed at once into the half-humorous, half-cynical vein which was\nhis habitual attitude to those about him. \"It would be superfluous to\ndrive us mad, my dear Watson,\" said he. \"A candid observer would\ncertainly declare that we were so already before we embarked upon so\nwild an experiment. I confess that I never imagined that the effect\ncould be so sudden and so severe.\" He dashed into the cottage, and,\nreappearing with the burning lamp held at full arm's length, he threw\nit among a bank of brambles. \"We must give the room a little time to\nclear. I take it, Watson, that you have no longer a shadow of a doubt\nas to how these tragedies were produced?\"\n\n\"None whatever.\"\n\n\"But the cause remains as obscure as before. Come into the arbour here\nand let us discuss it together. That villainous stuff seems still to\nlinger round my throat. I think we must admit that all the evidence\npoints to this man, Mortimer Tregennis, having been the criminal in the\nfirst tragedy, though he was the victim in the second one. We must\nremember, in the first place, that there is some story of a family\nquarrel, followed by a reconciliation. How bitter that quarrel may\nhave been, or how hollow the reconciliation we cannot tell. When I\nthink of Mortimer Tregennis, with the foxy face and the small shrewd,\nbeady eyes behind the spectacles, he is not a man whom I should judge\nto be of a particularly forgiving disposition. Well, in the next place,\nyou will remember that this idea of someone moving in the garden, which\ntook our attention for a moment from the real cause of the tragedy,\nemanated from him. He had a motive in misleading us. Finally, if he\ndid not throw the substance into the fire at the moment of leaving the\nroom, who did do so? The affair happened immediately after his\ndeparture. Had anyone else come in, the family would certainly have\nrisen from the table. Besides, in peaceful Cornwall, visitors did not\narrive after ten o'clock at night. We may take it, then, that all the\nevidence points to Mortimer Tregennis as the culprit.\"\n\n\"Then his own death was suicide!\"\n\n\"Well, Watson, it is on the face of it a not impossible supposition.\nThe man who had the guilt upon his soul of having brought such a fate\nupon his own family might well be driven by remorse to inflict it upon\nhimself. There are, however, some cogent reasons against it.\nFortunately, there is one man in England who knows all about it, and I\nhave made arrangements by which we shall hear the facts this afternoon\nfrom his own lips. Ah! he is a little before his time. Perhaps you\nwould kindly step this way, Dr. Leon Sterndale. We have been conducing\na chemical experiment indoors which has left our little room hardly fit\nfor the reception of so distinguished a visitor.\"\n\nI had heard the click of the garden gate, and now the majestic figure\nof the great African explorer appeared upon the path. He turned in\nsome surprise towards the rustic arbour in which we sat.\n\n\"You sent for me, Mr. Holmes. I had your note about an hour ago, and I\nhave come, though I really do not know why I should obey your summons.\"\n\n\"Perhaps we can clear the point up before we separate,\" said Holmes.\n\"Meanwhile, I am much obliged to you for your courteous acquiescence.\nYou will excuse this informal reception in the open air, but my friend\nWatson and I have nearly furnished an additional chapter to what the\npapers call the Cornish Horror, and we prefer a clear atmosphere for\nthe present. Perhaps, since the matters which we have to discuss will\naffect you personally in a very intimate fashion, it is as well that we\nshould talk where there can be no eavesdropping.\"\n\nThe explorer took his cigar from his lips and gazed sternly at my\ncompanion.\n\n\"I am at a loss to know, sir,\" he said, \"what you can have to speak\nabout which affects me personally in a very intimate fashion.\"\n\n\"The killing of Mortimer Tregennis,\" said Holmes.\n\nFor a moment I wished that I were armed. Sterndale's fierce face\nturned to a dusky red, his eyes glared, and the knotted, passionate\nveins started out in his forehead, while he sprang forward with\nclenched hands towards my companion. Then he stopped, and with a\nviolent effort he resumed a cold, rigid calmness, which was, perhaps,\nmore suggestive of danger than his hot-headed outburst.\n\n\"I have lived so long among savages and beyond the law,\" said he, \"that\nI have got into the way of being a law to myself. You would do well,\nMr. Holmes, not to forget it, for I have no desire to do you an injury.\"\n\n\"Nor have I any desire to do you an injury, Dr. Sterndale. Surely the\nclearest proof of it is that, knowing what I know, I have sent for you\nand not for the police.\"\n\nSterndale sat down with a gasp, overawed for, perhaps, the first time\nin his adventurous life. There was a calm assurance of power in\nHolmes's manner which could not be withstood. Our visitor stammered\nfor a moment, his great hands opening and shutting in his agitation.\n\n\"What do you mean?\" he asked at last. \"If this is bluff upon your\npart, Mr. Holmes, you have chosen a bad man for your experiment. Let us\nhave no more beating about the bush. What DO you mean?\"\n\n\"I will tell you,\" said Holmes, \"and the reason why I tell you is that\nI hope frankness may beget frankness. What my next step may be will\ndepend entirely upon the nature of your own defence.\"\n\n\"My defence?\"\n\n\"Yes, sir.\"\n\n\"My defence against what?\"\n\n\"Against the charge of killing Mortimer Tregennis.\"\n\nSterndale mopped his forehead with his handkerchief. \"Upon my word,\nyou are getting on,\" said he. \"Do all your successes depend upon this\nprodigious power of bluff?\"\n\n\"The bluff,\" said Holmes sternly, \"is upon your side, Dr. Leon\nSterndale, and not upon mine. As a proof I will tell you some of the\nfacts upon which my conclusions are based. Of your return from\nPlymouth, allowing much of your property to go on to Africa, I will say\nnothing save that it first informed me that you were one of the factors\nwhich had to be taken into account in reconstructing this drama--\"\n\n\"I came back--\"\n\n\"I have heard your reasons and regard them as unconvincing and\ninadequate. We will pass that. You came down here to ask me whom I\nsuspected. I refused to answer you. You then went to the vicarage,\nwaited outside it for some time, and finally returned to your cottage.\"\n\n\"How do you know that?\"\n\n\"I followed you.\"\n\n\"I saw no one.\"\n\n\"That is what you may expect to see when I follow you. You spent a\nrestless night at your cottage, and you formed certain plans, which in\nthe early morning you proceeded to put into execution. Leaving your\ndoor just as day was breaking, you filled your pocket with some reddish\ngravel that was lying heaped beside your gate.\"\n\nSterndale gave a violent start and looked at Holmes in amazement.\n\n\"You then walked swiftly for the mile which separated you from the\nvicarage. You were wearing, I may remark, the same pair of ribbed\ntennis shoes which are at the present moment upon your feet. At the\nvicarage you passed through the orchard and the side hedge, coming out\nunder the window of the lodger Tregennis. It was now daylight, but the\nhousehold was not yet stirring. You drew some of the gravel from your\npocket, and you threw it up at the window above you.\"\n\nSterndale sprang to his feet.\n\n\"I believe that you are the devil himself!\" he cried.\n\nHolmes smiled at the compliment. \"It took two, or possibly three,\nhandfuls before the lodger came to the window. You beckoned him to\ncome down. He dressed hurriedly and descended to his sitting-room.\nYou entered by the window. There was an interview--a short one--during\nwhich you walked up and down the room. Then you passed out and closed\nthe window, standing on the lawn outside smoking a cigar and watching\nwhat occurred. Finally, after the death of Tregennis, you withdrew as\nyou had come. Now, Dr. Sterndale, how do you justify such conduct, and\nwhat were the motives for your actions? If you prevaricate or trifle\nwith me, I give you my assurance that the matter will pass out of my\nhands forever.\"\n\nOur visitor's face had turned ashen gray as he listened to the words of\nhis accuser. Now he sat for some time in thought with his face sunk in\nhis hands. Then with a sudden impulsive gesture he plucked a\nphotograph from his breast-pocket and threw it on the rustic table\nbefore us.\n\n\"That is why I have done it,\" said he.\n\nIt showed the bust and face of a very beautiful woman. Holmes stooped\nover it.\n\n\"Brenda Tregennis,\" said he.\n\n\"Yes, Brenda Tregennis,\" repeated our visitor. \"For years I have loved\nher. For years she has loved me. There is the secret of that Cornish\nseclusion which people have marvelled at. It has brought me close to\nthe one thing on earth that was dear to me. I could not marry her, for\nI have a wife who has left me for years and yet whom, by the deplorable\nlaws of England, I could not divorce. For years Brenda waited. For\nyears I waited. And this is what we have waited for.\" A terrible sob\nshook his great frame, and he clutched his throat under his brindled\nbeard. Then with an effort he mastered himself and spoke on:\n\n\"The vicar knew. He was in our confidence. He would tell you that she\nwas an angel upon earth. That was why he telegraphed to me and I\nreturned. What was my baggage or Africa to me when I learned that such\na fate had come upon my darling? There you have the missing clue to my\naction, Mr. Holmes.\"\n\n\"Proceed,\" said my friend.\n\nDr. Sterndale drew from his pocket a paper packet and laid it upon the\ntable. On the outside was written \"Radix pedis diaboli\" with a red\npoison label beneath it. He pushed it towards me. \"I understand that\nyou are a doctor, sir. Have you ever heard of this preparation?\"\n\n\"Devil's-foot root! No, I have never heard of it.\"\n\n\"It is no reflection upon your professional knowledge,\" said he, \"for I\nbelieve that, save for one sample in a laboratory at Buda, there is no\nother specimen in Europe. It has not yet found its way either into the\npharmacopoeia or into the literature of toxicology. The root is shaped\nlike a foot, half human, half goatlike; hence the fanciful name given\nby a botanical missionary. It is used as an ordeal poison by the\nmedicine-men in certain districts of West Africa and is kept as a\nsecret among them. This particular specimen I obtained under very\nextraordinary circumstances in the Ubangi country.\" He opened the\npaper as he spoke and disclosed a heap of reddish-brown, snuff-like\npowder.\n\n\"Well, sir?\" asked Holmes sternly.\n\n\"I am about to tell you, Mr. Holmes, all that actually occurred, for\nyou already know so much that it is clearly to my interest that you\nshould know all. I have already explained the relationship in which I\nstood to the Tregennis family. For the sake of the sister I was\nfriendly with the brothers. There was a family quarrel about money\nwhich estranged this man Mortimer, but it was supposed to be made up,\nand I afterwards met him as I did the others. He was a sly, subtle,\nscheming man, and several things arose which gave me a suspicion of\nhim, but I had no cause for any positive quarrel.\n\n\"One day, only a couple of weeks ago, he came down to my cottage and I\nshowed him some of my African curiosities. Among other things I\nexhibited this powder, and I told him of its strange properties, how it\nstimulates those brain centres which control the emotion of fear, and\nhow either madness or death is the fate of the unhappy native who is\nsubjected to the ordeal by the priest of his tribe. I told him also\nhow powerless European science would be to detect it. How he took it I\ncannot say, for I never left the room, but there is no doubt that it\nwas then, while I was opening cabinets and stooping to boxes, that he\nmanaged to abstract some of the devil's-foot root. I well remember how\nhe plied me with questions as to the amount and the time that was\nneeded for its effect, but I little dreamed that he could have a\npersonal reason for asking.\n\n\"I thought no more of the matter until the vicar's telegram reached me\nat Plymouth. This villain had thought that I would be at sea before\nthe news could reach me, and that I should be lost for years in Africa.\nBut I returned at once. Of course, I could not listen to the details\nwithout feeling assured that my poison had been used. I came round to\nsee you on the chance that some other explanation had suggested itself\nto you. But there could be none. I was convinced that Mortimer\nTregennis was the murderer; that for the sake of money, and with the\nidea, perhaps, that if the other members of his family were all insane\nhe would be the sole guardian of their joint property, he had used the\ndevil's-foot powder upon them, driven two of them out of their senses,\nand killed his sister Brenda, the one human being whom I have ever\nloved or who has ever loved me. There was his crime; what was to be\nhis punishment?\n\n\"Should I appeal to the law? Where were my proofs? I knew that the\nfacts were true, but could I help to make a jury of countrymen believe\nso fantastic a story? I might or I might not. But I could not afford\nto fail. My soul cried out for revenge. I have said to you once\nbefore, Mr. Holmes, that I have spent much of my life outside the law,\nand that I have come at last to be a law to myself. So it was even\nnow. I determined that the fate which he had given to others should be\nshared by himself. Either that or I would do justice upon him with my\nown hand. In all England there can be no man who sets less value upon\nhis own life than I do at the present moment.\n\n\"Now I have told you all. You have yourself supplied the rest. I did,\nas you say, after a restless night, set off early from my cottage. I\nforesaw the difficulty of arousing him, so I gathered some gravel from\nthe pile which you have mentioned, and I used it to throw up to his\nwindow. He came down and admitted me through the window of the\nsitting-room. I laid his offence before him. I told him that I had\ncome both as judge and executioner. The wretch sank into a chair,\nparalyzed at the sight of my revolver. I lit the lamp, put the powder\nabove it, and stood outside the window, ready to carry out my threat to\nshoot him should he try to leave the room. In five minutes he died.\nMy God! how he died! But my heart was flint, for he endured nothing\nwhich my innocent darling had not felt before him. There is my story,\nMr. Holmes. Perhaps, if you loved a woman, you would have done as much\nyourself. At any rate, I am in your hands. You can take what steps\nyou like. As I have already said, there is no man living who can fear\ndeath less than I do.\"\n\nHolmes sat for some little time in silence.\n\n\"What were your plans?\" he asked at last.\n\n\"I had intended to bury myself in central Africa. My work there is but\nhalf finished.\"\n\n\"Go and do the other half,\" said Holmes. \"I, at least, am not prepared\nto prevent you.\"\n\nDr. Sterndale raised his giant figure, bowed gravely, and walked from\nthe arbour. Holmes lit his pipe and handed me his pouch.\n\n\"Some fumes which are not poisonous would be a welcome change,\" said\nhe. \"I think you must agree, Watson, that it is not a case in which we\nare called upon to interfere. Our investigation has been independent,\nand our action shall be so also. You would not denounce the man?\"\n\n\"Certainly not,\" I answered.\n\n\"I have never loved, Watson, but if I did and if the woman I loved had\nmet such an end, I might act even as our lawless lion-hunter has done.\nWho knows? Well, Watson, I will not offend your intelligence by\nexplaining what is obvious. The gravel upon the window-sill was, of\ncourse, the starting-point of my research. It was unlike anything in\nthe vicarage garden. Only when my attention had been drawn to Dr.\nSterndale and his cottage did I find its counterpart. The lamp shining\nin broad daylight and the remains of powder upon the shield were\nsuccessive links in a fairly obvious chain. And now, my dear Watson, I\nthink we may dismiss the matter from our mind and go back with a clear\nconscience to the study of those Chaldean roots which are surely to be\ntraced in the Cornish branch of the great Celtic speech.\"\n\n\n\n\n\n\n\n\n\nEnd of the Project Gutenberg EBook of The Adventure of the Devil's Foot, by \nArthur Conan Doyle", "answers": ["Six hours."]}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "padding_side": "left",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}