mncai
/

lit-llama-chatdoctor-kr

Model card Files Files and versions Community

jwcho commited on Jun 8, 2023

Commit

208ab35

1 Parent(s): cbeaae9

first commit

Browse files

Files changed (17) hide show

app.py +224 -0
checkpoints/lit-llama/7B/lit-llama.pth +3 -0
checkpoints/lit-llama/tokenizer.model +3 -0
lit_llama/__init__.py +2 -0
lit_llama/__pycache__/__init__.cpython-311.pyc +0 -0
lit_llama/__pycache__/model.cpython-311.pyc +0 -0
lit_llama/__pycache__/tokenizer.cpython-311.pyc +0 -0
lit_llama/__pycache__/utils.cpython-311.pyc +0 -0
lit_llama/adapter.py +313 -0
lit_llama/adapter_v2.py +45 -0
lit_llama/lora.py +476 -0
lit_llama/model.py +321 -0
lit_llama/packed_dataset.py +260 -0
lit_llama/quantization.py +614 -0
lit_llama/tokenizer.py +49 -0
lit_llama/utils.py +471 -0
requirements.txt +10 -0

app.py ADDED Viewed

	@@ -0,0 +1,224 @@

+from pathlib import Path
+import gradio as gr
+import lightning as L
+import torch
+from lit_llama import LLaMA, Tokenizer
+from lit_llama.utils import EmptyInitOnDevice
+class ChatDoctor:
+    def __init__(self, model, tokenizer, fabric):
+        self.model = model
+        self.tokenizer = tokenizer
+        self.fabric = fabric
+    def generate_prompt(self, example):
+        if example["input"]:
+            return (
+                "아래는 작업을 설명하는 명령어와 추가적 맥락을 제공하는 입력이 짝을 이루는 예제입니다.\n\n"
+                "요청을 적절히 완료하는 응답을 작성하세요.\n\n"
+                f"### 명령어:\n{example['instruction']}\n\n### 입력:\n{example['input']}\n\n### 응답:"
+            )
+        return (
+            "환자가 의사에게 아픈 곳에 대해 문의합니다.\n\n"
+            "환자의 문의 내용에 대해 답변하세요. 환자의 질병을 진단하고, 가능하면 처방을 하세요. \n\n"
+            f"### 문의:\n{example['instruction']}\n\n### 응답:"
+    )
+    # This method generates the chatbot's responses.
+    @torch.no_grad()
+    def generate(
+        self,
+        idx,
+        max_new_tokens,
+        max_seq_length=None,
+        temperature=0.8,
+        top_k=None,
+        eos_id=None
+    ):
+        T = idx.size(0)
+        T_new = T + max_new_tokens
+        if max_seq_length is None:
+            max_seq_length = min(T_new, self.model.config.block_size)
+        device, dtype = idx.device, idx.dtype
+        # create an empty tensor of the expected final shape and fill in the current tokens
+        empty = torch.empty(T_new, dtype=dtype, device=device)
+        empty[:T] = idx
+        idx = empty
+        input_pos = torch.arange(0, T, device=device)
+        if idx.device.type == "xla":
+            import torch_xla.core.xla_model as xm
+            xm.mark_step()
+        # generate max_new_tokens tokens
+        for _ in range(max_new_tokens):
+            x = idx.index_select(0, input_pos).view(1, -1)
+            # forward
+            logits = self.model(x, max_seq_length, input_pos)
+            logits = logits[0, -1] / temperature
+            # optionally crop the logits to only the top k options
+            if top_k is not None:
+                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                logits = torch.where(logits < v[[-1]], -float("Inf"), logits)
+            probs = torch.nn.functional.softmax(logits, dim=-1)
+            idx_next = torch.multinomial(probs, num_samples=1).to(dtype=dtype)
+            # advance
+            input_pos = input_pos[-1:] + 1
+            if idx.device.type == "xla":
+                xm.mark_step()
+            # concatenate the new generation
+            idx = idx.index_copy(0, input_pos, idx_next)
+            # if <eos> token is triggered, return the output (stop generation)
+            if idx_next == eos_id:
+                return idx[:input_pos]  # include the EOS token
+        return idx
+    # This method handles user's messages and updates the conversation history.
+    def user(self, user_message, history):
+        # The user's message is added to the history with None as the bot's response.
+        return "", history + [[user_message, None]]
+    # This method generates and handles bot's responses.
+    def bot(self, history, max_new_tokens, top_k, temperature):
+        instruction = history[-1][0].strip()
+        sample = { "instruction" : instruction, "input" : None }
+        prompt = self.generate_prompt(sample)
+        encoded_prompt = self.tokenizer.encode(prompt, bos=True, eos=False, device=self.fabric.device)
+        y = self.generate(
+            idx=encoded_prompt,
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+            top_k=top_k,
+            eos_id=self.tokenizer.eos_id
+        )
+        self.model.reset_cache()
+        response = self.tokenizer.decode(y)
+        response = response.split('응답:')[1].strip()
+        # The history is updated with the bot's response.
+        history[-1][1] = response
+        return history
+def load_model():
+    # Settings for inference
+    # Precision setting for float32 matmul operations. It's important for some CUDA devices.
+    torch.set_float32_matmul_precision("high")
+    checkpoint_path = Path("checkpoints/lit-llama/7B/lit-llama.pth")
+    tokenizer_path = Path("checkpoints/lit-llama/tokenizer.model")
+    quantize = None  # "gptq.int4" or "llm.int8"
+    fabric = L.Fabric(devices=1)
+    dtype = torch.bfloat16 if fabric.device.type == "cuda" and torch.cuda.is_bf16_supported() else torch.float32
+    with EmptyInitOnDevice(device=fabric.device, dtype=dtype, quantization_mode=quantize):
+        model = LLaMA.from_name("7B")
+    checkpoint = torch.load(checkpoint_path)
+    model.load_state_dict(checkpoint)
+    model.eval()
+    model = fabric.setup_module(model)
+    tokenizer = Tokenizer(tokenizer_path)
+    return model, tokenizer, fabric
+def setup_gradio_ui(chat_doctor):
+    with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
+        gr.Markdown(
+        """
+        # ChatDoctor-KR Demo
+        last modified : 23.05.18
+        """)
+        chatbot = gr.Chatbot(label="ChatDoctor-KR")
+        msg = gr.Textbox(lines=1, placeholder="질문 입력 후 엔터를 누르세요.", label="질문")
+        clear = gr.Button("클리어")
+        gr.Markdown(
+        """
+        ## Parameters
+        """)
+        max_new_tokens = gr.Slider(
+            minimum=1,
+            maximum=512,
+            step=1,
+            value=512,
+            label="max_new_tokens",
+            info="The number of new tokens to generate",
+            interactive=True
+        )
+        top_k = gr.Slider(
+            minimum=1,
+            maximum=300,
+            step=1,
+            value=200,
+            label="top_k",
+            info="If specified, only sample among the tokens with the k highest probabilities",
+            interactive=True
+        )
+        temperature = gr.Slider(
+            minimum=0.1,
+            maximum=1.0,
+            step=0.1,
+            value=0.8,
+            label="temperature",
+            info="Scales the predicted logits by 1 / temperature",
+            interactive=True
+        )
+        with gr.Accordion(label="Open for More!", open=False):
+            gr.Markdown("Blah Blah ...")
+        submit_result = msg.submit(
+            chat_doctor.user, [msg, chatbot], [msg, chatbot], queue=False
+        )
+        submit_result.then(
+            chat_doctor.bot, [chatbot, max_new_tokens, top_k, temperature], chatbot
+        )
+        # This part clears the chatbot history when the clear button is clicked.
+        clear.click(lambda: None, None, chatbot, queue=False)
+    demo.queue()
+    demo.launch(share=True, server_name="0.0.0.0")
+def main():
+    # Load model and tokenizer
+    model, tokenizer, fabric = load_model()
+    # ChatDoctor instance
+    chat_doctor = ChatDoctor(model, tokenizer, fabric)
+    # Gradio UI setup and launch
+    setup_gradio_ui(chat_doctor)
+if __name__ == "__main__":
+    main()

checkpoints/lit-llama/7B/lit-llama.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6ea5abe49d33b50c000c1107907db19ef293dd61fceab8b451fe883f5fd8a919
+size 13476954436

checkpoints/lit-llama/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

lit_llama/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from lit_llama.model import LLaMAConfig, LLaMA, RMSNorm, build_rope_cache, apply_rope
2	+ from lit_llama.tokenizer import Tokenizer

lit_llama/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (415 Bytes). View file

lit_llama/__pycache__/model.cpython-311.pyc ADDED Viewed

Binary file (20 kB). View file

lit_llama/__pycache__/tokenizer.cpython-311.pyc ADDED Viewed

Binary file (3.37 kB). View file

lit_llama/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (25.3 kB). View file

lit_llama/adapter.py ADDED Viewed

	@@ -0,0 +1,313 @@

+"""Implementation of the paper:
+LLaMA-Adapter: Efficient Fine-tuning of Language Models with Zero-init Attention
+https://arxiv.org/abs/2303.16199
+                                                                             |              Prefix cross-attention
+                                                                             |
+  ┌─────────────────┐                                                        |               ┌──────────────────┐
+  ┆        x        ┆                                                        |               ┆      prefix      ┆
+  └─────────────────┘                                                        |               └──────────────────┘
+           |                                                                 |                        |
+           ▼                                                                 |                        ▼
+  ┌──────────────────┐                                                       |              ┌─────────────────────┐
+  ┆  self-attention  ┆ --------------------------------------------------------------┐      ┆  linear projection  ┆
+  └──────────────────┘                                                       |       ┆      └─────────────────────┘
+           |                                                                 |       ┆                |         \
+           ▼                                                                 |       ▼                ▼          ▼
+         ╭───╮     ┌────────────────┐ ╭───╮ ┌──────────────────────────┐     |  ┌─────────┐    ┌──────────────┐  ┌────────────────┐
+         ┆ + ┆ ◀── ┆  gating factor ┆-┆ x ┆-┆  prefix cross-attention  ┆     |  ┆  query  ┆    ┆  prefix key  ┆  ┆  prefix value  ┆
+         ╰───╯     └────────────────┘ ╰───╯ └──────────────────────────┘     |  └─────────┘    └──────────────┘  └────────────────┘
+           |                                                                 |          \             |           /
+           ▼                                                                 |           ▼            ▼          ▼
+                                                                             |         ┌────────────────────────────────┐
+                                                                             |         ┆  scaled dot-product attention  ┆
+                                                                             |         └────────────────────────────────┘
+In order to inject learnable information from the prefix to pretrained weights we need to sum outputs from
+self-attention and prefix cross-attention (times gating factor). For prefix cross-attention we need `query` (from
+self-attention as a result of linear projection), `prefix key` and `prefix value` (from cross-attention as a result of
+linear projection).
+The output of prefix cross-attention is multiplied by gating factor, which is a learnable parameter that is needed to
+avoid potential disruption of pretrained weights caused by incorporating randomly initialized tensors. This factor is
+initialized with zeros to avoid noise from the adaption prompts at the early training stage.
+More about it: https://lightning.ai/pages/community/article/understanding-llama-adapters/
+Notes about implementation: as per paper adapter's prefix is concatenated with the input, while here outputs of
+self-attention and prefix cross-attention are summed. Both variants are mathematically equivalent:
+https://github.com/ZrrSkywalker/LLaMA-Adapter/issues/47
+"""
+# mypy: ignore-errors
+from dataclasses import dataclass
+from typing import Optional, Tuple, List, Union
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+import lit_llama.model as llama
+from lit_llama.model import build_rope_cache, apply_rope, RMSNorm, MLP, KVCache, RoPECache
+@dataclass
+class LLaMAConfig(llama.LLaMAConfig):
+    adapter_prompt_length: int = 10
+    adapter_start_layer: int = 2
+class CausalSelfAttention(nn.Module):
+    """A modification of `lit_llama.model.CausalSelfAttention` that adds the attention
+    over the adaption prompt."""
+    def __init__(self, config: LLaMAConfig, block_idx: int) -> None:
+        super().__init__()
+        assert config.n_embd % config.n_head == 0
+        # key, query, value projections for all heads, but in a batch
+        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=False)
+        # output projection
+        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
+        if block_idx >= config.adapter_start_layer:
+            # adapter embedding layer
+            self.adapter_wte = nn.Embedding(config.adapter_prompt_length, config.n_embd)
+            # a learnable gating factor (to avoid potential disruption of pretrained weights) initialized with zeros (to
+            # avoid noise from adaption prompts at the early training stage)
+            self.gating_factor = torch.nn.Parameter(torch.zeros(1, config.n_head, 1, 1))
+        self.n_head = config.n_head
+        self.n_embd = config.n_embd
+        self.block_size = config.block_size
+        self.block_idx = block_idx
+        self.adapter_prompt_length = config.adapter_prompt_length
+        self.adapter_start_layer = config.adapter_start_layer
+    def forward(
+        self,
+        x: torch.Tensor,
+        rope: RoPECache,
+        mask: torch.Tensor,
+        max_seq_length: int,
+        input_pos: Optional[torch.Tensor] = None,
+        kv_cache: Optional[KVCache] = None,
+        adapter_kv_cache: Optional[KVCache] = None,
+    ) -> Tuple[torch.Tensor, Optional[KVCache], Optional[KVCache]]:
+        # notation:
+        # - B  | batch
+        # - T  | time-step (sequence length)
+        # - C  | embeddings size (n_embd) = head size * num heads
+        # - hs | head size
+        # - nh | number of heads
+        B, T, C = x.size()
+        # instead of calculating `query`, `key` and `value` by separately multiplying input `x` with corresponding
+        # weight matrices do it (for all heads) in a single multiplication with a matrix of 3x size (concatenated
+        # weights for q, k, v) and then split the result along `embedding size` dimension
+        q, k, v = self.c_attn(x).split(self.n_embd, dim=2) # (B, T, 3 * C) --> 3 * (B, T, C)
+        # in order to move head_size (hs) dimension right after batch (B) dimension, we need to first split
+        # embedding size (C) dimension into num_heads (nh) and head_size (hs)
+        head_size = C // self.n_head
+        k = k.view(B, T, self.n_head, head_size)
+        q = q.view(B, T, self.n_head, head_size)
+        v = v.view(B, T, self.n_head, head_size)
+        # "Unlike standard positional embeddings rotary embeddings must be applied at every layer"
+        q = apply_rope(q, rope) # (B, T, nh, hs)
+        k = apply_rope(k, rope) # (B, T, nh, hs)
+        # now `key`, 'query` and `value` tensors are correctly represented: for each element in a batch (B)
+        # there is a number of heads (nh) and for each head there is a sequence of elements (T), each of them is
+        # represented by a vector of size `hs`
+        k = k.transpose(1, 2)  # (B, nh, T, hs)
+        q = q.transpose(1, 2)  # (B, nh, T, hs)
+        v = v.transpose(1, 2)  # (B, nh, T, hs)
+        if kv_cache is not None:
+            cache_k, cache_v = kv_cache # 2 * (B, nh, max_seq_length, hs)
+            # check if reached token limit
+            if input_pos[-1] >= max_seq_length:
+                # if we reached token limit and thus there is no space to put newly calculated `key` and `value`
+                # right next to cached ones, we need to rotate cache tensor along `max_seq_length` dimension by one
+                # element to the left: this will free up space for new `key` and `value`
+                input_pos = torch.tensor(max_seq_length - 1, device=input_pos.device)
+                # shift 1 position to the left
+                cache_k = torch.roll(cache_k, -1, dims=2)
+                cache_v = torch.roll(cache_v, -1, dims=2)
+            k = cache_k.index_copy(2, input_pos, k) # (B, nh, max_seq_length, hs)
+            v = cache_v.index_copy(2, input_pos, v) # (B, nh, max_seq_length, hs)
+            kv_cache = k, v
+        # efficient attention using Flash Attention CUDA kernels
+        # ↓ (B, nh, T, hs) @ (B, nh, T, hs).mT --> (B, nh, T, T) @ (B, nh, T, hs) --> (B, nh, T, hs)
+        y = F.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0) # (B, nh, T, hs)
+        # "Adapters are applied to the topmost layers to better tune the language
+        # representations with higher-level semantics".
+        if self.block_idx >= self.adapter_start_layer:
+            if adapter_kv_cache is not None:
+                ak, av = adapter_kv_cache # 2 * (B, nh, aT, hs)
+            else:
+                prefix = self.adapter_wte.weight.reshape(1, self.adapter_prompt_length, self.n_embd)
+                aT = prefix.size(1)
+                _, ak, av = self.c_attn(prefix).split(self.n_embd, dim=2) # (1, aT, 3 * C) --> 3 * (1, aT, C)
+                ak = ak.view(1, aT, self.n_head, head_size).repeat(B, 1, 1, 1).transpose(1, 2) # (B, nh, aT, hs)
+                av = av.view(1, aT, self.n_head, head_size).repeat(B, 1, 1, 1).transpose(1, 2) # (B, nh, aT, hs)
+                adapter_kv_cache = (ak, av)
+            # Apply cross-attention with `query`, `adapter_key`, `adapter_value` and sum the output with the output
+            # obtained from self-attention step. This is mathematically equivalent to concatenation of prefix and input as per paper.
+            amask = torch.ones(q.shape[-2], ak.shape[-2], dtype=torch.bool, device=x.device) # (T, aT)
+            # ↓ (B, nh, T, hs) @ (B, nh, aT, hs).mT --> (B, nh, T, aT) @ (B, nh, aT, hs) --> (B, nh, T, hs)
+            ay = F.scaled_dot_product_attention(q, ak, av, attn_mask=amask, dropout_p=0.0, is_causal=False) # (B, nh, T, hs)
+            y = y + self.gating_factor * ay
+        y = y.transpose(1, 2).contiguous().view(B, T, C)  # re-assemble all head outputs side by side
+        # output projection
+        y = self.c_proj(y) # (B, T, C)
+        return y, kv_cache, adapter_kv_cache
+    def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
+        """For backward compatibility with old checkpoints that have a single gating value for all heads."""
+        name = prefix + "gating_factor"
+        if name in state_dict:
+            tensor = state_dict[name]
+            # in case we are loading with `utils.lazy_load()`
+            tensor = tensor._load_tensor() if hasattr(tensor, "_load_tensor") else tensor
+            if len(tensor.shape) < 4:
+                # For old checkpoints with unified gating value
+                state_dict[name] = tensor.reshape(1, 1, 1, 1).repeat(1, self.n_head, 1, 1)
+            else:
+                state_dict[name] = tensor
+        return super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
+class Block(nn.Module):
+    """The implementation is identical to `lit_llama.model.Block` with the exception that
+    we replace the attention layer where adaption is implemented."""
+    def __init__(self, config: LLaMAConfig, block_idx: int) -> None:
+        super().__init__()
+        self.rms_1 = RMSNorm(config.n_embd)
+        self.attn = CausalSelfAttention(config, block_idx)
+        self.rms_2 = RMSNorm(config.n_embd)
+        self.mlp = MLP(config)
+    def forward(
+        self,
+        x: torch.Tensor,
+        rope: RoPECache,
+        mask: torch.Tensor,
+        max_seq_length: int,
+        input_pos: Optional[torch.Tensor] = None,
+        kv_cache: Optional[KVCache] = None,
+        adapter_kv_cache: Optional[KVCache] = None,
+    ) -> Tuple[torch.Tensor, Optional[KVCache], Optional[KVCache]]:
+        h, new_kv_cache, new_adapter_kv_cache = self.attn(
+            self.rms_1(x), rope, mask, max_seq_length, input_pos, kv_cache, adapter_kv_cache
+        )
+        x = x + h
+        x = x + self.mlp(self.rms_2(x))
+        return x, new_kv_cache, new_adapter_kv_cache
+class LLaMA(llama.LLaMA):
+    """The implementation is identical to `lit_llama.model.LLaMA` with the exception that
+    the `Block` saves the layer index and passes it down to the attention layer."""
+    def __init__(self, config: LLaMAConfig) -> None:
+        nn.Module.__init__(self)
+        assert config.vocab_size is not None
+        assert config.block_size is not None
+        self.config = config
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        self.transformer = nn.ModuleDict(
+            dict(
+                wte=nn.Embedding(config.vocab_size, config.n_embd),
+                h=nn.ModuleList(Block(config, i) for i in range(config.n_layer)),
+                ln_f=RMSNorm(config.n_embd),
+            )
+        )
+        self.rope_cache: Optional[RoPECache] = None
+        self.mask_cache: Optional[torch.Tensor] = None
+        self.kv_caches: List[KVCache] = []
+        self.adapter_kv_caches: List[KVCache] = []
+    @classmethod
+    def from_name(cls, name: str):
+        return cls(LLaMAConfig.from_name(name))
+    def reset_cache(self) -> None:
+        super().reset_cache()
+        self.adapter_kv_caches.clear()
+    def forward(
+        self, idx: torch.Tensor, max_seq_length: Optional[int] = None, input_pos: Optional[torch.Tensor] = None
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, List[KVCache]]]:
+        B, T = idx.size()
+        block_size = self.config.block_size
+        if max_seq_length is None:
+            max_seq_length = block_size
+        assert T <= max_seq_length, f"Cannot forward sequence of length {T}, max seq length is only {max_seq_length}"
+        assert max_seq_length <= block_size, f"Cannot attend to {max_seq_length}, block size is only {block_size}"
+        assert T <= block_size, f"Cannot forward sequence of length {T}, block size is only {block_size}"
+        if self.rope_cache is None:
+            self.rope_cache = self.build_rope_cache(idx) # (block_size, head_size / 2, 2)
+        if self.mask_cache is None:
+            self.mask_cache = self.build_mask_cache(idx) # (1, 1, block_size, block_size)
+        if input_pos is not None:
+            rope = self.rope_cache.index_select(0, input_pos)
+            mask = self.mask_cache.index_select(2, input_pos)
+            mask = mask[:, :, :, :max_seq_length]
+        else:
+            rope = self.rope_cache[:T]
+            mask = self.mask_cache[:, :, :T, :T]
+        # forward the model itself
+        x = self.transformer.wte(idx)  # token embeddings of shape (B, T, n_embd)
+        if input_pos is None:  # proxy for use_cache=False
+            for block in self.transformer.h:
+                x, *_ = block(x, rope, mask, max_seq_length)
+        else:
+            if not self.kv_caches:
+                head_size = self.config.n_embd // self.config.n_head
+                cache_shape = (B, self.config.n_head, max_seq_length, head_size)
+                self.kv_caches = [
+                    (torch.zeros(cache_shape, device=x.device, dtype=x.dtype), torch.zeros(cache_shape, device=x.device, dtype=x.dtype))
+                    for _ in range(self.config.n_layer)
+                ]
+            if not self.adapter_kv_caches:
+                self.adapter_kv_caches = [None for _ in range(self.config.n_layer)]
+            for i, block in enumerate(self.transformer.h):
+                x, self.kv_caches[i], self.adapter_kv_caches[i] = block(
+                    x, rope, mask, max_seq_length, input_pos, self.kv_caches[i], self.adapter_kv_caches[i]
+                )
+        x = self.transformer.ln_f(x) # (B, T, n_embd)
+        logits = self.lm_head(x)  # (B, T, vocab_size)
+        return logits
+def mark_only_adapter_as_trainable(model: LLaMA) -> None:
+    """Sets `requires_grad=False` for all non-adapter weights."""
+    for name, param in model.named_parameters():
+        param.requires_grad = "adapter_wte" in name or "gating_factor" in name
+def adapter_state_from_state_dict(state_dict: dict) -> dict:
+    """Returns the model state dict with only the adapter weights for saving."""
+    return {name: param for name, param in state_dict.items() if "adapter_wte" in name or "gating_factor" in name}

lit_llama/adapter_v2.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import torch
+from torch import Tensor
+import torch.nn as nn
+from torch.nn import functional as F
+from lit_llama.adapter import LLaMA
+def get_adapter_substrings():
+    substrings = ["adapter_wte", "gating_factor"]  # regular adapter v1 parameters
+    substrings.extend(["adapter_scale", "adapter_bias"])  # adapter v2: new bias and scale used in Linear
+    substrings.extend(["rms_1", "rms_2", "ln_f"])  # adapter v2: RMSNorm parameters are now trainable
+    return substrings
+def mark_only_adapter_v2_as_trainable(model: LLaMA) -> None:
+    """Sets `requires_grad=False` for all non-adapter weights."""
+    for name, param in model.named_parameters():
+        param.requires_grad = any(s in name for s in get_adapter_substrings())
+def adapter_v2_state_from_state_dict(state_dict: dict) -> dict:
+    """Returns the model state dict with only the adapter weights for saving."""
+    return {name: param for name, param in state_dict.items()
+            if any(s in name for s in get_adapter_substrings())}
+def adapter_v2_new_forward(self, input: Tensor) -> Tensor:
+    return self.adapter_scale * (
+        F.linear(input, self.weight, self.bias) + self.adapter_bias
+    )
+def adapter_v2_linear_with_bias_and_scale(layer):
+    layer.adapter_bias = torch.nn.Parameter(torch.zeros(layer.weight.shape[0]), requires_grad=True)
+    layer.adapter_scale = torch.nn.Parameter(torch.ones(layer.weight.shape[0]), requires_grad=True)
+    bound_method = adapter_v2_new_forward.__get__(layer, layer.__class__)
+    setattr(layer, 'forward', bound_method)
+    return layer
+def add_adapter_v2_parameters_to_linear_layers(model):
+    for module in model.modules():
+        if isinstance(module, nn.Linear):
+            adapter_v2_linear_with_bias_and_scale(module)

lit_llama/lora.py ADDED Viewed

	@@ -0,0 +1,476 @@

+# Derived from https://github.com/microsoft/LoRA
+#  ------------------------------------------------------------------------------------------
+#  Copyright (c) Microsoft Corporation. All rights reserved.
+#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
+#  ------------------------------------------------------------------------------------------
+r"""
+    Low Ranking Adaptation for LLMs scheme.
+             ┌───────────────────┐
+             ┆         h         ┆
+             └───────────────────┘
+                       ▲
+                       |
+                       +
+                    /     \
+    ┌─────────────────┐    ╭───────────────╮     Matrix initialization:
+    ┆                 ┆     \      B      /      B = 0
+    ┆   pretrained    ┆      \    r*d    /       A = N(0, sigma^2)
+    ┆    weights      ┆       ╰─────────╯
+    ┆                 ┆       |    r    |        r - rank
+    ┆   W e R^(d*d)   ┆       | ◀─────▶ |
+    ┆                 ┆       ╭─────────╮
+    └─────────────────┘      /     A     \
+              ▲             /     d*r     \
+               \           ╰───────────────╯
+                \                ▲
+                 \              /
+                  \            /
+             ┌───────────────────┐
+             ┆         x         ┆
+             └───────────────────┘
+With LoRA (Low Ranking Adaptation: https://arxiv.org/abs/2106.09685) instead of learning weights of size d*d,
+we can freeze the pretrained weights and instead learn two matrices of size d*r and r*d (they will store weight updates
+for the pretrained weights): the number of parameters in this case will be reduced drastically (depending on the rank of
+course) yet after multiplication of matrices d*r and r*d we will get a matrix d*d which we can sum with frozen
+pretrained weights and thus fine-tune the model.
+The goal of this approach is to move weight updates into a separate matrix which is decomposed with
+two matrices of a lower rank.
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+from typing import Dict, List
+import lit_llama.model as llama
+from contextlib import contextmanager
+from dataclasses import dataclass
+class LoRALayer():
+    def __init__(
+        self,
+        r: int,
+        lora_alpha: int,
+        lora_dropout: float,
+        merge_weights: bool,
+    ):
+        """Store LoRA specific attributes in a class.
+        Args:
+            r: rank of the weight update matrices. To make sense of using LoRA the rank should be smaller than the rank of
+                the weights of the model.  The rank can be as low as 1: https://arxiv.org/pdf/2106.09685.pdf (section 7.2)
+            lora_alpha: alpha is needed for scaling updates as alpha/r
+                "This scaling helps to reduce the need to retune hyperparameters when we vary r"
+                https://arxiv.org/pdf/2106.09685.pdf (section 4.1)
+            lora_dropout: dropout that is applied on the input in the LoRA branch (before multiplying by matrix A)
+            merge_weights: whether we want to merge pretrained weights and LoRA weight updates. This is useful if one wants to use
+                fine-tuned model as a standalone one (without storing LoRA weights separately) plus it helps to reduce
+                overhead during inference.
+        """
+        self.r = r
+        self.lora_alpha = lora_alpha
+        # Optional dropout
+        if lora_dropout > 0.:
+            self.lora_dropout = nn.Dropout(p=lora_dropout)
+        else:
+            self.lora_dropout = lambda x: x
+        # Mark the weight as unmerged
+        self.merged = False
+        self.merge_weights = merge_weights
+class MergedLinear(nn.Linear, LoRALayer):
+    # LoRA implemented in a dense layer
+    def __init__(
+        self,
+        # ↓ this part is for pretrained weights
+        in_features: int,
+        out_features: int,
+        # ↓ the remaining part is for LoRA
+        r: int = 0,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.,
+        enable_lora: List[bool] = [False],
+        fan_in_fan_out: bool = False,
+        merge_weights: bool = True,
+        **kwargs
+    ):
+        """LoRA wrapper around linear class that is used for calculation of q, k and v matrices.
+        This class has three weight matrices:
+            1. Pretrained weights are stored as `self.weight` (because of the nn.Linear inheritance)
+            2. LoRA A matrix as `self.lora_A`
+            3. LoRA B matrix as `self.lora_B`
+        Only LoRA's A and B matrices are updated, pretrained weights stay frozen.
+        Args:
+            in_features: number of input features of the pretrained weights
+            out_features: number of output features of the pretrained weights
+            r: rank of the weight update matrices. To make sense of using LoRA the rank should be smaller than the rank of
+                the weights of the model.  The rank can be as low as 1: https://arxiv.org/pdf/2106.09685.pdf (section 7.2)
+            lora_alpha: alpha is needed for scaling updates as alpha/r
+                "This scaling helps to reduce the need to retune hyperparameters when we vary r"
+                https://arxiv.org/pdf/2106.09685.pdf (section 4.1)
+            lora_dropout: dropout that is applied on the input in the LoRA branch (before multiplying by matrix A)
+            enable_lora: MergeLinear class is for attention mechanism where qkv are calculated with a single weight matrix. If we
+                don't want to apply LoRA for all three (query, key and value) we can set it as False. For example if we want
+                to apply LoRA only to `query` and `value` but keep `key` without weight updates we should pass `[True,
+                False, True]`
+            fan_in_fan_out: set this to True if the layer to replace stores weight like (fan_in, fan_out).  For example, gpt-2 uses
+                `Conv1D` which stores weights like (fan_in, fan_out) and hence this should be set to `True`
+                https://github.com/huggingface/peft/blob/main/src/peft/tuners/lora.py#LL53C9-L53C112
+            merge_weights: whether we want to merge pretrained weights and LoRA weight updates. This is useful if one wants to use
+                fine-tuned model as a standalone one (without storing LoRA weight separately) plus it helps to reduce
+                overhead during inference.
+        """
+        nn.Linear.__init__(self, in_features, out_features, **kwargs)
+        LoRALayer.__init__(self, r=r, lora_alpha=lora_alpha, lora_dropout=lora_dropout,
+                           merge_weights=merge_weights)
+        assert out_features % len(enable_lora) == 0, \
+            'The length of enable_lora must divide out_features'
+        self.enable_lora = enable_lora
+        self.fan_in_fan_out = fan_in_fan_out
+        # Actual trainable parameters
+        # To better understand initialization let's imagine that we have such parameters:
+        # ⚬ in_features: 128 (embeddings_size)
+        # ⚬ out_features: 384 (3 * embedding_size)
+        # ⚬ r: 2
+        # ⚬ enable_lora: [True, False, True]
+        if r > 0 and any(enable_lora):
+            self.lora_A = nn.Parameter(
+                self.weight.new_zeros((r * sum(enable_lora), in_features)))  # (4, 128)
+            self.lora_B = nn.Parameter(
+                self.weight.new_zeros((out_features // len(enable_lora) * sum(enable_lora), r))  # (256, 2)
+            ) # weights for Conv1D with groups=sum(enable_lora)
+            # Notes about shapes above
+            # - self.lora_A has shape (4, 128): 4 because rank is 2 and LoRA is applied only to two matrices;
+            # 128 is the input size of the x (embedding size). (4, 128) and not (128, 4) because later on in
+            # F.linear function weights are automatically transposed. In addition conv1d requires channels to
+            # be before seq length
+            # - self.lora_B has shape (256, 2): 256 because LoRA is applied only to two matrices, so the output is
+            # 128*2; 2 tells to have two channels per group for group convolution
+            # Scaling:
+            # This balances the pretrained model`s knowledge and the new task-specific adaptation
+            # https://lightning.ai/pages/community/tutorial/lora-llm/
+            # So, set alpha to 1.0 to fully add LoRA. If the LoRA seems to have too much effect (i.e., overfitted), set
+            # alpha to lower value. If the LoRA seems to have too little effect, set alpha to higher than 1.0. You can
+            # tune these values to your needs. This value can be even slightly greater than 1.0!
+            # https://github.com/cloneofsimo/lora
+            self.scaling = self.lora_alpha / self.r
+            # Freezing the pre-trained weight matrix
+            self.weight.requires_grad = False # (384, 128)
+            # Compute the indices
+            # Indices are needed to properly pad weight updates with zeros. If we want to fine-tune queries and values,
+            # but not keys, then the weights update should be:
+            #
+            # [[ΔW,ΔW,ΔW, ..., 0,0,0, ..., ΔW,ΔW,ΔW,],
+            #  [....................................],
+            #  [ΔW,ΔW,ΔW, ..., 0,0,0, ..., ΔW,ΔW,ΔW,]]
+            #      ↑              ↑            ↑
+            # ________________________________________
+            # | query         | key       | value    |
+            # ----------------------------------------
+            self.lora_ind = self.weight.new_zeros(
+                (out_features, ), dtype=torch.bool
+            ).view(len(enable_lora), -1)  # (3, 128)
+            self.lora_ind[enable_lora, :] = True  # (3, 128)
+            self.lora_ind = self.lora_ind.view(-1)  # (384,)
+        self.reset_parameters()
+        if fan_in_fan_out:
+            self.weight.data = self.weight.data.T
+    def reset_parameters(self):
+        """Reset all the weights, even including pretrained ones."""
+        nn.Linear.reset_parameters(self)
+        if hasattr(self, 'lora_A'):
+            # initialize A the same way as the default for nn.Linear and B to zero
+            # Wondering why 'a' is equal to math.sqrt(5)?: https://github.com/pytorch/pytorch/issues/15314
+            nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
+            nn.init.zeros_(self.lora_B)
+    def zero_pad(self, x: torch.Tensor) -> torch.Tensor:
+        """Properly pad weight updates with zeros.
+        If, based on `self.enable_lora`, we want to fine-tune queries and values, but not keys,
+        then the weights update should be:
+        [[ΔW,ΔW,ΔW, ..., 0,0,0, ..., ΔW,ΔW,ΔW,],
+         [....................................],
+         [ΔW,ΔW,ΔW, ..., 0,0,0, ..., ΔW,ΔW,ΔW,]]
+            ↑              ↑            ↑
+        ________________________________________
+        | query         | key       | value    |
+        ----------------------------------------
+        Args:
+            x: tensor with weights update that will be padded with zeros if necessary
+        Returns:
+            A tensor with weight updates and zeros for deselected q, k or v
+        """
+        # Let's image that:
+        # ⚬ input x has shape (64, 64, 256): (batch_size, sequence_length, embeddings_size)
+        # ⚬ embeddings_size: 128
+        # ⚬ self.out_features: 384 (3 * embeddings_size)
+        # ⚬ enable_lora: [True, False, True]
+        # Then x has embeddings_size of 256 (2 * 128 as enable_lora only for query and value, not keys) and expected
+        # embeddings_size is 384 (self.out_features), so that means that we need to pad from 256 to 384 with zeros, but
+        # only for key updates (this is where self.lora_ind comes in handy)
+        # Note: double transpose (in the beginning and in the end) is basically a guard for two-dimensional tensors
+        # for example when we want to merge/unmerge LoRA weights and pretrained weights
+        x = x.transpose(0, 1)
+        result = x.new_zeros((*x.shape[:-1], self.out_features))  # (64, 64, 384)
+        result = result.view(-1, self.out_features)  # (4096, 384)
+        result[:, self.lora_ind] = x.reshape(
+            -1, self.out_features // len(self.enable_lora) * sum(self.enable_lora)
+        )  # (4096, 256)
+        return result.view((*x.shape[:-1], self.out_features)).transpose(0, 1)  # (64, 64, 384)
+    def train(self, mode: bool = True):
+        """Set the module into train or eval mode if `mode` is True of False respectively.
+        For train mode (train(True)) if weights are merged we need to subtract weights updates (LoRA_A @ LoRA_B) from
+        pretrained weights so we can continue training LoRA's matrices A and B and keep pretrained weights frozen.
+        For eval mode (train(False)) if weights are not merged we need to add weight updates to pretrained weights in
+        order to reduce computational overhead during inference.
+        Args:
+            mode: if True the module will be set into train mode (affects Dropout and BatchNorm), if False - eval mode.
+        """
+        def T(w):
+            return w.T if self.fan_in_fan_out else w
+        # despite being called from nn.Linear this method will put all layers into train mode, including nn.Dropout
+        # of course except parameters (such as self.lora_A, self.lora_B)
+        nn.Linear.train(self, mode)
+        # if train(True) -> unmerge unless we already have them unmerged
+        # if train(False) -> merge unless we already have them merged
+        should = self.merged if mode else not self.merged
+        # Let's assume that:
+        # ⚬ self.weight.data: (384, 128) or (3 * embedding_size, embedding_size)
+        # ⚬ self.lora_A.data: (4, 128)
+        # ⚬ self.lora_B.data: (256, 2)
+        if self.merge_weights and should:
+            if self.r > 0 and any(self.enable_lora):
+                delta_w = F.conv1d(
+                    self.lora_A.data.unsqueeze(0),   # (4, 128) -> (1, 4, 128)
+                    self.lora_B.data.unsqueeze(-1),  # (256, 2) -> (256, 2, 1)
+                    groups=sum(self.enable_lora)
+                ).squeeze(0) # (1, 4, 128) @ (256, 2, 1) -> (1, 256, 128) -> (256, 128)
+                # -1: W = W - delta_W (unmerge), +1: W = W + delta_W (merge)
+                sign = -1 if mode else 1
+                self.weight.data += sign * self.zero_pad(T(delta_w * self.scaling)) # (256, 128) after zero_pad (384, 128)
+            self.merged = not mode
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Do the forward pass.
+        If LoRA's weights are merged with pretrained ones then it's a simple matrix multiplication.
+        If not, then multiply pretrained weights with input, apply LoRA on input and do summation.
+        Args:
+            x: input tensor of shape (batch_size, context_length, embedding_size)
+        Returns:
+            Output tensor of shape (batch_size, context_length, 3 * embedding_size)
+        """
+        def T(w):
+            return w.T if self.fan_in_fan_out else w
+        # Let's assume that:
+        # ⚬ x: (64, 64, 128) or (batch_size, context_length, embedding_size)
+        # ⚬ self.weight: (384, 128) or (3 * embedding_size, embedding_size)
+        # ⚬ self.lora_A.data: (4, 128)
+        # ⚬ self.lora_B.data: (256, 2)
+        # the logic here is that the weights are merged only during inference
+        # so if they are merged we don't need to do anything with LoRA's A and B matrices
+        # but if the weights are not merged that means that the forward method is called during
+        # training and we need to forward pass input through pretrained weights, LoRA A and B matrices
+        # and do the summation (as per scheme at the top of the file)
+        if self.merged:
+            return F.linear(x, T(self.weight), bias=self.bias)
+        else:
+            # `F.linear` automatically transposes the second argument (T(self.weight) in our case)
+            result = F.linear(x, T(self.weight), bias=self.bias)  # (64, 64, 128) @ (384, 128) -> (64, 64, 384)
+            if self.r > 0:
+                after_A = F.linear(self.lora_dropout(x), self.lora_A)  # (64, 64, 128) @ (4, 128) -> (64, 64, 4)
+                # For F.conv1d:
+                # ⚬ input: input tensor of shape (mini-batch, in_channels, iW)
+                # ⚬ weight: filters of shape (out_channels, in_channels/groups, kW)
+                # ⚬ groups: split input into groups, in_channels should be divisible by the number of groups. Default: 1
+                # presumably iW - sequence width/length, kW - kernel width
+                after_B = F.conv1d(
+                    after_A.transpose(-2, -1),  # (64, 64, 4) -> (64, 4, 64)
+                    self.lora_B.unsqueeze(-1),  # (256, 2) -> (256, 2, 1)
+                    groups=sum(self.enable_lora)
+                ).transpose(-2, -1)  # (64, 4, 64) @ (256, 2, 1) -> (64, 256, 64) -> (64, 64, 256)
+                result += self.zero_pad(after_B) * self.scaling  # (64, 64, 256) after zero_pad (64, 64, 384)
+            return result
+def mark_only_lora_as_trainable(model: nn.Module, bias: str = 'none') -> None:
+    """Freeze all modules except LoRA's and depending on 'bias' value unfreezes bias weights.
+    Args:
+        model: model with LoRA layers
+        bias:
+            ``"none"``: all bias weights will be frozen,
+            ``"lora_only"``: only bias weight for LoRA layers will be unfrozen,
+            ``"all"``: all bias weights will be unfrozen.
+    Raises:
+        NotImplementedError: if `bias` not in ["none", "lora_only", "all"]
+    """
+    # freeze all layers except LoRA's
+    for n, p in model.named_parameters():
+        if 'lora_' not in n:
+            p.requires_grad = False
+    # depending on the `bias` value unfreeze bias weights
+    if bias == 'none':
+        return
+    elif bias == 'all':
+        for n, p in model.named_parameters():
+            if 'bias' in n:
+                p.requires_grad = True
+    elif bias == 'lora_only':
+        for m in model.modules():
+            if isinstance(m, LoRALayer) and \
+                hasattr(m, 'bias') and \
+                m.bias is not None:
+                    m.bias.requires_grad = True
+    else:
+        raise NotImplementedError
+def lora_state_dict(model: nn.Module, bias: str = 'none') -> Dict[str, torch.Tensor]:
+    """Return state_dict with weights of LoRA's A and B matrices and with biases depending on the `bias` value.
+    Args:
+        model: model with LoRA layers
+        bias:
+            ``"none"``: state dict will not store bias weights,
+            ``"lora_only"``: state dict will store bias weights only from LoRA layers,
+            ``"all"``: state dict will store all bias weights.
+    Returns:
+        Weights and biases of LoRA layers
+    Raises:
+        NotImplementedError: if `bias` not in ["none", "lora_only", "all"]
+    """
+    my_state_dict = model.state_dict()
+    if bias == 'none':
+        return {k: my_state_dict[k] for k in my_state_dict if 'lora_' in k}
+    elif bias == 'all':
+        return {k: my_state_dict[k] for k in my_state_dict if 'lora_' in k or 'bias' in k}
+    elif bias == 'lora_only':
+        to_return = {}
+        for k in my_state_dict:
+            if 'lora_' in k:
+                to_return[k] = my_state_dict[k]
+                bias_name = k.split('lora_')[0]+'bias'
+                if bias_name in my_state_dict:
+                    to_return[bias_name] = my_state_dict[bias_name]
+        return to_return
+    else:
+        raise NotImplementedError
+@dataclass
+class LoRAConfig:
+    r: float = 0.0
+    alpha: float = 1.0
+    dropout: float = 0.0
+class CausalSelfAttention(llama.CausalSelfAttention):
+    lora_config = None
+    def __init__(self, config: llama.LLaMAConfig) -> None:
+        """Causal self-attention with calculating qkv matrices with a single matrix* and Low Ranking Adaptation for
+        parameter-efficient fine-tuning.
+        *Instead of creating multiple heads and concatenating the result (in addition to creating separate matrices for
+        query, key and value for each head) we can do this in a single pass with a single weight matrix.
+        Args:
+            config:
+                ``"block_size"``: size of the context of the model,
+                ``"vocab_size"``: number of unique tokens,
+                ``"padded_vocab_size"``: padded size of the vocabulary to the nearest multiple of 64 (leads to a greater performance),
+                ``"n_layer"``: number of transformer blocks (self-attention + MLP),
+                ``"n_head"``: number of heads in multi-head attention mechanism,
+                ``"n_embd"``: size of the embedding: vector representation of each token.
+        """
+        # Skip the parent class __init__ altogether and replace it to avoid
+        # useless allocations
+        nn.Module.__init__(self)
+        assert config.n_embd % config.n_head == 0
+        # key, query, value projections for all heads, but in a batch
+        self.c_attn = MergedLinear(
+            in_features=config.n_embd,
+            out_features=3 * config.n_embd,
+            r=self.lora_config.r,
+            lora_alpha=self.lora_config.alpha,
+            lora_dropout=self.lora_config.dropout,
+            enable_lora=[True, False, True],
+            fan_in_fan_out = False,
+            merge_weights=True,
+            bias=False)
+        # output projection
+        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
+        # regularization
+        self.n_head = config.n_head
+        self.n_embd = config.n_embd
+        self.block_size = config.block_size
+        self.rope_cache = None
+@contextmanager
+def lora(r, alpha, dropout, enabled: bool = True):
+    """Apply context manager under which you can instantiate the model with LoRA.
+    In a nutshell the code inside this function forces to use LoRA variant of causal self-attention
+    instead of the original one (without LoRA).
+    Args:
+        r: rank of the weight update matrices. To make sense of using LoRA the rank should be smaller than the rank of
+            the weights of the model.  The rank can be as low as 1: https://arxiv.org/pdf/2106.09685.pdf (section 7.2)
+        alpha: alpha is needed for scaling updates as alpha/r
+            "This scaling helps to reduce the need to retune hyperparameters when we vary r"
+            https://arxiv.org/pdf/2106.09685.pdf (section 4.1)
+        dropout: dropout that is applied on the input in the LoRA branch (before multiplying by matrix A)
+        enabled: enables/disables LoRA
+    """
+    if not enabled:
+        yield
+        return
+    CausalSelfAttention.lora_config = LoRAConfig(r=r, alpha=alpha, dropout=dropout)
+    # when entering context manager replace link to causal self-attention class from original
+    # to a variant with LoRA
+    causal_self_attention = llama.CausalSelfAttention
+    llama.CausalSelfAttention = CausalSelfAttention
+    yield
+    # when exiting context manager - restore link to original causal self-attention class
+    llama.CausalSelfAttention = causal_self_attention
+    CausalSelfAttention.lora_config = None

lit_llama/model.py ADDED Viewed

	@@ -0,0 +1,321 @@

+"""Full definition of a LLaMA Language Model, all of it in this single file.
+Based on the nanoGPT implementation: https://github.com/karpathy/nanoGPT.
+"""
+# mypy: ignore-errors
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from typing_extensions import Self
+from lit_llama.utils import find_multiple
+MaskCache = torch.Tensor
+RoPECache = torch.Tensor
+KVCache = Tuple[torch.Tensor, torch.Tensor]
+@dataclass
+class LLaMAConfig:
+    block_size: int = 2048
+    vocab_size: int = 32000
+    padded_vocab_size: Optional[int] = None
+    n_layer: int = 32
+    n_head: int = 32
+    n_embd: int = 4096
+    def __post_init__(self):
+        if self.padded_vocab_size is None:
+            self.padded_vocab_size = find_multiple(self.vocab_size, 64)
+    @classmethod
+    def from_name(cls, name: str) -> Self:
+        return cls(**llama_configs[name])
+llama_configs = {
+    "7B": dict(n_layer=32, n_head=32, n_embd=4096),
+    "13B": dict(n_layer=40, n_head=40, n_embd=5120),
+    "30B": dict(n_layer=60, n_head=52, n_embd=6656),
+    "65B": dict(n_layer=80, n_head=64, n_embd=8192),
+}
+class LLaMA(nn.Module):
+    def __init__(self, config: LLaMAConfig) -> None:
+        super().__init__()
+        assert config.padded_vocab_size is not None
+        self.config = config
+        self.lm_head = nn.Linear(config.n_embd, config.padded_vocab_size, bias=False)
+        self.transformer = nn.ModuleDict(
+            dict(
+                wte=nn.Embedding(config.padded_vocab_size, config.n_embd),
+                h=nn.ModuleList(Block(config) for _ in range(config.n_layer)),
+                ln_f=RMSNorm(config.n_embd),
+            )
+        )
+        self.rope_cache: Optional[RoPECache] = None
+        self.mask_cache: Optional[MaskCache] = None
+        self.kv_caches: List[KVCache] = []
+    def _init_weights(self, module: nn.Module) -> None:
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02 / math.sqrt(2 * self.config.n_layer))
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02 / math.sqrt(2 * self.config.n_layer))
+    def forward(
+        self, idx: torch.Tensor, max_seq_length: Optional[int] = None, input_pos: Optional[torch.Tensor] = None
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, List[KVCache]]]:
+        B, T = idx.size()
+        block_size = self.config.block_size
+        if max_seq_length is None:
+            max_seq_length = block_size
+        assert T <= max_seq_length, f"Cannot forward sequence of length {T}, max seq length is only {max_seq_length}"
+        assert max_seq_length <= block_size, f"Cannot attend to {max_seq_length}, block size is only {block_size}"
+        assert T <= block_size, f"Cannot forward sequence of length {T}, block size is only {block_size}"
+        if self.rope_cache is None:
+            self.rope_cache = self.build_rope_cache(idx)
+        if self.mask_cache is None:
+            self.mask_cache = self.build_mask_cache(idx)
+        if input_pos is not None:
+            rope = self.rope_cache.index_select(0, input_pos)
+            mask = self.mask_cache.index_select(2, input_pos)
+            mask = mask[:, :, :, :max_seq_length]
+        else:
+            rope = self.rope_cache[:T]
+            mask = self.mask_cache[:, :, :T, :T]
+        # forward the model itself
+        x = self.transformer.wte(idx)  # token embeddings of shape (b, t, n_embd)
+        if input_pos is None:  # proxy for use_cache=False
+            for block in self.transformer.h:
+                x, _ = block(x, rope, mask, max_seq_length)
+        else:
+            if not self.kv_caches:
+                head_size = self.config.n_embd // self.config.n_head
+                cache_shape = (B, self.config.n_head, max_seq_length, head_size)
+                self.kv_caches = [
+                    (torch.zeros(cache_shape, device=x.device, dtype=x.dtype), torch.zeros(cache_shape, device=x.device, dtype=x.dtype))
+                    for _ in range(self.config.n_layer)
+                ]
+            for i, block in enumerate(self.transformer.h):
+                x, self.kv_caches[i] = block(x, rope, mask, max_seq_length, input_pos, self.kv_caches[i])
+        x = self.transformer.ln_f(x)
+        logits = self.lm_head(x)  # (b, t, vocab_size)
+        return logits
+    @classmethod
+    def from_name(cls, name: str) -> Self:
+        return cls(LLaMAConfig.from_name(name))
+    def build_rope_cache(self, idx: torch.Tensor) -> RoPECache:
+        return build_rope_cache(
+            seq_len=self.config.block_size,
+            n_elem=self.config.n_embd // self.config.n_head,
+            dtype=idx.dtype,
+            device=idx.device,
+        )
+    def build_mask_cache(self, idx: torch.Tensor) -> MaskCache:
+        ones = torch.ones((self.config.block_size, self.config.block_size), device=idx.device, dtype=torch.bool)
+        return torch.tril(ones).unsqueeze(0).unsqueeze(0)
+    def reset_cache(self) -> None:
+        self.kv_caches.clear()
+        if self.mask_cache.device.type == "xla":
+            # https://github.com/Lightning-AI/lit-parrot/pull/83#issuecomment-1558150179
+            self.rope_cache = None
+            self.mask_cache = None
+class Block(nn.Module):
+    def __init__(self, config: LLaMAConfig) -> None:
+        super().__init__()
+        self.rms_1 = RMSNorm(config.n_embd)
+        self.attn = CausalSelfAttention(config)
+        self.rms_2 = RMSNorm(config.n_embd)
+        self.mlp = MLP(config)
+    def forward(
+        self,
+        x: torch.Tensor,
+        rope: RoPECache,
+        mask: MaskCache,
+        max_seq_length: int,
+        input_pos: Optional[torch.Tensor] = None,
+        kv_cache: Optional[KVCache] = None,
+    ) -> Tuple[torch.Tensor, Optional[KVCache]]:
+        h, new_kv_cache = self.attn(self.rms_1(x), rope, mask, max_seq_length, input_pos, kv_cache)
+        x = x + h
+        x = x + self.mlp(self.rms_2(x))
+        return x, new_kv_cache
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config: LLaMAConfig) -> None:
+        super().__init__()
+        assert config.n_embd % config.n_head == 0
+        # key, query, value projections for all heads, but in a batch
+        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=False)
+        # output projection
+        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
+        self.n_head = config.n_head
+        self.n_embd = config.n_embd
+        self.block_size = config.block_size
+    def forward(
+        self,
+        x: torch.Tensor,
+        rope: RoPECache,
+        mask: MaskCache,
+        max_seq_length: int,
+        input_pos: Optional[torch.Tensor] = None,
+        kv_cache: Optional[KVCache] = None,
+    ) -> Tuple[torch.Tensor, Optional[KVCache]]:
+        B, T, C = x.size()  # batch size, sequence length, embedding dimensionality (n_embd)
+        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
+        head_size = C // self.n_head
+        k = k.view(B, T, self.n_head, head_size)
+        q = q.view(B, T, self.n_head, head_size)
+        v = v.view(B, T, self.n_head, head_size)
+        q = apply_rope(q, rope)
+        k = apply_rope(k, rope)
+        k = k.transpose(1, 2)  # (B, nh, T, hs)
+        q = q.transpose(1, 2)  # (B, nh, T, hs)
+        v = v.transpose(1, 2)  # (B, nh, T, hs)
+        if kv_cache is not None:
+            cache_k, cache_v = kv_cache
+            # check if reached token limit
+            if input_pos[-1] >= max_seq_length:
+                input_pos = torch.tensor(max_seq_length - 1, device=input_pos.device)
+                # shift 1 position to the left
+                cache_k = torch.roll(cache_k, -1, dims=2)
+                cache_v = torch.roll(cache_v, -1, dims=2)
+            k = cache_k.index_copy(2, input_pos, k)
+            v = cache_v.index_copy(2, input_pos, v)
+            kv_cache = k, v
+        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
+        #  att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        #  att = att.masked_fill(mask[:,:,:T,:T] == 0, float('-inf'))
+        #  att = F.softmax(att, dim=-1)
+        #  y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
+        # efficient attention using Flash Attention CUDA kernels
+        y = F.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0)
+        y = y.transpose(1, 2).contiguous().view(B, T, C)  # re-assemble all head outputs side by side
+        # output projection
+        y = self.c_proj(y)
+        return y, kv_cache
+class MLP(nn.Module):
+    def __init__(self, config: LLaMAConfig) -> None:
+        super().__init__()
+        hidden_dim = 4 * config.n_embd
+        n_hidden = int(2 * hidden_dim / 3)
+        n_hidden = find_multiple(n_hidden, 256)
+        self.c_fc1 = nn.Linear(config.n_embd, n_hidden, bias=False)
+        self.c_fc2 = nn.Linear(config.n_embd, n_hidden, bias=False)
+        self.c_proj = nn.Linear(n_hidden, config.n_embd, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = F.silu(self.c_fc1(x)) * self.c_fc2(x)
+        x = self.c_proj(x)
+        return x
+class RMSNorm(nn.Module):
+    """Root Mean Square Layer Normalization.
+    Derived from https://github.com/bzhangGo/rmsnorm/blob/master/rmsnorm_torch.py. BSD 3-Clause License:
+    https://github.com/bzhangGo/rmsnorm/blob/master/LICENSE.
+    """
+    def __init__(self, size: int, dim: int = -1, eps: float = 1e-5) -> None:
+        super().__init__()
+        self.scale = nn.Parameter(torch.ones(size))
+        self.eps = eps
+        self.dim = dim
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # NOTE: the original RMSNorm paper implementation is not equivalent
+        # norm_x = x.norm(2, dim=self.dim, keepdim=True)
+        # rms_x = norm_x * d_x ** (-1. / 2)
+        # x_normed = x / (rms_x + self.eps)
+        norm_x = torch.mean(x * x, dim=self.dim, keepdim=True)
+        x_normed = x * torch.rsqrt(norm_x + self.eps)
+        return self.scale * x_normed
+def build_rope_cache(
+    seq_len: int, n_elem: int, dtype: torch.dtype, device: torch.device, base: int = 10000
+) -> RoPECache:
+    """Enhanced Transformer with Rotary Position Embedding.
+    Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/
+    transformers/rope/__init__.py. MIT License:
+    https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license.
+    """
+    # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$
+    theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, dtype=dtype, device=device) / n_elem))
+    # Create position indexes `[0, 1, ..., seq_len - 1]`
+    seq_idx = torch.arange(seq_len, dtype=dtype, device=device)
+    # Calculate the product of position index and $\theta_i$
+    idx_theta = torch.outer(seq_idx, theta).float()
+    cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1)
+    # this is to mimic the behaviour of complex32, else we will get different results
+    if dtype in (torch.float16, torch.bfloat16, torch.int8):
+        cache = cache.half()
+    return cache
+def apply_rope(x: torch.Tensor, rope_cache: RoPECache) -> torch.Tensor:
+    # truncate to support variable sizes
+    T = x.size(1)
+    rope_cache = rope_cache[:T]
+    # cast because the reference does
+    xshaped = x.float().reshape(*x.shape[:-1], -1, 2)
+    rope_cache = rope_cache.view(1, xshaped.size(1), 1, xshaped.size(3), 2)
+    x_out2 = torch.stack(
+        [
+            xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1],
+            xshaped[..., 1] * rope_cache[..., 0] + xshaped[..., 0] * rope_cache[..., 1],
+        ],
+        -1,
+    )
+    x_out2 = x_out2.flatten(3)
+    return x_out2.type_as(x)

lit_llama/packed_dataset.py ADDED Viewed

	@@ -0,0 +1,260 @@

+# Very loosely inspired by indexed_dataset in Fairseq, Megatron
+# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/data/indexed_dataset.py
+import os
+import struct
+import random
+import numpy as np
+import torch
+from torch.utils.data import IterableDataset, get_worker_info
+dtypes = {
+    1: np.uint8,
+    2: np.int8,
+    3: np.int16,
+    4: np.int32,
+    5: np.int64,
+    6: np.float32,
+    7: np.float64,
+    8: np.uint16,
+}
+def code(dtype):
+    for k in dtypes.keys():
+        if dtypes[k] == dtype:
+            return k
+    raise ValueError(dtype)
+HDR_MAGIC = b"LITPKDS"
+HDR_SIZE = 24  # bytes
+class PackedDataset(IterableDataset):
+    def __init__(self, filenames, n_chunks, block_size, seed=12345, shuffle=True, wrap=False, num_processes=1, process_rank=0):
+        self._filenames = filenames
+        self._n_chunks = n_chunks
+        self._block_size = block_size
+        self._seed = seed
+        self._shuffle = shuffle
+        self._wrap = wrap
+        self._num_processes = num_processes
+        self._process_rank = process_rank
+    def __iter__(self):
+        worker_info = get_worker_info()
+        num_workers = worker_info.num_workers if worker_info is not None else 1
+        worker_id = worker_info.id if worker_info is not None else 0
+        num_shards = num_workers * self._num_processes
+        shard_id = self._process_rank * num_workers + worker_id
+        max_num_files = len(self._filenames) // num_shards * num_shards
+        filenames = self._filenames[shard_id : max_num_files : num_shards]
+        return PackedDatasetIterator(
+            filenames=filenames,
+            n_chunks=self._n_chunks,
+            block_size=self._block_size,
+            seed=self._seed,
+            shuffle=self._shuffle,
+            wrap=self._wrap,
+        )
+class PackedDatasetBuilder(object):
+    def __init__(
+        self,
+        outdir,
+        prefix,
+        chunk_size,
+        sep_token,
+        dtype="auto",
+        vocab_size=None,
+    ):
+        if dtype == "auto":
+            if vocab_size is None:
+                raise ValueError("vocab_size cannot be None when dtype='auto'")
+            if vocab_size is not None and vocab_size < 65500:
+                self._dtype = np.uint16
+            else:
+                self._dtype = np.int32
+        else:
+            self._dtype = dtype
+        self._counter = 0
+        self._chunk_size = chunk_size
+        self._outdir = outdir
+        self._prefix = prefix
+        self._sep_token = sep_token
+        self._arr = np.zeros(self._chunk_size, dtype=self._dtype)
+        self._arr.fill(self._sep_token)
+        self._idx = 0
+        self._version = 1
+        self._filenames = []
+    def _write_chunk(self):
+        filename = f"{self._prefix}_{self._counter:010d}.bin"
+        filename = os.path.join(self._outdir, filename)
+        with open(filename, "wb") as f:
+            f.write(HDR_MAGIC)
+            f.write(struct.pack("<Q", self._version))
+            f.write(struct.pack("<B", code(self._dtype)))
+            f.write(struct.pack("<Q", self._chunk_size))
+            f.write(self._arr.tobytes(order="C"))
+        self._filenames.append(filename)
+        self._counter += 1
+        self._arr.fill(self._sep_token)
+        self._idx = 0
+    @property
+    def dtype(self):
+        return self._dtype
+    @property
+    def filenames(self):
+        return self._filenames.copy()
+    def add_array(self, arr):
+        while self._idx + arr.shape[0] > self._chunk_size:
+            part_len = self._chunk_size - self._idx
+            self._arr[self._idx : self._idx + part_len] = arr[:part_len]
+            self._write_chunk()
+            arr = arr[part_len:]
+        arr_len = arr.shape[0]
+        self._arr[self._idx : self._idx + arr_len] = arr
+        self._idx += arr_len
+    def write_reminder(self):
+        self._write_chunk()
+class PackedDatasetIterator:
+    def __init__(self, filenames, n_chunks, block_size, seed, shuffle, wrap):
+        self._seed = seed
+        self._shuffle = shuffle
+        self._rng = np.random.default_rng(seed) if shuffle else None
+        self._block_idxs = None
+        self._wrap = wrap
+        # TODO: instead of filenames, we could have a single text stream
+        #       (or text file) with the sequence of all files to be
+        #       fetched/loaded.
+        self._filenames = filenames
+        self._file_idx = 0
+        self._n_chunks = n_chunks
+        self._dtype = None
+        self._block_size = block_size
+        self._n_blocks = None
+        self._mmaps = []
+        self._buffers = []
+        self._block_idxs = []
+        self._curr_idx = 0
+        self._load_n_chunks()
+    def _read_header(self, path):
+        with open(path, "rb") as f:
+            magic = f.read(len(HDR_MAGIC))
+            assert magic == HDR_MAGIC, "File doesn't match expected format."
+            version = struct.unpack("<Q", f.read(8))
+            assert (1,) == version
+            (dtype_code,) = struct.unpack("<B", f.read(1))
+            dtype = dtypes[dtype_code]
+            (chunk_size,) = struct.unpack("<Q", f.read(8))
+        return dtype, chunk_size
+    def _close_mmaps(self):
+        for mmap in self._mmaps:
+            mmap._mmap.close()
+    def _load_n_chunks(self):
+        self._close_mmaps()
+        self._mmaps = []
+        self._buffers = []
+        if self._n_chunks > len(self._filenames[self._file_idx:]):
+            if not self._wrap:
+                raise StopIteration
+            else:
+                self._file_idx = 0
+        for i in range(self._n_chunks):
+            filename = self._filenames[self._file_idx + i]
+            if self._dtype is None:
+                self._dtype, self._chunk_size = self._read_header(
+                    filename
+                )
+                self._n_blocks = self._chunk_size // self._block_size
+            # TODO: check header matches with previous files
+            mmap = np.memmap(filename, mode="r", order="C", offset=HDR_SIZE)
+            self._mmaps.append(mmap)
+            self._buffers.append(memoryview(mmap))
+        self._file_idx += self._n_chunks
+        n_all_blocks = self._n_chunks * self._n_blocks
+        self._block_idxs = (
+            self._rng.permutation(n_all_blocks)
+            if self._shuffle
+            else range(n_all_blocks)
+        )
+        self._curr_idx = 0
+    def __del__(self):
+        self._close_mmaps()
+        del self._mmaps
+        del self._buffers
+    def __iter__(self):
+        return self
+    def __next__(self):
+        if self._curr_idx >= len(self._block_idxs):
+            self._load_n_chunks()
+            # TODO: trigger fetching next next n_chunks if remote
+        block_idx = self._block_idxs[self._curr_idx]
+        chunk_id = block_idx // self._n_blocks
+        buffer = self._buffers[chunk_id]
+        elem_id = (block_idx % self._n_blocks) * self._block_size
+        offset = np.dtype(self._dtype).itemsize * elem_id
+        arr = np.frombuffer(
+            buffer, dtype=self._dtype, count=self._block_size, offset=offset
+        )
+        self._curr_idx += 1
+        return torch.from_numpy(arr.astype(np.int64))
+class CombinedDataset(IterableDataset):
+    def __init__(self, datasets, seed, weights=None):
+        self._seed = seed
+        self._datasets = datasets
+        self._weights = weights
+        n_datasets = len(datasets)
+        if weights is None:
+            self._weights = [1 / n_datasets] * n_datasets
+    def __iter__(self):
+        return CombinedDatasetIterator(self._datasets, self._seed, self._weights)
+class CombinedDatasetIterator:
+    def __init__(self, datasets, seed, weights):
+        self._datasets = [iter(el) for el in datasets]
+        self._weights = weights
+        self._rng = random.Random(seed)
+    def __next__(self):
+        dataset, = self._rng.choices(self._datasets, weights=self._weights, k=1)
+        return next(dataset)

lit_llama/quantization.py ADDED Viewed

	@@ -0,0 +1,614 @@

+import os
+from contextlib import contextmanager
+import warnings
+import math
+import torch
+# configuration for bitsandbytes before import
+os.environ["BITSANDBYTES_NOWELCOME"] = "1"
+warnings.filterwarnings(
+    "ignore",
+    message="MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization",
+)
+warnings.filterwarnings(
+    "ignore",
+    message="MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization",
+)
+warnings.filterwarnings(
+    "ignore",
+    message="The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers and GPU quantization are unavailable.",
+)
+try:
+    import bitsandbytes as bnb  # noqa: E402
+except:
+    bnb = None
+try:
+    import triton  # noqa: E402
+    import triton.language as tl  # noqa: E402
+except:
+    triton = None
+if bnb is not None:
+    class Linear8bitLt(bnb.nn.Linear8bitLt):
+        """Wraps `bnb.nn.Linear8bitLt` and enables instantiation directly on the device and
+        re-quantizaton when loading the state dict.
+        This should only be used for inference. For training, use `bnb.nn.Linear8bitLt` directly.
+        """
+        def __init__(self, *args, **kwargs):
+            super().__init__(*args, **kwargs, has_fp16_weights=False, threshold=6.0)
+            # We quantize the initial weight here so we don't end up filling the device
+            # memory with float32 weights which could lead to OOM.
+            self._quantize_weight(self.weight.data)
+        def _load_from_state_dict(self, local_state_dict, *args, **kwargs):
+            # There is only one key that ends with `*.weight`, the other one is the bias
+            weight_key = next(
+                (name for name in local_state_dict.keys() if name.endswith("weight")),
+                None,
+            )
+            if weight_key is None:
+                return
+            # Load the weight from the state dict and re-quantize it
+            weight = local_state_dict.pop(weight_key)
+            self._quantize_weight(weight)
+            # If there is a bias, let nn.Module load it
+            if local_state_dict:
+                super()._load_from_state_dict(local_state_dict, *args, **kwargs)
+        def _quantize_weight(self, weight: torch.Tensor) -> None:
+            # This code is taken and adapted from `bnb.nn.Int8Params.cuda()`
+            B = weight.contiguous().half().cuda()
+            CB, CBt, SCB, SCBt, coo_tensorB = bnb.functional.double_quant(B)
+            del CBt
+            del SCBt
+            self.weight.data = CB
+            setattr(self.weight, "CB", CB)
+            setattr(self.weight, "SCB", SCB)
+if triton is not None:
+    # This is adapted from the OpenAI Triton matmul example.
+    @triton.autotune(
+        configs=[
+            triton.Config(
+                {
+                    "BLOCK_SIZE_M": 128,
+                    "BLOCK_SIZE_N": 256,
+                    "BLOCK_SIZE_K": 32,
+                    "GROUP_SIZE_M": 8,
+                },
+                num_stages=3,
+                num_warps=8,
+            ),
+            triton.Config(
+                {
+                    "BLOCK_SIZE_M": 256,
+                    "BLOCK_SIZE_N": 128,
+                    "BLOCK_SIZE_K": 32,
+                    "GROUP_SIZE_M": 8,
+                },
+                num_stages=3,
+                num_warps=8,
+            ),
+            triton.Config(
+                {
+                    "BLOCK_SIZE_M": 256,
+                    "BLOCK_SIZE_N": 64,
+                    "BLOCK_SIZE_K": 32,
+                    "GROUP_SIZE_M": 8,
+                },
+                num_stages=4,
+                num_warps=4,
+            ),
+            triton.Config(
+                {
+                    "BLOCK_SIZE_M": 64,
+                    "BLOCK_SIZE_N": 256,
+                    "BLOCK_SIZE_K": 32,
+                    "GROUP_SIZE_M": 8,
+                },
+                num_stages=4,
+                num_warps=4,
+            ),
+            triton.Config(
+                {
+                    "BLOCK_SIZE_M": 128,
+                    "BLOCK_SIZE_N": 128,
+                    "BLOCK_SIZE_K": 32,
+                    "GROUP_SIZE_M": 8,
+                },
+                num_stages=4,
+                num_warps=4,
+            ),
+            triton.Config(
+                {
+                    "BLOCK_SIZE_M": 128,
+                    "BLOCK_SIZE_N": 64,
+                    "BLOCK_SIZE_K": 32,
+                    "GROUP_SIZE_M": 8,
+                },
+                num_stages=4,
+                num_warps=4,
+            ),
+            triton.Config(
+                {
+                    "BLOCK_SIZE_M": 64,
+                    "BLOCK_SIZE_N": 128,
+                    "BLOCK_SIZE_K": 32,
+                    "GROUP_SIZE_M": 8,
+                },
+                num_stages=4,
+                num_warps=4,
+            ),
+            triton.Config(
+                {
+                    "BLOCK_SIZE_M": 128,
+                    "BLOCK_SIZE_N": 32,
+                    "BLOCK_SIZE_K": 32,
+                    "GROUP_SIZE_M": 8,
+                },
+                num_stages=4,
+                num_warps=4,
+            ),
+            triton.Config(
+                {
+                    "BLOCK_SIZE_M": 64,
+                    "BLOCK_SIZE_N": 32,
+                    "BLOCK_SIZE_K": 32,
+                    "GROUP_SIZE_M": 8,
+                },
+                num_stages=5,
+                num_warps=2,
+            ),
+            triton.Config(
+                {
+                    "BLOCK_SIZE_M": 32,
+                    "BLOCK_SIZE_N": 64,
+                    "BLOCK_SIZE_K": 32,
+                    "GROUP_SIZE_M": 8,
+                },
+                num_stages=5,
+                num_warps=2,
+            ),
+        ],
+        key=["M", "N", "K"],
+    )
+    @triton.jit
+    def linear_kernel_4bit_weight(
+        # Pointers to matrices
+        a_ptr,
+        b_ptr,
+        c_ptr,
+        bscales_ptr,
+        bzeros_ptr,
+        # bdequant,
+        # Matrix dimensions
+        M,
+        N,
+        K,
+        # The stride variables represent how much to increase the ptr by when moving by 1
+        # element in a particular dimension. E.g. stride_am is how much to increase a_ptr
+        # by to get the element one row down (A has M rows)
+        stride_am,
+        stride_ak,
+        stride_bk,
+        stride_bn,
+        stride_cm,
+        stride_cn,
+        # Meta-parameters
+        BLOCK_SIZE_M: tl.constexpr,
+        BLOCK_SIZE_N: tl.constexpr,
+        BLOCK_SIZE_K: tl.constexpr,
+        GROUP_SIZE_M: tl.constexpr,
+    ):
+        """Kernel for computing the matmul C = A x B.T.
+        A has shape (M, K), B has shape (N, K) and C has shape (M, N)
+        """
+        # -----------------------------------------------------------
+        # Map program ids `pid` to the block of C it should compute.
+        # This is done in a grouped ordering to promote L2 data reuse
+        # See above `L2 Cache Optimizations` section for details
+        pid = tl.program_id(axis=0)
+        num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+        num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+        num_pid_in_group = GROUP_SIZE_M * num_pid_n
+        group_id = pid // num_pid_in_group
+        first_pid_m = group_id * GROUP_SIZE_M
+        group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+        pid_m = first_pid_m + (pid % group_size_m)
+        pid_n = (pid % num_pid_in_group) // group_size_m
+        # ----------------------------------------------------------
+        # Create pointers for the first blocks of A and B.
+        # We will advance this pointer as we move in the K direction
+        # and accumulate
+        # a_ptrs is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers
+        # b_ptrs is a block of [BLOCK_SIZE_K, BLOCK_SIZE_n] pointers
+        # see above `Pointer Arithmetics` section for details
+        offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+        offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+        a_mask = offs_am[:, None] < M
+        b_mask = offs_bn[None, :] < N
+        offs_k = tl.arange(0, BLOCK_SIZE_K)
+        a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
+        b_ptrs = b_ptr + (
+            (offs_k[:, None] // 2) * stride_bk + offs_bn[None, :] * stride_bn
+        )
+        bscales_ptrs = bscales_ptr + offs_bn[None, :]
+        bzeros_ptrs = bzeros_ptr + offs_bn[None, :]
+        scale = tl.load(bscales_ptrs)
+        zero = tl.load(bzeros_ptrs)
+        # -----------------------------------------------------------
+        # Iterate to compute a block of the C matrix
+        # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block
+        # of fp32 values for higher accuracy.
+        # `accumulator` will be converted back to fp16 after the loop
+        accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+        for k in range(0, K, BLOCK_SIZE_K):
+            # wasteful as it is to load everything twice, my attempts at avoiding it lead to slower code
+            b12 = tl.load(b_ptrs, mask=b_mask)
+            # Note that for simplicity, we don't apply a mask in K here.
+            a = tl.load(a_ptrs, mask=a_mask).to(tl.float32)
+            b = (
+                ((b12.to(tl.uint8) >> ((offs_k[:, None] % 2) * 4)) & 0xF).to(tl.float32)
+                - zero
+            ) * scale
+            accumulator += tl.dot(a, b)
+            # Advance the ptrs to the next K block
+            a_ptrs += BLOCK_SIZE_K * stride_ak
+            b_ptrs += (BLOCK_SIZE_K // 2) * stride_bk
+        c = accumulator
+        # -----------------------------------------------------------
+        # Write back the block of the output matrix C
+        offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+        offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+        c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+        c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+        tl.store(c_ptrs, c, mask=c_mask)
+    def qlinear_4bit_weight(inp, weight, scales, zeros):
+        weight = weight.t().contiguous()
+        c_shape = inp.shape[:-1] + weight.shape[-1:]
+        inp = inp.reshape(-1, inp.shape[-1]).contiguous()
+        # we pad the input to amortize triton compilation cost better
+        PAD_TO = 256
+        if inp.shape[0] % PAD_TO != 0:
+            c_crop = inp.shape[0]
+            new_inp_shape0 = inp.shape[0] + PAD_TO - inp.shape[0] % PAD_TO
+            inp2 = inp.new_empty((new_inp_shape0, inp.shape[1]))
+            inp2[: inp.shape[0]] = inp
+            inp2[inp.shape[0] :].zero_()
+            inp = inp2
+        else:
+            c_crop = None
+        assert inp.shape[1] == weight.shape[0] * 2, "incompatible dimensions"
+        assert scales.shape == (weight.shape[1], 1)
+        assert zeros.shape == (weight.shape[1], 1)
+        scales = scales.contiguous()
+        zeros = zeros.contiguous()
+        K, N = weight.shape
+        M, K = inp.shape
+        assert (
+            K % 32 == 0
+        ), "We don't check memory-out-of-bounds with K so K must be divisible by BLOCK_SIZE_K"
+        # allocates output
+        c = torch.empty((M, N), device=inp.device, dtype=inp.dtype)
+        # 1D launch kernel where each block gets its own program.
+        grid = lambda META: (
+            triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
+        )
+        linear_kernel_4bit_weight[grid](
+            inp,
+            weight,
+            c,
+            scales,
+            zeros,
+            M,
+            N,
+            K,
+            inp.stride(0),
+            inp.stride(1),
+            weight.stride(0),
+            weight.stride(1),
+            c.stride(0),
+            c.stride(1),
+        )
+        return c[:c_crop].reshape(c_shape)
+else:
+    qlinear_4bit_weight = None
+# for correctness but with terrible perf
+class ColBlockQuantizedLinear(torch.nn.Module):
+    def __init__(self, in_features, out_features, bias: bool, *, bits, tile_cols):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.tile_cols = tile_cols if tile_cols != -1 else self.in_features
+        self.bits = bits
+        self.entries_per_byte = 8 // bits
+        assert self.entries_per_byte > 0 and self.entries_per_byte * self.bits == 8
+        assert in_features % self.entries_per_byte == 0
+        self.register_buffer(
+            "quant_weight",
+            torch.empty(
+                (self.out_features, self.in_features // self.entries_per_byte),
+                dtype=torch.uint8,
+            )
+            .t()
+            .contiguous()
+            .t(),
+        )
+        self.register_buffer(
+            "scales",
+            torch.empty(
+                (
+                    self.out_features,
+                    (self.in_features + self.tile_cols - 1) // self.tile_cols,
+                )
+            ),
+        )
+        self.register_buffer("zeros", torch.empty_like(self.scales))
+        assert isinstance(bias, bool)
+        if bias:
+            self.register_buffer("bias", torch.empty((self.out_features,)))
+        else:
+            self.register_buffer("bias", None)
+    def pack_weight(self, weight):
+        weight = weight.to(device=self.quant_weight.device, copy=True)
+        for j in range(self.scales.size(1)):
+            weight[:, j * self.tile_cols : (j + 1) * self.tile_cols] /= self.scales[
+                :, j : j + 1
+            ]
+            weight[:, j * self.tile_cols : (j + 1) * self.tile_cols] += self.zeros[
+                :, j : j + 1
+            ]
+        weight = weight.clamp_(min=0, max=2**self.bits - 1).to(dtype=torch.uint8)
+        self.quant_weight.zero_()
+        for nr in range(self.entries_per_byte):
+            self.quant_weight += weight[:, nr :: self.entries_per_byte] << (
+                nr * self.bits
+            )
+    def get_weight(self, dtype=torch.float):
+        weight = torch.empty(
+            (self.out_features, self.in_features),
+            device=self.quant_weight.device,
+            dtype=dtype,
+        )
+        mask = (1 << self.bits) - 1
+        for nr in range(self.entries_per_byte):
+            weight[:, nr :: self.entries_per_byte] = (
+                (self.quant_weight >> (nr * self.bits)) & mask
+            ).float()
+        self.quant_weight.to(dtype)
+        for j in range(self.scales.size(1)):
+            weight[:, j * self.tile_cols : (j + 1) * self.tile_cols] -= self.zeros[
+                :, j : j + 1
+            ]
+            weight[:, j * self.tile_cols : (j + 1) * self.tile_cols] *= self.scales[
+                :, j : j + 1
+            ]
+        return weight
+    def forward(self, inp):
+        if (
+            triton is not None
+            and self.bits == 4
+            and self.quant_weight.device.type == "cuda"
+            and self.zeros.shape[1] == 1
+            and self.quant_weight.shape[1] % 32 == 0
+        ):
+            return qlinear_4bit_weight(inp, self.quant_weight, self.scales, self.zeros)
+        weight = self.get_weight(dtype=inp.dtype)
+        return torch.nn.functional.linear(inp, weight, self.bias)
+class GPTQQuantizer:
+    # The algorithm and code has been taken from  https://github.com/IST-DASLab/gptq/
+    # E. Frantar et al GPTQ: Accurate Post-training Compression for GPT, arXiv:2210.17323
+    # portions copyright by the authors licensed under the Apache License 2.0
+    # All errors are our own.
+    def __init__(
+        self,
+        linear_module,
+        *,
+        bits,
+        perchannel=True,
+        sym=False,
+        blocksize=128,
+        percdamp=0.01,
+        groupsize=-1,
+        actorder=False
+    ):
+        assert isinstance(linear_module, torch.nn.Linear)
+        self.linear_module = linear_module
+        self.dev = self.linear_module.weight.device
+        self.rows = linear_module.weight.shape[0]
+        self.columns = linear_module.weight.shape[1]
+        self.H = torch.zeros((self.columns, self.columns), device=self.dev)
+        self.nsamples = 0
+        self.bits = bits
+        self.maxq = 2**bits - 1
+        self.perchannel = perchannel
+        self.sym = sym
+        self.blocksize = blocksize
+        self.percdamp = percdamp
+        self.groupsize = groupsize
+        self.actorder = actorder
+        self.tile_cols = self.columns if groupsize == -1 else groupsize
+        self.scales = torch.zeros(
+            (self.rows, (self.columns + self.tile_cols - 1) // self.tile_cols),
+            dtype=self.linear_module.weight.dtype,
+            device=self.dev,
+        )
+        self.zeros = torch.zeros_like(self.scales)
+        assert not (
+            self.actorder and self.groupsize != -1
+        ), "The permutation trick does not work for grouped quantization"
+    @staticmethod
+    def quantize_weight(x, scale, zero, maxq):
+        q = torch.clamp(torch.round(x / scale) + zero, 0, maxq)
+        x_rec = scale * (q - zero)
+        return x_rec
+    def find_params_weight(self, x):
+        dev = x.device
+        shape = x.shape
+        if self.perchannel:
+            x = x.flatten(1)
+        else:
+            x = x.flatten().unsqueeze(0)
+        tmp = torch.zeros(x.shape[0], device=dev)
+        xmin = torch.minimum(x.min(1)[0], tmp)
+        xmax = torch.maximum(x.max(1)[0], tmp)
+        if self.sym:
+            xmax = torch.maximum(torch.abs(xmin), xmax)
+            tmp = xmin < 0
+            if torch.any(tmp):
+                xmin[tmp] = -xmax[tmp]
+        tmp = (xmin == 0) & (xmax == 0)
+        xmin[tmp] = -1
+        xmax[tmp] = +1
+        scale = (xmax - xmin) / self.maxq
+        if self.sym:
+            zero = torch.full_like(scale, (self.maxq + 1) / 2)
+        else:
+            zero = torch.round(-xmin / scale)
+        if not self.perchannel:
+            tmp = shape[0]
+            scale = scale.repeat(tmp)
+            zero = zero.repeat(tmp)
+        shape = [-1] + [1] * (len(shape) - 1)
+        scale = scale.reshape(shape)
+        zero = zero.reshape(shape)
+        return scale, zero
+    def collect_input_stats(self, _1, inp, _2):
+        inp = inp[0].detach()
+        self.last_inp = inp
+        if len(inp.shape) == 2:
+            inp = inp.unsqueeze(0)
+        tmp = inp.shape[0]
+        if len(inp.shape) == 3:
+            inp = inp.reshape((-1, inp.shape[-1]))
+        inp = inp.t()
+        self.H *= self.nsamples / (self.nsamples + tmp)
+        self.nsamples += tmp
+        # inp = inp.float()
+        inp = math.sqrt(2 / self.nsamples) * inp.float()
+        # self.H += 2 / self.nsamples * inp.matmul(inp.t())
+        self.H += inp.matmul(inp.t())
+    def quantize(self):
+        W = self.linear_module.weight.detach().to(dtype=torch.float, copy=True)
+        scale, zero = self.find_params_weight(W)
+        self.scales[:] = scale
+        self.zeros[:] = zero
+        H = self.H
+        del self.H
+        dead = torch.diag(H) == 0
+        H[dead, dead] = 1
+        W[:, dead] = 0
+        if self.actorder:
+            perm = torch.argsort(torch.diag(H), descending=True)
+            W = W[:, perm]
+            H = H[perm][:, perm]
+        Losses = torch.zeros_like(W)
+        Q = torch.zeros_like(W)
+        damp = self.percdamp * torch.mean(torch.diag(H))
+        diag = torch.arange(self.columns, device=self.dev)
+        H[diag, diag] += damp
+        H = torch.linalg.cholesky(H)
+        H = torch.cholesky_inverse(H)
+        H = torch.linalg.cholesky(H, upper=True)
+        Hinv = H
+        for i1 in range(0, self.columns, self.blocksize):
+            i2 = min(i1 + self.blocksize, self.columns)
+            count = i2 - i1
+            W1 = W[:, i1:i2].clone()
+            Q1 = torch.zeros_like(W1)
+            Err1 = torch.zeros_like(W1)
+            Losses1 = torch.zeros_like(W1)
+            Hinv1 = Hinv[i1:i2, i1:i2]
+            for i in range(count):
+                w = W1[:, i]
+                d = Hinv1[i, i]
+                if self.groupsize != -1:
+                    if (i1 + i) % self.groupsize == 0:
+                        scale, zero = self.find_params_weight(
+                            W[:, (i1 + i) : (i1 + i + self.groupsize)]
+                        )
+                        self.scales[:, (i1 + i) // self.groupsize] = scale
+                        self.zeros[:, (i1 + i) // self.groupsize] = zero
+                q = self.quantize_weight(w.unsqueeze(1), scale, zero, self.maxq)
+                q = q.squeeze(1)
+                assert q.dim() == 1
+                Q1[:, i] = q
+                Losses1[:, i] = (w - q) ** 2 / d**2
+                err1 = (w - q) / d
+                W1[:, i:] -= err1.unsqueeze(1).matmul(Hinv1[i, i:].unsqueeze(0))
+                Err1[:, i] = err1
+            Q[:, i1:i2] = Q1
+            Losses[:, i1:i2] = Losses1 / 2
+            W[:, i2:] -= Err1.matmul(Hinv[i1:i2, i2:])
+        if self.actorder:
+            invperm = torch.argsort(perm)
+            Q = Q[:, invperm]
+        weight = Q.reshape(self.linear_module.weight.shape).to(
+            self.linear_module.weight.data.dtype
+        )
+        error = torch.sum(Losses).item()
+        q_module = ColBlockQuantizedLinear(
+            self.linear_module.in_features,
+            self.linear_module.out_features,
+            self.linear_module.bias is not None,
+            bits=self.bits,
+            tile_cols=self.groupsize,
+        ).to(self.dev)
+        q_module.scales = self.scales
+        q_module.zeros = self.zeros
+        q_module.pack_weight(weight)
+        q_module.bias = self.linear_module.bias
+        return q_module, error

lit_llama/tokenizer.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import os
+from pathlib import Path
+from typing import Optional
+import torch
+from sentencepiece import SentencePieceProcessor, SentencePieceTrainer
+class Tokenizer:
+    """Tokenizer for LLaMA."""
+    def __init__(self, model_path: Path) -> None:
+        self.processor = SentencePieceProcessor(model_file=str(model_path))
+        self.bos_id = self.processor.bos_id()
+        self.eos_id = self.processor.eos_id()
+        self.pad_id = self.processor.pad_id()
+    @property
+    def vocab_size(self) -> int:
+        return self.processor.vocab_size()
+    def encode(
+        self,
+        string: str,
+        bos: bool = True,
+        eos: bool = False,
+        max_length: int = -1,
+        pad: bool = False,
+        device: Optional[torch.device] = None
+    ) -> torch.Tensor:
+        tokens = self.processor.encode(string)
+        if bos:
+            tokens = [self.bos_id] + tokens
+        if eos:
+            tokens = tokens + [self.eos_id]
+        if max_length > 0:
+            tokens = tokens[:max_length]
+        if pad and len(tokens) < max_length:
+            tokens += [self.pad_id] * (max_length - len(tokens))
+        return torch.tensor(tokens, dtype=torch.int, device=device)
+    def decode(self, tokens: torch.Tensor) -> str:
+        return self.processor.decode(tokens.tolist())
+    @staticmethod
+    def train(input: str, destination: str, vocab_size=32000) -> None:
+        model_prefix = os.path.join(destination, "tokenizer")
+        SentencePieceTrainer.Train(input=input, model_prefix=model_prefix, vocab_size=vocab_size)

lit_llama/utils.py ADDED Viewed

	@@ -0,0 +1,471 @@

+"""Utility functions for training and inference."""
+import functools
+import pickle
+import warnings
+from io import BytesIO
+from pathlib import Path
+import torch
+import torch.utils._device
+from lightning.fabric.strategies import DeepSpeedStrategy, FSDPStrategy
+from torch.distributed.fsdp import FullStateDictConfig
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp import StateDictType
+from torch.serialization import normalize_storage_type
+llama_model_sizes = {
+    4096: "7B",  # 7B n_embd=4096
+    5120: "13B",  # 13B n_embd=5120
+    6656: "30B",  # 30B n_embd=6656
+    8192: "65B",  # 65B n_embd=8192
+}
+def llama_model_lookup(checkpoint: dict) -> str:
+    """Returns the LLaMA model name from the checkpoint.
+    Checks the width of the lm_head.weight matrix, as these uniquely identify the model.
+    """
+    embedding_size = checkpoint['transformer.wte.weight'].shape[1]
+    return llama_model_sizes[embedding_size]
+def find_multiple(n: int, k: int) -> int:
+    if n % k == 0:
+        return n
+    return n + k - (n % k)
+def save_model_checkpoint(fabric, model, file_path):
+    """Handles boilerplate logic for retrieving and saving the state_dict.
+    This will be upstreamed to Fabric soon.
+    """
+    file_path = Path(file_path)
+    if isinstance(fabric.strategy, DeepSpeedStrategy):
+        from deepspeed.utils.zero_to_fp32 import convert_zero_checkpoint_to_fp32_state_dict
+        fabric.save(file_path, {"model": model})
+        fabric.barrier()
+        if fabric.global_rank == 0:
+            # Create a consolidated checkpoint with the same name next to the deepspeed checkpoint
+            convert_zero_checkpoint_to_fp32_state_dict(file_path, file_path.with_suffix(".pth"))
+        return
+    if isinstance(fabric.strategy, FSDPStrategy):
+        save_policy = FullStateDictConfig(offload_to_cpu=(fabric.world_size > 1), rank0_only=True)
+        with FSDP.state_dict_type(model, StateDictType.FULL_STATE_DICT, save_policy):
+            state_dict = model._forward_module.state_dict()
+    else:
+        state_dict = model.state_dict()
+    if fabric.global_rank == 0:
+        torch.save(state_dict, file_path)
+    fabric.barrier()
+class EmptyInitOnDevice(torch.overrides.TorchFunctionMode):
+    def __init__(self, device=None, dtype=None, quantization_mode=None):
+        """
+        Create tensors with given device and dtype and don't run initialization
+           (but instead use "empty tensors", i.e. uninitialized memory).
+            device: `torch.device` to work with
+            dtype: `torch.dtype` to work with
+            quantization_mode: optional string, quantization mode to work with, default `None`.
+                 Available modes: `llm.int8` bitsnbytes LLM.int8 quantization (only on GPU)
+                                  `gptq.int4`, `gptq.int8`: GPTQ pre-quantized models
+        Example::
+            with EmptyInitOnDevice("cuda", dtype=torch.bfloat16):
+               model = LLaMA.from_name('7B')
+            model.load_state_dict(torch.load('llama-lit/7B/lit-llama.pth'))"""
+        self.quantization_mode = quantization_mode
+        self.quantized_linear_cls = None
+        if self.quantization_mode == 'llm.int8':
+            if device.type != "cuda":
+                raise ValueError("Quantization is only supported on the GPU.")
+            from .quantization import Linear8bitLt
+            self.quantized_linear_cls = Linear8bitLt
+        elif self.quantization_mode == 'gptq.int4':
+            from .quantization import ColBlockQuantizedLinear
+            self.quantized_linear_cls = functools.partial(ColBlockQuantizedLinear, bits=4, tile_cols=-1)
+        elif self.quantization_mode == 'gptq.int8':
+            from .quantization import ColBlockQuantizedLinear
+            self.quantized_linear_cls = functools.partial(ColBlockQuantizedLinear, bits=8, tile_cols=-1)
+        elif self.quantization_mode is not None:
+            raise RuntimeError(f"unknown quantization mode {self.quantization_mode}")
+        self.device = device
+        self.dtype = dtype
+    def __enter__(self):
+        if self.quantized_linear_cls != None:
+            self.torch_linear_cls = torch.nn.Linear
+            torch.nn.Linear = self.quantized_linear_cls
+        return super().__enter__()
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.quantized_linear_cls != None:
+            torch.nn.Linear = self.torch_linear_cls
+        return super().__exit__(exc_type, exc_val, exc_tb)
+    def __torch_function__(self, func, types, args=(), kwargs=None):
+        kwargs = kwargs or {}
+        if getattr(func, "__module__", None) == "torch.nn.init":
+            if "tensor" in kwargs:
+                return kwargs["tensor"]
+            else:
+                return args[0]
+        if (
+            self.device is not None
+            and func in torch.utils._device._device_constructors()
+            and kwargs.get("device") is None
+        ):
+            kwargs["device"] = self.device
+        if (
+            self.dtype is not None
+            and func in torch.utils._device._device_constructors()
+            and kwargs.get("dtype") is None
+        ):
+            kwargs["dtype"] = self.dtype
+        return func(*args, **kwargs)
+# this is taken from torchhacks https://github.com/lernapparat/torchhacks
+class NotYetLoadedTensor:
+    def __init__(self, metatensor, archiveinfo, storageinfo, rebuild_args):
+        self.metatensor = metatensor
+        self.archiveinfo = archiveinfo
+        self.storageinfo = storageinfo
+        self.rebuild_args = rebuild_args
+    @classmethod
+    def rebuild_from_type_v2(cls, func, new_type, args, state, *, archiveinfo=None):
+        ret = func(*args)
+        if isinstance(ret, NotYetLoadedTensor):
+            old_lt = ret._load_tensor
+            def _load_tensor():
+                t = old_lt()
+                return torch._tensor._rebuild_from_type_v2(
+                    lambda: t, new_type, (), state
+                )
+            ret._load_tensor = _load_tensor
+            return ret
+        return torch._tensor._rebuild_from_type_v2(func, new_type, args, state)
+    @classmethod
+    def rebuild_parameter(
+        cls, data, requires_grad, backward_hooks, *, archiveinfo=None
+    ):
+        if isinstance(data, NotYetLoadedTensor):
+            old_lt = data._load_tensor
+            def _load_tensor():
+                t = old_lt()
+                return torch._utils._rebuild_parameter(t, requires_grad, backward_hooks)
+            data._load_tensor = _load_tensor
+            return data
+        return torch._utils._rebuild_parameter(data, requires_grad, backward_hooks)
+    @classmethod
+    def rebuild_tensor_v2(
+        cls,
+        storage,
+        storage_offset,
+        size,
+        stride,
+        requires_grad,
+        backward_hooks,
+        metadata=None,
+        *,
+        archiveinfo=None,
+    ):
+        rebuild_args = (
+            storage_offset,
+            size,
+            stride,
+            requires_grad,
+            backward_hooks,
+            metadata,
+        )
+        metatensor = torch._utils._rebuild_tensor_v2(
+            storage,
+            storage_offset,
+            size,
+            stride,
+            requires_grad,
+            backward_hooks,
+            metadata,
+        )
+        storageinfo = storage.archiveinfo
+        return NotYetLoadedTensor(metatensor, archiveinfo, storageinfo, rebuild_args)
+    def _load_tensor(self):
+        name, storage_cls, fn, device, size = self.storageinfo
+        dtype = self.metatensor.dtype
+        uts = (
+            self.archiveinfo.zipfile_context.zf.get_storage_from_record(
+                f"data/{fn}",
+                size * torch._utils._element_size(dtype),
+                torch.UntypedStorage,
+            )
+            ._typed_storage()
+            ._untyped_storage
+        )
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            storage = torch.storage.TypedStorage(
+                wrap_storage=uts, dtype=self.metatensor.dtype, _internal=True
+            )
+        tensor = torch._utils._rebuild_tensor_v2(storage, *self.rebuild_args)
+        return tensor
+    @classmethod
+    def __torch_function__(cls, func, types, args=(), kwargs=None):
+        if kwargs is None:
+            kwargs = {}
+        loaded_args = [
+            (a._load_tensor() if isinstance(a, NotYetLoadedTensor) else a) for a in args
+        ]
+        res = func(*loaded_args, **kwargs)
+        # gc.collect would be costly here, maybe do it optionally
+        return res
+    def __getattr__(self, name):
+        # properties
+        ## TODO: device, is_...??
+        ## TODO: mH, mT, H, T, data, imag, real
+        ## name ???
+        if name in {
+            "dtype",
+            "grad",
+            "grad_fn",
+            "layout",
+            "names",
+            "ndim",
+            "output_nr",
+            "requires_grad",
+            "retains_grad",
+            "shape",
+            "volatile",
+        }:
+            return getattr(self.metatensor, name)
+        if name in {"size"}:
+            return getattr(self.metatensor, name)
+        # materializing with contiguous is needed for quantization
+        if name in {"contiguous"}:
+            return getattr(self._load_tensor(), name)
+        raise AttributeError(f"{type(self)} does not have {name}")
+    def __repr__(self):
+        return f"NotYetLoadedTensor({repr(self.metatensor)})"
+class LazyLoadingUnpickler(pickle.Unpickler):
+    def __init__(self, file, zipfile_context):
+        super().__init__(file)
+        self.zipfile_context = zipfile_context
+    def find_class(self, module, name):
+        res = super().find_class(module, name)
+        if module == "torch._utils" and name == "_rebuild_tensor_v2":
+            return functools.partial(
+                NotYetLoadedTensor.rebuild_tensor_v2, archiveinfo=self
+            )
+        elif module == "torch._tensor" and name == "_rebuild_from_type_v2":
+            return functools.partial(
+                NotYetLoadedTensor.rebuild_from_type_v2, archiveinfo=self
+            )
+        elif module == "torch._utils" and name == "_rebuild_parameter":
+            return functools.partial(
+                NotYetLoadedTensor.rebuild_parameter, archiveinfo=self
+            )
+        return res
+    def persistent_load(self, pid):
+        name, cls, fn, device, size = pid
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            s = torch.storage.TypedStorage(dtype=cls().dtype, device="meta")
+        s.archiveinfo = pid
+        return s
+class lazy_load:
+    def __init__(self, fn):
+        self.zf = torch._C.PyTorchFileReader(str(fn))
+        with BytesIO(self.zf.get_record("data.pkl")) as pkl:
+            mup = LazyLoadingUnpickler(pkl, self)
+            self.sd = mup.load()
+    def __enter__(self):
+        return self.sd
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        del self.zf  # I don't think there is a way to force closing...
+        self.zf = None
+class SavingProxyForStorage:
+    def __init__(self, obj, saver, protocol_version=5):
+        self.protocol_version = protocol_version
+        self.saver = saver
+        if not (isinstance(obj, torch.storage.TypedStorage) or torch.is_storage(obj)):
+            raise TypeError(f"expected storage, not {type(obj)}")
+        # this logic is taken from PyTorch 2.0+ torch/serialization.py
+        if isinstance(obj, torch.storage.TypedStorage):
+            # PT upstream wants to deprecate this eventually...
+            storage = obj._untyped_storage
+            storage_type_str = obj._pickle_storage_type()
+            storage_type = getattr(torch, storage_type_str)
+            storage_numel = obj._size()
+        else:
+            storage = obj
+            storage_type = normalize_storage_type(type(obj))
+            storage_numel = storage.nbytes()
+        storage_key = saver._write_storage_and_return_key(storage)
+        location = torch.serialization.location_tag(storage)
+        self.storage_info = (
+            "storage",
+            storage_type,
+            storage_key,
+            location,
+            storage_numel,
+        )
+    def __reduce_ex__(self, protocol_version):
+        assert False, "this should be handled with out of band"
+class SavingProxyForTensor:
+    def __init__(self, tensor, saver, protocol_version=5):
+        self.protocol_version = protocol_version
+        self.reduce_ret_fn, (storage, *other_reduce_args) = tensor.__reduce_ex__(
+            protocol_version
+        )
+        assert isinstance(
+            storage, torch.storage.TypedStorage
+        ), "Please check for updates"
+        storage_proxy = SavingProxyForStorage(
+            storage, saver, protocol_version=protocol_version
+        )
+        self.reduce_args = (storage_proxy, *other_reduce_args)
+    def __reduce_ex__(self, protocol_version):
+        if protocol_version != self.protocol_version:
+            raise RuntimeError(
+                f"Unexpected protocol version: expected {self.protocol_version}, got {protocol_version}"
+            )
+        return self.reduce_ret_fn, self.reduce_args
+class IncrementalPyTorchPickler(pickle.Pickler):
+    def __init__(self, saver, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.storage_dtypes = {}
+        self.saver = saver
+        self.id_map = {}
+    # this logic is taken from PyTorch 2.0+ torch/serialization.py
+    def persistent_id(self, obj):
+        # FIXME: the docs say that persistent_id should only return a string
+        # but torch store returns tuples. This works only in the binary protocol
+        # see
+        # https://docs.python.org/2/library/pickle.html#pickling-and-unpickling-external-objects
+        # https://github.com/python/cpython/blob/master/Lib/pickle.py#L527-L537
+        if isinstance(obj, SavingProxyForStorage):
+            return obj.storage_info
+        if isinstance(obj, torch.storage.TypedStorage) or torch.is_storage(obj):
+            if isinstance(obj, torch.storage.TypedStorage):
+                # TODO: Once we decide to break serialization FC, this case
+                # can be deleted
+                storage = obj._untyped_storage
+                storage_dtype = obj.dtype
+                storage_type_str = obj._pickle_storage_type()
+                storage_type = getattr(torch, storage_type_str)
+                storage_numel = obj._size()
+            else:
+                storage = obj
+                storage_dtype = torch.uint8
+                storage_type = normalize_storage_type(type(obj))
+                storage_numel = storage.nbytes()
+            # If storage is allocated, ensure that any other saved storages
+            # pointing to the same data all have the same dtype. If storage is
+            # not allocated, don't perform this check
+            if storage.data_ptr() != 0:
+                if storage.data_ptr() in self.storage_dtypes:
+                    if storage_dtype != self.storage_dtypes[storage.data_ptr()]:
+                        raise RuntimeError(
+                            "Cannot save multiple tensors or storages that "
+                            "view the same data as different types"
+                        )
+                else:
+                    self.storage_dtypes[storage.data_ptr()] = storage_dtype
+            storage_key = self.id_map.get(storage._cdata)
+            if storage_key is None:
+                storage_key = self.saver._write_storage_and_return_key(storage)
+                self.id_map[storage._cdata] = storage_key
+            location = torch.serialization.location_tag(storage)
+            return ("storage", storage_type, storage_key, location, storage_numel)
+        return None
+class incremental_save:
+    def __init__(self, name):
+        self.name = name
+        self.zipfile = torch._C.PyTorchFileWriter(str(name))
+        self.has_saved = False
+        self.next_key = 0
+    def __enter__(self):
+        return self
+    def store_early(self, tensor):
+        if isinstance(tensor, torch.Tensor):
+            return SavingProxyForTensor(tensor, self)
+        raise TypeError(f"can only store tensors early, not {type(tensor)}")
+    def save(self, obj):
+        if self.has_saved:
+            raise RuntimeError("have already saved")
+        # Write the pickle data for `obj`
+        data_buf = BytesIO()
+        pickler = IncrementalPyTorchPickler(self, data_buf, protocol=5)
+        pickler.dump(obj)
+        data_value = data_buf.getvalue()
+        self.zipfile.write_record("data.pkl", data_value, len(data_value))
+        self.has_saved = True
+    def _write_storage_and_return_key(self, storage):
+        if self.has_saved:
+            raise RuntimeError("have already saved")
+        key = self.next_key
+        self.next_key += 1
+        name = f"data/{key}"
+        if storage.device.type != "cpu":
+            storage = storage.cpu()
+        num_bytes = storage.nbytes()
+        self.zipfile.write_record(name, storage.data_ptr(), num_bytes)
+        return key
+    def __exit__(self, type, value, traceback):
+        self.zipfile.write_end_of_file()

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+torch>=2.0.0
+lightning @ git+https://github.com/Lightning-AI/lightning@master
+sentencepiece
+tqdm  # convert_checkpoint.py
+numpy  # train.py dataset memmap
+jsonargparse[signatures]  # generate.py, convert_checkpoint.py CLI
+bitsandbytes  # quantization.py
+datasets  # evaluate.py
+zstandard  # prepare_redpajama.py
+gradio # app.py