Spaces:

austinsilveria
/

tricksy

Paused

App Files Files Community

austinsilveria commited on Dec 29, 2023

Commit

75fa479

•

1 Parent(s): b2a1f5e

Add tricksy

Browse files

Files changed (4) hide show

app.py +47 -2
configuration_tricksy.py +18 -0
modeling_tricksy.py +618 -0
util.py +83 -0

app.py CHANGED Viewed

@@ -1,4 +1,49 @@
 import streamlit as st
-x = st.slider('Select a value')
-st.write(x, 'squared is', x * x)

+from threading import Thread
 import streamlit as st
+import torch
+from transformers import AutoTokenizer, TextIteratorStreamer, set_seed
+from modeling_tricksy import TricksyOPTForCausalLM, OPTDiskWeights
+from configuration_tricksy import TricksyConfig
+def generate():
+    set_seed(42)
+    # 13.4 GB (16 bit)
+    model_name = 'facebook/opt-6.7b'
+    disk_weights = OPTDiskWeights(model_name)
+    tricksy_model = TricksyOPTForCausalLM(TricksyConfig(disk_weights.config, full_offload=(not use_tricksy)), disk_weights)
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
+    inputs = tokenizer(prompt, return_tensors='pt').input_ids.to('cuda')
+    print()
+    generation_kwargs = dict(inputs=inputs, streamer=streamer, max_new_tokens=max_new_tokens, do_sample=True, top_k=top_k, top_p=top_p)
+    thread = Thread(target=tricksy_model.generate, kwargs=generation_kwargs)
+    thread.start()
+    generated_text = ''
+    with st.chat_message("user"):
+        t = st.empty()
+        for new_text in streamer:
+            generated_text += new_text.replace('\n', '  \n')
+            t.write(generated_text)
+    stats_text = f'Decoding tok/s: {1 / (sum(tricksy_model.tricksy_context.forward_times[1:]) / (len(tricksy_model.tricksy_context.forward_times) - 1))}'
+    stats_text += f'  \nCurrent GPU mem usage: {torch.cuda.memory_allocated("cuda") / 1024 ** 3} GB'
+    stats_text += f'  \nMax GPU mem usage: {torch.cuda.max_memory_allocated("cuda") / 1024 ** 3} GB'
+    st.write(stats_text)
+prompt = st.text_area('Prompt', 'Making pesto from scratch can be done with these ingredients in 4 simple steps:\nStep 1')
+col1, col2 = st.columns(2)
+with col1:
+    submit = st.button('Submit', on_click=generate)
+with col2:
+    use_tricksy = st.toggle('Use Tricksy', True, help='If true, only send sparse MLP weight diffs to GPU. If false, send all weights to GPU.')
+with st.expander('Additional options'):
+    max_new_tokens = st.slider('Max new tokens', 1, 500, 100)
+    top_k = st.slider('Top-k sampling', 1, 500, 50)
+    top_p = st.slider('Top-p (nucleus sampling)', 0.0, 1.0, .9)

configuration_tricksy.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import dataclasses
+import torch
+from transformers.models.opt.configuration_opt import OPTConfig
+@dataclasses.dataclass(frozen=True)
+class TricksyConfig:
+    opt_config: OPTConfig
+    # Percentage of weights to keep on each device
+    # e.g. 30% of each MLP layer on GPU
+    min_mlp_sparsity_gpu: float = .3
+    # e.g. 100% of each MLP layer on CPU
+    min_mlp_sparsity_cpu: float = 1
+    # If true, cleans up layer's weights after computing forward pass
+    full_offload: bool = False
+    dtype: torch.dtype = torch.float16

modeling_tricksy.py ADDED Viewed

	@@ -0,0 +1,618 @@

+from typing import Any, Optional, Callable, List, Tuple
+import os
+import time
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+from accelerate import init_empty_weights
+from transformers.activations import ACT2FN
+from transformers.generation import GenerationConfig
+from transformers.models.opt.modeling_opt import (
+    OPTAttention,
+    OPTDecoder,
+    OPTDecoderLayer,
+    OPTForCausalLM,
+    OPTModel,
+)
+from transformers.models.opt.configuration_opt import OPTConfig
+from huggingface_hub import snapshot_download
+from configuration_tricksy import TricksyConfig
+from util import batch_copy, compute_index_diffs, load_mlp_sparsity_predictor, mmap_to_tensor, topk_and_threshold
+TRICKSY_WEIGHTS_PATH = 'tricksy-weights/'
+class SparseMLPCache:
+    def __init__(
+        self,
+        indexed_fc1_weight: Optional[torch.Tensor] = None,
+        indexed_fc1_bias: Optional[torch.Tensor] = None,
+        indexed_fc2_weight: Optional[torch.Tensor] = None,
+        gpu_cached_mlp_indices: Optional[torch.Tensor] = None,
+    ):
+        # [ffn_embed_dim * min_mlp_sparsity, hidden_size]
+        self.indexed_fc1_weight = indexed_fc1_weight
+        # [ffn_embed_dim * min_mlp_sparsity]
+        self.indexed_fc1_bias = indexed_fc1_bias
+        # [ffn_embed_dim * min_mlp_sparsity, hidden_size] (stored in transpose for efficient indexing)
+        self.indexed_fc2_weight = indexed_fc2_weight
+        # Indices that are already on GPU (this tensor is stored on the CPU)
+        # [ffn_embed_dim * min_mlp_sparsity]
+        self.gpu_cached_mlp_indices = gpu_cached_mlp_indices
+class SparseIndices:
+    def __init__(self, tricksy_config: TricksyConfig, opt_config: OPTConfig):
+        self.mlp_indices_buffer_gpu = torch.empty(
+            (int(opt_config.ffn_dim * tricksy_config.min_mlp_sparsity_gpu),),
+            dtype=torch.int32,
+            device='cuda'
+        )
+        self.mlp_indices_buffer_cpu = torch.empty(
+            (int(opt_config.ffn_dim * tricksy_config.min_mlp_sparsity_gpu),),
+            dtype=torch.int32,
+            device='cpu',
+            pin_memory=True,
+        )
+        # Default stream blocks until indices are copied to CPU
+        self.index_copy_stream = torch.cuda.default_stream()
+    def copy_mlp_indices_to_cpu(self):
+        self.mlp_indices_buffer_cpu = batch_copy([self.mlp_indices_buffer_gpu], self.index_copy_stream, device='cpu')[0]
+class OPTDiskWeights:
+    def __init__(self, model_name: str):
+        self.model_name = model_name
+        self.model_suffix = model_name.split('/')[-1]
+        self.config = OPTConfig.from_pretrained(model_name)
+        try:
+            print(f'downloading from austinsilveria/tricksy-{self.model_suffix}')
+            self.weight_path = snapshot_download(repo_id=f'austinsilveria/tricksy-{self.model_suffix}') + '/'
+        except:
+            print(f'failed to download from austinsilveria/tricksy-{self.model_suffix}')
+            self.weight_path = f'{TRICKSY_WEIGHTS_PATH}{self.model_suffix}/'
+        with init_empty_weights():
+            model = OPTModel(self.config)
+        self.state_dict = model.state_dict()
+        if not os.path.exists(f'{self.weight_path}decoder.embed_tokens.weight'):
+            # Download original weights and write memmap files
+            print(f'downloading and preprocessing original weights')
+            self.cache_weights()
+        head_dim = self.config.hidden_size // self.config.num_attention_heads
+        for i in range(self.config.num_hidden_layers):
+            layer_prefix = f'decoder.layers.{i}.'
+            self.delete_weights([
+                f'{layer_prefix}self_attn.q_proj.weight',
+                f'{layer_prefix}self_attn.k_proj.weight',
+                f'{layer_prefix}self_attn.v_proj.weight',
+                f'{layer_prefix}self_attn.out_proj.weight',
+                f'{layer_prefix}self_attn.q_proj.bias',
+                f'{layer_prefix}self_attn.k_proj.bias',
+                f'{layer_prefix}self_attn.v_proj.bias'
+            ])
+            self.add_weights([
+                (f'{layer_prefix}fc2.weight', (self.config.ffn_dim, self.config.hidden_size)),
+                (f'{layer_prefix}self_attn.catted_head_weights', (self.config.num_attention_heads, head_dim * 4, self.config.hidden_size)),
+                (f'{layer_prefix}self_attn.catted_head_biases', (self.config.num_attention_heads, 3, head_dim)),
+            ])
+        self.memmap_weights = { key: self.load_memmap_weight(key) for key in self.state_dict.keys() }
+    def load_memmap_weight(self, key: str):
+        return torch.from_numpy(np.memmap(f'{self.weight_path}{key}', dtype='float16', mode='r', shape=(self.state_dict[key].shape)))
+    def add_weights(self, weights: List[Tuple[str, torch.Size]]):
+        for key, shape in weights:
+            self.state_dict[key] = torch.empty(shape, dtype=torch.float16, device='meta')
+    def delete_weights(self, keys: List[str]):
+        for key in keys:
+            if key in self.state_dict:
+                del self.state_dict[key]
+            path = f'{self.weight_path}{key}'
+            if os.path.exists(path):
+                os.remove(path)
+    def cache_weights(self):
+        os.makedirs(self.weight_path, exist_ok=True)
+        weights_location = snapshot_download(repo_id=self.model_name, ignore_patterns=['flax*', 'tf*'])
+        shards = [file for file in os.listdir(weights_location) if file.startswith("pytorch_model") and file.endswith(".bin")]
+        for shard in shards:
+            print(f'caching {shard}')
+            shard_path = os.path.join(weights_location, shard)
+            shard_state_dict = torch.load(shard_path)
+            for key in shard_state_dict.keys():
+                path = f'{self.weight_path}{key.replace("model.", "")}'
+                memmap = np.memmap(path, dtype='float16', mode='w+', shape=(shard_state_dict[key].shape))
+                memmap[:] = shard_state_dict[key].cpu().numpy()
+        # Store weights in shape for efficient indexing
+        for i in range(self.config.num_hidden_layers):
+            layer_prefix = f'decoder.layers.{i}.'
+            # FC2 in transpose
+            fc2t = torch.from_numpy(np.array(self.load_memmap_weight(f'{layer_prefix}fc2.weight')[:])).t().contiguous().clone()
+            np.memmap(f'{self.weight_path}decoder.layers.{i}.fc2.weight', dtype='float16', mode='w+', shape=fc2t.shape)[:] = fc2t.numpy()
+            # Attention weights by head
+            head_dim = self.config.hidden_size // self.config.num_attention_heads
+            qw = mmap_to_tensor(self.load_memmap_weight(f'{layer_prefix}self_attn.q_proj.weight')[:])
+            kw = mmap_to_tensor(self.load_memmap_weight(f'{layer_prefix}self_attn.k_proj.weight')[:])
+            vw = mmap_to_tensor(self.load_memmap_weight(f'{layer_prefix}self_attn.v_proj.weight')[:])
+            ow = mmap_to_tensor(self.load_memmap_weight(f'{layer_prefix}self_attn.out_proj.weight')[:])
+            pre_cat_shape = (self.config.num_attention_heads, head_dim, self.config.hidden_size)
+            # [head, head_dim * 4, hidden_size]
+            catted_head_weights = torch.cat(
+                [qw.view(pre_cat_shape).clone(), kw.view(pre_cat_shape).clone(), vw.view(pre_cat_shape).clone(), ow.T.view(pre_cat_shape).clone(),],
+                dim=1,
+            ).contiguous().clone()
+            np.memmap(f'{self.weight_path}{layer_prefix}self_attn.catted_head_weights', dtype='float16', mode='w+', shape=catted_head_weights.shape)[:] =\
+                catted_head_weights.numpy()
+            # Attention biases by head
+            qb = mmap_to_tensor(self.load_memmap_weight(f'{layer_prefix}self_attn.q_proj.bias')[:])
+            kb = mmap_to_tensor(self.load_memmap_weight(f'{layer_prefix}self_attn.k_proj.bias')[:])
+            vb = mmap_to_tensor(self.load_memmap_weight(f'{layer_prefix}self_attn.v_proj.bias')[:])
+            pre_cat_shape = (self.config.num_attention_heads, 1, head_dim)
+            # [head, 3, head_dim]
+            catted_head_biases = torch.cat(
+                # Don't index out bias since we need all dims after projecting back up to hidden size
+                [qb.view(pre_cat_shape).clone(), kb.view(pre_cat_shape).clone(), vb.view(pre_cat_shape).clone()],
+                dim=1,
+            ).contiguous().clone()
+            np.memmap(f'{self.weight_path}{layer_prefix}self_attn.catted_head_biases', dtype='float16', mode='w+', shape=catted_head_biases.shape)[:] =\
+                catted_head_biases.numpy()
+            self.delete_weights([
+                f'{layer_prefix}self_attn.q_proj.weight',
+                f'{layer_prefix}self_attn.k_proj.weight',
+                f'{layer_prefix}self_attn.v_proj.weight',
+                f'{layer_prefix}self_attn.out_proj.weight',
+                f'{layer_prefix}self_attn.q_proj.bias',
+                f'{layer_prefix}self_attn.k_proj.bias',
+                f'{layer_prefix}self_attn.v_proj.bias'
+            ])
+            self.add_weights([
+                (f'{layer_prefix}self_attn.catted_head_weights', catted_head_weights.shape),
+                (f'{layer_prefix}self_attn.catted_head_biases', catted_head_biases.shape),
+            ])
+class TricksyContext:
+    def __init__(self, tricksy_config: TricksyConfig, opt_config: OPTConfig):
+        self.indices = SparseIndices(tricksy_config, opt_config)
+        self.load_weight_stream = torch.cuda.Stream()
+        self.layer = 0
+        self.is_prompt_phase = True
+        self.forward_times = []
+class TricksyLayer:
+    def __call__(self, *args: Any, **kwds: Any) -> Any:
+        return self.forward(*args, **kwds)
+    def load_weights(self, tricksy_context: TricksyContext):
+        pass
+class TricksyLayerInputs:
+    def __init__(
+        self,
+        disk_weights: OPTDiskWeights,
+        layer_key_prefix: str = None,
+        next_layer: TricksyLayer = None,
+        sparsity_predictors: List[Callable[[torch.Tensor], torch.Tensor]] = None,
+    ) -> None:
+        self.disk_weights = disk_weights
+        # self.get_weight = lambda key: self.disk_weights.load_memmap_weight(f'{layer_key_prefix}{key}')
+        self.get_weight = lambda key: self.disk_weights.memmap_weights[(f'{layer_key_prefix}{key}')]
+        self.layer_key_prefix = layer_key_prefix
+        self.next_layer = next_layer
+        self.sparsity_predictors = sparsity_predictors
+class TricksyOPTLearnedPositionalEmbedding(TricksyLayer):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+    def __init__(self, tricksy_context):
+        # OPT is set up so that if padding_idx is specified then offset the embedding ids by 2
+        # and adjust num_embeddings appropriately. Other models don't have this hack
+        self.offset = 2
+        self.tricksy_context = tricksy_context
+        self.weight = None
+    def __call__(self, *args: Any, **kwds: Any) -> Any:
+        return self.forward(*args, **kwds)
+    def forward(self, attention_mask: torch.LongTensor, past_key_values_length: int = 0):
+        """`input_ids_shape` is expected to be [bsz x seqlen]."""
+        attention_mask = attention_mask.long()
+        # create positions depending on attention_mask
+        positions = (torch.cumsum(attention_mask, dim=1).type_as(attention_mask) * attention_mask).long() - 1
+        # cut positions if `past_key_values_length` is > 0
+        positions = positions[:, past_key_values_length:]
+        out = F.embedding(positions + self.offset, self.weight)
+        return out
+class TricksyOPTAttention(OPTAttention, TricksyLayer):
+    def __init__(self, tricksy_config: TricksyConfig, inputs: TricksyLayerInputs, tricksy_context: TricksyContext, is_decoder: bool = False, **kwargs):
+        nn.Module.__init__(self)
+        self.tricksy_config = tricksy_config
+        self.config = tricksy_config.opt_config
+        def _handle_deprecated_argument(config_arg_name, config, fn_arg_name, kwargs):
+            """
+            If a the deprecated argument `fn_arg_name` is passed, raise a deprecation
+            warning and return that value, otherwise take the equivalent config.config_arg_name
+            """
+            val = None
+            if fn_arg_name in kwargs:
+                print(
+                    "Passing in {} to {self.__class__.__name__} is deprecated and won't be supported from v4.38."
+                    " Please set it in the config instead"
+                )
+                val = kwargs.pop(fn_arg_name)
+            else:
+                val = getattr(config, config_arg_name)
+            return val
+        self.embed_dim = _handle_deprecated_argument("hidden_size", self.config, "embed_dim", kwargs)
+        self.num_heads = _handle_deprecated_argument("num_attention_heads", self.config, "num_heads", kwargs)
+        self.dropout = _handle_deprecated_argument("attention_dropout", self.config, "dropout", kwargs)
+        self.enable_bias = _handle_deprecated_argument("enable_bias", self.config, "bias", kwargs)
+        self.head_dim = self.embed_dim // self.num_heads
+        self.is_causal = True
+        if (self.head_dim * self.num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+        # [Tricksy]
+        self.tricksy_context = tricksy_context
+        self.inputs = inputs
+        self.head_dim = self.config.hidden_size // self.config.num_attention_heads
+        self.qw = self.kw = self.vw = self.ow = self.qb = self.kb = self.vb = self.out_proj_bias = self.layer_norm_weight = self.layer_norm_bias = None
+        self.q_proj = lambda x: F.linear(x, self.qw, self.qb)
+        self.k_proj = lambda x: F.linear(x, self.kw, self.kb)
+        self.v_proj = lambda x: F.linear(x, self.vw, self.vb)
+        self.out_proj = lambda x: F.linear(x, self.ow, self.out_proj_bias)
+        self.layer_norm = lambda x: F.layer_norm(x, (self.config.hidden_size,), self.layer_norm_weight, self.layer_norm_bias)
+    def load_weights(self, tricksy_context: TricksyContext):
+        if self.tricksy_context.is_prompt_phase:
+            # Full weights for prompt phase
+            self.catted_weights, self.catted_biases, self.out_proj_bias, self.layer_norm_weight, self.layer_norm_bias = batch_copy(
+                [
+                    mmap_to_tensor(self.inputs.get_weight('self_attn.catted_head_weights')[:], pin_memory=True),
+                    mmap_to_tensor(self.inputs.get_weight('self_attn.catted_head_biases')[:], pin_memory=True),
+                    mmap_to_tensor(self.inputs.get_weight('self_attn.out_proj.bias')[:], pin_memory=True),
+                    mmap_to_tensor(self.inputs.get_weight('self_attn_layer_norm.weight')[:], pin_memory=True),
+                    mmap_to_tensor(self.inputs.get_weight('self_attn_layer_norm.bias')[:], pin_memory=True),
+                ],
+                tricksy_context.load_weight_stream,
+            )
+            torch.cuda.synchronize()
+            # Weights stored in shape for efficient indexing to support offloading attention heads (not currently being done)
+            self.qw = self.catted_weights[:, :self.head_dim, :].reshape(self.config.hidden_size, self.config.hidden_size).contiguous()
+            self.kw = self.catted_weights[:, self.head_dim:self.head_dim * 2, :].reshape(self.config.hidden_size, self.config.hidden_size).contiguous()
+            self.vw = self.catted_weights[:, self.head_dim * 2:self.head_dim * 3, :].reshape(self.config.hidden_size, self.config.hidden_size).contiguous()
+            self.ow = self.catted_weights[:, self.head_dim * 3:, :].reshape(self.config.hidden_size, self.config.hidden_size).t().contiguous()
+            self.catted_weights = None
+            self.qb = self.catted_biases[:, 0, :].reshape(self.config.hidden_size).contiguous()
+            self.kb = self.catted_biases[:, 1, :].reshape(self.config.hidden_size).contiguous()
+            self.vb = self.catted_biases[:, 2, :].reshape(self.config.hidden_size).contiguous()
+            self.catted_biases = None
+    def forward(self, hidden_states, **kwargs):
+        # Wait for attention weights to get to GPU
+        torch.cuda.synchronize()
+        # Predict MLP sparsity based on attention input
+        self.tricksy_context.indices.mlp_indices_buffer_gpu = topk_and_threshold(
+            self.inputs.sparsity_predictors[0](hidden_states)[0, -1, :],
+            int(self.config.ffn_dim * self.tricksy_config.min_mlp_sparsity_gpu),
+        )
+        self.tricksy_context.indices.copy_mlp_indices_to_cpu()
+        torch.cuda.synchronize()
+        # Load MLP weights while computing attention
+        self.inputs.next_layer.load_weights(self.tricksy_context)
+        out = super().forward(self.layer_norm(hidden_states), **kwargs)
+        # Wait for MLP weights to get to GPU
+        torch.cuda.synchronize()
+        return out
+class TricksyOPTDecoderLayer(OPTDecoderLayer):
+    def __init__(self, tricksy_config: TricksyConfig, inputs: TricksyLayerInputs, tricksy_context: TricksyContext):
+        nn.Module.__init__(self)
+        self.tricksy_config = tricksy_config
+        self.config = tricksy_config.opt_config
+        self.embed_dim = self.config.hidden_size
+        self.tricksy_context = tricksy_context
+        self.self_attn_layer_inputs = TricksyLayerInputs(
+            disk_weights=inputs.disk_weights,
+            layer_key_prefix=inputs.layer_key_prefix,
+            # While computing attention, load MLP
+            next_layer=self,
+            sparsity_predictors=inputs.sparsity_predictors,
+        )
+        self.self_attn = TricksyOPTAttention(tricksy_config, self.self_attn_layer_inputs, tricksy_context, is_decoder=True)
+        self.do_layer_norm_before = self.config.do_layer_norm_before
+        self.dropout = self.config.dropout
+        self.activation_fn = ACT2FN[self.config.activation_function]
+        self.inputs = inputs
+        random_mlp_indices_gpu =\
+            torch.randperm(self.config.ffn_dim, device='cpu', dtype=torch.int32)[:int(self.config.ffn_dim * self.tricksy_config.min_mlp_sparsity_gpu)]
+        self.index_cache = SparseMLPCache(gpu_cached_mlp_indices=random_mlp_indices_gpu)
+        # identity since we move this to attention layer
+        # extreme tricksy
+        self.self_attn_layer_norm = lambda x: x
+        self.fc1_weight = self.fc2_weight = self.final_layer_norm_weight = self.fc1_bias = self.fc2_bias = self.final_layer_norm_bias = None
+        self.ring_idx = 0
+        self.fc1_weight_diff = self.fc2_weight_diff = self.fc1_bias_diff = None
+        self.fc1 = lambda x: F.linear(x, torch.cat([self.fc1_weight, self.fc1_weight_diff]), torch.cat([self.fc1_bias, self.fc1_bias_diff]))
+        self.fc2 = lambda x: F.linear(x, torch.cat([self.fc2_weight, self.fc2_weight_diff]).T, self.fc2_bias)
+        self.final_layer_norm = lambda x: F.layer_norm(x, (self.embed_dim,), self.final_layer_norm_weight, self.final_layer_norm_bias)
+    def load_weights(self, tricksy_context: TricksyContext):
+        if self.tricksy_context.is_prompt_phase:
+            # Full weights for prompt phase
+            fc1w = mmap_to_tensor(self.inputs.get_weight('fc1.weight')[:], pin_memory=True)
+            fc1b = mmap_to_tensor(self.inputs.get_weight('fc1.bias')[:], pin_memory=True)
+            fc2w = mmap_to_tensor(self.inputs.get_weight('fc2.weight')[:], pin_memory=True)
+            fc2b = mmap_to_tensor(self.inputs.get_weight('fc2.bias')[:], pin_memory=True)
+            lnw = mmap_to_tensor(self.inputs.get_weight('final_layer_norm.weight')[:], pin_memory=True)
+            lnb = mmap_to_tensor(self.inputs.get_weight('final_layer_norm.bias')[:], pin_memory=True)
+            self.fc1_weight, self.fc1_bias, self.fc2_weight, self.fc2_bias, self.final_layer_norm_weight, self.final_layer_norm_bias =\
+                batch_copy([fc1w, fc1b, fc2w, fc2b, lnw, lnb], tricksy_context.load_weight_stream)
+            self.fc1_weight_diff = torch.tensor([], dtype=self.tricksy_config.dtype, device='cuda')
+            self.fc1_bias_diff = torch.tensor([], dtype=self.tricksy_config.dtype, device='cuda')
+            self.fc2_weight_diff = torch.tensor([], dtype=self.tricksy_config.dtype, device='cuda')
+            index_diffs = compute_index_diffs(tricksy_context.indices.mlp_indices_buffer_cpu, [self.index_cache.gpu_cached_mlp_indices])
+            if len(index_diffs) > 0:
+                gpu_index_diff = index_diffs[0]
+                self.index_cache.gpu_cached_mlp_indices[gpu_index_diff.off_positions] = gpu_index_diff.off_elements
+            self.index_cache.indexed_fc1_weight = fc1w.contiguous().pin_memory()
+            self.index_cache.indexed_fc1_bias = fc1b.contiguous().pin_memory()
+            self.index_cache.indexed_fc2_weight = fc2w.contiguous().pin_memory()
+            return
+        elif self.fc1_weight is None:
+            # Full weights if full offload
+            self.fc1_weight, self.fc1_bias, self.fc2_weight = batch_copy(
+                [self.index_cache.indexed_fc1_weight, self.index_cache.indexed_fc1_bias, self.index_cache.indexed_fc2_weight],
+                tricksy_context.load_weight_stream
+            )
+            self.fc1_weight_diff = torch.tensor([], dtype=self.tricksy_config.dtype, device='cuda')
+            self.fc1_bias_diff = torch.tensor([], dtype=self.tricksy_config.dtype, device='cuda')
+            self.fc2_weight_diff = torch.tensor([], dtype=self.tricksy_config.dtype, device='cuda')
+        off_elements = torch.tensor(
+            list(set(tricksy_context.indices.mlp_indices_buffer_cpu.tolist()).difference(set(self.index_cache.gpu_cached_mlp_indices.tolist()))),
+            device='cpu',
+            dtype=torch.int32,
+            pin_memory=True
+        )
+        if off_elements.size(0) == 0:
+            self.fc1_weight_diff = torch.tensor([], dtype=self.tricksy_config.dtype, device='cuda')
+            self.fc1_bias_diff = torch.tensor([], dtype=self.tricksy_config.dtype, device='cuda')
+            self.fc2_weight_diff = torch.tensor([], dtype=self.tricksy_config.dtype, device='cuda')
+            return
+        new_ring_idx = (self.ring_idx + off_elements.size(0)) % self.index_cache.gpu_cached_mlp_indices.size(0)
+        if new_ring_idx > self.ring_idx:
+            # single contiguous update
+            self.index_cache.gpu_cached_mlp_indices[self.ring_idx:new_ring_idx] = off_elements
+        elif off_elements.size(0) > 0:
+            split = self.index_cache.gpu_cached_mlp_indices.size(0) - self.ring_idx
+            # end of ring
+            self.index_cache.gpu_cached_mlp_indices[self.ring_idx:] = off_elements[:split]
+            # beginning of ring
+            self.index_cache.gpu_cached_mlp_indices[:new_ring_idx] = off_elements[split:]
+        # Allocate
+        self.fc1_weight_diff = torch.empty((off_elements.size(0), self.config.hidden_size), dtype=self.tricksy_config.dtype, device='cuda')
+        self.fc1_bias_diff = torch.empty((off_elements.size(0)), dtype=self.tricksy_config.dtype, device='cuda')
+        self.fc2_weight_diff = torch.empty((off_elements.size(0), self.config.hidden_size), dtype=self.tricksy_config.dtype, device='cuda')
+        # Index
+        fc1wd = self.index_cache.indexed_fc1_weight[off_elements].pin_memory()
+        fc1bd = self.index_cache.indexed_fc1_bias[off_elements].pin_memory()
+        fc2wd = self.index_cache.indexed_fc2_weight[off_elements].pin_memory()
+        # Copy
+        self.fc1_weight_diff, self.fc1_bias_diff, self.fc2_weight_diff = batch_copy([fc1wd, fc1bd, fc2wd], tricksy_context.load_weight_stream)
+    def forward(self, *args, **kwargs):
+        # Wait for attention weights to get to GPU
+        torch.cuda.synchronize()
+        # Load next layer's attention weights
+        self.inputs.next_layer.load_weights(self.tricksy_context)
+        out = super().forward(*args, **kwargs)
+        if self.tricksy_config.full_offload:
+            self.fc1_weight = self.fc1_bias = self.fc2_weight = None
+        elif self.tricksy_context.is_prompt_phase:
+            # Only keep sparse MLP weights on GPU after prompt phase
+            self.fc1_weight = self.fc1_weight[self.index_cache.gpu_cached_mlp_indices.to('cuda')]
+            self.fc1_bias = self.fc1_bias[self.index_cache.gpu_cached_mlp_indices.to('cuda')]
+            self.fc2_weight = self.fc2_weight[self.index_cache.gpu_cached_mlp_indices.to('cuda')]
+        # Update ring buffers
+        if not self.tricksy_config.full_offload:
+            prev_ring_idx = self.ring_idx
+            self.ring_idx = (self.ring_idx + self.fc1_weight_diff.size(0)) % self.fc1_weight.size(0)
+            if self.ring_idx > prev_ring_idx:
+                # does not wrap around ring
+                self.fc1_weight[prev_ring_idx:self.ring_idx] = self.fc1_weight_diff
+                self.fc1_bias[prev_ring_idx:self.ring_idx] = self.fc1_bias_diff
+                self.fc2_weight[prev_ring_idx:self.ring_idx] = self.fc2_weight_diff
+            elif self.fc1_weight_diff.size(0) > 0:
+                # wraps around ring
+                split = self.fc1_weight_diff.size(0) - self.ring_idx
+                self.fc1_weight[prev_ring_idx:] = self.fc1_weight_diff[:split]
+                self.fc1_weight[:self.ring_idx] = self.fc1_weight_diff[split:]
+                self.fc1_bias[prev_ring_idx:] = self.fc1_bias_diff[:split]
+                self.fc1_bias[:self.ring_idx] = self.fc1_bias_diff[split:]
+                self.fc2_weight[prev_ring_idx:] = self.fc2_weight_diff[:split]
+                self.fc2_weight[:self.ring_idx] = self.fc2_weight_diff[split:]
+        self.fc1_weight_diff = self.fc2_weight_diff = self.fc1_bias_diff = None
+        self.tricksy_context.layer += 1
+        return out
+class TricksyOPTDecoder(OPTDecoder, TricksyLayer):
+    def __init__(self, tricksy_config: TricksyConfig, disk_weights: OPTDiskWeights, tricksy_opt_for_causal_lm, tricksy_context: TricksyContext):
+        nn.Module.__init__(self)
+        self.config = tricksy_config.opt_config
+        self.dropout = self.config.dropout
+        self.layerdrop = self.config.layerdrop
+        self.padding_idx = self.config.pad_token_id
+        self.max_target_positions = self.config.max_position_embeddings
+        self.vocab_size = self.config.vocab_size
+        self._use_flash_attention_2 = False
+        self.gradient_checkpointing = False
+        self.project_out = None
+        self.project_in = None
+        self.embed_tokens_weight = None
+        self.embed_positions = TricksyOPTLearnedPositionalEmbedding(tricksy_context)
+        self.tricksy_context = tricksy_context
+        self.layers: List[TricksyOPTDecoderLayer] = []
+        for i in range(self.config.num_hidden_layers):
+            pretrained_layer_num = self.config.num_hidden_layers - i - 1
+            sparsity_predictors = [load_mlp_sparsity_predictor(disk_weights.weight_path, pretrained_layer_num, tricksy_config.dtype)]
+            if sparsity_predictors[0] is None:
+                sparsity_predictors[0] = lambda x: F.linear(x, torch.rand((self.config.ffn_dim, self.config.hidden_size), device='cuda', dtype=tricksy_config.dtype))
+            self.layers.append(TricksyOPTDecoderLayer(
+                tricksy_config,
+                TricksyLayerInputs(
+                    disk_weights=disk_weights,
+                    layer_key_prefix=f'decoder.layers.{pretrained_layer_num}.',
+                    # While computing MLP, load next attention
+                    # While computing last MLP, load output embeddings (stored in TricksyOPTForCausalLM)
+                    next_layer=self.layers[i - 1].self_attn if i > 0 else tricksy_opt_for_causal_lm,
+                    sparsity_predictors=sparsity_predictors,
+                ),
+                tricksy_context,
+            ))
+        self.layers.reverse()
+        self.final_layer_norm = lambda x: x
+        self.inputs = TricksyLayerInputs(disk_weights=disk_weights, layer_key_prefix='decoder.')
+    def embed_tokens(self, x):
+        return F.embedding(x, self.embed_tokens_weight, self.padding_idx)
+    def load_weights(self, tricksy_context: TricksyContext):
+        if self.embed_tokens_weight is None:
+            self.embed_tokens_weight, self.embed_positions.weight = batch_copy(
+                [
+                    mmap_to_tensor(self.inputs.get_weight('embed_tokens.weight')[:], pin_memory=True),
+                    mmap_to_tensor(self.inputs.get_weight('embed_positions.weight')[:], pin_memory=True),
+                ],
+                tricksy_context.load_weight_stream,
+            )
+    def forward(self, *args, **kwargs):
+        # Wait for input embedding weights to get to GPU
+        torch.cuda.synchronize()
+        # While computing input embeddings, load first attention
+        self.layers[0].self_attn.load_weights(self.tricksy_context)
+        out = super().forward(*args, **kwargs)
+        # Wait for output embedding weights to get to GPU
+        torch.cuda.synchronize()
+        # No longer prompt phase after first full pass
+        self.tricksy_context.is_prompt_phase = False
+        # Load input embeddings while computing output
+        self.load_weights(self.tricksy_context)
+        return out
+class TricksyOPTModel(OPTModel):
+    def __init__(self, tricksy_config: TricksyConfig, disk_weights: OPTDiskWeights, tricksy_opt_for_causal_lm, tricksy_context: TricksyContext):
+        nn.Module.__init__(self)
+        self.config = tricksy_config.opt_config
+        self.tricksy_context = tricksy_context
+        self.decoder = TricksyOPTDecoder(tricksy_config, disk_weights, tricksy_opt_for_causal_lm, tricksy_context)
+    def forward(self, *args, **kwargs):
+        out = super().forward(*args, **kwargs)
+        return out
+# who's got the weights?
+# [InputEmbedding,    Attention.0,           MLP.0,                    Attention.1,           MLP.1,                 ..., OutputEmbedding]
+# [TricksyOPTDecoder, TricksyOPTAttention.0, TricksyOPTDecoderLayer.0, TricksyOPTAttention.1, TricksyDecoderLayer.1, ..., TricksyOPTForCausalLM]
+#
+# 1. Prompt pass: Before computing layer, send full dense weights to GPU. After computing layer, only keep sparse weights on GPU.
+# 2. Generation passes: Before computing layer, compute and send sparse weight diff to GPU.
+class TricksyOPTForCausalLM(OPTForCausalLM, TricksyLayer):
+    def __init__(self, tricksy_config: TricksyConfig, disk_weights: OPTDiskWeights):
+        nn.Module.__init__(self)
+        self.config = disk_weights.config
+        self.generation_config = GenerationConfig.from_model_config(self.config) if self.can_generate() else None
+        self.tricksy_context = TricksyContext(tricksy_config, self.config)
+        self.model = TricksyOPTModel(tricksy_config, disk_weights, self, self.tricksy_context)
+        self.final_layer_norm_weight = self.lm_head_weight = self.final_layer_norm_bias = None
+        # double stacking tricksy!
+        self.final_layer_norm = lambda x: F.layer_norm(x, (self.config.hidden_size,), self.final_layer_norm_weight, self.final_layer_norm_bias)
+        self.lm_head = lambda x: F.linear(self.final_layer_norm(x), self.lm_head_weight)
+        self.inputs = TricksyLayerInputs(disk_weights=disk_weights, layer_key_prefix='decoder.', next_layer=self.model.decoder)
+    def load_weights(self, tricksy_context: TricksyContext):
+        if self.final_layer_norm_weight is None:
+            self.final_layer_norm_weight, self.lm_head_weight, self.final_layer_norm_bias = batch_copy(
+                [
+                    mmap_to_tensor(self.inputs.get_weight('final_layer_norm.weight')[:], pin_memory=True),
+                    mmap_to_tensor(self.inputs.get_weight('embed_tokens.weight')[:], pin_memory=True),
+                    mmap_to_tensor(self.inputs.get_weight('final_layer_norm.bias')[:], pin_memory=True),
+                ],
+                tricksy_context.load_weight_stream,
+            )
+    def forward(self, *args, **kwargs):
+        torch.cuda.synchronize()
+        start = time.time()
+        out = super().forward(*args, **kwargs)
+        torch.cuda.synchronize()
+        self.tricksy_context.forward_times.append(time.time() - start)
+        self.tricksy_context.layer = 0
+        return out
+    def generate(self, *args, **kwargs):
+        # Load input embeddings for first token
+        self.model.decoder.load_weights(self.tricksy_context)
+        torch.cuda.synchronize()
+        out = super().generate(*args, **kwargs)
+        return out

util.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import os
+from typing import List, Callable
+import numpy as np
+import torch
+from torch.nn import functional as F
+np_dtype_to_torch_dtype = {
+    np.float16: torch.float16,
+    np.float32: torch.float32,
+    np.uint8: torch.uint8,
+    np.int8: torch.int8,
+    np.int32: torch.int32,
+    np.int64: torch.int64,
+    bool: torch.bool,
+}
+class IndexDiff:
+    def __init__(self, off_elements: torch.Tensor=None, off_positions: torch.Tensor=None, on_positions: torch.Tensor=None):
+        self.off_elements = off_elements
+        self.off_positions = off_positions
+        self.on_positions = on_positions
+def batch_copy(sources: List[torch.Tensor], copy_stream, indices=None, device='cuda'):
+    with torch.cuda.stream(copy_stream):
+        out = ()
+        for src in sources:
+            indexed = src[indices] if indices is not None else src
+            dst = torch.empty(indexed.shape, device=device, dtype=src.dtype)
+            dst.copy_(indexed, non_blocking=True)
+            out += (dst,)
+    return out
+def mmap_to_tensor(torch_wrapped_mmap, pin_memory=False) -> torch.Tensor:
+    out = torch.empty(torch_wrapped_mmap.shape, dtype=torch_wrapped_mmap.dtype, device='cpu', pin_memory=pin_memory)
+    out.copy_(torch_wrapped_mmap)
+    return out
+# Assuming that each entry of cached_indices is a step down the memory hierarchy,
+# compute the diff at each level of the hierarchy.
+#   e.g. the first loop computes the indices that the GPU does not have,
+#        and the second loop computes the indices *of that diff* that the CPU does not have.
+def compute_index_diffs(new_indices: torch.Tensor, cached_indices_list: List[torch.Tensor], pin_memory=True):
+    diffs = []
+    current_diff = new_indices
+    for cached_indices in cached_indices_list:
+        if current_diff.size(0) == 0:
+            # No need to go further down the hierarchy
+            break
+        # Compute elements of new indices not contained current indices
+        off_elements = torch.tensor(
+            list(set(current_diff.tolist()).difference(set(cached_indices.tolist()))),
+            device='cpu',
+            dtype=torch.int32,
+            pin_memory=pin_memory
+        )
+        # Compute mask of current indices where new indices does not contain the element
+        on_position_mask = torch.isin(cached_indices, current_diff, assume_unique=True)
+        on_positions = torch.nonzero(on_position_mask).flatten()
+        off_positions = torch.nonzero(~on_position_mask).flatten()[:off_elements.size(0)]
+        diffs.append(IndexDiff(off_elements, off_positions, on_positions))
+        current_diff = off_elements
+    return diffs
+def topk_and_threshold(x, k, threshold=1):
+    vals, indices = torch.topk(x, k, sorted=True)
+    return indices[vals > threshold].int()
+def load_mlp_sparsity_predictor(weight_path_prefix: str, layer_num: int, dtype: torch.dtype, device: str = 'cuda') -> Callable:
+    path_prefix = f'{weight_path_prefix}decoder.layers.{layer_num}.attn.mlp-sparsity-predictor.'
+    return load_predictor(path_prefix, dtype, device=device)
+def load_predictor(path_prefix: str, dtype: torch.dtype, device: str='cuda') -> Callable:
+    path = lambda i: os.path.expanduser(f'{path_prefix}{i}.weight')
+    if os.path.exists(path(1)):
+        l1 = torch.load(path(1)).to(device).to(dtype)
+        l2 = torch.load(path(2)).to(device).to(dtype)
+        return lambda x: F.linear(F.linear(x, l1), l2)
+    else:
+        print(f'could not find predictor at {path(1)}')
+        return None