Spaces:

austinsilveria
/

tricksy

Paused

App Files Files Community

austinsilveria commited on Dec 30, 2023

Commit

2f9c62d

1 Parent(s): 33ad5e9

switch to gradio for concurrency management

Browse files

Files changed (3) hide show

app.py +109 -56
modeling_tricksy.py +18 -0
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -1,65 +1,118 @@
-from threading import Thread
 import gc
 import time
-import streamlit as st
 import torch
 from transformers import AutoTokenizer, TextIteratorStreamer, set_seed
 from modeling_tricksy import TricksyOPTForCausalLM, OPTDiskWeights
 from configuration_tricksy import TricksyConfig
-if 'submit' in st.session_state and st.session_state.submit == True:
-    st.session_state.generating = True
-else:
-    st.session_state.generating = False
-prompt = st.text_area('Prompt', 'Making pesto from scratch can be done with these ingredients in 4 simple steps:\nStep 1')
-col1, col2 = st.columns(2)
-with st.expander('Additional options'):
-    max_new_tokens = st.slider('Max new tokens', 1, 500, 50)
-    top_k = st.slider('Top-k sampling', 1, 500, 50)
-    top_p = st.slider('Top-p (nucleus sampling)', 0.0, 1.0, .9)
-out = st.chat_message('user')
-stats = st.empty()
-with col1:
-    use_tricksy = st.toggle('Use Tricksy', True, help='If true, only send sparse MLP weight diffs to GPU. If false, send all weights to GPU.')
-with col2:
-    if st.button('Submit', disabled=st.session_state.generating, key='submit'):
-        set_seed(42)
-        # 13.4 GB (16 bit)
-        model_name = 'facebook/opt-6.7b'
-        disk_weights = OPTDiskWeights(model_name)
-        tricksy_model = TricksyOPTForCausalLM(TricksyConfig(disk_weights.config, full_offload=(not use_tricksy)), disk_weights)
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
-        inputs = tokenizer(prompt, return_tensors='pt').input_ids.to('cuda')
-        print()
-        generation_kwargs = dict(inputs=inputs, streamer=streamer, max_new_tokens=max_new_tokens, do_sample=True, top_k=top_k, top_p=top_p)
-        thread = Thread(target=tricksy_model.generate, kwargs=generation_kwargs)
-        thread.start()
-        generated_text = ''
-        with out:
-            t = st.empty()
-            for new_text in streamer:
-                generated_text += new_text.replace('\n', '  \n')
-                t.write(generated_text)
-        stats_text = f'Decoding tok/s: {1 / (sum(tricksy_model.tricksy_context.forward_times[1:]) / (len(tricksy_model.tricksy_context.forward_times) - 1))}'
-        stats_text += f'  \nCurrent GPU mem usage: {torch.cuda.memory_allocated("cuda") / 1024 ** 3} GB'
-        stats_text += f'  \nMax GPU mem usage: {torch.cuda.max_memory_allocated("cuda") / 1024 ** 3} GB'
-        stats.write(stats_text)
-        disk_weights = None
-        tricksy_model = None
-        time.sleep(.2)
-        # st.write(f'num open files: {len(psutil.Process().open_files())}')
-        torch.cuda.empty_cache()
-        gc.collect()
-        torch.cuda.reset_peak_memory_stats()

 import gc
 import time
+from threading import Thread
+import gradio as gr
 import torch
 from transformers import AutoTokenizer, TextIteratorStreamer, set_seed
 from modeling_tricksy import TricksyOPTForCausalLM, OPTDiskWeights
 from configuration_tricksy import TricksyConfig
+def generate_text(prompt, max_new_tokens, top_k, top_p, use_tricksy):
+    set_seed(42)
+    model_name = 'facebook/opt-6.7b'
+    disk_weights = OPTDiskWeights(model_name)
+    tricksy_model = TricksyOPTForCausalLM(TricksyConfig(disk_weights.config, full_offload=(not use_tricksy)), disk_weights)
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
+    inputs = tokenizer(prompt, return_tensors='pt').input_ids.to('cuda')
+    generation_kwargs = dict(inputs=inputs, streamer=streamer, max_new_tokens=max_new_tokens, do_sample=True, top_k=top_k, top_p=top_p)
+    thread = Thread(target=tricksy_model.generate, kwargs=generation_kwargs)
+    thread.start()
+    generated_text = ''
+    for new_text in streamer:
+        generated_text += new_text
+        yield generated_text, ''
+    stats_text = f'Decoding tok/s: {1 / (sum(tricksy_model.tricksy_context.forward_times[1:]) / (len(tricksy_model.tricksy_context.forward_times) - 1))}'
+    stats_text += f'  \nCurrent GPU mem usage: {torch.cuda.memory_allocated("cuda") / 1024 ** 3} GB'
+    stats_text += f'  \nMax GPU mem usage: {torch.cuda.max_memory_allocated("cuda") / 1024 ** 3} GB'
+    disk_weights = None
+    tricksy_model.clear()
+    tricksy_model = None
+    time.sleep(.2)
+    torch.cuda.empty_cache()
+    gc.collect()
+    torch.cuda.reset_peak_memory_stats()
+    yield generated_text, stats_text
+css = """
+h1 {
+    text-align: center;
+    display:block;
+}
+"""
+with gr.Blocks(css=css) as iface:
+    gr.Markdown('# Tricksy-OPT 6.7b')
+    with gr.Row():
+        with gr.Column():
+            prompt = gr.Text(label="Prompt", value='Making pesto from scratch can be done with these ingredients in 4 simple steps:\nStep 1')
+            with gr.Accordion("Additional inputs"):
+                use_tricksy = gr.Checkbox(value=True, label="Use Tricksy", info="If true, only send the sparse MLP weight diff to the GPU. If false, send the all weights to the GPU.")
+                max_new_tokens = gr.Slider(minimum=1, maximum=500, value=100 if use_tricksy else 30, label="Max new tokens")
+                top_k = gr.Slider(minimum=1, maximum=500, value=50, label="Top-k sampling")
+                top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.9, label="Top-p (nucleus sampling)")
+                use_tricksy.change(
+                    lambda x: 100 if x else 20,
+                    inputs=[use_tricksy],
+                    outputs=[max_new_tokens]
+                )
+        with gr.Column():
+            out = gr.Textbox(label="Generated Text")
+            stats = gr.Textbox(label="Statistics")
+    btn = gr.Button("Generate")
+    btn.click(
+        generate_text,
+        inputs=[
+            prompt,
+            max_new_tokens,
+            top_k,
+            top_p,
+            use_tricksy,
+        ],
+        outputs=[out, stats]
+    )
+    with gr.Accordion("Description", open=False):
+        gr.Markdown('''
+            MLP layers of large language models are naturally sparse--e.g. > 99% of layer 3's and > 90% of layer 20's neurons in OPT-1.3b have no effect (due to relu) for most inputs. Adjacent tokens also share a significant number of active neurons--e.g. for layers 1-7 of OPT-1.3b, > 90% of neurons active for token k are also active for token k + 1 (and 60-65% for layers 20-23).
+            We exploit this natural sparisity to minimize CPU-GPU data transfer.
+            ### At initialization, we:
+            1. Store a subset of each MLP layer (e.g. 30%) and full attention layers on the GPU (similar to [LLM in a flash](https://arxiv.org/abs/2312.11514))
+            2. Store full MLP layers in CPU RAM
+            3. Store a cache of which neuron indices we currently have on the GPU
+            ### Before each decoder layer's foward pass, we:
+            1. Predict active MLP neurons based on the attention layer input (following [Deja Vu](https://proceedings.mlr.press/v202/liu23am/liu23am.pdf))
+            ### During each decoder layer's attention computation, we, asynchronously on the CPU:
+            1. Compute the difference between the set of predicted active neuron indices and the set of neuron indices we currently have on the GPU
+            2. Index those neurons from CPU RAM
+            3. Copy them to the GPU
+            4. Update the layer's neuron indices cache
+            ### And finally, during each decoder layer's MLP computation, we:
+            1. Concatenate the newly received neuron diff with our existing neurons
+            2. Compute the MLP (**Note**: As long as fully-connected layer 1 and fully-connected layer 2 share the same neuron ordering, the full two layer computation is invariant with respect to neuron ordering.)
+            4. Overwrite a subset of our neuron buffer with the diff (FIFO order)
+            5. Delete the diff
+            ## Limitations
+            1. This is approximate inference. The active neuron predictors do not have perfect recall, leading to slight accuracy degradation. See the [Deja Vu paper](https://proceedings.mlr.press/v202/liu23am/liu23am.pdf) for an in depth evaluation.
+            ## Potential Improvements
+            1. Evaluations--to push the sparsity levels, we need evaluations to measure accuracy degradation.
+            2. Indexing the non-contiguous neuron diff from CPU RAM comes nowhere near saturating CPU-RAM memory bandwidth. We may be able to improve this with a custom C++ indexer.
+            3. Early layers are extremely sparse while later layers are less sparse--perhaps we can allocate smaller GPU neuron buffers to early layers to free up space for larger buffers for later layers.
+            4. Applying an advanced index to a pinned tensor in PyTorch will return an unpinned copy of the indexed data, which means it needs to be recopied to pinned memory before it can be sent to the GPU. If we can override this default PyTorch behavior to allow direct CPU-GPU copying from a specified advanced index without intermediate copies, we should get a nice speedup.
+        ''')
+iface.queue().launch(show_api=False)

modeling_tricksy.py CHANGED Viewed

@@ -289,6 +289,9 @@ class TricksyOPTAttention(OPTAttention, TricksyLayer):
         self.v_proj = lambda x: F.linear(x, self.vw, self.vb)
         self.out_proj = lambda x: F.linear(x, self.ow, self.out_proj_bias)
         self.layer_norm = lambda x: F.layer_norm(x, (self.config.hidden_size,), self.layer_norm_weight, self.layer_norm_bias)
     def load_weights(self, tricksy_context: TricksyContext):
         if self.tricksy_context.is_prompt_phase:
@@ -375,6 +378,10 @@ class TricksyOPTDecoderLayer(OPTDecoderLayer):
         self.fc2 = lambda x: F.linear(x, torch.cat([self.fc2_weight, self.fc2_weight_diff]).T, self.fc2_bias)
         self.final_layer_norm = lambda x: F.layer_norm(x, (self.embed_dim,), self.final_layer_norm_weight, self.final_layer_norm_bias)
     def load_weights(self, tricksy_context: TricksyContext):
         if self.tricksy_context.is_prompt_phase:
             # Full weights for prompt phase
@@ -525,6 +532,11 @@ class TricksyOPTDecoder(OPTDecoder, TricksyLayer):
         self.final_layer_norm = lambda x: x
         self.inputs = TricksyLayerInputs(disk_weights=disk_weights, layer_key_prefix='decoder.')
     def embed_tokens(self, x):
         return F.embedding(x, self.embed_tokens_weight, self.padding_idx)
@@ -563,6 +575,9 @@ class TricksyOPTModel(OPTModel):
         self.config = tricksy_config.opt_config
         self.tricksy_context = tricksy_context
         self.decoder = TricksyOPTDecoder(tricksy_config, disk_weights, tricksy_opt_for_causal_lm, tricksy_context)
     def forward(self, *args, **kwargs):
         out = super().forward(*args, **kwargs)
@@ -589,6 +604,9 @@ class TricksyOPTForCausalLM(OPTForCausalLM, TricksyLayer):
         self.lm_head = lambda x: F.linear(self.final_layer_norm(x), self.lm_head_weight)
         self.inputs = TricksyLayerInputs(disk_weights=disk_weights, layer_key_prefix='decoder.', next_layer=self.model.decoder)
     def load_weights(self, tricksy_context: TricksyContext):
         if self.final_layer_norm_weight is None:

         self.v_proj = lambda x: F.linear(x, self.vw, self.vb)
         self.out_proj = lambda x: F.linear(x, self.ow, self.out_proj_bias)
         self.layer_norm = lambda x: F.layer_norm(x, (self.config.hidden_size,), self.layer_norm_weight, self.layer_norm_bias)
+    def clear(self):
+        self.qw = self.kw = self.vw = self.ow = self.qb = self.kb = self.vb = self.out_proj_bias = self.layer_norm_weight = self.layer_norm_bias = None
     def load_weights(self, tricksy_context: TricksyContext):
         if self.tricksy_context.is_prompt_phase:
         self.fc2 = lambda x: F.linear(x, torch.cat([self.fc2_weight, self.fc2_weight_diff]).T, self.fc2_bias)
         self.final_layer_norm = lambda x: F.layer_norm(x, (self.embed_dim,), self.final_layer_norm_weight, self.final_layer_norm_bias)
+    def clear(self):
+        self.fc1_weight = self.fc2_weight = self.final_layer_norm_weight = self.fc1_bias = self.fc2_bias = self.final_layer_norm_bias = None
+        self.fc1_weight_diff = self.fc2_weight_diff = self.fc1_bias_diff = None
     def load_weights(self, tricksy_context: TricksyContext):
         if self.tricksy_context.is_prompt_phase:
             # Full weights for prompt phase
         self.final_layer_norm = lambda x: x
         self.inputs = TricksyLayerInputs(disk_weights=disk_weights, layer_key_prefix='decoder.')
+    def clear(self):
+        self.embed_tokens_weight = self.embed_positions.weight = None
+        for layer in self.layers:
+            layer.clear()
     def embed_tokens(self, x):
         return F.embedding(x, self.embed_tokens_weight, self.padding_idx)
         self.config = tricksy_config.opt_config
         self.tricksy_context = tricksy_context
         self.decoder = TricksyOPTDecoder(tricksy_config, disk_weights, tricksy_opt_for_causal_lm, tricksy_context)
+    def clear(self):
+        self.decoder.clear()
     def forward(self, *args, **kwargs):
         out = super().forward(*args, **kwargs)
         self.lm_head = lambda x: F.linear(self.final_layer_norm(x), self.lm_head_weight)
         self.inputs = TricksyLayerInputs(disk_weights=disk_weights, layer_key_prefix='decoder.', next_layer=self.model.decoder)
+    def clear(self):
+        self.model.clear()
     def load_weights(self, tricksy_context: TricksyContext):
         if self.final_layer_norm_weight is None:

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
 torch>=2.0
 transformers>=4.35
 accelerate>=0.24
-numpy

 torch>=2.0
 transformers>=4.35
 accelerate>=0.24
+numpy
+gradio