austinsilveria commited on
Commit
2f9c62d
1 Parent(s): 33ad5e9

switch to gradio for concurrency management

Browse files
Files changed (3) hide show
  1. app.py +109 -56
  2. modeling_tricksy.py +18 -0
  3. requirements.txt +2 -1
app.py CHANGED
@@ -1,65 +1,118 @@
1
- from threading import Thread
2
  import gc
3
  import time
 
4
 
5
- import streamlit as st
6
 
7
  import torch
8
  from transformers import AutoTokenizer, TextIteratorStreamer, set_seed
9
  from modeling_tricksy import TricksyOPTForCausalLM, OPTDiskWeights
10
  from configuration_tricksy import TricksyConfig
11
 
12
- if 'submit' in st.session_state and st.session_state.submit == True:
13
- st.session_state.generating = True
14
- else:
15
- st.session_state.generating = False
16
-
17
- prompt = st.text_area('Prompt', 'Making pesto from scratch can be done with these ingredients in 4 simple steps:\nStep 1')
18
-
19
- col1, col2 = st.columns(2)
20
-
21
- with st.expander('Additional options'):
22
- max_new_tokens = st.slider('Max new tokens', 1, 500, 50)
23
- top_k = st.slider('Top-k sampling', 1, 500, 50)
24
- top_p = st.slider('Top-p (nucleus sampling)', 0.0, 1.0, .9)
25
-
26
- out = st.chat_message('user')
27
- stats = st.empty()
28
-
29
- with col1:
30
- use_tricksy = st.toggle('Use Tricksy', True, help='If true, only send sparse MLP weight diffs to GPU. If false, send all weights to GPU.')
31
- with col2:
32
- if st.button('Submit', disabled=st.session_state.generating, key='submit'):
33
- set_seed(42)
34
- # 13.4 GB (16 bit)
35
- model_name = 'facebook/opt-6.7b'
36
- disk_weights = OPTDiskWeights(model_name)
37
- tricksy_model = TricksyOPTForCausalLM(TricksyConfig(disk_weights.config, full_offload=(not use_tricksy)), disk_weights)
38
- tokenizer = AutoTokenizer.from_pretrained(model_name)
39
- streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
40
-
41
- inputs = tokenizer(prompt, return_tensors='pt').input_ids.to('cuda')
42
-
43
- print()
44
- generation_kwargs = dict(inputs=inputs, streamer=streamer, max_new_tokens=max_new_tokens, do_sample=True, top_k=top_k, top_p=top_p)
45
- thread = Thread(target=tricksy_model.generate, kwargs=generation_kwargs)
46
- thread.start()
47
- generated_text = ''
48
- with out:
49
- t = st.empty()
50
- for new_text in streamer:
51
- generated_text += new_text.replace('\n', ' \n')
52
- t.write(generated_text)
53
-
54
- stats_text = f'Decoding tok/s: {1 / (sum(tricksy_model.tricksy_context.forward_times[1:]) / (len(tricksy_model.tricksy_context.forward_times) - 1))}'
55
- stats_text += f' \nCurrent GPU mem usage: {torch.cuda.memory_allocated("cuda") / 1024 ** 3} GB'
56
- stats_text += f' \nMax GPU mem usage: {torch.cuda.max_memory_allocated("cuda") / 1024 ** 3} GB'
57
- stats.write(stats_text)
58
-
59
- disk_weights = None
60
- tricksy_model = None
61
- time.sleep(.2)
62
- # st.write(f'num open files: {len(psutil.Process().open_files())}')
63
- torch.cuda.empty_cache()
64
- gc.collect()
65
- torch.cuda.reset_peak_memory_stats()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gc
2
  import time
3
+ from threading import Thread
4
 
5
+ import gradio as gr
6
 
7
  import torch
8
  from transformers import AutoTokenizer, TextIteratorStreamer, set_seed
9
  from modeling_tricksy import TricksyOPTForCausalLM, OPTDiskWeights
10
  from configuration_tricksy import TricksyConfig
11
 
12
+ def generate_text(prompt, max_new_tokens, top_k, top_p, use_tricksy):
13
+ set_seed(42)
14
+ model_name = 'facebook/opt-6.7b'
15
+ disk_weights = OPTDiskWeights(model_name)
16
+ tricksy_model = TricksyOPTForCausalLM(TricksyConfig(disk_weights.config, full_offload=(not use_tricksy)), disk_weights)
17
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
18
+ streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
19
+
20
+ inputs = tokenizer(prompt, return_tensors='pt').input_ids.to('cuda')
21
+
22
+ generation_kwargs = dict(inputs=inputs, streamer=streamer, max_new_tokens=max_new_tokens, do_sample=True, top_k=top_k, top_p=top_p)
23
+ thread = Thread(target=tricksy_model.generate, kwargs=generation_kwargs)
24
+ thread.start()
25
+
26
+ generated_text = ''
27
+ for new_text in streamer:
28
+ generated_text += new_text
29
+ yield generated_text, ''
30
+
31
+ stats_text = f'Decoding tok/s: {1 / (sum(tricksy_model.tricksy_context.forward_times[1:]) / (len(tricksy_model.tricksy_context.forward_times) - 1))}'
32
+ stats_text += f' \nCurrent GPU mem usage: {torch.cuda.memory_allocated("cuda") / 1024 ** 3} GB'
33
+ stats_text += f' \nMax GPU mem usage: {torch.cuda.max_memory_allocated("cuda") / 1024 ** 3} GB'
34
+
35
+ disk_weights = None
36
+ tricksy_model.clear()
37
+ tricksy_model = None
38
+ time.sleep(.2)
39
+ torch.cuda.empty_cache()
40
+ gc.collect()
41
+ torch.cuda.reset_peak_memory_stats()
42
+
43
+ yield generated_text, stats_text
44
+
45
+ css = """
46
+ h1 {
47
+ text-align: center;
48
+ display:block;
49
+ }
50
+ """
51
+
52
+ with gr.Blocks(css=css) as iface:
53
+ gr.Markdown('# Tricksy-OPT 6.7b')
54
+ with gr.Row():
55
+ with gr.Column():
56
+ prompt = gr.Text(label="Prompt", value='Making pesto from scratch can be done with these ingredients in 4 simple steps:\nStep 1')
57
+ with gr.Accordion("Additional inputs"):
58
+ use_tricksy = gr.Checkbox(value=True, label="Use Tricksy", info="If true, only send the sparse MLP weight diff to the GPU. If false, send the all weights to the GPU.")
59
+ max_new_tokens = gr.Slider(minimum=1, maximum=500, value=100 if use_tricksy else 30, label="Max new tokens")
60
+ top_k = gr.Slider(minimum=1, maximum=500, value=50, label="Top-k sampling")
61
+ top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.9, label="Top-p (nucleus sampling)")
62
+ use_tricksy.change(
63
+ lambda x: 100 if x else 20,
64
+ inputs=[use_tricksy],
65
+ outputs=[max_new_tokens]
66
+ )
67
+ with gr.Column():
68
+ out = gr.Textbox(label="Generated Text")
69
+ stats = gr.Textbox(label="Statistics")
70
+ btn = gr.Button("Generate")
71
+ btn.click(
72
+ generate_text,
73
+ inputs=[
74
+ prompt,
75
+ max_new_tokens,
76
+ top_k,
77
+ top_p,
78
+ use_tricksy,
79
+ ],
80
+ outputs=[out, stats]
81
+ )
82
+ with gr.Accordion("Description", open=False):
83
+ gr.Markdown('''
84
+ MLP layers of large language models are naturally sparse--e.g. > 99% of layer 3's and > 90% of layer 20's neurons in OPT-1.3b have no effect (due to relu) for most inputs. Adjacent tokens also share a significant number of active neurons--e.g. for layers 1-7 of OPT-1.3b, > 90% of neurons active for token k are also active for token k + 1 (and 60-65% for layers 20-23).
85
+
86
+ We exploit this natural sparisity to minimize CPU-GPU data transfer.
87
+
88
+ ### At initialization, we:
89
+ 1. Store a subset of each MLP layer (e.g. 30%) and full attention layers on the GPU (similar to [LLM in a flash](https://arxiv.org/abs/2312.11514))
90
+ 2. Store full MLP layers in CPU RAM
91
+ 3. Store a cache of which neuron indices we currently have on the GPU
92
+
93
+ ### Before each decoder layer's foward pass, we:
94
+ 1. Predict active MLP neurons based on the attention layer input (following [Deja Vu](https://proceedings.mlr.press/v202/liu23am/liu23am.pdf))
95
+
96
+ ### During each decoder layer's attention computation, we, asynchronously on the CPU:
97
+ 1. Compute the difference between the set of predicted active neuron indices and the set of neuron indices we currently have on the GPU
98
+ 2. Index those neurons from CPU RAM
99
+ 3. Copy them to the GPU
100
+ 4. Update the layer's neuron indices cache
101
+
102
+ ### And finally, during each decoder layer's MLP computation, we:
103
+ 1. Concatenate the newly received neuron diff with our existing neurons
104
+ 2. Compute the MLP (**Note**: As long as fully-connected layer 1 and fully-connected layer 2 share the same neuron ordering, the full two layer computation is invariant with respect to neuron ordering.)
105
+ 4. Overwrite a subset of our neuron buffer with the diff (FIFO order)
106
+ 5. Delete the diff
107
+
108
+ ## Limitations
109
+ 1. This is approximate inference. The active neuron predictors do not have perfect recall, leading to slight accuracy degradation. See the [Deja Vu paper](https://proceedings.mlr.press/v202/liu23am/liu23am.pdf) for an in depth evaluation.
110
+
111
+ ## Potential Improvements
112
+ 1. Evaluations--to push the sparsity levels, we need evaluations to measure accuracy degradation.
113
+ 2. Indexing the non-contiguous neuron diff from CPU RAM comes nowhere near saturating CPU-RAM memory bandwidth. We may be able to improve this with a custom C++ indexer.
114
+ 3. Early layers are extremely sparse while later layers are less sparse--perhaps we can allocate smaller GPU neuron buffers to early layers to free up space for larger buffers for later layers.
115
+ 4. Applying an advanced index to a pinned tensor in PyTorch will return an unpinned copy of the indexed data, which means it needs to be recopied to pinned memory before it can be sent to the GPU. If we can override this default PyTorch behavior to allow direct CPU-GPU copying from a specified advanced index without intermediate copies, we should get a nice speedup.
116
+ ''')
117
+
118
+ iface.queue().launch(show_api=False)
modeling_tricksy.py CHANGED
@@ -289,6 +289,9 @@ class TricksyOPTAttention(OPTAttention, TricksyLayer):
289
  self.v_proj = lambda x: F.linear(x, self.vw, self.vb)
290
  self.out_proj = lambda x: F.linear(x, self.ow, self.out_proj_bias)
291
  self.layer_norm = lambda x: F.layer_norm(x, (self.config.hidden_size,), self.layer_norm_weight, self.layer_norm_bias)
 
 
 
292
 
293
  def load_weights(self, tricksy_context: TricksyContext):
294
  if self.tricksy_context.is_prompt_phase:
@@ -375,6 +378,10 @@ class TricksyOPTDecoderLayer(OPTDecoderLayer):
375
  self.fc2 = lambda x: F.linear(x, torch.cat([self.fc2_weight, self.fc2_weight_diff]).T, self.fc2_bias)
376
  self.final_layer_norm = lambda x: F.layer_norm(x, (self.embed_dim,), self.final_layer_norm_weight, self.final_layer_norm_bias)
377
 
 
 
 
 
378
  def load_weights(self, tricksy_context: TricksyContext):
379
  if self.tricksy_context.is_prompt_phase:
380
  # Full weights for prompt phase
@@ -525,6 +532,11 @@ class TricksyOPTDecoder(OPTDecoder, TricksyLayer):
525
  self.final_layer_norm = lambda x: x
526
  self.inputs = TricksyLayerInputs(disk_weights=disk_weights, layer_key_prefix='decoder.')
527
 
 
 
 
 
 
528
  def embed_tokens(self, x):
529
  return F.embedding(x, self.embed_tokens_weight, self.padding_idx)
530
 
@@ -563,6 +575,9 @@ class TricksyOPTModel(OPTModel):
563
  self.config = tricksy_config.opt_config
564
  self.tricksy_context = tricksy_context
565
  self.decoder = TricksyOPTDecoder(tricksy_config, disk_weights, tricksy_opt_for_causal_lm, tricksy_context)
 
 
 
566
 
567
  def forward(self, *args, **kwargs):
568
  out = super().forward(*args, **kwargs)
@@ -589,6 +604,9 @@ class TricksyOPTForCausalLM(OPTForCausalLM, TricksyLayer):
589
  self.lm_head = lambda x: F.linear(self.final_layer_norm(x), self.lm_head_weight)
590
 
591
  self.inputs = TricksyLayerInputs(disk_weights=disk_weights, layer_key_prefix='decoder.', next_layer=self.model.decoder)
 
 
 
592
 
593
  def load_weights(self, tricksy_context: TricksyContext):
594
  if self.final_layer_norm_weight is None:
 
289
  self.v_proj = lambda x: F.linear(x, self.vw, self.vb)
290
  self.out_proj = lambda x: F.linear(x, self.ow, self.out_proj_bias)
291
  self.layer_norm = lambda x: F.layer_norm(x, (self.config.hidden_size,), self.layer_norm_weight, self.layer_norm_bias)
292
+
293
+ def clear(self):
294
+ self.qw = self.kw = self.vw = self.ow = self.qb = self.kb = self.vb = self.out_proj_bias = self.layer_norm_weight = self.layer_norm_bias = None
295
 
296
  def load_weights(self, tricksy_context: TricksyContext):
297
  if self.tricksy_context.is_prompt_phase:
 
378
  self.fc2 = lambda x: F.linear(x, torch.cat([self.fc2_weight, self.fc2_weight_diff]).T, self.fc2_bias)
379
  self.final_layer_norm = lambda x: F.layer_norm(x, (self.embed_dim,), self.final_layer_norm_weight, self.final_layer_norm_bias)
380
 
381
+ def clear(self):
382
+ self.fc1_weight = self.fc2_weight = self.final_layer_norm_weight = self.fc1_bias = self.fc2_bias = self.final_layer_norm_bias = None
383
+ self.fc1_weight_diff = self.fc2_weight_diff = self.fc1_bias_diff = None
384
+
385
  def load_weights(self, tricksy_context: TricksyContext):
386
  if self.tricksy_context.is_prompt_phase:
387
  # Full weights for prompt phase
 
532
  self.final_layer_norm = lambda x: x
533
  self.inputs = TricksyLayerInputs(disk_weights=disk_weights, layer_key_prefix='decoder.')
534
 
535
+ def clear(self):
536
+ self.embed_tokens_weight = self.embed_positions.weight = None
537
+ for layer in self.layers:
538
+ layer.clear()
539
+
540
  def embed_tokens(self, x):
541
  return F.embedding(x, self.embed_tokens_weight, self.padding_idx)
542
 
 
575
  self.config = tricksy_config.opt_config
576
  self.tricksy_context = tricksy_context
577
  self.decoder = TricksyOPTDecoder(tricksy_config, disk_weights, tricksy_opt_for_causal_lm, tricksy_context)
578
+
579
+ def clear(self):
580
+ self.decoder.clear()
581
 
582
  def forward(self, *args, **kwargs):
583
  out = super().forward(*args, **kwargs)
 
604
  self.lm_head = lambda x: F.linear(self.final_layer_norm(x), self.lm_head_weight)
605
 
606
  self.inputs = TricksyLayerInputs(disk_weights=disk_weights, layer_key_prefix='decoder.', next_layer=self.model.decoder)
607
+
608
+ def clear(self):
609
+ self.model.clear()
610
 
611
  def load_weights(self, tricksy_context: TricksyContext):
612
  if self.final_layer_norm_weight is None:
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  torch>=2.0
2
  transformers>=4.35
3
  accelerate>=0.24
4
- numpy
 
 
1
  torch>=2.0
2
  transformers>=4.35
3
  accelerate>=0.24
4
+ numpy
5
+ gradio