JRosenkranz commited on
Commit
f31ad8d
2 Parent(s): 54015ba 5c84892

Merge branch 'main' of https://huggingface.co/ibm-fms/llama3-8b-accelerator

Browse files
Files changed (1) hide show
  1. README.md +76 -1
README.md CHANGED
@@ -67,7 +67,7 @@ docker run --rm \
67
  ibm-fms/llama3-8b-accelerator \
68
  --token $HF_HUB_TOKEN
69
 
70
- # note: if the weights were downloaded separately (not with the above commands), please place them in the HF_HUB_CACHE directoy and refer to them with /models/<model_name>
71
  docker run -d --rm --gpus all \
72
  --name my-tgis-server \
73
  -p 8033:8033 \
@@ -92,3 +92,78 @@ cd text-generation-inference/integration_tests
92
  make gen-client
93
  pip install . --no-cache-dir
94
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  ibm-fms/llama3-8b-accelerator \
68
  --token $HF_HUB_TOKEN
69
 
70
+ # note: if the weights were downloaded separately (not with the above commands), please place them in the HF_HUB_CACHE directory and refer to them with /models/<model_name>
71
  docker run -d --rm --gpus all \
72
  --name my-tgis-server \
73
  -p 8033:8033 \
 
92
  make gen-client
93
  pip install . --no-cache-dir
94
  ```
95
+
96
+ #### Run Sample
97
+
98
+ ```bash
99
+ python sample_client.py
100
+ ```
101
+
102
+ _Note: first prompt may be slower as there is a slight warmup time_
103
+
104
+ ### Minimal Sample
105
+
106
+ #### Install
107
+
108
+ ```bash
109
+ git clone --branch llama_3_variants --single-branch https://github.com/JRosenkranz/fms-extras
110
+ (cd fms-extras && pip install -e .)
111
+ pip install transformers==4.35.0 sentencepiece numpy
112
+ ```
113
+
114
+ #### Run Sample
115
+
116
+ ##### batch_size=1 (compile + cudagraphs)
117
+
118
+ ```bash
119
+ MODEL_PATH=/path/to/llama3/hf/Meta-Llama-3-8B-Instruct
120
+ python fms-extras/scripts/paged_speculative_inference.py \
121
+ --architecture=llama3 \
122
+ --variant=8b \
123
+ --model_path=$MODEL_PATH \
124
+ --model_source=hf \
125
+ --tokenizer=$MODEL_PATH \
126
+ --speculator_path=ibm-fms/llama3-8b-accelerator \
127
+ --speculator_source=hf \
128
+ --speculator_variant=3_2b \
129
+ --top_k_tokens_per_head=4,3,2,2 \
130
+ --compile \
131
+ --compile_mode=reduce-overhead
132
+ ```
133
+
134
+ ##### batch_size=1 (compile)
135
+
136
+ ```bash
137
+ MODEL_PATH=/path/to/llama3/hf/Meta-Llama-3-8B-Instruct
138
+ python fms-extras/scripts/paged_speculative_inference.py \
139
+ --architecture=llama3 \
140
+ --variant=8b \
141
+ --model_path=$MODEL_PATH \
142
+ --model_source=hf \
143
+ --tokenizer=$MODEL_PATH \
144
+ --speculator_path=ibm-fms/llama3-8b-accelerator \
145
+ --speculator_source=hf \
146
+ --speculator_variant=3_2b \
147
+ --top_k_tokens_per_head=4,3,2,2 \
148
+ --compile
149
+ ```
150
+
151
+ ##### batch_size=4 (compile)
152
+
153
+ ```bash
154
+ MODEL_PATH=/path/to/llama3/hf/Meta-Llama-3-8B-Instruct
155
+ python fms-extras/scripts/paged_speculative_inference.py \
156
+ --architecture=llama3 \
157
+ --variant=8b \
158
+ --model_path=$MODEL_PATH \
159
+ --model_source=hf \
160
+ --tokenizer=$MODEL_PATH \
161
+ --speculator_path=ibm-fms/llama3-8b-accelerator \
162
+ --speculator_source=hf \
163
+ --speculator_variant=3_2b \
164
+ --top_k_tokens_per_head=4,3,2,2 \
165
+ --batch_input \
166
+ --compile
167
+ ```
168
+
169
+ Sample code can be found [here](https://github.com/foundation-model-stack/fms-extras/pull/24)