JRosenkranz commited on
Commit
4474d9e
1 Parent(s): d95c54e

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +13 -13
README.md CHANGED
@@ -13,7 +13,7 @@ pip install -e .
13
 
14
  ## Description
15
 
16
- This model is intended to be used as an accelerator for [granite 7B (instruct lab)](https://huggingface.co/instructlab/granite-7b-lab) and takes inspiration
17
  from the Medusa speculative decoding architecture. This accelerator modifies the MLP into a multi-stage MLP, where each stage predicts
18
  a single token in the draft based on both a state vector and sampled token
19
  from the prior stage (the base model can be considered stage 0).
@@ -47,14 +47,14 @@ TGIS_IMAGE=quay.io/wxpe/text-gen-server:main.ddc56ee
47
 
48
  docker pull $TGIS_IMAGE
49
 
50
- # optionally download granite-7b-lab if the weights do not already exist
51
  docker run --rm \
52
  -v $HF_HUB_CACHE:/models \
53
  -e HF_HUB_CACHE=/models \
54
  -e TRANSFORMERS_CACHE=/models \
55
  $TGIS_IMAGE \
56
  text-generation-server download-weights \
57
- ibm-granite/granite-7b-lab \
58
  --token $HF_HUB_TOKEN
59
 
60
  # optionally download the speculator model if the weights do not already exist
@@ -64,7 +64,7 @@ docker run --rm \
64
  -e TRANSFORMERS_CACHE=/models \
65
  $TGIS_IMAGE \
66
  text-generation-server download-weights \
67
- ibm-granite/granite-7b-lab-accelerator \
68
  --token $HF_HUB_TOKEN
69
 
70
  # note: if the weights were downloaded separately (not with the above commands), please place them in the HF_HUB_CACHE directory and refer to them with /models/<model_name>
@@ -74,8 +74,8 @@ docker run -d --rm --gpus all \
74
  -v $HF_HUB_CACHE:/models \
75
  -e HF_HUB_CACHE=/models \
76
  -e TRANSFORMERS_CACHE=/models \
77
- -e MODEL_NAME=ibm-granite/granite-7b-lab \
78
- -e SPECULATOR_NAME=ibm-granite/granite-7b-lab-accelerator \
79
  -e FLASH_ATTENTION=true \
80
  -e PAGED_ATTENTION=true \
81
  -e DTYPE=float16 \
@@ -106,7 +106,7 @@ _Note: first prompt may be slower as there is a slight warmup time_
106
  #### start the server
107
 
108
  ```bash
109
- model=ibm-granite/granite-7b-lab-accelerator
110
  volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
111
  docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model
112
  ```
@@ -139,13 +139,13 @@ pip install transformers==4.35.0 sentencepiece numpy
139
  ##### batch_size=1 (compile + cudagraphs)
140
 
141
  ```bash
142
- MODEL_PATH=/path/to/ibm-granite/granite-7b-lab
143
  python fms-extras/scripts/paged_speculative_inference.py \
144
  --variant=7b.ibm_instruct_lab \
145
  --model_path=$MODEL_PATH \
146
  --model_source=hf \
147
  --tokenizer=$MODEL_PATH \
148
- --speculator_path=ibm-granite/granite-7b-lab-accelerator \
149
  --speculator_source=hf \
150
  --speculator_variant=1_4b \
151
  --top_k_tokens_per_head=4,3,2,2,2 \
@@ -156,13 +156,13 @@ python fms-extras/scripts/paged_speculative_inference.py \
156
  ##### batch_size=1 (compile)
157
 
158
  ```bash
159
- MODEL_PATH=/path/to/ibm-granite/granite-7b-lab
160
  python fms-extras/scripts/paged_speculative_inference.py \
161
  --variant=7b.ibm_instruct_lab \
162
  --model_path=$MODEL_PATH \
163
  --model_source=hf \
164
  --tokenizer=$MODEL_PATH \
165
- --speculator_path=ibm-granite/granite-7b-lab-accelerator \
166
  --speculator_source=hf \
167
  --speculator_variant=1_4b \
168
  --top_k_tokens_per_head=4,3,2,2,2 \
@@ -172,13 +172,13 @@ python fms-extras/scripts/paged_speculative_inference.py \
172
  ##### batch_size=4 (compile)
173
 
174
  ```bash
175
- MODEL_PATH=/path/to/ibm-granite/granite-7b-lab
176
  python fms-extras/scripts/paged_speculative_inference.py \
177
  --variant=7b.ibm_instruct_lab \
178
  --model_path=$MODEL_PATH \
179
  --model_source=hf \
180
  --tokenizer=$MODEL_PATH \
181
- --speculator_path=ibm-granite/granite-7b-lab-accelerator \
182
  --speculator_source=hf \
183
  --speculator_variant=1_4b \
184
  --top_k_tokens_per_head=4,3,2,2,2 \
 
13
 
14
  ## Description
15
 
16
+ This model is intended to be used as an accelerator for [granite-7b-instruct](https://huggingface.co/ibm-granite/granite-7b-instruct) and takes inspiration
17
  from the Medusa speculative decoding architecture. This accelerator modifies the MLP into a multi-stage MLP, where each stage predicts
18
  a single token in the draft based on both a state vector and sampled token
19
  from the prior stage (the base model can be considered stage 0).
 
47
 
48
  docker pull $TGIS_IMAGE
49
 
50
+ # optionally download granite-7b-instruct if the weights do not already exist
51
  docker run --rm \
52
  -v $HF_HUB_CACHE:/models \
53
  -e HF_HUB_CACHE=/models \
54
  -e TRANSFORMERS_CACHE=/models \
55
  $TGIS_IMAGE \
56
  text-generation-server download-weights \
57
+ ibm-granite/granite-7b-instruct \
58
  --token $HF_HUB_TOKEN
59
 
60
  # optionally download the speculator model if the weights do not already exist
 
64
  -e TRANSFORMERS_CACHE=/models \
65
  $TGIS_IMAGE \
66
  text-generation-server download-weights \
67
+ ibm-granite/granite-7b-instruct-accelerator \
68
  --token $HF_HUB_TOKEN
69
 
70
  # note: if the weights were downloaded separately (not with the above commands), please place them in the HF_HUB_CACHE directory and refer to them with /models/<model_name>
 
74
  -v $HF_HUB_CACHE:/models \
75
  -e HF_HUB_CACHE=/models \
76
  -e TRANSFORMERS_CACHE=/models \
77
+ -e MODEL_NAME=ibm-granite/granite-7b-instruct \
78
+ -e SPECULATOR_NAME=ibm-granite/granite-7b-instruct-accelerator \
79
  -e FLASH_ATTENTION=true \
80
  -e PAGED_ATTENTION=true \
81
  -e DTYPE=float16 \
 
106
  #### start the server
107
 
108
  ```bash
109
+ model=ibm-granite/granite-7b-instruct-accelerator
110
  volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
111
  docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model
112
  ```
 
139
  ##### batch_size=1 (compile + cudagraphs)
140
 
141
  ```bash
142
+ MODEL_PATH=/path/to/ibm-granite/granite-7b-instruct
143
  python fms-extras/scripts/paged_speculative_inference.py \
144
  --variant=7b.ibm_instruct_lab \
145
  --model_path=$MODEL_PATH \
146
  --model_source=hf \
147
  --tokenizer=$MODEL_PATH \
148
+ --speculator_path=ibm-granite/granite-7b-instruct-accelerator \
149
  --speculator_source=hf \
150
  --speculator_variant=1_4b \
151
  --top_k_tokens_per_head=4,3,2,2,2 \
 
156
  ##### batch_size=1 (compile)
157
 
158
  ```bash
159
+ MODEL_PATH=/path/to/ibm-granite/granite-7b-instruct
160
  python fms-extras/scripts/paged_speculative_inference.py \
161
  --variant=7b.ibm_instruct_lab \
162
  --model_path=$MODEL_PATH \
163
  --model_source=hf \
164
  --tokenizer=$MODEL_PATH \
165
+ --speculator_path=ibm-granite/granite-7b-instruct-accelerator \
166
  --speculator_source=hf \
167
  --speculator_variant=1_4b \
168
  --top_k_tokens_per_head=4,3,2,2,2 \
 
172
  ##### batch_size=4 (compile)
173
 
174
  ```bash
175
+ MODEL_PATH=/path/to/ibm-granite/granite-7b-instruct
176
  python fms-extras/scripts/paged_speculative_inference.py \
177
  --variant=7b.ibm_instruct_lab \
178
  --model_path=$MODEL_PATH \
179
  --model_source=hf \
180
  --tokenizer=$MODEL_PATH \
181
+ --speculator_path=ibm-granite/granite-7b-instruct-accelerator \
182
  --speculator_source=hf \
183
  --speculator_variant=1_4b \
184
  --top_k_tokens_per_head=4,3,2,2,2 \