ibm-fms
/

llama-13b-accelerator

Inference Endpoints

Model card Files Files and versions Community

JRosenkranz commited on Apr 23

Commit

d923e32

•

1 Parent(s): 3cb2f95

Update README.md

Files changed (1) hide show

README.md +34 -6

README.md CHANGED Viewed

@@ -40,17 +40,45 @@ _Note: For all samples, your environment must have access to cuda_
 #### Setup
 ```bash
-docker pull quay.io/wxpe/text-gen-server:main.ee927a4
 docker run -d --rm --gpus all \
     --name my-tgis-server \
     -p 8033:8033 \
-    -v /path/to/all/models:/models \
-    -e MODEL_NAME=/models/model_weights/llama/13B-F \
-    -e SPECULATOR_NAME=/models/speculator_weights/llama/llama-13b-accelerator \
     -e FLASH_ATTENTION=true \
     -e PAGED_ATTENTION=true \
-    -e DTYPE_STR=float16 \
-    quay.io/wxpe/text-gen-server:main.ee927a4
 # check logs and wait for "gRPC server started on port 8033" and "HTTP server started on port 3000"
 docker logs my-tgis-server -f

 #### Setup
 ```bash
+HF_HUB_CACHE=/hf_hub_cache
+HF_HUB_TOKEN="your huggingface hub token"
+TGIS_IMAGE=quay.io/wxpe/text-gen-server:main.ee927a4
+docker pull $TGIS_IMAGE
+# optionally download llama-2-13b-chat if the weights do not already exist
+docker run --rm \
+    -v $HF_HUB_CACHE:/models \
+    -e HF_HUB_CACHE=/models \
+    -e TRANSFORMERS_CACHE=/models \
+    $TGIS_IMAGE \
+    text-generation-server download-weights \
+    meta-llama/Llama-2-13b-chat-hf \
+    --token $HF_HUB_TOKEN
+# optionally download the speculator model if the weights do not already exist
+docker run --rm \
+    -v $HF_HUB_CACHE:/models \
+    -e HF_HUB_CACHE=/models \
+    -e TRANSFORMERS_CACHE=/models \
+    $TGIS_IMAGE \
+    text-generation-server download-weights \
+    ibm-fms/llama-13b-accelerator \
+    --token $HF_HUB_TOKEN
+# note: if the weights were downloaded separately (not with the above commands), please place them in the HF_HUB_CACHE directoy and refer to them with /models/<model_name>
 docker run -d --rm --gpus all \
     --name my-tgis-server \
     -p 8033:8033 \
+    -v $HF_HUB_CACHE:/models \
+    -e HF_HUB_CACHE=/models \
+    -e TRANSFORMERS_CACHE=/models \
+    -e MODEL_NAME=meta-llama/Llama-2-13b-chat-hf \
+    -e SPECULATOR_NAME=ibm-fms/llama-13b-accelerator \
     -e FLASH_ATTENTION=true \
     -e PAGED_ATTENTION=true \
+    -e DTYPE=float16 \
+    $TGIS_IMAGE
 # check logs and wait for "gRPC server started on port 8033" and "HTTP server started on port 3000"
 docker logs my-tgis-server -f