JRosenkranz commited on
Commit
d923e32
1 Parent(s): 3cb2f95

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +34 -6
README.md CHANGED
@@ -40,17 +40,45 @@ _Note: For all samples, your environment must have access to cuda_
40
  #### Setup
41
 
42
  ```bash
43
- docker pull quay.io/wxpe/text-gen-server:main.ee927a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  docker run -d --rm --gpus all \
45
  --name my-tgis-server \
46
  -p 8033:8033 \
47
- -v /path/to/all/models:/models \
48
- -e MODEL_NAME=/models/model_weights/llama/13B-F \
49
- -e SPECULATOR_NAME=/models/speculator_weights/llama/llama-13b-accelerator \
 
 
50
  -e FLASH_ATTENTION=true \
51
  -e PAGED_ATTENTION=true \
52
- -e DTYPE_STR=float16 \
53
- quay.io/wxpe/text-gen-server:main.ee927a4
54
 
55
  # check logs and wait for "gRPC server started on port 8033" and "HTTP server started on port 3000"
56
  docker logs my-tgis-server -f
 
40
  #### Setup
41
 
42
  ```bash
43
+ HF_HUB_CACHE=/hf_hub_cache
44
+ HF_HUB_TOKEN="your huggingface hub token"
45
+ TGIS_IMAGE=quay.io/wxpe/text-gen-server:main.ee927a4
46
+
47
+ docker pull $TGIS_IMAGE
48
+
49
+ # optionally download llama-2-13b-chat if the weights do not already exist
50
+ docker run --rm \
51
+ -v $HF_HUB_CACHE:/models \
52
+ -e HF_HUB_CACHE=/models \
53
+ -e TRANSFORMERS_CACHE=/models \
54
+ $TGIS_IMAGE \
55
+ text-generation-server download-weights \
56
+ meta-llama/Llama-2-13b-chat-hf \
57
+ --token $HF_HUB_TOKEN
58
+
59
+ # optionally download the speculator model if the weights do not already exist
60
+ docker run --rm \
61
+ -v $HF_HUB_CACHE:/models \
62
+ -e HF_HUB_CACHE=/models \
63
+ -e TRANSFORMERS_CACHE=/models \
64
+ $TGIS_IMAGE \
65
+ text-generation-server download-weights \
66
+ ibm-fms/llama-13b-accelerator \
67
+ --token $HF_HUB_TOKEN
68
+
69
+ # note: if the weights were downloaded separately (not with the above commands), please place them in the HF_HUB_CACHE directoy and refer to them with /models/<model_name>
70
  docker run -d --rm --gpus all \
71
  --name my-tgis-server \
72
  -p 8033:8033 \
73
+ -v $HF_HUB_CACHE:/models \
74
+ -e HF_HUB_CACHE=/models \
75
+ -e TRANSFORMERS_CACHE=/models \
76
+ -e MODEL_NAME=meta-llama/Llama-2-13b-chat-hf \
77
+ -e SPECULATOR_NAME=ibm-fms/llama-13b-accelerator \
78
  -e FLASH_ATTENTION=true \
79
  -e PAGED_ATTENTION=true \
80
+ -e DTYPE=float16 \
81
+ $TGIS_IMAGE
82
 
83
  # check logs and wait for "gRPC server started on port 8033" and "HTTP server started on port 3000"
84
  docker logs my-tgis-server -f