yusufs commited on
Commit
fc30f26
·
1 Parent(s): d90e4d6

feat(download-model): add download model at runtime

Browse files
Files changed (4) hide show
  1. Dockerfile +8 -0
  2. README.md +2 -1
  3. download_model.py +13 -0
  4. run.sh +0 -2
Dockerfile CHANGED
@@ -1,5 +1,8 @@
1
  FROM python:3.12
2
 
 
 
 
3
  RUN useradd -m -u 1000 user
4
  USER user
5
  ENV PATH="/home/user/.local/bin:$PATH"
@@ -11,6 +14,11 @@ RUN pip install --no-cache-dir -r requirements.txt --extra-index-url https://dow
11
 
12
  COPY --chown=user . /app
13
 
 
 
 
 
 
14
  EXPOSE 7860
15
 
16
  #CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
  FROM python:3.12
2
 
3
+ # Declare your environment variables with the ARG directive
4
+ ARG HF_TOKEN
5
+
6
  RUN useradd -m -u 1000 user
7
  USER user
8
  ENV PATH="/home/user/.local/bin:$PATH"
 
14
 
15
  COPY --chown=user . /app
16
 
17
+
18
+ # Download at build time,
19
+ # to ensure during restart we won't have to wait for the download from HF (only wait for docker pull).
20
+ RUN python /app/download_model.py
21
+
22
  EXPOSE 7860
23
 
24
  #CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -15,6 +15,7 @@ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-
15
  poetry export -f requirements.txt --output requirements.txt --without-hashes
16
  ```
17
 
 
18
 
19
  ## VLLM OpenAI Compatible API Server
20
 
@@ -27,7 +28,7 @@ Fixes:
27
 
28
  This `api_server.py` file is exact copy version from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/entrypoints/openai/api_server.py
29
 
30
- * The `HUGGING_FACE_HUB_TOKEN` must exist during runtime.
31
 
32
  ## Documentation about config
33
 
 
15
  poetry export -f requirements.txt --output requirements.txt --without-hashes
16
  ```
17
 
18
+ * The `HUGGING_FACE_HUB_TOKEN` and `HF_TOKEN` must exist during runtime (use the same value, it must have read permission to the model.)
19
 
20
  ## VLLM OpenAI Compatible API Server
21
 
 
28
 
29
  This `api_server.py` file is exact copy version from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/entrypoints/openai/api_server.py
30
 
31
+
32
 
33
  ## Documentation about config
34
 
download_model.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from huggingface_hub import snapshot_download
3
+
4
+ hf_token: str = os.getenv("HF_TOKEN")
5
+ hf_token = hf_token.strip()
6
+ if hf_token == "":
7
+ raise ValueError("HF_TOKEN is empty")
8
+
9
+ snapshot_download(
10
+ repo_id="sail/Sailor-4B-Chat",
11
+ revision="89a866a7041e6ec023dd462adeca8e28dd53c83e",
12
+ token=hf_token,
13
+ )
run.sh CHANGED
@@ -20,8 +20,6 @@ python -u /app/openai_compatible_api_server.py \
20
  --revision 89a866a7041e6ec023dd462adeca8e28dd53c83e \
21
  --host 0.0.0.0 \
22
  --port 7860 \
23
- --max-num-batched-tokens 32768 \
24
- --max-model-len 32768 \
25
  --dtype half \
26
  --enforce-eager \
27
  --gpu-memory-utilization 0.85
 
20
  --revision 89a866a7041e6ec023dd462adeca8e28dd53c83e \
21
  --host 0.0.0.0 \
22
  --port 7860 \
 
 
23
  --dtype half \
24
  --enforce-eager \
25
  --gpu-memory-utilization 0.85