Spaces:
Paused
Paused
feat(download-model): add download model at runtime
Browse files- Dockerfile +8 -0
- README.md +2 -1
- download_model.py +13 -0
- run.sh +0 -2
Dockerfile
CHANGED
@@ -1,5 +1,8 @@
|
|
1 |
FROM python:3.12
|
2 |
|
|
|
|
|
|
|
3 |
RUN useradd -m -u 1000 user
|
4 |
USER user
|
5 |
ENV PATH="/home/user/.local/bin:$PATH"
|
@@ -11,6 +14,11 @@ RUN pip install --no-cache-dir -r requirements.txt --extra-index-url https://dow
|
|
11 |
|
12 |
COPY --chown=user . /app
|
13 |
|
|
|
|
|
|
|
|
|
|
|
14 |
EXPOSE 7860
|
15 |
|
16 |
#CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
|
|
1 |
FROM python:3.12
|
2 |
|
3 |
+
# Declare your environment variables with the ARG directive
|
4 |
+
ARG HF_TOKEN
|
5 |
+
|
6 |
RUN useradd -m -u 1000 user
|
7 |
USER user
|
8 |
ENV PATH="/home/user/.local/bin:$PATH"
|
|
|
14 |
|
15 |
COPY --chown=user . /app
|
16 |
|
17 |
+
|
18 |
+
# Download at build time,
|
19 |
+
# to ensure during restart we won't have to wait for the download from HF (only wait for docker pull).
|
20 |
+
RUN python /app/download_model.py
|
21 |
+
|
22 |
EXPOSE 7860
|
23 |
|
24 |
#CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
@@ -15,6 +15,7 @@ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-
|
|
15 |
poetry export -f requirements.txt --output requirements.txt --without-hashes
|
16 |
```
|
17 |
|
|
|
18 |
|
19 |
## VLLM OpenAI Compatible API Server
|
20 |
|
@@ -27,7 +28,7 @@ Fixes:
|
|
27 |
|
28 |
This `api_server.py` file is exact copy version from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/entrypoints/openai/api_server.py
|
29 |
|
30 |
-
|
31 |
|
32 |
## Documentation about config
|
33 |
|
|
|
15 |
poetry export -f requirements.txt --output requirements.txt --without-hashes
|
16 |
```
|
17 |
|
18 |
+
* The `HUGGING_FACE_HUB_TOKEN` and `HF_TOKEN` must exist during runtime (use the same value, it must have read permission to the model.)
|
19 |
|
20 |
## VLLM OpenAI Compatible API Server
|
21 |
|
|
|
28 |
|
29 |
This `api_server.py` file is exact copy version from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/entrypoints/openai/api_server.py
|
30 |
|
31 |
+
|
32 |
|
33 |
## Documentation about config
|
34 |
|
download_model.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from huggingface_hub import snapshot_download
|
3 |
+
|
4 |
+
hf_token: str = os.getenv("HF_TOKEN")
|
5 |
+
hf_token = hf_token.strip()
|
6 |
+
if hf_token == "":
|
7 |
+
raise ValueError("HF_TOKEN is empty")
|
8 |
+
|
9 |
+
snapshot_download(
|
10 |
+
repo_id="sail/Sailor-4B-Chat",
|
11 |
+
revision="89a866a7041e6ec023dd462adeca8e28dd53c83e",
|
12 |
+
token=hf_token,
|
13 |
+
)
|
run.sh
CHANGED
@@ -20,8 +20,6 @@ python -u /app/openai_compatible_api_server.py \
|
|
20 |
--revision 89a866a7041e6ec023dd462adeca8e28dd53c83e \
|
21 |
--host 0.0.0.0 \
|
22 |
--port 7860 \
|
23 |
-
--max-num-batched-tokens 32768 \
|
24 |
-
--max-model-len 32768 \
|
25 |
--dtype half \
|
26 |
--enforce-eager \
|
27 |
--gpu-memory-utilization 0.85
|
|
|
20 |
--revision 89a866a7041e6ec023dd462adeca8e28dd53c83e \
|
21 |
--host 0.0.0.0 \
|
22 |
--port 7860 \
|
|
|
|
|
23 |
--dtype half \
|
24 |
--enforce-eager \
|
25 |
--gpu-memory-utilization 0.85
|