Spaces:
Paused
Paused
fix(runner.sh): explicitly disabling enforce_eager
Browse files- run-llama.sh +1 -0
- run-sailor.sh +1 -0
- runner.sh +1 -0
run-llama.sh
CHANGED
@@ -27,4 +27,5 @@ python -u /app/openai_compatible_api_server.py \
|
|
27 |
--max-num-batched-tokens 32768 \
|
28 |
--max-model-len 32768 \
|
29 |
--dtype float16 \
|
|
|
30 |
--gpu-memory-utilization 0.85
|
|
|
27 |
--max-num-batched-tokens 32768 \
|
28 |
--max-model-len 32768 \
|
29 |
--dtype float16 \
|
30 |
+
--enforce-eager false \
|
31 |
--gpu-memory-utilization 0.85
|
run-sailor.sh
CHANGED
@@ -29,4 +29,5 @@ python -u /app/openai_compatible_api_server.py \
|
|
29 |
--max-num-batched-tokens 32768 \
|
30 |
--max-model-len 32768 \
|
31 |
--dtype float16 \
|
|
|
32 |
--gpu-memory-utilization 0.85
|
|
|
29 |
--max-num-batched-tokens 32768 \
|
30 |
--max-model-len 32768 \
|
31 |
--dtype float16 \
|
32 |
+
--enforce-eager false \
|
33 |
--gpu-memory-utilization 0.85
|
runner.sh
CHANGED
@@ -51,4 +51,5 @@ python -u /app/openai_compatible_api_server.py \
|
|
51 |
--max-num-batched-tokens 32768 \
|
52 |
--max-model-len 32768 \
|
53 |
--dtype float16 \
|
|
|
54 |
--gpu-memory-utilization 0.9
|
|
|
51 |
--max-num-batched-tokens 32768 \
|
52 |
--max-model-len 32768 \
|
53 |
--dtype float16 \
|
54 |
+
--enforce-eager false \
|
55 |
--gpu-memory-utilization 0.9
|