Spaces:

yusufs
/

vllm-inference

Paused

yusufs commited on Jan 20

Commit

266e7dd

1 Parent(s): 6bb48e9

fix(runner.sh): explicitly disabling enforce_eager

Files changed (3) hide show

run-llama.sh CHANGED Viewed

@@ -27,4 +27,5 @@ python -u /app/openai_compatible_api_server.py \
     --max-num-batched-tokens 32768 \
     --max-model-len 32768 \
     --dtype float16 \
     --gpu-memory-utilization 0.85

     --max-num-batched-tokens 32768 \
     --max-model-len 32768 \
     --dtype float16 \
+    --enforce-eager false \
     --gpu-memory-utilization 0.85

run-sailor.sh CHANGED Viewed

@@ -29,4 +29,5 @@ python -u /app/openai_compatible_api_server.py \
     --max-num-batched-tokens 32768 \
     --max-model-len 32768 \
     --dtype float16 \
     --gpu-memory-utilization 0.85

     --max-num-batched-tokens 32768 \
     --max-model-len 32768 \
     --dtype float16 \
+    --enforce-eager false \
     --gpu-memory-utilization 0.85

runner.sh CHANGED Viewed

@@ -51,4 +51,5 @@ python -u /app/openai_compatible_api_server.py \
     --max-num-batched-tokens 32768 \
     --max-model-len 32768 \
     --dtype float16 \
     --gpu-memory-utilization 0.9

     --max-num-batched-tokens 32768 \
     --max-model-len 32768 \
     --dtype float16 \
+    --enforce-eager false \
     --gpu-memory-utilization 0.9