sanketrai/qwen-2.5-coder-7b-sql-create-context-qlora
Updated
docker pull apostacyh/vllm:lmcache-0.1.0
model=mistralai/Mistral-7B-Instruct-v0.2 # Replace with your model name
sudo docker run --runtime nvidia --gpus '"device=0"' \
-v <Huggingface cache dir on your local machine>:/root/.cache/huggingface \
-p 8000:8000 \
--env "HF_TOKEN=<Your huggingface access token>" \
--ipc=host \
--network=host \
apostacyh/vllm:lmcache-0.1.0 \
--model $model --gpu-memory-utilization 0.6 --port 8000 \
--lmcache-config-file /lmcache/LMCache/examples/example-local.yaml
# The second vLLM instance listens at port 8001
model=mistralai/Mistral-7B-Instruct-v0.2 # Replace with your model name
sudo docker run --runtime nvidia --gpus '"device=1"' \
-v <Huggingface cache dir on your local machine>:/root/.cache/huggingface \
-p 8001:8001 \
--env "HF_TOKEN=<Your huggingface token>" \
--ipc=host \
--network=host \
apostacyh/vllm:lmcache-0.1.0 \
--model $model --gpu-memory-utilization 0.7 --port 8001 \
--lmcache-config-file /lmcache/LMCache/examples/example.yaml