services: Llama2-70B-INT8: container_name: worker4 image: mlenergy/tgi:v1.0.0 command: ["--model-id", "meta-llama/Llama-2-70b-chat-hf", "--num-shard", "2", "--otlp-endpoint", "http://jaeger:4317", "--quantize", "bitsandbytes"] shm_size: 1g environment: HUGGING_FACE_HUB_TOKEN: "${HF_TOKEN}" networks: - leaderboard volumes: - /data/leaderboard/tgi-data:/data deploy: restart_policy: condition: any resources: reservations: devices: - driver: nvidia device_ids: ["0", "1"] capabilities: [gpu] MPT-30B: container_name: worker5 image: mlenergy/tgi:v1.0.0 command: ["--model-id", "mosaicml/mpt-30b-chat", "--num-shard", "2", "--otlp-endpoint", "http://jaeger:4317"] shm_size: 1g networks: - leaderboard volumes: - /data/leaderboard/tgi-data:/data deploy: restart_policy: condition: any resources: reservations: devices: - driver: nvidia device_ids: ["2", "3"] capabilities: [gpu] networks: leaderboard: name: leaderboard external: true