File size: 1,419 Bytes
c3d82b0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
services:
train:
build:
context: .
command: |
python -m src.train && \
touch ./checkpoints/train_done.flag
volumes:
- ./data:/app/data
- ./checkpoints:/app/checkpoints
- ./logs:/app/logs
environment:
- PYTHONUNBUFFERED=1
- PYTHONPATH=/app
- NUM_WORKERS=4 # Set the number of workers
shm_size: '4g'
deploy:
resources:
limits:
memory: 8g # Limit to 8GB RAM
cpus: '4.0' # Use up to 4 CPU cores
reservations:
memory: 6g # Reserve 6GB RAM
cpus: '4.0' # Reserve 4 CPU cores
networks:
- default
env_file:
- .env
eval:
build:
context: .
command: |
sh -c 'while [ ! -f /app/checkpoints/train_done.flag ]; do sleep 10; done && python -m src.test'
volumes:
- ./data:/app/data
- ./checkpoints:/app/checkpoints
- ./logs:/app/logs
environment:
- PYTHONUNBUFFERED=1
- PYTHONPATH=/app
- NUM_WORKERS=2 # Set the number of workers
shm_size: '4g'
deploy:
resources:
limits:
memory: 4g # Limit to 4GB RAM
cpus: '4.0' # Use up to 4 CPU core
reservations:
memory: 2g # Reserve 2GB RAM
cpus: '2' # Reserve 2 CPU core
networks:
- default
env_file:
- .env
volumes:
data:
checkpoints:
logs:
networks:
default:
|