File size: 1,419 Bytes
c3d82b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
services:
  train:
    build:
      context: .
    command: |
      python -m src.train && \
      touch ./checkpoints/train_done.flag
    volumes:
      - ./data:/app/data
      - ./checkpoints:/app/checkpoints
      - ./logs:/app/logs
    environment:
      - PYTHONUNBUFFERED=1
      - PYTHONPATH=/app
      - NUM_WORKERS=4  # Set the number of workers
    shm_size: '4g'
    deploy:
      resources:
        limits:
          memory: 8g  # Limit to 8GB RAM
          cpus: '4.0' # Use up to 4 CPU cores
        reservations:
          memory: 6g  # Reserve 6GB RAM
          cpus: '4.0' # Reserve 4 CPU cores
    networks:
      - default
    env_file:
      - .env

  eval:
    build:
      context: .
    command: |
      sh -c 'while [ ! -f /app/checkpoints/train_done.flag ]; do sleep 10; done && python -m src.test'
    volumes:
      - ./data:/app/data
      - ./checkpoints:/app/checkpoints
      - ./logs:/app/logs
    environment:
      - PYTHONUNBUFFERED=1
      - PYTHONPATH=/app
      - NUM_WORKERS=2  # Set the number of workers
    shm_size: '4g'
    deploy:
      resources:
        limits:
          memory: 4g  # Limit to 4GB RAM
          cpus: '4.0' # Use up to 4 CPU core
        reservations:
          memory: 2g  # Reserve 2GB RAM
          cpus: '2' # Reserve 2 CPU core
    networks:
      - default
    env_file:
      - .env

volumes:
  data:
  checkpoints:
  logs:

networks:
  default: