owaiskha9654 commited on
Commit
1ddbf73
·
1 Parent(s): c28cc64
utils/aws/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ #init
utils/aws/mime.sh ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AWS EC2 instance startup 'MIME' script https://aws.amazon.com/premiumsupport/knowledge-center/execute-user-data-ec2/
2
+ # This script will run on every instance restart, not only on first start
3
+ # --- DO NOT COPY ABOVE COMMENTS WHEN PASTING INTO USERDATA ---
4
+
5
+ Content-Type: multipart/mixed; boundary="//"
6
+ MIME-Version: 1.0
7
+
8
+ --//
9
+ Content-Type: text/cloud-config; charset="us-ascii"
10
+ MIME-Version: 1.0
11
+ Content-Transfer-Encoding: 7bit
12
+ Content-Disposition: attachment; filename="cloud-config.txt"
13
+
14
+ #cloud-config
15
+ cloud_final_modules:
16
+ - [scripts-user, always]
17
+
18
+ --//
19
+ Content-Type: text/x-shellscript; charset="us-ascii"
20
+ MIME-Version: 1.0
21
+ Content-Transfer-Encoding: 7bit
22
+ Content-Disposition: attachment; filename="userdata.txt"
23
+
24
+ #!/bin/bash
25
+ # --- paste contents of userdata.sh here ---
26
+ --//
utils/aws/resume.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Resume all interrupted trainings in yolor/ dir including DDP trainings
2
+ # Usage: $ python utils/aws/resume.py
3
+
4
+ import os
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ import torch
9
+ import yaml
10
+
11
+ sys.path.append('./') # to run '$ python *.py' files in subdirectories
12
+
13
+ port = 0 # --master_port
14
+ path = Path('').resolve()
15
+ for last in path.rglob('*/**/last.pt'):
16
+ ckpt = torch.load(last)
17
+ if ckpt['optimizer'] is None:
18
+ continue
19
+
20
+ # Load opt.yaml
21
+ with open(last.parent.parent / 'opt.yaml') as f:
22
+ opt = yaml.load(f, Loader=yaml.SafeLoader)
23
+
24
+ # Get device count
25
+ d = opt['device'].split(',') # devices
26
+ nd = len(d) # number of devices
27
+ ddp = nd > 1 or (nd == 0 and torch.cuda.device_count() > 1) # distributed data parallel
28
+
29
+ if ddp: # multi-GPU
30
+ port += 1
31
+ cmd = f'python -m torch.distributed.launch --nproc_per_node {nd} --master_port {port} train.py --resume {last}'
32
+ else: # single-GPU
33
+ cmd = f'python train.py --resume {last}'
34
+
35
+ cmd += ' > /dev/null 2>&1 &' # redirect output to dev/null and run in daemon thread
36
+ print(cmd)
37
+ os.system(cmd)
utils/aws/userdata.sh ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # AWS EC2 instance startup script https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/user-data.html
3
+ # This script will run only once on first instance start (for a re-start script see mime.sh)
4
+ # /home/ubuntu (ubuntu) or /home/ec2-user (amazon-linux) is working dir
5
+ # Use >300 GB SSD
6
+
7
+ cd home/ubuntu
8
+ if [ ! -d yolor ]; then
9
+ echo "Running first-time script." # install dependencies, download COCO, pull Docker
10
+ git clone -b paper https://github.com/WongKinYiu/yolor && sudo chmod -R 777 yolor
11
+ cd yolor
12
+ bash data/scripts/get_coco.sh && echo "Data done." &
13
+ sudo docker pull nvcr.io/nvidia/pytorch:21.08-py3 && echo "Docker done." &
14
+ python -m pip install --upgrade pip && pip install -r requirements.txt && python detect.py && echo "Requirements done." &
15
+ wait && echo "All tasks done." # finish background tasks
16
+ else
17
+ echo "Running re-start script." # resume interrupted runs
18
+ i=0
19
+ list=$(sudo docker ps -qa) # container list i.e. $'one\ntwo\nthree\nfour'
20
+ while IFS= read -r id; do
21
+ ((i++))
22
+ echo "restarting container $i: $id"
23
+ sudo docker start $id
24
+ # sudo docker exec -it $id python train.py --resume # single-GPU
25
+ sudo docker exec -d $id python utils/aws/resume.py # multi-scenario
26
+ done <<<"$list"
27
+ fi