hank1996 commited on
Commit
d385f77
·
1 Parent(s): 2670a47

Create new file

Browse files
Files changed (1) hide show
  1. utils/aws/resume.py +36 -0
utils/aws/resume.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ import yaml
8
+
9
+ sys.path.append('./') # to run '$ python *.py' files in subdirectories
10
+
11
+ port = 0 # --master_port
12
+ path = Path('').resolve()
13
+ for last in path.rglob('*/**/last.pt'):
14
+ ckpt = torch.load(last)
15
+ if ckpt['optimizer'] is None:
16
+ continue
17
+
18
+ # Load opt.yaml
19
+ with open(last.parent.parent / 'opt.yaml') as f:
20
+ opt = yaml.load(f, Loader=yaml.SafeLoader)
21
+
22
+ # Get device count
23
+ d = opt['device'].split(',') # devices
24
+ nd = len(d) # number of devices
25
+ ddp = nd > 1 or (nd == 0 and torch.cuda.device_count() > 1) # distributed data parallel
26
+
27
+ if ddp: # multi-GPU
28
+ port += 1
29
+ cmd = f'python -m torch.distributed.launch --nproc_per_node {nd} --master_port {port} train.py --resume {last}'
30
+ else: # single-GPU
31
+ cmd = f'python train.py --resume {last}'
32
+
33
+ cmd += ' > /dev/null 2>&1 &' # redirect output to dev/null and run in daemon thread
34
+ print(cmd)
35
+ os.system(cmd)
36
+