naxalpha commited on
Commit
e14438a
1 Parent(s): eec55b1

sharded training

Browse files
Files changed (2) hide show
  1. app.py +5 -4
  2. default_config.yaml +28 -0
app.py CHANGED
@@ -45,18 +45,19 @@ def main():
45
  model.load_state_dict(torch.load('model.pt'))
46
  optim = AdamW(model.parameters(), 2e-5)
47
 
48
- bs = 24
49
- kk = 128
50
  dsx = C4X(kk+1)
51
  dlx = DataLoader(
52
  dsx,
53
  batch_size=bs,
54
- num_workers=8,
55
  )
56
 
57
  prog = tqdm(dlx, disable=not accelerator.is_main_process)
58
 
59
- model, optim, dlx = accelerator.prepare(model, optim, dlx)
 
60
 
61
  optim.zero_grad()
62
  for i, batch in enumerate(prog):
 
45
  model.load_state_dict(torch.load('model.pt'))
46
  optim = AdamW(model.parameters(), 2e-5)
47
 
48
+ bs = 1
49
+ kk = 2048
50
  dsx = C4X(kk+1)
51
  dlx = DataLoader(
52
  dsx,
53
  batch_size=bs,
54
+ num_workers=4,
55
  )
56
 
57
  prog = tqdm(dlx, disable=not accelerator.is_main_process)
58
 
59
+ model = accelerator.prepare(model)
60
+ optim, dlx = accelerator.prepare(optim, dlx)
61
 
62
  optim.zero_grad()
63
  for i, batch in enumerate(prog):
default_config.yaml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ command_file: null
2
+ commands: null
3
+ compute_environment: LOCAL_MACHINE
4
+ deepspeed_config: {}
5
+ distributed_type: FSDP
6
+ downcast_bf16: 'no'
7
+ dynamo_backend: 'NO'
8
+ fsdp_config:
9
+ fsdp_auto_wrap_policy: SIZE_BASED_WRAP
10
+ fsdp_backward_prefetch_policy: BACKWARD_PRE
11
+ fsdp_min_num_params: 2000
12
+ fsdp_offload_params: false
13
+ fsdp_sharding_strategy: 1
14
+ fsdp_state_dict_type: FULL_STATE_DICT
15
+ gpu_ids: null
16
+ machine_rank: 0
17
+ main_process_ip: null
18
+ main_process_port: null
19
+ main_training_function: main
20
+ megatron_lm_config: {}
21
+ mixed_precision: 'no'
22
+ num_machines: 1
23
+ num_processes: 2
24
+ rdzv_backend: static
25
+ same_network: true
26
+ tpu_name: null
27
+ tpu_zone: null
28
+ use_cpu: false