winglian commited on
Commit
31f3e71
1 Parent(s): 56c4a94

fix checkpints on multigpu (#481)

Browse files
Files changed (1) hide show
  1. src/axolotl/monkeypatch/relora.py +1 -1
src/axolotl/monkeypatch/relora.py CHANGED
@@ -131,7 +131,7 @@ class ReLoRACallback(TrainerCallback):
131
  and state.global_step % self.relora_steps != 0
132
  ):
133
  if self.quantized:
134
- if self.last_full_model != checkpoint_folder:
135
  # ensure the latest full parameter save is in the latest checkpoint
136
  # folder, so that automatic pruning of checkpoints does not remove it
137
  LOG.info(f"moving last full parameter save to {checkpoint_folder}")
 
131
  and state.global_step % self.relora_steps != 0
132
  ):
133
  if self.quantized:
134
+ if is_main_process() and self.last_full_model != checkpoint_folder:
135
  # ensure the latest full parameter save is in the latest checkpoint
136
  # folder, so that automatic pruning of checkpoints does not remove it
137
  LOG.info(f"moving last full parameter save to {checkpoint_folder}")