fix checkpints on multigpu (#481)
Browse files
src/axolotl/monkeypatch/relora.py
CHANGED
@@ -131,7 +131,7 @@ class ReLoRACallback(TrainerCallback):
|
|
131 |
and state.global_step % self.relora_steps != 0
|
132 |
):
|
133 |
if self.quantized:
|
134 |
-
if self.last_full_model != checkpoint_folder:
|
135 |
# ensure the latest full parameter save is in the latest checkpoint
|
136 |
# folder, so that automatic pruning of checkpoints does not remove it
|
137 |
LOG.info(f"moving last full parameter save to {checkpoint_folder}")
|
|
|
131 |
and state.global_step % self.relora_steps != 0
|
132 |
):
|
133 |
if self.quantized:
|
134 |
+
if is_main_process() and self.last_full_model != checkpoint_folder:
|
135 |
# ensure the latest full parameter save is in the latest checkpoint
|
136 |
# folder, so that automatic pruning of checkpoints does not remove it
|
137 |
LOG.info(f"moving last full parameter save to {checkpoint_folder}")
|