HaileyStorm
/

chess-mamba-vs-xformer

HaileyStorm commited on Mar 12

Commit

62e35f0

•

1 Parent(s): ec6ad99

Update chess-mamba-vs-xformer/train_bygame.py

Files changed (1) hide show

chess-mamba-vs-xformer/train_bygame.py CHANGED Viewed

@@ -394,7 +394,10 @@ while True:
     if iter_num % eval_interval == 0 and master_process and local_iter_num > 0:
         torch.cuda.empty_cache()
         losses = estimate_loss()
-        print(f"\ngame {games_seen} ({iter_num}, {(iter_num / max_iters)*100.0:.3f}%): 'val' loss {losses['val']:.4f}")
         if auto_clip and len(grad_norm_history) >= grad_clip_start_size:
             grad_clip_prev = grad_clip
             grad_clip = np.percentile(grad_norm_history, grad_clip_percentile)
@@ -481,7 +484,10 @@ while True:
         # get loss as float. note: this is a CPU-GPU sync point
         # scale up to undo the division above, approximating the true total loss (exact would have been a sum)
         lossf = loss.item() * gradient_accumulation_steps
-        print(f"game {games_seen} ({iter_num}, {(iter_num / max_iters)*100.0:.3f}%): loss {lossf:.4f}, time {dt*1000:.2f}ms")
         if wandb_log:
             wandb.log({
                 "etc/iter": iter_num,

     if iter_num % eval_interval == 0 and master_process and local_iter_num > 0:
         torch.cuda.empty_cache()
         losses = estimate_loss()
+        if init_from == 'anneal':
+            print(f"\ngame {games_seen} ({iter_num}, {(iter_num-anneal_start_iters) / anneal_decay_iters:.3%}): 'val' loss {losses['val']:.4f}")
+        else:
+            print(f"\ngame {games_seen} ({iter_num}, {iter_num / max_iters:.3%}): 'val' loss {losses['val']:.4f}")
         if auto_clip and len(grad_norm_history) >= grad_clip_start_size:
             grad_clip_prev = grad_clip
             grad_clip = np.percentile(grad_norm_history, grad_clip_percentile)
         # get loss as float. note: this is a CPU-GPU sync point
         # scale up to undo the division above, approximating the true total loss (exact would have been a sum)
         lossf = loss.item() * gradient_accumulation_steps
+        if init_from == 'anneal':
+            print(f"game {games_seen} ({iter_num}, {(iter_num-anneal_start_iters) / anneal_decay_iters:.3%}): loss {lossf:.4f}, time {dt*1000:.2f}ms")
+        else:
+            print(f"game {games_seen} ({iter_num}, {iter_num / max_iters:.3%}): loss {lossf:.4f}, time {dt*1000:.2f}ms")
         if wandb_log:
             wandb.log({
                 "etc/iter": iter_num,