HaileyStorm
/

chess-mamba-vs-xformer

HaileyStorm commited on Mar 13

Commit

070a898

•

1 Parent(s): b32cef0

Upload chess-mamba-vs-xformer/config/Mamba/50M.py with huggingface_hub

Files changed (1) hide show

chess-mamba-vs-xformer/config/Mamba/50M.py CHANGED Viewed

@@ -17,14 +17,14 @@ max_seq_len = 1536
 base_batch_size = 256
 batch_size = 50
-gradient_accumulation_steps = 2
 effective_batch_size = batch_size * gradient_accumulation_steps
 always_save_checkpoint = True
-eval_interval = 250
-eval_iters = 33
-log_interval = 50
-train_file_update_interval = 10 # 23 was original ... 7 definitely crashes (maybe try 10 on Lambda)
 warmup_iters = 500 # not super necessary potentially
 learning_rate = 1.5e-3 # tested 1.5e-3 from 112k-156k, before that 3.5e-3  #8e-3
@@ -64,7 +64,7 @@ d_state = 32
 dt_rank = 56
 move_num_in_gamestate = False
-init_from = 'scratch'
 device = 'cuda'  # run on cpu only
 compile = False # do not torch compile the model

 base_batch_size = 256
 batch_size = 50
+gradient_accumulation_steps = 2 #25
 effective_batch_size = batch_size * gradient_accumulation_steps
 always_save_checkpoint = True
+eval_interval = 60
+eval_iters = 1.5
+log_interval = 0.01
+train_file_update_interval = 1 # 23 was original ... 7 definitely crashes (maybe try 10 on Lambda)
 warmup_iters = 500 # not super necessary potentially
 learning_rate = 1.5e-3 # tested 1.5e-3 from 112k-156k, before that 3.5e-3  #8e-3
 dt_rank = 56
 move_num_in_gamestate = False
+init_from = 'resume'
 device = 'cuda'  # run on cpu only
 compile = False # do not torch compile the model