HaileyStorm
/

chess-mamba-vs-xformer

Model card Files Files and versions Community

HaileyStorm commited on May 2, 2024

Commit

d29de63

verified ·

1 Parent(s): 164b5fe

Update chess-gpt-eval-contrastive/mamba_module.py

Browse files

Files changed (1) hide show

chess-gpt-eval-contrastive/mamba_module.py +4 -4

chess-gpt-eval-contrastive/mamba_module.py CHANGED Viewed

@@ -126,7 +126,7 @@ class MambaPlayer:
                         tensor_output = output
                     seq_len = tensor_output.shape[1]
                     bucket = next(b for b in self.move_buckets if self.move_num <= b)
-                    self.activations_sum[layer_idx][bucket]["current"][:, :seq_len, :] += tensor_output.detach().cpu().numpy()
                     self.activations_count[layer_idx][bucket]["current"] += 1
                 self.hooks.append(layer.register_forward_hook(hook))
@@ -324,8 +324,8 @@ class MambaPlayer:
         def get_lr(it):
             warmup_iters = 150 * 43
             lr_decay_iters = 5000 * 43
-            learning_rate = 0.000015
-            min_lr = 0.000001
             # 1) linear warmup for warmup_iters steps
             if it < warmup_iters:
                 return learning_rate * it / warmup_iters
@@ -345,7 +345,7 @@ class MambaPlayer:
         for layer_idx in self.linear_probes:
             for bucket in self.move_buckets:
                 if self.activations_count[layer_idx][bucket]['current'] > 0:
-                    X = torch.from_numpy(self.activations_sum[layer_idx][bucket]['current']).float().flatten(1)[:self.seq_len][-8:] #/ self.activations_count[layer_idx][bucket]['current']).float()
                     for probe_type in ['q_value', 'q_value_delta', 'material_balance']:
                         y = torch.tensor(self.linear_probe_targets[layer_idx][bucket][probe_type]).float().unsqueeze(1)
                         if len(y) > 0:

                         tensor_output = output
                     seq_len = tensor_output.shape[1]
                     bucket = next(b for b in self.move_buckets if self.move_num <= b)
+                    self.activations_sum[layer_idx][bucket]["current"][:, :8, :] += tensor_output.detach().cpu().numpy()[:self.seq_len][-8:]
                     self.activations_count[layer_idx][bucket]["current"] += 1
                 self.hooks.append(layer.register_forward_hook(hook))
         def get_lr(it):
             warmup_iters = 150 * 43
             lr_decay_iters = 5000 * 43
+            learning_rate = 0.003
+            min_lr = 0.0001
             # 1) linear warmup for warmup_iters steps
             if it < warmup_iters:
                 return learning_rate * it / warmup_iters
         for layer_idx in self.linear_probes:
             for bucket in self.move_buckets:
                 if self.activations_count[layer_idx][bucket]['current'] > 0:
+                    X = torch.from_numpy(self.activations_sum[layer_idx][bucket]['current']).float().flatten(1) #/ self.activations_count[layer_idx][bucket]['current']).float()
                     for probe_type in ['q_value', 'q_value_delta', 'material_balance']:
                         y = torch.tensor(self.linear_probe_targets[layer_idx][bucket][probe_type]).float().unsqueeze(1)
                         if len(y) > 0: