HaileyStorm
/

chess-mamba-vs-xformer

Model card Files Files and versions Community

HaileyStorm commited on Apr 23, 2024

Commit

4b10b3e

verified ·

1 Parent(s): da8a8f2

Update chess-gpt-eval-contrastive/mamba_module.py

Browse files

Files changed (1) hide show

chess-gpt-eval-contrastive/mamba_module.py +35 -12

chess-gpt-eval-contrastive/mamba_module.py CHANGED Viewed

@@ -227,17 +227,40 @@ class MambaPlayer:
             with open(path, "rb") as f:
                 activations_sum, activations_count = pickle.load(f)
-            def hook(module, input, output, layer_idx, bucket):
-                seq_len = output.shape[1]
-                won_activations = activations_sum[layer_idx][bucket]["won"] / activations_count[layer_idx][bucket]["won"]
-                lost_activations = activations_sum[layer_idx][bucket]["lost"] / activations_count[layer_idx][bucket]["lost"]
-                contrastive_activations = won_activations - lost_activations
-                return output + torch.from_numpy(contrastive_activations[:, :seq_len, :]).to(output.device) * weight
-            for layer_idx in activations_sum:
-                for bucket in self.move_buckets:
-                    self.hooks.append(self.model.backbone.layers[layer_idx].register_forward_hook(
-                        lambda module, input, output, layer_idx=layer_idx, bucket=bucket: hook(module, input, output, layer_idx, bucket)
-                    ))

             with open(path, "rb") as f:
                 activations_sum, activations_count = pickle.load(f)
+            self.contrastive_activations_cache = {}
+            def hook(module, input, output, layer_idx):
+                if isinstance(output, tuple):
+                    tensor_output = output[0]
+                else:
+                    tensor_output = output
+                seq_len = tensor_output.shape[1]
+                bucket = next(b for b in self.move_buckets if self.move_num <= b)
+                # Check cache first
+                if layer_idx in self.contrastive_activations_cache and bucket in self.contrastive_activations_cache[layer_idx]:
+                    safe_contrastive_activations = self.contrastive_activations_cache[layer_idx][bucket]
+                else:
+                    won_activations = activations_sum[layer_idx][bucket]["won"] / activations_count[layer_idx][bucket]["won"]
+                    lost_activations = activations_sum[layer_idx][bucket]["lost"] / activations_count[layer_idx][bucket]["lost"]
+                    contrastive_activations = won_activations - lost_activations
+                    contrastive_activations_tensor = torch.from_numpy(contrastive_activations).to(tensor_output.device)
+                    valid_activations = torch.isfinite(contrastive_activations_tensor)
+                    safe_contrastive_activations = torch.zeros_like(contrastive_activations_tensor)
+                    safe_contrastive_activations[valid_activations] = contrastive_activations_tensor[valid_activations]
+                    # Cache the safe activations
+                    if layer_idx not in self.contrastive_activations_cache:
+                        self.contrastive_activations_cache[layer_idx] = {}
+                    self.contrastive_activations_cache[layer_idx][bucket] = safe_contrastive_activations
+                tensor_output += safe_contrastive_activations[:, :seq_len, :] * weight
+                if isinstance(output, tuple):
+                    return tensor_output, output[1]
+                else:
+                    return tensor_output
+            for layer_idx in activations_sum:
+                self.hooks.append(self.model.backbone.layers[layer_idx].register_forward_hook(
+                    lambda module, input, output, layer_idx=layer_idx: hook(module, input, output, layer_idx)
+                ))