diff --git a/102/edges.pkl b/102/edges.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e5ca91be0d15dfebb7961bc819259c28cd200595 --- /dev/null +++ b/102/edges.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a2cb02e0518a50c2b53be59ea3c3788ec80c0d3c6c1e70070bd7115abdfae77 +size 113 diff --git a/102/ll_model.pth b/102/ll_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..9fd62460eaea5b2fe4687165a2c53b48af6cbb36 --- /dev/null +++ b/102/ll_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:feb81e3d0658fbd63f8e19b5ec8b036dd18f5e58ef62d8e0a57048e6ca450c7d +size 15082 diff --git a/102/ll_model_cfg.pkl b/102/ll_model_cfg.pkl new file mode 100644 index 0000000000000000000000000000000000000000..d672d99d82ae67a30e8cd5ff8c09075f32934ad2 --- /dev/null +++ b/102/ll_model_cfg.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ecb69a2f8949b8c7b19adc4c33aed4e99913687a72b169b64a317d2e7878dc97 +size 1093 diff --git a/102/meta.json b/102/meta.json new file mode 100644 index 0000000000000000000000000000000000000000..2a64b9ad15db892f794f6353c546e7821b5869cb --- /dev/null +++ b/102/meta.json @@ -0,0 +1 @@ +{"output_dir": "/circuits-benchmark/results", "atol": 0.05, "lr": 0.001, "use_single_loss": true, "iit_weight": 1.0, "behavior_weight": 0.4, "strict_weight": 0.4, "epochs": 1000, "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", "use_wandb": true, "wandb_project": "iit-train-seed-82-siit-weigth-0.4", "wandb_name": "case-102-seed-82-s-0.4-b-0.4-iit-1", "save_model_to_wandb": true, "device": "cuda", "clip_grad_norm": 0.1, "lr_scheduler": "linear", "model_pair": "strict", "same_size": false, "seed": 82, "batch_size": 256, "include_mlp": false, "detach_while_caching": true, "scheduler_val_metric": ["val/accuracy", "val/IIA", "val/strict_accuracy"], "siit_sampling": "sample_all", "val_iia_sampling": "all"} \ No newline at end of file diff --git a/104/edges.pkl b/104/edges.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e5ca91be0d15dfebb7961bc819259c28cd200595 --- /dev/null +++ b/104/edges.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a2cb02e0518a50c2b53be59ea3c3788ec80c0d3c6c1e70070bd7115abdfae77 +size 113 diff --git a/104/ll_model.pth b/104/ll_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..e030c064b3fbe5c5718c40664b86d7e3d566afc1 --- /dev/null +++ b/104/ll_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddb4db579c13e1bf510ead6a01c343582eedff2f1d799f4d733d72f5f9da50ab +size 14698 diff --git a/104/ll_model_cfg.pkl b/104/ll_model_cfg.pkl new file mode 100644 index 0000000000000000000000000000000000000000..971a904427237bd2171a1b8a7b4deb269de7c5ca --- /dev/null +++ b/104/ll_model_cfg.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db76917a80935813594508b1d79b7a0836a20daf67e263161432430b095fd111 +size 1093 diff --git a/104/meta.json b/104/meta.json new file mode 100644 index 0000000000000000000000000000000000000000..e9fe3a76f7042095a23c20779b8e9ab3328b0e0b --- /dev/null +++ b/104/meta.json @@ -0,0 +1 @@ +{"output_dir": "/circuits-benchmark/results", "atol": 0.05, "lr": 0.001, "use_single_loss": true, "iit_weight": 1.0, "behavior_weight": 0.4, "strict_weight": 0.7, "epochs": 1000, "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", "use_wandb": true, "wandb_project": "iit-train-seed-67-siit-weigth-0.7", "wandb_name": "case-104-seed-67-s-0.7-b-0.4-iit-1", "save_model_to_wandb": true, "device": "cuda", "clip_grad_norm": 0.1, "lr_scheduler": "linear", "model_pair": "strict", "same_size": false, "seed": 67, "batch_size": 256, "include_mlp": false, "detach_while_caching": true, "scheduler_val_metric": ["val/accuracy", "val/IIA", "val/strict_accuracy"], "siit_sampling": "sample_all", "val_iia_sampling": "all"} \ No newline at end of file diff --git a/105/edges.pkl b/105/edges.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e5ca91be0d15dfebb7961bc819259c28cd200595 --- /dev/null +++ b/105/edges.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a2cb02e0518a50c2b53be59ea3c3788ec80c0d3c6c1e70070bd7115abdfae77 +size 113 diff --git a/105/ll_model.pth b/105/ll_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..503f970e58297932a0f93df298f7cefe1c459d91 --- /dev/null +++ b/105/ll_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c32e3e97b224cc8d115b70d7661b8b8433bfcd41a9f58d1d7409ce9367ea27a +size 15018 diff --git a/105/ll_model_cfg.pkl b/105/ll_model_cfg.pkl new file mode 100644 index 0000000000000000000000000000000000000000..2405e41cd39cba0b81cceb959306e573eff0e267 --- /dev/null +++ b/105/ll_model_cfg.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57dece33450716de87f38b12c0955174849706456cc698dd4fcce152feba3cbf +size 1093 diff --git a/105/meta.json b/105/meta.json new file mode 100644 index 0000000000000000000000000000000000000000..fed30b4cbe3e3c32f9314f9abc1729e4984ff4ec --- /dev/null +++ b/105/meta.json @@ -0,0 +1 @@ +{"output_dir": "/circuits-benchmark/results", "atol": 0.05, "lr": 0.001, "use_single_loss": true, "iit_weight": 1.0, "behavior_weight": 0.4, "strict_weight": 0.7, "epochs": 1000, "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", "use_wandb": true, "wandb_project": "iit-train-seed-67-siit-weigth-0.7", "wandb_name": "case-105-seed-67-s-0.7-b-0.4-iit-1", "save_model_to_wandb": true, "device": "cuda", "clip_grad_norm": 0.1, "lr_scheduler": "linear", "model_pair": "strict", "same_size": false, "seed": 67, "batch_size": 256, "include_mlp": false, "detach_while_caching": true, "scheduler_val_metric": ["val/accuracy", "val/IIA", "val/strict_accuracy"], "siit_sampling": "sample_all", "val_iia_sampling": "all"} \ No newline at end of file diff --git a/123/edges.pkl b/123/edges.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e5ca91be0d15dfebb7961bc819259c28cd200595 --- /dev/null +++ b/123/edges.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a2cb02e0518a50c2b53be59ea3c3788ec80c0d3c6c1e70070bd7115abdfae77 +size 113 diff --git a/123/ll_model.pth b/123/ll_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..43c68763ffa23c373508111a5f6af8ef124abded --- /dev/null +++ b/123/ll_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b640bb628be87f8ebf41e6bb8b5351a6d69bfa11699bf2e1cea140f6f6f9f95 +size 15082 diff --git a/123/ll_model_cfg.pkl b/123/ll_model_cfg.pkl new file mode 100644 index 0000000000000000000000000000000000000000..0c277598f6fa5a15b80026b95d17e17c98d63226 --- /dev/null +++ b/123/ll_model_cfg.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f027ad4de6828c6f5bb7a3f3c8aabd9658526e33b2284eba366977823b89c0a +size 1093 diff --git a/123/meta.json b/123/meta.json new file mode 100644 index 0000000000000000000000000000000000000000..f7c062b0dfc6206a3c64dd48c5d41d2affbb4e05 --- /dev/null +++ b/123/meta.json @@ -0,0 +1 @@ +{"output_dir": "/circuits-benchmark/results", "atol": 0.05, "lr": 0.001, "use_single_loss": true, "iit_weight": 1.0, "behavior_weight": 0.4, "strict_weight": 0.7, "epochs": 1000, "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", "use_wandb": true, "wandb_project": "iit-train-seed-67-siit-weigth-0.7", "wandb_name": "case-123-seed-67-s-0.7-b-0.4-iit-1", "save_model_to_wandb": true, "device": "cuda", "clip_grad_norm": 0.1, "lr_scheduler": "linear", "model_pair": "strict", "same_size": false, "seed": 67, "batch_size": 256, "include_mlp": false, "detach_while_caching": true, "scheduler_val_metric": ["val/accuracy", "val/IIA", "val/strict_accuracy"], "siit_sampling": "sample_all", "val_iia_sampling": "all"} \ No newline at end of file diff --git a/46/edges.pkl b/46/edges.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e5ca91be0d15dfebb7961bc819259c28cd200595 --- /dev/null +++ b/46/edges.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a2cb02e0518a50c2b53be59ea3c3788ec80c0d3c6c1e70070bd7115abdfae77 +size 113 diff --git a/46/ll_model.pth b/46/ll_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..e030c064b3fbe5c5718c40664b86d7e3d566afc1 --- /dev/null +++ b/46/ll_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddb4db579c13e1bf510ead6a01c343582eedff2f1d799f4d733d72f5f9da50ab +size 14698 diff --git a/46/ll_model_cfg.pkl b/46/ll_model_cfg.pkl new file mode 100644 index 0000000000000000000000000000000000000000..971a904427237bd2171a1b8a7b4deb269de7c5ca --- /dev/null +++ b/46/ll_model_cfg.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db76917a80935813594508b1d79b7a0836a20daf67e263161432430b095fd111 +size 1093 diff --git a/46/meta.json b/46/meta.json new file mode 100644 index 0000000000000000000000000000000000000000..c88afa06b76d37bdc13f7ca0d9fe3b4f54e0ab83 --- /dev/null +++ b/46/meta.json @@ -0,0 +1 @@ +{"output_dir": "/circuits-benchmark/results", "atol": 0.05, "lr": 0.001, "use_single_loss": true, "iit_weight": 1.0, "behavior_weight": 0.4, "strict_weight": 0.7, "epochs": 1000, "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", "use_wandb": true, "wandb_project": "iit-train-seed-67-siit-weigth-0.7", "wandb_name": "case-46-seed-67-s-0.7-b-0.4-iit-1", "save_model_to_wandb": true, "device": "cuda", "clip_grad_norm": 0.1, "lr_scheduler": "linear", "model_pair": "strict", "same_size": false, "seed": 67, "batch_size": 256, "include_mlp": false, "detach_while_caching": true, "scheduler_val_metric": ["val/accuracy", "val/IIA", "val/strict_accuracy"], "siit_sampling": "sample_all", "val_iia_sampling": "all"} \ No newline at end of file diff --git a/50/edges.pkl b/50/edges.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e5ca91be0d15dfebb7961bc819259c28cd200595 --- /dev/null +++ b/50/edges.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a2cb02e0518a50c2b53be59ea3c3788ec80c0d3c6c1e70070bd7115abdfae77 +size 113 diff --git a/50/ll_model.pth b/50/ll_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..e030c064b3fbe5c5718c40664b86d7e3d566afc1 --- /dev/null +++ b/50/ll_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddb4db579c13e1bf510ead6a01c343582eedff2f1d799f4d733d72f5f9da50ab +size 14698 diff --git a/50/ll_model_cfg.pkl b/50/ll_model_cfg.pkl new file mode 100644 index 0000000000000000000000000000000000000000..971a904427237bd2171a1b8a7b4deb269de7c5ca --- /dev/null +++ b/50/ll_model_cfg.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db76917a80935813594508b1d79b7a0836a20daf67e263161432430b095fd111 +size 1093 diff --git a/50/meta.json b/50/meta.json new file mode 100644 index 0000000000000000000000000000000000000000..ba9313288c7527f1052a4204310f6d02f3723687 --- /dev/null +++ b/50/meta.json @@ -0,0 +1 @@ +{"output_dir": "/circuits-benchmark/results", "atol": 0.05, "lr": 0.001, "use_single_loss": true, "iit_weight": 1.0, "behavior_weight": 0.4, "strict_weight": 0.7, "epochs": 1000, "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", "use_wandb": true, "wandb_project": "iit-train-seed-67-siit-weigth-0.7", "wandb_name": "case-50-seed-67-s-0.7-b-0.4-iit-1", "save_model_to_wandb": true, "device": "cuda", "clip_grad_norm": 0.1, "lr_scheduler": "linear", "model_pair": "strict", "same_size": false, "seed": 67, "batch_size": 256, "include_mlp": false, "detach_while_caching": true, "scheduler_val_metric": ["val/accuracy", "val/IIA", "val/strict_accuracy"], "siit_sampling": "sample_all", "val_iia_sampling": "all"} \ No newline at end of file diff --git a/52/edges.pkl b/52/edges.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e5ca91be0d15dfebb7961bc819259c28cd200595 --- /dev/null +++ b/52/edges.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a2cb02e0518a50c2b53be59ea3c3788ec80c0d3c6c1e70070bd7115abdfae77 +size 113 diff --git a/52/ll_model.pth b/52/ll_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..e030c064b3fbe5c5718c40664b86d7e3d566afc1 --- /dev/null +++ b/52/ll_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddb4db579c13e1bf510ead6a01c343582eedff2f1d799f4d733d72f5f9da50ab +size 14698 diff --git a/52/ll_model_cfg.pkl b/52/ll_model_cfg.pkl new file mode 100644 index 0000000000000000000000000000000000000000..971a904427237bd2171a1b8a7b4deb269de7c5ca --- /dev/null +++ b/52/ll_model_cfg.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db76917a80935813594508b1d79b7a0836a20daf67e263161432430b095fd111 +size 1093 diff --git a/52/meta.json b/52/meta.json new file mode 100644 index 0000000000000000000000000000000000000000..32f37ef14bdef255ac0594ead679dc523c48c177 --- /dev/null +++ b/52/meta.json @@ -0,0 +1 @@ +{"output_dir": "/circuits-benchmark/results", "atol": 0.05, "lr": 0.001, "use_single_loss": true, "iit_weight": 1.0, "behavior_weight": 0.4, "strict_weight": 0.7, "epochs": 1000, "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", "use_wandb": true, "wandb_project": "iit-train-seed-67-siit-weigth-0.7", "wandb_name": "case-52-seed-67-s-0.7-b-0.4-iit-1", "save_model_to_wandb": true, "device": "cuda", "clip_grad_norm": 0.1, "lr_scheduler": "linear", "model_pair": "strict", "same_size": false, "seed": 67, "batch_size": 256, "include_mlp": false, "detach_while_caching": true, "scheduler_val_metric": ["val/accuracy", "val/IIA", "val/strict_accuracy"], "siit_sampling": "sample_all", "val_iia_sampling": "all"} \ No newline at end of file diff --git a/53/edges.pkl b/53/edges.pkl new file mode 100644 index 0000000000000000000000000000000000000000..145ad9019256687adb7de5136c70accdaedd74a3 --- /dev/null +++ b/53/edges.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:994f3bdfbe5b148e1da38018a24a1567c1d86e5de1c18e9b4d62af358812c709 +size 189 diff --git a/53/ll_model.pth b/53/ll_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..114b2f59baa956e78999af060d060f1bea5f8863 --- /dev/null +++ b/53/ll_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c16ef496b74897cf166795daa3a019dd6adde99efe3edd12d540676047ff695b +size 14762 diff --git a/53/ll_model_cfg.pkl b/53/ll_model_cfg.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a390e3aaa7e1cbd1c647f03e6f121426c8d0b7a6 --- /dev/null +++ b/53/ll_model_cfg.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:067caa3354f7d42fd2eb39aeacec1719e1e8aa60f9c707b5be1dff2a7a5eac4c +size 1093 diff --git a/53/meta.json b/53/meta.json new file mode 100644 index 0000000000000000000000000000000000000000..9d32547a0535e4d14e87af2db85a300ad022f38f --- /dev/null +++ b/53/meta.json @@ -0,0 +1 @@ +{"output_dir": "/circuits-benchmark/results", "atol": 0.05, "lr": 0.001, "use_single_loss": true, "iit_weight": 1.0, "behavior_weight": 0.4, "strict_weight": 0.7, "epochs": 1000, "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", "use_wandb": true, "wandb_project": "iit-train-seed-67-siit-weigth-0.7", "wandb_name": "case-53-seed-67-s-0.7-b-0.4-iit-1", "save_model_to_wandb": true, "device": "cuda", "clip_grad_norm": 0.1, "lr_scheduler": "linear", "model_pair": "strict", "same_size": false, "seed": 67, "batch_size": 256, "include_mlp": false, "detach_while_caching": true, "scheduler_val_metric": ["val/accuracy", "val/IIA", "val/strict_accuracy"], "siit_sampling": "sample_all", "val_iia_sampling": "all"} \ No newline at end of file diff --git a/54/edges.pkl b/54/edges.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e5ca91be0d15dfebb7961bc819259c28cd200595 --- /dev/null +++ b/54/edges.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a2cb02e0518a50c2b53be59ea3c3788ec80c0d3c6c1e70070bd7115abdfae77 +size 113 diff --git a/54/ll_model.pth b/54/ll_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..e030c064b3fbe5c5718c40664b86d7e3d566afc1 --- /dev/null +++ b/54/ll_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddb4db579c13e1bf510ead6a01c343582eedff2f1d799f4d733d72f5f9da50ab +size 14698 diff --git a/54/ll_model_cfg.pkl b/54/ll_model_cfg.pkl new file mode 100644 index 0000000000000000000000000000000000000000..971a904427237bd2171a1b8a7b4deb269de7c5ca --- /dev/null +++ b/54/ll_model_cfg.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db76917a80935813594508b1d79b7a0836a20daf67e263161432430b095fd111 +size 1093 diff --git a/54/meta.json b/54/meta.json new file mode 100644 index 0000000000000000000000000000000000000000..2eb8ad88eb31ddcf7c649513130bfc2b08874b8c --- /dev/null +++ b/54/meta.json @@ -0,0 +1 @@ +{"output_dir": "/circuits-benchmark/results", "atol": 0.05, "lr": 0.001, "use_single_loss": true, "iit_weight": 1.0, "behavior_weight": 0.4, "strict_weight": 0.7, "epochs": 1000, "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", "use_wandb": true, "wandb_project": "iit-train-seed-67-siit-weigth-0.7", "wandb_name": "case-54-seed-67-s-0.7-b-0.4-iit-1", "save_model_to_wandb": true, "device": "cuda", "clip_grad_norm": 0.1, "lr_scheduler": "linear", "model_pair": "strict", "same_size": false, "seed": 67, "batch_size": 256, "include_mlp": false, "detach_while_caching": true, "scheduler_val_metric": ["val/accuracy", "val/IIA", "val/strict_accuracy"], "siit_sampling": "sample_all", "val_iia_sampling": "all"} \ No newline at end of file diff --git a/55/edges.pkl b/55/edges.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e5ca91be0d15dfebb7961bc819259c28cd200595 --- /dev/null +++ b/55/edges.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a2cb02e0518a50c2b53be59ea3c3788ec80c0d3c6c1e70070bd7115abdfae77 +size 113 diff --git a/55/ll_model.pth b/55/ll_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..e030c064b3fbe5c5718c40664b86d7e3d566afc1 --- /dev/null +++ b/55/ll_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddb4db579c13e1bf510ead6a01c343582eedff2f1d799f4d733d72f5f9da50ab +size 14698 diff --git a/55/ll_model_cfg.pkl b/55/ll_model_cfg.pkl new file mode 100644 index 0000000000000000000000000000000000000000..971a904427237bd2171a1b8a7b4deb269de7c5ca --- /dev/null +++ b/55/ll_model_cfg.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db76917a80935813594508b1d79b7a0836a20daf67e263161432430b095fd111 +size 1093 diff --git a/55/meta.json b/55/meta.json new file mode 100644 index 0000000000000000000000000000000000000000..512b3f2f72c28bda037302a0201c8dce880a7d95 --- /dev/null +++ b/55/meta.json @@ -0,0 +1 @@ +{"output_dir": "/circuits-benchmark/results", "atol": 0.05, "lr": 0.001, "use_single_loss": true, "iit_weight": 1.0, "behavior_weight": 0.4, "strict_weight": 0.7, "epochs": 1000, "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", "use_wandb": true, "wandb_project": "iit-train-seed-67-siit-weigth-0.7", "wandb_name": "case-55-seed-67-s-0.7-b-0.4-iit-1", "save_model_to_wandb": true, "device": "cuda", "clip_grad_norm": 0.1, "lr_scheduler": "linear", "model_pair": "strict", "same_size": false, "seed": 67, "batch_size": 256, "include_mlp": false, "detach_while_caching": true, "scheduler_val_metric": ["val/accuracy", "val/IIA", "val/strict_accuracy"], "siit_sampling": "sample_all", "val_iia_sampling": "all"} \ No newline at end of file diff --git a/60/edges.pkl b/60/edges.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e5ca91be0d15dfebb7961bc819259c28cd200595 --- /dev/null +++ b/60/edges.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a2cb02e0518a50c2b53be59ea3c3788ec80c0d3c6c1e70070bd7115abdfae77 +size 113 diff --git a/60/ll_model.pth b/60/ll_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..e030c064b3fbe5c5718c40664b86d7e3d566afc1 --- /dev/null +++ b/60/ll_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddb4db579c13e1bf510ead6a01c343582eedff2f1d799f4d733d72f5f9da50ab +size 14698 diff --git a/60/ll_model_cfg.pkl b/60/ll_model_cfg.pkl new file mode 100644 index 0000000000000000000000000000000000000000..971a904427237bd2171a1b8a7b4deb269de7c5ca --- /dev/null +++ b/60/ll_model_cfg.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db76917a80935813594508b1d79b7a0836a20daf67e263161432430b095fd111 +size 1093 diff --git a/60/meta.json b/60/meta.json new file mode 100644 index 0000000000000000000000000000000000000000..64e52a2f613f889e507368c96f77838ef2996eca --- /dev/null +++ b/60/meta.json @@ -0,0 +1 @@ +{"output_dir": "/circuits-benchmark/results", "atol": 0.05, "lr": 0.001, "use_single_loss": true, "iit_weight": 1.0, "behavior_weight": 0.4, "strict_weight": 0.7, "epochs": 1000, "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", "use_wandb": true, "wandb_project": "iit-train-seed-67-siit-weigth-0.7", "wandb_name": "case-60-seed-67-s-0.7-b-0.4-iit-1", "save_model_to_wandb": true, "device": "cuda", "clip_grad_norm": 0.1, "lr_scheduler": "linear", "model_pair": "strict", "same_size": false, "seed": 67, "batch_size": 256, "include_mlp": false, "detach_while_caching": true, "scheduler_val_metric": ["val/accuracy", "val/IIA", "val/strict_accuracy"], "siit_sampling": "sample_all", "val_iia_sampling": "all"} \ No newline at end of file diff --git a/62/edges.pkl b/62/edges.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e5ca91be0d15dfebb7961bc819259c28cd200595 --- /dev/null +++ b/62/edges.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a2cb02e0518a50c2b53be59ea3c3788ec80c0d3c6c1e70070bd7115abdfae77 +size 113 diff --git a/62/ll_model.pth b/62/ll_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..e88019c946626401aace44160ac7cf225c1f2885 --- /dev/null +++ b/62/ll_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44b75705cc0241f73d43d60bd6dba9014f5643c80a0da7d58b49ea7bdc1526fe +size 14698 diff --git a/62/ll_model_cfg.pkl b/62/ll_model_cfg.pkl new file mode 100644 index 0000000000000000000000000000000000000000..166515f115742cdc958af8ac628de1edcb760d20 --- /dev/null +++ b/62/ll_model_cfg.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a2d11862991a5791d7bd91005f7c0f928d522a579988c7000192e53ff05de81 +size 1093 diff --git a/62/meta.json b/62/meta.json new file mode 100644 index 0000000000000000000000000000000000000000..1b80b0dd0d38f6671f932f457a87d3671f0311f0 --- /dev/null +++ b/62/meta.json @@ -0,0 +1 @@ +{"output_dir": "/circuits-benchmark/results", "atol": 0.05, "lr": 0.001, "use_single_loss": true, "iit_weight": 1.0, "behavior_weight": 0.4, "strict_weight": 0.7, "epochs": 1000, "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", "use_wandb": true, "wandb_project": "iit-train-seed-67-siit-weigth-0.7", "wandb_name": "case-62-seed-67-s-0.7-b-0.4-iit-1", "save_model_to_wandb": true, "device": "cuda", "clip_grad_norm": 0.1, "lr_scheduler": "linear", "model_pair": "strict", "same_size": false, "seed": 67, "batch_size": 256, "include_mlp": false, "detach_while_caching": true, "scheduler_val_metric": ["val/accuracy", "val/IIA", "val/strict_accuracy"], "siit_sampling": "sample_all", "val_iia_sampling": "all"} \ No newline at end of file diff --git a/64/edges.pkl b/64/edges.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e5ca91be0d15dfebb7961bc819259c28cd200595 --- /dev/null +++ b/64/edges.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a2cb02e0518a50c2b53be59ea3c3788ec80c0d3c6c1e70070bd7115abdfae77 +size 113 diff --git a/64/ll_model.pth b/64/ll_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..e030c064b3fbe5c5718c40664b86d7e3d566afc1 --- /dev/null +++ b/64/ll_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddb4db579c13e1bf510ead6a01c343582eedff2f1d799f4d733d72f5f9da50ab +size 14698 diff --git a/64/ll_model_cfg.pkl b/64/ll_model_cfg.pkl new file mode 100644 index 0000000000000000000000000000000000000000..971a904427237bd2171a1b8a7b4deb269de7c5ca --- /dev/null +++ b/64/ll_model_cfg.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db76917a80935813594508b1d79b7a0836a20daf67e263161432430b095fd111 +size 1093 diff --git a/64/meta.json b/64/meta.json new file mode 100644 index 0000000000000000000000000000000000000000..c2a964af1068e56c19a015e7de899d525ad2f412 --- /dev/null +++ b/64/meta.json @@ -0,0 +1 @@ +{"output_dir": "/circuits-benchmark/results", "atol": 0.05, "lr": 0.001, "use_single_loss": true, "iit_weight": 1.0, "behavior_weight": 0.4, "strict_weight": 0.7, "epochs": 1000, "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", "use_wandb": true, "wandb_project": "iit-train-seed-67-siit-weigth-0.7", "wandb_name": "case-64-seed-67-s-0.7-b-0.4-iit-1", "save_model_to_wandb": true, "device": "cuda", "clip_grad_norm": 0.1, "lr_scheduler": "linear", "model_pair": "strict", "same_size": false, "seed": 67, "batch_size": 256, "include_mlp": false, "detach_while_caching": true, "scheduler_val_metric": ["val/accuracy", "val/IIA", "val/strict_accuracy"], "siit_sampling": "sample_all", "val_iia_sampling": "all"} \ No newline at end of file diff --git a/65/edges.pkl b/65/edges.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e5ca91be0d15dfebb7961bc819259c28cd200595 --- /dev/null +++ b/65/edges.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a2cb02e0518a50c2b53be59ea3c3788ec80c0d3c6c1e70070bd7115abdfae77 +size 113 diff --git a/65/ll_model.pth b/65/ll_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..e030c064b3fbe5c5718c40664b86d7e3d566afc1 --- /dev/null +++ b/65/ll_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddb4db579c13e1bf510ead6a01c343582eedff2f1d799f4d733d72f5f9da50ab +size 14698 diff --git a/65/ll_model_cfg.pkl b/65/ll_model_cfg.pkl new file mode 100644 index 0000000000000000000000000000000000000000..971a904427237bd2171a1b8a7b4deb269de7c5ca --- /dev/null +++ b/65/ll_model_cfg.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db76917a80935813594508b1d79b7a0836a20daf67e263161432430b095fd111 +size 1093 diff --git a/65/meta.json b/65/meta.json new file mode 100644 index 0000000000000000000000000000000000000000..8148a084b92e971a747e741b05ccb81e18088f12 --- /dev/null +++ b/65/meta.json @@ -0,0 +1 @@ +{"output_dir": "/circuits-benchmark/results", "atol": 0.05, "lr": 0.001, "use_single_loss": true, "iit_weight": 1.0, "behavior_weight": 0.4, "strict_weight": 0.7, "epochs": 1000, "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", "use_wandb": true, "wandb_project": "iit-train-seed-67-siit-weigth-0.7", "wandb_name": "case-65-seed-67-s-0.7-b-0.4-iit-1", "save_model_to_wandb": true, "device": "cuda", "clip_grad_norm": 0.1, "lr_scheduler": "linear", "model_pair": "strict", "same_size": false, "seed": 67, "batch_size": 256, "include_mlp": false, "detach_while_caching": true, "scheduler_val_metric": ["val/accuracy", "val/IIA", "val/strict_accuracy"], "siit_sampling": "sample_all", "val_iia_sampling": "all"} \ No newline at end of file diff --git a/70/edges.pkl b/70/edges.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e5ca91be0d15dfebb7961bc819259c28cd200595 --- /dev/null +++ b/70/edges.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a2cb02e0518a50c2b53be59ea3c3788ec80c0d3c6c1e70070bd7115abdfae77 +size 113 diff --git a/70/ll_model.pth b/70/ll_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..78db9a8280c82cc0928da467a1144f40608d0737 --- /dev/null +++ b/70/ll_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a624a7f2389feabcc0d2cacd91a8b0703b57140d7517115583f951858ac7247f +size 14698 diff --git a/70/ll_model_cfg.pkl b/70/ll_model_cfg.pkl new file mode 100644 index 0000000000000000000000000000000000000000..971a904427237bd2171a1b8a7b4deb269de7c5ca --- /dev/null +++ b/70/ll_model_cfg.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db76917a80935813594508b1d79b7a0836a20daf67e263161432430b095fd111 +size 1093 diff --git a/70/meta.json b/70/meta.json new file mode 100644 index 0000000000000000000000000000000000000000..390ffc8997225a2e205b5eff21094bc228c65c14 --- /dev/null +++ b/70/meta.json @@ -0,0 +1 @@ +{"output_dir": "/circuits-benchmark/results", "atol": 0.05, "lr": 0.001, "use_single_loss": true, "iit_weight": 1.0, "behavior_weight": 0.4, "strict_weight": 0.7, "epochs": 1000, "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", "use_wandb": true, "wandb_project": "iit-train-seed-67-siit-weigth-0.7", "wandb_name": "case-70-seed-67-s-0.7-b-0.4-iit-1", "save_model_to_wandb": true, "device": "cuda", "clip_grad_norm": 0.1, "lr_scheduler": "linear", "model_pair": "strict", "same_size": false, "seed": 67, "batch_size": 256, "include_mlp": false, "detach_while_caching": true, "scheduler_val_metric": ["val/accuracy", "val/IIA", "val/strict_accuracy"], "siit_sampling": "sample_all", "val_iia_sampling": "all"} \ No newline at end of file diff --git a/72/edges.pkl b/72/edges.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e5ca91be0d15dfebb7961bc819259c28cd200595 --- /dev/null +++ b/72/edges.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a2cb02e0518a50c2b53be59ea3c3788ec80c0d3c6c1e70070bd7115abdfae77 +size 113 diff --git a/72/ll_model.pth b/72/ll_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..43c68763ffa23c373508111a5f6af8ef124abded --- /dev/null +++ b/72/ll_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b640bb628be87f8ebf41e6bb8b5351a6d69bfa11699bf2e1cea140f6f6f9f95 +size 15082 diff --git a/72/ll_model_cfg.pkl b/72/ll_model_cfg.pkl new file mode 100644 index 0000000000000000000000000000000000000000..0c277598f6fa5a15b80026b95d17e17c98d63226 --- /dev/null +++ b/72/ll_model_cfg.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f027ad4de6828c6f5bb7a3f3c8aabd9658526e33b2284eba366977823b89c0a +size 1093 diff --git a/72/meta.json b/72/meta.json new file mode 100644 index 0000000000000000000000000000000000000000..1f12974a4ea4f99748e56298bae5c6a781658f7c --- /dev/null +++ b/72/meta.json @@ -0,0 +1 @@ +{"output_dir": "/circuits-benchmark/results", "atol": 0.05, "lr": 0.001, "use_single_loss": true, "iit_weight": 1.0, "behavior_weight": 0.4, "strict_weight": 0.7, "epochs": 1000, "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", "use_wandb": true, "wandb_project": "iit-train-seed-67-siit-weigth-0.7", "wandb_name": "case-72-seed-67-s-0.7-b-0.4-iit-1", "save_model_to_wandb": true, "device": "cuda", "clip_grad_norm": 0.1, "lr_scheduler": "linear", "model_pair": "strict", "same_size": false, "seed": 67, "batch_size": 256, "include_mlp": false, "detach_while_caching": true, "scheduler_val_metric": ["val/accuracy", "val/IIA", "val/strict_accuracy"], "siit_sampling": "sample_all", "val_iia_sampling": "all"} \ No newline at end of file diff --git a/75/edges.pkl b/75/edges.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e5ca91be0d15dfebb7961bc819259c28cd200595 --- /dev/null +++ b/75/edges.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a2cb02e0518a50c2b53be59ea3c3788ec80c0d3c6c1e70070bd7115abdfae77 +size 113 diff --git a/75/ll_model.pth b/75/ll_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..e030c064b3fbe5c5718c40664b86d7e3d566afc1 --- /dev/null +++ b/75/ll_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddb4db579c13e1bf510ead6a01c343582eedff2f1d799f4d733d72f5f9da50ab +size 14698 diff --git a/75/ll_model_cfg.pkl b/75/ll_model_cfg.pkl new file mode 100644 index 0000000000000000000000000000000000000000..971a904427237bd2171a1b8a7b4deb269de7c5ca --- /dev/null +++ b/75/ll_model_cfg.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db76917a80935813594508b1d79b7a0836a20daf67e263161432430b095fd111 +size 1093 diff --git a/75/meta.json b/75/meta.json new file mode 100644 index 0000000000000000000000000000000000000000..c4866525617060faec2c720e861325869009d0a2 --- /dev/null +++ b/75/meta.json @@ -0,0 +1 @@ +{"output_dir": "/circuits-benchmark/results", "atol": 0.05, "lr": 0.001, "use_single_loss": true, "iit_weight": 1.0, "behavior_weight": 0.4, "strict_weight": 0.7, "epochs": 1000, "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", "use_wandb": true, "wandb_project": "iit-train-seed-67-siit-weigth-0.7", "wandb_name": "case-75-seed-67-s-0.7-b-0.4-iit-1", "save_model_to_wandb": true, "device": "cuda", "clip_grad_norm": 0.1, "lr_scheduler": "linear", "model_pair": "strict", "same_size": false, "seed": 67, "batch_size": 256, "include_mlp": false, "detach_while_caching": true, "scheduler_val_metric": ["val/accuracy", "val/IIA", "val/strict_accuracy"], "siit_sampling": "sample_all", "val_iia_sampling": "all"} \ No newline at end of file diff --git a/77/edges.pkl b/77/edges.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e5ca91be0d15dfebb7961bc819259c28cd200595 --- /dev/null +++ b/77/edges.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a2cb02e0518a50c2b53be59ea3c3788ec80c0d3c6c1e70070bd7115abdfae77 +size 113 diff --git a/77/ll_model.pth b/77/ll_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..4dcbb972ae5fdaf9f0d657a5d41cf9b763260675 --- /dev/null +++ b/77/ll_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea8f3e7e90367aac2a071c5d27bcbd12bdec85d0ab29ab6191a94e1d1d27fe48 +size 14698 diff --git a/77/ll_model_cfg.pkl b/77/ll_model_cfg.pkl new file mode 100644 index 0000000000000000000000000000000000000000..971a904427237bd2171a1b8a7b4deb269de7c5ca --- /dev/null +++ b/77/ll_model_cfg.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db76917a80935813594508b1d79b7a0836a20daf67e263161432430b095fd111 +size 1093 diff --git a/77/meta.json b/77/meta.json new file mode 100644 index 0000000000000000000000000000000000000000..743781c71cb55315b0971dc28413e69c2aafd681 --- /dev/null +++ b/77/meta.json @@ -0,0 +1 @@ +{"output_dir": "/circuits-benchmark/results", "atol": 0.05, "lr": 0.001, "use_single_loss": true, "iit_weight": 1.0, "behavior_weight": 0.4, "strict_weight": 0.7, "epochs": 1000, "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", "use_wandb": true, "wandb_project": "iit-train-seed-67-siit-weigth-0.7", "wandb_name": "case-77-seed-67-s-0.7-b-0.4-iit-1", "save_model_to_wandb": true, "device": "cuda", "clip_grad_norm": 0.1, "lr_scheduler": "linear", "model_pair": "strict", "same_size": false, "seed": 67, "batch_size": 256, "include_mlp": false, "detach_while_caching": true, "scheduler_val_metric": ["val/accuracy", "val/IIA", "val/strict_accuracy"], "siit_sampling": "sample_all", "val_iia_sampling": "all"} \ No newline at end of file diff --git a/8/edges.pkl b/8/edges.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e5ca91be0d15dfebb7961bc819259c28cd200595 --- /dev/null +++ b/8/edges.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a2cb02e0518a50c2b53be59ea3c3788ec80c0d3c6c1e70070bd7115abdfae77 +size 113 diff --git a/8/ll_model.pth b/8/ll_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..e030c064b3fbe5c5718c40664b86d7e3d566afc1 --- /dev/null +++ b/8/ll_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddb4db579c13e1bf510ead6a01c343582eedff2f1d799f4d733d72f5f9da50ab +size 14698 diff --git a/8/ll_model_cfg.pkl b/8/ll_model_cfg.pkl new file mode 100644 index 0000000000000000000000000000000000000000..971a904427237bd2171a1b8a7b4deb269de7c5ca --- /dev/null +++ b/8/ll_model_cfg.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db76917a80935813594508b1d79b7a0836a20daf67e263161432430b095fd111 +size 1093 diff --git a/8/meta.json b/8/meta.json new file mode 100644 index 0000000000000000000000000000000000000000..84a781d6b4796097264f00fcbfc1b9de043df2e4 --- /dev/null +++ b/8/meta.json @@ -0,0 +1 @@ +{"output_dir": "/circuits-benchmark/results", "atol": 0.05, "lr": 0.001, "use_single_loss": true, "iit_weight": 1.0, "behavior_weight": 0.4, "strict_weight": 0.7, "epochs": 1000, "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", "use_wandb": true, "wandb_project": "iit-train-seed-67-siit-weigth-0.7", "wandb_name": "case-8-seed-67-s-0.7-b-0.4-iit-1", "save_model_to_wandb": true, "device": "cuda", "clip_grad_norm": 0.1, "lr_scheduler": "linear", "model_pair": "strict", "same_size": false, "seed": 67, "batch_size": 256, "include_mlp": false, "detach_while_caching": true, "scheduler_val_metric": ["val/accuracy", "val/IIA", "val/strict_accuracy"], "siit_sampling": "sample_all", "val_iia_sampling": "all"} \ No newline at end of file diff --git a/80/edges.pkl b/80/edges.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e5ca91be0d15dfebb7961bc819259c28cd200595 --- /dev/null +++ b/80/edges.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a2cb02e0518a50c2b53be59ea3c3788ec80c0d3c6c1e70070bd7115abdfae77 +size 113 diff --git a/80/ll_model.pth b/80/ll_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..e030c064b3fbe5c5718c40664b86d7e3d566afc1 --- /dev/null +++ b/80/ll_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddb4db579c13e1bf510ead6a01c343582eedff2f1d799f4d733d72f5f9da50ab +size 14698 diff --git a/80/ll_model_cfg.pkl b/80/ll_model_cfg.pkl new file mode 100644 index 0000000000000000000000000000000000000000..971a904427237bd2171a1b8a7b4deb269de7c5ca --- /dev/null +++ b/80/ll_model_cfg.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db76917a80935813594508b1d79b7a0836a20daf67e263161432430b095fd111 +size 1093 diff --git a/80/meta.json b/80/meta.json new file mode 100644 index 0000000000000000000000000000000000000000..53fee361eefad099ad2bb7d81c71d3dc73239de1 --- /dev/null +++ b/80/meta.json @@ -0,0 +1 @@ +{"output_dir": "/circuits-benchmark/results", "atol": 0.05, "lr": 0.001, "use_single_loss": true, "iit_weight": 1.0, "behavior_weight": 0.4, "strict_weight": 0.7, "epochs": 1000, "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", "use_wandb": true, "wandb_project": "iit-train-seed-67-siit-weigth-0.7", "wandb_name": "case-80-seed-67-s-0.7-b-0.4-iit-1", "save_model_to_wandb": true, "device": "cuda", "clip_grad_norm": 0.1, "lr_scheduler": "linear", "model_pair": "strict", "same_size": false, "seed": 67, "batch_size": 256, "include_mlp": false, "detach_while_caching": true, "scheduler_val_metric": ["val/accuracy", "val/IIA", "val/strict_accuracy"], "siit_sampling": "sample_all", "val_iia_sampling": "all"} \ No newline at end of file diff --git a/83/edges.pkl b/83/edges.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e5ca91be0d15dfebb7961bc819259c28cd200595 --- /dev/null +++ b/83/edges.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a2cb02e0518a50c2b53be59ea3c3788ec80c0d3c6c1e70070bd7115abdfae77 +size 113 diff --git a/83/ll_model.pth b/83/ll_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..e030c064b3fbe5c5718c40664b86d7e3d566afc1 --- /dev/null +++ b/83/ll_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddb4db579c13e1bf510ead6a01c343582eedff2f1d799f4d733d72f5f9da50ab +size 14698 diff --git a/83/ll_model_cfg.pkl b/83/ll_model_cfg.pkl new file mode 100644 index 0000000000000000000000000000000000000000..971a904427237bd2171a1b8a7b4deb269de7c5ca --- /dev/null +++ b/83/ll_model_cfg.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db76917a80935813594508b1d79b7a0836a20daf67e263161432430b095fd111 +size 1093 diff --git a/83/meta.json b/83/meta.json new file mode 100644 index 0000000000000000000000000000000000000000..1c48cc00cb84a535e4d01cf863476725b8405128 --- /dev/null +++ b/83/meta.json @@ -0,0 +1 @@ +{"output_dir": "/circuits-benchmark/results", "atol": 0.05, "lr": 0.001, "use_single_loss": true, "iit_weight": 1.0, "behavior_weight": 0.4, "strict_weight": 0.7, "epochs": 1000, "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", "use_wandb": true, "wandb_project": "iit-train-seed-67-siit-weigth-0.7", "wandb_name": "case-83-seed-67-s-0.7-b-0.4-iit-1", "save_model_to_wandb": true, "device": "cuda", "clip_grad_norm": 0.1, "lr_scheduler": "linear", "model_pair": "strict", "same_size": false, "seed": 67, "batch_size": 256, "include_mlp": false, "detach_while_caching": true, "scheduler_val_metric": ["val/accuracy", "val/IIA", "val/strict_accuracy"], "siit_sampling": "sample_all", "val_iia_sampling": "all"} \ No newline at end of file diff --git a/84/edges.pkl b/84/edges.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e5ca91be0d15dfebb7961bc819259c28cd200595 --- /dev/null +++ b/84/edges.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a2cb02e0518a50c2b53be59ea3c3788ec80c0d3c6c1e70070bd7115abdfae77 +size 113 diff --git a/84/ll_model.pth b/84/ll_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..e030c064b3fbe5c5718c40664b86d7e3d566afc1 --- /dev/null +++ b/84/ll_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddb4db579c13e1bf510ead6a01c343582eedff2f1d799f4d733d72f5f9da50ab +size 14698 diff --git a/84/ll_model_cfg.pkl b/84/ll_model_cfg.pkl new file mode 100644 index 0000000000000000000000000000000000000000..971a904427237bd2171a1b8a7b4deb269de7c5ca --- /dev/null +++ b/84/ll_model_cfg.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db76917a80935813594508b1d79b7a0836a20daf67e263161432430b095fd111 +size 1093 diff --git a/84/meta.json b/84/meta.json new file mode 100644 index 0000000000000000000000000000000000000000..7800f5efd6d1977ef4c0a9bc6677889bb4e653bc --- /dev/null +++ b/84/meta.json @@ -0,0 +1 @@ +{"output_dir": "/circuits-benchmark/results", "atol": 0.05, "lr": 0.001, "use_single_loss": true, "iit_weight": 1.0, "behavior_weight": 0.4, "strict_weight": 0.7, "epochs": 1000, "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", "use_wandb": true, "wandb_project": "iit-train-seed-67-siit-weigth-0.7", "wandb_name": "case-84-seed-67-s-0.7-b-0.4-iit-1", "save_model_to_wandb": true, "device": "cuda", "clip_grad_norm": 0.1, "lr_scheduler": "linear", "model_pair": "strict", "same_size": false, "seed": 67, "batch_size": 256, "include_mlp": false, "detach_while_caching": true, "scheduler_val_metric": ["val/accuracy", "val/IIA", "val/strict_accuracy"], "siit_sampling": "sample_all", "val_iia_sampling": "all"} \ No newline at end of file diff --git a/85/edges.pkl b/85/edges.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e5ca91be0d15dfebb7961bc819259c28cd200595 --- /dev/null +++ b/85/edges.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a2cb02e0518a50c2b53be59ea3c3788ec80c0d3c6c1e70070bd7115abdfae77 +size 113 diff --git a/85/ll_model.pth b/85/ll_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..e030c064b3fbe5c5718c40664b86d7e3d566afc1 --- /dev/null +++ b/85/ll_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddb4db579c13e1bf510ead6a01c343582eedff2f1d799f4d733d72f5f9da50ab +size 14698 diff --git a/85/ll_model_cfg.pkl b/85/ll_model_cfg.pkl new file mode 100644 index 0000000000000000000000000000000000000000..971a904427237bd2171a1b8a7b4deb269de7c5ca --- /dev/null +++ b/85/ll_model_cfg.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db76917a80935813594508b1d79b7a0836a20daf67e263161432430b095fd111 +size 1093 diff --git a/85/meta.json b/85/meta.json new file mode 100644 index 0000000000000000000000000000000000000000..38479f25ea2453b4749d6730677cb6cf5edbfc49 --- /dev/null +++ b/85/meta.json @@ -0,0 +1 @@ +{"output_dir": "/circuits-benchmark/results", "atol": 0.05, "lr": 0.001, "use_single_loss": true, "iit_weight": 1.0, "behavior_weight": 0.4, "strict_weight": 0.7, "epochs": 1000, "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", "use_wandb": true, "wandb_project": "iit-train-seed-67-siit-weigth-0.7", "wandb_name": "case-85-seed-67-s-0.7-b-0.4-iit-1", "save_model_to_wandb": true, "device": "cuda", "clip_grad_norm": 0.1, "lr_scheduler": "linear", "model_pair": "strict", "same_size": false, "seed": 67, "batch_size": 256, "include_mlp": false, "detach_while_caching": true, "scheduler_val_metric": ["val/accuracy", "val/IIA", "val/strict_accuracy"], "siit_sampling": "sample_all", "val_iia_sampling": "all"} \ No newline at end of file diff --git a/90/edges.pkl b/90/edges.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e5ca91be0d15dfebb7961bc819259c28cd200595 --- /dev/null +++ b/90/edges.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a2cb02e0518a50c2b53be59ea3c3788ec80c0d3c6c1e70070bd7115abdfae77 +size 113 diff --git a/90/ll_model.pth b/90/ll_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..891d715075c9e3e221836f9cc8da974f43d552fb --- /dev/null +++ b/90/ll_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f83b14a664e9da5bd04aa0a7d77f24cfb7c628f9635efc938700bc99f6e1c46 +size 14698 diff --git a/90/ll_model_cfg.pkl b/90/ll_model_cfg.pkl new file mode 100644 index 0000000000000000000000000000000000000000..7449c91db5a4af72fc3e64e13d5a17a1c5fe112b --- /dev/null +++ b/90/ll_model_cfg.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c85e1f4714f9fca8b018f160458611bf212599e6f9edc83de523721c6027e60f +size 1093 diff --git a/90/meta.json b/90/meta.json new file mode 100644 index 0000000000000000000000000000000000000000..bd5ac8282f72df1beab184e97acbb314d741180d --- /dev/null +++ b/90/meta.json @@ -0,0 +1 @@ +{"output_dir": "/circuits-benchmark/results", "atol": 0.05, "lr": 0.001, "use_single_loss": true, "iit_weight": 1.0, "behavior_weight": 0.4, "strict_weight": 0.4, "epochs": 1000, "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", "use_wandb": true, "wandb_project": "iit-train-seed-82-siit-weigth-0.4", "wandb_name": "case-90-seed-82-s-0.4-b-0.4-iit-1", "save_model_to_wandb": true, "device": "cuda", "clip_grad_norm": 0.1, "lr_scheduler": "linear", "model_pair": "strict", "same_size": false, "seed": 82, "batch_size": 256, "include_mlp": false, "detach_while_caching": true, "scheduler_val_metric": ["val/accuracy", "val/IIA", "val/strict_accuracy"], "siit_sampling": "sample_all", "val_iia_sampling": "all"} \ No newline at end of file diff --git a/95/edges.pkl b/95/edges.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e5ca91be0d15dfebb7961bc819259c28cd200595 --- /dev/null +++ b/95/edges.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a2cb02e0518a50c2b53be59ea3c3788ec80c0d3c6c1e70070bd7115abdfae77 +size 113 diff --git a/95/ll_model.pth b/95/ll_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..1e6cd2f53fb849836ee0bf356500feff31084acb --- /dev/null +++ b/95/ll_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5665811a8574843411df9c47ec06cb70d1374744dcfbe19a0728d7a54389c68a +size 14890 diff --git a/95/ll_model_cfg.pkl b/95/ll_model_cfg.pkl new file mode 100644 index 0000000000000000000000000000000000000000..ecc680f7c23f0e49423b45c5c55f59d1596737be --- /dev/null +++ b/95/ll_model_cfg.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2be93b2822d25fdc67eb73f9767ddaa4c0f37adbb43f8442962b907c72b24335 +size 1093 diff --git a/95/meta.json b/95/meta.json new file mode 100644 index 0000000000000000000000000000000000000000..cbaceebffdd3ef6b7dc172a9ea69904f8db085cd --- /dev/null +++ b/95/meta.json @@ -0,0 +1 @@ +{"output_dir": "/circuits-benchmark/results", "atol": 0.05, "lr": 0.001, "use_single_loss": true, "iit_weight": 1.0, "behavior_weight": 0.4, "strict_weight": 0.7, "epochs": 1000, "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", "use_wandb": true, "wandb_project": "iit-train-seed-67-siit-weigth-0.7", "wandb_name": "case-95-seed-67-s-0.7-b-0.4-iit-1", "save_model_to_wandb": true, "device": "cuda", "clip_grad_norm": 0.1, "lr_scheduler": "linear", "model_pair": "strict", "same_size": false, "seed": 67, "batch_size": 256, "include_mlp": false, "detach_while_caching": true, "scheduler_val_metric": ["val/accuracy", "val/IIA", "val/strict_accuracy"], "siit_sampling": "sample_all", "val_iia_sampling": "all"} \ No newline at end of file diff --git a/97/edges.pkl b/97/edges.pkl new file mode 100644 index 0000000000000000000000000000000000000000..b927cb5a367468165b1ed0fc593752c9816e5746 --- /dev/null +++ b/97/edges.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afdfd64f4c8be661a44ae212318dcb22d1cd61cd023a9f510b0ed3cf9ceceac6 +size 1690 diff --git a/97/ll_model.pth b/97/ll_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..de54a00622412d60dd4b3fe9696dd6a45dd4e3bb --- /dev/null +++ b/97/ll_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7edb6ff1572d3b7b43e97ff512fe1bd0c78f47b00724906c72ef175338173028 +size 5869062 diff --git a/97/ll_model_cfg.pkl b/97/ll_model_cfg.pkl new file mode 100644 index 0000000000000000000000000000000000000000..9060fab742a5cfe522c32511aa7e5cef7bd308dc --- /dev/null +++ b/97/ll_model_cfg.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2cb2f82ed87ef19e802ed3400f3b508631496f474aae19e07304666c6d1a06d5 +size 1103 diff --git a/97/meta.json b/97/meta.json new file mode 100644 index 0000000000000000000000000000000000000000..16bf425a99ba09e8c6af55158712b9aaf4f09e4d --- /dev/null +++ b/97/meta.json @@ -0,0 +1 @@ +{"output_dir": "/circuits-benchmark/results", "atol": 0.05, "lr": 0.001, "use_single_loss": true, "iit_weight": 1.0, "behavior_weight": 0.4, "strict_weight": 0.4, "epochs": 1000, "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", "use_wandb": true, "wandb_project": "iit-train-seed-82-siit-weigth-0.4", "wandb_name": "case-97-seed-82-s-0.4-b-0.4-iit-1", "save_model_to_wandb": true, "device": "cuda", "clip_grad_norm": 0.1, "lr_scheduler": "linear", "model_pair": "strict", "same_size": false, "seed": 82, "batch_size": 256, "include_mlp": false, "detach_while_caching": true, "scheduler_val_metric": ["val/accuracy", "val/IIA", "val/strict_accuracy"], "siit_sampling": "sample_all", "val_iia_sampling": "all"} \ No newline at end of file diff --git a/benchmark_cases_metadata.csv b/benchmark_cases_metadata.csv index c5c3d65ca4e26262fb3c49d8e4959f6d4730a043..8a5f499df6f53bbf8c6ab082d630b79c1c917981 100644 --- a/benchmark_cases_metadata.csv +++ b/benchmark_cases_metadata.csv @@ -1,48 +1,74 @@ case_id,url,task_description,max_seq_len,min_seq_len,transformer_cfg_file_url,training_args_file_url,weights_file_url,circuit_file_url,transformer_cfg.n_layers,transformer_cfg.d_model,transformer_cfg.n_ctx,transformer_cfg.d_head,transformer_cfg.model_name,transformer_cfg.n_heads,transformer_cfg.d_mlp,transformer_cfg.act_fn,transformer_cfg.d_vocab,transformer_cfg.eps,transformer_cfg.use_attn_result,transformer_cfg.use_attn_scale,transformer_cfg.use_split_qkv_input,transformer_cfg.use_hook_mlp_in,transformer_cfg.use_attn_in,transformer_cfg.use_local_attn,transformer_cfg.original_architecture,transformer_cfg.from_checkpoint,transformer_cfg.tokenizer_name,transformer_cfg.init_mode,transformer_cfg.normalization_type,transformer_cfg.n_devices,transformer_cfg.attention_dir,transformer_cfg.attn_only,transformer_cfg.seed,transformer_cfg.initializer_range,transformer_cfg.init_weights,transformer_cfg.scale_attn_by_inverse_layer_idx,transformer_cfg.positional_embedding_type,transformer_cfg.final_rms,transformer_cfg.d_vocab_out,transformer_cfg.parallel_attn_mlp,transformer_cfg.n_params,transformer_cfg.use_hook_tokens,transformer_cfg.gated_mlp,transformer_cfg.default_prepend_bos,transformer_cfg.dtype,transformer_cfg.tokenizer_prepends_bos,transformer_cfg.post_embedding_ln,transformer_cfg.rotary_base,transformer_cfg.trust_remote_code,transformer_cfg.rotary_adjacent_pairs,transformer_cfg.load_in_4bit,training_args.output_dir,training_args.atol,training_args.lr,training_args.use_single_loss,training_args.iit_weight,training_args.behavior_weight,training_args.strict_weight,training_args.epochs,training_args.early_stop_accuracy_threshold,training_args.act_fn,training_args.use_wandb,training_args.save_model_to_wandb,training_args.clip_grad_norm,training_args.lr_scheduler,training_args.model_pair,training_args.same_size,training_args.seed,training_args.batch_size,training_args.include_mlp,training_args.detach_while_caching,training_args.scheduler_val_metric,training_args.siit_sampling,training_args.val_iia_sampling,training_args.next_token,training_args.non_ioi_thresh,training_args.use_per_token_check,training_args.num_workers,training_args.early_stop,training_args.scheduler_mode,training_args.val_IIA_sampling,training_args.use_all_tokens_for_behavior,training_args.optimizer_kwargs.betas -101,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/101,Check if each element is a square of an integer.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/101/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/101/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/101/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/101/edges.pkl,2,4,10,1,custom,4,16,gelu,32,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.12199885626608374,True,False,standard,False,2,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, -103,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/103,Swap consecutive numbers in a list,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/103/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/103/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/103/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/103/edges.pkl,3,24,10,6,custom,4,96,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,bidirectional,False,0.0,0.10787197799411874,True,False,standard,False,11,False,20736,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, -11,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/11,Counts the number of words in a sequence based on their length.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/11/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/11/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/11/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/11/edges.pkl,2,12,10,3,custom,4,48,gelu,10,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.1460593486680443,True,False,standard,False,5,False,3456,False,False,True,torch.float32,False,False,10000,False,False,True,,0.05,0.01,False,1.0,1.0,0.4,500.0,,gelu,True,True,1.0,,,True,,,True,True,,,,True,,True,,True,,,True, +52,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/52,Takes the square root of each element.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/52/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/52/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/52/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/52/edges.pkl,2,4,10,1,custom,4,16,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.1392621247645583,True,False,standard,False,11,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.7,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +7,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/7,Returns the number of times each token occurs in the input.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/7/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/7/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/7/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/7/edges.pkl,2,17,10,4,custom,4,68,gelu,5,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,bidirectional,False,0.0,0.15689290811054724,True,False,standard,False,10,False,6800,False,False,True,torch.float32,False,False,10000,False,False,True,,0.05,0.01,False,1.0,1.0,0.5,2000.0,,gelu,True,True,0.1,,strict,False,1234.0,256.0,False,True,,,,False,,True,,True,,,True, +13,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/13,"Analyzes the trend (increasing, decreasing, constant) of numeric tokens.",10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/13/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/13/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/13/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/13/edges.pkl,2,20,10,5,custom,4,80,gelu,5,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,bidirectional,False,0.0,0.1460593486680443,True,False,standard,False,3,False,9600,False,False,True,torch.float32,False,False,10000,False,False,True,,0.05,0.01,False,1.0,1.0,0.4,500.0,,gelu,True,True,1.0,,,True,,,True,True,,,,True,,True,,True,,,True, +63,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/63,Replaces each element with the number of elements less than it in the sequence.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/63/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/63/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/63/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/63/edges.pkl,2,24,10,6,custom,4,96,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,bidirectional,False,0.0,0.13719886811400708,True,False,standard,False,10,False,13824,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +60,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/60,Increment each element in the sequence by 1.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/60/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/60/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/60/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/60/edges.pkl,2,4,10,1,custom,4,16,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.1392621247645583,True,False,standard,False,11,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.7,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +29,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/29,Creates abbreviations for each token in the sequence.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/29/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/29/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/29/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/29/edges.pkl,2,13,10,3,custom,4,52,gelu,10,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.1539600717839002,True,False,standard,False,8,False,3952,False,False,True,torch.float32,False,False,10000,False,False,True,,0.05,0.01,False,1.0,1.0,0.4,2000.0,,gelu,True,True,0.1,,strict,True,,,True,True,,,,True,,True,,True,,,True, +79,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/79,Check if each number in a sequence is prime,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/79/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/79/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/79/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/79/edges.pkl,2,4,10,1,custom,4,16,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.16329931618554522,True,False,standard,False,2,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +24,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/24,Identifies the first occurrence of each token in a sequence.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/24/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/24/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/24/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/24/edges.pkl,2,36,10,9,custom,4,144,gelu,5,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.1885618083164127,True,False,standard,False,3,False,31104,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +82,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/82,Halve the elements in the second half of the sequence.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/82/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/82/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/82/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/82/edges.pkl,4,24,10,6,custom,4,96,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,bidirectional,False,0.0,0.1059625885652035,True,False,standard,False,16,False,27648,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +31,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/31,Identify if tokens in the sequence are anagrams of the word 'listen'.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/31/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/31/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/31/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/31/edges.pkl,2,4,10,1,custom,4,16,gelu,11,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.17056057308448835,True,False,standard,False,2,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +72,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/72,Negate each element in the input sequence.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/72/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/72/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/72/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/72/edges.pkl,2,4,10,1,custom,4,16,gelu,22,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.11202240672224079,True,False,standard,False,20,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.7,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +104,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/104,Apply exponential function to all elements of the input sequence.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/104/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/104/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/104/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/104/edges.pkl,2,4,10,1,custom,4,16,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.1392621247645583,True,False,standard,False,11,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.7,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +18,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/18,"Classify each token based on its frequency as 'rare', 'common', or 'frequent'.",10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/18/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/18/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/18/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/18/edges.pkl,2,26,10,6,custom,4,104,gelu,7,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,bidirectional,False,0.0,0.12344267996967354,True,False,standard,False,3,False,15808,False,False,True,torch.float32,False,False,10000,False,False,True,,0.05,0.001,False,1.0,1.0,1.0,2000.0,,gelu,True,True,0.1,,strict,True,,,True,True,,,,True,,True,,True,,,True, 110,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/110,"Inserts zeros between each element, removing the latter half of the list.",10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/110/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/110/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/110/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/110/edges.pkl,2,20,10,5,custom,4,80,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,bidirectional,False,0.0,0.11925695879998878,True,False,standard,False,11,False,9600,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, -111,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/111,Returns the last element of the sequence and pads the rest with zeros.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/111/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/111/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/111/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/111/edges.pkl,3,24,10,6,custom,4,96,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,bidirectional,False,0.0,0.09847319278346618,True,False,standard,False,11,False,20736,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, -113,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/113,"Inverts the sequence if it is sorted in ascending order, otherwise leaves it unchanged.",10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/113/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/113/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/113/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/113/edges.pkl,7,88,10,22,custom,4,352,gelu,32,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,bidirectional,False,0.0,0.0512147519731584,True,False,standard,False,30,False,650496,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +69,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/69,"Assign -1, 0, or 1 to each element of the input sequence based on its sign.",10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/69/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/69/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/69/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/69/edges.pkl,2,4,10,1,custom,4,16,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.16329931618554522,True,False,standard,False,2,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +87,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/87,Binarize a sequence of integers using a threshold.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/87/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/87/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/87/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/87/edges.pkl,2,4,10,1,custom,4,16,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.16329931618554522,True,False,standard,False,2,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +80,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/80,Subtract a constant from each element of the input sequence.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/80/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/80/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/80/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/80/edges.pkl,2,4,10,1,custom,4,16,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.1392621247645583,True,False,standard,False,11,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.7,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +83,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/83,Triple each element in the sequence.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/83/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/83/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/83/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/83/edges.pkl,2,4,10,1,custom,4,16,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.1392621247645583,True,False,standard,False,11,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.7,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +45,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/45,Doubles the first half of the sequence,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/45/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/45/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/45/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/45/edges.pkl,3,24,10,6,custom,4,96,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,bidirectional,False,0.0,0.11094003924504584,True,False,standard,False,16,False,20736,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, 114,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/114,Apply a logarithm base 10 to each element of the input sequence.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/114/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/114/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/114/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/114/edges.pkl,2,4,10,1,custom,4,16,gelu,12,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.14368424162141993,True,False,standard,False,10,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, -122,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/122,Check if each number is divisible by 3.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/122/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/122/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/122/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/122/edges.pkl,2,4,10,1,custom,4,16,gelu,32,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.12199885626608374,True,False,standard,False,2,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, -124,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/124,Check if all elements in a list are equal.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/124/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/124/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/124/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/124/edges.pkl,3,24,10,6,custom,4,96,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,bidirectional,False,0.0,0.11547005383792516,True,False,standard,False,2,False,20736,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, -129,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/129,Checks if all elements are a multiple of n (set the default at 2).,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/129/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/129/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/129/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/129/edges.pkl,3,4,10,1,custom,4,16,gelu,32,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,bidirectional,False,0.0,0.10504514628777804,True,False,standard,False,2,False,576,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, -13,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/13,"Analyzes the trend (increasing, decreasing, constant) of numeric tokens.",10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/13/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/13/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/13/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/13/edges.pkl,2,20,10,5,custom,4,80,gelu,5,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,bidirectional,False,0.0,0.1460593486680443,True,False,standard,False,3,False,9600,False,False,True,torch.float32,False,False,10000,False,False,True,,0.05,0.01,False,1.0,1.0,0.4,500.0,,gelu,True,True,1.0,,,True,,,True,True,,,,True,,True,,True,,,True, +33,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/33,Checks if each token's length is odd or even.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/33/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/33/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/33/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/33/edges.pkl,2,4,10,1,custom,4,16,gelu,10,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.17457431218879393,True,False,standard,False,2,False,384,False,False,True,torch.float32,False,False,10000,False,False,True,,0.05,0.001,False,1.0,1.0,0.4,2000.0,,gelu,True,True,0.1,,,True,,,True,True,,,,True,,True,,True,,,True, +ioi,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/ioi,Indirect Object Identification (IOI) task.,16,16,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi/edges.pkl,6,64,1024,16,gpt2,4,3072,gelu_new,50257,1e-05,False,True,False,False,False,False,GPT2LMHeadModel,False,gpt2,gpt2,LNPre,1,causal,False,,0.02886751345948129,False,False,standard,False,50257,False,84934656,False,False,True,torch.float32,False,False,10000,False,False,True,,0.05,0.001,False,1.0,1.0,0.4,,,,True,True,1.0,,,True,0.0,512.0,True,True,"val/accuracy,val/IIA",individual,,False,0.65,False,0.0,True,max,random,False,"0.9,0.9" +55,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/55,Applies the hyperbolic sine to each element.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/55/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/55/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/55/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/55/edges.pkl,2,4,10,1,custom,4,16,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.1392621247645583,True,False,standard,False,11,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.7,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +21,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/21,Extract unique tokens from a string,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/21/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/21/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/21/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/21/edges.pkl,4,50,10,12,custom,4,200,gelu,5,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.09847319278346618,True,False,standard,False,3,False,118400,False,False,True,torch.float32,False,False,10000,False,False,True,,0.05,0.0005,False,1.0,1.0,0.5,2000.0,,gelu,True,True,0.1,,strict,True,,,True,True,,,,True,,True,,True,,,True, +4,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/4,Return fraction of previous open tokens minus the fraction of close tokens.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/4/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/4/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/4/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/4/edges.pkl,2,20,10,5,custom,4,80,gelu,7,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.17056057308448835,True,False,standard,False,1,False,9600,False,False,True,torch.float32,False,False,10000,False,False,True,,0.05,0.001,False,1.0,1.0,0.4,2000.0,,gelu,True,True,0.1,,,True,,,True,True,,,,True,,True,,True,,,True, +30,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/30,Tags numeric tokens in a sequence based on whether they fall within a given range.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/30/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/30/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/30/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/30/edges.pkl,2,4,10,1,custom,4,16,gelu,32,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.12199885626608374,True,False,standard,False,2,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +90,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/90,Replaces a specific token with another one.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/90/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/90/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/90/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/90/edges.pkl,2,4,10,1,custom,4,16,gelu,12,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.1460593486680443,True,False,standard,False,9,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,82.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +39,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/39,Returns the fraction of 'x' in the input up to the i-th position for all i.,60,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/39/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/39/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/39/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/39/edges.pkl,2,120,60,30,custom,4,480,gelu,28,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.08432740427115679,True,False,standard,False,1,False,345600,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +75,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/75,Double each element of the input sequence.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/75/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/75/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/75/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/75/edges.pkl,2,4,10,1,custom,4,16,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.1392621247645583,True,False,standard,False,11,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.7,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +101,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/101,Check if each element is a square of an integer.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/101/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/101/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/101/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/101/edges.pkl,2,4,10,1,custom,4,16,gelu,32,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.12199885626608374,True,False,standard,False,2,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +ioi_next_token,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/ioi_next_token,"Indirect Object Identification (IOI) task, trained using next token prediction.",16,16,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi_next_token/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi_next_token/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi_next_token/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi_next_token/edges.pkl,6,64,1024,16,gpt2,4,3072,gelu_new,50257,1e-05,False,True,False,False,False,False,GPT2LMHeadModel,False,gpt2,gpt2,LNPre,1,causal,False,,0.02886751345948129,True,False,standard,False,50257,False,2457600,False,False,True,torch.float32,False,False,10000,False,False,True,,0.05,0.001,False,1.0,1.0,0.4,,,,True,True,1.0,,,True,,256.0,True,True,"val/accuracy,val/IIA",,,True,0.65,False,0.0,True,max,,True, +65,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/65,Calculate the cube root of each element in the input sequence.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/65/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/65/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/65/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/65/edges.pkl,2,4,10,1,custom,4,16,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.1392621247645583,True,False,standard,False,11,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.7,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, 14,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/14,Returns the count of 'a' in the input sequence.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/14/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/14/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/14/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/14/edges.pkl,2,8,10,2,custom,4,32,gelu,5,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,bidirectional,False,0.0,0.15689290811054724,True,False,standard,False,10,False,1536,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, -18,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/18,"Classify each token based on its frequency as 'rare', 'common', or 'frequent'.",10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/18/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/18/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/18/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/18/edges.pkl,2,26,10,6,custom,4,104,gelu,7,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,bidirectional,False,0.0,0.12344267996967354,True,False,standard,False,3,False,15808,False,False,True,torch.float32,False,False,10000,False,False,True,,0.05,0.001,False,1.0,1.0,1.0,2000.0,,gelu,True,True,0.1,,strict,True,,,True,True,,,,True,,True,,True,,,True, +95,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/95,Counts the distinct prime factors of each number in the input list.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/95/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/95/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/95/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/95/edges.pkl,2,4,10,1,custom,4,16,gelu,32,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.12060453783110546,True,False,standard,False,3,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.7,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +84,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/84,Apply the arctangent function to each element of the input sequence.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/84/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/84/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/84/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/84/edges.pkl,2,4,10,1,custom,4,16,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.1392621247645583,True,False,standard,False,11,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.7,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +93,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/93,Swaps the nth with the n+1th element if n%2==1.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/93/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/93/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/93/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/93/edges.pkl,3,20,10,5,custom,4,80,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,bidirectional,False,0.0,0.10886621079036347,True,False,standard,False,11,False,14400,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +37,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/37,Reverses each word in the sequence except for specified exclusions.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/37/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/37/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/37/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/37/edges.pkl,2,12,10,3,custom,4,48,gelu,10,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.1539600717839002,True,False,standard,False,8,False,3456,False,False,True,torch.float32,False,False,10000,False,False,True,,0.05,0.01,False,1.0,1.0,1.0,2000.0,,gelu,True,True,0.1,,strict,True,,,True,True,,,,True,,True,,True,,,True, +34,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/34,Calculate the ratio of vowels to consonants in each word.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/34/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/34/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/34/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/34/edges.pkl,2,16,10,4,custom,4,64,gelu,10,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.16329931618554522,True,False,standard,False,5,False,6144,False,False,True,torch.float32,False,False,10000,False,False,True,,0.05,0.01,False,1.0,1.0,1.0,2000.0,,gelu,True,True,0.1,,strict,True,,,True,True,,,,True,,True,,True,,,True, +26,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/26,Creates a cascading effect by repeating each token in sequence incrementally.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/26/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/26/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/26/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/26/edges.pkl,2,21,10,5,custom,4,84,gelu,5,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.12344267996967354,True,False,standard,False,27,False,10416,False,False,True,torch.float32,False,False,10000,False,False,True,,0.05,0.01,False,1.0,1.0,0.4,2000.0,,gelu,True,True,0.1,,strict,True,,,True,True,,,,True,,True,,True,,,True, 19,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/19,Removes consecutive duplicate tokens from a sequence.,15,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/19/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/19/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/19/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/19/edges.pkl,2,32,15,8,custom,4,128,gelu,5,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.15689290811054724,True,False,standard,False,3,False,24576,False,False,True,torch.float32,False,False,10000,False,False,True,,0.05,0.001,False,1.0,1.0,0.4,2000.0,,gelu,True,True,0.1,,,True,,,True,True,,,,True,,True,,True,,,True, +53,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/53,Increment elements at odd indices by 1,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/53/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/53/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/53/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/53/edges.pkl,2,4,10,1,custom,4,16,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.13151918984428584,True,False,standard,False,12,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.7,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +40,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/40,Sum the last and previous to last digits of a number,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/40/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/40/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/40/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/40/edges.pkl,2,4,10,1,custom,4,16,gelu,31,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.09847319278346618,True,False,standard,False,12,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +51,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/51,Checks if each element is a Fibonacci number,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/51/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/51/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/51/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/51/edges.pkl,2,4,10,1,custom,4,16,gelu,102,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.07525766947068778,True,False,standard,False,2,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +122,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/122,Check if each number is divisible by 3.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/122/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/122/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/122/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/122/edges.pkl,2,4,10,1,custom,4,16,gelu,32,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.12199885626608374,True,False,standard,False,2,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +85,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/85,Square each element of the input sequence.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/85/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/85/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/85/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/85/edges.pkl,2,4,10,1,custom,4,16,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.1392621247645583,True,False,standard,False,11,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.7,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, 2,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/2,Reverse the input sequence.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/2/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/2/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/2/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/2/edges.pkl,4,56,10,14,custom,4,224,gelu,28,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,bidirectional,False,0.0,0.07593263966019993,True,False,standard,False,26,False,150528,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +44,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/44,Replaces each element with the number of elements greater than it in the sequence,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/44/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/44/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/44/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/44/edges.pkl,2,24,10,6,custom,4,96,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,bidirectional,False,0.0,0.13719886811400708,True,False,standard,False,10,False,13824,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +113,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/113,"Inverts the sequence if it is sorted in ascending order, otherwise leaves it unchanged.",10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/113/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/113/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/113/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/113/edges.pkl,7,88,10,22,custom,4,352,gelu,32,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,bidirectional,False,0.0,0.0512147519731584,True,False,standard,False,30,False,650496,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +77,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/77,Apply the tangent function to each element of the sequence.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/77/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/77/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/77/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/77/edges.pkl,2,4,10,1,custom,4,16,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.1392621247645583,True,False,standard,False,11,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.7,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, 20,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/20,Detect spam messages based on appearance of spam keywords.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/20/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/20/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/20/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/20/edges.pkl,2,13,10,3,custom,4,52,gelu,14,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.16,True,False,standard,False,2,False,3952,False,False,True,torch.float32,False,False,10000,False,False,True,,0.05,0.01,False,1.0,1.0,0.4,2000.0,,gelu,True,True,0.1,,strict,True,,,True,True,,,,True,,True,,True,,,True, -21,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/21,Extract unique tokens from a string,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/21/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/21/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/21/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/21/edges.pkl,4,50,10,12,custom,4,200,gelu,5,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.09847319278346618,True,False,standard,False,3,False,118400,False,False,True,torch.float32,False,False,10000,False,False,True,,0.05,0.0005,False,1.0,1.0,0.5,2000.0,,gelu,True,True,0.1,,strict,True,,,True,True,,,,True,,True,,True,,,True, -24,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/24,Identifies the first occurrence of each token in a sequence.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/24/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/24/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/24/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/24/edges.pkl,2,36,10,9,custom,4,144,gelu,5,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.1885618083164127,True,False,standard,False,3,False,31104,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, 25,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/25,Normalizes token frequencies in a sequence to a range between 0 and 1.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/25/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/25/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/25/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/25/edges.pkl,2,62,10,15,custom,4,248,gelu,5,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,bidirectional,False,0.0,0.08295613557843402,True,False,standard,False,56,False,91264,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, -26,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/26,Creates a cascading effect by repeating each token in sequence incrementally.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/26/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/26/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/26/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/26/edges.pkl,2,21,10,5,custom,4,84,gelu,5,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.12344267996967354,True,False,standard,False,27,False,10416,False,False,True,torch.float32,False,False,10000,False,False,True,,0.05,0.01,False,1.0,1.0,0.4,2000.0,,gelu,True,True,0.1,,strict,True,,,True,True,,,,True,,True,,True,,,True, -29,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/29,Creates abbreviations for each token in the sequence.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/29/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/29/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/29/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/29/edges.pkl,2,13,10,3,custom,4,52,gelu,10,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.1539600717839002,True,False,standard,False,8,False,3952,False,False,True,torch.float32,False,False,10000,False,False,True,,0.05,0.01,False,1.0,1.0,0.4,2000.0,,gelu,True,True,0.1,,strict,True,,,True,True,,,,True,,True,,True,,,True, -3,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/3,Returns the fraction of 'x' in the input up to the i-th position for all i.,5,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/3/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/3/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/3/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/3/edges.pkl,2,12,5,3,custom,4,48,gelu,6,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.22188007849009167,True,False,standard,False,1,False,3456,False,False,True,torch.float32,False,False,10000,False,False,True,,0.05,0.01,False,1.0,1.0,10.0,2000.0,,gelu,True,True,0.1,,strict,False,,,True,True,,,,True,,True,,True,,,True, -30,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/30,Tags numeric tokens in a sequence based on whether they fall within a given range.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/30/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/30/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/30/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/30/edges.pkl,2,4,10,1,custom,4,16,gelu,32,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.12199885626608374,True,False,standard,False,2,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, -31,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/31,Identify if tokens in the sequence are anagrams of the word 'listen'.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/31/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/31/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/31/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/31/edges.pkl,2,4,10,1,custom,4,16,gelu,11,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.17056057308448835,True,False,standard,False,2,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, -33,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/33,Checks if each token's length is odd or even.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/33/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/33/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/33/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/33/edges.pkl,2,4,10,1,custom,4,16,gelu,10,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.17457431218879393,True,False,standard,False,2,False,384,False,False,True,torch.float32,False,False,10000,False,False,True,,0.05,0.001,False,1.0,1.0,0.4,2000.0,,gelu,True,True,0.1,,,True,,,True,True,,,,True,,True,,True,,,True, -34,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/34,Calculate the ratio of vowels to consonants in each word.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/34/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/34/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/34/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/34/edges.pkl,2,16,10,4,custom,4,64,gelu,10,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.16329931618554522,True,False,standard,False,5,False,6144,False,False,True,torch.float32,False,False,10000,False,False,True,,0.05,0.01,False,1.0,1.0,1.0,2000.0,,gelu,True,True,0.1,,strict,True,,,True,True,,,,True,,True,,True,,,True, +11,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/11,Counts the number of words in a sequence based on their length.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/11/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/11/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/11/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/11/edges.pkl,2,12,10,3,custom,4,48,gelu,10,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.1460593486680443,True,False,standard,False,5,False,3456,False,False,True,torch.float32,False,False,10000,False,False,True,,0.05,0.01,False,1.0,1.0,0.4,500.0,,gelu,True,True,1.0,,,True,,,True,True,,,,True,,True,,True,,,True, +54,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/54,Applies the hyperbolic tangent to each element.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/54/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/54/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/54/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/54/edges.pkl,2,4,10,1,custom,4,16,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.1392621247645583,True,False,standard,False,11,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.7,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +124,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/124,Check if all elements in a list are equal.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/124/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/124/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/124/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/124/edges.pkl,3,24,10,6,custom,4,96,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,bidirectional,False,0.0,0.11547005383792516,True,False,standard,False,2,False,20736,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +102,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/102,"Reflects each element within a range (default is [2, 7]).",10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/102/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/102/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/102/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/102/edges.pkl,2,4,10,1,custom,4,16,gelu,42,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.1059625885652035,True,False,standard,False,6,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,82.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +129,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/129,Checks if all elements are a multiple of n (set the default at 2).,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/129/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/129/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/129/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/129/edges.pkl,3,4,10,1,custom,4,16,gelu,32,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,bidirectional,False,0.0,0.10504514628777804,True,False,standard,False,2,False,576,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +50,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/50,Applies the hyperbolic cosine to each element,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/50/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/50/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/50/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/50/edges.pkl,2,4,10,1,custom,4,16,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.1392621247645583,True,False,standard,False,11,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.7,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +123,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/123,Apply arccosine to each element of the input sequence.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/123/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/123/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/123/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/123/edges.pkl,2,4,10,1,custom,4,16,gelu,22,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.11202240672224079,True,False,standard,False,20,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.7,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +86,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/86,"Check if each element is a power of 2. Return 1 if true, otherwise 0.",10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/86/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/86/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/86/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/86/edges.pkl,2,4,10,1,custom,4,16,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.16329931618554522,True,False,standard,False,2,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +105,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/105,Replaces each number with the next prime after that number.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/105/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/105/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/105/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/105/edges.pkl,2,4,10,1,custom,4,16,gelu,32,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.11202240672224079,True,False,standard,False,10,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.7,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +58,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/58,Mirrors the first half of the sequence to the second half.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/58/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/58/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/58/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/58/edges.pkl,3,32,10,8,custom,4,128,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,bidirectional,False,0.0,0.10415112878465911,True,False,standard,False,11,False,36864,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +70,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/70,Apply the cosine function to each element of the input sequence.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/70/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/70/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/70/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/70/edges.pkl,2,4,10,1,custom,4,16,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.1392621247645583,True,False,standard,False,11,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.7,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +62,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/62,Replaces each element with its factorial.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/62/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/62/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/62/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/62/edges.pkl,2,4,10,1,custom,4,16,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.1414213562373095,True,False,standard,False,10,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.7,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +111,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/111,Returns the last element of the sequence and pads the rest with zeros.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/111/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/111/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/111/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/111/edges.pkl,3,24,10,6,custom,4,96,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,bidirectional,False,0.0,0.09847319278346618,True,False,standard,False,11,False,20736,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, 35,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/35,Alternates capitalization of each character in words.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/35/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/35/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/35/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/35/edges.pkl,2,9,10,2,custom,4,36,gelu,10,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.1539600717839002,True,False,standard,False,8,False,1872,False,False,True,torch.float32,False,False,10000,False,False,True,,0.05,0.01,False,1.0,1.0,1.0,2000.0,,gelu,True,True,0.1,,strict,True,,,True,True,,,,True,,True,,True,,,True, +3,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/3,Returns the fraction of 'x' in the input up to the i-th position for all i.,5,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/3/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/3/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/3/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/3/edges.pkl,2,12,5,3,custom,4,48,gelu,6,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.22188007849009167,True,False,standard,False,1,False,3456,False,False,True,torch.float32,False,False,10000,False,False,True,,0.05,0.01,False,1.0,1.0,10.0,2000.0,,gelu,True,True,0.1,,strict,False,,,True,True,,,,True,,True,,True,,,True, 36,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/36,"Classifies each token as 'positive', 'negative', or 'neutral' based on emojis.",10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/36/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/36/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/36/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/36/edges.pkl,2,6,10,1,custom,4,24,gelu,5,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.19402850002906638,True,False,standard,False,3,False,768,False,False,True,torch.float32,False,False,10000,False,False,True,,0.05,0.01,False,1.0,1.0,1.0,2000.0,,gelu,True,True,0.1,,strict,True,,,True,True,,,,True,,True,,True,,,True, -37,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/37,Reverses each word in the sequence except for specified exclusions.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/37/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/37/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/37/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/37/edges.pkl,2,12,10,3,custom,4,48,gelu,10,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.1539600717839002,True,False,standard,False,8,False,3456,False,False,True,torch.float32,False,False,10000,False,False,True,,0.05,0.01,False,1.0,1.0,1.0,2000.0,,gelu,True,True,0.1,,strict,True,,,True,True,,,,True,,True,,True,,,True, -39,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/39,Returns the fraction of 'x' in the input up to the i-th position for all i.,60,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/39/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/39/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/39/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/39/edges.pkl,2,120,60,30,custom,4,480,gelu,28,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.08432740427115679,True,False,standard,False,1,False,345600,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, -4,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/4,Return fraction of previous open tokens minus the fraction of close tokens.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/4/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/4/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/4/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/4/edges.pkl,2,20,10,5,custom,4,80,gelu,7,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.17056057308448835,True,False,standard,False,1,False,9600,False,False,True,torch.float32,False,False,10000,False,False,True,,0.05,0.001,False,1.0,1.0,0.4,2000.0,,gelu,True,True,0.1,,,True,,,True,True,,,,True,,True,,True,,,True, -40,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/40,Sum the last and previous to last digits of a number,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/40/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/40/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/40/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/40/edges.pkl,2,4,10,1,custom,4,16,gelu,31,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.09847319278346618,True,False,standard,False,12,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, -44,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/44,Replaces each element with the number of elements greater than it in the sequence,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/44/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/44/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/44/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/44/edges.pkl,2,24,10,6,custom,4,96,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,bidirectional,False,0.0,0.13719886811400708,True,False,standard,False,10,False,13824,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, -45,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/45,Doubles the first half of the sequence,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/45/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/45/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/45/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/45/edges.pkl,3,24,10,6,custom,4,96,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,bidirectional,False,0.0,0.11094003924504584,True,False,standard,False,16,False,20736,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, -51,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/51,Checks if each element is a Fibonacci number,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/51/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/51/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/51/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/51/edges.pkl,2,4,10,1,custom,4,16,gelu,102,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.07525766947068778,True,False,standard,False,2,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +64,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/64,Cubes each element in the sequence.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/64/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/64/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/64/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/64/edges.pkl,2,4,10,1,custom,4,16,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.1392621247645583,True,False,standard,False,11,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.7,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +8,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/8,Identity,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/8/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/8/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/8/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/8/edges.pkl,2,4,10,1,custom,4,16,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.1392621247645583,True,False,standard,False,11,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.7,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +103,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/103,Swap consecutive numbers in a list,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/103/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/103/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/103/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/103/edges.pkl,3,24,10,6,custom,4,96,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,bidirectional,False,0.0,0.10787197799411874,True,False,standard,False,11,False,20736,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, 56,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/56,Sets every third element to zero.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/56/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/56/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/56/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/56/edges.pkl,2,4,10,1,custom,4,16,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.13333333333333333,True,False,standard,False,11,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, -58,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/58,Mirrors the first half of the sequence to the second half.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/58/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/58/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/58/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/58/edges.pkl,3,32,10,8,custom,4,128,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,bidirectional,False,0.0,0.10415112878465911,True,False,standard,False,11,False,36864,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, -63,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/63,Replaces each element with the number of elements less than it in the sequence.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/63/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/63/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/63/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/63/edges.pkl,2,24,10,6,custom,4,96,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,bidirectional,False,0.0,0.13719886811400708,True,False,standard,False,10,False,13824,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, -69,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/69,"Assign -1, 0, or 1 to each element of the input sequence based on its sign.",10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/69/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/69/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/69/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/69/edges.pkl,2,4,10,1,custom,4,16,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.16329931618554522,True,False,standard,False,2,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, -7,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/7,Returns the number of times each token occurs in the input.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/7/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/7/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/7/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/7/edges.pkl,2,17,10,4,custom,4,68,gelu,5,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,bidirectional,False,0.0,0.15689290811054724,True,False,standard,False,10,False,6800,False,False,True,torch.float32,False,False,10000,False,False,True,,0.05,0.01,False,1.0,1.0,0.5,2000.0,,gelu,True,True,0.1,,strict,False,1234.0,256.0,False,True,,,,False,,True,,True,,,True, -79,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/79,Check if each number in a sequence is prime,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/79/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/79/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/79/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/79/edges.pkl,2,4,10,1,custom,4,16,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.16329931618554522,True,False,standard,False,2,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, -82,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/82,Halve the elements in the second half of the sequence.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/82/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/82/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/82/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/82/edges.pkl,4,24,10,6,custom,4,96,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,bidirectional,False,0.0,0.1059625885652035,True,False,standard,False,16,False,27648,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, -86,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/86,"Check if each element is a power of 2. Return 1 if true, otherwise 0.",10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/86/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/86/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/86/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/86/edges.pkl,2,4,10,1,custom,4,16,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.16329931618554522,True,False,standard,False,2,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, -87,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/87,Binarize a sequence of integers using a threshold.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/87/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/87/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/87/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/87/edges.pkl,2,4,10,1,custom,4,16,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.16329931618554522,True,False,standard,False,2,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, -93,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/93,Swaps the nth with the n+1th element if n%2==1.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/93/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/93/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/93/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/93/edges.pkl,3,20,10,5,custom,4,80,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,bidirectional,False,0.0,0.10886621079036347,True,False,standard,False,11,False,14400,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, -ioi,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/ioi,Indirect Object Identification (IOI) task.,16,16,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi/edges.pkl,6,64,1024,16,gpt2,4,3072,gelu_new,50257,1e-05,False,True,False,False,False,False,GPT2LMHeadModel,False,gpt2,gpt2,LNPre,1,causal,False,,0.02886751345948129,False,False,standard,False,50257,False,84934656,False,False,True,torch.float32,False,False,10000,False,False,True,,0.05,0.001,False,1.0,1.0,0.4,,,,True,True,1.0,,,True,0.0,512.0,True,True,"val/accuracy,val/IIA",individual,,False,0.65,False,0.0,True,max,random,False,"0.9,0.9" -ioi_next_token,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/ioi_next_token,"Indirect Object Identification (IOI) task, trained using next token prediction.",16,16,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi_next_token/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi_next_token/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi_next_token/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi_next_token/edges.pkl,6,64,1024,16,gpt2,4,3072,gelu_new,50257,1e-05,False,True,False,False,False,False,GPT2LMHeadModel,False,gpt2,gpt2,LNPre,1,causal,False,,0.02886751345948129,True,False,standard,False,50257,False,2457600,False,False,True,torch.float32,False,False,10000,False,False,True,,0.05,0.001,False,1.0,1.0,0.4,,,,True,True,1.0,,,True,,256.0,True,True,"val/accuracy,val/IIA",,,True,0.65,False,0.0,True,max,,True, +97,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/97,Scale a sequence by its maximum element.,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/97/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/97/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/97/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/97/edges.pkl,3,200,10,50,custom,4,800,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,bidirectional,False,0.0,0.05405899027195888,True,False,standard,False,64,False,1440000,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.4,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,82.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, +46,https://huggingface.co/cybershiptrooper/InterpBench/tree/main/46,Decrements each element in the sequence by 1,10,4,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/46/ll_model_cfg.pkl,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/46/meta.json,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/46/ll_model.pth,https://huggingface.co/cybershiptrooper/InterpBench/blob/main/46/edges.pkl,2,4,10,1,custom,4,16,gelu,13,1e-05,True,True,True,True,False,False,,False,,gpt2,,1,causal,False,0.0,0.1392621247645583,True,False,standard,False,11,False,384,False,False,True,torch.float32,False,False,10000,False,False,False,/circuits-benchmark/results,0.05,0.001,True,1.0,0.4,0.7,1000.0,99.9,gelu,True,True,0.1,linear,strict,False,67.0,256.0,False,True,"val/accuracy,val/IIA,val/strict_accuracy",sample_all,all,True,,True,,True,,,True, diff --git a/benchmark_cases_metadata.parquet b/benchmark_cases_metadata.parquet index d06a0be30f4a35956d142442f5cc6d5bd49a829f..7f48c2aec6c8e1217dbf66604bb4f7f8b2ef09f5 100644 --- a/benchmark_cases_metadata.parquet +++ b/benchmark_cases_metadata.parquet @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0e070cc1d81773e400a5c30ce2e9b328d370ef2e2f67083ac90fc0638e4d2da8 -size 72322 +oid sha256:729585ec39204363367ad83efc6bf10185f4e191fe490714be2ab026d1c9864c +size 75187 diff --git a/benchmark_metadata.json b/benchmark_metadata.json index fd5b25af8111724f95f53daf802b0bbe3b4ac7c2..a21bb7d64a15ec20752b0924a4fd9ab02823894a 100644 --- a/benchmark_metadata.json +++ b/benchmark_metadata.json @@ -6,9 +6,9 @@ "url": "https://huggingface.co/cybershiptrooper/InterpBench", "cases": [ { - "case_id": "101", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/101", - "task_description": "Check if each element is a square of an integer.", + "case_id": "52", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/52", + "task_description": "Takes the square root of each element.", "vocab": [ 0, 1, @@ -20,45 +20,26 @@ 7, 8, 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29 + 10 ], "max_seq_len": 10, "min_seq_len": 4, "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/101/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/52/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/101/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/52/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/101/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/52/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/101/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/52/meta.json" } ], "transformer_cfg": { @@ -70,7 +51,7 @@ "n_heads": 4, "d_mlp": 16, "act_fn": "gelu", - "d_vocab": 32, + "d_vocab": 13, "eps": 1e-05, "use_attn_result": true, "use_attn_scale": true, @@ -92,12 +73,12 @@ "attention_dir": "causal", "attn_only": false, "seed": 0, - "initializer_range": 0.12199885626608374, + "initializer_range": 0.1392621247645583, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", "final_rms": false, - "d_vocab_out": 2, + "d_vocab_out": 11, "parallel_attn_mlp": false, "rotary_dim": null, "n_params": 384, @@ -115,7 +96,7 @@ "num_experts": null, "experts_per_token": null }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/101/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/52/ll_model_cfg.pkl", "training_args": { "output_dir": "/circuits-benchmark/results", "atol": 0.05, @@ -123,7 +104,7 @@ "use_single_loss": true, "iit_weight": 1.0, "behavior_weight": 0.4, - "strict_weight": 0.4, + "strict_weight": 0.7, "epochs": 1000, "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", @@ -145,57 +126,49 @@ "siit_sampling": "sample_all", "val_iia_sampling": "all" }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/101/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/101/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/101/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/52/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/52/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/52/edges.pkl" }, { - "case_id": "103", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/103", - "task_description": "Swap consecutive numbers in a list", + "case_id": "7", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/7", + "task_description": "Returns the number of times each token occurs in the input.", "vocab": [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10 + "a", + "b", + "c" ], "max_seq_len": 10, "min_seq_len": 4, "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/103/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/7/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/103/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/7/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/103/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/7/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/103/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/7/meta.json" } ], "transformer_cfg": { - "n_layers": 3, - "d_model": 24, + "n_layers": 2, + "d_model": 17, "n_ctx": 10, - "d_head": 6, + "d_head": 4, "model_name": "custom", "n_heads": 4, - "d_mlp": 96, + "d_mlp": 68, "act_fn": "gelu", - "d_vocab": 13, + "d_vocab": 5, "eps": 1e-05, "use_attn_result": true, "use_attn_scale": true, @@ -217,15 +190,15 @@ "attention_dir": "bidirectional", "attn_only": false, "seed": 0, - "initializer_range": 0.10787197799411874, + "initializer_range": 0.15689290811054724, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", "final_rms": false, - "d_vocab_out": 11, + "d_vocab_out": 10, "parallel_attn_mlp": false, "rotary_dim": null, - "n_params": 20736, + "n_params": 6800, "use_hook_tokens": false, "gated_mlp": false, "default_prepend_bos": true, @@ -235,89 +208,71 @@ "post_embedding_ln": false, "rotary_base": 10000, "trust_remote_code": false, - "rotary_adjacent_pairs": false, - "load_in_4bit": false, - "num_experts": null, - "experts_per_token": null + "rotary_adjacent_pairs": false }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/103/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/7/ll_model_cfg.pkl", "training_args": { - "output_dir": "/circuits-benchmark/results", "atol": 0.05, - "lr": 0.001, - "use_single_loss": true, + "lr": 0.01, + "use_single_loss": false, "iit_weight": 1.0, - "behavior_weight": 0.4, - "strict_weight": 0.4, - "epochs": 1000, - "early_stop_accuracy_threshold": 99.9, + "behavior_weight": 1.0, + "strict_weight": 0.5, + "epochs": 2000, "act_fn": "gelu", - "use_wandb": true, - "save_model_to_wandb": true, "clip_grad_norm": 0.1, - "lr_scheduler": "linear", + "lr_scheduler": "", "model_pair": "strict", "same_size": false, - "seed": 67, + "seed": 1234, "batch_size": 256, "include_mlp": false, - "detach_while_caching": true, - "scheduler_val_metric": [ - "val/accuracy", - "val/IIA", - "val/strict_accuracy" - ], - "siit_sampling": "sample_all", - "val_iia_sampling": "all" + "next_token": false, + "detach_while_caching": true }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/103/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/103/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/103/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/7/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/7/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/7/edges.pkl" }, { - "case_id": "11", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/11", - "task_description": "Counts the number of words in a sequence based on their length.", + "case_id": "13", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/13", + "task_description": "Analyzes the trend (increasing, decreasing, constant) of numeric tokens.", "vocab": [ - "J", - "LB", - "TPSI", - "V", - "b", - "no", - "oCLrZaW", - "poiVg" + 0, + 1, + 2 ], "max_seq_len": 10, "min_seq_len": 4, "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/11/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/13/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/11/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/13/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/11/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/13/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/11/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/13/meta.json" } ], "transformer_cfg": { "n_layers": 2, - "d_model": 12, + "d_model": 20, "n_ctx": 10, - "d_head": 3, + "d_head": 5, "model_name": "custom", "n_heads": 4, - "d_mlp": 48, + "d_mlp": 80, "act_fn": "gelu", - "d_vocab": 10, + "d_vocab": 5, "eps": 1e-05, "use_attn_result": true, "use_attn_scale": true, @@ -336,7 +291,7 @@ "init_mode": "gpt2", "normalization_type": null, "n_devices": 1, - "attention_dir": "causal", + "attention_dir": "bidirectional", "attn_only": false, "seed": 0, "initializer_range": 0.1460593486680443, @@ -344,10 +299,10 @@ "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", "final_rms": false, - "d_vocab_out": 5, + "d_vocab_out": 3, "parallel_attn_mlp": false, "rotary_dim": null, - "n_params": 3456, + "n_params": 9600, "use_hook_tokens": false, "gated_mlp": false, "default_prepend_bos": true, @@ -359,7 +314,7 @@ "trust_remote_code": false, "rotary_adjacent_pairs": false }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/11/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/13/ll_model_cfg.pkl", "training_args": { "atol": 0.05, "lr": 0.01, @@ -372,14 +327,14 @@ "clip_grad_norm": 1.0, "lr_scheduler": "" }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/11/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/11/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/11/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/13/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/13/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/13/edges.pkl" }, { - "case_id": "110", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/110", - "task_description": "Inserts zeros between each element, removing the latter half of the list.", + "case_id": "63", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/63", + "task_description": "Replaces each element with the number of elements less than it in the sequence.", "vocab": [ 0, 1, @@ -398,29 +353,29 @@ "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/110/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/63/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/110/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/63/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/110/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/63/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/110/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/63/meta.json" } ], "transformer_cfg": { "n_layers": 2, - "d_model": 20, + "d_model": 24, "n_ctx": 10, - "d_head": 5, + "d_head": 6, "model_name": "custom", "n_heads": 4, - "d_mlp": 80, + "d_mlp": 96, "act_fn": "gelu", "d_vocab": 13, "eps": 1e-05, @@ -444,15 +399,15 @@ "attention_dir": "bidirectional", "attn_only": false, "seed": 0, - "initializer_range": 0.11925695879998878, + "initializer_range": 0.13719886811400708, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", "final_rms": false, - "d_vocab_out": 11, + "d_vocab_out": 10, "parallel_attn_mlp": false, "rotary_dim": null, - "n_params": 9600, + "n_params": 13824, "use_hook_tokens": false, "gated_mlp": false, "default_prepend_bos": true, @@ -467,7 +422,7 @@ "num_experts": null, "experts_per_token": null }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/110/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/63/ll_model_cfg.pkl", "training_args": { "output_dir": "/circuits-benchmark/results", "atol": 0.05, @@ -497,14 +452,14 @@ "siit_sampling": "sample_all", "val_iia_sampling": "all" }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/110/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/110/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/110/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/63/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/63/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/63/edges.pkl" }, { - "case_id": "111", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/111", - "task_description": "Returns the last element of the sequence and pads the rest with zeros.", + "case_id": "60", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/60", + "task_description": "Increment each element in the sequence by 1.", "vocab": [ 0, 1, @@ -523,29 +478,29 @@ "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/111/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/60/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/111/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/60/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/111/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/60/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/111/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/60/meta.json" } ], "transformer_cfg": { - "n_layers": 3, - "d_model": 24, + "n_layers": 2, + "d_model": 4, "n_ctx": 10, - "d_head": 6, + "d_head": 1, "model_name": "custom", "n_heads": 4, - "d_mlp": 96, + "d_mlp": 16, "act_fn": "gelu", "d_vocab": 13, "eps": 1e-05, @@ -566,10 +521,10 @@ "init_mode": "gpt2", "normalization_type": null, "n_devices": 1, - "attention_dir": "bidirectional", + "attention_dir": "causal", "attn_only": false, "seed": 0, - "initializer_range": 0.09847319278346618, + "initializer_range": 0.1392621247645583, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", @@ -577,7 +532,7 @@ "d_vocab_out": 11, "parallel_attn_mlp": false, "rotary_dim": null, - "n_params": 20736, + "n_params": 384, "use_hook_tokens": false, "gated_mlp": false, "default_prepend_bos": true, @@ -592,7 +547,7 @@ "num_experts": null, "experts_per_token": null }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/111/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/60/ll_model_cfg.pkl", "training_args": { "output_dir": "/circuits-benchmark/results", "atol": 0.05, @@ -600,7 +555,7 @@ "use_single_loss": true, "iit_weight": 1.0, "behavior_weight": 0.4, - "strict_weight": 0.4, + "strict_weight": 0.7, "epochs": 1000, "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", @@ -622,76 +577,54 @@ "siit_sampling": "sample_all", "val_iia_sampling": "all" }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/111/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/111/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/111/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/60/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/60/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/60/edges.pkl" }, { - "case_id": "113", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/113", - "task_description": "Inverts the sequence if it is sorted in ascending order, otherwise leaves it unchanged.", + "case_id": "29", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/29", + "task_description": "Creates abbreviations for each token in the sequence.", "vocab": [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29 + "J", + "LB", + "TPSI", + "V", + "b", + "no", + "oCLrZaW", + "poiVg" ], "max_seq_len": 10, "min_seq_len": 4, "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/113/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/29/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/113/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/29/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/113/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/29/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/113/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/29/meta.json" } ], "transformer_cfg": { - "n_layers": 7, - "d_model": 88, + "n_layers": 2, + "d_model": 13, "n_ctx": 10, - "d_head": 22, + "d_head": 3, "model_name": "custom", "n_heads": 4, - "d_mlp": 352, + "d_mlp": 52, "act_fn": "gelu", - "d_vocab": 32, + "d_vocab": 10, "eps": 1e-05, "use_attn_result": true, "use_attn_scale": true, @@ -710,18 +643,18 @@ "init_mode": "gpt2", "normalization_type": null, "n_devices": 1, - "attention_dir": "bidirectional", + "attention_dir": "causal", "attn_only": false, "seed": 0, - "initializer_range": 0.0512147519731584, + "initializer_range": 0.1539600717839002, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", "final_rms": false, - "d_vocab_out": 30, + "d_vocab_out": 8, "parallel_attn_mlp": false, "rotary_dim": null, - "n_params": 650496, + "n_params": 3952, "use_hook_tokens": false, "gated_mlp": false, "default_prepend_bos": true, @@ -731,50 +664,32 @@ "post_embedding_ln": false, "rotary_base": 10000, "trust_remote_code": false, - "rotary_adjacent_pairs": false, - "load_in_4bit": false, - "num_experts": null, - "experts_per_token": null + "rotary_adjacent_pairs": false }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/113/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/29/ll_model_cfg.pkl", "training_args": { - "output_dir": "/circuits-benchmark/results", "atol": 0.05, - "lr": 0.001, - "use_single_loss": true, + "lr": 0.01, + "use_single_loss": false, "iit_weight": 1.0, - "behavior_weight": 0.4, + "behavior_weight": 1.0, "strict_weight": 0.4, - "epochs": 1000, - "early_stop_accuracy_threshold": 99.9, + "epochs": 2000, "act_fn": "gelu", - "use_wandb": true, - "save_model_to_wandb": true, "clip_grad_norm": 0.1, - "lr_scheduler": "linear", - "model_pair": "strict", - "same_size": false, - "seed": 67, - "batch_size": 256, - "include_mlp": false, - "detach_while_caching": true, - "scheduler_val_metric": [ - "val/accuracy", - "val/IIA", - "val/strict_accuracy" - ], - "siit_sampling": "sample_all", - "val_iia_sampling": "all" + "lr_scheduler": "", + "model_pair": "strict" }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/113/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/113/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/113/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/29/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/29/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/29/edges.pkl" }, { - "case_id": "114", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/114", - "task_description": "Apply a logarithm base 10 to each element of the input sequence.", + "case_id": "79", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/79", + "task_description": "Check if each number in a sequence is prime", "vocab": [ + 0, 1, 2, 3, @@ -791,19 +706,19 @@ "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/114/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/79/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/114/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/79/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/114/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/79/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/114/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/79/meta.json" } ], "transformer_cfg": { @@ -815,7 +730,7 @@ "n_heads": 4, "d_mlp": 16, "act_fn": "gelu", - "d_vocab": 12, + "d_vocab": 13, "eps": 1e-05, "use_attn_result": true, "use_attn_scale": true, @@ -837,12 +752,12 @@ "attention_dir": "causal", "attn_only": false, "seed": 0, - "initializer_range": 0.14368424162141993, + "initializer_range": 0.16329931618554522, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", "final_rms": false, - "d_vocab_out": 10, + "d_vocab_out": 2, "parallel_attn_mlp": false, "rotary_dim": null, "n_params": 384, @@ -860,7 +775,7 @@ "num_experts": null, "experts_per_token": null }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/114/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/79/ll_model_cfg.pkl", "training_args": { "output_dir": "/circuits-benchmark/results", "atol": 0.05, @@ -890,76 +805,49 @@ "siit_sampling": "sample_all", "val_iia_sampling": "all" }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/114/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/114/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/114/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/79/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/79/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/79/edges.pkl" }, { - "case_id": "122", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/122", - "task_description": "Check if each number is divisible by 3.", + "case_id": "24", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/24", + "task_description": "Identifies the first occurrence of each token in a sequence.", "vocab": [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29 + "a", + "b", + "c" ], "max_seq_len": 10, "min_seq_len": 4, "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/122/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/24/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/122/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/24/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/122/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/24/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/122/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/24/meta.json" } ], "transformer_cfg": { "n_layers": 2, - "d_model": 4, + "d_model": 36, "n_ctx": 10, - "d_head": 1, + "d_head": 9, "model_name": "custom", "n_heads": 4, - "d_mlp": 16, + "d_mlp": 144, "act_fn": "gelu", - "d_vocab": 32, + "d_vocab": 5, "eps": 1e-05, "use_attn_result": true, "use_attn_scale": true, @@ -981,15 +869,15 @@ "attention_dir": "causal", "attn_only": false, "seed": 0, - "initializer_range": 0.12199885626608374, + "initializer_range": 0.1885618083164127, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", "final_rms": false, - "d_vocab_out": 2, + "d_vocab_out": 3, "parallel_attn_mlp": false, "rotary_dim": null, - "n_params": 384, + "n_params": 31104, "use_hook_tokens": false, "gated_mlp": false, "default_prepend_bos": true, @@ -1004,7 +892,7 @@ "num_experts": null, "experts_per_token": null }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/122/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/24/ll_model_cfg.pkl", "training_args": { "output_dir": "/circuits-benchmark/results", "atol": 0.05, @@ -1034,14 +922,14 @@ "siit_sampling": "sample_all", "val_iia_sampling": "all" }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/122/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/122/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/122/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/24/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/24/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/24/edges.pkl" }, { - "case_id": "124", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/124", - "task_description": "Check if all elements in a list are equal.", + "case_id": "82", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/82", + "task_description": "Halve the elements in the second half of the sequence.", "vocab": [ 0, 1, @@ -1060,23 +948,23 @@ "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/124/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/82/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/124/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/82/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/124/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/82/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/124/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/82/meta.json" } ], "transformer_cfg": { - "n_layers": 3, + "n_layers": 4, "d_model": 24, "n_ctx": 10, "d_head": 6, @@ -1106,15 +994,15 @@ "attention_dir": "bidirectional", "attn_only": false, "seed": 0, - "initializer_range": 0.11547005383792516, + "initializer_range": 0.1059625885652035, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", "final_rms": false, - "d_vocab_out": 2, + "d_vocab_out": 16, "parallel_attn_mlp": false, "rotary_dim": null, - "n_params": 20736, + "n_params": 27648, "use_hook_tokens": false, "gated_mlp": false, "default_prepend_bos": true, @@ -1129,7 +1017,7 @@ "num_experts": null, "experts_per_token": null }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/124/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/82/ll_model_cfg.pkl", "training_args": { "output_dir": "/circuits-benchmark/results", "atol": 0.05, @@ -1159,68 +1047,47 @@ "siit_sampling": "sample_all", "val_iia_sampling": "all" }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/124/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/124/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/124/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/82/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/82/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/82/edges.pkl" }, { - "case_id": "129", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/129", - "task_description": "Checks if all elements are a multiple of n (set the default at 2).", + "case_id": "31", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/31", + "task_description": "Identify if tokens in the sequence are anagrams of the word 'listen'.", "vocab": [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29 - ], - "max_seq_len": 10, - "min_seq_len": 4, - "files": [ - { - "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/129/edges.pkl" - }, + "J", + "LB", + "TPSI", + "V", + "b", + "listen", + "no", + "oCLrZaW", + "poiVg" + ], + "max_seq_len": 10, + "min_seq_len": 4, + "files": [ + { + "file_name": "edges.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/31/edges.pkl" + }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/129/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/31/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/129/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/31/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/129/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/31/meta.json" } ], "transformer_cfg": { - "n_layers": 3, + "n_layers": 2, "d_model": 4, "n_ctx": 10, "d_head": 1, @@ -1228,7 +1095,7 @@ "n_heads": 4, "d_mlp": 16, "act_fn": "gelu", - "d_vocab": 32, + "d_vocab": 11, "eps": 1e-05, "use_attn_result": true, "use_attn_scale": true, @@ -1247,10 +1114,10 @@ "init_mode": "gpt2", "normalization_type": null, "n_devices": 1, - "attention_dir": "bidirectional", + "attention_dir": "causal", "attn_only": false, "seed": 0, - "initializer_range": 0.10504514628777804, + "initializer_range": 0.17056057308448835, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", @@ -1258,7 +1125,7 @@ "d_vocab_out": 2, "parallel_attn_mlp": false, "rotary_dim": null, - "n_params": 576, + "n_params": 384, "use_hook_tokens": false, "gated_mlp": false, "default_prepend_bos": true, @@ -1273,7 +1140,7 @@ "num_experts": null, "experts_per_token": null }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/129/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/31/ll_model_cfg.pkl", "training_args": { "output_dir": "/circuits-benchmark/results", "atol": 0.05, @@ -1303,49 +1170,66 @@ "siit_sampling": "sample_all", "val_iia_sampling": "all" }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/129/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/129/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/129/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/31/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/31/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/31/edges.pkl" }, { - "case_id": "13", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/13", - "task_description": "Analyzes the trend (increasing, decreasing, constant) of numeric tokens.", + "case_id": "72", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/72", + "task_description": "Negate each element in the input sequence.", "vocab": [ + -10, + -9, + -8, + -7, + -6, + -5, + -4, + -3, + -2, + -1, 0, 1, - 2 + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9 ], "max_seq_len": 10, "min_seq_len": 4, "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/13/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/72/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/13/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/72/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/13/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/72/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/13/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/72/meta.json" } ], "transformer_cfg": { "n_layers": 2, - "d_model": 20, + "d_model": 4, "n_ctx": 10, - "d_head": 5, + "d_head": 1, "model_name": "custom", "n_heads": 4, - "d_mlp": 80, + "d_mlp": 16, "act_fn": "gelu", - "d_vocab": 5, + "d_vocab": 22, "eps": 1e-05, "use_attn_result": true, "use_attn_scale": true, @@ -1364,18 +1248,18 @@ "init_mode": "gpt2", "normalization_type": null, "n_devices": 1, - "attention_dir": "bidirectional", + "attention_dir": "causal", "attn_only": false, "seed": 0, - "initializer_range": 0.1460593486680443, + "initializer_range": 0.11202240672224079, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", "final_rms": false, - "d_vocab_out": 3, + "d_vocab_out": 20, "parallel_attn_mlp": false, "rotary_dim": null, - "n_params": 9600, + "n_params": 384, "use_hook_tokens": false, "gated_mlp": false, "default_prepend_bos": true, @@ -1385,64 +1269,92 @@ "post_embedding_ln": false, "rotary_base": 10000, "trust_remote_code": false, - "rotary_adjacent_pairs": false + "rotary_adjacent_pairs": false, + "load_in_4bit": false, + "num_experts": null, + "experts_per_token": null }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/13/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/72/ll_model_cfg.pkl", "training_args": { + "output_dir": "/circuits-benchmark/results", "atol": 0.05, - "lr": 0.01, - "use_single_loss": false, + "lr": 0.001, + "use_single_loss": true, "iit_weight": 1.0, - "behavior_weight": 1.0, - "strict_weight": 0.4, - "epochs": 500, + "behavior_weight": 0.4, + "strict_weight": 0.7, + "epochs": 1000, + "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", - "clip_grad_norm": 1.0, - "lr_scheduler": "" + "use_wandb": true, + "save_model_to_wandb": true, + "clip_grad_norm": 0.1, + "lr_scheduler": "linear", + "model_pair": "strict", + "same_size": false, + "seed": 67, + "batch_size": 256, + "include_mlp": false, + "detach_while_caching": true, + "scheduler_val_metric": [ + "val/accuracy", + "val/IIA", + "val/strict_accuracy" + ], + "siit_sampling": "sample_all", + "val_iia_sampling": "all" }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/13/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/13/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/13/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/72/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/72/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/72/edges.pkl" }, { - "case_id": "14", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/14", - "task_description": "Returns the count of 'a' in the input sequence.", + "case_id": "104", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/104", + "task_description": "Apply exponential function to all elements of the input sequence.", "vocab": [ - "a", - "b", - "c" + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 ], "max_seq_len": 10, "min_seq_len": 4, "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/14/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/104/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/14/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/104/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/14/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/104/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/14/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/104/meta.json" } ], "transformer_cfg": { "n_layers": 2, - "d_model": 8, + "d_model": 4, "n_ctx": 10, - "d_head": 2, + "d_head": 1, "model_name": "custom", "n_heads": 4, - "d_mlp": 32, + "d_mlp": 16, "act_fn": "gelu", - "d_vocab": 5, + "d_vocab": 13, "eps": 1e-05, "use_attn_result": true, "use_attn_scale": true, @@ -1461,18 +1373,18 @@ "init_mode": "gpt2", "normalization_type": null, "n_devices": 1, - "attention_dir": "bidirectional", + "attention_dir": "causal", "attn_only": false, "seed": 0, - "initializer_range": 0.15689290811054724, + "initializer_range": 0.1392621247645583, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", "final_rms": false, - "d_vocab_out": 10, + "d_vocab_out": 11, "parallel_attn_mlp": false, "rotary_dim": null, - "n_params": 1536, + "n_params": 384, "use_hook_tokens": false, "gated_mlp": false, "default_prepend_bos": true, @@ -1487,7 +1399,7 @@ "num_experts": null, "experts_per_token": null }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/14/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/104/ll_model_cfg.pkl", "training_args": { "output_dir": "/circuits-benchmark/results", "atol": 0.05, @@ -1495,7 +1407,7 @@ "use_single_loss": true, "iit_weight": 1.0, "behavior_weight": 0.4, - "strict_weight": 0.4, + "strict_weight": 0.7, "epochs": 1000, "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", @@ -1517,9 +1429,9 @@ "siit_sampling": "sample_all", "val_iia_sampling": "all" }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/14/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/14/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/14/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/104/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/104/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/104/edges.pkl" }, { "case_id": "18", @@ -1622,44 +1534,52 @@ "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/18/edges.pkl" }, { - "case_id": "19", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/19", - "task_description": "Removes consecutive duplicate tokens from a sequence.", + "case_id": "110", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/110", + "task_description": "Inserts zeros between each element, removing the latter half of the list.", "vocab": [ - "a", - "b", - "c" + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 ], - "max_seq_len": 15, + "max_seq_len": 10, "min_seq_len": 4, "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/19/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/110/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/19/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/110/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/19/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/110/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/19/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/110/meta.json" } ], "transformer_cfg": { "n_layers": 2, - "d_model": 32, - "n_ctx": 15, - "d_head": 8, + "d_model": 20, + "n_ctx": 10, + "d_head": 5, "model_name": "custom", "n_heads": 4, - "d_mlp": 128, + "d_mlp": 80, "act_fn": "gelu", - "d_vocab": 5, + "d_vocab": 13, "eps": 1e-05, "use_attn_result": true, "use_attn_scale": true, @@ -1678,18 +1598,18 @@ "init_mode": "gpt2", "normalization_type": null, "n_devices": 1, - "attention_dir": "causal", + "attention_dir": "bidirectional", "attn_only": false, "seed": 0, - "initializer_range": 0.15689290811054724, + "initializer_range": 0.11925695879998878, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", "final_rms": false, - "d_vocab_out": 3, + "d_vocab_out": 11, "parallel_attn_mlp": false, "rotary_dim": null, - "n_params": 24576, + "n_params": 9600, "use_hook_tokens": false, "gated_mlp": false, "default_prepend_bos": true, @@ -1699,87 +1619,92 @@ "post_embedding_ln": false, "rotary_base": 10000, "trust_remote_code": false, - "rotary_adjacent_pairs": false + "rotary_adjacent_pairs": false, + "load_in_4bit": false, + "num_experts": null, + "experts_per_token": null }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/19/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/110/ll_model_cfg.pkl", "training_args": { + "output_dir": "/circuits-benchmark/results", "atol": 0.05, "lr": 0.001, - "use_single_loss": false, + "use_single_loss": true, "iit_weight": 1.0, - "behavior_weight": 1.0, + "behavior_weight": 0.4, "strict_weight": 0.4, - "epochs": 2000, + "epochs": 1000, + "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", + "use_wandb": true, + "save_model_to_wandb": true, "clip_grad_norm": 0.1, - "lr_scheduler": "" + "lr_scheduler": "linear", + "model_pair": "strict", + "same_size": false, + "seed": 67, + "batch_size": 256, + "include_mlp": false, + "detach_while_caching": true, + "scheduler_val_metric": [ + "val/accuracy", + "val/IIA", + "val/strict_accuracy" + ], + "siit_sampling": "sample_all", + "val_iia_sampling": "all" }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/19/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/19/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/19/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/110/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/110/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/110/edges.pkl" }, { - "case_id": "2", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/2", - "task_description": "Reverse the input sequence.", + "case_id": "69", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/69", + "task_description": "Assign -1, 0, or 1 to each element of the input sequence based on its sign.", "vocab": [ - "a", - "b", - "c", - "d", - "e", - "f", - "g", - "h", - "i", - "j", - "k", - "l", - "m", - "n", - "o", - "p", - "q", - "r", - "s", - "t", - "u", - "v", - "w", - "x", - "y", - "z" + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 ], "max_seq_len": 10, "min_seq_len": 4, "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/2/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/69/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/2/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/69/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/2/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/69/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/2/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/69/meta.json" } ], "transformer_cfg": { - "n_layers": 4, - "d_model": 56, + "n_layers": 2, + "d_model": 4, "n_ctx": 10, - "d_head": 14, + "d_head": 1, "model_name": "custom", "n_heads": 4, - "d_mlp": 224, + "d_mlp": 16, "act_fn": "gelu", - "d_vocab": 28, + "d_vocab": 13, "eps": 1e-05, "use_attn_result": true, "use_attn_scale": true, @@ -1798,18 +1723,18 @@ "init_mode": "gpt2", "normalization_type": null, "n_devices": 1, - "attention_dir": "bidirectional", + "attention_dir": "causal", "attn_only": false, "seed": 0, - "initializer_range": 0.07593263966019993, + "initializer_range": 0.16329931618554522, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", "final_rms": false, - "d_vocab_out": 26, + "d_vocab_out": 2, "parallel_attn_mlp": false, "rotary_dim": null, - "n_params": 150528, + "n_params": 384, "use_hook_tokens": false, "gated_mlp": false, "default_prepend_bos": true, @@ -1824,7 +1749,7 @@ "num_experts": null, "experts_per_token": null }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/2/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/69/ll_model_cfg.pkl", "training_args": { "output_dir": "/circuits-benchmark/results", "atol": 0.05, @@ -1854,58 +1779,57 @@ "siit_sampling": "sample_all", "val_iia_sampling": "all" }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/2/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/2/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/2/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/69/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/69/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/69/edges.pkl" }, { - "case_id": "20", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/20", - "task_description": "Detect spam messages based on appearance of spam keywords.", + "case_id": "87", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/87", + "task_description": "Binarize a sequence of integers using a threshold.", "vocab": [ - "J", - "LB", - "TPSI", - "V", - "b", - "click", - "no", - "now", - "oCLrZaW", - "offer", - "poiVg", - "spam" + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 ], "max_seq_len": 10, "min_seq_len": 4, "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/20/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/87/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/20/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/87/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/20/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/87/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/20/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/87/meta.json" } ], "transformer_cfg": { "n_layers": 2, - "d_model": 13, + "d_model": 4, "n_ctx": 10, - "d_head": 3, + "d_head": 1, "model_name": "custom", "n_heads": 4, - "d_mlp": 52, + "d_mlp": 16, "act_fn": "gelu", - "d_vocab": 14, + "d_vocab": 13, "eps": 1e-05, "use_attn_result": true, "use_attn_scale": true, @@ -1927,7 +1851,7 @@ "attention_dir": "causal", "attn_only": false, "seed": 0, - "initializer_range": 0.16, + "initializer_range": 0.16329931618554522, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", @@ -1935,7 +1859,7 @@ "d_vocab_out": 2, "parallel_attn_mlp": false, "rotary_dim": null, - "n_params": 3952, + "n_params": 384, "use_hook_tokens": false, "gated_mlp": false, "default_prepend_bos": true, @@ -1945,65 +1869,92 @@ "post_embedding_ln": false, "rotary_base": 10000, "trust_remote_code": false, - "rotary_adjacent_pairs": false + "rotary_adjacent_pairs": false, + "load_in_4bit": false, + "num_experts": null, + "experts_per_token": null }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/20/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/87/ll_model_cfg.pkl", "training_args": { + "output_dir": "/circuits-benchmark/results", "atol": 0.05, - "lr": 0.01, - "use_single_loss": false, + "lr": 0.001, + "use_single_loss": true, "iit_weight": 1.0, - "behavior_weight": 1.0, + "behavior_weight": 0.4, "strict_weight": 0.4, - "epochs": 2000, + "epochs": 1000, + "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", + "use_wandb": true, + "save_model_to_wandb": true, "clip_grad_norm": 0.1, - "lr_scheduler": "", - "model_pair": "strict" + "lr_scheduler": "linear", + "model_pair": "strict", + "same_size": false, + "seed": 67, + "batch_size": 256, + "include_mlp": false, + "detach_while_caching": true, + "scheduler_val_metric": [ + "val/accuracy", + "val/IIA", + "val/strict_accuracy" + ], + "siit_sampling": "sample_all", + "val_iia_sampling": "all" }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/20/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/20/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/20/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/87/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/87/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/87/edges.pkl" }, { - "case_id": "21", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/21", - "task_description": "Extract unique tokens from a string", + "case_id": "80", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/80", + "task_description": "Subtract a constant from each element of the input sequence.", "vocab": [ - "a", - "b", - "c" + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 ], "max_seq_len": 10, "min_seq_len": 4, "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/21/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/80/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/21/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/80/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/21/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/80/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/21/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/80/meta.json" } ], "transformer_cfg": { - "n_layers": 4, - "d_model": 50, + "n_layers": 2, + "d_model": 4, "n_ctx": 10, - "d_head": 12, + "d_head": 1, "model_name": "custom", "n_heads": 4, - "d_mlp": 200, + "d_mlp": 16, "act_fn": "gelu", - "d_vocab": 5, + "d_vocab": 13, "eps": 1e-05, "use_attn_result": true, "use_attn_scale": true, @@ -2025,15 +1976,15 @@ "attention_dir": "causal", "attn_only": false, "seed": 0, - "initializer_range": 0.09847319278346618, + "initializer_range": 0.1392621247645583, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", "final_rms": false, - "d_vocab_out": 3, + "d_vocab_out": 11, "parallel_attn_mlp": false, "rotary_dim": null, - "n_params": 118400, + "n_params": 384, "use_hook_tokens": false, "gated_mlp": false, "default_prepend_bos": true, @@ -2043,65 +1994,92 @@ "post_embedding_ln": false, "rotary_base": 10000, "trust_remote_code": false, - "rotary_adjacent_pairs": false + "rotary_adjacent_pairs": false, + "load_in_4bit": false, + "num_experts": null, + "experts_per_token": null }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/21/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/80/ll_model_cfg.pkl", "training_args": { + "output_dir": "/circuits-benchmark/results", "atol": 0.05, - "lr": 0.0005, - "use_single_loss": false, + "lr": 0.001, + "use_single_loss": true, "iit_weight": 1.0, - "behavior_weight": 1.0, - "strict_weight": 0.5, - "epochs": 2000, + "behavior_weight": 0.4, + "strict_weight": 0.7, + "epochs": 1000, + "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", + "use_wandb": true, + "save_model_to_wandb": true, "clip_grad_norm": 0.1, - "lr_scheduler": "", - "model_pair": "strict" + "lr_scheduler": "linear", + "model_pair": "strict", + "same_size": false, + "seed": 67, + "batch_size": 256, + "include_mlp": false, + "detach_while_caching": true, + "scheduler_val_metric": [ + "val/accuracy", + "val/IIA", + "val/strict_accuracy" + ], + "siit_sampling": "sample_all", + "val_iia_sampling": "all" }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/21/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/21/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/21/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/80/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/80/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/80/edges.pkl" }, { - "case_id": "24", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/24", - "task_description": "Identifies the first occurrence of each token in a sequence.", + "case_id": "83", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/83", + "task_description": "Triple each element in the sequence.", "vocab": [ - "a", - "b", - "c" + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 ], "max_seq_len": 10, "min_seq_len": 4, "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/24/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/83/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/24/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/83/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/24/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/83/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/24/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/83/meta.json" } ], "transformer_cfg": { "n_layers": 2, - "d_model": 36, + "d_model": 4, "n_ctx": 10, - "d_head": 9, + "d_head": 1, "model_name": "custom", "n_heads": 4, - "d_mlp": 144, + "d_mlp": 16, "act_fn": "gelu", - "d_vocab": 5, + "d_vocab": 13, "eps": 1e-05, "use_attn_result": true, "use_attn_scale": true, @@ -2123,15 +2101,15 @@ "attention_dir": "causal", "attn_only": false, "seed": 0, - "initializer_range": 0.1885618083164127, + "initializer_range": 0.1392621247645583, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", "final_rms": false, - "d_vocab_out": 3, + "d_vocab_out": 11, "parallel_attn_mlp": false, "rotary_dim": null, - "n_params": 31104, + "n_params": 384, "use_hook_tokens": false, "gated_mlp": false, "default_prepend_bos": true, @@ -2146,7 +2124,7 @@ "num_experts": null, "experts_per_token": null }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/24/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/83/ll_model_cfg.pkl", "training_args": { "output_dir": "/circuits-benchmark/results", "atol": 0.05, @@ -2154,7 +2132,7 @@ "use_single_loss": true, "iit_weight": 1.0, "behavior_weight": 0.4, - "strict_weight": 0.4, + "strict_weight": 0.7, "epochs": 1000, "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", @@ -2176,49 +2154,57 @@ "siit_sampling": "sample_all", "val_iia_sampling": "all" }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/24/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/24/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/24/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/83/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/83/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/83/edges.pkl" }, { - "case_id": "25", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/25", - "task_description": "Normalizes token frequencies in a sequence to a range between 0 and 1.", + "case_id": "45", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/45", + "task_description": "Doubles the first half of the sequence", "vocab": [ - "a", - "b", - "c" + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 ], "max_seq_len": 10, "min_seq_len": 4, "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/25/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/45/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/25/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/45/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/25/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/45/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/25/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/45/meta.json" } ], "transformer_cfg": { - "n_layers": 2, - "d_model": 62, + "n_layers": 3, + "d_model": 24, "n_ctx": 10, - "d_head": 15, + "d_head": 6, "model_name": "custom", "n_heads": 4, - "d_mlp": 248, + "d_mlp": 96, "act_fn": "gelu", - "d_vocab": 5, + "d_vocab": 13, "eps": 1e-05, "use_attn_result": true, "use_attn_scale": true, @@ -2240,15 +2226,15 @@ "attention_dir": "bidirectional", "attn_only": false, "seed": 0, - "initializer_range": 0.08295613557843402, + "initializer_range": 0.11094003924504584, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", "final_rms": false, - "d_vocab_out": 56, + "d_vocab_out": 16, "parallel_attn_mlp": false, "rotary_dim": null, - "n_params": 91264, + "n_params": 20736, "use_hook_tokens": false, "gated_mlp": false, "default_prepend_bos": true, @@ -2263,7 +2249,7 @@ "num_experts": null, "experts_per_token": null }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/25/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/45/ll_model_cfg.pkl", "training_args": { "output_dir": "/circuits-benchmark/results", "atol": 0.05, @@ -2293,49 +2279,56 @@ "siit_sampling": "sample_all", "val_iia_sampling": "all" }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/25/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/25/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/25/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/45/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/45/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/45/edges.pkl" }, { - "case_id": "26", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/26", - "task_description": "Creates a cascading effect by repeating each token in sequence incrementally.", + "case_id": "114", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/114", + "task_description": "Apply a logarithm base 10 to each element of the input sequence.", "vocab": [ - "a", - "b", - "c" + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 ], "max_seq_len": 10, "min_seq_len": 4, "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/26/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/114/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/26/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/114/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/26/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/114/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/26/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/114/meta.json" } ], "transformer_cfg": { "n_layers": 2, - "d_model": 21, + "d_model": 4, "n_ctx": 10, - "d_head": 5, + "d_head": 1, "model_name": "custom", "n_heads": 4, - "d_mlp": 84, + "d_mlp": 16, "act_fn": "gelu", - "d_vocab": 5, + "d_vocab": 12, "eps": 1e-05, "use_attn_result": true, "use_attn_scale": true, @@ -2357,15 +2350,15 @@ "attention_dir": "causal", "attn_only": false, "seed": 0, - "initializer_range": 0.12344267996967354, + "initializer_range": 0.14368424162141993, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", "final_rms": false, - "d_vocab_out": 27, + "d_vocab_out": 10, "parallel_attn_mlp": false, "rotary_dim": null, - "n_params": 10416, + "n_params": 384, "use_hook_tokens": false, "gated_mlp": false, "default_prepend_bos": true, @@ -2375,30 +2368,49 @@ "post_embedding_ln": false, "rotary_base": 10000, "trust_remote_code": false, - "rotary_adjacent_pairs": false + "rotary_adjacent_pairs": false, + "load_in_4bit": false, + "num_experts": null, + "experts_per_token": null }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/26/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/114/ll_model_cfg.pkl", "training_args": { + "output_dir": "/circuits-benchmark/results", "atol": 0.05, - "lr": 0.01, - "use_single_loss": false, + "lr": 0.001, + "use_single_loss": true, "iit_weight": 1.0, - "behavior_weight": 1.0, + "behavior_weight": 0.4, "strict_weight": 0.4, - "epochs": 2000, + "epochs": 1000, + "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", + "use_wandb": true, + "save_model_to_wandb": true, "clip_grad_norm": 0.1, - "lr_scheduler": "", - "model_pair": "strict" + "lr_scheduler": "linear", + "model_pair": "strict", + "same_size": false, + "seed": 67, + "batch_size": 256, + "include_mlp": false, + "detach_while_caching": true, + "scheduler_val_metric": [ + "val/accuracy", + "val/IIA", + "val/strict_accuracy" + ], + "siit_sampling": "sample_all", + "val_iia_sampling": "all" }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/26/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/26/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/26/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/114/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/114/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/114/edges.pkl" }, { - "case_id": "29", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/29", - "task_description": "Creates abbreviations for each token in the sequence.", + "case_id": "33", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/33", + "task_description": "Checks if each token's length is odd or even.", "vocab": [ "J", "LB", @@ -2414,29 +2426,29 @@ "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/29/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/33/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/29/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/33/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/29/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/33/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/29/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/33/meta.json" } ], "transformer_cfg": { "n_layers": 2, - "d_model": 13, + "d_model": 4, "n_ctx": 10, - "d_head": 3, + "d_head": 1, "model_name": "custom", "n_heads": 4, - "d_mlp": 52, + "d_mlp": 16, "act_fn": "gelu", "d_vocab": 10, "eps": 1e-05, @@ -2460,15 +2472,15 @@ "attention_dir": "causal", "attn_only": false, "seed": 0, - "initializer_range": 0.1539600717839002, + "initializer_range": 0.17457431218879393, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", "final_rms": false, - "d_vocab_out": 8, + "d_vocab_out": 2, "parallel_attn_mlp": false, "rotary_dim": null, - "n_params": 3952, + "n_params": 384, "use_hook_tokens": false, "gated_mlp": false, "default_prepend_bos": true, @@ -2480,10 +2492,10 @@ "trust_remote_code": false, "rotary_adjacent_pairs": false }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/29/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/33/ll_model_cfg.pkl", "training_args": { "atol": 0.05, - "lr": 0.01, + "lr": 0.001, "use_single_loss": false, "iit_weight": 1.0, "behavior_weight": 1.0, @@ -2491,167 +2503,160 @@ "epochs": 2000, "act_fn": "gelu", "clip_grad_norm": 0.1, - "lr_scheduler": "", - "model_pair": "strict" + "lr_scheduler": "" }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/29/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/29/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/29/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/33/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/33/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/33/edges.pkl" }, { - "case_id": "3", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/3", - "task_description": "Returns the fraction of 'x' in the input up to the i-th position for all i.", - "vocab": [ - "a", - "b", - "c", - "x" - ], - "max_seq_len": 5, - "min_seq_len": 4, + "case_id": "ioi", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/ioi", + "task_description": "Indirect Object Identification (IOI) task.", + "max_seq_len": 16, + "min_seq_len": 16, "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/3/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/3/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/3/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/3/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi/meta.json" } ], "transformer_cfg": { - "n_layers": 2, - "d_model": 12, - "n_ctx": 5, - "d_head": 3, - "model_name": "custom", + "n_layers": 6, + "d_model": 64, + "n_ctx": 1024, + "d_head": 16, + "model_name": "gpt2", "n_heads": 4, - "d_mlp": 48, - "act_fn": "gelu", - "d_vocab": 6, + "d_mlp": 3072, + "act_fn": "gelu_new", + "d_vocab": 50257, "eps": 1e-05, - "use_attn_result": true, + "use_attn_result": false, "use_attn_scale": true, - "use_split_qkv_input": true, - "use_hook_mlp_in": true, + "use_split_qkv_input": false, + "use_hook_mlp_in": false, "use_attn_in": false, "use_local_attn": false, - "original_architecture": null, + "original_architecture": "GPT2LMHeadModel", "from_checkpoint": false, "checkpoint_index": null, "checkpoint_label_type": null, "checkpoint_value": null, - "tokenizer_name": null, + "tokenizer_name": "gpt2", "window_size": null, "attn_types": null, "init_mode": "gpt2", - "normalization_type": null, + "normalization_type": "LNPre", "n_devices": 1, "attention_dir": "causal", "attn_only": false, - "seed": 0, - "initializer_range": 0.22188007849009167, - "init_weights": true, + "seed": null, + "initializer_range": 0.02886751345948129, + "init_weights": false, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", "final_rms": false, - "d_vocab_out": 1, + "d_vocab_out": 50257, "parallel_attn_mlp": false, "rotary_dim": null, - "n_params": 3456, + "n_params": 84934656, "use_hook_tokens": false, "gated_mlp": false, "default_prepend_bos": true, "dtype": "torch.float32", - "tokenizer_prepends_bos": null, + "tokenizer_prepends_bos": false, "n_key_value_heads": null, "post_embedding_ln": false, "rotary_base": 10000, "trust_remote_code": false, "rotary_adjacent_pairs": false }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/3/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi/ll_model_cfg.pkl", "training_args": { - "atol": 0.05, - "lr": 0.01, - "use_single_loss": false, - "iit_weight": 1.0, - "behavior_weight": 1.0, - "strict_weight": 10.0, - "epochs": 2000, - "act_fn": "gelu", - "clip_grad_norm": 0.1, - "lr_scheduler": "", - "model_pair": "strict", - "same_size": false - }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/3/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/3/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/3/edges.pkl" + "next_token": false, + "non_ioi_thresh": 0.65, + "use_per_token_check": false, + "batch_size": 512, + "num_workers": 0, + "early_stop": true, + "lr_scheduler": null, + "scheduler_val_metric": [ + "val/accuracy", + "val/IIA" + ], + "scheduler_mode": "max", + "scheduler_kwargs": {}, + "clip_grad_norm": 1.0, + "seed": 0, + "lr": 0.001, + "detach_while_caching": true, + "optimizer_kwargs": { + "betas": [ + 0.9, + 0.9 + ] + }, + "atol": 0.05, + "use_single_loss": false, + "iit_weight": 1.0, + "behavior_weight": 1.0, + "val_IIA_sampling": "random", + "use_all_tokens_for_behavior": false, + "strict_weight": 0.4, + "siit_sampling": "individual" + }, + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi/edges.pkl" }, { - "case_id": "30", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/30", - "task_description": "Tags numeric tokens in a sequence based on whether they fall within a given range.", + "case_id": "55", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/55", + "task_description": "Applies the hyperbolic sine to each element.", "vocab": [ - "0", - "1", - "10", - "11", - "12", - "13", - "14", - "15", - "16", - "17", - "18", - "19", - "2", - "20", - "21", - "22", - "23", - "24", - "25", - "26", - "27", - "28", - "29", - "3", - "4", - "5", - "6", - "7", - "8", - "9" + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 ], "max_seq_len": 10, "min_seq_len": 4, "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/30/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/55/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/30/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/55/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/30/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/55/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/30/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/55/meta.json" } ], "transformer_cfg": { @@ -2663,7 +2668,7 @@ "n_heads": 4, "d_mlp": 16, "act_fn": "gelu", - "d_vocab": 32, + "d_vocab": 13, "eps": 1e-05, "use_attn_result": true, "use_attn_scale": true, @@ -2685,12 +2690,12 @@ "attention_dir": "causal", "attn_only": false, "seed": 0, - "initializer_range": 0.12199885626608374, + "initializer_range": 0.1392621247645583, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", "final_rms": false, - "d_vocab_out": 2, + "d_vocab_out": 11, "parallel_attn_mlp": false, "rotary_dim": null, "n_params": 384, @@ -2708,7 +2713,7 @@ "num_experts": null, "experts_per_token": null }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/30/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/55/ll_model_cfg.pkl", "training_args": { "output_dir": "/circuits-benchmark/results", "atol": 0.05, @@ -2716,7 +2721,7 @@ "use_single_loss": true, "iit_weight": 1.0, "behavior_weight": 0.4, - "strict_weight": 0.4, + "strict_weight": 0.7, "epochs": 1000, "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", @@ -2738,55 +2743,49 @@ "siit_sampling": "sample_all", "val_iia_sampling": "all" }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/30/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/30/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/30/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/55/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/55/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/55/edges.pkl" }, { - "case_id": "31", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/31", - "task_description": "Identify if tokens in the sequence are anagrams of the word 'listen'.", + "case_id": "21", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/21", + "task_description": "Extract unique tokens from a string", "vocab": [ - "J", - "LB", - "TPSI", - "V", + "a", "b", - "listen", - "no", - "oCLrZaW", - "poiVg" + "c" ], "max_seq_len": 10, "min_seq_len": 4, "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/31/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/21/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/31/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/21/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/31/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/21/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/31/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/21/meta.json" } ], "transformer_cfg": { - "n_layers": 2, - "d_model": 4, + "n_layers": 4, + "d_model": 50, "n_ctx": 10, - "d_head": 1, + "d_head": 12, "model_name": "custom", "n_heads": 4, - "d_mlp": 16, + "d_mlp": 200, "act_fn": "gelu", - "d_vocab": 11, + "d_vocab": 5, "eps": 1e-05, "use_attn_result": true, "use_attn_scale": true, @@ -2808,15 +2807,15 @@ "attention_dir": "causal", "attn_only": false, "seed": 0, - "initializer_range": 0.17056057308448835, + "initializer_range": 0.09847319278346618, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", "final_rms": false, - "d_vocab_out": 2, + "d_vocab_out": 3, "parallel_attn_mlp": false, "rotary_dim": null, - "n_params": 384, + "n_params": 118400, "use_hook_tokens": false, "gated_mlp": false, "default_prepend_bos": true, @@ -2826,89 +2825,67 @@ "post_embedding_ln": false, "rotary_base": 10000, "trust_remote_code": false, - "rotary_adjacent_pairs": false, - "load_in_4bit": false, - "num_experts": null, - "experts_per_token": null + "rotary_adjacent_pairs": false }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/31/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/21/ll_model_cfg.pkl", "training_args": { - "output_dir": "/circuits-benchmark/results", "atol": 0.05, - "lr": 0.001, - "use_single_loss": true, + "lr": 0.0005, + "use_single_loss": false, "iit_weight": 1.0, - "behavior_weight": 0.4, - "strict_weight": 0.4, - "epochs": 1000, - "early_stop_accuracy_threshold": 99.9, + "behavior_weight": 1.0, + "strict_weight": 0.5, + "epochs": 2000, "act_fn": "gelu", - "use_wandb": true, - "save_model_to_wandb": true, "clip_grad_norm": 0.1, - "lr_scheduler": "linear", - "model_pair": "strict", - "same_size": false, - "seed": 67, - "batch_size": 256, - "include_mlp": false, - "detach_while_caching": true, - "scheduler_val_metric": [ - "val/accuracy", - "val/IIA", - "val/strict_accuracy" - ], - "siit_sampling": "sample_all", - "val_iia_sampling": "all" + "lr_scheduler": "", + "model_pair": "strict" }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/31/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/31/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/31/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/21/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/21/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/21/edges.pkl" }, { - "case_id": "33", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/33", - "task_description": "Checks if each token's length is odd or even.", + "case_id": "4", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/4", + "task_description": "Return fraction of previous open tokens minus the fraction of close tokens.", "vocab": [ - "J", - "LB", - "TPSI", - "V", + "(", + ")", + "a", "b", - "no", - "oCLrZaW", - "poiVg" + "c" ], "max_seq_len": 10, "min_seq_len": 4, "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/33/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/4/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/33/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/4/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/33/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/4/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/33/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/4/meta.json" } ], "transformer_cfg": { "n_layers": 2, - "d_model": 4, + "d_model": 20, "n_ctx": 10, - "d_head": 1, + "d_head": 5, "model_name": "custom", "n_heads": 4, - "d_mlp": 16, + "d_mlp": 80, "act_fn": "gelu", - "d_vocab": 10, + "d_vocab": 7, "eps": 1e-05, "use_attn_result": true, "use_attn_scale": true, @@ -2930,15 +2907,15 @@ "attention_dir": "causal", "attn_only": false, "seed": 0, - "initializer_range": 0.17457431218879393, + "initializer_range": 0.17056057308448835, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", "final_rms": false, - "d_vocab_out": 2, + "d_vocab_out": 1, "parallel_attn_mlp": false, "rotary_dim": null, - "n_params": 384, + "n_params": 9600, "use_hook_tokens": false, "gated_mlp": false, "default_prepend_bos": true, @@ -2950,7 +2927,7 @@ "trust_remote_code": false, "rotary_adjacent_pairs": false }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/33/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/4/ll_model_cfg.pkl", "training_args": { "atol": 0.05, "lr": 0.001, @@ -2963,52 +2940,3376 @@ "clip_grad_norm": 0.1, "lr_scheduler": "" }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/33/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/33/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/33/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/4/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/4/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/4/edges.pkl" }, { - "case_id": "34", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/34", - "task_description": "Calculate the ratio of vowels to consonants in each word.", + "case_id": "30", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/30", + "task_description": "Tags numeric tokens in a sequence based on whether they fall within a given range.", "vocab": [ - "J", - "LB", - "TPSI", - "V", - "b", - "no", - "oCLrZaW", - "poiVg" - ], - "max_seq_len": 10, - "min_seq_len": 4, - "files": [ - { - "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/34/edges.pkl" - }, - { - "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/34/ll_model.pth" - }, - { - "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/34/ll_model_cfg.pkl" - }, - { + "0", + "1", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "2", + "20", + "21", + "22", + "23", + "24", + "25", + "26", + "27", + "28", + "29", + "3", + "4", + "5", + "6", + "7", + "8", + "9" + ], + "max_seq_len": 10, + "min_seq_len": 4, + "files": [ + { + "file_name": "edges.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/30/edges.pkl" + }, + { + "file_name": "ll_model.pth", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/30/ll_model.pth" + }, + { + "file_name": "ll_model_cfg.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/30/ll_model_cfg.pkl" + }, + { + "file_name": "meta.json", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/30/meta.json" + } + ], + "transformer_cfg": { + "n_layers": 2, + "d_model": 4, + "n_ctx": 10, + "d_head": 1, + "model_name": "custom", + "n_heads": 4, + "d_mlp": 16, + "act_fn": "gelu", + "d_vocab": 32, + "eps": 1e-05, + "use_attn_result": true, + "use_attn_scale": true, + "use_split_qkv_input": true, + "use_hook_mlp_in": true, + "use_attn_in": false, + "use_local_attn": false, + "original_architecture": null, + "from_checkpoint": false, + "checkpoint_index": null, + "checkpoint_label_type": null, + "checkpoint_value": null, + "tokenizer_name": null, + "window_size": null, + "attn_types": null, + "init_mode": "gpt2", + "normalization_type": null, + "n_devices": 1, + "attention_dir": "causal", + "attn_only": false, + "seed": 0, + "initializer_range": 0.12199885626608374, + "init_weights": true, + "scale_attn_by_inverse_layer_idx": false, + "positional_embedding_type": "standard", + "final_rms": false, + "d_vocab_out": 2, + "parallel_attn_mlp": false, + "rotary_dim": null, + "n_params": 384, + "use_hook_tokens": false, + "gated_mlp": false, + "default_prepend_bos": true, + "dtype": "torch.float32", + "tokenizer_prepends_bos": null, + "n_key_value_heads": null, + "post_embedding_ln": false, + "rotary_base": 10000, + "trust_remote_code": false, + "rotary_adjacent_pairs": false, + "load_in_4bit": false, + "num_experts": null, + "experts_per_token": null + }, + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/30/ll_model_cfg.pkl", + "training_args": { + "output_dir": "/circuits-benchmark/results", + "atol": 0.05, + "lr": 0.001, + "use_single_loss": true, + "iit_weight": 1.0, + "behavior_weight": 0.4, + "strict_weight": 0.4, + "epochs": 1000, + "early_stop_accuracy_threshold": 99.9, + "act_fn": "gelu", + "use_wandb": true, + "save_model_to_wandb": true, + "clip_grad_norm": 0.1, + "lr_scheduler": "linear", + "model_pair": "strict", + "same_size": false, + "seed": 67, + "batch_size": 256, + "include_mlp": false, + "detach_while_caching": true, + "scheduler_val_metric": [ + "val/accuracy", + "val/IIA", + "val/strict_accuracy" + ], + "siit_sampling": "sample_all", + "val_iia_sampling": "all" + }, + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/30/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/30/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/30/edges.pkl" + }, + { + "case_id": "90", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/90", + "task_description": "Replaces a specific token with another one.", + "vocab": [ + "-", + "J", + "LB", + "TPSI", + "V", + "b", + "findme", + "no", + "oCLrZaW", + "poiVg" + ], + "max_seq_len": 10, + "min_seq_len": 4, + "files": [ + { + "file_name": "edges.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/90/edges.pkl" + }, + { + "file_name": "ll_model.pth", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/90/ll_model.pth" + }, + { + "file_name": "ll_model_cfg.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/90/ll_model_cfg.pkl" + }, + { + "file_name": "meta.json", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/90/meta.json" + } + ], + "transformer_cfg": { + "n_layers": 2, + "d_model": 4, + "n_ctx": 10, + "d_head": 1, + "model_name": "custom", + "n_heads": 4, + "d_mlp": 16, + "act_fn": "gelu", + "d_vocab": 12, + "eps": 1e-05, + "use_attn_result": true, + "use_attn_scale": true, + "use_split_qkv_input": true, + "use_hook_mlp_in": true, + "use_attn_in": false, + "use_local_attn": false, + "original_architecture": null, + "from_checkpoint": false, + "checkpoint_index": null, + "checkpoint_label_type": null, + "checkpoint_value": null, + "tokenizer_name": null, + "window_size": null, + "attn_types": null, + "init_mode": "gpt2", + "normalization_type": null, + "n_devices": 1, + "attention_dir": "causal", + "attn_only": false, + "seed": 0, + "initializer_range": 0.1460593486680443, + "init_weights": true, + "scale_attn_by_inverse_layer_idx": false, + "positional_embedding_type": "standard", + "final_rms": false, + "d_vocab_out": 9, + "parallel_attn_mlp": false, + "rotary_dim": null, + "n_params": 384, + "use_hook_tokens": false, + "gated_mlp": false, + "default_prepend_bos": true, + "dtype": "torch.float32", + "tokenizer_prepends_bos": null, + "n_key_value_heads": null, + "post_embedding_ln": false, + "rotary_base": 10000, + "trust_remote_code": false, + "rotary_adjacent_pairs": false, + "load_in_4bit": false, + "num_experts": null, + "experts_per_token": null + }, + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/90/ll_model_cfg.pkl", + "training_args": { + "output_dir": "/circuits-benchmark/results", + "atol": 0.05, + "lr": 0.001, + "use_single_loss": true, + "iit_weight": 1.0, + "behavior_weight": 0.4, + "strict_weight": 0.4, + "epochs": 1000, + "early_stop_accuracy_threshold": 99.9, + "act_fn": "gelu", + "use_wandb": true, + "save_model_to_wandb": true, + "clip_grad_norm": 0.1, + "lr_scheduler": "linear", + "model_pair": "strict", + "same_size": false, + "seed": 82, + "batch_size": 256, + "include_mlp": false, + "detach_while_caching": true, + "scheduler_val_metric": [ + "val/accuracy", + "val/IIA", + "val/strict_accuracy" + ], + "siit_sampling": "sample_all", + "val_iia_sampling": "all" + }, + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/90/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/90/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/90/edges.pkl" + }, + { + "case_id": "39", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/39", + "task_description": "Returns the fraction of 'x' in the input up to the i-th position for all i.", + "vocab": [ + "a", + "b", + "c", + "d", + "e", + "f", + "g", + "h", + "i", + "j", + "k", + "l", + "m", + "n", + "o", + "p", + "q", + "r", + "s", + "t", + "u", + "v", + "w", + "x", + "y", + "z" + ], + "max_seq_len": 60, + "min_seq_len": 4, + "files": [ + { + "file_name": "edges.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/39/edges.pkl" + }, + { + "file_name": "ll_model.pth", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/39/ll_model.pth" + }, + { + "file_name": "ll_model_cfg.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/39/ll_model_cfg.pkl" + }, + { + "file_name": "meta.json", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/39/meta.json" + } + ], + "transformer_cfg": { + "n_layers": 2, + "d_model": 120, + "n_ctx": 60, + "d_head": 30, + "model_name": "custom", + "n_heads": 4, + "d_mlp": 480, + "act_fn": "gelu", + "d_vocab": 28, + "eps": 1e-05, + "use_attn_result": true, + "use_attn_scale": true, + "use_split_qkv_input": true, + "use_hook_mlp_in": true, + "use_attn_in": false, + "use_local_attn": false, + "original_architecture": null, + "from_checkpoint": false, + "checkpoint_index": null, + "checkpoint_label_type": null, + "checkpoint_value": null, + "tokenizer_name": null, + "window_size": null, + "attn_types": null, + "init_mode": "gpt2", + "normalization_type": null, + "n_devices": 1, + "attention_dir": "causal", + "attn_only": false, + "seed": 0, + "initializer_range": 0.08432740427115679, + "init_weights": true, + "scale_attn_by_inverse_layer_idx": false, + "positional_embedding_type": "standard", + "final_rms": false, + "d_vocab_out": 1, + "parallel_attn_mlp": false, + "rotary_dim": null, + "n_params": 345600, + "use_hook_tokens": false, + "gated_mlp": false, + "default_prepend_bos": true, + "dtype": "torch.float32", + "tokenizer_prepends_bos": null, + "n_key_value_heads": null, + "post_embedding_ln": false, + "rotary_base": 10000, + "trust_remote_code": false, + "rotary_adjacent_pairs": false, + "load_in_4bit": false, + "num_experts": null, + "experts_per_token": null + }, + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/39/ll_model_cfg.pkl", + "training_args": { + "output_dir": "/circuits-benchmark/results", + "atol": 0.05, + "lr": 0.001, + "use_single_loss": true, + "iit_weight": 1.0, + "behavior_weight": 0.4, + "strict_weight": 0.4, + "epochs": 1000, + "early_stop_accuracy_threshold": 99.9, + "act_fn": "gelu", + "use_wandb": true, + "save_model_to_wandb": true, + "clip_grad_norm": 0.1, + "lr_scheduler": "linear", + "model_pair": "strict", + "same_size": false, + "seed": 67, + "batch_size": 256, + "include_mlp": false, + "detach_while_caching": true, + "scheduler_val_metric": [ + "val/accuracy", + "val/IIA", + "val/strict_accuracy" + ], + "siit_sampling": "sample_all", + "val_iia_sampling": "all" + }, + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/39/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/39/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/39/edges.pkl" + }, + { + "case_id": "75", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/75", + "task_description": "Double each element of the input sequence.", + "vocab": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 + ], + "max_seq_len": 10, + "min_seq_len": 4, + "files": [ + { + "file_name": "edges.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/75/edges.pkl" + }, + { + "file_name": "ll_model.pth", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/75/ll_model.pth" + }, + { + "file_name": "ll_model_cfg.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/75/ll_model_cfg.pkl" + }, + { + "file_name": "meta.json", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/75/meta.json" + } + ], + "transformer_cfg": { + "n_layers": 2, + "d_model": 4, + "n_ctx": 10, + "d_head": 1, + "model_name": "custom", + "n_heads": 4, + "d_mlp": 16, + "act_fn": "gelu", + "d_vocab": 13, + "eps": 1e-05, + "use_attn_result": true, + "use_attn_scale": true, + "use_split_qkv_input": true, + "use_hook_mlp_in": true, + "use_attn_in": false, + "use_local_attn": false, + "original_architecture": null, + "from_checkpoint": false, + "checkpoint_index": null, + "checkpoint_label_type": null, + "checkpoint_value": null, + "tokenizer_name": null, + "window_size": null, + "attn_types": null, + "init_mode": "gpt2", + "normalization_type": null, + "n_devices": 1, + "attention_dir": "causal", + "attn_only": false, + "seed": 0, + "initializer_range": 0.1392621247645583, + "init_weights": true, + "scale_attn_by_inverse_layer_idx": false, + "positional_embedding_type": "standard", + "final_rms": false, + "d_vocab_out": 11, + "parallel_attn_mlp": false, + "rotary_dim": null, + "n_params": 384, + "use_hook_tokens": false, + "gated_mlp": false, + "default_prepend_bos": true, + "dtype": "torch.float32", + "tokenizer_prepends_bos": null, + "n_key_value_heads": null, + "post_embedding_ln": false, + "rotary_base": 10000, + "trust_remote_code": false, + "rotary_adjacent_pairs": false, + "load_in_4bit": false, + "num_experts": null, + "experts_per_token": null + }, + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/75/ll_model_cfg.pkl", + "training_args": { + "output_dir": "/circuits-benchmark/results", + "atol": 0.05, + "lr": 0.001, + "use_single_loss": true, + "iit_weight": 1.0, + "behavior_weight": 0.4, + "strict_weight": 0.7, + "epochs": 1000, + "early_stop_accuracy_threshold": 99.9, + "act_fn": "gelu", + "use_wandb": true, + "save_model_to_wandb": true, + "clip_grad_norm": 0.1, + "lr_scheduler": "linear", + "model_pair": "strict", + "same_size": false, + "seed": 67, + "batch_size": 256, + "include_mlp": false, + "detach_while_caching": true, + "scheduler_val_metric": [ + "val/accuracy", + "val/IIA", + "val/strict_accuracy" + ], + "siit_sampling": "sample_all", + "val_iia_sampling": "all" + }, + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/75/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/75/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/75/edges.pkl" + }, + { + "case_id": "101", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/101", + "task_description": "Check if each element is a square of an integer.", + "vocab": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29 + ], + "max_seq_len": 10, + "min_seq_len": 4, + "files": [ + { + "file_name": "edges.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/101/edges.pkl" + }, + { + "file_name": "ll_model.pth", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/101/ll_model.pth" + }, + { + "file_name": "ll_model_cfg.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/101/ll_model_cfg.pkl" + }, + { + "file_name": "meta.json", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/101/meta.json" + } + ], + "transformer_cfg": { + "n_layers": 2, + "d_model": 4, + "n_ctx": 10, + "d_head": 1, + "model_name": "custom", + "n_heads": 4, + "d_mlp": 16, + "act_fn": "gelu", + "d_vocab": 32, + "eps": 1e-05, + "use_attn_result": true, + "use_attn_scale": true, + "use_split_qkv_input": true, + "use_hook_mlp_in": true, + "use_attn_in": false, + "use_local_attn": false, + "original_architecture": null, + "from_checkpoint": false, + "checkpoint_index": null, + "checkpoint_label_type": null, + "checkpoint_value": null, + "tokenizer_name": null, + "window_size": null, + "attn_types": null, + "init_mode": "gpt2", + "normalization_type": null, + "n_devices": 1, + "attention_dir": "causal", + "attn_only": false, + "seed": 0, + "initializer_range": 0.12199885626608374, + "init_weights": true, + "scale_attn_by_inverse_layer_idx": false, + "positional_embedding_type": "standard", + "final_rms": false, + "d_vocab_out": 2, + "parallel_attn_mlp": false, + "rotary_dim": null, + "n_params": 384, + "use_hook_tokens": false, + "gated_mlp": false, + "default_prepend_bos": true, + "dtype": "torch.float32", + "tokenizer_prepends_bos": null, + "n_key_value_heads": null, + "post_embedding_ln": false, + "rotary_base": 10000, + "trust_remote_code": false, + "rotary_adjacent_pairs": false, + "load_in_4bit": false, + "num_experts": null, + "experts_per_token": null + }, + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/101/ll_model_cfg.pkl", + "training_args": { + "output_dir": "/circuits-benchmark/results", + "atol": 0.05, + "lr": 0.001, + "use_single_loss": true, + "iit_weight": 1.0, + "behavior_weight": 0.4, + "strict_weight": 0.4, + "epochs": 1000, + "early_stop_accuracy_threshold": 99.9, + "act_fn": "gelu", + "use_wandb": true, + "save_model_to_wandb": true, + "clip_grad_norm": 0.1, + "lr_scheduler": "linear", + "model_pair": "strict", + "same_size": false, + "seed": 67, + "batch_size": 256, + "include_mlp": false, + "detach_while_caching": true, + "scheduler_val_metric": [ + "val/accuracy", + "val/IIA", + "val/strict_accuracy" + ], + "siit_sampling": "sample_all", + "val_iia_sampling": "all" + }, + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/101/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/101/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/101/edges.pkl" + }, + { + "case_id": "ioi_next_token", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/ioi_next_token", + "task_description": "Indirect Object Identification (IOI) task, trained using next token prediction.", + "max_seq_len": 16, + "min_seq_len": 16, + "files": [ + { + "file_name": "edges.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi_next_token/edges.pkl" + }, + { + "file_name": "ll_model.pth", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi_next_token/ll_model.pth" + }, + { + "file_name": "ll_model_cfg.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi_next_token/ll_model_cfg.pkl" + }, + { + "file_name": "meta.json", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi_next_token/meta.json" + } + ], + "transformer_cfg": { + "n_layers": 6, + "d_model": 64, + "n_ctx": 1024, + "d_head": 16, + "model_name": "gpt2", + "n_heads": 4, + "d_mlp": 3072, + "act_fn": "gelu_new", + "d_vocab": 50257, + "eps": 1e-05, + "use_attn_result": false, + "use_attn_scale": true, + "use_split_qkv_input": false, + "use_hook_mlp_in": false, + "use_attn_in": false, + "use_local_attn": false, + "original_architecture": "GPT2LMHeadModel", + "from_checkpoint": false, + "checkpoint_index": null, + "checkpoint_label_type": null, + "checkpoint_value": null, + "tokenizer_name": "gpt2", + "window_size": null, + "attn_types": null, + "init_mode": "gpt2", + "normalization_type": "LNPre", + "n_devices": 1, + "attention_dir": "causal", + "attn_only": false, + "seed": null, + "initializer_range": 0.02886751345948129, + "init_weights": true, + "scale_attn_by_inverse_layer_idx": false, + "positional_embedding_type": "standard", + "final_rms": false, + "d_vocab_out": 50257, + "parallel_attn_mlp": false, + "rotary_dim": null, + "n_params": 2457600, + "use_hook_tokens": false, + "gated_mlp": false, + "default_prepend_bos": true, + "dtype": "torch.float32", + "tokenizer_prepends_bos": false, + "n_key_value_heads": null, + "post_embedding_ln": false, + "rotary_base": 10000, + "trust_remote_code": false, + "rotary_adjacent_pairs": false + }, + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi_next_token/ll_model_cfg.pkl", + "training_args": { + "next_token": true, + "non_ioi_thresh": 0.65, + "use_per_token_check": false, + "batch_size": 256, + "lr": 0.001, + "num_workers": 0, + "early_stop": true, + "lr_scheduler": null, + "scheduler_val_metric": [ + "val/accuracy", + "val/IIA" + ], + "scheduler_mode": "max", + "clip_grad_norm": 1.0, + "atol": 0.05, + "use_single_loss": false, + "iit_weight": 1.0, + "behavior_weight": 1.0, + "strict_weight": 0.4 + }, + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi_next_token/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi_next_token/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi_next_token/edges.pkl" + }, + { + "case_id": "65", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/65", + "task_description": "Calculate the cube root of each element in the input sequence.", + "vocab": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 + ], + "max_seq_len": 10, + "min_seq_len": 4, + "files": [ + { + "file_name": "edges.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/65/edges.pkl" + }, + { + "file_name": "ll_model.pth", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/65/ll_model.pth" + }, + { + "file_name": "ll_model_cfg.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/65/ll_model_cfg.pkl" + }, + { + "file_name": "meta.json", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/65/meta.json" + } + ], + "transformer_cfg": { + "n_layers": 2, + "d_model": 4, + "n_ctx": 10, + "d_head": 1, + "model_name": "custom", + "n_heads": 4, + "d_mlp": 16, + "act_fn": "gelu", + "d_vocab": 13, + "eps": 1e-05, + "use_attn_result": true, + "use_attn_scale": true, + "use_split_qkv_input": true, + "use_hook_mlp_in": true, + "use_attn_in": false, + "use_local_attn": false, + "original_architecture": null, + "from_checkpoint": false, + "checkpoint_index": null, + "checkpoint_label_type": null, + "checkpoint_value": null, + "tokenizer_name": null, + "window_size": null, + "attn_types": null, + "init_mode": "gpt2", + "normalization_type": null, + "n_devices": 1, + "attention_dir": "causal", + "attn_only": false, + "seed": 0, + "initializer_range": 0.1392621247645583, + "init_weights": true, + "scale_attn_by_inverse_layer_idx": false, + "positional_embedding_type": "standard", + "final_rms": false, + "d_vocab_out": 11, + "parallel_attn_mlp": false, + "rotary_dim": null, + "n_params": 384, + "use_hook_tokens": false, + "gated_mlp": false, + "default_prepend_bos": true, + "dtype": "torch.float32", + "tokenizer_prepends_bos": null, + "n_key_value_heads": null, + "post_embedding_ln": false, + "rotary_base": 10000, + "trust_remote_code": false, + "rotary_adjacent_pairs": false, + "load_in_4bit": false, + "num_experts": null, + "experts_per_token": null + }, + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/65/ll_model_cfg.pkl", + "training_args": { + "output_dir": "/circuits-benchmark/results", + "atol": 0.05, + "lr": 0.001, + "use_single_loss": true, + "iit_weight": 1.0, + "behavior_weight": 0.4, + "strict_weight": 0.7, + "epochs": 1000, + "early_stop_accuracy_threshold": 99.9, + "act_fn": "gelu", + "use_wandb": true, + "save_model_to_wandb": true, + "clip_grad_norm": 0.1, + "lr_scheduler": "linear", + "model_pair": "strict", + "same_size": false, + "seed": 67, + "batch_size": 256, + "include_mlp": false, + "detach_while_caching": true, + "scheduler_val_metric": [ + "val/accuracy", + "val/IIA", + "val/strict_accuracy" + ], + "siit_sampling": "sample_all", + "val_iia_sampling": "all" + }, + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/65/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/65/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/65/edges.pkl" + }, + { + "case_id": "14", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/14", + "task_description": "Returns the count of 'a' in the input sequence.", + "vocab": [ + "a", + "b", + "c" + ], + "max_seq_len": 10, + "min_seq_len": 4, + "files": [ + { + "file_name": "edges.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/14/edges.pkl" + }, + { + "file_name": "ll_model.pth", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/14/ll_model.pth" + }, + { + "file_name": "ll_model_cfg.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/14/ll_model_cfg.pkl" + }, + { + "file_name": "meta.json", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/14/meta.json" + } + ], + "transformer_cfg": { + "n_layers": 2, + "d_model": 8, + "n_ctx": 10, + "d_head": 2, + "model_name": "custom", + "n_heads": 4, + "d_mlp": 32, + "act_fn": "gelu", + "d_vocab": 5, + "eps": 1e-05, + "use_attn_result": true, + "use_attn_scale": true, + "use_split_qkv_input": true, + "use_hook_mlp_in": true, + "use_attn_in": false, + "use_local_attn": false, + "original_architecture": null, + "from_checkpoint": false, + "checkpoint_index": null, + "checkpoint_label_type": null, + "checkpoint_value": null, + "tokenizer_name": null, + "window_size": null, + "attn_types": null, + "init_mode": "gpt2", + "normalization_type": null, + "n_devices": 1, + "attention_dir": "bidirectional", + "attn_only": false, + "seed": 0, + "initializer_range": 0.15689290811054724, + "init_weights": true, + "scale_attn_by_inverse_layer_idx": false, + "positional_embedding_type": "standard", + "final_rms": false, + "d_vocab_out": 10, + "parallel_attn_mlp": false, + "rotary_dim": null, + "n_params": 1536, + "use_hook_tokens": false, + "gated_mlp": false, + "default_prepend_bos": true, + "dtype": "torch.float32", + "tokenizer_prepends_bos": null, + "n_key_value_heads": null, + "post_embedding_ln": false, + "rotary_base": 10000, + "trust_remote_code": false, + "rotary_adjacent_pairs": false, + "load_in_4bit": false, + "num_experts": null, + "experts_per_token": null + }, + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/14/ll_model_cfg.pkl", + "training_args": { + "output_dir": "/circuits-benchmark/results", + "atol": 0.05, + "lr": 0.001, + "use_single_loss": true, + "iit_weight": 1.0, + "behavior_weight": 0.4, + "strict_weight": 0.4, + "epochs": 1000, + "early_stop_accuracy_threshold": 99.9, + "act_fn": "gelu", + "use_wandb": true, + "save_model_to_wandb": true, + "clip_grad_norm": 0.1, + "lr_scheduler": "linear", + "model_pair": "strict", + "same_size": false, + "seed": 67, + "batch_size": 256, + "include_mlp": false, + "detach_while_caching": true, + "scheduler_val_metric": [ + "val/accuracy", + "val/IIA", + "val/strict_accuracy" + ], + "siit_sampling": "sample_all", + "val_iia_sampling": "all" + }, + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/14/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/14/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/14/edges.pkl" + }, + { + "case_id": "95", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/95", + "task_description": "Counts the distinct prime factors of each number in the input list.", + "vocab": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29 + ], + "max_seq_len": 10, + "min_seq_len": 4, + "files": [ + { + "file_name": "edges.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/95/edges.pkl" + }, + { + "file_name": "ll_model.pth", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/95/ll_model.pth" + }, + { + "file_name": "ll_model_cfg.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/95/ll_model_cfg.pkl" + }, + { + "file_name": "meta.json", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/95/meta.json" + } + ], + "transformer_cfg": { + "n_layers": 2, + "d_model": 4, + "n_ctx": 10, + "d_head": 1, + "model_name": "custom", + "n_heads": 4, + "d_mlp": 16, + "act_fn": "gelu", + "d_vocab": 32, + "eps": 1e-05, + "use_attn_result": true, + "use_attn_scale": true, + "use_split_qkv_input": true, + "use_hook_mlp_in": true, + "use_attn_in": false, + "use_local_attn": false, + "original_architecture": null, + "from_checkpoint": false, + "checkpoint_index": null, + "checkpoint_label_type": null, + "checkpoint_value": null, + "tokenizer_name": null, + "window_size": null, + "attn_types": null, + "init_mode": "gpt2", + "normalization_type": null, + "n_devices": 1, + "attention_dir": "causal", + "attn_only": false, + "seed": 0, + "initializer_range": 0.12060453783110546, + "init_weights": true, + "scale_attn_by_inverse_layer_idx": false, + "positional_embedding_type": "standard", + "final_rms": false, + "d_vocab_out": 3, + "parallel_attn_mlp": false, + "rotary_dim": null, + "n_params": 384, + "use_hook_tokens": false, + "gated_mlp": false, + "default_prepend_bos": true, + "dtype": "torch.float32", + "tokenizer_prepends_bos": null, + "n_key_value_heads": null, + "post_embedding_ln": false, + "rotary_base": 10000, + "trust_remote_code": false, + "rotary_adjacent_pairs": false, + "load_in_4bit": false, + "num_experts": null, + "experts_per_token": null + }, + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/95/ll_model_cfg.pkl", + "training_args": { + "output_dir": "/circuits-benchmark/results", + "atol": 0.05, + "lr": 0.001, + "use_single_loss": true, + "iit_weight": 1.0, + "behavior_weight": 0.4, + "strict_weight": 0.7, + "epochs": 1000, + "early_stop_accuracy_threshold": 99.9, + "act_fn": "gelu", + "use_wandb": true, + "save_model_to_wandb": true, + "clip_grad_norm": 0.1, + "lr_scheduler": "linear", + "model_pair": "strict", + "same_size": false, + "seed": 67, + "batch_size": 256, + "include_mlp": false, + "detach_while_caching": true, + "scheduler_val_metric": [ + "val/accuracy", + "val/IIA", + "val/strict_accuracy" + ], + "siit_sampling": "sample_all", + "val_iia_sampling": "all" + }, + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/95/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/95/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/95/edges.pkl" + }, + { + "case_id": "84", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/84", + "task_description": "Apply the arctangent function to each element of the input sequence.", + "vocab": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 + ], + "max_seq_len": 10, + "min_seq_len": 4, + "files": [ + { + "file_name": "edges.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/84/edges.pkl" + }, + { + "file_name": "ll_model.pth", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/84/ll_model.pth" + }, + { + "file_name": "ll_model_cfg.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/84/ll_model_cfg.pkl" + }, + { + "file_name": "meta.json", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/84/meta.json" + } + ], + "transformer_cfg": { + "n_layers": 2, + "d_model": 4, + "n_ctx": 10, + "d_head": 1, + "model_name": "custom", + "n_heads": 4, + "d_mlp": 16, + "act_fn": "gelu", + "d_vocab": 13, + "eps": 1e-05, + "use_attn_result": true, + "use_attn_scale": true, + "use_split_qkv_input": true, + "use_hook_mlp_in": true, + "use_attn_in": false, + "use_local_attn": false, + "original_architecture": null, + "from_checkpoint": false, + "checkpoint_index": null, + "checkpoint_label_type": null, + "checkpoint_value": null, + "tokenizer_name": null, + "window_size": null, + "attn_types": null, + "init_mode": "gpt2", + "normalization_type": null, + "n_devices": 1, + "attention_dir": "causal", + "attn_only": false, + "seed": 0, + "initializer_range": 0.1392621247645583, + "init_weights": true, + "scale_attn_by_inverse_layer_idx": false, + "positional_embedding_type": "standard", + "final_rms": false, + "d_vocab_out": 11, + "parallel_attn_mlp": false, + "rotary_dim": null, + "n_params": 384, + "use_hook_tokens": false, + "gated_mlp": false, + "default_prepend_bos": true, + "dtype": "torch.float32", + "tokenizer_prepends_bos": null, + "n_key_value_heads": null, + "post_embedding_ln": false, + "rotary_base": 10000, + "trust_remote_code": false, + "rotary_adjacent_pairs": false, + "load_in_4bit": false, + "num_experts": null, + "experts_per_token": null + }, + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/84/ll_model_cfg.pkl", + "training_args": { + "output_dir": "/circuits-benchmark/results", + "atol": 0.05, + "lr": 0.001, + "use_single_loss": true, + "iit_weight": 1.0, + "behavior_weight": 0.4, + "strict_weight": 0.7, + "epochs": 1000, + "early_stop_accuracy_threshold": 99.9, + "act_fn": "gelu", + "use_wandb": true, + "save_model_to_wandb": true, + "clip_grad_norm": 0.1, + "lr_scheduler": "linear", + "model_pair": "strict", + "same_size": false, + "seed": 67, + "batch_size": 256, + "include_mlp": false, + "detach_while_caching": true, + "scheduler_val_metric": [ + "val/accuracy", + "val/IIA", + "val/strict_accuracy" + ], + "siit_sampling": "sample_all", + "val_iia_sampling": "all" + }, + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/84/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/84/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/84/edges.pkl" + }, + { + "case_id": "93", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/93", + "task_description": "Swaps the nth with the n+1th element if n%2==1.", + "vocab": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 + ], + "max_seq_len": 10, + "min_seq_len": 4, + "files": [ + { + "file_name": "edges.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/93/edges.pkl" + }, + { + "file_name": "ll_model.pth", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/93/ll_model.pth" + }, + { + "file_name": "ll_model_cfg.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/93/ll_model_cfg.pkl" + }, + { + "file_name": "meta.json", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/93/meta.json" + } + ], + "transformer_cfg": { + "n_layers": 3, + "d_model": 20, + "n_ctx": 10, + "d_head": 5, + "model_name": "custom", + "n_heads": 4, + "d_mlp": 80, + "act_fn": "gelu", + "d_vocab": 13, + "eps": 1e-05, + "use_attn_result": true, + "use_attn_scale": true, + "use_split_qkv_input": true, + "use_hook_mlp_in": true, + "use_attn_in": false, + "use_local_attn": false, + "original_architecture": null, + "from_checkpoint": false, + "checkpoint_index": null, + "checkpoint_label_type": null, + "checkpoint_value": null, + "tokenizer_name": null, + "window_size": null, + "attn_types": null, + "init_mode": "gpt2", + "normalization_type": null, + "n_devices": 1, + "attention_dir": "bidirectional", + "attn_only": false, + "seed": 0, + "initializer_range": 0.10886621079036347, + "init_weights": true, + "scale_attn_by_inverse_layer_idx": false, + "positional_embedding_type": "standard", + "final_rms": false, + "d_vocab_out": 11, + "parallel_attn_mlp": false, + "rotary_dim": null, + "n_params": 14400, + "use_hook_tokens": false, + "gated_mlp": false, + "default_prepend_bos": true, + "dtype": "torch.float32", + "tokenizer_prepends_bos": null, + "n_key_value_heads": null, + "post_embedding_ln": false, + "rotary_base": 10000, + "trust_remote_code": false, + "rotary_adjacent_pairs": false, + "load_in_4bit": false, + "num_experts": null, + "experts_per_token": null + }, + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/93/ll_model_cfg.pkl", + "training_args": { + "output_dir": "/circuits-benchmark/results", + "atol": 0.05, + "lr": 0.001, + "use_single_loss": true, + "iit_weight": 1.0, + "behavior_weight": 0.4, + "strict_weight": 0.4, + "epochs": 1000, + "early_stop_accuracy_threshold": 99.9, + "act_fn": "gelu", + "use_wandb": true, + "save_model_to_wandb": true, + "clip_grad_norm": 0.1, + "lr_scheduler": "linear", + "model_pair": "strict", + "same_size": false, + "seed": 67, + "batch_size": 256, + "include_mlp": false, + "detach_while_caching": true, + "scheduler_val_metric": [ + "val/accuracy", + "val/IIA", + "val/strict_accuracy" + ], + "siit_sampling": "sample_all", + "val_iia_sampling": "all" + }, + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/93/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/93/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/93/edges.pkl" + }, + { + "case_id": "37", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/37", + "task_description": "Reverses each word in the sequence except for specified exclusions.", + "vocab": [ + "J", + "LB", + "TPSI", + "V", + "b", + "no", + "oCLrZaW", + "poiVg" + ], + "max_seq_len": 10, + "min_seq_len": 4, + "files": [ + { + "file_name": "edges.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/37/edges.pkl" + }, + { + "file_name": "ll_model.pth", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/37/ll_model.pth" + }, + { + "file_name": "ll_model_cfg.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/37/ll_model_cfg.pkl" + }, + { + "file_name": "meta.json", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/37/meta.json" + } + ], + "transformer_cfg": { + "n_layers": 2, + "d_model": 12, + "n_ctx": 10, + "d_head": 3, + "model_name": "custom", + "n_heads": 4, + "d_mlp": 48, + "act_fn": "gelu", + "d_vocab": 10, + "eps": 1e-05, + "use_attn_result": true, + "use_attn_scale": true, + "use_split_qkv_input": true, + "use_hook_mlp_in": true, + "use_attn_in": false, + "use_local_attn": false, + "original_architecture": null, + "from_checkpoint": false, + "checkpoint_index": null, + "checkpoint_label_type": null, + "checkpoint_value": null, + "tokenizer_name": null, + "window_size": null, + "attn_types": null, + "init_mode": "gpt2", + "normalization_type": null, + "n_devices": 1, + "attention_dir": "causal", + "attn_only": false, + "seed": 0, + "initializer_range": 0.1539600717839002, + "init_weights": true, + "scale_attn_by_inverse_layer_idx": false, + "positional_embedding_type": "standard", + "final_rms": false, + "d_vocab_out": 8, + "parallel_attn_mlp": false, + "rotary_dim": null, + "n_params": 3456, + "use_hook_tokens": false, + "gated_mlp": false, + "default_prepend_bos": true, + "dtype": "torch.float32", + "tokenizer_prepends_bos": null, + "n_key_value_heads": null, + "post_embedding_ln": false, + "rotary_base": 10000, + "trust_remote_code": false, + "rotary_adjacent_pairs": false + }, + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/37/ll_model_cfg.pkl", + "training_args": { + "atol": 0.05, + "lr": 0.01, + "use_single_loss": false, + "iit_weight": 1.0, + "behavior_weight": 1.0, + "strict_weight": 1.0, + "epochs": 2000, + "act_fn": "gelu", + "clip_grad_norm": 0.1, + "lr_scheduler": "", + "model_pair": "strict" + }, + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/37/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/37/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/37/edges.pkl" + }, + { + "case_id": "34", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/34", + "task_description": "Calculate the ratio of vowels to consonants in each word.", + "vocab": [ + "J", + "LB", + "TPSI", + "V", + "b", + "no", + "oCLrZaW", + "poiVg" + ], + "max_seq_len": 10, + "min_seq_len": 4, + "files": [ + { + "file_name": "edges.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/34/edges.pkl" + }, + { + "file_name": "ll_model.pth", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/34/ll_model.pth" + }, + { + "file_name": "ll_model_cfg.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/34/ll_model_cfg.pkl" + }, + { + "file_name": "meta.json", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/34/meta.json" + } + ], + "transformer_cfg": { + "n_layers": 2, + "d_model": 16, + "n_ctx": 10, + "d_head": 4, + "model_name": "custom", + "n_heads": 4, + "d_mlp": 64, + "act_fn": "gelu", + "d_vocab": 10, + "eps": 1e-05, + "use_attn_result": true, + "use_attn_scale": true, + "use_split_qkv_input": true, + "use_hook_mlp_in": true, + "use_attn_in": false, + "use_local_attn": false, + "original_architecture": null, + "from_checkpoint": false, + "checkpoint_index": null, + "checkpoint_label_type": null, + "checkpoint_value": null, + "tokenizer_name": null, + "window_size": null, + "attn_types": null, + "init_mode": "gpt2", + "normalization_type": null, + "n_devices": 1, + "attention_dir": "causal", + "attn_only": false, + "seed": 0, + "initializer_range": 0.16329931618554522, + "init_weights": true, + "scale_attn_by_inverse_layer_idx": false, + "positional_embedding_type": "standard", + "final_rms": false, + "d_vocab_out": 5, + "parallel_attn_mlp": false, + "rotary_dim": null, + "n_params": 6144, + "use_hook_tokens": false, + "gated_mlp": false, + "default_prepend_bos": true, + "dtype": "torch.float32", + "tokenizer_prepends_bos": null, + "n_key_value_heads": null, + "post_embedding_ln": false, + "rotary_base": 10000, + "trust_remote_code": false, + "rotary_adjacent_pairs": false + }, + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/34/ll_model_cfg.pkl", + "training_args": { + "atol": 0.05, + "lr": 0.01, + "use_single_loss": false, + "iit_weight": 1.0, + "behavior_weight": 1.0, + "strict_weight": 1.0, + "epochs": 2000, + "act_fn": "gelu", + "clip_grad_norm": 0.1, + "lr_scheduler": "", + "model_pair": "strict" + }, + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/34/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/34/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/34/edges.pkl" + }, + { + "case_id": "26", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/26", + "task_description": "Creates a cascading effect by repeating each token in sequence incrementally.", + "vocab": [ + "a", + "b", + "c" + ], + "max_seq_len": 10, + "min_seq_len": 4, + "files": [ + { + "file_name": "edges.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/26/edges.pkl" + }, + { + "file_name": "ll_model.pth", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/26/ll_model.pth" + }, + { + "file_name": "ll_model_cfg.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/26/ll_model_cfg.pkl" + }, + { + "file_name": "meta.json", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/26/meta.json" + } + ], + "transformer_cfg": { + "n_layers": 2, + "d_model": 21, + "n_ctx": 10, + "d_head": 5, + "model_name": "custom", + "n_heads": 4, + "d_mlp": 84, + "act_fn": "gelu", + "d_vocab": 5, + "eps": 1e-05, + "use_attn_result": true, + "use_attn_scale": true, + "use_split_qkv_input": true, + "use_hook_mlp_in": true, + "use_attn_in": false, + "use_local_attn": false, + "original_architecture": null, + "from_checkpoint": false, + "checkpoint_index": null, + "checkpoint_label_type": null, + "checkpoint_value": null, + "tokenizer_name": null, + "window_size": null, + "attn_types": null, + "init_mode": "gpt2", + "normalization_type": null, + "n_devices": 1, + "attention_dir": "causal", + "attn_only": false, + "seed": 0, + "initializer_range": 0.12344267996967354, + "init_weights": true, + "scale_attn_by_inverse_layer_idx": false, + "positional_embedding_type": "standard", + "final_rms": false, + "d_vocab_out": 27, + "parallel_attn_mlp": false, + "rotary_dim": null, + "n_params": 10416, + "use_hook_tokens": false, + "gated_mlp": false, + "default_prepend_bos": true, + "dtype": "torch.float32", + "tokenizer_prepends_bos": null, + "n_key_value_heads": null, + "post_embedding_ln": false, + "rotary_base": 10000, + "trust_remote_code": false, + "rotary_adjacent_pairs": false + }, + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/26/ll_model_cfg.pkl", + "training_args": { + "atol": 0.05, + "lr": 0.01, + "use_single_loss": false, + "iit_weight": 1.0, + "behavior_weight": 1.0, + "strict_weight": 0.4, + "epochs": 2000, + "act_fn": "gelu", + "clip_grad_norm": 0.1, + "lr_scheduler": "", + "model_pair": "strict" + }, + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/26/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/26/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/26/edges.pkl" + }, + { + "case_id": "19", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/19", + "task_description": "Removes consecutive duplicate tokens from a sequence.", + "vocab": [ + "a", + "b", + "c" + ], + "max_seq_len": 15, + "min_seq_len": 4, + "files": [ + { + "file_name": "edges.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/19/edges.pkl" + }, + { + "file_name": "ll_model.pth", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/19/ll_model.pth" + }, + { + "file_name": "ll_model_cfg.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/19/ll_model_cfg.pkl" + }, + { + "file_name": "meta.json", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/19/meta.json" + } + ], + "transformer_cfg": { + "n_layers": 2, + "d_model": 32, + "n_ctx": 15, + "d_head": 8, + "model_name": "custom", + "n_heads": 4, + "d_mlp": 128, + "act_fn": "gelu", + "d_vocab": 5, + "eps": 1e-05, + "use_attn_result": true, + "use_attn_scale": true, + "use_split_qkv_input": true, + "use_hook_mlp_in": true, + "use_attn_in": false, + "use_local_attn": false, + "original_architecture": null, + "from_checkpoint": false, + "checkpoint_index": null, + "checkpoint_label_type": null, + "checkpoint_value": null, + "tokenizer_name": null, + "window_size": null, + "attn_types": null, + "init_mode": "gpt2", + "normalization_type": null, + "n_devices": 1, + "attention_dir": "causal", + "attn_only": false, + "seed": 0, + "initializer_range": 0.15689290811054724, + "init_weights": true, + "scale_attn_by_inverse_layer_idx": false, + "positional_embedding_type": "standard", + "final_rms": false, + "d_vocab_out": 3, + "parallel_attn_mlp": false, + "rotary_dim": null, + "n_params": 24576, + "use_hook_tokens": false, + "gated_mlp": false, + "default_prepend_bos": true, + "dtype": "torch.float32", + "tokenizer_prepends_bos": null, + "n_key_value_heads": null, + "post_embedding_ln": false, + "rotary_base": 10000, + "trust_remote_code": false, + "rotary_adjacent_pairs": false + }, + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/19/ll_model_cfg.pkl", + "training_args": { + "atol": 0.05, + "lr": 0.001, + "use_single_loss": false, + "iit_weight": 1.0, + "behavior_weight": 1.0, + "strict_weight": 0.4, + "epochs": 2000, + "act_fn": "gelu", + "clip_grad_norm": 0.1, + "lr_scheduler": "" + }, + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/19/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/19/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/19/edges.pkl" + }, + { + "case_id": "53", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/53", + "task_description": "Increment elements at odd indices by 1", + "vocab": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 + ], + "max_seq_len": 10, + "min_seq_len": 4, + "files": [ + { + "file_name": "edges.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/53/edges.pkl" + }, + { + "file_name": "ll_model.pth", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/53/ll_model.pth" + }, + { + "file_name": "ll_model_cfg.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/53/ll_model_cfg.pkl" + }, + { + "file_name": "meta.json", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/53/meta.json" + } + ], + "transformer_cfg": { + "n_layers": 2, + "d_model": 4, + "n_ctx": 10, + "d_head": 1, + "model_name": "custom", + "n_heads": 4, + "d_mlp": 16, + "act_fn": "gelu", + "d_vocab": 13, + "eps": 1e-05, + "use_attn_result": true, + "use_attn_scale": true, + "use_split_qkv_input": true, + "use_hook_mlp_in": true, + "use_attn_in": false, + "use_local_attn": false, + "original_architecture": null, + "from_checkpoint": false, + "checkpoint_index": null, + "checkpoint_label_type": null, + "checkpoint_value": null, + "tokenizer_name": null, + "window_size": null, + "attn_types": null, + "init_mode": "gpt2", + "normalization_type": null, + "n_devices": 1, + "attention_dir": "causal", + "attn_only": false, + "seed": 0, + "initializer_range": 0.13151918984428584, + "init_weights": true, + "scale_attn_by_inverse_layer_idx": false, + "positional_embedding_type": "standard", + "final_rms": false, + "d_vocab_out": 12, + "parallel_attn_mlp": false, + "rotary_dim": null, + "n_params": 384, + "use_hook_tokens": false, + "gated_mlp": false, + "default_prepend_bos": true, + "dtype": "torch.float32", + "tokenizer_prepends_bos": null, + "n_key_value_heads": null, + "post_embedding_ln": false, + "rotary_base": 10000, + "trust_remote_code": false, + "rotary_adjacent_pairs": false, + "load_in_4bit": false, + "num_experts": null, + "experts_per_token": null + }, + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/53/ll_model_cfg.pkl", + "training_args": { + "output_dir": "/circuits-benchmark/results", + "atol": 0.05, + "lr": 0.001, + "use_single_loss": true, + "iit_weight": 1.0, + "behavior_weight": 0.4, + "strict_weight": 0.7, + "epochs": 1000, + "early_stop_accuracy_threshold": 99.9, + "act_fn": "gelu", + "use_wandb": true, + "save_model_to_wandb": true, + "clip_grad_norm": 0.1, + "lr_scheduler": "linear", + "model_pair": "strict", + "same_size": false, + "seed": 67, + "batch_size": 256, + "include_mlp": false, + "detach_while_caching": true, + "scheduler_val_metric": [ + "val/accuracy", + "val/IIA", + "val/strict_accuracy" + ], + "siit_sampling": "sample_all", + "val_iia_sampling": "all" + }, + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/53/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/53/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/53/edges.pkl" + }, + { + "case_id": "40", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/40", + "task_description": "Sum the last and previous to last digits of a number", + "vocab": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28 + ], + "max_seq_len": 10, + "min_seq_len": 4, + "files": [ + { + "file_name": "edges.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/40/edges.pkl" + }, + { + "file_name": "ll_model.pth", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/40/ll_model.pth" + }, + { + "file_name": "ll_model_cfg.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/40/ll_model_cfg.pkl" + }, + { + "file_name": "meta.json", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/40/meta.json" + } + ], + "transformer_cfg": { + "n_layers": 2, + "d_model": 4, + "n_ctx": 10, + "d_head": 1, + "model_name": "custom", + "n_heads": 4, + "d_mlp": 16, + "act_fn": "gelu", + "d_vocab": 31, + "eps": 1e-05, + "use_attn_result": true, + "use_attn_scale": true, + "use_split_qkv_input": true, + "use_hook_mlp_in": true, + "use_attn_in": false, + "use_local_attn": false, + "original_architecture": null, + "from_checkpoint": false, + "checkpoint_index": null, + "checkpoint_label_type": null, + "checkpoint_value": null, + "tokenizer_name": null, + "window_size": null, + "attn_types": null, + "init_mode": "gpt2", + "normalization_type": null, + "n_devices": 1, + "attention_dir": "causal", + "attn_only": false, + "seed": 0, + "initializer_range": 0.09847319278346618, + "init_weights": true, + "scale_attn_by_inverse_layer_idx": false, + "positional_embedding_type": "standard", + "final_rms": false, + "d_vocab_out": 12, + "parallel_attn_mlp": false, + "rotary_dim": null, + "n_params": 384, + "use_hook_tokens": false, + "gated_mlp": false, + "default_prepend_bos": true, + "dtype": "torch.float32", + "tokenizer_prepends_bos": null, + "n_key_value_heads": null, + "post_embedding_ln": false, + "rotary_base": 10000, + "trust_remote_code": false, + "rotary_adjacent_pairs": false, + "load_in_4bit": false, + "num_experts": null, + "experts_per_token": null + }, + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/40/ll_model_cfg.pkl", + "training_args": { + "output_dir": "/circuits-benchmark/results", + "atol": 0.05, + "lr": 0.001, + "use_single_loss": true, + "iit_weight": 1.0, + "behavior_weight": 0.4, + "strict_weight": 0.4, + "epochs": 1000, + "early_stop_accuracy_threshold": 99.9, + "act_fn": "gelu", + "use_wandb": true, + "save_model_to_wandb": true, + "clip_grad_norm": 0.1, + "lr_scheduler": "linear", + "model_pair": "strict", + "same_size": false, + "seed": 67, + "batch_size": 256, + "include_mlp": false, + "detach_while_caching": true, + "scheduler_val_metric": [ + "val/accuracy", + "val/IIA", + "val/strict_accuracy" + ], + "siit_sampling": "sample_all", + "val_iia_sampling": "all" + }, + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/40/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/40/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/40/edges.pkl" + }, + { + "case_id": "51", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/51", + "task_description": "Checks if each element is a Fibonacci number", + "vocab": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30, + 31, + 32, + 33, + 34, + 35, + 36, + 37, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 58, + 59, + 60, + 61, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 91, + 92, + 93, + 94, + 95, + 96, + 97, + 98, + 99 + ], + "max_seq_len": 10, + "min_seq_len": 4, + "files": [ + { + "file_name": "edges.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/51/edges.pkl" + }, + { + "file_name": "ll_model.pth", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/51/ll_model.pth" + }, + { + "file_name": "ll_model_cfg.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/51/ll_model_cfg.pkl" + }, + { + "file_name": "meta.json", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/51/meta.json" + } + ], + "transformer_cfg": { + "n_layers": 2, + "d_model": 4, + "n_ctx": 10, + "d_head": 1, + "model_name": "custom", + "n_heads": 4, + "d_mlp": 16, + "act_fn": "gelu", + "d_vocab": 102, + "eps": 1e-05, + "use_attn_result": true, + "use_attn_scale": true, + "use_split_qkv_input": true, + "use_hook_mlp_in": true, + "use_attn_in": false, + "use_local_attn": false, + "original_architecture": null, + "from_checkpoint": false, + "checkpoint_index": null, + "checkpoint_label_type": null, + "checkpoint_value": null, + "tokenizer_name": null, + "window_size": null, + "attn_types": null, + "init_mode": "gpt2", + "normalization_type": null, + "n_devices": 1, + "attention_dir": "causal", + "attn_only": false, + "seed": 0, + "initializer_range": 0.07525766947068778, + "init_weights": true, + "scale_attn_by_inverse_layer_idx": false, + "positional_embedding_type": "standard", + "final_rms": false, + "d_vocab_out": 2, + "parallel_attn_mlp": false, + "rotary_dim": null, + "n_params": 384, + "use_hook_tokens": false, + "gated_mlp": false, + "default_prepend_bos": true, + "dtype": "torch.float32", + "tokenizer_prepends_bos": null, + "n_key_value_heads": null, + "post_embedding_ln": false, + "rotary_base": 10000, + "trust_remote_code": false, + "rotary_adjacent_pairs": false, + "load_in_4bit": false, + "num_experts": null, + "experts_per_token": null + }, + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/51/ll_model_cfg.pkl", + "training_args": { + "output_dir": "/circuits-benchmark/results", + "atol": 0.05, + "lr": 0.001, + "use_single_loss": true, + "iit_weight": 1.0, + "behavior_weight": 0.4, + "strict_weight": 0.4, + "epochs": 1000, + "early_stop_accuracy_threshold": 99.9, + "act_fn": "gelu", + "use_wandb": true, + "save_model_to_wandb": true, + "clip_grad_norm": 0.1, + "lr_scheduler": "linear", + "model_pair": "strict", + "same_size": false, + "seed": 67, + "batch_size": 256, + "include_mlp": false, + "detach_while_caching": true, + "scheduler_val_metric": [ + "val/accuracy", + "val/IIA", + "val/strict_accuracy" + ], + "siit_sampling": "sample_all", + "val_iia_sampling": "all" + }, + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/51/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/51/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/51/edges.pkl" + }, + { + "case_id": "122", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/122", + "task_description": "Check if each number is divisible by 3.", + "vocab": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29 + ], + "max_seq_len": 10, + "min_seq_len": 4, + "files": [ + { + "file_name": "edges.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/122/edges.pkl" + }, + { + "file_name": "ll_model.pth", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/122/ll_model.pth" + }, + { + "file_name": "ll_model_cfg.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/122/ll_model_cfg.pkl" + }, + { + "file_name": "meta.json", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/122/meta.json" + } + ], + "transformer_cfg": { + "n_layers": 2, + "d_model": 4, + "n_ctx": 10, + "d_head": 1, + "model_name": "custom", + "n_heads": 4, + "d_mlp": 16, + "act_fn": "gelu", + "d_vocab": 32, + "eps": 1e-05, + "use_attn_result": true, + "use_attn_scale": true, + "use_split_qkv_input": true, + "use_hook_mlp_in": true, + "use_attn_in": false, + "use_local_attn": false, + "original_architecture": null, + "from_checkpoint": false, + "checkpoint_index": null, + "checkpoint_label_type": null, + "checkpoint_value": null, + "tokenizer_name": null, + "window_size": null, + "attn_types": null, + "init_mode": "gpt2", + "normalization_type": null, + "n_devices": 1, + "attention_dir": "causal", + "attn_only": false, + "seed": 0, + "initializer_range": 0.12199885626608374, + "init_weights": true, + "scale_attn_by_inverse_layer_idx": false, + "positional_embedding_type": "standard", + "final_rms": false, + "d_vocab_out": 2, + "parallel_attn_mlp": false, + "rotary_dim": null, + "n_params": 384, + "use_hook_tokens": false, + "gated_mlp": false, + "default_prepend_bos": true, + "dtype": "torch.float32", + "tokenizer_prepends_bos": null, + "n_key_value_heads": null, + "post_embedding_ln": false, + "rotary_base": 10000, + "trust_remote_code": false, + "rotary_adjacent_pairs": false, + "load_in_4bit": false, + "num_experts": null, + "experts_per_token": null + }, + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/122/ll_model_cfg.pkl", + "training_args": { + "output_dir": "/circuits-benchmark/results", + "atol": 0.05, + "lr": 0.001, + "use_single_loss": true, + "iit_weight": 1.0, + "behavior_weight": 0.4, + "strict_weight": 0.4, + "epochs": 1000, + "early_stop_accuracy_threshold": 99.9, + "act_fn": "gelu", + "use_wandb": true, + "save_model_to_wandb": true, + "clip_grad_norm": 0.1, + "lr_scheduler": "linear", + "model_pair": "strict", + "same_size": false, + "seed": 67, + "batch_size": 256, + "include_mlp": false, + "detach_while_caching": true, + "scheduler_val_metric": [ + "val/accuracy", + "val/IIA", + "val/strict_accuracy" + ], + "siit_sampling": "sample_all", + "val_iia_sampling": "all" + }, + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/122/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/122/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/122/edges.pkl" + }, + { + "case_id": "85", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/85", + "task_description": "Square each element of the input sequence.", + "vocab": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 + ], + "max_seq_len": 10, + "min_seq_len": 4, + "files": [ + { + "file_name": "edges.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/85/edges.pkl" + }, + { + "file_name": "ll_model.pth", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/85/ll_model.pth" + }, + { + "file_name": "ll_model_cfg.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/85/ll_model_cfg.pkl" + }, + { + "file_name": "meta.json", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/85/meta.json" + } + ], + "transformer_cfg": { + "n_layers": 2, + "d_model": 4, + "n_ctx": 10, + "d_head": 1, + "model_name": "custom", + "n_heads": 4, + "d_mlp": 16, + "act_fn": "gelu", + "d_vocab": 13, + "eps": 1e-05, + "use_attn_result": true, + "use_attn_scale": true, + "use_split_qkv_input": true, + "use_hook_mlp_in": true, + "use_attn_in": false, + "use_local_attn": false, + "original_architecture": null, + "from_checkpoint": false, + "checkpoint_index": null, + "checkpoint_label_type": null, + "checkpoint_value": null, + "tokenizer_name": null, + "window_size": null, + "attn_types": null, + "init_mode": "gpt2", + "normalization_type": null, + "n_devices": 1, + "attention_dir": "causal", + "attn_only": false, + "seed": 0, + "initializer_range": 0.1392621247645583, + "init_weights": true, + "scale_attn_by_inverse_layer_idx": false, + "positional_embedding_type": "standard", + "final_rms": false, + "d_vocab_out": 11, + "parallel_attn_mlp": false, + "rotary_dim": null, + "n_params": 384, + "use_hook_tokens": false, + "gated_mlp": false, + "default_prepend_bos": true, + "dtype": "torch.float32", + "tokenizer_prepends_bos": null, + "n_key_value_heads": null, + "post_embedding_ln": false, + "rotary_base": 10000, + "trust_remote_code": false, + "rotary_adjacent_pairs": false, + "load_in_4bit": false, + "num_experts": null, + "experts_per_token": null + }, + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/85/ll_model_cfg.pkl", + "training_args": { + "output_dir": "/circuits-benchmark/results", + "atol": 0.05, + "lr": 0.001, + "use_single_loss": true, + "iit_weight": 1.0, + "behavior_weight": 0.4, + "strict_weight": 0.7, + "epochs": 1000, + "early_stop_accuracy_threshold": 99.9, + "act_fn": "gelu", + "use_wandb": true, + "save_model_to_wandb": true, + "clip_grad_norm": 0.1, + "lr_scheduler": "linear", + "model_pair": "strict", + "same_size": false, + "seed": 67, + "batch_size": 256, + "include_mlp": false, + "detach_while_caching": true, + "scheduler_val_metric": [ + "val/accuracy", + "val/IIA", + "val/strict_accuracy" + ], + "siit_sampling": "sample_all", + "val_iia_sampling": "all" + }, + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/85/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/85/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/85/edges.pkl" + }, + { + "case_id": "2", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/2", + "task_description": "Reverse the input sequence.", + "vocab": [ + "a", + "b", + "c", + "d", + "e", + "f", + "g", + "h", + "i", + "j", + "k", + "l", + "m", + "n", + "o", + "p", + "q", + "r", + "s", + "t", + "u", + "v", + "w", + "x", + "y", + "z" + ], + "max_seq_len": 10, + "min_seq_len": 4, + "files": [ + { + "file_name": "edges.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/2/edges.pkl" + }, + { + "file_name": "ll_model.pth", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/2/ll_model.pth" + }, + { + "file_name": "ll_model_cfg.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/2/ll_model_cfg.pkl" + }, + { + "file_name": "meta.json", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/2/meta.json" + } + ], + "transformer_cfg": { + "n_layers": 4, + "d_model": 56, + "n_ctx": 10, + "d_head": 14, + "model_name": "custom", + "n_heads": 4, + "d_mlp": 224, + "act_fn": "gelu", + "d_vocab": 28, + "eps": 1e-05, + "use_attn_result": true, + "use_attn_scale": true, + "use_split_qkv_input": true, + "use_hook_mlp_in": true, + "use_attn_in": false, + "use_local_attn": false, + "original_architecture": null, + "from_checkpoint": false, + "checkpoint_index": null, + "checkpoint_label_type": null, + "checkpoint_value": null, + "tokenizer_name": null, + "window_size": null, + "attn_types": null, + "init_mode": "gpt2", + "normalization_type": null, + "n_devices": 1, + "attention_dir": "bidirectional", + "attn_only": false, + "seed": 0, + "initializer_range": 0.07593263966019993, + "init_weights": true, + "scale_attn_by_inverse_layer_idx": false, + "positional_embedding_type": "standard", + "final_rms": false, + "d_vocab_out": 26, + "parallel_attn_mlp": false, + "rotary_dim": null, + "n_params": 150528, + "use_hook_tokens": false, + "gated_mlp": false, + "default_prepend_bos": true, + "dtype": "torch.float32", + "tokenizer_prepends_bos": null, + "n_key_value_heads": null, + "post_embedding_ln": false, + "rotary_base": 10000, + "trust_remote_code": false, + "rotary_adjacent_pairs": false, + "load_in_4bit": false, + "num_experts": null, + "experts_per_token": null + }, + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/2/ll_model_cfg.pkl", + "training_args": { + "output_dir": "/circuits-benchmark/results", + "atol": 0.05, + "lr": 0.001, + "use_single_loss": true, + "iit_weight": 1.0, + "behavior_weight": 0.4, + "strict_weight": 0.4, + "epochs": 1000, + "early_stop_accuracy_threshold": 99.9, + "act_fn": "gelu", + "use_wandb": true, + "save_model_to_wandb": true, + "clip_grad_norm": 0.1, + "lr_scheduler": "linear", + "model_pair": "strict", + "same_size": false, + "seed": 67, + "batch_size": 256, + "include_mlp": false, + "detach_while_caching": true, + "scheduler_val_metric": [ + "val/accuracy", + "val/IIA", + "val/strict_accuracy" + ], + "siit_sampling": "sample_all", + "val_iia_sampling": "all" + }, + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/2/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/2/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/2/edges.pkl" + }, + { + "case_id": "44", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/44", + "task_description": "Replaces each element with the number of elements greater than it in the sequence", + "vocab": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 + ], + "max_seq_len": 10, + "min_seq_len": 4, + "files": [ + { + "file_name": "edges.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/44/edges.pkl" + }, + { + "file_name": "ll_model.pth", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/44/ll_model.pth" + }, + { + "file_name": "ll_model_cfg.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/44/ll_model_cfg.pkl" + }, + { + "file_name": "meta.json", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/44/meta.json" + } + ], + "transformer_cfg": { + "n_layers": 2, + "d_model": 24, + "n_ctx": 10, + "d_head": 6, + "model_name": "custom", + "n_heads": 4, + "d_mlp": 96, + "act_fn": "gelu", + "d_vocab": 13, + "eps": 1e-05, + "use_attn_result": true, + "use_attn_scale": true, + "use_split_qkv_input": true, + "use_hook_mlp_in": true, + "use_attn_in": false, + "use_local_attn": false, + "original_architecture": null, + "from_checkpoint": false, + "checkpoint_index": null, + "checkpoint_label_type": null, + "checkpoint_value": null, + "tokenizer_name": null, + "window_size": null, + "attn_types": null, + "init_mode": "gpt2", + "normalization_type": null, + "n_devices": 1, + "attention_dir": "bidirectional", + "attn_only": false, + "seed": 0, + "initializer_range": 0.13719886811400708, + "init_weights": true, + "scale_attn_by_inverse_layer_idx": false, + "positional_embedding_type": "standard", + "final_rms": false, + "d_vocab_out": 10, + "parallel_attn_mlp": false, + "rotary_dim": null, + "n_params": 13824, + "use_hook_tokens": false, + "gated_mlp": false, + "default_prepend_bos": true, + "dtype": "torch.float32", + "tokenizer_prepends_bos": null, + "n_key_value_heads": null, + "post_embedding_ln": false, + "rotary_base": 10000, + "trust_remote_code": false, + "rotary_adjacent_pairs": false, + "load_in_4bit": false, + "num_experts": null, + "experts_per_token": null + }, + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/44/ll_model_cfg.pkl", + "training_args": { + "output_dir": "/circuits-benchmark/results", + "atol": 0.05, + "lr": 0.001, + "use_single_loss": true, + "iit_weight": 1.0, + "behavior_weight": 0.4, + "strict_weight": 0.4, + "epochs": 1000, + "early_stop_accuracy_threshold": 99.9, + "act_fn": "gelu", + "use_wandb": true, + "save_model_to_wandb": true, + "clip_grad_norm": 0.1, + "lr_scheduler": "linear", + "model_pair": "strict", + "same_size": false, + "seed": 67, + "batch_size": 256, + "include_mlp": false, + "detach_while_caching": true, + "scheduler_val_metric": [ + "val/accuracy", + "val/IIA", + "val/strict_accuracy" + ], + "siit_sampling": "sample_all", + "val_iia_sampling": "all" + }, + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/44/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/44/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/44/edges.pkl" + }, + { + "case_id": "113", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/113", + "task_description": "Inverts the sequence if it is sorted in ascending order, otherwise leaves it unchanged.", + "vocab": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29 + ], + "max_seq_len": 10, + "min_seq_len": 4, + "files": [ + { + "file_name": "edges.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/113/edges.pkl" + }, + { + "file_name": "ll_model.pth", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/113/ll_model.pth" + }, + { + "file_name": "ll_model_cfg.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/113/ll_model_cfg.pkl" + }, + { + "file_name": "meta.json", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/113/meta.json" + } + ], + "transformer_cfg": { + "n_layers": 7, + "d_model": 88, + "n_ctx": 10, + "d_head": 22, + "model_name": "custom", + "n_heads": 4, + "d_mlp": 352, + "act_fn": "gelu", + "d_vocab": 32, + "eps": 1e-05, + "use_attn_result": true, + "use_attn_scale": true, + "use_split_qkv_input": true, + "use_hook_mlp_in": true, + "use_attn_in": false, + "use_local_attn": false, + "original_architecture": null, + "from_checkpoint": false, + "checkpoint_index": null, + "checkpoint_label_type": null, + "checkpoint_value": null, + "tokenizer_name": null, + "window_size": null, + "attn_types": null, + "init_mode": "gpt2", + "normalization_type": null, + "n_devices": 1, + "attention_dir": "bidirectional", + "attn_only": false, + "seed": 0, + "initializer_range": 0.0512147519731584, + "init_weights": true, + "scale_attn_by_inverse_layer_idx": false, + "positional_embedding_type": "standard", + "final_rms": false, + "d_vocab_out": 30, + "parallel_attn_mlp": false, + "rotary_dim": null, + "n_params": 650496, + "use_hook_tokens": false, + "gated_mlp": false, + "default_prepend_bos": true, + "dtype": "torch.float32", + "tokenizer_prepends_bos": null, + "n_key_value_heads": null, + "post_embedding_ln": false, + "rotary_base": 10000, + "trust_remote_code": false, + "rotary_adjacent_pairs": false, + "load_in_4bit": false, + "num_experts": null, + "experts_per_token": null + }, + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/113/ll_model_cfg.pkl", + "training_args": { + "output_dir": "/circuits-benchmark/results", + "atol": 0.05, + "lr": 0.001, + "use_single_loss": true, + "iit_weight": 1.0, + "behavior_weight": 0.4, + "strict_weight": 0.4, + "epochs": 1000, + "early_stop_accuracy_threshold": 99.9, + "act_fn": "gelu", + "use_wandb": true, + "save_model_to_wandb": true, + "clip_grad_norm": 0.1, + "lr_scheduler": "linear", + "model_pair": "strict", + "same_size": false, + "seed": 67, + "batch_size": 256, + "include_mlp": false, + "detach_while_caching": true, + "scheduler_val_metric": [ + "val/accuracy", + "val/IIA", + "val/strict_accuracy" + ], + "siit_sampling": "sample_all", + "val_iia_sampling": "all" + }, + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/113/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/113/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/113/edges.pkl" + }, + { + "case_id": "77", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/77", + "task_description": "Apply the tangent function to each element of the sequence.", + "vocab": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 + ], + "max_seq_len": 10, + "min_seq_len": 4, + "files": [ + { + "file_name": "edges.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/77/edges.pkl" + }, + { + "file_name": "ll_model.pth", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/77/ll_model.pth" + }, + { + "file_name": "ll_model_cfg.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/77/ll_model_cfg.pkl" + }, + { + "file_name": "meta.json", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/77/meta.json" + } + ], + "transformer_cfg": { + "n_layers": 2, + "d_model": 4, + "n_ctx": 10, + "d_head": 1, + "model_name": "custom", + "n_heads": 4, + "d_mlp": 16, + "act_fn": "gelu", + "d_vocab": 13, + "eps": 1e-05, + "use_attn_result": true, + "use_attn_scale": true, + "use_split_qkv_input": true, + "use_hook_mlp_in": true, + "use_attn_in": false, + "use_local_attn": false, + "original_architecture": null, + "from_checkpoint": false, + "checkpoint_index": null, + "checkpoint_label_type": null, + "checkpoint_value": null, + "tokenizer_name": null, + "window_size": null, + "attn_types": null, + "init_mode": "gpt2", + "normalization_type": null, + "n_devices": 1, + "attention_dir": "causal", + "attn_only": false, + "seed": 0, + "initializer_range": 0.1392621247645583, + "init_weights": true, + "scale_attn_by_inverse_layer_idx": false, + "positional_embedding_type": "standard", + "final_rms": false, + "d_vocab_out": 11, + "parallel_attn_mlp": false, + "rotary_dim": null, + "n_params": 384, + "use_hook_tokens": false, + "gated_mlp": false, + "default_prepend_bos": true, + "dtype": "torch.float32", + "tokenizer_prepends_bos": null, + "n_key_value_heads": null, + "post_embedding_ln": false, + "rotary_base": 10000, + "trust_remote_code": false, + "rotary_adjacent_pairs": false, + "load_in_4bit": false, + "num_experts": null, + "experts_per_token": null + }, + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/77/ll_model_cfg.pkl", + "training_args": { + "output_dir": "/circuits-benchmark/results", + "atol": 0.05, + "lr": 0.001, + "use_single_loss": true, + "iit_weight": 1.0, + "behavior_weight": 0.4, + "strict_weight": 0.7, + "epochs": 1000, + "early_stop_accuracy_threshold": 99.9, + "act_fn": "gelu", + "use_wandb": true, + "save_model_to_wandb": true, + "clip_grad_norm": 0.1, + "lr_scheduler": "linear", + "model_pair": "strict", + "same_size": false, + "seed": 67, + "batch_size": 256, + "include_mlp": false, + "detach_while_caching": true, + "scheduler_val_metric": [ + "val/accuracy", + "val/IIA", + "val/strict_accuracy" + ], + "siit_sampling": "sample_all", + "val_iia_sampling": "all" + }, + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/77/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/77/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/77/edges.pkl" + }, + { + "case_id": "20", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/20", + "task_description": "Detect spam messages based on appearance of spam keywords.", + "vocab": [ + "J", + "LB", + "TPSI", + "V", + "b", + "click", + "no", + "now", + "oCLrZaW", + "offer", + "poiVg", + "spam" + ], + "max_seq_len": 10, + "min_seq_len": 4, + "files": [ + { + "file_name": "edges.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/20/edges.pkl" + }, + { + "file_name": "ll_model.pth", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/20/ll_model.pth" + }, + { + "file_name": "ll_model_cfg.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/20/ll_model_cfg.pkl" + }, + { + "file_name": "meta.json", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/20/meta.json" + } + ], + "transformer_cfg": { + "n_layers": 2, + "d_model": 13, + "n_ctx": 10, + "d_head": 3, + "model_name": "custom", + "n_heads": 4, + "d_mlp": 52, + "act_fn": "gelu", + "d_vocab": 14, + "eps": 1e-05, + "use_attn_result": true, + "use_attn_scale": true, + "use_split_qkv_input": true, + "use_hook_mlp_in": true, + "use_attn_in": false, + "use_local_attn": false, + "original_architecture": null, + "from_checkpoint": false, + "checkpoint_index": null, + "checkpoint_label_type": null, + "checkpoint_value": null, + "tokenizer_name": null, + "window_size": null, + "attn_types": null, + "init_mode": "gpt2", + "normalization_type": null, + "n_devices": 1, + "attention_dir": "causal", + "attn_only": false, + "seed": 0, + "initializer_range": 0.16, + "init_weights": true, + "scale_attn_by_inverse_layer_idx": false, + "positional_embedding_type": "standard", + "final_rms": false, + "d_vocab_out": 2, + "parallel_attn_mlp": false, + "rotary_dim": null, + "n_params": 3952, + "use_hook_tokens": false, + "gated_mlp": false, + "default_prepend_bos": true, + "dtype": "torch.float32", + "tokenizer_prepends_bos": null, + "n_key_value_heads": null, + "post_embedding_ln": false, + "rotary_base": 10000, + "trust_remote_code": false, + "rotary_adjacent_pairs": false + }, + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/20/ll_model_cfg.pkl", + "training_args": { + "atol": 0.05, + "lr": 0.01, + "use_single_loss": false, + "iit_weight": 1.0, + "behavior_weight": 1.0, + "strict_weight": 0.4, + "epochs": 2000, + "act_fn": "gelu", + "clip_grad_norm": 0.1, + "lr_scheduler": "", + "model_pair": "strict" + }, + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/20/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/20/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/20/edges.pkl" + }, + { + "case_id": "25", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/25", + "task_description": "Normalizes token frequencies in a sequence to a range between 0 and 1.", + "vocab": [ + "a", + "b", + "c" + ], + "max_seq_len": 10, + "min_seq_len": 4, + "files": [ + { + "file_name": "edges.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/25/edges.pkl" + }, + { + "file_name": "ll_model.pth", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/25/ll_model.pth" + }, + { + "file_name": "ll_model_cfg.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/25/ll_model_cfg.pkl" + }, + { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/34/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/25/meta.json" } ], "transformer_cfg": { "n_layers": 2, - "d_model": 16, + "d_model": 62, "n_ctx": 10, - "d_head": 4, + "d_head": 15, "model_name": "custom", "n_heads": 4, - "d_mlp": 64, + "d_mlp": 248, + "act_fn": "gelu", + "d_vocab": 5, + "eps": 1e-05, + "use_attn_result": true, + "use_attn_scale": true, + "use_split_qkv_input": true, + "use_hook_mlp_in": true, + "use_attn_in": false, + "use_local_attn": false, + "original_architecture": null, + "from_checkpoint": false, + "checkpoint_index": null, + "checkpoint_label_type": null, + "checkpoint_value": null, + "tokenizer_name": null, + "window_size": null, + "attn_types": null, + "init_mode": "gpt2", + "normalization_type": null, + "n_devices": 1, + "attention_dir": "bidirectional", + "attn_only": false, + "seed": 0, + "initializer_range": 0.08295613557843402, + "init_weights": true, + "scale_attn_by_inverse_layer_idx": false, + "positional_embedding_type": "standard", + "final_rms": false, + "d_vocab_out": 56, + "parallel_attn_mlp": false, + "rotary_dim": null, + "n_params": 91264, + "use_hook_tokens": false, + "gated_mlp": false, + "default_prepend_bos": true, + "dtype": "torch.float32", + "tokenizer_prepends_bos": null, + "n_key_value_heads": null, + "post_embedding_ln": false, + "rotary_base": 10000, + "trust_remote_code": false, + "rotary_adjacent_pairs": false, + "load_in_4bit": false, + "num_experts": null, + "experts_per_token": null + }, + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/25/ll_model_cfg.pkl", + "training_args": { + "output_dir": "/circuits-benchmark/results", + "atol": 0.05, + "lr": 0.001, + "use_single_loss": true, + "iit_weight": 1.0, + "behavior_weight": 0.4, + "strict_weight": 0.4, + "epochs": 1000, + "early_stop_accuracy_threshold": 99.9, + "act_fn": "gelu", + "use_wandb": true, + "save_model_to_wandb": true, + "clip_grad_norm": 0.1, + "lr_scheduler": "linear", + "model_pair": "strict", + "same_size": false, + "seed": 67, + "batch_size": 256, + "include_mlp": false, + "detach_while_caching": true, + "scheduler_val_metric": [ + "val/accuracy", + "val/IIA", + "val/strict_accuracy" + ], + "siit_sampling": "sample_all", + "val_iia_sampling": "all" + }, + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/25/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/25/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/25/edges.pkl" + }, + { + "case_id": "11", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/11", + "task_description": "Counts the number of words in a sequence based on their length.", + "vocab": [ + "J", + "LB", + "TPSI", + "V", + "b", + "no", + "oCLrZaW", + "poiVg" + ], + "max_seq_len": 10, + "min_seq_len": 4, + "files": [ + { + "file_name": "edges.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/11/edges.pkl" + }, + { + "file_name": "ll_model.pth", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/11/ll_model.pth" + }, + { + "file_name": "ll_model_cfg.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/11/ll_model_cfg.pkl" + }, + { + "file_name": "meta.json", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/11/meta.json" + } + ], + "transformer_cfg": { + "n_layers": 2, + "d_model": 12, + "n_ctx": 10, + "d_head": 3, + "model_name": "custom", + "n_heads": 4, + "d_mlp": 48, "act_fn": "gelu", "d_vocab": 10, "eps": 1e-05, @@ -3032,7 +6333,7 @@ "attention_dir": "causal", "attn_only": false, "seed": 0, - "initializer_range": 0.16329931618554522, + "initializer_range": 0.1460593486680443, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", @@ -3040,7 +6341,7 @@ "d_vocab_out": 5, "parallel_attn_mlp": false, "rotary_dim": null, - "n_params": 6144, + "n_params": 3456, "use_hook_tokens": false, "gated_mlp": false, "default_prepend_bos": true, @@ -3052,68 +6353,195 @@ "trust_remote_code": false, "rotary_adjacent_pairs": false }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/34/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/11/ll_model_cfg.pkl", "training_args": { "atol": 0.05, "lr": 0.01, "use_single_loss": false, "iit_weight": 1.0, "behavior_weight": 1.0, - "strict_weight": 1.0, - "epochs": 2000, + "strict_weight": 0.4, + "epochs": 500, + "act_fn": "gelu", + "clip_grad_norm": 1.0, + "lr_scheduler": "" + }, + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/11/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/11/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/11/edges.pkl" + }, + { + "case_id": "54", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/54", + "task_description": "Applies the hyperbolic tangent to each element.", + "vocab": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 + ], + "max_seq_len": 10, + "min_seq_len": 4, + "files": [ + { + "file_name": "edges.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/54/edges.pkl" + }, + { + "file_name": "ll_model.pth", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/54/ll_model.pth" + }, + { + "file_name": "ll_model_cfg.pkl", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/54/ll_model_cfg.pkl" + }, + { + "file_name": "meta.json", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/54/meta.json" + } + ], + "transformer_cfg": { + "n_layers": 2, + "d_model": 4, + "n_ctx": 10, + "d_head": 1, + "model_name": "custom", + "n_heads": 4, + "d_mlp": 16, + "act_fn": "gelu", + "d_vocab": 13, + "eps": 1e-05, + "use_attn_result": true, + "use_attn_scale": true, + "use_split_qkv_input": true, + "use_hook_mlp_in": true, + "use_attn_in": false, + "use_local_attn": false, + "original_architecture": null, + "from_checkpoint": false, + "checkpoint_index": null, + "checkpoint_label_type": null, + "checkpoint_value": null, + "tokenizer_name": null, + "window_size": null, + "attn_types": null, + "init_mode": "gpt2", + "normalization_type": null, + "n_devices": 1, + "attention_dir": "causal", + "attn_only": false, + "seed": 0, + "initializer_range": 0.1392621247645583, + "init_weights": true, + "scale_attn_by_inverse_layer_idx": false, + "positional_embedding_type": "standard", + "final_rms": false, + "d_vocab_out": 11, + "parallel_attn_mlp": false, + "rotary_dim": null, + "n_params": 384, + "use_hook_tokens": false, + "gated_mlp": false, + "default_prepend_bos": true, + "dtype": "torch.float32", + "tokenizer_prepends_bos": null, + "n_key_value_heads": null, + "post_embedding_ln": false, + "rotary_base": 10000, + "trust_remote_code": false, + "rotary_adjacent_pairs": false, + "load_in_4bit": false, + "num_experts": null, + "experts_per_token": null + }, + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/54/ll_model_cfg.pkl", + "training_args": { + "output_dir": "/circuits-benchmark/results", + "atol": 0.05, + "lr": 0.001, + "use_single_loss": true, + "iit_weight": 1.0, + "behavior_weight": 0.4, + "strict_weight": 0.7, + "epochs": 1000, + "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", + "use_wandb": true, + "save_model_to_wandb": true, "clip_grad_norm": 0.1, - "lr_scheduler": "", - "model_pair": "strict" + "lr_scheduler": "linear", + "model_pair": "strict", + "same_size": false, + "seed": 67, + "batch_size": 256, + "include_mlp": false, + "detach_while_caching": true, + "scheduler_val_metric": [ + "val/accuracy", + "val/IIA", + "val/strict_accuracy" + ], + "siit_sampling": "sample_all", + "val_iia_sampling": "all" }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/34/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/34/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/34/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/54/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/54/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/54/edges.pkl" }, { - "case_id": "35", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/35", - "task_description": "Alternates capitalization of each character in words.", + "case_id": "124", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/124", + "task_description": "Check if all elements in a list are equal.", "vocab": [ - "J", - "LB", - "TPSI", - "V", - "b", - "no", - "oCLrZaW", - "poiVg" + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 ], "max_seq_len": 10, "min_seq_len": 4, "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/35/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/124/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/35/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/124/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/35/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/124/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/35/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/124/meta.json" } ], "transformer_cfg": { - "n_layers": 2, - "d_model": 9, + "n_layers": 3, + "d_model": 24, "n_ctx": 10, - "d_head": 2, + "d_head": 6, "model_name": "custom", "n_heads": 4, - "d_mlp": 36, + "d_mlp": 96, "act_fn": "gelu", - "d_vocab": 10, + "d_vocab": 13, "eps": 1e-05, "use_attn_result": true, "use_attn_scale": true, @@ -3132,18 +6560,18 @@ "init_mode": "gpt2", "normalization_type": null, "n_devices": 1, - "attention_dir": "causal", + "attention_dir": "bidirectional", "attn_only": false, "seed": 0, - "initializer_range": 0.1539600717839002, + "initializer_range": 0.11547005383792516, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", "final_rms": false, - "d_vocab_out": 8, + "d_vocab_out": 2, "parallel_attn_mlp": false, "rotary_dim": null, - "n_params": 1872, + "n_params": 20736, "use_hook_tokens": false, "gated_mlp": false, "default_prepend_bos": true, @@ -3153,65 +6581,121 @@ "post_embedding_ln": false, "rotary_base": 10000, "trust_remote_code": false, - "rotary_adjacent_pairs": false + "rotary_adjacent_pairs": false, + "load_in_4bit": false, + "num_experts": null, + "experts_per_token": null }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/35/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/124/ll_model_cfg.pkl", "training_args": { + "output_dir": "/circuits-benchmark/results", "atol": 0.05, - "lr": 0.01, - "use_single_loss": false, + "lr": 0.001, + "use_single_loss": true, "iit_weight": 1.0, - "behavior_weight": 1.0, - "strict_weight": 1.0, - "epochs": 2000, + "behavior_weight": 0.4, + "strict_weight": 0.4, + "epochs": 1000, + "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", + "use_wandb": true, + "save_model_to_wandb": true, "clip_grad_norm": 0.1, - "lr_scheduler": "", - "model_pair": "strict" + "lr_scheduler": "linear", + "model_pair": "strict", + "same_size": false, + "seed": 67, + "batch_size": 256, + "include_mlp": false, + "detach_while_caching": true, + "scheduler_val_metric": [ + "val/accuracy", + "val/IIA", + "val/strict_accuracy" + ], + "siit_sampling": "sample_all", + "val_iia_sampling": "all" }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/35/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/35/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/35/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/124/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/124/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/124/edges.pkl" }, { - "case_id": "36", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/36", - "task_description": "Classifies each token as 'positive', 'negative', or 'neutral' based on emojis.", + "case_id": "102", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/102", + "task_description": "Reflects each element within a range (default is [2, 7]).", "vocab": [ - "\ud83d\udcd8", - "\ud83d\ude0a", - "\ud83d\ude22" + -20, + -19, + -18, + -17, + -16, + -15, + -14, + -13, + -12, + -11, + -10, + -9, + -8, + -7, + -6, + -5, + -4, + -3, + -2, + -1, + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19 ], "max_seq_len": 10, "min_seq_len": 4, "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/36/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/102/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/36/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/102/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/36/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/102/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/36/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/102/meta.json" } ], "transformer_cfg": { "n_layers": 2, - "d_model": 6, + "d_model": 4, "n_ctx": 10, "d_head": 1, "model_name": "custom", "n_heads": 4, - "d_mlp": 24, + "d_mlp": 16, "act_fn": "gelu", - "d_vocab": 5, + "d_vocab": 42, "eps": 1e-05, "use_attn_result": true, "use_attn_scale": true, @@ -3233,15 +6717,15 @@ "attention_dir": "causal", "attn_only": false, "seed": 0, - "initializer_range": 0.19402850002906638, + "initializer_range": 0.1059625885652035, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", "final_rms": false, - "d_vocab_out": 3, + "d_vocab_out": 6, "parallel_attn_mlp": false, "rotary_dim": null, - "n_params": 768, + "n_params": 384, "use_hook_tokens": false, "gated_mlp": false, "default_prepend_bos": true, @@ -3251,70 +6735,111 @@ "post_embedding_ln": false, "rotary_base": 10000, "trust_remote_code": false, - "rotary_adjacent_pairs": false + "rotary_adjacent_pairs": false, + "load_in_4bit": false, + "num_experts": null, + "experts_per_token": null }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/36/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/102/ll_model_cfg.pkl", "training_args": { + "output_dir": "/circuits-benchmark/results", "atol": 0.05, - "lr": 0.01, - "use_single_loss": false, + "lr": 0.001, + "use_single_loss": true, "iit_weight": 1.0, - "behavior_weight": 1.0, - "strict_weight": 1.0, - "epochs": 2000, + "behavior_weight": 0.4, + "strict_weight": 0.4, + "epochs": 1000, + "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", + "use_wandb": true, + "save_model_to_wandb": true, "clip_grad_norm": 0.1, - "lr_scheduler": "", - "model_pair": "strict" + "lr_scheduler": "linear", + "model_pair": "strict", + "same_size": false, + "seed": 82, + "batch_size": 256, + "include_mlp": false, + "detach_while_caching": true, + "scheduler_val_metric": [ + "val/accuracy", + "val/IIA", + "val/strict_accuracy" + ], + "siit_sampling": "sample_all", + "val_iia_sampling": "all" }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/36/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/36/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/36/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/102/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/102/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/102/edges.pkl" }, { - "case_id": "37", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/37", - "task_description": "Reverses each word in the sequence except for specified exclusions.", + "case_id": "129", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/129", + "task_description": "Checks if all elements are a multiple of n (set the default at 2).", "vocab": [ - "J", - "LB", - "TPSI", - "V", - "b", - "no", - "oCLrZaW", - "poiVg" + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29 ], "max_seq_len": 10, "min_seq_len": 4, "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/37/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/129/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/37/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/129/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/37/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/129/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/37/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/129/meta.json" } ], "transformer_cfg": { - "n_layers": 2, - "d_model": 12, + "n_layers": 3, + "d_model": 4, "n_ctx": 10, - "d_head": 3, + "d_head": 1, "model_name": "custom", "n_heads": 4, - "d_mlp": 48, + "d_mlp": 16, "act_fn": "gelu", - "d_vocab": 10, + "d_vocab": 32, "eps": 1e-05, "use_attn_result": true, "use_attn_scale": true, @@ -3333,18 +6858,18 @@ "init_mode": "gpt2", "normalization_type": null, "n_devices": 1, - "attention_dir": "causal", + "attention_dir": "bidirectional", "attn_only": false, "seed": 0, - "initializer_range": 0.1539600717839002, + "initializer_range": 0.10504514628777804, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", "final_rms": false, - "d_vocab_out": 8, + "d_vocab_out": 2, "parallel_attn_mlp": false, "rotary_dim": null, - "n_params": 3456, + "n_params": 576, "use_hook_tokens": false, "gated_mlp": false, "default_prepend_bos": true, @@ -3354,88 +6879,92 @@ "post_embedding_ln": false, "rotary_base": 10000, "trust_remote_code": false, - "rotary_adjacent_pairs": false + "rotary_adjacent_pairs": false, + "load_in_4bit": false, + "num_experts": null, + "experts_per_token": null }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/37/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/129/ll_model_cfg.pkl", "training_args": { + "output_dir": "/circuits-benchmark/results", "atol": 0.05, - "lr": 0.01, - "use_single_loss": false, + "lr": 0.001, + "use_single_loss": true, "iit_weight": 1.0, - "behavior_weight": 1.0, - "strict_weight": 1.0, - "epochs": 2000, + "behavior_weight": 0.4, + "strict_weight": 0.4, + "epochs": 1000, + "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", + "use_wandb": true, + "save_model_to_wandb": true, "clip_grad_norm": 0.1, - "lr_scheduler": "", - "model_pair": "strict" + "lr_scheduler": "linear", + "model_pair": "strict", + "same_size": false, + "seed": 67, + "batch_size": 256, + "include_mlp": false, + "detach_while_caching": true, + "scheduler_val_metric": [ + "val/accuracy", + "val/IIA", + "val/strict_accuracy" + ], + "siit_sampling": "sample_all", + "val_iia_sampling": "all" }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/37/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/37/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/37/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/129/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/129/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/129/edges.pkl" }, { - "case_id": "39", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/39", - "task_description": "Returns the fraction of 'x' in the input up to the i-th position for all i.", + "case_id": "50", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/50", + "task_description": "Applies the hyperbolic cosine to each element", "vocab": [ - "a", - "b", - "c", - "d", - "e", - "f", - "g", - "h", - "i", - "j", - "k", - "l", - "m", - "n", - "o", - "p", - "q", - "r", - "s", - "t", - "u", - "v", - "w", - "x", - "y", - "z" + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 ], - "max_seq_len": 60, + "max_seq_len": 10, "min_seq_len": 4, "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/39/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/50/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/39/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/50/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/39/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/50/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/39/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/50/meta.json" } ], "transformer_cfg": { "n_layers": 2, - "d_model": 120, - "n_ctx": 60, - "d_head": 30, + "d_model": 4, + "n_ctx": 10, + "d_head": 1, "model_name": "custom", "n_heads": 4, - "d_mlp": 480, + "d_mlp": 16, "act_fn": "gelu", - "d_vocab": 28, + "d_vocab": 13, "eps": 1e-05, "use_attn_result": true, "use_attn_scale": true, @@ -3457,15 +6986,15 @@ "attention_dir": "causal", "attn_only": false, "seed": 0, - "initializer_range": 0.08432740427115679, + "initializer_range": 0.1392621247645583, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", "final_rms": false, - "d_vocab_out": 1, + "d_vocab_out": 11, "parallel_attn_mlp": false, "rotary_dim": null, - "n_params": 345600, + "n_params": 384, "use_hook_tokens": false, "gated_mlp": false, "default_prepend_bos": true, @@ -3480,7 +7009,7 @@ "num_experts": null, "experts_per_token": null }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/39/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/50/ll_model_cfg.pkl", "training_args": { "output_dir": "/circuits-benchmark/results", "atol": 0.05, @@ -3488,7 +7017,7 @@ "use_single_loss": true, "iit_weight": 1.0, "behavior_weight": 0.4, - "strict_weight": 0.4, + "strict_weight": 0.7, "epochs": 1000, "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", @@ -3510,51 +7039,66 @@ "siit_sampling": "sample_all", "val_iia_sampling": "all" }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/39/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/39/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/39/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/50/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/50/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/50/edges.pkl" }, { - "case_id": "4", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/4", - "task_description": "Return fraction of previous open tokens minus the fraction of close tokens.", + "case_id": "123", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/123", + "task_description": "Apply arccosine to each element of the input sequence.", "vocab": [ - "(", - ")", - "a", - "b", - "c" + -1.0, + -0.9, + -0.8, + -0.7, + -0.6, + -0.5, + -0.4, + -0.30000000000000004, + -0.19999999999999996, + -0.09999999999999998, + 0.0, + 0.10000000000000009, + 0.19999999999999996, + 0.30000000000000004, + 0.3999999999999999, + 0.5, + 0.6000000000000001, + 0.7, + 0.8, + 0.8999999999999999 ], "max_seq_len": 10, "min_seq_len": 4, "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/4/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/123/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/4/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/123/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/4/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/123/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/4/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/123/meta.json" } ], "transformer_cfg": { "n_layers": 2, - "d_model": 20, + "d_model": 4, "n_ctx": 10, - "d_head": 5, + "d_head": 1, "model_name": "custom", "n_heads": 4, - "d_mlp": 80, + "d_mlp": 16, "act_fn": "gelu", - "d_vocab": 7, + "d_vocab": 22, "eps": 1e-05, "use_attn_result": true, "use_attn_scale": true, @@ -3576,15 +7120,15 @@ "attention_dir": "causal", "attn_only": false, "seed": 0, - "initializer_range": 0.17056057308448835, + "initializer_range": 0.11202240672224079, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", "final_rms": false, - "d_vocab_out": 1, + "d_vocab_out": 20, "parallel_attn_mlp": false, "rotary_dim": null, - "n_params": 9600, + "n_params": 384, "use_hook_tokens": false, "gated_mlp": false, "default_prepend_bos": true, @@ -3594,29 +7138,49 @@ "post_embedding_ln": false, "rotary_base": 10000, "trust_remote_code": false, - "rotary_adjacent_pairs": false + "rotary_adjacent_pairs": false, + "load_in_4bit": false, + "num_experts": null, + "experts_per_token": null }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/4/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/123/ll_model_cfg.pkl", "training_args": { + "output_dir": "/circuits-benchmark/results", "atol": 0.05, "lr": 0.001, - "use_single_loss": false, + "use_single_loss": true, "iit_weight": 1.0, - "behavior_weight": 1.0, - "strict_weight": 0.4, - "epochs": 2000, + "behavior_weight": 0.4, + "strict_weight": 0.7, + "epochs": 1000, + "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", + "use_wandb": true, + "save_model_to_wandb": true, "clip_grad_norm": 0.1, - "lr_scheduler": "" + "lr_scheduler": "linear", + "model_pair": "strict", + "same_size": false, + "seed": 67, + "batch_size": 256, + "include_mlp": false, + "detach_while_caching": true, + "scheduler_val_metric": [ + "val/accuracy", + "val/IIA", + "val/strict_accuracy" + ], + "siit_sampling": "sample_all", + "val_iia_sampling": "all" }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/4/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/4/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/4/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/123/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/123/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/123/edges.pkl" }, { - "case_id": "40", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/40", - "task_description": "Sum the last and previous to last digits of a number", + "case_id": "86", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/86", + "task_description": "Check if each element is a power of 2. Return 1 if true, otherwise 0.", "vocab": [ 0, 1, @@ -3628,44 +7192,26 @@ 7, 8, 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28 + 10 ], "max_seq_len": 10, "min_seq_len": 4, "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/40/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/86/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/40/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/86/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/40/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/86/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/40/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/86/meta.json" } ], "transformer_cfg": { @@ -3677,7 +7223,7 @@ "n_heads": 4, "d_mlp": 16, "act_fn": "gelu", - "d_vocab": 31, + "d_vocab": 13, "eps": 1e-05, "use_attn_result": true, "use_attn_scale": true, @@ -3699,12 +7245,12 @@ "attention_dir": "causal", "attn_only": false, "seed": 0, - "initializer_range": 0.09847319278346618, + "initializer_range": 0.16329931618554522, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", "final_rms": false, - "d_vocab_out": 12, + "d_vocab_out": 2, "parallel_attn_mlp": false, "rotary_dim": null, "n_params": 384, @@ -3722,7 +7268,7 @@ "num_experts": null, "experts_per_token": null }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/40/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/86/ll_model_cfg.pkl", "training_args": { "output_dir": "/circuits-benchmark/results", "atol": 0.05, @@ -3752,14 +7298,14 @@ "siit_sampling": "sample_all", "val_iia_sampling": "all" }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/40/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/40/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/40/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/86/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/86/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/86/edges.pkl" }, { - "case_id": "44", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/44", - "task_description": "Replaces each element with the number of elements greater than it in the sequence", + "case_id": "105", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/105", + "task_description": "Replaces each number with the next prime after that number.", "vocab": [ 0, 1, @@ -3771,38 +7317,57 @@ 7, 8, 9, - 10 + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29 ], "max_seq_len": 10, "min_seq_len": 4, "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/44/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/105/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/44/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/105/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/44/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/105/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/44/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/105/meta.json" } ], "transformer_cfg": { "n_layers": 2, - "d_model": 24, + "d_model": 4, "n_ctx": 10, - "d_head": 6, + "d_head": 1, "model_name": "custom", "n_heads": 4, - "d_mlp": 96, + "d_mlp": 16, "act_fn": "gelu", - "d_vocab": 13, + "d_vocab": 32, "eps": 1e-05, "use_attn_result": true, "use_attn_scale": true, @@ -3821,10 +7386,10 @@ "init_mode": "gpt2", "normalization_type": null, "n_devices": 1, - "attention_dir": "bidirectional", + "attention_dir": "causal", "attn_only": false, "seed": 0, - "initializer_range": 0.13719886811400708, + "initializer_range": 0.11202240672224079, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", @@ -3832,7 +7397,7 @@ "d_vocab_out": 10, "parallel_attn_mlp": false, "rotary_dim": null, - "n_params": 13824, + "n_params": 384, "use_hook_tokens": false, "gated_mlp": false, "default_prepend_bos": true, @@ -3847,7 +7412,7 @@ "num_experts": null, "experts_per_token": null }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/44/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/105/ll_model_cfg.pkl", "training_args": { "output_dir": "/circuits-benchmark/results", "atol": 0.05, @@ -3855,7 +7420,7 @@ "use_single_loss": true, "iit_weight": 1.0, "behavior_weight": 0.4, - "strict_weight": 0.4, + "strict_weight": 0.7, "epochs": 1000, "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", @@ -3875,16 +7440,16 @@ "val/strict_accuracy" ], "siit_sampling": "sample_all", - "val_iia_sampling": "all" - }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/44/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/44/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/44/edges.pkl" + "val_iia_sampling": "all" + }, + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/105/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/105/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/105/edges.pkl" }, { - "case_id": "45", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/45", - "task_description": "Doubles the first half of the sequence", + "case_id": "58", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/58", + "task_description": "Mirrors the first half of the sequence to the second half.", "vocab": [ 0, 1, @@ -3903,29 +7468,29 @@ "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/45/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/58/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/45/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/58/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/45/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/58/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/45/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/58/meta.json" } ], "transformer_cfg": { "n_layers": 3, - "d_model": 24, + "d_model": 32, "n_ctx": 10, - "d_head": 6, + "d_head": 8, "model_name": "custom", "n_heads": 4, - "d_mlp": 96, + "d_mlp": 128, "act_fn": "gelu", "d_vocab": 13, "eps": 1e-05, @@ -3949,15 +7514,15 @@ "attention_dir": "bidirectional", "attn_only": false, "seed": 0, - "initializer_range": 0.11094003924504584, + "initializer_range": 0.10415112878465911, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", "final_rms": false, - "d_vocab_out": 16, + "d_vocab_out": 11, "parallel_attn_mlp": false, "rotary_dim": null, - "n_params": 20736, + "n_params": 36864, "use_hook_tokens": false, "gated_mlp": false, "default_prepend_bos": true, @@ -3972,7 +7537,7 @@ "num_experts": null, "experts_per_token": null }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/45/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/58/ll_model_cfg.pkl", "training_args": { "output_dir": "/circuits-benchmark/results", "atol": 0.05, @@ -4002,14 +7567,14 @@ "siit_sampling": "sample_all", "val_iia_sampling": "all" }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/45/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/45/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/45/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/58/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/58/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/58/edges.pkl" }, { - "case_id": "51", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/51", - "task_description": "Checks if each element is a Fibonacci number", + "case_id": "70", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/70", + "task_description": "Apply the cosine function to each element of the input sequence.", "vocab": [ 0, 1, @@ -4021,115 +7586,26 @@ 7, 8, 9, - 10, - 11, - 12, - 13, - 14, - 15, - 16, - 17, - 18, - 19, - 20, - 21, - 22, - 23, - 24, - 25, - 26, - 27, - 28, - 29, - 30, - 31, - 32, - 33, - 34, - 35, - 36, - 37, - 38, - 39, - 40, - 41, - 42, - 43, - 44, - 45, - 46, - 47, - 48, - 49, - 50, - 51, - 52, - 53, - 54, - 55, - 56, - 57, - 58, - 59, - 60, - 61, - 62, - 63, - 64, - 65, - 66, - 67, - 68, - 69, - 70, - 71, - 72, - 73, - 74, - 75, - 76, - 77, - 78, - 79, - 80, - 81, - 82, - 83, - 84, - 85, - 86, - 87, - 88, - 89, - 90, - 91, - 92, - 93, - 94, - 95, - 96, - 97, - 98, - 99 + 10 ], "max_seq_len": 10, "min_seq_len": 4, "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/51/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/70/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/51/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/70/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/51/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/70/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/51/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/70/meta.json" } ], "transformer_cfg": { @@ -4141,7 +7617,7 @@ "n_heads": 4, "d_mlp": 16, "act_fn": "gelu", - "d_vocab": 102, + "d_vocab": 13, "eps": 1e-05, "use_attn_result": true, "use_attn_scale": true, @@ -4163,12 +7639,12 @@ "attention_dir": "causal", "attn_only": false, "seed": 0, - "initializer_range": 0.07525766947068778, + "initializer_range": 0.1392621247645583, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", "final_rms": false, - "d_vocab_out": 2, + "d_vocab_out": 11, "parallel_attn_mlp": false, "rotary_dim": null, "n_params": 384, @@ -4186,7 +7662,7 @@ "num_experts": null, "experts_per_token": null }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/51/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/70/ll_model_cfg.pkl", "training_args": { "output_dir": "/circuits-benchmark/results", "atol": 0.05, @@ -4194,7 +7670,7 @@ "use_single_loss": true, "iit_weight": 1.0, "behavior_weight": 0.4, - "strict_weight": 0.4, + "strict_weight": 0.7, "epochs": 1000, "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", @@ -4216,14 +7692,14 @@ "siit_sampling": "sample_all", "val_iia_sampling": "all" }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/51/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/51/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/51/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/70/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/70/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/70/edges.pkl" }, { - "case_id": "56", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/56", - "task_description": "Sets every third element to zero.", + "case_id": "62", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/62", + "task_description": "Replaces each element with its factorial.", "vocab": [ 0, 1, @@ -4242,19 +7718,19 @@ "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/56/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/62/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/56/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/62/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/56/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/62/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/56/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/62/meta.json" } ], "transformer_cfg": { @@ -4288,12 +7764,12 @@ "attention_dir": "causal", "attn_only": false, "seed": 0, - "initializer_range": 0.13333333333333333, + "initializer_range": 0.1414213562373095, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", "final_rms": false, - "d_vocab_out": 11, + "d_vocab_out": 10, "parallel_attn_mlp": false, "rotary_dim": null, "n_params": 384, @@ -4311,7 +7787,7 @@ "num_experts": null, "experts_per_token": null }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/56/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/62/ll_model_cfg.pkl", "training_args": { "output_dir": "/circuits-benchmark/results", "atol": 0.05, @@ -4319,7 +7795,7 @@ "use_single_loss": true, "iit_weight": 1.0, "behavior_weight": 0.4, - "strict_weight": 0.4, + "strict_weight": 0.7, "epochs": 1000, "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", @@ -4341,14 +7817,14 @@ "siit_sampling": "sample_all", "val_iia_sampling": "all" }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/56/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/56/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/56/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/62/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/62/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/62/edges.pkl" }, { - "case_id": "58", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/58", - "task_description": "Mirrors the first half of the sequence to the second half.", + "case_id": "111", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/111", + "task_description": "Returns the last element of the sequence and pads the rest with zeros.", "vocab": [ 0, 1, @@ -4367,29 +7843,29 @@ "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/58/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/111/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/58/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/111/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/58/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/111/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/58/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/111/meta.json" } ], "transformer_cfg": { "n_layers": 3, - "d_model": 32, + "d_model": 24, "n_ctx": 10, - "d_head": 8, + "d_head": 6, "model_name": "custom", "n_heads": 4, - "d_mlp": 128, + "d_mlp": 96, "act_fn": "gelu", "d_vocab": 13, "eps": 1e-05, @@ -4413,7 +7889,7 @@ "attention_dir": "bidirectional", "attn_only": false, "seed": 0, - "initializer_range": 0.10415112878465911, + "initializer_range": 0.09847319278346618, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", @@ -4421,7 +7897,7 @@ "d_vocab_out": 11, "parallel_attn_mlp": false, "rotary_dim": null, - "n_params": 36864, + "n_params": 20736, "use_hook_tokens": false, "gated_mlp": false, "default_prepend_bos": true, @@ -4436,7 +7912,7 @@ "num_experts": null, "experts_per_token": null }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/58/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/111/ll_model_cfg.pkl", "training_args": { "output_dir": "/circuits-benchmark/results", "atol": 0.05, @@ -4466,57 +7942,54 @@ "siit_sampling": "sample_all", "val_iia_sampling": "all" }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/58/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/58/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/58/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/111/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/111/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/111/edges.pkl" }, { - "case_id": "63", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/63", - "task_description": "Replaces each element with the number of elements less than it in the sequence.", + "case_id": "35", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/35", + "task_description": "Alternates capitalization of each character in words.", "vocab": [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10 + "J", + "LB", + "TPSI", + "V", + "b", + "no", + "oCLrZaW", + "poiVg" ], "max_seq_len": 10, "min_seq_len": 4, "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/63/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/35/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/63/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/35/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/63/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/35/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/63/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/35/meta.json" } ], "transformer_cfg": { "n_layers": 2, - "d_model": 24, + "d_model": 9, "n_ctx": 10, - "d_head": 6, + "d_head": 2, "model_name": "custom", "n_heads": 4, - "d_mlp": 96, + "d_mlp": 36, "act_fn": "gelu", - "d_vocab": 13, + "d_vocab": 10, "eps": 1e-05, "use_attn_result": true, "use_attn_scale": true, @@ -4535,18 +8008,18 @@ "init_mode": "gpt2", "normalization_type": null, "n_devices": 1, - "attention_dir": "bidirectional", + "attention_dir": "causal", "attn_only": false, "seed": 0, - "initializer_range": 0.13719886811400708, + "initializer_range": 0.1539600717839002, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", "final_rms": false, - "d_vocab_out": 10, + "d_vocab_out": 8, "parallel_attn_mlp": false, "rotary_dim": null, - "n_params": 13824, + "n_params": 1872, "use_hook_tokens": false, "gated_mlp": false, "default_prepend_bos": true, @@ -4556,92 +8029,66 @@ "post_embedding_ln": false, "rotary_base": 10000, "trust_remote_code": false, - "rotary_adjacent_pairs": false, - "load_in_4bit": false, - "num_experts": null, - "experts_per_token": null + "rotary_adjacent_pairs": false }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/63/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/35/ll_model_cfg.pkl", "training_args": { - "output_dir": "/circuits-benchmark/results", "atol": 0.05, - "lr": 0.001, - "use_single_loss": true, - "iit_weight": 1.0, - "behavior_weight": 0.4, - "strict_weight": 0.4, - "epochs": 1000, - "early_stop_accuracy_threshold": 99.9, + "lr": 0.01, + "use_single_loss": false, + "iit_weight": 1.0, + "behavior_weight": 1.0, + "strict_weight": 1.0, + "epochs": 2000, "act_fn": "gelu", - "use_wandb": true, - "save_model_to_wandb": true, "clip_grad_norm": 0.1, - "lr_scheduler": "linear", - "model_pair": "strict", - "same_size": false, - "seed": 67, - "batch_size": 256, - "include_mlp": false, - "detach_while_caching": true, - "scheduler_val_metric": [ - "val/accuracy", - "val/IIA", - "val/strict_accuracy" - ], - "siit_sampling": "sample_all", - "val_iia_sampling": "all" + "lr_scheduler": "", + "model_pair": "strict" }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/63/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/63/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/63/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/35/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/35/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/35/edges.pkl" }, { - "case_id": "69", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/69", - "task_description": "Assign -1, 0, or 1 to each element of the input sequence based on its sign.", + "case_id": "3", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/3", + "task_description": "Returns the fraction of 'x' in the input up to the i-th position for all i.", "vocab": [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10 + "a", + "b", + "c", + "x" ], - "max_seq_len": 10, + "max_seq_len": 5, "min_seq_len": 4, "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/69/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/3/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/69/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/3/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/69/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/3/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/69/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/3/meta.json" } ], "transformer_cfg": { "n_layers": 2, - "d_model": 4, - "n_ctx": 10, - "d_head": 1, + "d_model": 12, + "n_ctx": 5, + "d_head": 3, "model_name": "custom", "n_heads": 4, - "d_mlp": 16, + "d_mlp": 48, "act_fn": "gelu", - "d_vocab": 13, + "d_vocab": 6, "eps": 1e-05, "use_attn_result": true, "use_attn_scale": true, @@ -4663,15 +8110,15 @@ "attention_dir": "causal", "attn_only": false, "seed": 0, - "initializer_range": 0.16329931618554522, + "initializer_range": 0.22188007849009167, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", "final_rms": false, - "d_vocab_out": 2, + "d_vocab_out": 1, "parallel_attn_mlp": false, "rotary_dim": null, - "n_params": 384, + "n_params": 3456, "use_hook_tokens": false, "gated_mlp": false, "default_prepend_bos": true, @@ -4681,82 +8128,64 @@ "post_embedding_ln": false, "rotary_base": 10000, "trust_remote_code": false, - "rotary_adjacent_pairs": false, - "load_in_4bit": false, - "num_experts": null, - "experts_per_token": null + "rotary_adjacent_pairs": false }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/69/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/3/ll_model_cfg.pkl", "training_args": { - "output_dir": "/circuits-benchmark/results", "atol": 0.05, - "lr": 0.001, - "use_single_loss": true, + "lr": 0.01, + "use_single_loss": false, "iit_weight": 1.0, - "behavior_weight": 0.4, - "strict_weight": 0.4, - "epochs": 1000, - "early_stop_accuracy_threshold": 99.9, + "behavior_weight": 1.0, + "strict_weight": 10.0, + "epochs": 2000, "act_fn": "gelu", - "use_wandb": true, - "save_model_to_wandb": true, "clip_grad_norm": 0.1, - "lr_scheduler": "linear", + "lr_scheduler": "", "model_pair": "strict", - "same_size": false, - "seed": 67, - "batch_size": 256, - "include_mlp": false, - "detach_while_caching": true, - "scheduler_val_metric": [ - "val/accuracy", - "val/IIA", - "val/strict_accuracy" - ], - "siit_sampling": "sample_all", - "val_iia_sampling": "all" + "same_size": false }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/69/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/69/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/69/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/3/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/3/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/3/edges.pkl" }, { - "case_id": "7", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/7", - "task_description": "Returns the number of times each token occurs in the input.", + "case_id": "36", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/36", + "task_description": "Classifies each token as 'positive', 'negative', or 'neutral' based on emojis.", "vocab": [ - "a", - "b", - "c" + "\ud83d\udcd8", + "\ud83d\ude0a", + "\ud83d\ude22" ], "max_seq_len": 10, "min_seq_len": 4, "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/7/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/36/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/7/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/36/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/7/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/36/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/7/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/36/meta.json" } ], "transformer_cfg": { "n_layers": 2, - "d_model": 17, + "d_model": 6, "n_ctx": 10, - "d_head": 4, + "d_head": 1, "model_name": "custom", "n_heads": 4, - "d_mlp": 68, + "d_mlp": 24, "act_fn": "gelu", "d_vocab": 5, "eps": 1e-05, @@ -4777,18 +8206,18 @@ "init_mode": "gpt2", "normalization_type": null, "n_devices": 1, - "attention_dir": "bidirectional", + "attention_dir": "causal", "attn_only": false, "seed": 0, - "initializer_range": 0.15689290811054724, + "initializer_range": 0.19402850002906638, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", "final_rms": false, - "d_vocab_out": 10, + "d_vocab_out": 3, "parallel_attn_mlp": false, "rotary_dim": null, - "n_params": 6800, + "n_params": 768, "use_hook_tokens": false, "gated_mlp": false, "default_prepend_bos": true, @@ -4800,34 +8229,28 @@ "trust_remote_code": false, "rotary_adjacent_pairs": false }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/7/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/36/ll_model_cfg.pkl", "training_args": { "atol": 0.05, "lr": 0.01, "use_single_loss": false, "iit_weight": 1.0, "behavior_weight": 1.0, - "strict_weight": 0.5, + "strict_weight": 1.0, "epochs": 2000, "act_fn": "gelu", "clip_grad_norm": 0.1, "lr_scheduler": "", - "model_pair": "strict", - "same_size": false, - "seed": 1234, - "batch_size": 256, - "include_mlp": false, - "next_token": false, - "detach_while_caching": true + "model_pair": "strict" }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/7/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/7/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/7/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/36/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/36/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/36/edges.pkl" }, { - "case_id": "79", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/79", - "task_description": "Check if each number in a sequence is prime", + "case_id": "64", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/64", + "task_description": "Cubes each element in the sequence.", "vocab": [ 0, 1, @@ -4846,19 +8269,19 @@ "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/79/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/64/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/79/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/64/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/79/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/64/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/79/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/64/meta.json" } ], "transformer_cfg": { @@ -4892,12 +8315,12 @@ "attention_dir": "causal", "attn_only": false, "seed": 0, - "initializer_range": 0.16329931618554522, + "initializer_range": 0.1392621247645583, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", "final_rms": false, - "d_vocab_out": 2, + "d_vocab_out": 11, "parallel_attn_mlp": false, "rotary_dim": null, "n_params": 384, @@ -4915,7 +8338,7 @@ "num_experts": null, "experts_per_token": null }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/79/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/64/ll_model_cfg.pkl", "training_args": { "output_dir": "/circuits-benchmark/results", "atol": 0.05, @@ -4923,7 +8346,7 @@ "use_single_loss": true, "iit_weight": 1.0, "behavior_weight": 0.4, - "strict_weight": 0.4, + "strict_weight": 0.7, "epochs": 1000, "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", @@ -4945,14 +8368,14 @@ "siit_sampling": "sample_all", "val_iia_sampling": "all" }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/79/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/79/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/79/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/64/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/64/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/64/edges.pkl" }, { - "case_id": "82", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/82", - "task_description": "Halve the elements in the second half of the sequence.", + "case_id": "8", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/8", + "task_description": "Identity", "vocab": [ 0, 1, @@ -4971,29 +8394,29 @@ "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/82/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/8/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/82/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/8/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/82/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/8/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/82/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/8/meta.json" } ], "transformer_cfg": { - "n_layers": 4, - "d_model": 24, + "n_layers": 2, + "d_model": 4, "n_ctx": 10, - "d_head": 6, + "d_head": 1, "model_name": "custom", "n_heads": 4, - "d_mlp": 96, + "d_mlp": 16, "act_fn": "gelu", "d_vocab": 13, "eps": 1e-05, @@ -5014,18 +8437,18 @@ "init_mode": "gpt2", "normalization_type": null, "n_devices": 1, - "attention_dir": "bidirectional", + "attention_dir": "causal", "attn_only": false, "seed": 0, - "initializer_range": 0.1059625885652035, + "initializer_range": 0.1392621247645583, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", "final_rms": false, - "d_vocab_out": 16, + "d_vocab_out": 11, "parallel_attn_mlp": false, "rotary_dim": null, - "n_params": 27648, + "n_params": 384, "use_hook_tokens": false, "gated_mlp": false, "default_prepend_bos": true, @@ -5040,7 +8463,7 @@ "num_experts": null, "experts_per_token": null }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/82/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/8/ll_model_cfg.pkl", "training_args": { "output_dir": "/circuits-benchmark/results", "atol": 0.05, @@ -5048,7 +8471,7 @@ "use_single_loss": true, "iit_weight": 1.0, "behavior_weight": 0.4, - "strict_weight": 0.4, + "strict_weight": 0.7, "epochs": 1000, "early_stop_accuracy_threshold": 99.9, "act_fn": "gelu", @@ -5070,14 +8493,14 @@ "siit_sampling": "sample_all", "val_iia_sampling": "all" }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/82/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/82/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/82/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/8/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/8/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/8/edges.pkl" }, { - "case_id": "86", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/86", - "task_description": "Check if each element is a power of 2. Return 1 if true, otherwise 0.", + "case_id": "103", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/103", + "task_description": "Swap consecutive numbers in a list", "vocab": [ 0, 1, @@ -5096,29 +8519,29 @@ "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/86/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/103/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/86/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/103/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/86/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/103/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/86/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/103/meta.json" } ], "transformer_cfg": { - "n_layers": 2, - "d_model": 4, + "n_layers": 3, + "d_model": 24, "n_ctx": 10, - "d_head": 1, + "d_head": 6, "model_name": "custom", "n_heads": 4, - "d_mlp": 16, + "d_mlp": 96, "act_fn": "gelu", "d_vocab": 13, "eps": 1e-05, @@ -5139,18 +8562,18 @@ "init_mode": "gpt2", "normalization_type": null, "n_devices": 1, - "attention_dir": "causal", + "attention_dir": "bidirectional", "attn_only": false, "seed": 0, - "initializer_range": 0.16329931618554522, + "initializer_range": 0.10787197799411874, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", "final_rms": false, - "d_vocab_out": 2, + "d_vocab_out": 11, "parallel_attn_mlp": false, "rotary_dim": null, - "n_params": 384, + "n_params": 20736, "use_hook_tokens": false, "gated_mlp": false, "default_prepend_bos": true, @@ -5165,7 +8588,7 @@ "num_experts": null, "experts_per_token": null }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/86/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/103/ll_model_cfg.pkl", "training_args": { "output_dir": "/circuits-benchmark/results", "atol": 0.05, @@ -5195,14 +8618,14 @@ "siit_sampling": "sample_all", "val_iia_sampling": "all" }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/86/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/86/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/86/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/103/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/103/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/103/edges.pkl" }, { - "case_id": "87", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/87", - "task_description": "Binarize a sequence of integers using a threshold.", + "case_id": "56", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/56", + "task_description": "Sets every third element to zero.", "vocab": [ 0, 1, @@ -5221,19 +8644,19 @@ "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/87/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/56/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/87/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/56/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/87/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/56/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/87/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/56/meta.json" } ], "transformer_cfg": { @@ -5267,12 +8690,12 @@ "attention_dir": "causal", "attn_only": false, "seed": 0, - "initializer_range": 0.16329931618554522, + "initializer_range": 0.13333333333333333, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", "final_rms": false, - "d_vocab_out": 2, + "d_vocab_out": 11, "parallel_attn_mlp": false, "rotary_dim": null, "n_params": 384, @@ -5290,7 +8713,7 @@ "num_experts": null, "experts_per_token": null }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/87/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/56/ll_model_cfg.pkl", "training_args": { "output_dir": "/circuits-benchmark/results", "atol": 0.05, @@ -5320,14 +8743,14 @@ "siit_sampling": "sample_all", "val_iia_sampling": "all" }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/87/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/87/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/87/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/56/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/56/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/56/edges.pkl" }, { - "case_id": "93", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/93", - "task_description": "Swaps the nth with the n+1th element if n%2==1.", + "case_id": "97", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/97", + "task_description": "Scale a sequence by its maximum element.", "vocab": [ 0, 1, @@ -5346,29 +8769,29 @@ "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/93/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/97/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/93/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/97/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/93/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/97/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/93/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/97/meta.json" } ], "transformer_cfg": { "n_layers": 3, - "d_model": 20, + "d_model": 200, "n_ctx": 10, - "d_head": 5, + "d_head": 50, "model_name": "custom", "n_heads": 4, - "d_mlp": 80, + "d_mlp": 800, "act_fn": "gelu", "d_vocab": 13, "eps": 1e-05, @@ -5392,15 +8815,15 @@ "attention_dir": "bidirectional", "attn_only": false, "seed": 0, - "initializer_range": 0.10886621079036347, + "initializer_range": 0.05405899027195888, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", "final_rms": false, - "d_vocab_out": 11, + "d_vocab_out": 64, "parallel_attn_mlp": false, "rotary_dim": null, - "n_params": 14400, + "n_params": 1440000, "use_hook_tokens": false, "gated_mlp": false, "default_prepend_bos": true, @@ -5415,7 +8838,7 @@ "num_experts": null, "experts_per_token": null }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/93/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/97/ll_model_cfg.pkl", "training_args": { "output_dir": "/circuits-benchmark/results", "atol": 0.05, @@ -5433,7 +8856,7 @@ "lr_scheduler": "linear", "model_pair": "strict", "same_size": false, - "seed": 67, + "seed": 82, "batch_size": 256, "include_mlp": false, "detach_while_caching": true, @@ -5445,223 +8868,134 @@ "siit_sampling": "sample_all", "val_iia_sampling": "all" }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/93/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/93/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/93/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/97/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/97/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/97/edges.pkl" }, { - "case_id": "ioi", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/ioi", - "task_description": "Indirect Object Identification (IOI) task.", - "max_seq_len": 16, - "min_seq_len": 16, - "files": [ - { - "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi/edges.pkl" - }, - { - "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi/ll_model.pth" - }, - { - "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi/ll_model_cfg.pkl" - }, - { - "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi/meta.json" - } + "case_id": "46", + "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/46", + "task_description": "Decrements each element in the sequence by 1", + "vocab": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10 ], - "transformer_cfg": { - "n_layers": 6, - "d_model": 64, - "n_ctx": 1024, - "d_head": 16, - "model_name": "gpt2", - "n_heads": 4, - "d_mlp": 3072, - "act_fn": "gelu_new", - "d_vocab": 50257, - "eps": 1e-05, - "use_attn_result": false, - "use_attn_scale": true, - "use_split_qkv_input": false, - "use_hook_mlp_in": false, - "use_attn_in": false, - "use_local_attn": false, - "original_architecture": "GPT2LMHeadModel", - "from_checkpoint": false, - "checkpoint_index": null, - "checkpoint_label_type": null, - "checkpoint_value": null, - "tokenizer_name": "gpt2", - "window_size": null, - "attn_types": null, - "init_mode": "gpt2", - "normalization_type": "LNPre", - "n_devices": 1, - "attention_dir": "causal", - "attn_only": false, - "seed": null, - "initializer_range": 0.02886751345948129, - "init_weights": false, - "scale_attn_by_inverse_layer_idx": false, - "positional_embedding_type": "standard", - "final_rms": false, - "d_vocab_out": 50257, - "parallel_attn_mlp": false, - "rotary_dim": null, - "n_params": 84934656, - "use_hook_tokens": false, - "gated_mlp": false, - "default_prepend_bos": true, - "dtype": "torch.float32", - "tokenizer_prepends_bos": false, - "n_key_value_heads": null, - "post_embedding_ln": false, - "rotary_base": 10000, - "trust_remote_code": false, - "rotary_adjacent_pairs": false - }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi/ll_model_cfg.pkl", - "training_args": { - "next_token": false, - "non_ioi_thresh": 0.65, - "use_per_token_check": false, - "batch_size": 512, - "num_workers": 0, - "early_stop": true, - "lr_scheduler": null, - "scheduler_val_metric": [ - "val/accuracy", - "val/IIA" - ], - "scheduler_mode": "max", - "scheduler_kwargs": {}, - "clip_grad_norm": 1.0, - "seed": 0, - "lr": 0.001, - "detach_while_caching": true, - "optimizer_kwargs": { - "betas": [ - 0.9, - 0.9 - ] - }, - "atol": 0.05, - "use_single_loss": false, - "iit_weight": 1.0, - "behavior_weight": 1.0, - "val_IIA_sampling": "random", - "use_all_tokens_for_behavior": false, - "strict_weight": 0.4, - "siit_sampling": "individual" - }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi/edges.pkl" - }, - { - "case_id": "ioi_next_token", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/tree/main/ioi_next_token", - "task_description": "Indirect Object Identification (IOI) task, trained using next token prediction.", - "max_seq_len": 16, - "min_seq_len": 16, + "max_seq_len": 10, + "min_seq_len": 4, "files": [ { "file_name": "edges.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi_next_token/edges.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/46/edges.pkl" }, { "file_name": "ll_model.pth", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi_next_token/ll_model.pth" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/46/ll_model.pth" }, { "file_name": "ll_model_cfg.pkl", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi_next_token/ll_model_cfg.pkl" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/46/ll_model_cfg.pkl" }, { "file_name": "meta.json", - "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi_next_token/meta.json" + "url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/46/meta.json" } ], "transformer_cfg": { - "n_layers": 6, - "d_model": 64, - "n_ctx": 1024, - "d_head": 16, - "model_name": "gpt2", + "n_layers": 2, + "d_model": 4, + "n_ctx": 10, + "d_head": 1, + "model_name": "custom", "n_heads": 4, - "d_mlp": 3072, - "act_fn": "gelu_new", - "d_vocab": 50257, + "d_mlp": 16, + "act_fn": "gelu", + "d_vocab": 13, "eps": 1e-05, - "use_attn_result": false, + "use_attn_result": true, "use_attn_scale": true, - "use_split_qkv_input": false, - "use_hook_mlp_in": false, + "use_split_qkv_input": true, + "use_hook_mlp_in": true, "use_attn_in": false, "use_local_attn": false, - "original_architecture": "GPT2LMHeadModel", + "original_architecture": null, "from_checkpoint": false, "checkpoint_index": null, "checkpoint_label_type": null, "checkpoint_value": null, - "tokenizer_name": "gpt2", + "tokenizer_name": null, "window_size": null, "attn_types": null, "init_mode": "gpt2", - "normalization_type": "LNPre", + "normalization_type": null, "n_devices": 1, "attention_dir": "causal", "attn_only": false, - "seed": null, - "initializer_range": 0.02886751345948129, + "seed": 0, + "initializer_range": 0.1392621247645583, "init_weights": true, "scale_attn_by_inverse_layer_idx": false, "positional_embedding_type": "standard", "final_rms": false, - "d_vocab_out": 50257, + "d_vocab_out": 11, "parallel_attn_mlp": false, "rotary_dim": null, - "n_params": 2457600, + "n_params": 384, "use_hook_tokens": false, "gated_mlp": false, "default_prepend_bos": true, "dtype": "torch.float32", - "tokenizer_prepends_bos": false, + "tokenizer_prepends_bos": null, "n_key_value_heads": null, "post_embedding_ln": false, "rotary_base": 10000, "trust_remote_code": false, - "rotary_adjacent_pairs": false + "rotary_adjacent_pairs": false, + "load_in_4bit": false, + "num_experts": null, + "experts_per_token": null }, - "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi_next_token/ll_model_cfg.pkl", + "transformer_cfg_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/46/ll_model_cfg.pkl", "training_args": { - "next_token": true, - "non_ioi_thresh": 0.65, - "use_per_token_check": false, - "batch_size": 256, + "output_dir": "/circuits-benchmark/results", + "atol": 0.05, "lr": 0.001, - "num_workers": 0, - "early_stop": true, - "lr_scheduler": null, + "use_single_loss": true, + "iit_weight": 1.0, + "behavior_weight": 0.4, + "strict_weight": 0.7, + "epochs": 1000, + "early_stop_accuracy_threshold": 99.9, + "act_fn": "gelu", + "use_wandb": true, + "save_model_to_wandb": true, + "clip_grad_norm": 0.1, + "lr_scheduler": "linear", + "model_pair": "strict", + "same_size": false, + "seed": 67, + "batch_size": 256, + "include_mlp": false, + "detach_while_caching": true, "scheduler_val_metric": [ "val/accuracy", - "val/IIA" + "val/IIA", + "val/strict_accuracy" ], - "scheduler_mode": "max", - "clip_grad_norm": 1.0, - "atol": 0.05, - "use_single_loss": false, - "iit_weight": 1.0, - "behavior_weight": 1.0, - "strict_weight": 0.4 + "siit_sampling": "sample_all", + "val_iia_sampling": "all" }, - "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi_next_token/meta.json", - "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi_next_token/ll_model.pth", - "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/ioi_next_token/edges.pkl" + "training_args_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/46/meta.json", + "weights_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/46/ll_model.pth", + "circuit_file_url": "https://huggingface.co/cybershiptrooper/InterpBench/blob/main/46/edges.pkl" } ] } \ No newline at end of file