taufeeque commited on
Commit
d2a7b67
1 Parent(s): 9ef6e9e
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
37
+ optimizer.pt filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,128 @@
1
  ---
2
- license: mit
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ tags:
3
+ - generated_from_trainer
4
+ datasets:
5
+ - toy_graph
6
+ metrics:
7
+ - accuracy
8
+ model-index:
9
+ - name: output_toy
10
+ results:
11
+ - task:
12
+ name: Causal Language Modeling
13
+ type: text-generation
14
+ dataset:
15
+ name: toy_graph
16
+ type: toy_graph
17
+ metrics:
18
+ - name: Accuracy
19
+ type: accuracy
20
+ value: 0.4525254617525837
21
  ---
22
+
23
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
24
+ should probably proofread and complete it, then remove this comment. -->
25
+
26
+ # output_toy
27
+
28
+ This model is a fine-tuned version of [toy/model](https://huggingface.co/toy/model) on the toy_graph dataset.
29
+ It achieves the following results on the evaluation set:
30
+ - Loss: 1.2691
31
+ - Accuracy: 0.4525
32
+ - Transition Accuracy: 0.5634
33
+ - First Transition Accuracy: 0.88
34
+ - Multicode K: 1
35
+ - Dead Code Fraction/layer0: 0.9969
36
+ - Mse/layer0: 220380.4595
37
+ - Input Norm/layer0: 333.7717
38
+ - Output Norm/layer0: 12.9360
39
+ - Dead Code Fraction/layer1: 0.9535
40
+ - Mse/layer1: 132.7843
41
+ - Input Norm/layer1: 6.5450
42
+ - Output Norm/layer1: 13.1449
43
+ - Dead Code Fraction/layer2: 0.9349
44
+ - Mse/layer2: 365.9396
45
+ - Input Norm/layer2: 6.1370
46
+ - Output Norm/layer2: 18.3248
47
+ - Dead Code Fraction/layer3: 0.9819
48
+ - Mse/layer3: 415.9804
49
+ - Input Norm/layer3: 7.4097
50
+ - Output Norm/layer3: 18.4665
51
+
52
+ ## Model description
53
+
54
+ More information needed
55
+
56
+ ## Intended uses & limitations
57
+
58
+ More information needed
59
+
60
+ ## Training and evaluation data
61
+
62
+ More information needed
63
+
64
+ ## Training procedure
65
+
66
+ ### Training hyperparameters
67
+
68
+ The following hyperparameters were used during training:
69
+ - learning_rate: 0.001
70
+ - train_batch_size: 1024
71
+ - eval_batch_size: 512
72
+ - seed: 42
73
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
74
+ - lr_scheduler_type: constant
75
+ - training_steps: 20000
76
+
77
+ ### Training results
78
+
79
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy | Transition Accuracy | First Transition Accuracy | Multicode K | Dead Code Fraction/layer0 | Mse/layer0 | Input Norm/layer0 | Output Norm/layer0 | Dead Code Fraction/layer1 | Mse/layer1 | Input Norm/layer1 | Output Norm/layer1 | Dead Code Fraction/layer2 | Mse/layer2 | Input Norm/layer2 | Output Norm/layer2 | Dead Code Fraction/layer3 | Mse/layer3 | Input Norm/layer3 | Output Norm/layer3 |
80
+ |:-------------:|:-----:|:-----:|:---------------:|:--------:|:-------------------:|:-------------------------:|:-----------:|:-------------------------:|:----------:|:-----------------:|:------------------:|:-------------------------:|:----------:|:-----------------:|:------------------:|:-------------------------:|:----------:|:-----------------:|:------------------:|:-------------------------:|:----------:|:-----------------:|:------------------:|
81
+ | 2.2465 | 0.03 | 500 | 1.8386 | 0.3565 | 0.3555 | 0.31 | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
82
+ | 1.5981 | 0.05 | 1000 | 1.4652 | 0.4204 | 0.5015 | 0.58 | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
83
+ | 1.3928 | 0.07 | 1500 | 1.3541 | 0.4378 | 0.555 | 0.79 | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
84
+ | 1.3405 | 0.1 | 2000 | 1.3264 | 0.4427 | 0.5756 | 0.82 | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
85
+ | 1.3189 | 0.12 | 2500 | 1.3187 | 0.4446 | 0.5576 | 0.86 | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
86
+ | 1.308 | 0.15 | 3000 | 1.3064 | 0.4468 | 0.5573 | 0.82 | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
87
+ | 1.3009 | 0.17 | 3500 | 1.2963 | 0.4493 | 0.5763 | 0.87 | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
88
+ | 1.2965 | 0.2 | 4000 | 1.2922 | 0.4494 | 0.5677 | 0.9 | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
89
+ | 1.2919 | 0.23 | 4500 | 1.2880 | 0.4499 | 0.5821 | 0.91 | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
90
+ | 1.2889 | 0.25 | 5000 | 1.2856 | 0.4501 | 0.56 | 0.9 | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
91
+ | 1.2855 | 0.28 | 5500 | 1.2816 | 0.4503 | 0.6016 | 0.9 | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
92
+ | 1.2828 | 0.3 | 6000 | 1.2844 | 0.4502 | 0.5734 | 0.87 | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
93
+ | 1.2805 | 0.33 | 6500 | 1.2777 | 0.4516 | 0.6084 | 0.95 | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
94
+ | 1.2793 | 0.35 | 7000 | 1.2796 | 0.4511 | 0.5681 | 0.93 | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
95
+ | 1.2785 | 0.38 | 7500 | 1.2748 | 0.4519 | 0.5919 | 0.95 | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
96
+ | 1.2764 | 0.4 | 8000 | 1.2767 | 0.4518 | 0.5760 | 0.9 | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
97
+ | 1.2763 | 0.42 | 8500 | 1.2801 | 0.4507 | 0.5827 | 0.94 | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
98
+ | 1.2755 | 0.45 | 9000 | 1.2755 | 0.4516 | 0.5765 | 0.9 | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
99
+ | 1.2746 | 0.47 | 9500 | 1.2736 | 0.4523 | 0.5865 | 0.9 | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
100
+ | 1.2734 | 0.5 | 10000 | 1.2740 | 0.4519 | 0.5779 | 0.91 | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
101
+ | 1.2732 | 0.53 | 10500 | 1.2744 | 0.4516 | 0.5879 | 0.89 | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
102
+ | 1.2723 | 0.55 | 11000 | 1.2690 | 0.4525 | 0.5811 | 0.89 | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
103
+ | 1.2712 | 0.57 | 11500 | 1.2705 | 0.4526 | 0.5779 | 0.93 | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
104
+ | 1.2716 | 0.6 | 12000 | 1.2701 | 0.4527 | 0.5760 | 0.89 | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
105
+ | 1.2708 | 0.62 | 12500 | 1.2716 | 0.4522 | 0.5485 | 0.95 | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
106
+ | 1.2705 | 0.65 | 13000 | 1.2676 | 0.4529 | 0.5734 | 0.93 | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
107
+ | 1.2696 | 0.68 | 13500 | 1.2717 | 0.4519 | 0.5994 | 0.91 | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
108
+ | 1.2687 | 0.7 | 14000 | 1.2687 | 0.4524 | 0.5756 | 0.9 | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
109
+ | 1.2685 | 0.72 | 14500 | 1.2709 | 0.4521 | 0.6127 | 0.89 | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
110
+ | 1.2685 | 0.75 | 15000 | 1.2706 | 0.4519 | 0.5873 | 0.91 | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
111
+ | 1.2675 | 0.78 | 15500 | 1.2691 | 0.4527 | 0.6365 | 0.96 | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
112
+ | 1.2677 | 0.8 | 16000 | 1.2686 | 0.4526 | 0.5589 | 0.93 | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
113
+ | 1.2676 | 0.82 | 16500 | 1.2639 | 0.4529 | 0.5940 | 0.89 | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
114
+ | 1.2662 | 0.85 | 17000 | 1.2655 | 0.4530 | 0.5955 | 0.94 | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
115
+ | 1.2666 | 0.88 | 17500 | 1.2636 | 0.4526 | 0.6013 | 0.96 | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
116
+ | 1.2664 | 0.9 | 18000 | 1.2681 | 0.4526 | 0.6034 | 0.96 | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
117
+ | 1.266 | 0.93 | 18500 | 1.2624 | 0.4527 | 0.5839 | 0.88 | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
118
+ | 1.2653 | 0.95 | 19000 | 1.2688 | 0.4519 | 0.5837 | 0.92 | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
119
+ | 1.2654 | 0.97 | 19500 | 1.2619 | 0.4534 | 0.5973 | 0.92 | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
120
+ | 1.2649 | 1.0 | 20000 | 1.2647 | 0.4525 | 0.59 | 0.93 | 1 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
121
+
122
+
123
+ ### Framework versions
124
+
125
+ - Transformers 4.28.1
126
+ - Pytorch 2.0.1+cu117
127
+ - Datasets 2.12.0
128
+ - Tokenizers 0.13.3
all_results.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "MSE": 0.0,
3
+ "MSE/layer0": 0.0,
4
+ "MSE/layer1": 0.0,
5
+ "MSE/layer2": 0.0,
6
+ "MSE/layer3": 0.0,
7
+ "dead_code_fraction": 1.0,
8
+ "dead_code_fraction/layer0": 1.0,
9
+ "dead_code_fraction/layer1": 1.0,
10
+ "dead_code_fraction/layer2": 1.0,
11
+ "dead_code_fraction/layer3": 1.0,
12
+ "epoch": 1.0,
13
+ "eval_MSE/layer0": 220380.4595384912,
14
+ "eval_MSE/layer1": 132.78432877894963,
15
+ "eval_MSE/layer2": 365.9396393365076,
16
+ "eval_MSE/layer3": 415.98040078389045,
17
+ "eval_accuracy": 0.4525254617525837,
18
+ "eval_dead_code_fraction/layer0": 0.99694,
19
+ "eval_dead_code_fraction/layer1": 0.9535,
20
+ "eval_dead_code_fraction/layer2": 0.93486,
21
+ "eval_dead_code_fraction/layer3": 0.98186,
22
+ "eval_first_transition_accuracy": 0.88,
23
+ "eval_input_norm/layer0": 333.77172351868165,
24
+ "eval_input_norm/layer1": 6.54500140022604,
25
+ "eval_input_norm/layer2": 6.137018968109251,
26
+ "eval_input_norm/layer3": 7.40972196774554,
27
+ "eval_loss": 1.2691402435302734,
28
+ "eval_multicode_k": 1,
29
+ "eval_output_norm/layer0": 12.936006403074337,
30
+ "eval_output_norm/layer1": 13.144865618203756,
31
+ "eval_output_norm/layer2": 18.324818944643734,
32
+ "eval_output_norm/layer3": 18.466466705456643,
33
+ "eval_runtime": 40.0743,
34
+ "eval_samples_per_second": 817.682,
35
+ "eval_steps_per_second": 1.597,
36
+ "eval_transition_accuracy": 0.5633870967741935,
37
+ "input_norm": 0.0,
38
+ "input_norm/layer0": 0.0,
39
+ "input_norm/layer1": 0.0,
40
+ "input_norm/layer2": 0.0,
41
+ "input_norm/layer3": 0.0,
42
+ "loss": 1.313426919734478,
43
+ "max_norm": 46.33829879760742,
44
+ "max_norm/layer0": 17.856664657592773,
45
+ "max_norm/layer1": 20.084186553955078,
46
+ "max_norm/layer2": 33.940242767333984,
47
+ "max_norm/layer3": 46.33829879760742,
48
+ "mean_norm": 8.291451185941696,
49
+ "mean_norm/layer0": 8.462452054023743,
50
+ "mean_norm/layer1": 8.18280303478241,
51
+ "mean_norm/layer2": 8.143204748630524,
52
+ "mean_norm/layer3": 8.377344906330109,
53
+ "multicode_k": 1,
54
+ "output_norm": 0.0,
55
+ "output_norm/layer0": 0.0,
56
+ "output_norm/layer1": 0.0,
57
+ "output_norm/layer2": 0.0,
58
+ "output_norm/layer3": 0.0,
59
+ "perplexity": 3.5577924120078235,
60
+ "runtime": 19944.2896,
61
+ "samples_per_second": 1026.86,
62
+ "steps_per_second": 1.003
63
+ }
config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "GPTNeoXCodebookModel"
4
+ ],
5
+ "codebook_at": [
6
+ "preproj_attention",
7
+ "mlp"
8
+ ],
9
+ "codebook_kwargs": {},
10
+ "codebook_type": [
11
+ "group",
12
+ "vanilla"
13
+ ],
14
+ "k_codebook": [
15
+ 1,
16
+ 1
17
+ ],
18
+ "kmeans_init": false,
19
+ "kmeans_init_examples": 1000,
20
+ "kmeans_kwargs": null,
21
+ "kmeans_path": null,
22
+ "layers_to_snap": [
23
+ 0,
24
+ 1,
25
+ 2,
26
+ 3
27
+ ],
28
+ "loss": "aeloss",
29
+ "model_type": "codebook",
30
+ "num_codebooks": [
31
+ 4,
32
+ 1
33
+ ],
34
+ "num_codes": 10000,
35
+ "replace_codes": false,
36
+ "similarity_metric": "inner_product",
37
+ "torch_dtype": "float32",
38
+ "transformers_version": "4.28.1"
39
+ }
eval_results.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "eval_MSE/layer0": 220380.4595384912,
4
+ "eval_MSE/layer1": 132.78432877894963,
5
+ "eval_MSE/layer2": 365.9396393365076,
6
+ "eval_MSE/layer3": 415.98040078389045,
7
+ "eval_accuracy": 0.4525254617525837,
8
+ "eval_dead_code_fraction/layer0": 0.99694,
9
+ "eval_dead_code_fraction/layer1": 0.9535,
10
+ "eval_dead_code_fraction/layer2": 0.93486,
11
+ "eval_dead_code_fraction/layer3": 0.98186,
12
+ "eval_first_transition_accuracy": 0.88,
13
+ "eval_input_norm/layer0": 333.77172351868165,
14
+ "eval_input_norm/layer1": 6.54500140022604,
15
+ "eval_input_norm/layer2": 6.137018968109251,
16
+ "eval_input_norm/layer3": 7.40972196774554,
17
+ "eval_loss": 1.2691402435302734,
18
+ "eval_multicode_k": 1,
19
+ "eval_output_norm/layer0": 12.936006403074337,
20
+ "eval_output_norm/layer1": 13.144865618203756,
21
+ "eval_output_norm/layer2": 18.324818944643734,
22
+ "eval_output_norm/layer3": 18.466466705456643,
23
+ "eval_runtime": 40.0743,
24
+ "eval_samples_per_second": 817.682,
25
+ "eval_steps_per_second": 1.597,
26
+ "eval_transition_accuracy": 0.5633870967741935,
27
+ "perplexity": 3.5577924120078235
28
+ }
fsm.npy ADDED
Binary file (80.1 kB). View file
 
merges.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ #version: 0.2
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7adc1d0512a86c5335e044308688de193ede8d653be289942c36f3dc75c829f
3
+ size 86257029
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db3d252c367d55a926e6d46c93c5fab8d2fbaf44d3ef3dd70bcc71675c3c600c
3
+ size 44261693
rng_state.pth ADDED
Binary file (14.6 kB). View file
 
scheduler.pt ADDED
Binary file (627 Bytes). View file
 
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [],
6
+ "normalizer": null,
7
+ "pre_tokenizer": {
8
+ "type": "ByteLevel",
9
+ "add_prefix_space": false,
10
+ "trim_offsets": true,
11
+ "use_regex": true
12
+ },
13
+ "post_processor": {
14
+ "type": "ByteLevel",
15
+ "add_prefix_space": true,
16
+ "trim_offsets": false,
17
+ "use_regex": true
18
+ },
19
+ "decoder": {
20
+ "type": "ByteLevel",
21
+ "add_prefix_space": true,
22
+ "trim_offsets": true,
23
+ "use_regex": true
24
+ },
25
+ "model": {
26
+ "type": "BPE",
27
+ "dropout": null,
28
+ "unk_token": null,
29
+ "continuing_subword_prefix": "",
30
+ "end_of_word_suffix": "",
31
+ "fuse_unk": false,
32
+ "byte_fallback": false,
33
+ "vocab": {
34
+ "0": 0,
35
+ "1": 1,
36
+ "2": 2,
37
+ "3": 3,
38
+ "4": 4,
39
+ "5": 5,
40
+ "6": 6,
41
+ "7": 7,
42
+ "8": 8,
43
+ "9": 9,
44
+ "<|endoftext|>": 10
45
+ },
46
+ "merges": []
47
+ }
48
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "bos_token": {
5
+ "__type": "AddedToken",
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "clean_up_tokenization_spaces": true,
13
+ "eos_token": {
14
+ "__type": "AddedToken",
15
+ "content": "<|endoftext|>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false
20
+ },
21
+ "errors": "replace",
22
+ "model_max_length": 1000000000000000019884624838656,
23
+ "pad_token": {
24
+ "__type": "AddedToken",
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": true,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ },
31
+ "tokenizer_class": "GPT2Tokenizer",
32
+ "unk_token": {
33
+ "__type": "AddedToken",
34
+ "content": "<|endoftext|>",
35
+ "lstrip": false,
36
+ "normalized": true,
37
+ "rstrip": false,
38
+ "single_word": false
39
+ }
40
+ }
train_results.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "MSE": 0.0,
3
+ "MSE/layer0": 0.0,
4
+ "MSE/layer1": 0.0,
5
+ "MSE/layer2": 0.0,
6
+ "MSE/layer3": 0.0,
7
+ "dead_code_fraction": 1.0,
8
+ "dead_code_fraction/layer0": 1.0,
9
+ "dead_code_fraction/layer1": 1.0,
10
+ "dead_code_fraction/layer2": 1.0,
11
+ "dead_code_fraction/layer3": 1.0,
12
+ "epoch": 1.0,
13
+ "input_norm": 0.0,
14
+ "input_norm/layer0": 0.0,
15
+ "input_norm/layer1": 0.0,
16
+ "input_norm/layer2": 0.0,
17
+ "input_norm/layer3": 0.0,
18
+ "loss": 1.313426919734478,
19
+ "max_norm": 46.33829879760742,
20
+ "max_norm/layer0": 17.856664657592773,
21
+ "max_norm/layer1": 20.084186553955078,
22
+ "max_norm/layer2": 33.940242767333984,
23
+ "max_norm/layer3": 46.33829879760742,
24
+ "mean_norm": 8.291451185941696,
25
+ "mean_norm/layer0": 8.462452054023743,
26
+ "mean_norm/layer1": 8.18280303478241,
27
+ "mean_norm/layer2": 8.143204748630524,
28
+ "mean_norm/layer3": 8.377344906330109,
29
+ "multicode_k": 1,
30
+ "output_norm": 0.0,
31
+ "output_norm/layer0": 0.0,
32
+ "output_norm/layer1": 0.0,
33
+ "output_norm/layer2": 0.0,
34
+ "output_norm/layer3": 0.0,
35
+ "runtime": 19944.2896,
36
+ "samples_per_second": 1026.86,
37
+ "steps_per_second": 1.003
38
+ }
trainer_state.json ADDED
@@ -0,0 +1,2068 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.6364516129032258,
3
+ "best_model_checkpoint": "output_toy/checkpoint-15500",
4
+ "epoch": 0.775,
5
+ "global_step": 15500,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "MSE": 0.0,
12
+ "MSE/layer0": 0.0,
13
+ "MSE/layer1": 0.0,
14
+ "MSE/layer2": 0.0,
15
+ "MSE/layer3": 0.0,
16
+ "dead_code_fraction": 1.0,
17
+ "dead_code_fraction/layer0": 1.0,
18
+ "dead_code_fraction/layer1": 1.0,
19
+ "dead_code_fraction/layer2": 1.0,
20
+ "dead_code_fraction/layer3": 1.0,
21
+ "epoch": 0.0,
22
+ "input_norm": 0.0,
23
+ "input_norm/layer0": 0.0,
24
+ "input_norm/layer1": 0.0,
25
+ "input_norm/layer2": 0.0,
26
+ "input_norm/layer3": 0.0,
27
+ "learning_rate": 0.001,
28
+ "loss": 3.5788,
29
+ "max_norm": 14.111713409423828,
30
+ "max_norm/layer0": 14.111713409423828,
31
+ "max_norm/layer1": 13.754842758178711,
32
+ "max_norm/layer2": 14.107428550720215,
33
+ "max_norm/layer3": 14.075852394104004,
34
+ "mean_norm": 8.452343925833702,
35
+ "mean_norm/layer0": 8.452010810375214,
36
+ "mean_norm/layer1": 8.451448321342468,
37
+ "mean_norm/layer2": 8.451122224330902,
38
+ "mean_norm/layer3": 8.454794347286224,
39
+ "multicode_k": 1,
40
+ "output_norm": 0.0,
41
+ "output_norm/layer0": 0.0,
42
+ "output_norm/layer1": 0.0,
43
+ "output_norm/layer2": 0.0,
44
+ "output_norm/layer3": 0.0,
45
+ "step": 1
46
+ },
47
+ {
48
+ "MSE": 0.0,
49
+ "MSE/layer0": 0.0,
50
+ "MSE/layer1": 0.0,
51
+ "MSE/layer2": 0.0,
52
+ "MSE/layer3": 0.0,
53
+ "dead_code_fraction": 1.0,
54
+ "dead_code_fraction/layer0": 1.0,
55
+ "dead_code_fraction/layer1": 1.0,
56
+ "dead_code_fraction/layer2": 1.0,
57
+ "dead_code_fraction/layer3": 1.0,
58
+ "epoch": 0.03,
59
+ "input_norm": 0.0,
60
+ "input_norm/layer0": 0.0,
61
+ "input_norm/layer1": 0.0,
62
+ "input_norm/layer2": 0.0,
63
+ "input_norm/layer3": 0.0,
64
+ "learning_rate": 0.001,
65
+ "loss": 2.2465,
66
+ "max_norm": 14.111713409423828,
67
+ "max_norm/layer0": 14.111713409423828,
68
+ "max_norm/layer1": 13.313672065734863,
69
+ "max_norm/layer2": 13.402200698852539,
70
+ "max_norm/layer3": 13.972790718078613,
71
+ "mean_norm": 8.24767641723156,
72
+ "mean_norm/layer0": 8.446629762649536,
73
+ "mean_norm/layer1": 8.26882529258728,
74
+ "mean_norm/layer2": 8.018843710422516,
75
+ "mean_norm/layer3": 8.256406903266907,
76
+ "multicode_k": 1,
77
+ "output_norm": 0.0,
78
+ "output_norm/layer0": 0.0,
79
+ "output_norm/layer1": 0.0,
80
+ "output_norm/layer2": 0.0,
81
+ "output_norm/layer3": 0.0,
82
+ "step": 500
83
+ },
84
+ {
85
+ "epoch": 0.03,
86
+ "eval_MSE/layer0": 0.0,
87
+ "eval_MSE/layer1": 0.0,
88
+ "eval_MSE/layer2": 0.0,
89
+ "eval_MSE/layer3": 0.0,
90
+ "eval_accuracy": 0.3564852016178642,
91
+ "eval_dead_code_fraction/layer0": 1.0,
92
+ "eval_dead_code_fraction/layer1": 1.0,
93
+ "eval_dead_code_fraction/layer2": 1.0,
94
+ "eval_dead_code_fraction/layer3": 1.0,
95
+ "eval_first_transition_accuracy": 0.31,
96
+ "eval_input_norm/layer0": 0.0,
97
+ "eval_input_norm/layer1": 0.0,
98
+ "eval_input_norm/layer2": 0.0,
99
+ "eval_input_norm/layer3": 0.0,
100
+ "eval_loss": 1.8386362791061401,
101
+ "eval_multicode_k": 1,
102
+ "eval_output_norm/layer0": 0.0,
103
+ "eval_output_norm/layer1": 0.0,
104
+ "eval_output_norm/layer2": 0.0,
105
+ "eval_output_norm/layer3": 0.0,
106
+ "eval_runtime": 40.7699,
107
+ "eval_samples_per_second": 803.731,
108
+ "eval_steps_per_second": 1.57,
109
+ "eval_transition_accuracy": 0.3554838709677419,
110
+ "step": 500
111
+ },
112
+ {
113
+ "MSE": 0.0,
114
+ "MSE/layer0": 0.0,
115
+ "MSE/layer1": 0.0,
116
+ "MSE/layer2": 0.0,
117
+ "MSE/layer3": 0.0,
118
+ "dead_code_fraction": 1.0,
119
+ "dead_code_fraction/layer0": 1.0,
120
+ "dead_code_fraction/layer1": 1.0,
121
+ "dead_code_fraction/layer2": 1.0,
122
+ "dead_code_fraction/layer3": 1.0,
123
+ "epoch": 0.05,
124
+ "input_norm": 0.0,
125
+ "input_norm/layer0": 0.0,
126
+ "input_norm/layer1": 0.0,
127
+ "input_norm/layer2": 0.0,
128
+ "input_norm/layer3": 0.0,
129
+ "learning_rate": 0.001,
130
+ "loss": 1.5981,
131
+ "max_norm": 14.111713409423828,
132
+ "max_norm/layer0": 14.111713409423828,
133
+ "max_norm/layer1": 13.26521110534668,
134
+ "max_norm/layer2": 13.196029663085938,
135
+ "max_norm/layer3": 13.539477348327637,
136
+ "mean_norm": 8.097165614366531,
137
+ "mean_norm/layer0": 8.447738409042358,
138
+ "mean_norm/layer1": 8.100942492485046,
139
+ "mean_norm/layer2": 7.6819451451301575,
140
+ "mean_norm/layer3": 8.158036410808563,
141
+ "multicode_k": 1,
142
+ "output_norm": 0.0,
143
+ "output_norm/layer0": 0.0,
144
+ "output_norm/layer1": 0.0,
145
+ "output_norm/layer2": 0.0,
146
+ "output_norm/layer3": 0.0,
147
+ "step": 1000
148
+ },
149
+ {
150
+ "epoch": 0.05,
151
+ "eval_MSE/layer0": 0.0,
152
+ "eval_MSE/layer1": 0.0,
153
+ "eval_MSE/layer2": 0.0,
154
+ "eval_MSE/layer3": 0.0,
155
+ "eval_accuracy": 0.4204176054226132,
156
+ "eval_dead_code_fraction/layer0": 1.0,
157
+ "eval_dead_code_fraction/layer1": 1.0,
158
+ "eval_dead_code_fraction/layer2": 1.0,
159
+ "eval_dead_code_fraction/layer3": 1.0,
160
+ "eval_first_transition_accuracy": 0.58,
161
+ "eval_input_norm/layer0": 0.0,
162
+ "eval_input_norm/layer1": 0.0,
163
+ "eval_input_norm/layer2": 0.0,
164
+ "eval_input_norm/layer3": 0.0,
165
+ "eval_loss": 1.4651859998703003,
166
+ "eval_multicode_k": 1,
167
+ "eval_output_norm/layer0": 0.0,
168
+ "eval_output_norm/layer1": 0.0,
169
+ "eval_output_norm/layer2": 0.0,
170
+ "eval_output_norm/layer3": 0.0,
171
+ "eval_runtime": 40.3952,
172
+ "eval_samples_per_second": 811.185,
173
+ "eval_steps_per_second": 1.584,
174
+ "eval_transition_accuracy": 0.5014516129032258,
175
+ "step": 1000
176
+ },
177
+ {
178
+ "MSE": 0.0,
179
+ "MSE/layer0": 0.0,
180
+ "MSE/layer1": 0.0,
181
+ "MSE/layer2": 0.0,
182
+ "MSE/layer3": 0.0,
183
+ "dead_code_fraction": 1.0,
184
+ "dead_code_fraction/layer0": 1.0,
185
+ "dead_code_fraction/layer1": 1.0,
186
+ "dead_code_fraction/layer2": 1.0,
187
+ "dead_code_fraction/layer3": 1.0,
188
+ "epoch": 0.07,
189
+ "input_norm": 0.0,
190
+ "input_norm/layer0": 0.0,
191
+ "input_norm/layer1": 0.0,
192
+ "input_norm/layer2": 0.0,
193
+ "input_norm/layer3": 0.0,
194
+ "learning_rate": 0.001,
195
+ "loss": 1.3928,
196
+ "max_norm": 14.111713409423828,
197
+ "max_norm/layer0": 14.111713409423828,
198
+ "max_norm/layer1": 13.196317672729492,
199
+ "max_norm/layer2": 12.821307182312012,
200
+ "max_norm/layer3": 13.539477348327637,
201
+ "mean_norm": 8.05762867629528,
202
+ "mean_norm/layer0": 8.449561476707458,
203
+ "mean_norm/layer1": 8.045644223690033,
204
+ "mean_norm/layer2": 7.585634410381317,
205
+ "mean_norm/layer3": 8.149674594402313,
206
+ "multicode_k": 1,
207
+ "output_norm": 0.0,
208
+ "output_norm/layer0": 0.0,
209
+ "output_norm/layer1": 0.0,
210
+ "output_norm/layer2": 0.0,
211
+ "output_norm/layer3": 0.0,
212
+ "step": 1500
213
+ },
214
+ {
215
+ "epoch": 0.07,
216
+ "eval_MSE/layer0": 0.0,
217
+ "eval_MSE/layer1": 0.0,
218
+ "eval_MSE/layer2": 0.0,
219
+ "eval_MSE/layer3": 0.0,
220
+ "eval_accuracy": 0.4378030131182333,
221
+ "eval_dead_code_fraction/layer0": 1.0,
222
+ "eval_dead_code_fraction/layer1": 1.0,
223
+ "eval_dead_code_fraction/layer2": 1.0,
224
+ "eval_dead_code_fraction/layer3": 1.0,
225
+ "eval_first_transition_accuracy": 0.79,
226
+ "eval_input_norm/layer0": 0.0,
227
+ "eval_input_norm/layer1": 0.0,
228
+ "eval_input_norm/layer2": 0.0,
229
+ "eval_input_norm/layer3": 0.0,
230
+ "eval_loss": 1.354053258895874,
231
+ "eval_multicode_k": 1,
232
+ "eval_output_norm/layer0": 0.0,
233
+ "eval_output_norm/layer1": 0.0,
234
+ "eval_output_norm/layer2": 0.0,
235
+ "eval_output_norm/layer3": 0.0,
236
+ "eval_runtime": 40.3967,
237
+ "eval_samples_per_second": 811.156,
238
+ "eval_steps_per_second": 1.584,
239
+ "eval_transition_accuracy": 0.555,
240
+ "step": 1500
241
+ },
242
+ {
243
+ "MSE": 0.0,
244
+ "MSE/layer0": 0.0,
245
+ "MSE/layer1": 0.0,
246
+ "MSE/layer2": 0.0,
247
+ "MSE/layer3": 0.0,
248
+ "dead_code_fraction": 1.0,
249
+ "dead_code_fraction/layer0": 1.0,
250
+ "dead_code_fraction/layer1": 1.0,
251
+ "dead_code_fraction/layer2": 1.0,
252
+ "dead_code_fraction/layer3": 1.0,
253
+ "epoch": 0.1,
254
+ "input_norm": 0.0,
255
+ "input_norm/layer0": 0.0,
256
+ "input_norm/layer1": 0.0,
257
+ "input_norm/layer2": 0.0,
258
+ "input_norm/layer3": 0.0,
259
+ "learning_rate": 0.001,
260
+ "loss": 1.3405,
261
+ "max_norm": 14.111713409423828,
262
+ "max_norm/layer0": 14.111713409423828,
263
+ "max_norm/layer1": 12.928963661193848,
264
+ "max_norm/layer2": 12.629460334777832,
265
+ "max_norm/layer3": 13.539477348327637,
266
+ "mean_norm": 8.056178480386734,
267
+ "mean_norm/layer0": 8.451180398464203,
268
+ "mean_norm/layer1": 8.046804785728455,
269
+ "mean_norm/layer2": 7.5705525279045105,
270
+ "mean_norm/layer3": 8.156176209449768,
271
+ "multicode_k": 1,
272
+ "output_norm": 0.0,
273
+ "output_norm/layer0": 0.0,
274
+ "output_norm/layer1": 0.0,
275
+ "output_norm/layer2": 0.0,
276
+ "output_norm/layer3": 0.0,
277
+ "step": 2000
278
+ },
279
+ {
280
+ "epoch": 0.1,
281
+ "eval_MSE/layer0": 0.0,
282
+ "eval_MSE/layer1": 0.0,
283
+ "eval_MSE/layer2": 0.0,
284
+ "eval_MSE/layer3": 0.0,
285
+ "eval_accuracy": 0.4426980807086614,
286
+ "eval_dead_code_fraction/layer0": 1.0,
287
+ "eval_dead_code_fraction/layer1": 1.0,
288
+ "eval_dead_code_fraction/layer2": 1.0,
289
+ "eval_dead_code_fraction/layer3": 1.0,
290
+ "eval_first_transition_accuracy": 0.82,
291
+ "eval_input_norm/layer0": 0.0,
292
+ "eval_input_norm/layer1": 0.0,
293
+ "eval_input_norm/layer2": 0.0,
294
+ "eval_input_norm/layer3": 0.0,
295
+ "eval_loss": 1.3263764381408691,
296
+ "eval_multicode_k": 1,
297
+ "eval_output_norm/layer0": 0.0,
298
+ "eval_output_norm/layer1": 0.0,
299
+ "eval_output_norm/layer2": 0.0,
300
+ "eval_output_norm/layer3": 0.0,
301
+ "eval_runtime": 40.4538,
302
+ "eval_samples_per_second": 810.011,
303
+ "eval_steps_per_second": 1.582,
304
+ "eval_transition_accuracy": 0.5756451612903226,
305
+ "step": 2000
306
+ },
307
+ {
308
+ "MSE": 0.0,
309
+ "MSE/layer0": 0.0,
310
+ "MSE/layer1": 0.0,
311
+ "MSE/layer2": 0.0,
312
+ "MSE/layer3": 0.0,
313
+ "dead_code_fraction": 1.0,
314
+ "dead_code_fraction/layer0": 1.0,
315
+ "dead_code_fraction/layer1": 1.0,
316
+ "dead_code_fraction/layer2": 1.0,
317
+ "dead_code_fraction/layer3": 1.0,
318
+ "epoch": 0.12,
319
+ "input_norm": 0.0,
320
+ "input_norm/layer0": 0.0,
321
+ "input_norm/layer1": 0.0,
322
+ "input_norm/layer2": 0.0,
323
+ "input_norm/layer3": 0.0,
324
+ "learning_rate": 0.001,
325
+ "loss": 1.3189,
326
+ "max_norm": 14.111713409423828,
327
+ "max_norm/layer0": 14.111713409423828,
328
+ "max_norm/layer1": 12.928963661193848,
329
+ "max_norm/layer2": 12.610198974609375,
330
+ "max_norm/layer3": 13.539477348327637,
331
+ "mean_norm": 8.065410792827606,
332
+ "mean_norm/layer0": 8.452252388000488,
333
+ "mean_norm/layer1": 8.05913120508194,
334
+ "mean_norm/layer2": 7.583977818489075,
335
+ "mean_norm/layer3": 8.166281759738922,
336
+ "multicode_k": 1,
337
+ "output_norm": 0.0,
338
+ "output_norm/layer0": 0.0,
339
+ "output_norm/layer1": 0.0,
340
+ "output_norm/layer2": 0.0,
341
+ "output_norm/layer3": 0.0,
342
+ "step": 2500
343
+ },
344
+ {
345
+ "epoch": 0.12,
346
+ "eval_MSE/layer0": 0.0,
347
+ "eval_MSE/layer1": 0.0,
348
+ "eval_MSE/layer2": 0.0,
349
+ "eval_MSE/layer3": 0.0,
350
+ "eval_accuracy": 0.4445690245140256,
351
+ "eval_dead_code_fraction/layer0": 1.0,
352
+ "eval_dead_code_fraction/layer1": 1.0,
353
+ "eval_dead_code_fraction/layer2": 1.0,
354
+ "eval_dead_code_fraction/layer3": 1.0,
355
+ "eval_first_transition_accuracy": 0.86,
356
+ "eval_input_norm/layer0": 0.0,
357
+ "eval_input_norm/layer1": 0.0,
358
+ "eval_input_norm/layer2": 0.0,
359
+ "eval_input_norm/layer3": 0.0,
360
+ "eval_loss": 1.3187371492385864,
361
+ "eval_multicode_k": 1,
362
+ "eval_output_norm/layer0": 0.0,
363
+ "eval_output_norm/layer1": 0.0,
364
+ "eval_output_norm/layer2": 0.0,
365
+ "eval_output_norm/layer3": 0.0,
366
+ "eval_runtime": 40.0204,
367
+ "eval_samples_per_second": 818.782,
368
+ "eval_steps_per_second": 1.599,
369
+ "eval_transition_accuracy": 0.5575806451612904,
370
+ "step": 2500
371
+ },
372
+ {
373
+ "MSE": 0.0,
374
+ "MSE/layer0": 0.0,
375
+ "MSE/layer1": 0.0,
376
+ "MSE/layer2": 0.0,
377
+ "MSE/layer3": 0.0,
378
+ "dead_code_fraction": 1.0,
379
+ "dead_code_fraction/layer0": 1.0,
380
+ "dead_code_fraction/layer1": 1.0,
381
+ "dead_code_fraction/layer2": 1.0,
382
+ "dead_code_fraction/layer3": 1.0,
383
+ "epoch": 0.15,
384
+ "input_norm": 0.0,
385
+ "input_norm/layer0": 0.0,
386
+ "input_norm/layer1": 0.0,
387
+ "input_norm/layer2": 0.0,
388
+ "input_norm/layer3": 0.0,
389
+ "learning_rate": 0.001,
390
+ "loss": 1.308,
391
+ "max_norm": 14.111713409423828,
392
+ "max_norm/layer0": 14.111713409423828,
393
+ "max_norm/layer1": 12.928963661193848,
394
+ "max_norm/layer2": 12.610198974609375,
395
+ "max_norm/layer3": 13.539477348327637,
396
+ "mean_norm": 8.076771080493927,
397
+ "mean_norm/layer0": 8.453002035617828,
398
+ "mean_norm/layer1": 8.070474624633789,
399
+ "mean_norm/layer2": 7.606890320777893,
400
+ "mean_norm/layer3": 8.176717340946198,
401
+ "multicode_k": 1,
402
+ "output_norm": 0.0,
403
+ "output_norm/layer0": 0.0,
404
+ "output_norm/layer1": 0.0,
405
+ "output_norm/layer2": 0.0,
406
+ "output_norm/layer3": 0.0,
407
+ "step": 3000
408
+ },
409
+ {
410
+ "epoch": 0.15,
411
+ "eval_MSE/layer0": 0.0,
412
+ "eval_MSE/layer1": 0.0,
413
+ "eval_MSE/layer2": 0.0,
414
+ "eval_MSE/layer3": 0.0,
415
+ "eval_accuracy": 0.44684558778297245,
416
+ "eval_dead_code_fraction/layer0": 1.0,
417
+ "eval_dead_code_fraction/layer1": 1.0,
418
+ "eval_dead_code_fraction/layer2": 1.0,
419
+ "eval_dead_code_fraction/layer3": 1.0,
420
+ "eval_first_transition_accuracy": 0.82,
421
+ "eval_input_norm/layer0": 0.0,
422
+ "eval_input_norm/layer1": 0.0,
423
+ "eval_input_norm/layer2": 0.0,
424
+ "eval_input_norm/layer3": 0.0,
425
+ "eval_loss": 1.3064292669296265,
426
+ "eval_multicode_k": 1,
427
+ "eval_output_norm/layer0": 0.0,
428
+ "eval_output_norm/layer1": 0.0,
429
+ "eval_output_norm/layer2": 0.0,
430
+ "eval_output_norm/layer3": 0.0,
431
+ "eval_runtime": 40.4873,
432
+ "eval_samples_per_second": 809.341,
433
+ "eval_steps_per_second": 1.581,
434
+ "eval_transition_accuracy": 0.557258064516129,
435
+ "step": 3000
436
+ },
437
+ {
438
+ "MSE": 0.0,
439
+ "MSE/layer0": 0.0,
440
+ "MSE/layer1": 0.0,
441
+ "MSE/layer2": 0.0,
442
+ "MSE/layer3": 0.0,
443
+ "dead_code_fraction": 1.0,
444
+ "dead_code_fraction/layer0": 1.0,
445
+ "dead_code_fraction/layer1": 1.0,
446
+ "dead_code_fraction/layer2": 1.0,
447
+ "dead_code_fraction/layer3": 1.0,
448
+ "epoch": 0.17,
449
+ "input_norm": 0.0,
450
+ "input_norm/layer0": 0.0,
451
+ "input_norm/layer1": 0.0,
452
+ "input_norm/layer2": 0.0,
453
+ "input_norm/layer3": 0.0,
454
+ "learning_rate": 0.001,
455
+ "loss": 1.3009,
456
+ "max_norm": 14.111713409423828,
457
+ "max_norm/layer0": 14.111713409423828,
458
+ "max_norm/layer1": 12.928963661193848,
459
+ "max_norm/layer2": 12.480420112609863,
460
+ "max_norm/layer3": 13.539477348327637,
461
+ "mean_norm": 8.087428167462349,
462
+ "mean_norm/layer0": 8.45363199710846,
463
+ "mean_norm/layer1": 8.081151723861694,
464
+ "mean_norm/layer2": 7.628681242465973,
465
+ "mean_norm/layer3": 8.186247706413269,
466
+ "multicode_k": 1,
467
+ "output_norm": 0.0,
468
+ "output_norm/layer0": 0.0,
469
+ "output_norm/layer1": 0.0,
470
+ "output_norm/layer2": 0.0,
471
+ "output_norm/layer3": 0.0,
472
+ "step": 3500
473
+ },
474
+ {
475
+ "epoch": 0.17,
476
+ "eval_MSE/layer0": 0.0,
477
+ "eval_MSE/layer1": 0.0,
478
+ "eval_MSE/layer2": 0.0,
479
+ "eval_MSE/layer3": 0.0,
480
+ "eval_accuracy": 0.44931342658095474,
481
+ "eval_dead_code_fraction/layer0": 1.0,
482
+ "eval_dead_code_fraction/layer1": 1.0,
483
+ "eval_dead_code_fraction/layer2": 1.0,
484
+ "eval_dead_code_fraction/layer3": 1.0,
485
+ "eval_first_transition_accuracy": 0.87,
486
+ "eval_input_norm/layer0": 0.0,
487
+ "eval_input_norm/layer1": 0.0,
488
+ "eval_input_norm/layer2": 0.0,
489
+ "eval_input_norm/layer3": 0.0,
490
+ "eval_loss": 1.2963054180145264,
491
+ "eval_multicode_k": 1,
492
+ "eval_output_norm/layer0": 0.0,
493
+ "eval_output_norm/layer1": 0.0,
494
+ "eval_output_norm/layer2": 0.0,
495
+ "eval_output_norm/layer3": 0.0,
496
+ "eval_runtime": 40.5536,
497
+ "eval_samples_per_second": 808.018,
498
+ "eval_steps_per_second": 1.578,
499
+ "eval_transition_accuracy": 0.5762903225806452,
500
+ "step": 3500
501
+ },
502
+ {
503
+ "MSE": 0.0,
504
+ "MSE/layer0": 0.0,
505
+ "MSE/layer1": 0.0,
506
+ "MSE/layer2": 0.0,
507
+ "MSE/layer3": 0.0,
508
+ "dead_code_fraction": 1.0,
509
+ "dead_code_fraction/layer0": 1.0,
510
+ "dead_code_fraction/layer1": 1.0,
511
+ "dead_code_fraction/layer2": 1.0,
512
+ "dead_code_fraction/layer3": 1.0,
513
+ "epoch": 0.2,
514
+ "input_norm": 0.0,
515
+ "input_norm/layer0": 0.0,
516
+ "input_norm/layer1": 0.0,
517
+ "input_norm/layer2": 0.0,
518
+ "input_norm/layer3": 0.0,
519
+ "learning_rate": 0.001,
520
+ "loss": 1.2965,
521
+ "max_norm": 14.111713409423828,
522
+ "max_norm/layer0": 14.111713409423828,
523
+ "max_norm/layer1": 12.945528030395508,
524
+ "max_norm/layer2": 13.130059242248535,
525
+ "max_norm/layer3": 13.539477348327637,
526
+ "mean_norm": 8.097855687141418,
527
+ "mean_norm/layer0": 8.454240918159485,
528
+ "mean_norm/layer1": 8.09092777967453,
529
+ "mean_norm/layer2": 7.649804890155792,
530
+ "mean_norm/layer3": 8.196449160575867,
531
+ "multicode_k": 1,
532
+ "output_norm": 0.0,
533
+ "output_norm/layer0": 0.0,
534
+ "output_norm/layer1": 0.0,
535
+ "output_norm/layer2": 0.0,
536
+ "output_norm/layer3": 0.0,
537
+ "step": 4000
538
+ },
539
+ {
540
+ "epoch": 0.2,
541
+ "eval_MSE/layer0": 0.0,
542
+ "eval_MSE/layer1": 0.0,
543
+ "eval_MSE/layer2": 0.0,
544
+ "eval_MSE/layer3": 0.0,
545
+ "eval_accuracy": 0.4493698961152805,
546
+ "eval_dead_code_fraction/layer0": 1.0,
547
+ "eval_dead_code_fraction/layer1": 1.0,
548
+ "eval_dead_code_fraction/layer2": 1.0,
549
+ "eval_dead_code_fraction/layer3": 1.0,
550
+ "eval_first_transition_accuracy": 0.9,
551
+ "eval_input_norm/layer0": 0.0,
552
+ "eval_input_norm/layer1": 0.0,
553
+ "eval_input_norm/layer2": 0.0,
554
+ "eval_input_norm/layer3": 0.0,
555
+ "eval_loss": 1.2922283411026,
556
+ "eval_multicode_k": 1,
557
+ "eval_output_norm/layer0": 0.0,
558
+ "eval_output_norm/layer1": 0.0,
559
+ "eval_output_norm/layer2": 0.0,
560
+ "eval_output_norm/layer3": 0.0,
561
+ "eval_runtime": 40.3081,
562
+ "eval_samples_per_second": 812.939,
563
+ "eval_steps_per_second": 1.588,
564
+ "eval_transition_accuracy": 0.567741935483871,
565
+ "step": 4000
566
+ },
567
+ {
568
+ "MSE": 0.0,
569
+ "MSE/layer0": 0.0,
570
+ "MSE/layer1": 0.0,
571
+ "MSE/layer2": 0.0,
572
+ "MSE/layer3": 0.0,
573
+ "dead_code_fraction": 1.0,
574
+ "dead_code_fraction/layer0": 1.0,
575
+ "dead_code_fraction/layer1": 1.0,
576
+ "dead_code_fraction/layer2": 1.0,
577
+ "dead_code_fraction/layer3": 1.0,
578
+ "epoch": 0.23,
579
+ "input_norm": 0.0,
580
+ "input_norm/layer0": 0.0,
581
+ "input_norm/layer1": 0.0,
582
+ "input_norm/layer2": 0.0,
583
+ "input_norm/layer3": 0.0,
584
+ "learning_rate": 0.001,
585
+ "loss": 1.2919,
586
+ "max_norm": 14.111713409423828,
587
+ "max_norm/layer0": 14.111713409423828,
588
+ "max_norm/layer1": 13.611448287963867,
589
+ "max_norm/layer2": 13.752634048461914,
590
+ "max_norm/layer3": 13.539477348327637,
591
+ "mean_norm": 8.10820010304451,
592
+ "mean_norm/layer0": 8.454896569252014,
593
+ "mean_norm/layer1": 8.099043369293213,
594
+ "mean_norm/layer2": 7.671496093273163,
595
+ "mean_norm/layer3": 8.20736438035965,
596
+ "multicode_k": 1,
597
+ "output_norm": 0.0,
598
+ "output_norm/layer0": 0.0,
599
+ "output_norm/layer1": 0.0,
600
+ "output_norm/layer2": 0.0,
601
+ "output_norm/layer3": 0.0,
602
+ "step": 4500
603
+ },
604
+ {
605
+ "epoch": 0.23,
606
+ "eval_MSE/layer0": 0.0,
607
+ "eval_MSE/layer1": 0.0,
608
+ "eval_MSE/layer2": 0.0,
609
+ "eval_MSE/layer3": 0.0,
610
+ "eval_accuracy": 0.449889175535187,
611
+ "eval_dead_code_fraction/layer0": 1.0,
612
+ "eval_dead_code_fraction/layer1": 1.0,
613
+ "eval_dead_code_fraction/layer2": 1.0,
614
+ "eval_dead_code_fraction/layer3": 1.0,
615
+ "eval_first_transition_accuracy": 0.91,
616
+ "eval_input_norm/layer0": 0.0,
617
+ "eval_input_norm/layer1": 0.0,
618
+ "eval_input_norm/layer2": 0.0,
619
+ "eval_input_norm/layer3": 0.0,
620
+ "eval_loss": 1.2880299091339111,
621
+ "eval_multicode_k": 1,
622
+ "eval_output_norm/layer0": 0.0,
623
+ "eval_output_norm/layer1": 0.0,
624
+ "eval_output_norm/layer2": 0.0,
625
+ "eval_output_norm/layer3": 0.0,
626
+ "eval_runtime": 40.5303,
627
+ "eval_samples_per_second": 808.482,
628
+ "eval_steps_per_second": 1.579,
629
+ "eval_transition_accuracy": 0.5820967741935484,
630
+ "step": 4500
631
+ },
632
+ {
633
+ "MSE": 0.0,
634
+ "MSE/layer0": 0.0,
635
+ "MSE/layer1": 0.0,
636
+ "MSE/layer2": 0.0,
637
+ "MSE/layer3": 0.0,
638
+ "dead_code_fraction": 1.0,
639
+ "dead_code_fraction/layer0": 1.0,
640
+ "dead_code_fraction/layer1": 1.0,
641
+ "dead_code_fraction/layer2": 1.0,
642
+ "dead_code_fraction/layer3": 1.0,
643
+ "epoch": 0.25,
644
+ "input_norm": 0.0,
645
+ "input_norm/layer0": 0.0,
646
+ "input_norm/layer1": 0.0,
647
+ "input_norm/layer2": 0.0,
648
+ "input_norm/layer3": 0.0,
649
+ "learning_rate": 0.001,
650
+ "loss": 1.2889,
651
+ "max_norm": 14.629088401794434,
652
+ "max_norm/layer0": 14.111713409423828,
653
+ "max_norm/layer1": 14.212156295776367,
654
+ "max_norm/layer2": 14.629088401794434,
655
+ "max_norm/layer3": 13.539477348327637,
656
+ "mean_norm": 8.117734983563423,
657
+ "mean_norm/layer0": 8.455368101596832,
658
+ "mean_norm/layer1": 8.104798018932343,
659
+ "mean_norm/layer2": 7.693721830844879,
660
+ "mean_norm/layer3": 8.217051982879639,
661
+ "multicode_k": 1,
662
+ "output_norm": 0.0,
663
+ "output_norm/layer0": 0.0,
664
+ "output_norm/layer1": 0.0,
665
+ "output_norm/layer2": 0.0,
666
+ "output_norm/layer3": 0.0,
667
+ "step": 5000
668
+ },
669
+ {
670
+ "epoch": 0.25,
671
+ "eval_MSE/layer0": 0.0,
672
+ "eval_MSE/layer1": 0.0,
673
+ "eval_MSE/layer2": 0.0,
674
+ "eval_MSE/layer3": 0.0,
675
+ "eval_accuracy": 0.45008597786970966,
676
+ "eval_dead_code_fraction/layer0": 1.0,
677
+ "eval_dead_code_fraction/layer1": 1.0,
678
+ "eval_dead_code_fraction/layer2": 1.0,
679
+ "eval_dead_code_fraction/layer3": 1.0,
680
+ "eval_first_transition_accuracy": 0.9,
681
+ "eval_input_norm/layer0": 0.0,
682
+ "eval_input_norm/layer1": 0.0,
683
+ "eval_input_norm/layer2": 0.0,
684
+ "eval_input_norm/layer3": 0.0,
685
+ "eval_loss": 1.2855565547943115,
686
+ "eval_multicode_k": 1,
687
+ "eval_output_norm/layer0": 0.0,
688
+ "eval_output_norm/layer1": 0.0,
689
+ "eval_output_norm/layer2": 0.0,
690
+ "eval_output_norm/layer3": 0.0,
691
+ "eval_runtime": 40.1444,
692
+ "eval_samples_per_second": 816.252,
693
+ "eval_steps_per_second": 1.594,
694
+ "eval_transition_accuracy": 0.56,
695
+ "step": 5000
696
+ },
697
+ {
698
+ "MSE": 0.0,
699
+ "MSE/layer0": 0.0,
700
+ "MSE/layer1": 0.0,
701
+ "MSE/layer2": 0.0,
702
+ "MSE/layer3": 0.0,
703
+ "dead_code_fraction": 1.0,
704
+ "dead_code_fraction/layer0": 1.0,
705
+ "dead_code_fraction/layer1": 1.0,
706
+ "dead_code_fraction/layer2": 1.0,
707
+ "dead_code_fraction/layer3": 1.0,
708
+ "epoch": 0.28,
709
+ "input_norm": 0.0,
710
+ "input_norm/layer0": 0.0,
711
+ "input_norm/layer1": 0.0,
712
+ "input_norm/layer2": 0.0,
713
+ "input_norm/layer3": 0.0,
714
+ "learning_rate": 0.001,
715
+ "loss": 1.2855,
716
+ "max_norm": 15.594667434692383,
717
+ "max_norm/layer0": 14.111713409423828,
718
+ "max_norm/layer1": 14.633105278015137,
719
+ "max_norm/layer2": 15.594667434692383,
720
+ "max_norm/layer3": 13.539477348327637,
721
+ "mean_norm": 8.12691231071949,
722
+ "mean_norm/layer0": 8.45590054988861,
723
+ "mean_norm/layer1": 8.110510230064392,
724
+ "mean_norm/layer2": 7.71501624584198,
725
+ "mean_norm/layer3": 8.226222217082977,
726
+ "multicode_k": 1,
727
+ "output_norm": 0.0,
728
+ "output_norm/layer0": 0.0,
729
+ "output_norm/layer1": 0.0,
730
+ "output_norm/layer2": 0.0,
731
+ "output_norm/layer3": 0.0,
732
+ "step": 5500
733
+ },
734
+ {
735
+ "epoch": 0.28,
736
+ "eval_MSE/layer0": 0.0,
737
+ "eval_MSE/layer1": 0.0,
738
+ "eval_MSE/layer2": 0.0,
739
+ "eval_MSE/layer3": 0.0,
740
+ "eval_accuracy": 0.4503207469549705,
741
+ "eval_dead_code_fraction/layer0": 1.0,
742
+ "eval_dead_code_fraction/layer1": 1.0,
743
+ "eval_dead_code_fraction/layer2": 1.0,
744
+ "eval_dead_code_fraction/layer3": 1.0,
745
+ "eval_first_transition_accuracy": 0.9,
746
+ "eval_input_norm/layer0": 0.0,
747
+ "eval_input_norm/layer1": 0.0,
748
+ "eval_input_norm/layer2": 0.0,
749
+ "eval_input_norm/layer3": 0.0,
750
+ "eval_loss": 1.2815940380096436,
751
+ "eval_multicode_k": 1,
752
+ "eval_output_norm/layer0": 0.0,
753
+ "eval_output_norm/layer1": 0.0,
754
+ "eval_output_norm/layer2": 0.0,
755
+ "eval_output_norm/layer3": 0.0,
756
+ "eval_runtime": 40.4723,
757
+ "eval_samples_per_second": 809.64,
758
+ "eval_steps_per_second": 1.581,
759
+ "eval_transition_accuracy": 0.6016129032258064,
760
+ "step": 5500
761
+ },
762
+ {
763
+ "MSE": 0.0,
764
+ "MSE/layer0": 0.0,
765
+ "MSE/layer1": 0.0,
766
+ "MSE/layer2": 0.0,
767
+ "MSE/layer3": 0.0,
768
+ "dead_code_fraction": 1.0,
769
+ "dead_code_fraction/layer0": 1.0,
770
+ "dead_code_fraction/layer1": 1.0,
771
+ "dead_code_fraction/layer2": 1.0,
772
+ "dead_code_fraction/layer3": 1.0,
773
+ "epoch": 0.3,
774
+ "input_norm": 0.0,
775
+ "input_norm/layer0": 0.0,
776
+ "input_norm/layer1": 0.0,
777
+ "input_norm/layer2": 0.0,
778
+ "input_norm/layer3": 0.0,
779
+ "learning_rate": 0.001,
780
+ "loss": 1.2828,
781
+ "max_norm": 16.523221969604492,
782
+ "max_norm/layer0": 14.111713409423828,
783
+ "max_norm/layer1": 15.140296936035156,
784
+ "max_norm/layer2": 16.523221969604492,
785
+ "max_norm/layer3": 15.213438987731934,
786
+ "mean_norm": 8.136623591184616,
787
+ "mean_norm/layer0": 8.456406712532043,
788
+ "mean_norm/layer1": 8.11625623703003,
789
+ "mean_norm/layer2": 7.738001704216003,
790
+ "mean_norm/layer3": 8.235829710960388,
791
+ "multicode_k": 1,
792
+ "output_norm": 0.0,
793
+ "output_norm/layer0": 0.0,
794
+ "output_norm/layer1": 0.0,
795
+ "output_norm/layer2": 0.0,
796
+ "output_norm/layer3": 0.0,
797
+ "step": 6000
798
+ },
799
+ {
800
+ "epoch": 0.3,
801
+ "eval_MSE/layer0": 0.0,
802
+ "eval_MSE/layer1": 0.0,
803
+ "eval_MSE/layer2": 0.0,
804
+ "eval_MSE/layer3": 0.0,
805
+ "eval_accuracy": 0.45019699457123524,
806
+ "eval_dead_code_fraction/layer0": 1.0,
807
+ "eval_dead_code_fraction/layer1": 1.0,
808
+ "eval_dead_code_fraction/layer2": 1.0,
809
+ "eval_dead_code_fraction/layer3": 1.0,
810
+ "eval_first_transition_accuracy": 0.87,
811
+ "eval_input_norm/layer0": 0.0,
812
+ "eval_input_norm/layer1": 0.0,
813
+ "eval_input_norm/layer2": 0.0,
814
+ "eval_input_norm/layer3": 0.0,
815
+ "eval_loss": 1.2844185829162598,
816
+ "eval_multicode_k": 1,
817
+ "eval_output_norm/layer0": 0.0,
818
+ "eval_output_norm/layer1": 0.0,
819
+ "eval_output_norm/layer2": 0.0,
820
+ "eval_output_norm/layer3": 0.0,
821
+ "eval_runtime": 40.4554,
822
+ "eval_samples_per_second": 809.979,
823
+ "eval_steps_per_second": 1.582,
824
+ "eval_transition_accuracy": 0.5733870967741935,
825
+ "step": 6000
826
+ },
827
+ {
828
+ "MSE": 0.0,
829
+ "MSE/layer0": 0.0,
830
+ "MSE/layer1": 0.0,
831
+ "MSE/layer2": 0.0,
832
+ "MSE/layer3": 0.0,
833
+ "dead_code_fraction": 1.0,
834
+ "dead_code_fraction/layer0": 1.0,
835
+ "dead_code_fraction/layer1": 1.0,
836
+ "dead_code_fraction/layer2": 1.0,
837
+ "dead_code_fraction/layer3": 1.0,
838
+ "epoch": 0.33,
839
+ "input_norm": 0.0,
840
+ "input_norm/layer0": 0.0,
841
+ "input_norm/layer1": 0.0,
842
+ "input_norm/layer2": 0.0,
843
+ "input_norm/layer3": 0.0,
844
+ "learning_rate": 0.001,
845
+ "loss": 1.2805,
846
+ "max_norm": 17.3682861328125,
847
+ "max_norm/layer0": 14.111713409423828,
848
+ "max_norm/layer1": 15.587692260742188,
849
+ "max_norm/layer2": 17.3682861328125,
850
+ "max_norm/layer3": 16.75887107849121,
851
+ "mean_norm": 8.145677655935287,
852
+ "mean_norm/layer0": 8.456890940666199,
853
+ "mean_norm/layer1": 8.120486974716187,
854
+ "mean_norm/layer2": 7.760386824607849,
855
+ "mean_norm/layer3": 8.244945883750916,
856
+ "multicode_k": 1,
857
+ "output_norm": 0.0,
858
+ "output_norm/layer0": 0.0,
859
+ "output_norm/layer1": 0.0,
860
+ "output_norm/layer2": 0.0,
861
+ "output_norm/layer3": 0.0,
862
+ "step": 6500
863
+ },
864
+ {
865
+ "epoch": 0.33,
866
+ "eval_MSE/layer0": 0.0,
867
+ "eval_MSE/layer1": 0.0,
868
+ "eval_MSE/layer2": 0.0,
869
+ "eval_MSE/layer3": 0.0,
870
+ "eval_accuracy": 0.45155730960875984,
871
+ "eval_dead_code_fraction/layer0": 1.0,
872
+ "eval_dead_code_fraction/layer1": 1.0,
873
+ "eval_dead_code_fraction/layer2": 1.0,
874
+ "eval_dead_code_fraction/layer3": 1.0,
875
+ "eval_first_transition_accuracy": 0.95,
876
+ "eval_input_norm/layer0": 0.0,
877
+ "eval_input_norm/layer1": 0.0,
878
+ "eval_input_norm/layer2": 0.0,
879
+ "eval_input_norm/layer3": 0.0,
880
+ "eval_loss": 1.277733325958252,
881
+ "eval_multicode_k": 1,
882
+ "eval_output_norm/layer0": 0.0,
883
+ "eval_output_norm/layer1": 0.0,
884
+ "eval_output_norm/layer2": 0.0,
885
+ "eval_output_norm/layer3": 0.0,
886
+ "eval_runtime": 40.4778,
887
+ "eval_samples_per_second": 809.529,
888
+ "eval_steps_per_second": 1.581,
889
+ "eval_transition_accuracy": 0.6083870967741936,
890
+ "step": 6500
891
+ },
892
+ {
893
+ "MSE": 0.0,
894
+ "MSE/layer0": 0.0,
895
+ "MSE/layer1": 0.0,
896
+ "MSE/layer2": 0.0,
897
+ "MSE/layer3": 0.0,
898
+ "dead_code_fraction": 1.0,
899
+ "dead_code_fraction/layer0": 1.0,
900
+ "dead_code_fraction/layer1": 1.0,
901
+ "dead_code_fraction/layer2": 1.0,
902
+ "dead_code_fraction/layer3": 1.0,
903
+ "epoch": 0.35,
904
+ "input_norm": 0.0,
905
+ "input_norm/layer0": 0.0,
906
+ "input_norm/layer1": 0.0,
907
+ "input_norm/layer2": 0.0,
908
+ "input_norm/layer3": 0.0,
909
+ "learning_rate": 0.001,
910
+ "loss": 1.2793,
911
+ "max_norm": 18.367448806762695,
912
+ "max_norm/layer0": 14.111713409423828,
913
+ "max_norm/layer1": 16.05299949645996,
914
+ "max_norm/layer2": 18.142847061157227,
915
+ "max_norm/layer3": 18.367448806762695,
916
+ "mean_norm": 8.15514886379242,
917
+ "mean_norm/layer0": 8.45737361907959,
918
+ "mean_norm/layer1": 8.127346932888031,
919
+ "mean_norm/layer2": 7.781527459621429,
920
+ "mean_norm/layer3": 8.254347443580627,
921
+ "multicode_k": 1,
922
+ "output_norm": 0.0,
923
+ "output_norm/layer0": 0.0,
924
+ "output_norm/layer1": 0.0,
925
+ "output_norm/layer2": 0.0,
926
+ "output_norm/layer3": 0.0,
927
+ "step": 7000
928
+ },
929
+ {
930
+ "epoch": 0.35,
931
+ "eval_MSE/layer0": 0.0,
932
+ "eval_MSE/layer1": 0.0,
933
+ "eval_MSE/layer2": 0.0,
934
+ "eval_MSE/layer3": 0.0,
935
+ "eval_accuracy": 0.45114255890132876,
936
+ "eval_dead_code_fraction/layer0": 1.0,
937
+ "eval_dead_code_fraction/layer1": 1.0,
938
+ "eval_dead_code_fraction/layer2": 1.0,
939
+ "eval_dead_code_fraction/layer3": 1.0,
940
+ "eval_first_transition_accuracy": 0.93,
941
+ "eval_input_norm/layer0": 0.0,
942
+ "eval_input_norm/layer1": 0.0,
943
+ "eval_input_norm/layer2": 0.0,
944
+ "eval_input_norm/layer3": 0.0,
945
+ "eval_loss": 1.2795634269714355,
946
+ "eval_multicode_k": 1,
947
+ "eval_output_norm/layer0": 0.0,
948
+ "eval_output_norm/layer1": 0.0,
949
+ "eval_output_norm/layer2": 0.0,
950
+ "eval_output_norm/layer3": 0.0,
951
+ "eval_runtime": 40.6598,
952
+ "eval_samples_per_second": 805.906,
953
+ "eval_steps_per_second": 1.574,
954
+ "eval_transition_accuracy": 0.5680645161290323,
955
+ "step": 7000
956
+ },
957
+ {
958
+ "MSE": 0.0,
959
+ "MSE/layer0": 0.0,
960
+ "MSE/layer1": 0.0,
961
+ "MSE/layer2": 0.0,
962
+ "MSE/layer3": 0.0,
963
+ "dead_code_fraction": 1.0,
964
+ "dead_code_fraction/layer0": 1.0,
965
+ "dead_code_fraction/layer1": 1.0,
966
+ "dead_code_fraction/layer2": 1.0,
967
+ "dead_code_fraction/layer3": 1.0,
968
+ "epoch": 0.38,
969
+ "input_norm": 0.0,
970
+ "input_norm/layer0": 0.0,
971
+ "input_norm/layer1": 0.0,
972
+ "input_norm/layer2": 0.0,
973
+ "input_norm/layer3": 0.0,
974
+ "learning_rate": 0.001,
975
+ "loss": 1.2785,
976
+ "max_norm": 20.019760131835938,
977
+ "max_norm/layer0": 14.111713409423828,
978
+ "max_norm/layer1": 16.42165756225586,
979
+ "max_norm/layer2": 19.017709732055664,
980
+ "max_norm/layer3": 20.019760131835938,
981
+ "mean_norm": 8.164311796426773,
982
+ "mean_norm/layer0": 8.457763373851776,
983
+ "mean_norm/layer1": 8.13219028711319,
984
+ "mean_norm/layer2": 7.803518235683441,
985
+ "mean_norm/layer3": 8.263775289058685,
986
+ "multicode_k": 1,
987
+ "output_norm": 0.0,
988
+ "output_norm/layer0": 0.0,
989
+ "output_norm/layer1": 0.0,
990
+ "output_norm/layer2": 0.0,
991
+ "output_norm/layer3": 0.0,
992
+ "step": 7500
993
+ },
994
+ {
995
+ "epoch": 0.38,
996
+ "eval_MSE/layer0": 0.0,
997
+ "eval_MSE/layer1": 0.0,
998
+ "eval_MSE/layer2": 0.0,
999
+ "eval_MSE/layer3": 0.0,
1000
+ "eval_accuracy": 0.45188651497908466,
1001
+ "eval_dead_code_fraction/layer0": 1.0,
1002
+ "eval_dead_code_fraction/layer1": 1.0,
1003
+ "eval_dead_code_fraction/layer2": 1.0,
1004
+ "eval_dead_code_fraction/layer3": 1.0,
1005
+ "eval_first_transition_accuracy": 0.95,
1006
+ "eval_input_norm/layer0": 0.0,
1007
+ "eval_input_norm/layer1": 0.0,
1008
+ "eval_input_norm/layer2": 0.0,
1009
+ "eval_input_norm/layer3": 0.0,
1010
+ "eval_loss": 1.2748253345489502,
1011
+ "eval_multicode_k": 1,
1012
+ "eval_output_norm/layer0": 0.0,
1013
+ "eval_output_norm/layer1": 0.0,
1014
+ "eval_output_norm/layer2": 0.0,
1015
+ "eval_output_norm/layer3": 0.0,
1016
+ "eval_runtime": 40.4299,
1017
+ "eval_samples_per_second": 810.489,
1018
+ "eval_steps_per_second": 1.583,
1019
+ "eval_transition_accuracy": 0.5919354838709677,
1020
+ "step": 7500
1021
+ },
1022
+ {
1023
+ "MSE": 0.0,
1024
+ "MSE/layer0": 0.0,
1025
+ "MSE/layer1": 0.0,
1026
+ "MSE/layer2": 0.0,
1027
+ "MSE/layer3": 0.0,
1028
+ "dead_code_fraction": 1.0,
1029
+ "dead_code_fraction/layer0": 1.0,
1030
+ "dead_code_fraction/layer1": 1.0,
1031
+ "dead_code_fraction/layer2": 1.0,
1032
+ "dead_code_fraction/layer3": 1.0,
1033
+ "epoch": 0.4,
1034
+ "input_norm": 0.0,
1035
+ "input_norm/layer0": 0.0,
1036
+ "input_norm/layer1": 0.0,
1037
+ "input_norm/layer2": 0.0,
1038
+ "input_norm/layer3": 0.0,
1039
+ "learning_rate": 0.001,
1040
+ "loss": 1.2764,
1041
+ "max_norm": 21.821395874023438,
1042
+ "max_norm/layer0": 14.158592224121094,
1043
+ "max_norm/layer1": 16.7973690032959,
1044
+ "max_norm/layer2": 19.965808868408203,
1045
+ "max_norm/layer3": 21.821395874023438,
1046
+ "mean_norm": 8.173083677887917,
1047
+ "mean_norm/layer0": 8.458155512809753,
1048
+ "mean_norm/layer1": 8.137401163578033,
1049
+ "mean_norm/layer2": 7.8241875767707825,
1050
+ "mean_norm/layer3": 8.272590458393097,
1051
+ "multicode_k": 1,
1052
+ "output_norm": 0.0,
1053
+ "output_norm/layer0": 0.0,
1054
+ "output_norm/layer1": 0.0,
1055
+ "output_norm/layer2": 0.0,
1056
+ "output_norm/layer3": 0.0,
1057
+ "step": 8000
1058
+ },
1059
+ {
1060
+ "epoch": 0.4,
1061
+ "eval_MSE/layer0": 0.0,
1062
+ "eval_MSE/layer1": 0.0,
1063
+ "eval_MSE/layer2": 0.0,
1064
+ "eval_MSE/layer3": 0.0,
1065
+ "eval_accuracy": 0.451844222902313,
1066
+ "eval_dead_code_fraction/layer0": 1.0,
1067
+ "eval_dead_code_fraction/layer1": 1.0,
1068
+ "eval_dead_code_fraction/layer2": 1.0,
1069
+ "eval_dead_code_fraction/layer3": 1.0,
1070
+ "eval_first_transition_accuracy": 0.9,
1071
+ "eval_input_norm/layer0": 0.0,
1072
+ "eval_input_norm/layer1": 0.0,
1073
+ "eval_input_norm/layer2": 0.0,
1074
+ "eval_input_norm/layer3": 0.0,
1075
+ "eval_loss": 1.276716709136963,
1076
+ "eval_multicode_k": 1,
1077
+ "eval_output_norm/layer0": 0.0,
1078
+ "eval_output_norm/layer1": 0.0,
1079
+ "eval_output_norm/layer2": 0.0,
1080
+ "eval_output_norm/layer3": 0.0,
1081
+ "eval_runtime": 40.3344,
1082
+ "eval_samples_per_second": 812.407,
1083
+ "eval_steps_per_second": 1.587,
1084
+ "eval_transition_accuracy": 0.5759677419354838,
1085
+ "step": 8000
1086
+ },
1087
+ {
1088
+ "MSE": 0.0,
1089
+ "MSE/layer0": 0.0,
1090
+ "MSE/layer1": 0.0,
1091
+ "MSE/layer2": 0.0,
1092
+ "MSE/layer3": 0.0,
1093
+ "dead_code_fraction": 1.0,
1094
+ "dead_code_fraction/layer0": 1.0,
1095
+ "dead_code_fraction/layer1": 1.0,
1096
+ "dead_code_fraction/layer2": 1.0,
1097
+ "dead_code_fraction/layer3": 1.0,
1098
+ "epoch": 0.42,
1099
+ "input_norm": 0.0,
1100
+ "input_norm/layer0": 0.0,
1101
+ "input_norm/layer1": 0.0,
1102
+ "input_norm/layer2": 0.0,
1103
+ "input_norm/layer3": 0.0,
1104
+ "learning_rate": 0.001,
1105
+ "loss": 1.2763,
1106
+ "max_norm": 23.506425857543945,
1107
+ "max_norm/layer0": 14.451132774353027,
1108
+ "max_norm/layer1": 17.121503829956055,
1109
+ "max_norm/layer2": 20.906761169433594,
1110
+ "max_norm/layer3": 23.506425857543945,
1111
+ "mean_norm": 8.181086376309395,
1112
+ "mean_norm/layer0": 8.458472549915314,
1113
+ "mean_norm/layer1": 8.140588343143463,
1114
+ "mean_norm/layer2": 7.84406965970993,
1115
+ "mean_norm/layer3": 8.281214952468872,
1116
+ "multicode_k": 1,
1117
+ "output_norm": 0.0,
1118
+ "output_norm/layer0": 0.0,
1119
+ "output_norm/layer1": 0.0,
1120
+ "output_norm/layer2": 0.0,
1121
+ "output_norm/layer3": 0.0,
1122
+ "step": 8500
1123
+ },
1124
+ {
1125
+ "epoch": 0.42,
1126
+ "eval_MSE/layer0": 0.0,
1127
+ "eval_MSE/layer1": 0.0,
1128
+ "eval_MSE/layer2": 0.0,
1129
+ "eval_MSE/layer3": 0.0,
1130
+ "eval_accuracy": 0.45069200410617616,
1131
+ "eval_dead_code_fraction/layer0": 1.0,
1132
+ "eval_dead_code_fraction/layer1": 1.0,
1133
+ "eval_dead_code_fraction/layer2": 1.0,
1134
+ "eval_dead_code_fraction/layer3": 1.0,
1135
+ "eval_first_transition_accuracy": 0.94,
1136
+ "eval_input_norm/layer0": 0.0,
1137
+ "eval_input_norm/layer1": 0.0,
1138
+ "eval_input_norm/layer2": 0.0,
1139
+ "eval_input_norm/layer3": 0.0,
1140
+ "eval_loss": 1.2800538539886475,
1141
+ "eval_multicode_k": 1,
1142
+ "eval_output_norm/layer0": 0.0,
1143
+ "eval_output_norm/layer1": 0.0,
1144
+ "eval_output_norm/layer2": 0.0,
1145
+ "eval_output_norm/layer3": 0.0,
1146
+ "eval_runtime": 40.8942,
1147
+ "eval_samples_per_second": 801.287,
1148
+ "eval_steps_per_second": 1.565,
1149
+ "eval_transition_accuracy": 0.582741935483871,
1150
+ "step": 8500
1151
+ },
1152
+ {
1153
+ "MSE": 0.0,
1154
+ "MSE/layer0": 0.0,
1155
+ "MSE/layer1": 0.0,
1156
+ "MSE/layer2": 0.0,
1157
+ "MSE/layer3": 0.0,
1158
+ "dead_code_fraction": 1.0,
1159
+ "dead_code_fraction/layer0": 1.0,
1160
+ "dead_code_fraction/layer1": 1.0,
1161
+ "dead_code_fraction/layer2": 1.0,
1162
+ "dead_code_fraction/layer3": 1.0,
1163
+ "epoch": 0.45,
1164
+ "input_norm": 0.0,
1165
+ "input_norm/layer0": 0.0,
1166
+ "input_norm/layer1": 0.0,
1167
+ "input_norm/layer2": 0.0,
1168
+ "input_norm/layer3": 0.0,
1169
+ "learning_rate": 0.001,
1170
+ "loss": 1.2755,
1171
+ "max_norm": 25.07321548461914,
1172
+ "max_norm/layer0": 14.874269485473633,
1173
+ "max_norm/layer1": 17.39398956298828,
1174
+ "max_norm/layer2": 21.67055892944336,
1175
+ "max_norm/layer3": 25.07321548461914,
1176
+ "mean_norm": 8.188907638192177,
1177
+ "mean_norm/layer0": 8.458899140357971,
1178
+ "mean_norm/layer1": 8.143711388111115,
1179
+ "mean_norm/layer2": 7.863752484321594,
1180
+ "mean_norm/layer3": 8.289267539978027,
1181
+ "multicode_k": 1,
1182
+ "output_norm": 0.0,
1183
+ "output_norm/layer0": 0.0,
1184
+ "output_norm/layer1": 0.0,
1185
+ "output_norm/layer2": 0.0,
1186
+ "output_norm/layer3": 0.0,
1187
+ "step": 9000
1188
+ },
1189
+ {
1190
+ "epoch": 0.45,
1191
+ "eval_MSE/layer0": 0.0,
1192
+ "eval_MSE/layer1": 0.0,
1193
+ "eval_MSE/layer2": 0.0,
1194
+ "eval_MSE/layer3": 0.0,
1195
+ "eval_accuracy": 0.4516255536417323,
1196
+ "eval_dead_code_fraction/layer0": 1.0,
1197
+ "eval_dead_code_fraction/layer1": 1.0,
1198
+ "eval_dead_code_fraction/layer2": 1.0,
1199
+ "eval_dead_code_fraction/layer3": 1.0,
1200
+ "eval_first_transition_accuracy": 0.9,
1201
+ "eval_input_norm/layer0": 0.0,
1202
+ "eval_input_norm/layer1": 0.0,
1203
+ "eval_input_norm/layer2": 0.0,
1204
+ "eval_input_norm/layer3": 0.0,
1205
+ "eval_loss": 1.2754778861999512,
1206
+ "eval_multicode_k": 1,
1207
+ "eval_output_norm/layer0": 0.0,
1208
+ "eval_output_norm/layer1": 0.0,
1209
+ "eval_output_norm/layer2": 0.0,
1210
+ "eval_output_norm/layer3": 0.0,
1211
+ "eval_runtime": 40.3576,
1212
+ "eval_samples_per_second": 811.942,
1213
+ "eval_steps_per_second": 1.586,
1214
+ "eval_transition_accuracy": 0.5764516129032258,
1215
+ "step": 9000
1216
+ },
1217
+ {
1218
+ "MSE": 0.0,
1219
+ "MSE/layer0": 0.0,
1220
+ "MSE/layer1": 0.0,
1221
+ "MSE/layer2": 0.0,
1222
+ "MSE/layer3": 0.0,
1223
+ "dead_code_fraction": 1.0,
1224
+ "dead_code_fraction/layer0": 1.0,
1225
+ "dead_code_fraction/layer1": 1.0,
1226
+ "dead_code_fraction/layer2": 1.0,
1227
+ "dead_code_fraction/layer3": 1.0,
1228
+ "epoch": 0.47,
1229
+ "input_norm": 0.0,
1230
+ "input_norm/layer0": 0.0,
1231
+ "input_norm/layer1": 0.0,
1232
+ "input_norm/layer2": 0.0,
1233
+ "input_norm/layer3": 0.0,
1234
+ "learning_rate": 0.001,
1235
+ "loss": 1.2746,
1236
+ "max_norm": 26.98938751220703,
1237
+ "max_norm/layer0": 15.152251243591309,
1238
+ "max_norm/layer1": 17.66726303100586,
1239
+ "max_norm/layer2": 22.432802200317383,
1240
+ "max_norm/layer3": 26.98938751220703,
1241
+ "mean_norm": 8.197539746761322,
1242
+ "mean_norm/layer0": 8.4593066573143,
1243
+ "mean_norm/layer1": 8.148526132106781,
1244
+ "mean_norm/layer2": 7.884801626205444,
1245
+ "mean_norm/layer3": 8.297524571418762,
1246
+ "multicode_k": 1,
1247
+ "output_norm": 0.0,
1248
+ "output_norm/layer0": 0.0,
1249
+ "output_norm/layer1": 0.0,
1250
+ "output_norm/layer2": 0.0,
1251
+ "output_norm/layer3": 0.0,
1252
+ "step": 9500
1253
+ },
1254
+ {
1255
+ "epoch": 0.47,
1256
+ "eval_MSE/layer0": 0.0,
1257
+ "eval_MSE/layer1": 0.0,
1258
+ "eval_MSE/layer2": 0.0,
1259
+ "eval_MSE/layer3": 0.0,
1260
+ "eval_accuracy": 0.45229261503444884,
1261
+ "eval_dead_code_fraction/layer0": 1.0,
1262
+ "eval_dead_code_fraction/layer1": 1.0,
1263
+ "eval_dead_code_fraction/layer2": 1.0,
1264
+ "eval_dead_code_fraction/layer3": 1.0,
1265
+ "eval_first_transition_accuracy": 0.9,
1266
+ "eval_input_norm/layer0": 0.0,
1267
+ "eval_input_norm/layer1": 0.0,
1268
+ "eval_input_norm/layer2": 0.0,
1269
+ "eval_input_norm/layer3": 0.0,
1270
+ "eval_loss": 1.2736179828643799,
1271
+ "eval_multicode_k": 1,
1272
+ "eval_output_norm/layer0": 0.0,
1273
+ "eval_output_norm/layer1": 0.0,
1274
+ "eval_output_norm/layer2": 0.0,
1275
+ "eval_output_norm/layer3": 0.0,
1276
+ "eval_runtime": 40.4635,
1277
+ "eval_samples_per_second": 809.817,
1278
+ "eval_steps_per_second": 1.582,
1279
+ "eval_transition_accuracy": 0.5864516129032258,
1280
+ "step": 9500
1281
+ },
1282
+ {
1283
+ "MSE": 0.0,
1284
+ "MSE/layer0": 0.0,
1285
+ "MSE/layer1": 0.0,
1286
+ "MSE/layer2": 0.0,
1287
+ "MSE/layer3": 0.0,
1288
+ "dead_code_fraction": 1.0,
1289
+ "dead_code_fraction/layer0": 1.0,
1290
+ "dead_code_fraction/layer1": 1.0,
1291
+ "dead_code_fraction/layer2": 1.0,
1292
+ "dead_code_fraction/layer3": 1.0,
1293
+ "epoch": 0.5,
1294
+ "input_norm": 0.0,
1295
+ "input_norm/layer0": 0.0,
1296
+ "input_norm/layer1": 0.0,
1297
+ "input_norm/layer2": 0.0,
1298
+ "input_norm/layer3": 0.0,
1299
+ "learning_rate": 0.001,
1300
+ "loss": 1.2734,
1301
+ "max_norm": 28.549026489257812,
1302
+ "max_norm/layer0": 15.429606437683105,
1303
+ "max_norm/layer1": 17.9191837310791,
1304
+ "max_norm/layer2": 23.421247482299805,
1305
+ "max_norm/layer3": 28.549026489257812,
1306
+ "mean_norm": 8.206039026379585,
1307
+ "mean_norm/layer0": 8.459603905677795,
1308
+ "mean_norm/layer1": 8.153472065925598,
1309
+ "mean_norm/layer2": 7.905435502529144,
1310
+ "mean_norm/layer3": 8.305644631385803,
1311
+ "multicode_k": 1,
1312
+ "output_norm": 0.0,
1313
+ "output_norm/layer0": 0.0,
1314
+ "output_norm/layer1": 0.0,
1315
+ "output_norm/layer2": 0.0,
1316
+ "output_norm/layer3": 0.0,
1317
+ "step": 10000
1318
+ },
1319
+ {
1320
+ "epoch": 0.5,
1321
+ "eval_MSE/layer0": 0.0,
1322
+ "eval_MSE/layer1": 0.0,
1323
+ "eval_MSE/layer2": 0.0,
1324
+ "eval_MSE/layer3": 0.0,
1325
+ "eval_accuracy": 0.4518793061023622,
1326
+ "eval_dead_code_fraction/layer0": 1.0,
1327
+ "eval_dead_code_fraction/layer1": 1.0,
1328
+ "eval_dead_code_fraction/layer2": 1.0,
1329
+ "eval_dead_code_fraction/layer3": 1.0,
1330
+ "eval_first_transition_accuracy": 0.91,
1331
+ "eval_input_norm/layer0": 0.0,
1332
+ "eval_input_norm/layer1": 0.0,
1333
+ "eval_input_norm/layer2": 0.0,
1334
+ "eval_input_norm/layer3": 0.0,
1335
+ "eval_loss": 1.2739558219909668,
1336
+ "eval_multicode_k": 1,
1337
+ "eval_output_norm/layer0": 0.0,
1338
+ "eval_output_norm/layer1": 0.0,
1339
+ "eval_output_norm/layer2": 0.0,
1340
+ "eval_output_norm/layer3": 0.0,
1341
+ "eval_runtime": 40.3957,
1342
+ "eval_samples_per_second": 811.175,
1343
+ "eval_steps_per_second": 1.584,
1344
+ "eval_transition_accuracy": 0.5779032258064516,
1345
+ "step": 10000
1346
+ },
1347
+ {
1348
+ "MSE": 0.0,
1349
+ "MSE/layer0": 0.0,
1350
+ "MSE/layer1": 0.0,
1351
+ "MSE/layer2": 0.0,
1352
+ "MSE/layer3": 0.0,
1353
+ "dead_code_fraction": 1.0,
1354
+ "dead_code_fraction/layer0": 1.0,
1355
+ "dead_code_fraction/layer1": 1.0,
1356
+ "dead_code_fraction/layer2": 1.0,
1357
+ "dead_code_fraction/layer3": 1.0,
1358
+ "epoch": 0.53,
1359
+ "input_norm": 0.0,
1360
+ "input_norm/layer0": 0.0,
1361
+ "input_norm/layer1": 0.0,
1362
+ "input_norm/layer2": 0.0,
1363
+ "input_norm/layer3": 0.0,
1364
+ "learning_rate": 0.001,
1365
+ "loss": 1.2732,
1366
+ "max_norm": 30.3062801361084,
1367
+ "max_norm/layer0": 15.69118595123291,
1368
+ "max_norm/layer1": 18.259578704833984,
1369
+ "max_norm/layer2": 24.377281188964844,
1370
+ "max_norm/layer3": 30.3062801361084,
1371
+ "mean_norm": 8.214424923062325,
1372
+ "mean_norm/layer0": 8.459942817687988,
1373
+ "mean_norm/layer1": 8.156991243362427,
1374
+ "mean_norm/layer2": 7.927173614501953,
1375
+ "mean_norm/layer3": 8.31359201669693,
1376
+ "multicode_k": 1,
1377
+ "output_norm": 0.0,
1378
+ "output_norm/layer0": 0.0,
1379
+ "output_norm/layer1": 0.0,
1380
+ "output_norm/layer2": 0.0,
1381
+ "output_norm/layer3": 0.0,
1382
+ "step": 10500
1383
+ },
1384
+ {
1385
+ "epoch": 0.53,
1386
+ "eval_MSE/layer0": 0.0,
1387
+ "eval_MSE/layer1": 0.0,
1388
+ "eval_MSE/layer2": 0.0,
1389
+ "eval_MSE/layer3": 0.0,
1390
+ "eval_accuracy": 0.45159791961429624,
1391
+ "eval_dead_code_fraction/layer0": 1.0,
1392
+ "eval_dead_code_fraction/layer1": 1.0,
1393
+ "eval_dead_code_fraction/layer2": 1.0,
1394
+ "eval_dead_code_fraction/layer3": 1.0,
1395
+ "eval_first_transition_accuracy": 0.89,
1396
+ "eval_input_norm/layer0": 0.0,
1397
+ "eval_input_norm/layer1": 0.0,
1398
+ "eval_input_norm/layer2": 0.0,
1399
+ "eval_input_norm/layer3": 0.0,
1400
+ "eval_loss": 1.2743829488754272,
1401
+ "eval_multicode_k": 1,
1402
+ "eval_output_norm/layer0": 0.0,
1403
+ "eval_output_norm/layer1": 0.0,
1404
+ "eval_output_norm/layer2": 0.0,
1405
+ "eval_output_norm/layer3": 0.0,
1406
+ "eval_runtime": 40.5803,
1407
+ "eval_samples_per_second": 807.485,
1408
+ "eval_steps_per_second": 1.577,
1409
+ "eval_transition_accuracy": 0.5879032258064516,
1410
+ "step": 10500
1411
+ },
1412
+ {
1413
+ "MSE": 0.0,
1414
+ "MSE/layer0": 0.0,
1415
+ "MSE/layer1": 0.0,
1416
+ "MSE/layer2": 0.0,
1417
+ "MSE/layer3": 0.0,
1418
+ "dead_code_fraction": 1.0,
1419
+ "dead_code_fraction/layer0": 1.0,
1420
+ "dead_code_fraction/layer1": 1.0,
1421
+ "dead_code_fraction/layer2": 1.0,
1422
+ "dead_code_fraction/layer3": 1.0,
1423
+ "epoch": 0.55,
1424
+ "input_norm": 0.0,
1425
+ "input_norm/layer0": 0.0,
1426
+ "input_norm/layer1": 0.0,
1427
+ "input_norm/layer2": 0.0,
1428
+ "input_norm/layer3": 0.0,
1429
+ "learning_rate": 0.001,
1430
+ "loss": 1.2723,
1431
+ "max_norm": 31.752639770507812,
1432
+ "max_norm/layer0": 16.03121566772461,
1433
+ "max_norm/layer1": 18.51296043395996,
1434
+ "max_norm/layer2": 25.478057861328125,
1435
+ "max_norm/layer3": 31.752639770507812,
1436
+ "mean_norm": 8.222758993506432,
1437
+ "mean_norm/layer0": 8.460269570350647,
1438
+ "mean_norm/layer1": 8.1603884100914,
1439
+ "mean_norm/layer2": 7.949049711227417,
1440
+ "mean_norm/layer3": 8.321328282356262,
1441
+ "multicode_k": 1,
1442
+ "output_norm": 0.0,
1443
+ "output_norm/layer0": 0.0,
1444
+ "output_norm/layer1": 0.0,
1445
+ "output_norm/layer2": 0.0,
1446
+ "output_norm/layer3": 0.0,
1447
+ "step": 11000
1448
+ },
1449
+ {
1450
+ "epoch": 0.55,
1451
+ "eval_MSE/layer0": 0.0,
1452
+ "eval_MSE/layer1": 0.0,
1453
+ "eval_MSE/layer2": 0.0,
1454
+ "eval_MSE/layer3": 0.0,
1455
+ "eval_accuracy": 0.45252594234436516,
1456
+ "eval_dead_code_fraction/layer0": 1.0,
1457
+ "eval_dead_code_fraction/layer1": 1.0,
1458
+ "eval_dead_code_fraction/layer2": 1.0,
1459
+ "eval_dead_code_fraction/layer3": 1.0,
1460
+ "eval_first_transition_accuracy": 0.89,
1461
+ "eval_input_norm/layer0": 0.0,
1462
+ "eval_input_norm/layer1": 0.0,
1463
+ "eval_input_norm/layer2": 0.0,
1464
+ "eval_input_norm/layer3": 0.0,
1465
+ "eval_loss": 1.269043207168579,
1466
+ "eval_multicode_k": 1,
1467
+ "eval_output_norm/layer0": 0.0,
1468
+ "eval_output_norm/layer1": 0.0,
1469
+ "eval_output_norm/layer2": 0.0,
1470
+ "eval_output_norm/layer3": 0.0,
1471
+ "eval_runtime": 40.1138,
1472
+ "eval_samples_per_second": 816.875,
1473
+ "eval_steps_per_second": 1.595,
1474
+ "eval_transition_accuracy": 0.5811290322580646,
1475
+ "step": 11000
1476
+ },
1477
+ {
1478
+ "MSE": 0.0,
1479
+ "MSE/layer0": 0.0,
1480
+ "MSE/layer1": 0.0,
1481
+ "MSE/layer2": 0.0,
1482
+ "MSE/layer3": 0.0,
1483
+ "dead_code_fraction": 1.0,
1484
+ "dead_code_fraction/layer0": 1.0,
1485
+ "dead_code_fraction/layer1": 1.0,
1486
+ "dead_code_fraction/layer2": 1.0,
1487
+ "dead_code_fraction/layer3": 1.0,
1488
+ "epoch": 0.57,
1489
+ "input_norm": 0.0,
1490
+ "input_norm/layer0": 0.0,
1491
+ "input_norm/layer1": 0.0,
1492
+ "input_norm/layer2": 0.0,
1493
+ "input_norm/layer3": 0.0,
1494
+ "learning_rate": 0.001,
1495
+ "loss": 1.2712,
1496
+ "max_norm": 33.44036102294922,
1497
+ "max_norm/layer0": 16.178028106689453,
1498
+ "max_norm/layer1": 18.79450798034668,
1499
+ "max_norm/layer2": 26.372129440307617,
1500
+ "max_norm/layer3": 33.44036102294922,
1501
+ "mean_norm": 8.230609133839607,
1502
+ "mean_norm/layer0": 8.460545778274536,
1503
+ "mean_norm/layer1": 8.163512825965881,
1504
+ "mean_norm/layer2": 7.969985008239746,
1505
+ "mean_norm/layer3": 8.328392922878265,
1506
+ "multicode_k": 1,
1507
+ "output_norm": 0.0,
1508
+ "output_norm/layer0": 0.0,
1509
+ "output_norm/layer1": 0.0,
1510
+ "output_norm/layer2": 0.0,
1511
+ "output_norm/layer3": 0.0,
1512
+ "step": 11500
1513
+ },
1514
+ {
1515
+ "epoch": 0.57,
1516
+ "eval_MSE/layer0": 0.0,
1517
+ "eval_MSE/layer1": 0.0,
1518
+ "eval_MSE/layer2": 0.0,
1519
+ "eval_MSE/layer3": 0.0,
1520
+ "eval_accuracy": 0.45260740265132876,
1521
+ "eval_dead_code_fraction/layer0": 1.0,
1522
+ "eval_dead_code_fraction/layer1": 1.0,
1523
+ "eval_dead_code_fraction/layer2": 1.0,
1524
+ "eval_dead_code_fraction/layer3": 1.0,
1525
+ "eval_first_transition_accuracy": 0.93,
1526
+ "eval_input_norm/layer0": 0.0,
1527
+ "eval_input_norm/layer1": 0.0,
1528
+ "eval_input_norm/layer2": 0.0,
1529
+ "eval_input_norm/layer3": 0.0,
1530
+ "eval_loss": 1.2705051898956299,
1531
+ "eval_multicode_k": 1,
1532
+ "eval_output_norm/layer0": 0.0,
1533
+ "eval_output_norm/layer1": 0.0,
1534
+ "eval_output_norm/layer2": 0.0,
1535
+ "eval_output_norm/layer3": 0.0,
1536
+ "eval_runtime": 40.2988,
1537
+ "eval_samples_per_second": 813.126,
1538
+ "eval_steps_per_second": 1.588,
1539
+ "eval_transition_accuracy": 0.5779032258064516,
1540
+ "step": 11500
1541
+ },
1542
+ {
1543
+ "MSE": 0.0,
1544
+ "MSE/layer0": 0.0,
1545
+ "MSE/layer1": 0.0,
1546
+ "MSE/layer2": 0.0,
1547
+ "MSE/layer3": 0.0,
1548
+ "dead_code_fraction": 1.0,
1549
+ "dead_code_fraction/layer0": 1.0,
1550
+ "dead_code_fraction/layer1": 1.0,
1551
+ "dead_code_fraction/layer2": 1.0,
1552
+ "dead_code_fraction/layer3": 1.0,
1553
+ "epoch": 0.6,
1554
+ "input_norm": 0.0,
1555
+ "input_norm/layer0": 0.0,
1556
+ "input_norm/layer1": 0.0,
1557
+ "input_norm/layer2": 0.0,
1558
+ "input_norm/layer3": 0.0,
1559
+ "learning_rate": 0.001,
1560
+ "loss": 1.2716,
1561
+ "max_norm": 35.007076263427734,
1562
+ "max_norm/layer0": 16.497554779052734,
1563
+ "max_norm/layer1": 18.986597061157227,
1564
+ "max_norm/layer2": 27.318687438964844,
1565
+ "max_norm/layer3": 35.007076263427734,
1566
+ "mean_norm": 8.238363325595856,
1567
+ "mean_norm/layer0": 8.460874915122986,
1568
+ "mean_norm/layer1": 8.166603803634644,
1569
+ "mean_norm/layer2": 7.990897297859192,
1570
+ "mean_norm/layer3": 8.335077285766602,
1571
+ "multicode_k": 1,
1572
+ "output_norm": 0.0,
1573
+ "output_norm/layer0": 0.0,
1574
+ "output_norm/layer1": 0.0,
1575
+ "output_norm/layer2": 0.0,
1576
+ "output_norm/layer3": 0.0,
1577
+ "step": 12000
1578
+ },
1579
+ {
1580
+ "epoch": 0.6,
1581
+ "eval_MSE/layer0": 0.0,
1582
+ "eval_MSE/layer1": 0.0,
1583
+ "eval_MSE/layer2": 0.0,
1584
+ "eval_MSE/layer3": 0.0,
1585
+ "eval_accuracy": 0.4526636318897638,
1586
+ "eval_dead_code_fraction/layer0": 1.0,
1587
+ "eval_dead_code_fraction/layer1": 1.0,
1588
+ "eval_dead_code_fraction/layer2": 1.0,
1589
+ "eval_dead_code_fraction/layer3": 1.0,
1590
+ "eval_first_transition_accuracy": 0.89,
1591
+ "eval_input_norm/layer0": 0.0,
1592
+ "eval_input_norm/layer1": 0.0,
1593
+ "eval_input_norm/layer2": 0.0,
1594
+ "eval_input_norm/layer3": 0.0,
1595
+ "eval_loss": 1.2700704336166382,
1596
+ "eval_multicode_k": 1,
1597
+ "eval_output_norm/layer0": 0.0,
1598
+ "eval_output_norm/layer1": 0.0,
1599
+ "eval_output_norm/layer2": 0.0,
1600
+ "eval_output_norm/layer3": 0.0,
1601
+ "eval_runtime": 40.2954,
1602
+ "eval_samples_per_second": 813.195,
1603
+ "eval_steps_per_second": 1.588,
1604
+ "eval_transition_accuracy": 0.5759677419354838,
1605
+ "step": 12000
1606
+ },
1607
+ {
1608
+ "MSE": 0.0,
1609
+ "MSE/layer0": 0.0,
1610
+ "MSE/layer1": 0.0,
1611
+ "MSE/layer2": 0.0,
1612
+ "MSE/layer3": 0.0,
1613
+ "dead_code_fraction": 1.0,
1614
+ "dead_code_fraction/layer0": 1.0,
1615
+ "dead_code_fraction/layer1": 1.0,
1616
+ "dead_code_fraction/layer2": 1.0,
1617
+ "dead_code_fraction/layer3": 1.0,
1618
+ "epoch": 0.62,
1619
+ "input_norm": 0.0,
1620
+ "input_norm/layer0": 0.0,
1621
+ "input_norm/layer1": 0.0,
1622
+ "input_norm/layer2": 0.0,
1623
+ "input_norm/layer3": 0.0,
1624
+ "learning_rate": 0.001,
1625
+ "loss": 1.2708,
1626
+ "max_norm": 36.530174255371094,
1627
+ "max_norm/layer0": 16.774707794189453,
1628
+ "max_norm/layer1": 19.286115646362305,
1629
+ "max_norm/layer2": 28.3411808013916,
1630
+ "max_norm/layer3": 36.530174255371094,
1631
+ "mean_norm": 8.246006086468697,
1632
+ "mean_norm/layer0": 8.461170196533203,
1633
+ "mean_norm/layer1": 8.169004082679749,
1634
+ "mean_norm/layer2": 8.012158274650574,
1635
+ "mean_norm/layer3": 8.341691792011261,
1636
+ "multicode_k": 1,
1637
+ "output_norm": 0.0,
1638
+ "output_norm/layer0": 0.0,
1639
+ "output_norm/layer1": 0.0,
1640
+ "output_norm/layer2": 0.0,
1641
+ "output_norm/layer3": 0.0,
1642
+ "step": 12500
1643
+ },
1644
+ {
1645
+ "epoch": 0.62,
1646
+ "eval_MSE/layer0": 0.0,
1647
+ "eval_MSE/layer1": 0.0,
1648
+ "eval_MSE/layer2": 0.0,
1649
+ "eval_MSE/layer3": 0.0,
1650
+ "eval_accuracy": 0.45218808632197344,
1651
+ "eval_dead_code_fraction/layer0": 1.0,
1652
+ "eval_dead_code_fraction/layer1": 1.0,
1653
+ "eval_dead_code_fraction/layer2": 1.0,
1654
+ "eval_dead_code_fraction/layer3": 1.0,
1655
+ "eval_first_transition_accuracy": 0.95,
1656
+ "eval_input_norm/layer0": 0.0,
1657
+ "eval_input_norm/layer1": 0.0,
1658
+ "eval_input_norm/layer2": 0.0,
1659
+ "eval_input_norm/layer3": 0.0,
1660
+ "eval_loss": 1.2715603113174438,
1661
+ "eval_multicode_k": 1,
1662
+ "eval_output_norm/layer0": 0.0,
1663
+ "eval_output_norm/layer1": 0.0,
1664
+ "eval_output_norm/layer2": 0.0,
1665
+ "eval_output_norm/layer3": 0.0,
1666
+ "eval_runtime": 40.6419,
1667
+ "eval_samples_per_second": 806.261,
1668
+ "eval_steps_per_second": 1.575,
1669
+ "eval_transition_accuracy": 0.5485483870967742,
1670
+ "step": 12500
1671
+ },
1672
+ {
1673
+ "MSE": 0.0,
1674
+ "MSE/layer0": 0.0,
1675
+ "MSE/layer1": 0.0,
1676
+ "MSE/layer2": 0.0,
1677
+ "MSE/layer3": 0.0,
1678
+ "dead_code_fraction": 1.0,
1679
+ "dead_code_fraction/layer0": 1.0,
1680
+ "dead_code_fraction/layer1": 1.0,
1681
+ "dead_code_fraction/layer2": 1.0,
1682
+ "dead_code_fraction/layer3": 1.0,
1683
+ "epoch": 0.65,
1684
+ "input_norm": 0.0,
1685
+ "input_norm/layer0": 0.0,
1686
+ "input_norm/layer1": 0.0,
1687
+ "input_norm/layer2": 0.0,
1688
+ "input_norm/layer3": 0.0,
1689
+ "learning_rate": 0.001,
1690
+ "loss": 1.2705,
1691
+ "max_norm": 38.14279556274414,
1692
+ "max_norm/layer0": 16.91411590576172,
1693
+ "max_norm/layer1": 19.68568229675293,
1694
+ "max_norm/layer2": 29.167861938476562,
1695
+ "max_norm/layer3": 38.14279556274414,
1696
+ "mean_norm": 8.253673061728477,
1697
+ "mean_norm/layer0": 8.461421430110931,
1698
+ "mean_norm/layer1": 8.171639680862427,
1699
+ "mean_norm/layer2": 8.033495247364044,
1700
+ "mean_norm/layer3": 8.348135888576508,
1701
+ "multicode_k": 1,
1702
+ "output_norm": 0.0,
1703
+ "output_norm/layer0": 0.0,
1704
+ "output_norm/layer1": 0.0,
1705
+ "output_norm/layer2": 0.0,
1706
+ "output_norm/layer3": 0.0,
1707
+ "step": 13000
1708
+ },
1709
+ {
1710
+ "epoch": 0.65,
1711
+ "eval_MSE/layer0": 0.0,
1712
+ "eval_MSE/layer1": 0.0,
1713
+ "eval_MSE/layer2": 0.0,
1714
+ "eval_MSE/layer3": 0.0,
1715
+ "eval_accuracy": 0.4528599536325049,
1716
+ "eval_dead_code_fraction/layer0": 1.0,
1717
+ "eval_dead_code_fraction/layer1": 1.0,
1718
+ "eval_dead_code_fraction/layer2": 1.0,
1719
+ "eval_dead_code_fraction/layer3": 1.0,
1720
+ "eval_first_transition_accuracy": 0.93,
1721
+ "eval_input_norm/layer0": 0.0,
1722
+ "eval_input_norm/layer1": 0.0,
1723
+ "eval_input_norm/layer2": 0.0,
1724
+ "eval_input_norm/layer3": 0.0,
1725
+ "eval_loss": 1.2675950527191162,
1726
+ "eval_multicode_k": 1,
1727
+ "eval_output_norm/layer0": 0.0,
1728
+ "eval_output_norm/layer1": 0.0,
1729
+ "eval_output_norm/layer2": 0.0,
1730
+ "eval_output_norm/layer3": 0.0,
1731
+ "eval_runtime": 40.2422,
1732
+ "eval_samples_per_second": 814.27,
1733
+ "eval_steps_per_second": 1.59,
1734
+ "eval_transition_accuracy": 0.5733870967741935,
1735
+ "step": 13000
1736
+ },
1737
+ {
1738
+ "MSE": 0.0,
1739
+ "MSE/layer0": 0.0,
1740
+ "MSE/layer1": 0.0,
1741
+ "MSE/layer2": 0.0,
1742
+ "MSE/layer3": 0.0,
1743
+ "dead_code_fraction": 1.0,
1744
+ "dead_code_fraction/layer0": 1.0,
1745
+ "dead_code_fraction/layer1": 1.0,
1746
+ "dead_code_fraction/layer2": 1.0,
1747
+ "dead_code_fraction/layer3": 1.0,
1748
+ "epoch": 0.68,
1749
+ "input_norm": 0.0,
1750
+ "input_norm/layer0": 0.0,
1751
+ "input_norm/layer1": 0.0,
1752
+ "input_norm/layer2": 0.0,
1753
+ "input_norm/layer3": 0.0,
1754
+ "learning_rate": 0.001,
1755
+ "loss": 1.2696,
1756
+ "max_norm": 39.6273307800293,
1757
+ "max_norm/layer0": 17.14823341369629,
1758
+ "max_norm/layer1": 19.711994171142578,
1759
+ "max_norm/layer2": 30.05324363708496,
1760
+ "max_norm/layer3": 39.6273307800293,
1761
+ "mean_norm": 8.261498123407364,
1762
+ "mean_norm/layer0": 8.46165120601654,
1763
+ "mean_norm/layer1": 8.17492812871933,
1764
+ "mean_norm/layer2": 8.055188477039337,
1765
+ "mean_norm/layer3": 8.354224681854248,
1766
+ "multicode_k": 1,
1767
+ "output_norm": 0.0,
1768
+ "output_norm/layer0": 0.0,
1769
+ "output_norm/layer1": 0.0,
1770
+ "output_norm/layer2": 0.0,
1771
+ "output_norm/layer3": 0.0,
1772
+ "step": 13500
1773
+ },
1774
+ {
1775
+ "epoch": 0.68,
1776
+ "eval_MSE/layer0": 0.0,
1777
+ "eval_MSE/layer1": 0.0,
1778
+ "eval_MSE/layer2": 0.0,
1779
+ "eval_MSE/layer3": 0.0,
1780
+ "eval_accuracy": 0.4518833911325049,
1781
+ "eval_dead_code_fraction/layer0": 1.0,
1782
+ "eval_dead_code_fraction/layer1": 1.0,
1783
+ "eval_dead_code_fraction/layer2": 1.0,
1784
+ "eval_dead_code_fraction/layer3": 1.0,
1785
+ "eval_first_transition_accuracy": 0.91,
1786
+ "eval_input_norm/layer0": 0.0,
1787
+ "eval_input_norm/layer1": 0.0,
1788
+ "eval_input_norm/layer2": 0.0,
1789
+ "eval_input_norm/layer3": 0.0,
1790
+ "eval_loss": 1.2716896533966064,
1791
+ "eval_multicode_k": 1,
1792
+ "eval_output_norm/layer0": 0.0,
1793
+ "eval_output_norm/layer1": 0.0,
1794
+ "eval_output_norm/layer2": 0.0,
1795
+ "eval_output_norm/layer3": 0.0,
1796
+ "eval_runtime": 40.327,
1797
+ "eval_samples_per_second": 812.557,
1798
+ "eval_steps_per_second": 1.587,
1799
+ "eval_transition_accuracy": 0.5993548387096774,
1800
+ "step": 13500
1801
+ },
1802
+ {
1803
+ "MSE": 0.0,
1804
+ "MSE/layer0": 0.0,
1805
+ "MSE/layer1": 0.0,
1806
+ "MSE/layer2": 0.0,
1807
+ "MSE/layer3": 0.0,
1808
+ "dead_code_fraction": 1.0,
1809
+ "dead_code_fraction/layer0": 1.0,
1810
+ "dead_code_fraction/layer1": 1.0,
1811
+ "dead_code_fraction/layer2": 1.0,
1812
+ "dead_code_fraction/layer3": 1.0,
1813
+ "epoch": 0.7,
1814
+ "input_norm": 0.0,
1815
+ "input_norm/layer0": 0.0,
1816
+ "input_norm/layer1": 0.0,
1817
+ "input_norm/layer2": 0.0,
1818
+ "input_norm/layer3": 0.0,
1819
+ "learning_rate": 0.001,
1820
+ "loss": 1.2687,
1821
+ "max_norm": 41.1092643737793,
1822
+ "max_norm/layer0": 17.324541091918945,
1823
+ "max_norm/layer1": 19.974578857421875,
1824
+ "max_norm/layer2": 30.87788200378418,
1825
+ "max_norm/layer3": 41.1092643737793,
1826
+ "mean_norm": 8.26871033012867,
1827
+ "mean_norm/layer0": 8.46183955669403,
1828
+ "mean_norm/layer1": 8.177218735218048,
1829
+ "mean_norm/layer2": 8.076003432273865,
1830
+ "mean_norm/layer3": 8.359779596328735,
1831
+ "multicode_k": 1,
1832
+ "output_norm": 0.0,
1833
+ "output_norm/layer0": 0.0,
1834
+ "output_norm/layer1": 0.0,
1835
+ "output_norm/layer2": 0.0,
1836
+ "output_norm/layer3": 0.0,
1837
+ "step": 14000
1838
+ },
1839
+ {
1840
+ "epoch": 0.7,
1841
+ "eval_MSE/layer0": 0.0,
1842
+ "eval_MSE/layer1": 0.0,
1843
+ "eval_MSE/layer2": 0.0,
1844
+ "eval_MSE/layer3": 0.0,
1845
+ "eval_accuracy": 0.45239882581815943,
1846
+ "eval_dead_code_fraction/layer0": 1.0,
1847
+ "eval_dead_code_fraction/layer1": 1.0,
1848
+ "eval_dead_code_fraction/layer2": 1.0,
1849
+ "eval_dead_code_fraction/layer3": 1.0,
1850
+ "eval_first_transition_accuracy": 0.9,
1851
+ "eval_input_norm/layer0": 0.0,
1852
+ "eval_input_norm/layer1": 0.0,
1853
+ "eval_input_norm/layer2": 0.0,
1854
+ "eval_input_norm/layer3": 0.0,
1855
+ "eval_loss": 1.2687005996704102,
1856
+ "eval_multicode_k": 1,
1857
+ "eval_output_norm/layer0": 0.0,
1858
+ "eval_output_norm/layer1": 0.0,
1859
+ "eval_output_norm/layer2": 0.0,
1860
+ "eval_output_norm/layer3": 0.0,
1861
+ "eval_runtime": 40.5627,
1862
+ "eval_samples_per_second": 807.837,
1863
+ "eval_steps_per_second": 1.578,
1864
+ "eval_transition_accuracy": 0.5756451612903226,
1865
+ "step": 14000
1866
+ },
1867
+ {
1868
+ "MSE": 0.0,
1869
+ "MSE/layer0": 0.0,
1870
+ "MSE/layer1": 0.0,
1871
+ "MSE/layer2": 0.0,
1872
+ "MSE/layer3": 0.0,
1873
+ "dead_code_fraction": 1.0,
1874
+ "dead_code_fraction/layer0": 1.0,
1875
+ "dead_code_fraction/layer1": 1.0,
1876
+ "dead_code_fraction/layer2": 1.0,
1877
+ "dead_code_fraction/layer3": 1.0,
1878
+ "epoch": 0.72,
1879
+ "input_norm": 0.0,
1880
+ "input_norm/layer0": 0.0,
1881
+ "input_norm/layer1": 0.0,
1882
+ "input_norm/layer2": 0.0,
1883
+ "input_norm/layer3": 0.0,
1884
+ "learning_rate": 0.001,
1885
+ "loss": 1.2685,
1886
+ "max_norm": 42.766014099121094,
1887
+ "max_norm/layer0": 17.42141342163086,
1888
+ "max_norm/layer1": 20.071725845336914,
1889
+ "max_norm/layer2": 31.913164138793945,
1890
+ "max_norm/layer3": 42.766014099121094,
1891
+ "mean_norm": 8.276251748204231,
1892
+ "mean_norm/layer0": 8.46206510066986,
1893
+ "mean_norm/layer1": 8.179498374462128,
1894
+ "mean_norm/layer2": 8.097876787185669,
1895
+ "mean_norm/layer3": 8.365566730499268,
1896
+ "multicode_k": 1,
1897
+ "output_norm": 0.0,
1898
+ "output_norm/layer0": 0.0,
1899
+ "output_norm/layer1": 0.0,
1900
+ "output_norm/layer2": 0.0,
1901
+ "output_norm/layer3": 0.0,
1902
+ "step": 14500
1903
+ },
1904
+ {
1905
+ "epoch": 0.72,
1906
+ "eval_MSE/layer0": 0.0,
1907
+ "eval_MSE/layer1": 0.0,
1908
+ "eval_MSE/layer2": 0.0,
1909
+ "eval_MSE/layer3": 0.0,
1910
+ "eval_accuracy": 0.45213714359313484,
1911
+ "eval_dead_code_fraction/layer0": 1.0,
1912
+ "eval_dead_code_fraction/layer1": 1.0,
1913
+ "eval_dead_code_fraction/layer2": 1.0,
1914
+ "eval_dead_code_fraction/layer3": 1.0,
1915
+ "eval_first_transition_accuracy": 0.89,
1916
+ "eval_input_norm/layer0": 0.0,
1917
+ "eval_input_norm/layer1": 0.0,
1918
+ "eval_input_norm/layer2": 0.0,
1919
+ "eval_input_norm/layer3": 0.0,
1920
+ "eval_loss": 1.2709327936172485,
1921
+ "eval_multicode_k": 1,
1922
+ "eval_output_norm/layer0": 0.0,
1923
+ "eval_output_norm/layer1": 0.0,
1924
+ "eval_output_norm/layer2": 0.0,
1925
+ "eval_output_norm/layer3": 0.0,
1926
+ "eval_runtime": 40.2398,
1927
+ "eval_samples_per_second": 814.319,
1928
+ "eval_steps_per_second": 1.59,
1929
+ "eval_transition_accuracy": 0.612741935483871,
1930
+ "step": 14500
1931
+ },
1932
+ {
1933
+ "MSE": 0.0,
1934
+ "MSE/layer0": 0.0,
1935
+ "MSE/layer1": 0.0,
1936
+ "MSE/layer2": 0.0,
1937
+ "MSE/layer3": 0.0,
1938
+ "dead_code_fraction": 1.0,
1939
+ "dead_code_fraction/layer0": 1.0,
1940
+ "dead_code_fraction/layer1": 1.0,
1941
+ "dead_code_fraction/layer2": 1.0,
1942
+ "dead_code_fraction/layer3": 1.0,
1943
+ "epoch": 0.75,
1944
+ "input_norm": 0.0,
1945
+ "input_norm/layer0": 0.0,
1946
+ "input_norm/layer1": 0.0,
1947
+ "input_norm/layer2": 0.0,
1948
+ "input_norm/layer3": 0.0,
1949
+ "learning_rate": 0.001,
1950
+ "loss": 1.2685,
1951
+ "max_norm": 44.61077117919922,
1952
+ "max_norm/layer0": 17.665727615356445,
1953
+ "max_norm/layer1": 20.15743637084961,
1954
+ "max_norm/layer2": 32.966209411621094,
1955
+ "max_norm/layer3": 44.61077117919922,
1956
+ "mean_norm": 8.283790707588196,
1957
+ "mean_norm/layer0": 8.462257981300354,
1958
+ "mean_norm/layer1": 8.181303024291992,
1959
+ "mean_norm/layer2": 8.120182931423187,
1960
+ "mean_norm/layer3": 8.37141889333725,
1961
+ "multicode_k": 1,
1962
+ "output_norm": 0.0,
1963
+ "output_norm/layer0": 0.0,
1964
+ "output_norm/layer1": 0.0,
1965
+ "output_norm/layer2": 0.0,
1966
+ "output_norm/layer3": 0.0,
1967
+ "step": 15000
1968
+ },
1969
+ {
1970
+ "epoch": 0.75,
1971
+ "eval_MSE/layer0": 0.0,
1972
+ "eval_MSE/layer1": 0.0,
1973
+ "eval_MSE/layer2": 0.0,
1974
+ "eval_MSE/layer3": 0.0,
1975
+ "eval_accuracy": 0.4519187146284449,
1976
+ "eval_dead_code_fraction/layer0": 1.0,
1977
+ "eval_dead_code_fraction/layer1": 1.0,
1978
+ "eval_dead_code_fraction/layer2": 1.0,
1979
+ "eval_dead_code_fraction/layer3": 1.0,
1980
+ "eval_first_transition_accuracy": 0.91,
1981
+ "eval_input_norm/layer0": 0.0,
1982
+ "eval_input_norm/layer1": 0.0,
1983
+ "eval_input_norm/layer2": 0.0,
1984
+ "eval_input_norm/layer3": 0.0,
1985
+ "eval_loss": 1.2706036567687988,
1986
+ "eval_multicode_k": 1,
1987
+ "eval_output_norm/layer0": 0.0,
1988
+ "eval_output_norm/layer1": 0.0,
1989
+ "eval_output_norm/layer2": 0.0,
1990
+ "eval_output_norm/layer3": 0.0,
1991
+ "eval_runtime": 40.0952,
1992
+ "eval_samples_per_second": 817.256,
1993
+ "eval_steps_per_second": 1.596,
1994
+ "eval_transition_accuracy": 0.587258064516129,
1995
+ "step": 15000
1996
+ },
1997
+ {
1998
+ "MSE": 0.0,
1999
+ "MSE/layer0": 0.0,
2000
+ "MSE/layer1": 0.0,
2001
+ "MSE/layer2": 0.0,
2002
+ "MSE/layer3": 0.0,
2003
+ "dead_code_fraction": 1.0,
2004
+ "dead_code_fraction/layer0": 1.0,
2005
+ "dead_code_fraction/layer1": 1.0,
2006
+ "dead_code_fraction/layer2": 1.0,
2007
+ "dead_code_fraction/layer3": 1.0,
2008
+ "epoch": 0.78,
2009
+ "input_norm": 0.0,
2010
+ "input_norm/layer0": 0.0,
2011
+ "input_norm/layer1": 0.0,
2012
+ "input_norm/layer2": 0.0,
2013
+ "input_norm/layer3": 0.0,
2014
+ "learning_rate": 0.001,
2015
+ "loss": 1.2675,
2016
+ "max_norm": 46.33829879760742,
2017
+ "max_norm/layer0": 17.856664657592773,
2018
+ "max_norm/layer1": 20.084186553955078,
2019
+ "max_norm/layer2": 33.940242767333984,
2020
+ "max_norm/layer3": 46.33829879760742,
2021
+ "mean_norm": 8.291451185941696,
2022
+ "mean_norm/layer0": 8.462452054023743,
2023
+ "mean_norm/layer1": 8.18280303478241,
2024
+ "mean_norm/layer2": 8.143204748630524,
2025
+ "mean_norm/layer3": 8.377344906330109,
2026
+ "multicode_k": 1,
2027
+ "output_norm": 0.0,
2028
+ "output_norm/layer0": 0.0,
2029
+ "output_norm/layer1": 0.0,
2030
+ "output_norm/layer2": 0.0,
2031
+ "output_norm/layer3": 0.0,
2032
+ "step": 15500
2033
+ },
2034
+ {
2035
+ "epoch": 0.78,
2036
+ "eval_MSE/layer0": 0.0,
2037
+ "eval_MSE/layer1": 0.0,
2038
+ "eval_MSE/layer2": 0.0,
2039
+ "eval_MSE/layer3": 0.0,
2040
+ "eval_accuracy": 0.45268790177472934,
2041
+ "eval_dead_code_fraction/layer0": 1.0,
2042
+ "eval_dead_code_fraction/layer1": 1.0,
2043
+ "eval_dead_code_fraction/layer2": 1.0,
2044
+ "eval_dead_code_fraction/layer3": 1.0,
2045
+ "eval_first_transition_accuracy": 0.96,
2046
+ "eval_input_norm/layer0": 0.0,
2047
+ "eval_input_norm/layer1": 0.0,
2048
+ "eval_input_norm/layer2": 0.0,
2049
+ "eval_input_norm/layer3": 0.0,
2050
+ "eval_loss": 1.2690919637680054,
2051
+ "eval_multicode_k": 1,
2052
+ "eval_output_norm/layer0": 0.0,
2053
+ "eval_output_norm/layer1": 0.0,
2054
+ "eval_output_norm/layer2": 0.0,
2055
+ "eval_output_norm/layer3": 0.0,
2056
+ "eval_runtime": 39.8501,
2057
+ "eval_samples_per_second": 822.281,
2058
+ "eval_steps_per_second": 1.606,
2059
+ "eval_transition_accuracy": 0.6364516129032258,
2060
+ "step": 15500
2061
+ }
2062
+ ],
2063
+ "max_steps": 20000,
2064
+ "num_train_epochs": 9223372036854775807,
2065
+ "total_flos": 9712749772800000.0,
2066
+ "trial_name": null,
2067
+ "trial_params": null
2068
+ }
training_args.bin ADDED
Binary file (3.77 kB). View file
 
vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"0":0,"1":1,"2":2,"3":3,"4":4,"5":5,"6":6,"7":7,"8":8,"9":9,"<|endoftext|>":10}