taufeeque commited on
Commit
0b846e5
1 Parent(s): cf154c9
README.md CHANGED
The diff for this file is too large to render. See raw diff
 
all_results.json ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "MSE": 0.0,
3
+ "MSE/layer0": 0.0,
4
+ "MSE/layer1": 0.0,
5
+ "MSE/layer10": 0.0,
6
+ "MSE/layer11": 0.0,
7
+ "MSE/layer12": 0.0,
8
+ "MSE/layer13": 0.0,
9
+ "MSE/layer14": 0.0,
10
+ "MSE/layer15": 0.0,
11
+ "MSE/layer16": 0.0,
12
+ "MSE/layer17": 0.0,
13
+ "MSE/layer18": 0.0,
14
+ "MSE/layer19": 0.0,
15
+ "MSE/layer2": 0.0,
16
+ "MSE/layer20": 0.0,
17
+ "MSE/layer21": 0.0,
18
+ "MSE/layer22": 0.0,
19
+ "MSE/layer23": 0.0,
20
+ "MSE/layer3": 0.0,
21
+ "MSE/layer4": 0.0,
22
+ "MSE/layer5": 0.0,
23
+ "MSE/layer6": 0.0,
24
+ "MSE/layer7": 0.0,
25
+ "MSE/layer8": 0.0,
26
+ "MSE/layer9": 0.0,
27
+ "dead_code_fraction": 1.0,
28
+ "dead_code_fraction/layer0": 1.0,
29
+ "dead_code_fraction/layer1": 1.0,
30
+ "dead_code_fraction/layer10": 1.0,
31
+ "dead_code_fraction/layer11": 1.0,
32
+ "dead_code_fraction/layer12": 1.0,
33
+ "dead_code_fraction/layer13": 1.0,
34
+ "dead_code_fraction/layer14": 1.0,
35
+ "dead_code_fraction/layer15": 1.0,
36
+ "dead_code_fraction/layer16": 1.0,
37
+ "dead_code_fraction/layer17": 1.0,
38
+ "dead_code_fraction/layer18": 1.0,
39
+ "dead_code_fraction/layer19": 1.0,
40
+ "dead_code_fraction/layer2": 1.0,
41
+ "dead_code_fraction/layer20": 1.0,
42
+ "dead_code_fraction/layer21": 1.0,
43
+ "dead_code_fraction/layer22": 1.0,
44
+ "dead_code_fraction/layer23": 1.0,
45
+ "dead_code_fraction/layer3": 1.0,
46
+ "dead_code_fraction/layer4": 1.0,
47
+ "dead_code_fraction/layer5": 1.0,
48
+ "dead_code_fraction/layer6": 1.0,
49
+ "dead_code_fraction/layer7": 1.0,
50
+ "dead_code_fraction/layer8": 1.0,
51
+ "dead_code_fraction/layer9": 1.0,
52
+ "epoch": 6.26,
53
+ "input_norm": 0.0,
54
+ "input_norm/layer0": 0.0,
55
+ "input_norm/layer1": 0.0,
56
+ "input_norm/layer10": 0.0,
57
+ "input_norm/layer11": 0.0,
58
+ "input_norm/layer12": 0.0,
59
+ "input_norm/layer13": 0.0,
60
+ "input_norm/layer14": 0.0,
61
+ "input_norm/layer15": 0.0,
62
+ "input_norm/layer16": 0.0,
63
+ "input_norm/layer17": 0.0,
64
+ "input_norm/layer18": 0.0,
65
+ "input_norm/layer19": 0.0,
66
+ "input_norm/layer2": 0.0,
67
+ "input_norm/layer20": 0.0,
68
+ "input_norm/layer21": 0.0,
69
+ "input_norm/layer22": 0.0,
70
+ "input_norm/layer23": 0.0,
71
+ "input_norm/layer3": 0.0,
72
+ "input_norm/layer4": 0.0,
73
+ "input_norm/layer5": 0.0,
74
+ "input_norm/layer6": 0.0,
75
+ "input_norm/layer7": 0.0,
76
+ "input_norm/layer8": 0.0,
77
+ "input_norm/layer9": 0.0,
78
+ "max_norm": 45.539119720458984,
79
+ "max_norm/layer0": 34.44173049926758,
80
+ "max_norm/layer1": 36.61558151245117,
81
+ "max_norm/layer10": 38.54380416870117,
82
+ "max_norm/layer11": 34.865203857421875,
83
+ "max_norm/layer12": 40.908504486083984,
84
+ "max_norm/layer13": 35.78108215332031,
85
+ "max_norm/layer14": 36.67228317260742,
86
+ "max_norm/layer15": 45.083438873291016,
87
+ "max_norm/layer16": 36.927913665771484,
88
+ "max_norm/layer17": 45.539119720458984,
89
+ "max_norm/layer18": 39.2352409362793,
90
+ "max_norm/layer19": 38.779598236083984,
91
+ "max_norm/layer2": 26.836795806884766,
92
+ "max_norm/layer20": 38.50577163696289,
93
+ "max_norm/layer21": 38.87571334838867,
94
+ "max_norm/layer22": 39.42427062988281,
95
+ "max_norm/layer23": 37.21847915649414,
96
+ "max_norm/layer3": 34.34575271606445,
97
+ "max_norm/layer4": 34.4432258605957,
98
+ "max_norm/layer5": 44.077754974365234,
99
+ "max_norm/layer6": 28.6057071685791,
100
+ "max_norm/layer7": 37.91745376586914,
101
+ "max_norm/layer8": 36.69032287597656,
102
+ "max_norm/layer9": 37.08796691894531,
103
+ "mean_norm": 11.799732064207396,
104
+ "mean_norm/layer0": 11.755437850952148,
105
+ "mean_norm/layer1": 11.22901839017868,
106
+ "mean_norm/layer10": 11.532833635807037,
107
+ "mean_norm/layer11": 11.962444841861725,
108
+ "mean_norm/layer12": 12.79077160358429,
109
+ "mean_norm/layer13": 11.57960969209671,
110
+ "mean_norm/layer14": 12.059264957904816,
111
+ "mean_norm/layer15": 12.540440499782562,
112
+ "mean_norm/layer16": 11.641206741333008,
113
+ "mean_norm/layer17": 12.231300234794617,
114
+ "mean_norm/layer18": 11.600049555301666,
115
+ "mean_norm/layer19": 11.686796128749847,
116
+ "mean_norm/layer2": 9.256644666194916,
117
+ "mean_norm/layer20": 11.78922188282013,
118
+ "mean_norm/layer21": 11.759462356567383,
119
+ "mean_norm/layer22": 13.063357532024384,
120
+ "mean_norm/layer23": 13.022553265094757,
121
+ "mean_norm/layer3": 12.574194192886353,
122
+ "mean_norm/layer4": 10.863756775856018,
123
+ "mean_norm/layer5": 14.197384178638458,
124
+ "mean_norm/layer6": 10.185243308544159,
125
+ "mean_norm/layer7": 10.893572747707367,
126
+ "mean_norm/layer8": 11.53871750831604,
127
+ "mean_norm/layer9": 11.440286993980408,
128
+ "multicode_k": 8,
129
+ "output_norm": 0.0,
130
+ "output_norm/layer0": 0.0,
131
+ "output_norm/layer1": 0.0,
132
+ "output_norm/layer10": 0.0,
133
+ "output_norm/layer11": 0.0,
134
+ "output_norm/layer12": 0.0,
135
+ "output_norm/layer13": 0.0,
136
+ "output_norm/layer14": 0.0,
137
+ "output_norm/layer15": 0.0,
138
+ "output_norm/layer16": 0.0,
139
+ "output_norm/layer17": 0.0,
140
+ "output_norm/layer18": 0.0,
141
+ "output_norm/layer19": 0.0,
142
+ "output_norm/layer2": 0.0,
143
+ "output_norm/layer20": 0.0,
144
+ "output_norm/layer21": 0.0,
145
+ "output_norm/layer22": 0.0,
146
+ "output_norm/layer23": 0.0,
147
+ "output_norm/layer3": 0.0,
148
+ "output_norm/layer4": 0.0,
149
+ "output_norm/layer5": 0.0,
150
+ "output_norm/layer6": 0.0,
151
+ "output_norm/layer7": 0.0,
152
+ "output_norm/layer8": 0.0,
153
+ "output_norm/layer9": 0.0,
154
+ "train_loss": 2.685329116312663,
155
+ "train_runtime": 43939.9354,
156
+ "train_samples": 114937,
157
+ "train_samples_per_second": 16.386,
158
+ "train_steps_per_second": 0.341
159
+ }
config.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "GPTNeoXCodebookModel"
4
+ ],
5
+ "codebook_at": [
6
+ "attn_preproj"
7
+ ],
8
+ "codebook_type": "group",
9
+ "k_codebook": 8,
10
+ "kmeans_init": false,
11
+ "kmeans_init_examples": 1000,
12
+ "kmeans_kwargs": {
13
+ "batch_size": 24576,
14
+ "n_init": "auto"
15
+ },
16
+ "kmeans_path": "/.cache/cb_volume/huggingface/kmeans_embeddings.pt",
17
+ "layers_to_snap": [
18
+ 0,
19
+ 1,
20
+ 2,
21
+ 3,
22
+ 4,
23
+ 5,
24
+ 6,
25
+ 7,
26
+ 8,
27
+ 9,
28
+ 10,
29
+ 11,
30
+ 12,
31
+ 13,
32
+ 14,
33
+ 15,
34
+ 16,
35
+ 17,
36
+ 18,
37
+ 19,
38
+ 20,
39
+ 21,
40
+ 22,
41
+ 23
42
+ ],
43
+ "loss": "aeloss",
44
+ "model_type": "codebook",
45
+ "num_codebooks": 16,
46
+ "num_codes": 10000,
47
+ "similarity_metric": "inner_product",
48
+ "torch_dtype": "float32",
49
+ "transformers_version": "4.27.3"
50
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:047349fb750aba188ba5b471a0c8e518b74984623d17524d02475169d29d4a3d
3
+ size 2705783745
special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "unk_token": "<|endoftext|>"
5
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": "<|endoftext|>",
4
+ "eos_token": "<|endoftext|>",
5
+ "model_max_length": 1000000000000000019884624838656,
6
+ "special_tokens_map_file": "/admin/home-hailey/.cache/huggingface/hub/models--EleutherAI--gpt-neox-20b/snapshots/4e49eadb5d14bd22f314ec3f45b69a87b88c7691/special_tokens_map.json",
7
+ "tokenizer_class": "GPTNeoXTokenizer",
8
+ "unk_token": "<|endoftext|>"
9
+ }
train_results.json ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "MSE": 0.0,
3
+ "MSE/layer0": 0.0,
4
+ "MSE/layer1": 0.0,
5
+ "MSE/layer10": 0.0,
6
+ "MSE/layer11": 0.0,
7
+ "MSE/layer12": 0.0,
8
+ "MSE/layer13": 0.0,
9
+ "MSE/layer14": 0.0,
10
+ "MSE/layer15": 0.0,
11
+ "MSE/layer16": 0.0,
12
+ "MSE/layer17": 0.0,
13
+ "MSE/layer18": 0.0,
14
+ "MSE/layer19": 0.0,
15
+ "MSE/layer2": 0.0,
16
+ "MSE/layer20": 0.0,
17
+ "MSE/layer21": 0.0,
18
+ "MSE/layer22": 0.0,
19
+ "MSE/layer23": 0.0,
20
+ "MSE/layer3": 0.0,
21
+ "MSE/layer4": 0.0,
22
+ "MSE/layer5": 0.0,
23
+ "MSE/layer6": 0.0,
24
+ "MSE/layer7": 0.0,
25
+ "MSE/layer8": 0.0,
26
+ "MSE/layer9": 0.0,
27
+ "dead_code_fraction": 1.0,
28
+ "dead_code_fraction/layer0": 1.0,
29
+ "dead_code_fraction/layer1": 1.0,
30
+ "dead_code_fraction/layer10": 1.0,
31
+ "dead_code_fraction/layer11": 1.0,
32
+ "dead_code_fraction/layer12": 1.0,
33
+ "dead_code_fraction/layer13": 1.0,
34
+ "dead_code_fraction/layer14": 1.0,
35
+ "dead_code_fraction/layer15": 1.0,
36
+ "dead_code_fraction/layer16": 1.0,
37
+ "dead_code_fraction/layer17": 1.0,
38
+ "dead_code_fraction/layer18": 1.0,
39
+ "dead_code_fraction/layer19": 1.0,
40
+ "dead_code_fraction/layer2": 1.0,
41
+ "dead_code_fraction/layer20": 1.0,
42
+ "dead_code_fraction/layer21": 1.0,
43
+ "dead_code_fraction/layer22": 1.0,
44
+ "dead_code_fraction/layer23": 1.0,
45
+ "dead_code_fraction/layer3": 1.0,
46
+ "dead_code_fraction/layer4": 1.0,
47
+ "dead_code_fraction/layer5": 1.0,
48
+ "dead_code_fraction/layer6": 1.0,
49
+ "dead_code_fraction/layer7": 1.0,
50
+ "dead_code_fraction/layer8": 1.0,
51
+ "dead_code_fraction/layer9": 1.0,
52
+ "epoch": 6.26,
53
+ "input_norm": 0.0,
54
+ "input_norm/layer0": 0.0,
55
+ "input_norm/layer1": 0.0,
56
+ "input_norm/layer10": 0.0,
57
+ "input_norm/layer11": 0.0,
58
+ "input_norm/layer12": 0.0,
59
+ "input_norm/layer13": 0.0,
60
+ "input_norm/layer14": 0.0,
61
+ "input_norm/layer15": 0.0,
62
+ "input_norm/layer16": 0.0,
63
+ "input_norm/layer17": 0.0,
64
+ "input_norm/layer18": 0.0,
65
+ "input_norm/layer19": 0.0,
66
+ "input_norm/layer2": 0.0,
67
+ "input_norm/layer20": 0.0,
68
+ "input_norm/layer21": 0.0,
69
+ "input_norm/layer22": 0.0,
70
+ "input_norm/layer23": 0.0,
71
+ "input_norm/layer3": 0.0,
72
+ "input_norm/layer4": 0.0,
73
+ "input_norm/layer5": 0.0,
74
+ "input_norm/layer6": 0.0,
75
+ "input_norm/layer7": 0.0,
76
+ "input_norm/layer8": 0.0,
77
+ "input_norm/layer9": 0.0,
78
+ "max_norm": 45.539119720458984,
79
+ "max_norm/layer0": 34.44173049926758,
80
+ "max_norm/layer1": 36.61558151245117,
81
+ "max_norm/layer10": 38.54380416870117,
82
+ "max_norm/layer11": 34.865203857421875,
83
+ "max_norm/layer12": 40.908504486083984,
84
+ "max_norm/layer13": 35.78108215332031,
85
+ "max_norm/layer14": 36.67228317260742,
86
+ "max_norm/layer15": 45.083438873291016,
87
+ "max_norm/layer16": 36.927913665771484,
88
+ "max_norm/layer17": 45.539119720458984,
89
+ "max_norm/layer18": 39.2352409362793,
90
+ "max_norm/layer19": 38.779598236083984,
91
+ "max_norm/layer2": 26.836795806884766,
92
+ "max_norm/layer20": 38.50577163696289,
93
+ "max_norm/layer21": 38.87571334838867,
94
+ "max_norm/layer22": 39.42427062988281,
95
+ "max_norm/layer23": 37.21847915649414,
96
+ "max_norm/layer3": 34.34575271606445,
97
+ "max_norm/layer4": 34.4432258605957,
98
+ "max_norm/layer5": 44.077754974365234,
99
+ "max_norm/layer6": 28.6057071685791,
100
+ "max_norm/layer7": 37.91745376586914,
101
+ "max_norm/layer8": 36.69032287597656,
102
+ "max_norm/layer9": 37.08796691894531,
103
+ "mean_norm": 11.799732064207396,
104
+ "mean_norm/layer0": 11.755437850952148,
105
+ "mean_norm/layer1": 11.22901839017868,
106
+ "mean_norm/layer10": 11.532833635807037,
107
+ "mean_norm/layer11": 11.962444841861725,
108
+ "mean_norm/layer12": 12.79077160358429,
109
+ "mean_norm/layer13": 11.57960969209671,
110
+ "mean_norm/layer14": 12.059264957904816,
111
+ "mean_norm/layer15": 12.540440499782562,
112
+ "mean_norm/layer16": 11.641206741333008,
113
+ "mean_norm/layer17": 12.231300234794617,
114
+ "mean_norm/layer18": 11.600049555301666,
115
+ "mean_norm/layer19": 11.686796128749847,
116
+ "mean_norm/layer2": 9.256644666194916,
117
+ "mean_norm/layer20": 11.78922188282013,
118
+ "mean_norm/layer21": 11.759462356567383,
119
+ "mean_norm/layer22": 13.063357532024384,
120
+ "mean_norm/layer23": 13.022553265094757,
121
+ "mean_norm/layer3": 12.574194192886353,
122
+ "mean_norm/layer4": 10.863756775856018,
123
+ "mean_norm/layer5": 14.197384178638458,
124
+ "mean_norm/layer6": 10.185243308544159,
125
+ "mean_norm/layer7": 10.893572747707367,
126
+ "mean_norm/layer8": 11.53871750831604,
127
+ "mean_norm/layer9": 11.440286993980408,
128
+ "multicode_k": 8,
129
+ "output_norm": 0.0,
130
+ "output_norm/layer0": 0.0,
131
+ "output_norm/layer1": 0.0,
132
+ "output_norm/layer10": 0.0,
133
+ "output_norm/layer11": 0.0,
134
+ "output_norm/layer12": 0.0,
135
+ "output_norm/layer13": 0.0,
136
+ "output_norm/layer14": 0.0,
137
+ "output_norm/layer15": 0.0,
138
+ "output_norm/layer16": 0.0,
139
+ "output_norm/layer17": 0.0,
140
+ "output_norm/layer18": 0.0,
141
+ "output_norm/layer19": 0.0,
142
+ "output_norm/layer2": 0.0,
143
+ "output_norm/layer20": 0.0,
144
+ "output_norm/layer21": 0.0,
145
+ "output_norm/layer22": 0.0,
146
+ "output_norm/layer23": 0.0,
147
+ "output_norm/layer3": 0.0,
148
+ "output_norm/layer4": 0.0,
149
+ "output_norm/layer5": 0.0,
150
+ "output_norm/layer6": 0.0,
151
+ "output_norm/layer7": 0.0,
152
+ "output_norm/layer8": 0.0,
153
+ "output_norm/layer9": 0.0,
154
+ "train_loss": 2.685329116312663,
155
+ "train_runtime": 43939.9354,
156
+ "train_samples": 114937,
157
+ "train_samples_per_second": 16.386,
158
+ "train_steps_per_second": 0.341
159
+ }
trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:974e67b66201b847274c72f8bccd37bc28a91bd779b977a46504b96111e57b61
3
+ size 3771