reaperdoesntknow commited on
Commit
66e9d41
·
verified ·
1 Parent(s): 1bb77f7

Upload 2 files

Browse files
MoA-150M_results.json ADDED
@@ -0,0 +1,331 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "entropy": 2.184988856315613,
4
+ "epoch": 0.0625,
5
+ "grad_norm": 6.174712657928467,
6
+ "learning_rate": 0.00029531249999999995,
7
+ "loss": 0.513,
8
+ "mean_token_accuracy": 0.9921568632125854,
9
+ "num_tokens": 4096.0,
10
+ "step": 2
11
+ },
12
+ {
13
+ "entropy": 1.7302125096321106,
14
+ "epoch": 0.125,
15
+ "grad_norm": 0.9681358933448792,
16
+ "learning_rate": 0.00028593749999999995,
17
+ "loss": 0.441,
18
+ "mean_token_accuracy": 0.9870097935199738,
19
+ "num_tokens": 8192.0,
20
+ "step": 4
21
+ },
22
+ {
23
+ "entropy": 2.0538085103034973,
24
+ "epoch": 0.1875,
25
+ "grad_norm": 0.5153102278709412,
26
+ "learning_rate": 0.00027656249999999995,
27
+ "loss": 0.4929,
28
+ "mean_token_accuracy": 0.9980392158031464,
29
+ "num_tokens": 12288.0,
30
+ "step": 6
31
+ },
32
+ {
33
+ "entropy": 1.7263505458831787,
34
+ "epoch": 0.25,
35
+ "grad_norm": 0.6404310464859009,
36
+ "learning_rate": 0.00026718749999999996,
37
+ "loss": 0.4014,
38
+ "mean_token_accuracy": 0.9987744987010956,
39
+ "num_tokens": 16384.0,
40
+ "step": 8
41
+ },
42
+ {
43
+ "entropy": 1.8347786664962769,
44
+ "epoch": 0.3125,
45
+ "grad_norm": 0.6209350228309631,
46
+ "learning_rate": 0.00025781249999999996,
47
+ "loss": 0.4365,
48
+ "mean_token_accuracy": 1.0,
49
+ "num_tokens": 20480.0,
50
+ "step": 10
51
+ },
52
+ {
53
+ "entropy": 1.4254534244537354,
54
+ "epoch": 0.375,
55
+ "grad_norm": 0.44427844882011414,
56
+ "learning_rate": 0.00024843749999999996,
57
+ "loss": 0.2791,
58
+ "mean_token_accuracy": 1.0,
59
+ "num_tokens": 24576.0,
60
+ "step": 12
61
+ },
62
+ {
63
+ "entropy": 1.8001930117607117,
64
+ "epoch": 0.4375,
65
+ "grad_norm": 0.3619579076766968,
66
+ "learning_rate": 0.0002390625,
67
+ "loss": 0.3962,
68
+ "mean_token_accuracy": 0.9987744987010956,
69
+ "num_tokens": 28672.0,
70
+ "step": 14
71
+ },
72
+ {
73
+ "entropy": 1.6178001761436462,
74
+ "epoch": 0.5,
75
+ "grad_norm": 0.38358834385871887,
76
+ "learning_rate": 0.0002296875,
77
+ "loss": 0.3492,
78
+ "mean_token_accuracy": 1.0,
79
+ "num_tokens": 32768.0,
80
+ "step": 16
81
+ },
82
+ {
83
+ "entropy": 1.661442220211029,
84
+ "epoch": 0.5625,
85
+ "grad_norm": 0.3749903440475464,
86
+ "learning_rate": 0.00022031249999999997,
87
+ "loss": 0.3582,
88
+ "mean_token_accuracy": 1.0,
89
+ "num_tokens": 36864.0,
90
+ "step": 18
91
+ },
92
+ {
93
+ "entropy": 1.5717861652374268,
94
+ "epoch": 0.625,
95
+ "grad_norm": 0.36388659477233887,
96
+ "learning_rate": 0.00021093749999999997,
97
+ "loss": 0.3126,
98
+ "mean_token_accuracy": 1.0,
99
+ "num_tokens": 40960.0,
100
+ "step": 20
101
+ },
102
+ {
103
+ "entropy": 1.5534449219703674,
104
+ "epoch": 0.6875,
105
+ "grad_norm": 0.40969353914260864,
106
+ "learning_rate": 0.00020156249999999997,
107
+ "loss": 0.3478,
108
+ "mean_token_accuracy": 0.9997549057006836,
109
+ "num_tokens": 45056.0,
110
+ "step": 22
111
+ },
112
+ {
113
+ "entropy": 1.5265448689460754,
114
+ "epoch": 0.75,
115
+ "grad_norm": 0.41839736700057983,
116
+ "learning_rate": 0.00019218749999999998,
117
+ "loss": 0.3601,
118
+ "mean_token_accuracy": 0.9995098114013672,
119
+ "num_tokens": 49152.0,
120
+ "step": 24
121
+ },
122
+ {
123
+ "entropy": 2.074858069419861,
124
+ "epoch": 0.8125,
125
+ "grad_norm": 0.44888272881507874,
126
+ "learning_rate": 0.00018281249999999998,
127
+ "loss": 0.4777,
128
+ "mean_token_accuracy": 0.9997549057006836,
129
+ "num_tokens": 53248.0,
130
+ "step": 26
131
+ },
132
+ {
133
+ "entropy": 1.6095194816589355,
134
+ "epoch": 0.875,
135
+ "grad_norm": 0.5080280900001526,
136
+ "learning_rate": 0.00017343749999999998,
137
+ "loss": 0.3859,
138
+ "mean_token_accuracy": 0.9987744987010956,
139
+ "num_tokens": 57344.0,
140
+ "step": 28
141
+ },
142
+ {
143
+ "entropy": 1.7032344341278076,
144
+ "epoch": 0.9375,
145
+ "grad_norm": 0.3624984622001648,
146
+ "learning_rate": 0.00016406249999999998,
147
+ "loss": 0.3576,
148
+ "mean_token_accuracy": 1.0,
149
+ "num_tokens": 61440.0,
150
+ "step": 30
151
+ },
152
+ {
153
+ "entropy": 1.8873920440673828,
154
+ "epoch": 1.0,
155
+ "grad_norm": 0.618506133556366,
156
+ "learning_rate": 0.00015468749999999999,
157
+ "loss": 0.446,
158
+ "mean_token_accuracy": 0.9997549057006836,
159
+ "num_tokens": 64256.0,
160
+ "step": 32
161
+ },
162
+ {
163
+ "entropy": 1.4586840271949768,
164
+ "epoch": 1.0625,
165
+ "grad_norm": 0.3723963797092438,
166
+ "learning_rate": 0.0001453125,
167
+ "loss": 0.2824,
168
+ "mean_token_accuracy": 1.0,
169
+ "num_tokens": 68352.0,
170
+ "step": 34
171
+ },
172
+ {
173
+ "entropy": 1.2832568883895874,
174
+ "epoch": 1.125,
175
+ "grad_norm": 0.3108985424041748,
176
+ "learning_rate": 0.0001359375,
177
+ "loss": 0.2191,
178
+ "mean_token_accuracy": 1.0,
179
+ "num_tokens": 72448.0,
180
+ "step": 36
181
+ },
182
+ {
183
+ "entropy": 1.523368000984192,
184
+ "epoch": 1.1875,
185
+ "grad_norm": 0.3509906232357025,
186
+ "learning_rate": 0.0001265625,
187
+ "loss": 0.3042,
188
+ "mean_token_accuracy": 0.9995098114013672,
189
+ "num_tokens": 76544.0,
190
+ "step": 38
191
+ },
192
+ {
193
+ "entropy": 1.2994396686553955,
194
+ "epoch": 1.25,
195
+ "grad_norm": 0.3014850616455078,
196
+ "learning_rate": 0.0001171875,
197
+ "loss": 0.2456,
198
+ "mean_token_accuracy": 1.0,
199
+ "num_tokens": 80640.0,
200
+ "step": 40
201
+ },
202
+ {
203
+ "entropy": 1.3811439871788025,
204
+ "epoch": 1.3125,
205
+ "grad_norm": 0.32755109667778015,
206
+ "learning_rate": 0.00010781249999999998,
207
+ "loss": 0.2521,
208
+ "mean_token_accuracy": 1.0,
209
+ "num_tokens": 84736.0,
210
+ "step": 42
211
+ },
212
+ {
213
+ "entropy": 1.1994215250015259,
214
+ "epoch": 1.375,
215
+ "grad_norm": 0.24541084468364716,
216
+ "learning_rate": 9.843749999999999e-05,
217
+ "loss": 0.2118,
218
+ "mean_token_accuracy": 1.0,
219
+ "num_tokens": 88832.0,
220
+ "step": 44
221
+ },
222
+ {
223
+ "entropy": 1.0519097447395325,
224
+ "epoch": 1.4375,
225
+ "grad_norm": 0.2063349187374115,
226
+ "learning_rate": 8.906249999999999e-05,
227
+ "loss": 0.1943,
228
+ "mean_token_accuracy": 1.0,
229
+ "num_tokens": 92928.0,
230
+ "step": 46
231
+ },
232
+ {
233
+ "entropy": 1.1152112483978271,
234
+ "epoch": 1.5,
235
+ "grad_norm": 0.31837204098701477,
236
+ "learning_rate": 7.968749999999999e-05,
237
+ "loss": 0.2132,
238
+ "mean_token_accuracy": 1.0,
239
+ "num_tokens": 97024.0,
240
+ "step": 48
241
+ },
242
+ {
243
+ "entropy": 1.2649919390678406,
244
+ "epoch": 1.5625,
245
+ "grad_norm": 0.289153516292572,
246
+ "learning_rate": 7.03125e-05,
247
+ "loss": 0.2158,
248
+ "mean_token_accuracy": 1.0,
249
+ "num_tokens": 101120.0,
250
+ "step": 50
251
+ },
252
+ {
253
+ "entropy": 0.9955946207046509,
254
+ "epoch": 1.625,
255
+ "grad_norm": 0.2607753276824951,
256
+ "learning_rate": 6.09375e-05,
257
+ "loss": 0.1913,
258
+ "mean_token_accuracy": 0.9997549057006836,
259
+ "num_tokens": 105216.0,
260
+ "step": 52
261
+ },
262
+ {
263
+ "entropy": 1.3506205081939697,
264
+ "epoch": 1.6875,
265
+ "grad_norm": 0.2850724458694458,
266
+ "learning_rate": 5.156249999999999e-05,
267
+ "loss": 0.2234,
268
+ "mean_token_accuracy": 1.0,
269
+ "num_tokens": 109312.0,
270
+ "step": 54
271
+ },
272
+ {
273
+ "entropy": 1.3460099697113037,
274
+ "epoch": 1.75,
275
+ "grad_norm": 0.23587484657764435,
276
+ "learning_rate": 4.2187499999999995e-05,
277
+ "loss": 0.2544,
278
+ "mean_token_accuracy": 1.0,
279
+ "num_tokens": 113408.0,
280
+ "step": 56
281
+ },
282
+ {
283
+ "entropy": 1.4365423321723938,
284
+ "epoch": 1.8125,
285
+ "grad_norm": 0.3239842653274536,
286
+ "learning_rate": 3.28125e-05,
287
+ "loss": 0.2958,
288
+ "mean_token_accuracy": 1.0,
289
+ "num_tokens": 117504.0,
290
+ "step": 58
291
+ },
292
+ {
293
+ "entropy": 1.567048728466034,
294
+ "epoch": 1.875,
295
+ "grad_norm": 0.34480002522468567,
296
+ "learning_rate": 2.3437499999999997e-05,
297
+ "loss": 0.3122,
298
+ "mean_token_accuracy": 1.0,
299
+ "num_tokens": 121600.0,
300
+ "step": 60
301
+ },
302
+ {
303
+ "entropy": 1.344693124294281,
304
+ "epoch": 1.9375,
305
+ "grad_norm": 0.25280237197875977,
306
+ "learning_rate": 1.40625e-05,
307
+ "loss": 0.2472,
308
+ "mean_token_accuracy": 1.0,
309
+ "num_tokens": 125696.0,
310
+ "step": 62
311
+ },
312
+ {
313
+ "entropy": 1.2893942594528198,
314
+ "epoch": 2.0,
315
+ "grad_norm": 0.5825140476226807,
316
+ "learning_rate": 4.6875e-06,
317
+ "loss": 0.2222,
318
+ "mean_token_accuracy": 1.0,
319
+ "num_tokens": 128512.0,
320
+ "step": 64
321
+ },
322
+ {
323
+ "epoch": 2.0,
324
+ "step": 64,
325
+ "total_flos": 59560143504384.0,
326
+ "train_loss": 0.32000101869925857,
327
+ "train_runtime": 720.302,
328
+ "train_samples_per_second": 0.711,
329
+ "train_steps_per_second": 0.089
330
+ }
331
+ ]
events.out.tfevents.1758523788.ed35ea831684.8365.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21a0268fbab3fb572425abfb4366a2e110c12add72177ce0d97fc045a4db0586
3
+ size 18423