alexgrigore commited on
Commit
c8195bc
1 Parent(s): c97ef5e

End of training

Browse files
Files changed (3) hide show
  1. all_results.json +4 -4
  2. test_results.json +4 -4
  3. trainer_state.json +102 -102
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 3.242021276595745,
3
  "eval_accuracy": 0.7633136094674556,
4
- "eval_loss": 0.8817277550697327,
5
- "eval_runtime": 14.8211,
6
- "eval_samples_per_second": 11.403,
7
- "eval_steps_per_second": 1.484
8
  }
 
1
  {
2
  "epoch": 3.242021276595745,
3
  "eval_accuracy": 0.7633136094674556,
4
+ "eval_loss": 0.8992812633514404,
5
+ "eval_runtime": 12.8676,
6
+ "eval_samples_per_second": 13.134,
7
+ "eval_steps_per_second": 1.71
8
  }
test_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 3.242021276595745,
3
  "eval_accuracy": 0.7633136094674556,
4
- "eval_loss": 0.8817277550697327,
5
- "eval_runtime": 14.8211,
6
- "eval_samples_per_second": 11.403,
7
- "eval_steps_per_second": 1.484
8
  }
 
1
  {
2
  "epoch": 3.242021276595745,
3
  "eval_accuracy": 0.7633136094674556,
4
+ "eval_loss": 0.8992812633514404,
5
+ "eval_runtime": 12.8676,
6
+ "eval_samples_per_second": 13.134,
7
+ "eval_steps_per_second": 1.71
8
  }
trainer_state.json CHANGED
@@ -10,324 +10,324 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.026595744680851064,
13
- "grad_norm": 12.65776252746582,
14
  "learning_rate": 1.3157894736842106e-05,
15
- "loss": 1.7008,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.05319148936170213,
20
- "grad_norm": 6.046972274780273,
21
  "learning_rate": 2.6315789473684212e-05,
22
- "loss": 0.9981,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.0797872340425532,
27
- "grad_norm": 9.531058311462402,
28
  "learning_rate": 3.9473684210526316e-05,
29
- "loss": 0.9371,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.10638297872340426,
34
- "grad_norm": 6.416304111480713,
35
  "learning_rate": 4.970414201183432e-05,
36
- "loss": 0.8702,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.13297872340425532,
41
- "grad_norm": 4.882875442504883,
42
  "learning_rate": 4.822485207100592e-05,
43
- "loss": 0.5812,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.1595744680851064,
48
- "grad_norm": 3.9771392345428467,
49
  "learning_rate": 4.674556213017752e-05,
50
- "loss": 1.1064,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.18617021276595744,
55
- "grad_norm": 2.8228275775909424,
56
  "learning_rate": 4.5266272189349114e-05,
57
- "loss": 0.6824,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.2127659574468085,
62
- "grad_norm": 2.11715030670166,
63
  "learning_rate": 4.378698224852072e-05,
64
- "loss": 0.5582,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.2393617021276596,
69
- "grad_norm": 1.6437604427337646,
70
  "learning_rate": 4.230769230769231e-05,
71
- "loss": 0.7968,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.2526595744680851,
76
  "eval_accuracy": 0.7875,
77
- "eval_loss": 0.7984516620635986,
78
- "eval_runtime": 15.2987,
79
- "eval_samples_per_second": 10.458,
80
- "eval_steps_per_second": 1.307,
81
  "step": 95
82
  },
83
  {
84
  "epoch": 1.0132978723404256,
85
- "grad_norm": 1.7461698055267334,
86
  "learning_rate": 4.0828402366863904e-05,
87
- "loss": 0.6967,
88
  "step": 100
89
  },
90
  {
91
  "epoch": 1.0398936170212767,
92
- "grad_norm": 2.8989648818969727,
93
  "learning_rate": 3.934911242603551e-05,
94
- "loss": 0.8106,
95
  "step": 110
96
  },
97
  {
98
  "epoch": 1.0664893617021276,
99
- "grad_norm": 2.848541021347046,
100
  "learning_rate": 3.7869822485207104e-05,
101
- "loss": 0.741,
102
  "step": 120
103
  },
104
  {
105
  "epoch": 1.0930851063829787,
106
- "grad_norm": 2.6322543621063232,
107
  "learning_rate": 3.63905325443787e-05,
108
- "loss": 0.7027,
109
  "step": 130
110
  },
111
  {
112
  "epoch": 1.1196808510638299,
113
- "grad_norm": 7.683568954467773,
114
  "learning_rate": 3.49112426035503e-05,
115
- "loss": 0.6579,
116
  "step": 140
117
  },
118
  {
119
  "epoch": 1.1462765957446808,
120
- "grad_norm": 1.7948863506317139,
121
  "learning_rate": 3.3431952662721895e-05,
122
- "loss": 0.8164,
123
  "step": 150
124
  },
125
  {
126
  "epoch": 1.172872340425532,
127
- "grad_norm": 1.682706594467163,
128
  "learning_rate": 3.195266272189349e-05,
129
- "loss": 0.6967,
130
  "step": 160
131
  },
132
  {
133
  "epoch": 1.199468085106383,
134
- "grad_norm": 3.870861530303955,
135
  "learning_rate": 3.047337278106509e-05,
136
- "loss": 0.7218,
137
  "step": 170
138
  },
139
  {
140
  "epoch": 1.226063829787234,
141
- "grad_norm": 1.6848540306091309,
142
  "learning_rate": 2.8994082840236685e-05,
143
- "loss": 0.7805,
144
  "step": 180
145
  },
146
  {
147
  "epoch": 1.252659574468085,
148
- "grad_norm": 5.371254920959473,
149
  "learning_rate": 2.751479289940829e-05,
150
- "loss": 0.8744,
151
  "step": 190
152
  },
153
  {
154
  "epoch": 1.252659574468085,
155
  "eval_accuracy": 0.7875,
156
- "eval_loss": 0.7836956977844238,
157
- "eval_runtime": 11.0004,
158
- "eval_samples_per_second": 14.545,
159
- "eval_steps_per_second": 1.818,
160
  "step": 190
161
  },
162
  {
163
  "epoch": 2.026595744680851,
164
- "grad_norm": 1.314697027206421,
165
  "learning_rate": 2.6035502958579882e-05,
166
- "loss": 0.7122,
167
  "step": 200
168
  },
169
  {
170
  "epoch": 2.0531914893617023,
171
- "grad_norm": 1.7407877445220947,
172
  "learning_rate": 2.4556213017751482e-05,
173
- "loss": 0.7189,
174
  "step": 210
175
  },
176
  {
177
  "epoch": 2.0797872340425534,
178
- "grad_norm": 2.266850471496582,
179
  "learning_rate": 2.307692307692308e-05,
180
- "loss": 0.8913,
181
  "step": 220
182
  },
183
  {
184
  "epoch": 2.106382978723404,
185
- "grad_norm": 2.1507058143615723,
186
  "learning_rate": 2.1597633136094676e-05,
187
- "loss": 0.8554,
188
  "step": 230
189
  },
190
  {
191
  "epoch": 2.132978723404255,
192
- "grad_norm": 4.038496971130371,
193
  "learning_rate": 2.0118343195266273e-05,
194
- "loss": 0.6919,
195
  "step": 240
196
  },
197
  {
198
  "epoch": 2.1595744680851063,
199
- "grad_norm": 4.005491256713867,
200
  "learning_rate": 1.8639053254437873e-05,
201
- "loss": 0.7086,
202
  "step": 250
203
  },
204
  {
205
  "epoch": 2.1861702127659575,
206
- "grad_norm": 3.721208095550537,
207
  "learning_rate": 1.7159763313609466e-05,
208
- "loss": 0.7478,
209
  "step": 260
210
  },
211
  {
212
  "epoch": 2.2127659574468086,
213
- "grad_norm": 4.95754337310791,
214
  "learning_rate": 1.5680473372781066e-05,
215
- "loss": 0.5457,
216
  "step": 270
217
  },
218
  {
219
  "epoch": 2.2393617021276597,
220
- "grad_norm": 2.7324411869049072,
221
  "learning_rate": 1.4201183431952663e-05,
222
- "loss": 0.9479,
223
  "step": 280
224
  },
225
  {
226
  "epoch": 2.252659574468085,
227
  "eval_accuracy": 0.7875,
228
- "eval_loss": 0.7812246084213257,
229
- "eval_runtime": 11.2394,
230
- "eval_samples_per_second": 14.236,
231
- "eval_steps_per_second": 1.779,
232
  "step": 285
233
  },
234
  {
235
  "epoch": 3.0132978723404253,
236
- "grad_norm": 2.182974338531494,
237
  "learning_rate": 1.2721893491124262e-05,
238
- "loss": 0.6173,
239
  "step": 290
240
  },
241
  {
242
  "epoch": 3.0398936170212765,
243
- "grad_norm": 3.512352228164673,
244
  "learning_rate": 1.1242603550295859e-05,
245
- "loss": 0.6313,
246
  "step": 300
247
  },
248
  {
249
  "epoch": 3.0664893617021276,
250
- "grad_norm": 2.436189651489258,
251
  "learning_rate": 9.763313609467455e-06,
252
- "loss": 0.6465,
253
  "step": 310
254
  },
255
  {
256
  "epoch": 3.0930851063829787,
257
- "grad_norm": 4.008547782897949,
258
  "learning_rate": 8.284023668639054e-06,
259
- "loss": 0.8407,
260
  "step": 320
261
  },
262
  {
263
  "epoch": 3.11968085106383,
264
- "grad_norm": 4.314539909362793,
265
  "learning_rate": 6.8047337278106515e-06,
266
- "loss": 0.7826,
267
  "step": 330
268
  },
269
  {
270
  "epoch": 3.146276595744681,
271
- "grad_norm": 2.1060965061187744,
272
  "learning_rate": 5.325443786982249e-06,
273
- "loss": 0.8714,
274
  "step": 340
275
  },
276
  {
277
  "epoch": 3.172872340425532,
278
- "grad_norm": 3.250807046890259,
279
  "learning_rate": 3.846153846153847e-06,
280
- "loss": 0.6124,
281
  "step": 350
282
  },
283
  {
284
  "epoch": 3.199468085106383,
285
- "grad_norm": 4.283337593078613,
286
  "learning_rate": 2.366863905325444e-06,
287
- "loss": 0.8395,
288
  "step": 360
289
  },
290
  {
291
  "epoch": 3.226063829787234,
292
- "grad_norm": 4.142538070678711,
293
  "learning_rate": 8.875739644970415e-07,
294
- "loss": 0.8282,
295
  "step": 370
296
  },
297
  {
298
  "epoch": 3.242021276595745,
299
  "eval_accuracy": 0.7875,
300
- "eval_loss": 0.7884188294410706,
301
- "eval_runtime": 11.8341,
302
- "eval_samples_per_second": 13.52,
303
- "eval_steps_per_second": 1.69,
304
  "step": 376
305
  },
306
  {
307
  "epoch": 3.242021276595745,
308
  "step": 376,
309
  "total_flos": 3.7220613152994755e+18,
310
- "train_loss": 0.785655234088289,
311
- "train_runtime": 616.1212,
312
- "train_samples_per_second": 4.882,
313
- "train_steps_per_second": 0.61
314
  },
315
  {
316
  "epoch": 3.242021276595745,
317
  "eval_accuracy": 0.7633136094674556,
318
- "eval_loss": 0.8817278742790222,
319
- "eval_runtime": 19.9254,
320
- "eval_samples_per_second": 8.482,
321
- "eval_steps_per_second": 1.104,
322
  "step": 376
323
  },
324
  {
325
  "epoch": 3.242021276595745,
326
  "eval_accuracy": 0.7633136094674556,
327
- "eval_loss": 0.8817277550697327,
328
- "eval_runtime": 14.8211,
329
- "eval_samples_per_second": 11.403,
330
- "eval_steps_per_second": 1.484,
331
  "step": 376
332
  }
333
  ],
 
10
  "log_history": [
11
  {
12
  "epoch": 0.026595744680851064,
13
+ "grad_norm": 11.770811080932617,
14
  "learning_rate": 1.3157894736842106e-05,
15
+ "loss": 1.5077,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.05319148936170213,
20
+ "grad_norm": 6.303102493286133,
21
  "learning_rate": 2.6315789473684212e-05,
22
+ "loss": 0.9267,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.0797872340425532,
27
+ "grad_norm": 5.121222019195557,
28
  "learning_rate": 3.9473684210526316e-05,
29
+ "loss": 0.9311,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.10638297872340426,
34
+ "grad_norm": 5.0847344398498535,
35
  "learning_rate": 4.970414201183432e-05,
36
+ "loss": 0.8635,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.13297872340425532,
41
+ "grad_norm": 4.5861735343933105,
42
  "learning_rate": 4.822485207100592e-05,
43
+ "loss": 0.5788,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.1595744680851064,
48
+ "grad_norm": 3.4006526470184326,
49
  "learning_rate": 4.674556213017752e-05,
50
+ "loss": 1.0821,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.18617021276595744,
55
+ "grad_norm": 2.8479793071746826,
56
  "learning_rate": 4.5266272189349114e-05,
57
+ "loss": 0.6913,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.2127659574468085,
62
+ "grad_norm": 2.3751566410064697,
63
  "learning_rate": 4.378698224852072e-05,
64
+ "loss": 0.5482,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.2393617021276596,
69
+ "grad_norm": 1.6777262687683105,
70
  "learning_rate": 4.230769230769231e-05,
71
+ "loss": 0.8086,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.2526595744680851,
76
  "eval_accuracy": 0.7875,
77
+ "eval_loss": 0.8058681488037109,
78
+ "eval_runtime": 13.8118,
79
+ "eval_samples_per_second": 11.584,
80
+ "eval_steps_per_second": 1.448,
81
  "step": 95
82
  },
83
  {
84
  "epoch": 1.0132978723404256,
85
+ "grad_norm": 2.0221545696258545,
86
  "learning_rate": 4.0828402366863904e-05,
87
+ "loss": 0.6867,
88
  "step": 100
89
  },
90
  {
91
  "epoch": 1.0398936170212767,
92
+ "grad_norm": 2.89918851852417,
93
  "learning_rate": 3.934911242603551e-05,
94
+ "loss": 0.7971,
95
  "step": 110
96
  },
97
  {
98
  "epoch": 1.0664893617021276,
99
+ "grad_norm": 2.958357810974121,
100
  "learning_rate": 3.7869822485207104e-05,
101
+ "loss": 0.7237,
102
  "step": 120
103
  },
104
  {
105
  "epoch": 1.0930851063829787,
106
+ "grad_norm": 2.737426996231079,
107
  "learning_rate": 3.63905325443787e-05,
108
+ "loss": 0.7045,
109
  "step": 130
110
  },
111
  {
112
  "epoch": 1.1196808510638299,
113
+ "grad_norm": 8.279816627502441,
114
  "learning_rate": 3.49112426035503e-05,
115
+ "loss": 0.6584,
116
  "step": 140
117
  },
118
  {
119
  "epoch": 1.1462765957446808,
120
+ "grad_norm": 1.7154802083969116,
121
  "learning_rate": 3.3431952662721895e-05,
122
+ "loss": 0.8076,
123
  "step": 150
124
  },
125
  {
126
  "epoch": 1.172872340425532,
127
+ "grad_norm": 1.5087482929229736,
128
  "learning_rate": 3.195266272189349e-05,
129
+ "loss": 0.7007,
130
  "step": 160
131
  },
132
  {
133
  "epoch": 1.199468085106383,
134
+ "grad_norm": 4.256617069244385,
135
  "learning_rate": 3.047337278106509e-05,
136
+ "loss": 0.7228,
137
  "step": 170
138
  },
139
  {
140
  "epoch": 1.226063829787234,
141
+ "grad_norm": 1.8001201152801514,
142
  "learning_rate": 2.8994082840236685e-05,
143
+ "loss": 0.7797,
144
  "step": 180
145
  },
146
  {
147
  "epoch": 1.252659574468085,
148
+ "grad_norm": 5.422004222869873,
149
  "learning_rate": 2.751479289940829e-05,
150
+ "loss": 0.8755,
151
  "step": 190
152
  },
153
  {
154
  "epoch": 1.252659574468085,
155
  "eval_accuracy": 0.7875,
156
+ "eval_loss": 0.7764860987663269,
157
+ "eval_runtime": 10.4189,
158
+ "eval_samples_per_second": 15.357,
159
+ "eval_steps_per_second": 1.92,
160
  "step": 190
161
  },
162
  {
163
  "epoch": 2.026595744680851,
164
+ "grad_norm": 1.4132887125015259,
165
  "learning_rate": 2.6035502958579882e-05,
166
+ "loss": 0.7022,
167
  "step": 200
168
  },
169
  {
170
  "epoch": 2.0531914893617023,
171
+ "grad_norm": 1.8760045766830444,
172
  "learning_rate": 2.4556213017751482e-05,
173
+ "loss": 0.7098,
174
  "step": 210
175
  },
176
  {
177
  "epoch": 2.0797872340425534,
178
+ "grad_norm": 2.369459629058838,
179
  "learning_rate": 2.307692307692308e-05,
180
+ "loss": 0.8856,
181
  "step": 220
182
  },
183
  {
184
  "epoch": 2.106382978723404,
185
+ "grad_norm": 2.2688181400299072,
186
  "learning_rate": 2.1597633136094676e-05,
187
+ "loss": 0.8472,
188
  "step": 230
189
  },
190
  {
191
  "epoch": 2.132978723404255,
192
+ "grad_norm": 4.264322280883789,
193
  "learning_rate": 2.0118343195266273e-05,
194
+ "loss": 0.6886,
195
  "step": 240
196
  },
197
  {
198
  "epoch": 2.1595744680851063,
199
+ "grad_norm": 3.999258518218994,
200
  "learning_rate": 1.8639053254437873e-05,
201
+ "loss": 0.7047,
202
  "step": 250
203
  },
204
  {
205
  "epoch": 2.1861702127659575,
206
+ "grad_norm": 3.8878672122955322,
207
  "learning_rate": 1.7159763313609466e-05,
208
+ "loss": 0.7449,
209
  "step": 260
210
  },
211
  {
212
  "epoch": 2.2127659574468086,
213
+ "grad_norm": 4.918434143066406,
214
  "learning_rate": 1.5680473372781066e-05,
215
+ "loss": 0.5438,
216
  "step": 270
217
  },
218
  {
219
  "epoch": 2.2393617021276597,
220
+ "grad_norm": 2.8296828269958496,
221
  "learning_rate": 1.4201183431952663e-05,
222
+ "loss": 0.9334,
223
  "step": 280
224
  },
225
  {
226
  "epoch": 2.252659574468085,
227
  "eval_accuracy": 0.7875,
228
+ "eval_loss": 0.7846399545669556,
229
+ "eval_runtime": 10.9493,
230
+ "eval_samples_per_second": 14.613,
231
+ "eval_steps_per_second": 1.827,
232
  "step": 285
233
  },
234
  {
235
  "epoch": 3.0132978723404253,
236
+ "grad_norm": 2.3935749530792236,
237
  "learning_rate": 1.2721893491124262e-05,
238
+ "loss": 0.6163,
239
  "step": 290
240
  },
241
  {
242
  "epoch": 3.0398936170212765,
243
+ "grad_norm": 3.57572340965271,
244
  "learning_rate": 1.1242603550295859e-05,
245
+ "loss": 0.6265,
246
  "step": 300
247
  },
248
  {
249
  "epoch": 3.0664893617021276,
250
+ "grad_norm": 2.4465370178222656,
251
  "learning_rate": 9.763313609467455e-06,
252
+ "loss": 0.64,
253
  "step": 310
254
  },
255
  {
256
  "epoch": 3.0930851063829787,
257
+ "grad_norm": 4.222550868988037,
258
  "learning_rate": 8.284023668639054e-06,
259
+ "loss": 0.8452,
260
  "step": 320
261
  },
262
  {
263
  "epoch": 3.11968085106383,
264
+ "grad_norm": 3.980487823486328,
265
  "learning_rate": 6.8047337278106515e-06,
266
+ "loss": 0.7681,
267
  "step": 330
268
  },
269
  {
270
  "epoch": 3.146276595744681,
271
+ "grad_norm": 1.943975806236267,
272
  "learning_rate": 5.325443786982249e-06,
273
+ "loss": 0.8814,
274
  "step": 340
275
  },
276
  {
277
  "epoch": 3.172872340425532,
278
+ "grad_norm": 3.3926122188568115,
279
  "learning_rate": 3.846153846153847e-06,
280
+ "loss": 0.5995,
281
  "step": 350
282
  },
283
  {
284
  "epoch": 3.199468085106383,
285
+ "grad_norm": 4.575331211090088,
286
  "learning_rate": 2.366863905325444e-06,
287
+ "loss": 0.8319,
288
  "step": 360
289
  },
290
  {
291
  "epoch": 3.226063829787234,
292
+ "grad_norm": 4.1369309425354,
293
  "learning_rate": 8.875739644970415e-07,
294
+ "loss": 0.8263,
295
  "step": 370
296
  },
297
  {
298
  "epoch": 3.242021276595745,
299
  "eval_accuracy": 0.7875,
300
+ "eval_loss": 0.7844525575637817,
301
+ "eval_runtime": 11.1167,
302
+ "eval_samples_per_second": 14.393,
303
+ "eval_steps_per_second": 1.799,
304
  "step": 376
305
  },
306
  {
307
  "epoch": 3.242021276595745,
308
  "step": 376,
309
  "total_flos": 3.7220613152994755e+18,
310
+ "train_loss": 0.7742417086946204,
311
+ "train_runtime": 578.534,
312
+ "train_samples_per_second": 5.199,
313
+ "train_steps_per_second": 0.65
314
  },
315
  {
316
  "epoch": 3.242021276595745,
317
  "eval_accuracy": 0.7633136094674556,
318
+ "eval_loss": 0.8992813229560852,
319
+ "eval_runtime": 20.2942,
320
+ "eval_samples_per_second": 8.327,
321
+ "eval_steps_per_second": 1.084,
322
  "step": 376
323
  },
324
  {
325
  "epoch": 3.242021276595745,
326
  "eval_accuracy": 0.7633136094674556,
327
+ "eval_loss": 0.8992812633514404,
328
+ "eval_runtime": 12.8676,
329
+ "eval_samples_per_second": 13.134,
330
+ "eval_steps_per_second": 1.71,
331
  "step": 376
332
  }
333
  ],