alexgrigore commited on
Commit
e762635
1 Parent(s): 9efb50c

End of training

Browse files
Files changed (3) hide show
  1. all_results.json +4 -4
  2. test_results.json +4 -4
  3. trainer_state.json +102 -102
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 3.242021276595745,
3
  "eval_accuracy": 0.7633136094674556,
4
- "eval_loss": 0.893736720085144,
5
- "eval_runtime": 11.4267,
6
- "eval_samples_per_second": 14.79,
7
- "eval_steps_per_second": 1.925
8
  }
 
1
  {
2
  "epoch": 3.242021276595745,
3
  "eval_accuracy": 0.7633136094674556,
4
+ "eval_loss": 0.8817277550697327,
5
+ "eval_runtime": 14.8211,
6
+ "eval_samples_per_second": 11.403,
7
+ "eval_steps_per_second": 1.484
8
  }
test_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 3.242021276595745,
3
  "eval_accuracy": 0.7633136094674556,
4
- "eval_loss": 0.893736720085144,
5
- "eval_runtime": 11.4267,
6
- "eval_samples_per_second": 14.79,
7
- "eval_steps_per_second": 1.925
8
  }
 
1
  {
2
  "epoch": 3.242021276595745,
3
  "eval_accuracy": 0.7633136094674556,
4
+ "eval_loss": 0.8817277550697327,
5
+ "eval_runtime": 14.8211,
6
+ "eval_samples_per_second": 11.403,
7
+ "eval_steps_per_second": 1.484
8
  }
trainer_state.json CHANGED
@@ -10,324 +10,324 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.026595744680851064,
13
- "grad_norm": 12.626161575317383,
14
  "learning_rate": 1.3157894736842106e-05,
15
- "loss": 1.6196,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.05319148936170213,
20
- "grad_norm": 5.767611980438232,
21
  "learning_rate": 2.6315789473684212e-05,
22
- "loss": 0.9827,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.0797872340425532,
27
- "grad_norm": 4.556581497192383,
28
  "learning_rate": 3.9473684210526316e-05,
29
- "loss": 0.9375,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.10638297872340426,
34
- "grad_norm": 5.054460525512695,
35
  "learning_rate": 4.970414201183432e-05,
36
- "loss": 0.8402,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.13297872340425532,
41
- "grad_norm": 4.505584239959717,
42
  "learning_rate": 4.822485207100592e-05,
43
- "loss": 0.582,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.1595744680851064,
48
- "grad_norm": 3.063162088394165,
49
  "learning_rate": 4.674556213017752e-05,
50
- "loss": 1.1131,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.18617021276595744,
55
- "grad_norm": 2.564850330352783,
56
  "learning_rate": 4.5266272189349114e-05,
57
- "loss": 0.6761,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.2127659574468085,
62
- "grad_norm": 2.2109172344207764,
63
  "learning_rate": 4.378698224852072e-05,
64
- "loss": 0.5614,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.2393617021276596,
69
- "grad_norm": 1.6165765523910522,
70
  "learning_rate": 4.230769230769231e-05,
71
- "loss": 0.8026,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.2526595744680851,
76
  "eval_accuracy": 0.7875,
77
- "eval_loss": 0.8083112835884094,
78
- "eval_runtime": 16.3601,
79
- "eval_samples_per_second": 9.78,
80
- "eval_steps_per_second": 1.222,
81
  "step": 95
82
  },
83
  {
84
  "epoch": 1.0132978723404256,
85
- "grad_norm": 1.8220001459121704,
86
  "learning_rate": 4.0828402366863904e-05,
87
- "loss": 0.6855,
88
  "step": 100
89
  },
90
  {
91
  "epoch": 1.0398936170212767,
92
- "grad_norm": 2.690967321395874,
93
  "learning_rate": 3.934911242603551e-05,
94
- "loss": 0.8036,
95
  "step": 110
96
  },
97
  {
98
  "epoch": 1.0664893617021276,
99
- "grad_norm": 2.7666921615600586,
100
  "learning_rate": 3.7869822485207104e-05,
101
- "loss": 0.7408,
102
  "step": 120
103
  },
104
  {
105
  "epoch": 1.0930851063829787,
106
- "grad_norm": 2.5995383262634277,
107
  "learning_rate": 3.63905325443787e-05,
108
- "loss": 0.7017,
109
  "step": 130
110
  },
111
  {
112
  "epoch": 1.1196808510638299,
113
- "grad_norm": 7.389265537261963,
114
  "learning_rate": 3.49112426035503e-05,
115
- "loss": 0.6631,
116
  "step": 140
117
  },
118
  {
119
  "epoch": 1.1462765957446808,
120
- "grad_norm": 1.8834972381591797,
121
  "learning_rate": 3.3431952662721895e-05,
122
- "loss": 0.8175,
123
  "step": 150
124
  },
125
  {
126
  "epoch": 1.172872340425532,
127
- "grad_norm": 1.4516174793243408,
128
  "learning_rate": 3.195266272189349e-05,
129
- "loss": 0.6888,
130
  "step": 160
131
  },
132
  {
133
  "epoch": 1.199468085106383,
134
- "grad_norm": 3.942476272583008,
135
  "learning_rate": 3.047337278106509e-05,
136
- "loss": 0.7102,
137
  "step": 170
138
  },
139
  {
140
  "epoch": 1.226063829787234,
141
- "grad_norm": 1.646263837814331,
142
  "learning_rate": 2.8994082840236685e-05,
143
- "loss": 0.7908,
144
  "step": 180
145
  },
146
  {
147
  "epoch": 1.252659574468085,
148
- "grad_norm": 5.147498607635498,
149
  "learning_rate": 2.751479289940829e-05,
150
- "loss": 0.8728,
151
  "step": 190
152
  },
153
  {
154
  "epoch": 1.252659574468085,
155
  "eval_accuracy": 0.7875,
156
- "eval_loss": 0.7828324437141418,
157
- "eval_runtime": 16.1891,
158
- "eval_samples_per_second": 9.883,
159
- "eval_steps_per_second": 1.235,
160
  "step": 190
161
  },
162
  {
163
  "epoch": 2.026595744680851,
164
- "grad_norm": 1.3614023923873901,
165
  "learning_rate": 2.6035502958579882e-05,
166
- "loss": 0.7153,
167
  "step": 200
168
  },
169
  {
170
  "epoch": 2.0531914893617023,
171
- "grad_norm": 1.6865636110305786,
172
  "learning_rate": 2.4556213017751482e-05,
173
- "loss": 0.7114,
174
  "step": 210
175
  },
176
  {
177
  "epoch": 2.0797872340425534,
178
- "grad_norm": 2.2771339416503906,
179
  "learning_rate": 2.307692307692308e-05,
180
- "loss": 0.8898,
181
  "step": 220
182
  },
183
  {
184
  "epoch": 2.106382978723404,
185
- "grad_norm": 2.0663363933563232,
186
  "learning_rate": 2.1597633136094676e-05,
187
- "loss": 0.8524,
188
  "step": 230
189
  },
190
  {
191
  "epoch": 2.132978723404255,
192
- "grad_norm": 4.288294792175293,
193
  "learning_rate": 2.0118343195266273e-05,
194
- "loss": 0.6944,
195
  "step": 240
196
  },
197
  {
198
  "epoch": 2.1595744680851063,
199
- "grad_norm": 3.9807941913604736,
200
  "learning_rate": 1.8639053254437873e-05,
201
- "loss": 0.7094,
202
  "step": 250
203
  },
204
  {
205
  "epoch": 2.1861702127659575,
206
- "grad_norm": 3.6395652294158936,
207
  "learning_rate": 1.7159763313609466e-05,
208
- "loss": 0.7492,
209
  "step": 260
210
  },
211
  {
212
  "epoch": 2.2127659574468086,
213
- "grad_norm": 4.759531021118164,
214
  "learning_rate": 1.5680473372781066e-05,
215
- "loss": 0.5475,
216
  "step": 270
217
  },
218
  {
219
  "epoch": 2.2393617021276597,
220
- "grad_norm": 2.811213970184326,
221
  "learning_rate": 1.4201183431952663e-05,
222
- "loss": 0.952,
223
  "step": 280
224
  },
225
  {
226
  "epoch": 2.252659574468085,
227
  "eval_accuracy": 0.7875,
228
- "eval_loss": 0.7974634170532227,
229
- "eval_runtime": 12.6508,
230
- "eval_samples_per_second": 12.647,
231
- "eval_steps_per_second": 1.581,
232
  "step": 285
233
  },
234
  {
235
  "epoch": 3.0132978723404253,
236
- "grad_norm": 2.360816240310669,
237
  "learning_rate": 1.2721893491124262e-05,
238
- "loss": 0.6142,
239
  "step": 290
240
  },
241
  {
242
  "epoch": 3.0398936170212765,
243
- "grad_norm": 3.2934021949768066,
244
  "learning_rate": 1.1242603550295859e-05,
245
- "loss": 0.6263,
246
  "step": 300
247
  },
248
  {
249
  "epoch": 3.0664893617021276,
250
- "grad_norm": 2.3915889263153076,
251
  "learning_rate": 9.763313609467455e-06,
252
- "loss": 0.6514,
253
  "step": 310
254
  },
255
  {
256
  "epoch": 3.0930851063829787,
257
- "grad_norm": 3.9150514602661133,
258
  "learning_rate": 8.284023668639054e-06,
259
- "loss": 0.8365,
260
  "step": 320
261
  },
262
  {
263
  "epoch": 3.11968085106383,
264
- "grad_norm": 3.9853973388671875,
265
  "learning_rate": 6.8047337278106515e-06,
266
- "loss": 0.7772,
267
  "step": 330
268
  },
269
  {
270
  "epoch": 3.146276595744681,
271
- "grad_norm": 1.9402360916137695,
272
  "learning_rate": 5.325443786982249e-06,
273
- "loss": 0.8747,
274
  "step": 340
275
  },
276
  {
277
  "epoch": 3.172872340425532,
278
- "grad_norm": 3.0497424602508545,
279
  "learning_rate": 3.846153846153847e-06,
280
- "loss": 0.5998,
281
  "step": 350
282
  },
283
  {
284
  "epoch": 3.199468085106383,
285
- "grad_norm": 4.310546875,
286
  "learning_rate": 2.366863905325444e-06,
287
- "loss": 0.8328,
288
  "step": 360
289
  },
290
  {
291
  "epoch": 3.226063829787234,
292
- "grad_norm": 3.943880796432495,
293
  "learning_rate": 8.875739644970415e-07,
294
- "loss": 0.8287,
295
  "step": 370
296
  },
297
  {
298
  "epoch": 3.242021276595745,
299
  "eval_accuracy": 0.7875,
300
- "eval_loss": 0.7972937822341919,
301
- "eval_runtime": 11.2004,
302
- "eval_samples_per_second": 14.285,
303
- "eval_steps_per_second": 1.786,
304
  "step": 376
305
  },
306
  {
307
  "epoch": 3.242021276595745,
308
  "step": 376,
309
  "total_flos": 3.7220613152994755e+18,
310
- "train_loss": 0.7813464815312243,
311
- "train_runtime": 603.3994,
312
- "train_samples_per_second": 4.985,
313
- "train_steps_per_second": 0.623
314
  },
315
  {
316
  "epoch": 3.242021276595745,
317
  "eval_accuracy": 0.7633136094674556,
318
- "eval_loss": 0.8937366008758545,
319
- "eval_runtime": 12.7948,
320
- "eval_samples_per_second": 13.209,
321
- "eval_steps_per_second": 1.719,
322
  "step": 376
323
  },
324
  {
325
  "epoch": 3.242021276595745,
326
  "eval_accuracy": 0.7633136094674556,
327
- "eval_loss": 0.893736720085144,
328
- "eval_runtime": 11.4267,
329
- "eval_samples_per_second": 14.79,
330
- "eval_steps_per_second": 1.925,
331
  "step": 376
332
  }
333
  ],
 
10
  "log_history": [
11
  {
12
  "epoch": 0.026595744680851064,
13
+ "grad_norm": 12.65776252746582,
14
  "learning_rate": 1.3157894736842106e-05,
15
+ "loss": 1.7008,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.05319148936170213,
20
+ "grad_norm": 6.046972274780273,
21
  "learning_rate": 2.6315789473684212e-05,
22
+ "loss": 0.9981,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.0797872340425532,
27
+ "grad_norm": 9.531058311462402,
28
  "learning_rate": 3.9473684210526316e-05,
29
+ "loss": 0.9371,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.10638297872340426,
34
+ "grad_norm": 6.416304111480713,
35
  "learning_rate": 4.970414201183432e-05,
36
+ "loss": 0.8702,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.13297872340425532,
41
+ "grad_norm": 4.882875442504883,
42
  "learning_rate": 4.822485207100592e-05,
43
+ "loss": 0.5812,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.1595744680851064,
48
+ "grad_norm": 3.9771392345428467,
49
  "learning_rate": 4.674556213017752e-05,
50
+ "loss": 1.1064,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.18617021276595744,
55
+ "grad_norm": 2.8228275775909424,
56
  "learning_rate": 4.5266272189349114e-05,
57
+ "loss": 0.6824,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.2127659574468085,
62
+ "grad_norm": 2.11715030670166,
63
  "learning_rate": 4.378698224852072e-05,
64
+ "loss": 0.5582,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.2393617021276596,
69
+ "grad_norm": 1.6437604427337646,
70
  "learning_rate": 4.230769230769231e-05,
71
+ "loss": 0.7968,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.2526595744680851,
76
  "eval_accuracy": 0.7875,
77
+ "eval_loss": 0.7984516620635986,
78
+ "eval_runtime": 15.2987,
79
+ "eval_samples_per_second": 10.458,
80
+ "eval_steps_per_second": 1.307,
81
  "step": 95
82
  },
83
  {
84
  "epoch": 1.0132978723404256,
85
+ "grad_norm": 1.7461698055267334,
86
  "learning_rate": 4.0828402366863904e-05,
87
+ "loss": 0.6967,
88
  "step": 100
89
  },
90
  {
91
  "epoch": 1.0398936170212767,
92
+ "grad_norm": 2.8989648818969727,
93
  "learning_rate": 3.934911242603551e-05,
94
+ "loss": 0.8106,
95
  "step": 110
96
  },
97
  {
98
  "epoch": 1.0664893617021276,
99
+ "grad_norm": 2.848541021347046,
100
  "learning_rate": 3.7869822485207104e-05,
101
+ "loss": 0.741,
102
  "step": 120
103
  },
104
  {
105
  "epoch": 1.0930851063829787,
106
+ "grad_norm": 2.6322543621063232,
107
  "learning_rate": 3.63905325443787e-05,
108
+ "loss": 0.7027,
109
  "step": 130
110
  },
111
  {
112
  "epoch": 1.1196808510638299,
113
+ "grad_norm": 7.683568954467773,
114
  "learning_rate": 3.49112426035503e-05,
115
+ "loss": 0.6579,
116
  "step": 140
117
  },
118
  {
119
  "epoch": 1.1462765957446808,
120
+ "grad_norm": 1.7948863506317139,
121
  "learning_rate": 3.3431952662721895e-05,
122
+ "loss": 0.8164,
123
  "step": 150
124
  },
125
  {
126
  "epoch": 1.172872340425532,
127
+ "grad_norm": 1.682706594467163,
128
  "learning_rate": 3.195266272189349e-05,
129
+ "loss": 0.6967,
130
  "step": 160
131
  },
132
  {
133
  "epoch": 1.199468085106383,
134
+ "grad_norm": 3.870861530303955,
135
  "learning_rate": 3.047337278106509e-05,
136
+ "loss": 0.7218,
137
  "step": 170
138
  },
139
  {
140
  "epoch": 1.226063829787234,
141
+ "grad_norm": 1.6848540306091309,
142
  "learning_rate": 2.8994082840236685e-05,
143
+ "loss": 0.7805,
144
  "step": 180
145
  },
146
  {
147
  "epoch": 1.252659574468085,
148
+ "grad_norm": 5.371254920959473,
149
  "learning_rate": 2.751479289940829e-05,
150
+ "loss": 0.8744,
151
  "step": 190
152
  },
153
  {
154
  "epoch": 1.252659574468085,
155
  "eval_accuracy": 0.7875,
156
+ "eval_loss": 0.7836956977844238,
157
+ "eval_runtime": 11.0004,
158
+ "eval_samples_per_second": 14.545,
159
+ "eval_steps_per_second": 1.818,
160
  "step": 190
161
  },
162
  {
163
  "epoch": 2.026595744680851,
164
+ "grad_norm": 1.314697027206421,
165
  "learning_rate": 2.6035502958579882e-05,
166
+ "loss": 0.7122,
167
  "step": 200
168
  },
169
  {
170
  "epoch": 2.0531914893617023,
171
+ "grad_norm": 1.7407877445220947,
172
  "learning_rate": 2.4556213017751482e-05,
173
+ "loss": 0.7189,
174
  "step": 210
175
  },
176
  {
177
  "epoch": 2.0797872340425534,
178
+ "grad_norm": 2.266850471496582,
179
  "learning_rate": 2.307692307692308e-05,
180
+ "loss": 0.8913,
181
  "step": 220
182
  },
183
  {
184
  "epoch": 2.106382978723404,
185
+ "grad_norm": 2.1507058143615723,
186
  "learning_rate": 2.1597633136094676e-05,
187
+ "loss": 0.8554,
188
  "step": 230
189
  },
190
  {
191
  "epoch": 2.132978723404255,
192
+ "grad_norm": 4.038496971130371,
193
  "learning_rate": 2.0118343195266273e-05,
194
+ "loss": 0.6919,
195
  "step": 240
196
  },
197
  {
198
  "epoch": 2.1595744680851063,
199
+ "grad_norm": 4.005491256713867,
200
  "learning_rate": 1.8639053254437873e-05,
201
+ "loss": 0.7086,
202
  "step": 250
203
  },
204
  {
205
  "epoch": 2.1861702127659575,
206
+ "grad_norm": 3.721208095550537,
207
  "learning_rate": 1.7159763313609466e-05,
208
+ "loss": 0.7478,
209
  "step": 260
210
  },
211
  {
212
  "epoch": 2.2127659574468086,
213
+ "grad_norm": 4.95754337310791,
214
  "learning_rate": 1.5680473372781066e-05,
215
+ "loss": 0.5457,
216
  "step": 270
217
  },
218
  {
219
  "epoch": 2.2393617021276597,
220
+ "grad_norm": 2.7324411869049072,
221
  "learning_rate": 1.4201183431952663e-05,
222
+ "loss": 0.9479,
223
  "step": 280
224
  },
225
  {
226
  "epoch": 2.252659574468085,
227
  "eval_accuracy": 0.7875,
228
+ "eval_loss": 0.7812246084213257,
229
+ "eval_runtime": 11.2394,
230
+ "eval_samples_per_second": 14.236,
231
+ "eval_steps_per_second": 1.779,
232
  "step": 285
233
  },
234
  {
235
  "epoch": 3.0132978723404253,
236
+ "grad_norm": 2.182974338531494,
237
  "learning_rate": 1.2721893491124262e-05,
238
+ "loss": 0.6173,
239
  "step": 290
240
  },
241
  {
242
  "epoch": 3.0398936170212765,
243
+ "grad_norm": 3.512352228164673,
244
  "learning_rate": 1.1242603550295859e-05,
245
+ "loss": 0.6313,
246
  "step": 300
247
  },
248
  {
249
  "epoch": 3.0664893617021276,
250
+ "grad_norm": 2.436189651489258,
251
  "learning_rate": 9.763313609467455e-06,
252
+ "loss": 0.6465,
253
  "step": 310
254
  },
255
  {
256
  "epoch": 3.0930851063829787,
257
+ "grad_norm": 4.008547782897949,
258
  "learning_rate": 8.284023668639054e-06,
259
+ "loss": 0.8407,
260
  "step": 320
261
  },
262
  {
263
  "epoch": 3.11968085106383,
264
+ "grad_norm": 4.314539909362793,
265
  "learning_rate": 6.8047337278106515e-06,
266
+ "loss": 0.7826,
267
  "step": 330
268
  },
269
  {
270
  "epoch": 3.146276595744681,
271
+ "grad_norm": 2.1060965061187744,
272
  "learning_rate": 5.325443786982249e-06,
273
+ "loss": 0.8714,
274
  "step": 340
275
  },
276
  {
277
  "epoch": 3.172872340425532,
278
+ "grad_norm": 3.250807046890259,
279
  "learning_rate": 3.846153846153847e-06,
280
+ "loss": 0.6124,
281
  "step": 350
282
  },
283
  {
284
  "epoch": 3.199468085106383,
285
+ "grad_norm": 4.283337593078613,
286
  "learning_rate": 2.366863905325444e-06,
287
+ "loss": 0.8395,
288
  "step": 360
289
  },
290
  {
291
  "epoch": 3.226063829787234,
292
+ "grad_norm": 4.142538070678711,
293
  "learning_rate": 8.875739644970415e-07,
294
+ "loss": 0.8282,
295
  "step": 370
296
  },
297
  {
298
  "epoch": 3.242021276595745,
299
  "eval_accuracy": 0.7875,
300
+ "eval_loss": 0.7884188294410706,
301
+ "eval_runtime": 11.8341,
302
+ "eval_samples_per_second": 13.52,
303
+ "eval_steps_per_second": 1.69,
304
  "step": 376
305
  },
306
  {
307
  "epoch": 3.242021276595745,
308
  "step": 376,
309
  "total_flos": 3.7220613152994755e+18,
310
+ "train_loss": 0.785655234088289,
311
+ "train_runtime": 616.1212,
312
+ "train_samples_per_second": 4.882,
313
+ "train_steps_per_second": 0.61
314
  },
315
  {
316
  "epoch": 3.242021276595745,
317
  "eval_accuracy": 0.7633136094674556,
318
+ "eval_loss": 0.8817278742790222,
319
+ "eval_runtime": 19.9254,
320
+ "eval_samples_per_second": 8.482,
321
+ "eval_steps_per_second": 1.104,
322
  "step": 376
323
  },
324
  {
325
  "epoch": 3.242021276595745,
326
  "eval_accuracy": 0.7633136094674556,
327
+ "eval_loss": 0.8817277550697327,
328
+ "eval_runtime": 14.8211,
329
+ "eval_samples_per_second": 11.403,
330
+ "eval_steps_per_second": 1.484,
331
  "step": 376
332
  }
333
  ],