RikkiXu commited on
Commit
691e859
1 Parent(s): 45ce72f

Model save

Browse files
Files changed (3) hide show
  1. all_results.json +5 -5
  2. train_results.json +5 -5
  3. trainer_state.json +51 -419
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 5.0,
3
- "train_loss": 0.03106498374777325,
4
- "train_runtime": 2739.2502,
5
  "train_samples": 6599,
6
- "train_samples_per_second": 13.728,
7
- "train_steps_per_second": 0.108
8
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "train_loss": 0.908859632783017,
4
+ "train_runtime": 1626.0912,
5
  "train_samples": 6599,
6
+ "train_samples_per_second": 4.625,
7
+ "train_steps_per_second": 0.036
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 5.0,
3
- "train_loss": 0.03106498374777325,
4
- "train_runtime": 2739.2502,
5
  "train_samples": 6599,
6
- "train_samples_per_second": 13.728,
7
- "train_steps_per_second": 0.108
8
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "train_loss": 0.908859632783017,
4
+ "train_runtime": 1626.0912,
5
  "train_samples": 6599,
6
+ "train_samples_per_second": 4.625,
7
+ "train_steps_per_second": 0.036
8
  }
trainer_state.json CHANGED
@@ -1,489 +1,121 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 5.0,
5
  "eval_steps": 500,
6
- "global_step": 295,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.02,
13
- "grad_norm": 4.7237044668329,
14
- "learning_rate": 6.666666666666667e-07,
15
  "loss": 0.9545,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.08,
20
- "grad_norm": 3.1773052856369324,
21
- "learning_rate": 3.3333333333333333e-06,
22
- "loss": 0.9326,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.17,
27
- "grad_norm": 3.033793297452865,
28
- "learning_rate": 6.666666666666667e-06,
29
- "loss": 0.8956,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.25,
34
- "grad_norm": 2.0855419235541013,
35
- "learning_rate": 1e-05,
36
- "loss": 0.8574,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.34,
41
- "grad_norm": 2.0199202255230047,
42
- "learning_rate": 1.3333333333333333e-05,
43
- "loss": 0.8222,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.42,
48
- "grad_norm": 2.3471928271965106,
49
- "learning_rate": 1.6666666666666667e-05,
50
- "loss": 0.8138,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.51,
55
- "grad_norm": 2.600668692972917,
56
- "learning_rate": 2e-05,
57
- "loss": 0.7721,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.59,
62
- "grad_norm": 2.4226542415206485,
63
- "learning_rate": 1.9982437317643218e-05,
64
- "loss": 0.7634,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.68,
69
- "grad_norm": 2.850819690314508,
70
- "learning_rate": 1.992981096013517e-05,
71
- "loss": 0.7521,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.76,
76
- "grad_norm": 2.5549789377169114,
77
- "learning_rate": 1.984230577947597e-05,
78
- "loss": 0.697,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.85,
83
- "grad_norm": 2.4774565913803563,
84
- "learning_rate": 1.972022914080411e-05,
85
- "loss": 0.6735,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.93,
90
- "grad_norm": 2.146208038921012,
91
- "learning_rate": 1.9564009842765225e-05,
92
- "loss": 0.6546,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 1.0,
97
- "eval_loss": 0.4448932409286499,
98
- "eval_runtime": 334.8516,
99
- "eval_samples_per_second": 22.461,
100
- "eval_steps_per_second": 0.352,
101
  "step": 59
102
  },
103
  {
104
- "epoch": 1.02,
105
- "grad_norm": 3.923304638941212,
106
- "learning_rate": 1.9374196611341212e-05,
107
- "loss": 0.6271,
108
- "step": 60
109
- },
110
- {
111
- "epoch": 1.1,
112
- "grad_norm": 3.1379021048274596,
113
- "learning_rate": 1.9151456172430186e-05,
114
- "loss": 0.4188,
115
- "step": 65
116
- },
117
- {
118
- "epoch": 1.19,
119
- "grad_norm": 2.478413663736477,
120
- "learning_rate": 1.8896570909947477e-05,
121
- "loss": 0.4197,
122
- "step": 70
123
- },
124
- {
125
- "epoch": 1.27,
126
- "grad_norm": 3.1965773364967114,
127
- "learning_rate": 1.8610436117673557e-05,
128
- "loss": 0.3996,
129
- "step": 75
130
- },
131
- {
132
- "epoch": 1.36,
133
- "grad_norm": 1.8635605447479031,
134
- "learning_rate": 1.829405685450202e-05,
135
- "loss": 0.4087,
136
- "step": 80
137
- },
138
- {
139
- "epoch": 1.44,
140
- "grad_norm": 4.049894316503265,
141
- "learning_rate": 1.7948544414133534e-05,
142
- "loss": 0.4113,
143
- "step": 85
144
- },
145
- {
146
- "epoch": 1.53,
147
- "grad_norm": 175.82031352565957,
148
- "learning_rate": 1.7575112421616203e-05,
149
- "loss": 0.7264,
150
- "step": 90
151
- },
152
- {
153
- "epoch": 1.61,
154
- "grad_norm": 97.72323752808755,
155
- "learning_rate": 1.717507257044331e-05,
156
- "loss": 0.7019,
157
- "step": 95
158
- },
159
- {
160
- "epoch": 1.69,
161
- "grad_norm": 10.439473368823062,
162
- "learning_rate": 1.6749830015182106e-05,
163
- "loss": 0.8288,
164
- "step": 100
165
- },
166
- {
167
- "epoch": 1.78,
168
- "grad_norm": 14.65204562703183,
169
- "learning_rate": 1.6300878435817115e-05,
170
- "loss": 0.622,
171
- "step": 105
172
- },
173
- {
174
- "epoch": 1.86,
175
- "grad_norm": 4.935916944271238,
176
- "learning_rate": 1.5829794791144723e-05,
177
- "loss": 0.5399,
178
- "step": 110
179
- },
180
- {
181
- "epoch": 1.95,
182
- "grad_norm": 3.1671885548837544,
183
- "learning_rate": 1.533823377964791e-05,
184
- "loss": 0.4747,
185
- "step": 115
186
- },
187
- {
188
- "epoch": 2.0,
189
- "eval_loss": 0.31807941198349,
190
- "eval_runtime": 335.1235,
191
- "eval_samples_per_second": 22.442,
192
- "eval_steps_per_second": 0.352,
193
- "step": 118
194
- },
195
- {
196
- "epoch": 2.03,
197
- "grad_norm": 2.7978128026277864,
198
- "learning_rate": 1.482792202730745e-05,
199
- "loss": 0.3959,
200
- "step": 120
201
- },
202
- {
203
- "epoch": 2.12,
204
- "grad_norm": 2.1385098508541693,
205
- "learning_rate": 1.4300652022765207e-05,
206
- "loss": 0.2884,
207
- "step": 125
208
- },
209
- {
210
- "epoch": 2.2,
211
- "grad_norm": 1.9703828411806736,
212
- "learning_rate": 1.3758275821142382e-05,
213
- "loss": 0.2745,
214
- "step": 130
215
- },
216
- {
217
- "epoch": 2.29,
218
- "grad_norm": 1.870032223898805,
219
- "learning_rate": 1.3202698538628376e-05,
220
- "loss": 0.2539,
221
- "step": 135
222
- },
223
- {
224
- "epoch": 2.37,
225
- "grad_norm": 7.255082480038806,
226
- "learning_rate": 1.2635871660690677e-05,
227
- "loss": 0.2588,
228
- "step": 140
229
- },
230
- {
231
- "epoch": 2.46,
232
- "grad_norm": 1.7733158231347383,
233
- "learning_rate": 1.2059786187410984e-05,
234
- "loss": 0.2564,
235
- "step": 145
236
- },
237
- {
238
- "epoch": 2.54,
239
- "grad_norm": 1.9182852956652028,
240
- "learning_rate": 1.1476465640024814e-05,
241
- "loss": 0.2282,
242
- "step": 150
243
- },
244
- {
245
- "epoch": 2.63,
246
- "grad_norm": 1.6601552566584281,
247
- "learning_rate": 1.0887958953229349e-05,
248
- "loss": 0.2323,
249
- "step": 155
250
- },
251
- {
252
- "epoch": 2.71,
253
- "grad_norm": 1.4919318406362796,
254
- "learning_rate": 1.0296333278225599e-05,
255
- "loss": 0.2354,
256
- "step": 160
257
- },
258
- {
259
- "epoch": 2.8,
260
- "grad_norm": 1.5076718836100451,
261
- "learning_rate": 9.703666721774403e-06,
262
- "loss": 0.23,
263
- "step": 165
264
- },
265
- {
266
- "epoch": 2.88,
267
- "grad_norm": 1.703882565208849,
268
- "learning_rate": 9.112041046770653e-06,
269
- "loss": 0.2312,
270
- "step": 170
271
- },
272
- {
273
- "epoch": 2.97,
274
- "grad_norm": 1.3815318382548532,
275
- "learning_rate": 8.52353435997519e-06,
276
- "loss": 0.2246,
277
- "step": 175
278
- },
279
- {
280
- "epoch": 3.0,
281
- "eval_loss": 0.12802040576934814,
282
- "eval_runtime": 335.0335,
283
- "eval_samples_per_second": 22.449,
284
- "eval_steps_per_second": 0.352,
285
- "step": 177
286
- },
287
- {
288
- "epoch": 3.05,
289
- "grad_norm": 1.340084619420041,
290
- "learning_rate": 7.940213812589018e-06,
291
- "loss": 0.1568,
292
- "step": 180
293
- },
294
- {
295
- "epoch": 3.14,
296
- "grad_norm": 1.4363530293138602,
297
- "learning_rate": 7.364128339309326e-06,
298
- "loss": 0.1171,
299
- "step": 185
300
- },
301
- {
302
- "epoch": 3.22,
303
- "grad_norm": 7.055735555930357,
304
- "learning_rate": 6.797301461371626e-06,
305
- "loss": 0.129,
306
- "step": 190
307
- },
308
- {
309
- "epoch": 3.31,
310
- "grad_norm": 3.3324655424159606,
311
- "learning_rate": 6.241724178857621e-06,
312
- "loss": 0.15,
313
- "step": 195
314
- },
315
- {
316
- "epoch": 3.39,
317
- "grad_norm": 2.4413759201237917,
318
- "learning_rate": 5.699347977234799e-06,
319
- "loss": 0.137,
320
- "step": 200
321
- },
322
- {
323
- "epoch": 3.47,
324
- "grad_norm": 1.6325745641721867,
325
- "learning_rate": 5.172077972692553e-06,
326
- "loss": 0.1218,
327
- "step": 205
328
- },
329
- {
330
- "epoch": 3.56,
331
- "grad_norm": 1.5192997389176583,
332
- "learning_rate": 4.661766220352098e-06,
333
- "loss": 0.1241,
334
- "step": 210
335
- },
336
- {
337
- "epoch": 3.64,
338
- "grad_norm": 1.416532576646694,
339
- "learning_rate": 4.170205208855281e-06,
340
- "loss": 0.1163,
341
- "step": 215
342
- },
343
- {
344
- "epoch": 3.73,
345
- "grad_norm": 1.1859416249764543,
346
- "learning_rate": 3.6991215641828903e-06,
347
- "loss": 0.1212,
348
- "step": 220
349
- },
350
- {
351
- "epoch": 3.81,
352
- "grad_norm": 1.2425425377133077,
353
- "learning_rate": 3.250169984817897e-06,
354
- "loss": 0.1261,
355
- "step": 225
356
- },
357
- {
358
- "epoch": 3.9,
359
- "grad_norm": 1.197573551306019,
360
- "learning_rate": 2.8249274295566863e-06,
361
- "loss": 0.1206,
362
- "step": 230
363
- },
364
- {
365
- "epoch": 3.98,
366
- "grad_norm": 1.2203991841318311,
367
- "learning_rate": 2.424887578383799e-06,
368
- "loss": 0.1179,
369
- "step": 235
370
- },
371
- {
372
- "epoch": 4.0,
373
- "eval_loss": 0.08678202331066132,
374
- "eval_runtime": 334.8445,
375
- "eval_samples_per_second": 22.461,
376
- "eval_steps_per_second": 0.352,
377
- "step": 236
378
- },
379
- {
380
- "epoch": 4.07,
381
- "grad_norm": 1.3225710226725387,
382
- "learning_rate": 2.0514555858664663e-06,
383
- "loss": 0.0919,
384
- "step": 240
385
- },
386
- {
387
- "epoch": 4.15,
388
- "grad_norm": 1.2828578734194267,
389
- "learning_rate": 1.7059431454979825e-06,
390
- "loss": 0.0833,
391
- "step": 245
392
- },
393
- {
394
- "epoch": 4.24,
395
- "grad_norm": 1.1833858250329508,
396
- "learning_rate": 1.3895638823264447e-06,
397
- "loss": 0.0861,
398
- "step": 250
399
- },
400
- {
401
- "epoch": 4.32,
402
- "grad_norm": 0.9459675579164174,
403
- "learning_rate": 1.1034290900525279e-06,
404
- "loss": 0.0749,
405
- "step": 255
406
- },
407
- {
408
- "epoch": 4.41,
409
- "grad_norm": 0.968509379642604,
410
- "learning_rate": 8.485438275698154e-07,
411
- "loss": 0.0847,
412
- "step": 260
413
- },
414
- {
415
- "epoch": 4.49,
416
- "grad_norm": 1.021637819526785,
417
- "learning_rate": 6.258033886587911e-07,
418
- "loss": 0.0832,
419
- "step": 265
420
- },
421
- {
422
- "epoch": 4.58,
423
- "grad_norm": 0.9348358669170516,
424
- "learning_rate": 4.359901572347758e-07,
425
- "loss": 0.0819,
426
- "step": 270
427
- },
428
- {
429
- "epoch": 4.66,
430
- "grad_norm": 0.9018557213311438,
431
- "learning_rate": 2.7977085919589253e-07,
432
- "loss": 0.081,
433
- "step": 275
434
- },
435
- {
436
- "epoch": 4.75,
437
- "grad_norm": 0.9143406331670494,
438
- "learning_rate": 1.5769422052403172e-07,
439
- "loss": 0.0777,
440
- "step": 280
441
- },
442
- {
443
- "epoch": 4.83,
444
- "grad_norm": 0.965550232388536,
445
- "learning_rate": 7.018903986483083e-08,
446
- "loss": 0.0804,
447
- "step": 285
448
- },
449
- {
450
- "epoch": 4.92,
451
- "grad_norm": 0.982878932025226,
452
- "learning_rate": 1.7562682356786488e-08,
453
- "loss": 0.0842,
454
- "step": 290
455
- },
456
- {
457
- "epoch": 5.0,
458
- "grad_norm": 0.8492670680881892,
459
- "learning_rate": 0.0,
460
- "loss": 0.0755,
461
- "step": 295
462
- },
463
- {
464
- "epoch": 5.0,
465
- "eval_loss": 0.07571765035390854,
466
- "eval_runtime": 334.5414,
467
- "eval_samples_per_second": 22.482,
468
- "eval_steps_per_second": 0.353,
469
- "step": 295
470
- },
471
- {
472
- "epoch": 5.0,
473
- "step": 295,
474
- "total_flos": 247067993702400.0,
475
- "train_loss": 0.03106498374777325,
476
- "train_runtime": 2739.2502,
477
- "train_samples_per_second": 13.728,
478
- "train_steps_per_second": 0.108
479
  }
480
  ],
481
  "logging_steps": 5,
482
- "max_steps": 295,
483
  "num_input_tokens_seen": 0,
484
- "num_train_epochs": 5,
485
  "save_steps": 100,
486
- "total_flos": 247067993702400.0,
487
  "train_batch_size": 16,
488
  "trial_name": null,
489
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
  "eval_steps": 500,
6
+ "global_step": 59,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.02,
13
+ "grad_norm": 4.72473337945939,
14
+ "learning_rate": 3.3333333333333333e-06,
15
  "loss": 0.9545,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.08,
20
+ "grad_norm": 15.606619949426117,
21
+ "learning_rate": 1.6666666666666667e-05,
22
+ "loss": 0.9459,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.17,
27
+ "grad_norm": 97.5546177933576,
28
+ "learning_rate": 1.972022914080411e-05,
29
+ "loss": 1.309,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.25,
34
+ "grad_norm": 7.451729536881689,
35
+ "learning_rate": 1.8610436117673557e-05,
36
+ "loss": 1.1022,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.34,
41
+ "grad_norm": 4.013413307542302,
42
+ "learning_rate": 1.6749830015182106e-05,
43
+ "loss": 0.9536,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.42,
48
+ "grad_norm": 2.5281389594304153,
49
+ "learning_rate": 1.4300652022765207e-05,
50
+ "loss": 0.9122,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.51,
55
+ "grad_norm": 2.4087072108196184,
56
+ "learning_rate": 1.1476465640024814e-05,
57
+ "loss": 0.8548,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.59,
62
+ "grad_norm": 2.127180353057553,
63
+ "learning_rate": 8.52353435997519e-06,
64
+ "loss": 0.8386,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.68,
69
+ "grad_norm": 1.9083395084428687,
70
+ "learning_rate": 5.699347977234799e-06,
71
+ "loss": 0.8324,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.76,
76
+ "grad_norm": 1.7712149272954687,
77
+ "learning_rate": 3.250169984817897e-06,
78
+ "loss": 0.7792,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.85,
83
+ "grad_norm": 1.6965019256204015,
84
+ "learning_rate": 1.3895638823264447e-06,
85
+ "loss": 0.77,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.93,
90
+ "grad_norm": 1.6261522519462899,
91
+ "learning_rate": 2.7977085919589253e-07,
92
+ "loss": 0.7609,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 1.0,
97
+ "eval_loss": 0.7076106071472168,
98
+ "eval_runtime": 338.673,
99
+ "eval_samples_per_second": 22.207,
100
+ "eval_steps_per_second": 0.348,
101
  "step": 59
102
  },
103
  {
104
+ "epoch": 1.0,
105
+ "step": 59,
106
+ "total_flos": 49413598740480.0,
107
+ "train_loss": 0.908859632783017,
108
+ "train_runtime": 1626.0912,
109
+ "train_samples_per_second": 4.625,
110
+ "train_steps_per_second": 0.036
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  }
112
  ],
113
  "logging_steps": 5,
114
+ "max_steps": 59,
115
  "num_input_tokens_seen": 0,
116
+ "num_train_epochs": 1,
117
  "save_steps": 100,
118
+ "total_flos": 49413598740480.0,
119
  "train_batch_size": 16,
120
  "trial_name": null,
121
  "trial_params": null