|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 924, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.032467532467532464, |
|
"grad_norm": 6.265174130511246, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9365, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06493506493506493, |
|
"grad_norm": 1.3770366162111063, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8189, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09740259740259741, |
|
"grad_norm": 1.0978187926506935, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7833, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.12987012987012986, |
|
"grad_norm": 0.981210956561077, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7556, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.16233766233766234, |
|
"grad_norm": 0.9693356495883646, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7414, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.19480519480519481, |
|
"grad_norm": 1.0030466632996962, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7236, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.22727272727272727, |
|
"grad_norm": 1.1260479581636729, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7169, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2597402597402597, |
|
"grad_norm": 0.7460895709589158, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7168, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2922077922077922, |
|
"grad_norm": 0.9486986195334304, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7031, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3246753246753247, |
|
"grad_norm": 0.7051606863668234, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6974, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.35714285714285715, |
|
"grad_norm": 0.6756560441314118, |
|
"learning_rate": 5e-06, |
|
"loss": 0.692, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.38961038961038963, |
|
"grad_norm": 0.5240739369726283, |
|
"learning_rate": 5e-06, |
|
"loss": 0.693, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.42207792207792205, |
|
"grad_norm": 0.5785376996044719, |
|
"learning_rate": 5e-06, |
|
"loss": 0.689, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.45454545454545453, |
|
"grad_norm": 0.5933112722295861, |
|
"learning_rate": 5e-06, |
|
"loss": 0.691, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.487012987012987, |
|
"grad_norm": 0.6077588775577111, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6806, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5194805194805194, |
|
"grad_norm": 0.6238054182160374, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6817, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.551948051948052, |
|
"grad_norm": 0.6081002667408969, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6807, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5844155844155844, |
|
"grad_norm": 0.6219764792866612, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6773, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6168831168831169, |
|
"grad_norm": 0.739529394087955, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6795, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6493506493506493, |
|
"grad_norm": 0.7524681424985254, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6723, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6818181818181818, |
|
"grad_norm": 0.6010281827966147, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6709, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 0.6611165599870378, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6692, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.7467532467532467, |
|
"grad_norm": 0.8344801352021102, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6738, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7792207792207793, |
|
"grad_norm": 0.7851764850319622, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6692, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.8116883116883117, |
|
"grad_norm": 0.6068138322416587, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6693, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.8441558441558441, |
|
"grad_norm": 0.5781959225993195, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6698, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8766233766233766, |
|
"grad_norm": 0.7049586430934481, |
|
"learning_rate": 5e-06, |
|
"loss": 0.672, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 0.6323170370591866, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6668, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.9415584415584416, |
|
"grad_norm": 0.881618301887001, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6706, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.974025974025974, |
|
"grad_norm": 0.5219254149696031, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6679, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.6691647171974182, |
|
"eval_runtime": 30.5714, |
|
"eval_samples_per_second": 270.972, |
|
"eval_steps_per_second": 1.079, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 1.0064935064935066, |
|
"grad_norm": 0.7028333992981749, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6597, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.0389610389610389, |
|
"grad_norm": 0.620216384870711, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6178, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.0714285714285714, |
|
"grad_norm": 0.6279544966110486, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6196, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.103896103896104, |
|
"grad_norm": 0.47691022078448675, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6213, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.1363636363636362, |
|
"grad_norm": 0.6169659732755709, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6189, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.1688311688311688, |
|
"grad_norm": 0.6930896730291389, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6179, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.2012987012987013, |
|
"grad_norm": 0.5888468229519391, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6193, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.2337662337662338, |
|
"grad_norm": 0.5114807666495347, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6205, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.2662337662337662, |
|
"grad_norm": 0.576480885597218, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6143, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.2987012987012987, |
|
"grad_norm": 0.9781557440302872, |
|
"learning_rate": 5e-06, |
|
"loss": 0.616, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.3311688311688312, |
|
"grad_norm": 0.5493968761484528, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6181, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.3636363636363638, |
|
"grad_norm": 0.8450188883114491, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6186, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.396103896103896, |
|
"grad_norm": 0.6672141224772778, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6182, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 0.5436445484738832, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6147, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.4610389610389611, |
|
"grad_norm": 0.5830504026660146, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6179, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.4935064935064934, |
|
"grad_norm": 0.6473257236943104, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6199, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.525974025974026, |
|
"grad_norm": 0.5427880278607804, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6168, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.5584415584415585, |
|
"grad_norm": 0.5689580314401272, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6159, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.5909090909090908, |
|
"grad_norm": 0.597927845953086, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6175, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.6233766233766234, |
|
"grad_norm": 0.6128642707216239, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6163, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.655844155844156, |
|
"grad_norm": 0.5455974938431143, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6168, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.6883116883116882, |
|
"grad_norm": 0.5153120159264221, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6204, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.7207792207792207, |
|
"grad_norm": 0.5767601324955324, |
|
"learning_rate": 5e-06, |
|
"loss": 0.619, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.7532467532467533, |
|
"grad_norm": 0.5856685996311523, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6195, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.7857142857142856, |
|
"grad_norm": 0.5318505472371191, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6142, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.8181818181818183, |
|
"grad_norm": 0.5867934386348821, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6163, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.8506493506493507, |
|
"grad_norm": 0.5460100351131841, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6209, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.883116883116883, |
|
"grad_norm": 0.5930045996717794, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6173, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.9155844155844157, |
|
"grad_norm": 0.5210440244932204, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6172, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.948051948051948, |
|
"grad_norm": 0.678263024145128, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6219, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.9805194805194806, |
|
"grad_norm": 0.5193570456262979, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6119, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.6595985293388367, |
|
"eval_runtime": 30.7002, |
|
"eval_samples_per_second": 269.835, |
|
"eval_steps_per_second": 1.075, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 2.012987012987013, |
|
"grad_norm": 0.997715568070811, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5894, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.0454545454545454, |
|
"grad_norm": 0.7592737794541236, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5677, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.0779220779220777, |
|
"grad_norm": 0.6056584838864539, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5669, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.1103896103896105, |
|
"grad_norm": 0.6211039916928865, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5697, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.142857142857143, |
|
"grad_norm": 0.615106862254971, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5693, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.175324675324675, |
|
"grad_norm": 0.566417720964845, |
|
"learning_rate": 5e-06, |
|
"loss": 0.577, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.207792207792208, |
|
"grad_norm": 0.5480748428783726, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5724, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.24025974025974, |
|
"grad_norm": 0.6883572551516758, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5663, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.2727272727272725, |
|
"grad_norm": 0.7624758724871575, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5721, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.3051948051948052, |
|
"grad_norm": 0.5925041865618843, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5718, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.3376623376623376, |
|
"grad_norm": 0.5423034645452969, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5681, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.3701298701298703, |
|
"grad_norm": 0.5480316834860852, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5722, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.4025974025974026, |
|
"grad_norm": 0.5169062030347897, |
|
"learning_rate": 5e-06, |
|
"loss": 0.578, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.435064935064935, |
|
"grad_norm": 0.5457808079840645, |
|
"learning_rate": 5e-06, |
|
"loss": 0.57, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.4675324675324677, |
|
"grad_norm": 0.5470205045138103, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5726, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.5125136364795218, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5693, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.5324675324675323, |
|
"grad_norm": 0.5945664415971015, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5714, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.564935064935065, |
|
"grad_norm": 0.5702694037641614, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5689, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.5974025974025974, |
|
"grad_norm": 0.5441374726350022, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5742, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.62987012987013, |
|
"grad_norm": 0.5674621294447999, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5687, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.6623376623376624, |
|
"grad_norm": 0.5997098488587294, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5763, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.6948051948051948, |
|
"grad_norm": 0.6199757649220302, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5747, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.7272727272727275, |
|
"grad_norm": 0.6911213249901123, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5711, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.75974025974026, |
|
"grad_norm": 0.5709123176208969, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5701, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.792207792207792, |
|
"grad_norm": 0.6304517541226137, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5673, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.824675324675325, |
|
"grad_norm": 0.6030037959776535, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5713, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 0.5603204730571357, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5749, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.8896103896103895, |
|
"grad_norm": 0.5148606934943276, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5671, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.9220779220779223, |
|
"grad_norm": 0.770823574891512, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5694, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.9545454545454546, |
|
"grad_norm": 0.6707592403791355, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5691, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.987012987012987, |
|
"grad_norm": 0.7817460976590817, |
|
"learning_rate": 5e-06, |
|
"loss": 0.568, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.6653555631637573, |
|
"eval_runtime": 29.9562, |
|
"eval_samples_per_second": 276.537, |
|
"eval_steps_per_second": 1.102, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 924, |
|
"total_flos": 1547734414786560.0, |
|
"train_loss": 0.6307248511871735, |
|
"train_runtime": 5890.0632, |
|
"train_samples_per_second": 80.163, |
|
"train_steps_per_second": 0.157 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 924, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1547734414786560.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|