{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 924, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.032467532467532464, "grad_norm": 6.265174130511246, "learning_rate": 5e-06, "loss": 0.9365, "step": 10 }, { "epoch": 0.06493506493506493, "grad_norm": 1.3770366162111063, "learning_rate": 5e-06, "loss": 0.8189, "step": 20 }, { "epoch": 0.09740259740259741, "grad_norm": 1.0978187926506935, "learning_rate": 5e-06, "loss": 0.7833, "step": 30 }, { "epoch": 0.12987012987012986, "grad_norm": 0.981210956561077, "learning_rate": 5e-06, "loss": 0.7556, "step": 40 }, { "epoch": 0.16233766233766234, "grad_norm": 0.9693356495883646, "learning_rate": 5e-06, "loss": 0.7414, "step": 50 }, { "epoch": 0.19480519480519481, "grad_norm": 1.0030466632996962, "learning_rate": 5e-06, "loss": 0.7236, "step": 60 }, { "epoch": 0.22727272727272727, "grad_norm": 1.1260479581636729, "learning_rate": 5e-06, "loss": 0.7169, "step": 70 }, { "epoch": 0.2597402597402597, "grad_norm": 0.7460895709589158, "learning_rate": 5e-06, "loss": 0.7168, "step": 80 }, { "epoch": 0.2922077922077922, "grad_norm": 0.9486986195334304, "learning_rate": 5e-06, "loss": 0.7031, "step": 90 }, { "epoch": 0.3246753246753247, "grad_norm": 0.7051606863668234, "learning_rate": 5e-06, "loss": 0.6974, "step": 100 }, { "epoch": 0.35714285714285715, "grad_norm": 0.6756560441314118, "learning_rate": 5e-06, "loss": 0.692, "step": 110 }, { "epoch": 0.38961038961038963, "grad_norm": 0.5240739369726283, "learning_rate": 5e-06, "loss": 0.693, "step": 120 }, { "epoch": 0.42207792207792205, "grad_norm": 0.5785376996044719, "learning_rate": 5e-06, "loss": 0.689, "step": 130 }, { "epoch": 0.45454545454545453, "grad_norm": 0.5933112722295861, "learning_rate": 5e-06, "loss": 0.691, "step": 140 }, { "epoch": 0.487012987012987, "grad_norm": 0.6077588775577111, "learning_rate": 5e-06, "loss": 0.6806, "step": 150 }, { "epoch": 0.5194805194805194, "grad_norm": 0.6238054182160374, "learning_rate": 5e-06, "loss": 0.6817, "step": 160 }, { "epoch": 0.551948051948052, "grad_norm": 0.6081002667408969, "learning_rate": 5e-06, "loss": 0.6807, "step": 170 }, { "epoch": 0.5844155844155844, "grad_norm": 0.6219764792866612, "learning_rate": 5e-06, "loss": 0.6773, "step": 180 }, { "epoch": 0.6168831168831169, "grad_norm": 0.739529394087955, "learning_rate": 5e-06, "loss": 0.6795, "step": 190 }, { "epoch": 0.6493506493506493, "grad_norm": 0.7524681424985254, "learning_rate": 5e-06, "loss": 0.6723, "step": 200 }, { "epoch": 0.6818181818181818, "grad_norm": 0.6010281827966147, "learning_rate": 5e-06, "loss": 0.6709, "step": 210 }, { "epoch": 0.7142857142857143, "grad_norm": 0.6611165599870378, "learning_rate": 5e-06, "loss": 0.6692, "step": 220 }, { "epoch": 0.7467532467532467, "grad_norm": 0.8344801352021102, "learning_rate": 5e-06, "loss": 0.6738, "step": 230 }, { "epoch": 0.7792207792207793, "grad_norm": 0.7851764850319622, "learning_rate": 5e-06, "loss": 0.6692, "step": 240 }, { "epoch": 0.8116883116883117, "grad_norm": 0.6068138322416587, "learning_rate": 5e-06, "loss": 0.6693, "step": 250 }, { "epoch": 0.8441558441558441, "grad_norm": 0.5781959225993195, "learning_rate": 5e-06, "loss": 0.6698, "step": 260 }, { "epoch": 0.8766233766233766, "grad_norm": 0.7049586430934481, "learning_rate": 5e-06, "loss": 0.672, "step": 270 }, { "epoch": 0.9090909090909091, "grad_norm": 0.6323170370591866, "learning_rate": 5e-06, "loss": 0.6668, "step": 280 }, { "epoch": 0.9415584415584416, "grad_norm": 0.881618301887001, "learning_rate": 5e-06, "loss": 0.6706, "step": 290 }, { "epoch": 0.974025974025974, "grad_norm": 0.5219254149696031, "learning_rate": 5e-06, "loss": 0.6679, "step": 300 }, { "epoch": 1.0, "eval_loss": 0.6691647171974182, "eval_runtime": 30.5714, "eval_samples_per_second": 270.972, "eval_steps_per_second": 1.079, "step": 308 }, { "epoch": 1.0064935064935066, "grad_norm": 0.7028333992981749, "learning_rate": 5e-06, "loss": 0.6597, "step": 310 }, { "epoch": 1.0389610389610389, "grad_norm": 0.620216384870711, "learning_rate": 5e-06, "loss": 0.6178, "step": 320 }, { "epoch": 1.0714285714285714, "grad_norm": 0.6279544966110486, "learning_rate": 5e-06, "loss": 0.6196, "step": 330 }, { "epoch": 1.103896103896104, "grad_norm": 0.47691022078448675, "learning_rate": 5e-06, "loss": 0.6213, "step": 340 }, { "epoch": 1.1363636363636362, "grad_norm": 0.6169659732755709, "learning_rate": 5e-06, "loss": 0.6189, "step": 350 }, { "epoch": 1.1688311688311688, "grad_norm": 0.6930896730291389, "learning_rate": 5e-06, "loss": 0.6179, "step": 360 }, { "epoch": 1.2012987012987013, "grad_norm": 0.5888468229519391, "learning_rate": 5e-06, "loss": 0.6193, "step": 370 }, { "epoch": 1.2337662337662338, "grad_norm": 0.5114807666495347, "learning_rate": 5e-06, "loss": 0.6205, "step": 380 }, { "epoch": 1.2662337662337662, "grad_norm": 0.576480885597218, "learning_rate": 5e-06, "loss": 0.6143, "step": 390 }, { "epoch": 1.2987012987012987, "grad_norm": 0.9781557440302872, "learning_rate": 5e-06, "loss": 0.616, "step": 400 }, { "epoch": 1.3311688311688312, "grad_norm": 0.5493968761484528, "learning_rate": 5e-06, "loss": 0.6181, "step": 410 }, { "epoch": 1.3636363636363638, "grad_norm": 0.8450188883114491, "learning_rate": 5e-06, "loss": 0.6186, "step": 420 }, { "epoch": 1.396103896103896, "grad_norm": 0.6672141224772778, "learning_rate": 5e-06, "loss": 0.6182, "step": 430 }, { "epoch": 1.4285714285714286, "grad_norm": 0.5436445484738832, "learning_rate": 5e-06, "loss": 0.6147, "step": 440 }, { "epoch": 1.4610389610389611, "grad_norm": 0.5830504026660146, "learning_rate": 5e-06, "loss": 0.6179, "step": 450 }, { "epoch": 1.4935064935064934, "grad_norm": 0.6473257236943104, "learning_rate": 5e-06, "loss": 0.6199, "step": 460 }, { "epoch": 1.525974025974026, "grad_norm": 0.5427880278607804, "learning_rate": 5e-06, "loss": 0.6168, "step": 470 }, { "epoch": 1.5584415584415585, "grad_norm": 0.5689580314401272, "learning_rate": 5e-06, "loss": 0.6159, "step": 480 }, { "epoch": 1.5909090909090908, "grad_norm": 0.597927845953086, "learning_rate": 5e-06, "loss": 0.6175, "step": 490 }, { "epoch": 1.6233766233766234, "grad_norm": 0.6128642707216239, "learning_rate": 5e-06, "loss": 0.6163, "step": 500 }, { "epoch": 1.655844155844156, "grad_norm": 0.5455974938431143, "learning_rate": 5e-06, "loss": 0.6168, "step": 510 }, { "epoch": 1.6883116883116882, "grad_norm": 0.5153120159264221, "learning_rate": 5e-06, "loss": 0.6204, "step": 520 }, { "epoch": 1.7207792207792207, "grad_norm": 0.5767601324955324, "learning_rate": 5e-06, "loss": 0.619, "step": 530 }, { "epoch": 1.7532467532467533, "grad_norm": 0.5856685996311523, "learning_rate": 5e-06, "loss": 0.6195, "step": 540 }, { "epoch": 1.7857142857142856, "grad_norm": 0.5318505472371191, "learning_rate": 5e-06, "loss": 0.6142, "step": 550 }, { "epoch": 1.8181818181818183, "grad_norm": 0.5867934386348821, "learning_rate": 5e-06, "loss": 0.6163, "step": 560 }, { "epoch": 1.8506493506493507, "grad_norm": 0.5460100351131841, "learning_rate": 5e-06, "loss": 0.6209, "step": 570 }, { "epoch": 1.883116883116883, "grad_norm": 0.5930045996717794, "learning_rate": 5e-06, "loss": 0.6173, "step": 580 }, { "epoch": 1.9155844155844157, "grad_norm": 0.5210440244932204, "learning_rate": 5e-06, "loss": 0.6172, "step": 590 }, { "epoch": 1.948051948051948, "grad_norm": 0.678263024145128, "learning_rate": 5e-06, "loss": 0.6219, "step": 600 }, { "epoch": 1.9805194805194806, "grad_norm": 0.5193570456262979, "learning_rate": 5e-06, "loss": 0.6119, "step": 610 }, { "epoch": 2.0, "eval_loss": 0.6595985293388367, "eval_runtime": 30.7002, "eval_samples_per_second": 269.835, "eval_steps_per_second": 1.075, "step": 616 }, { "epoch": 2.012987012987013, "grad_norm": 0.997715568070811, "learning_rate": 5e-06, "loss": 0.5894, "step": 620 }, { "epoch": 2.0454545454545454, "grad_norm": 0.7592737794541236, "learning_rate": 5e-06, "loss": 0.5677, "step": 630 }, { "epoch": 2.0779220779220777, "grad_norm": 0.6056584838864539, "learning_rate": 5e-06, "loss": 0.5669, "step": 640 }, { "epoch": 2.1103896103896105, "grad_norm": 0.6211039916928865, "learning_rate": 5e-06, "loss": 0.5697, "step": 650 }, { "epoch": 2.142857142857143, "grad_norm": 0.615106862254971, "learning_rate": 5e-06, "loss": 0.5693, "step": 660 }, { "epoch": 2.175324675324675, "grad_norm": 0.566417720964845, "learning_rate": 5e-06, "loss": 0.577, "step": 670 }, { "epoch": 2.207792207792208, "grad_norm": 0.5480748428783726, "learning_rate": 5e-06, "loss": 0.5724, "step": 680 }, { "epoch": 2.24025974025974, "grad_norm": 0.6883572551516758, "learning_rate": 5e-06, "loss": 0.5663, "step": 690 }, { "epoch": 2.2727272727272725, "grad_norm": 0.7624758724871575, "learning_rate": 5e-06, "loss": 0.5721, "step": 700 }, { "epoch": 2.3051948051948052, "grad_norm": 0.5925041865618843, "learning_rate": 5e-06, "loss": 0.5718, "step": 710 }, { "epoch": 2.3376623376623376, "grad_norm": 0.5423034645452969, "learning_rate": 5e-06, "loss": 0.5681, "step": 720 }, { "epoch": 2.3701298701298703, "grad_norm": 0.5480316834860852, "learning_rate": 5e-06, "loss": 0.5722, "step": 730 }, { "epoch": 2.4025974025974026, "grad_norm": 0.5169062030347897, "learning_rate": 5e-06, "loss": 0.578, "step": 740 }, { "epoch": 2.435064935064935, "grad_norm": 0.5457808079840645, "learning_rate": 5e-06, "loss": 0.57, "step": 750 }, { "epoch": 2.4675324675324677, "grad_norm": 0.5470205045138103, "learning_rate": 5e-06, "loss": 0.5726, "step": 760 }, { "epoch": 2.5, "grad_norm": 0.5125136364795218, "learning_rate": 5e-06, "loss": 0.5693, "step": 770 }, { "epoch": 2.5324675324675323, "grad_norm": 0.5945664415971015, "learning_rate": 5e-06, "loss": 0.5714, "step": 780 }, { "epoch": 2.564935064935065, "grad_norm": 0.5702694037641614, "learning_rate": 5e-06, "loss": 0.5689, "step": 790 }, { "epoch": 2.5974025974025974, "grad_norm": 0.5441374726350022, "learning_rate": 5e-06, "loss": 0.5742, "step": 800 }, { "epoch": 2.62987012987013, "grad_norm": 0.5674621294447999, "learning_rate": 5e-06, "loss": 0.5687, "step": 810 }, { "epoch": 2.6623376623376624, "grad_norm": 0.5997098488587294, "learning_rate": 5e-06, "loss": 0.5763, "step": 820 }, { "epoch": 2.6948051948051948, "grad_norm": 0.6199757649220302, "learning_rate": 5e-06, "loss": 0.5747, "step": 830 }, { "epoch": 2.7272727272727275, "grad_norm": 0.6911213249901123, "learning_rate": 5e-06, "loss": 0.5711, "step": 840 }, { "epoch": 2.75974025974026, "grad_norm": 0.5709123176208969, "learning_rate": 5e-06, "loss": 0.5701, "step": 850 }, { "epoch": 2.792207792207792, "grad_norm": 0.6304517541226137, "learning_rate": 5e-06, "loss": 0.5673, "step": 860 }, { "epoch": 2.824675324675325, "grad_norm": 0.6030037959776535, "learning_rate": 5e-06, "loss": 0.5713, "step": 870 }, { "epoch": 2.857142857142857, "grad_norm": 0.5603204730571357, "learning_rate": 5e-06, "loss": 0.5749, "step": 880 }, { "epoch": 2.8896103896103895, "grad_norm": 0.5148606934943276, "learning_rate": 5e-06, "loss": 0.5671, "step": 890 }, { "epoch": 2.9220779220779223, "grad_norm": 0.770823574891512, "learning_rate": 5e-06, "loss": 0.5694, "step": 900 }, { "epoch": 2.9545454545454546, "grad_norm": 0.6707592403791355, "learning_rate": 5e-06, "loss": 0.5691, "step": 910 }, { "epoch": 2.987012987012987, "grad_norm": 0.7817460976590817, "learning_rate": 5e-06, "loss": 0.568, "step": 920 }, { "epoch": 3.0, "eval_loss": 0.6653555631637573, "eval_runtime": 29.9562, "eval_samples_per_second": 276.537, "eval_steps_per_second": 1.102, "step": 924 }, { "epoch": 3.0, "step": 924, "total_flos": 1547734414786560.0, "train_loss": 0.6307248511871735, "train_runtime": 5890.0632, "train_samples_per_second": 80.163, "train_steps_per_second": 0.157 } ], "logging_steps": 10, "max_steps": 924, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1547734414786560.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }