{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.024681201151789386, "eval_steps": 500, "global_step": 120, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00020567667626491157, "grad_norm": 12.799294471740723, "learning_rate": 5e-06, "loss": 1.5984, "step": 1 }, { "epoch": 0.00041135335252982314, "grad_norm": 16.628677368164062, "learning_rate": 1e-05, "loss": 2.3592, "step": 2 }, { "epoch": 0.0006170300287947347, "grad_norm": 9.23403549194336, "learning_rate": 1.5e-05, "loss": 1.9703, "step": 3 }, { "epoch": 0.0008227067050596463, "grad_norm": 8.804163932800293, "learning_rate": 2e-05, "loss": 1.7174, "step": 4 }, { "epoch": 0.0010283833813245578, "grad_norm": 19.202816009521484, "learning_rate": 2.5e-05, "loss": 2.5644, "step": 5 }, { "epoch": 0.0012340600575894694, "grad_norm": Infinity, "learning_rate": 2.5e-05, "loss": 1.9342, "step": 6 }, { "epoch": 0.001439736733854381, "grad_norm": 7.568014621734619, "learning_rate": 3e-05, "loss": 1.8679, "step": 7 }, { "epoch": 0.0016454134101192926, "grad_norm": 14.93575382232666, "learning_rate": 3.5e-05, "loss": 1.6565, "step": 8 }, { "epoch": 0.001851090086384204, "grad_norm": 11.807939529418945, "learning_rate": 4e-05, "loss": 2.067, "step": 9 }, { "epoch": 0.0020567667626491155, "grad_norm": 9.383672714233398, "learning_rate": 4.5e-05, "loss": 2.2929, "step": 10 }, { "epoch": 0.0022624434389140274, "grad_norm": 12.13580322265625, "learning_rate": 5e-05, "loss": 1.902, "step": 11 }, { "epoch": 0.0024681201151789387, "grad_norm": 26.6066837310791, "learning_rate": 4.9545454545454553e-05, "loss": 2.2335, "step": 12 }, { "epoch": 0.00267379679144385, "grad_norm": 22.572980880737305, "learning_rate": 4.909090909090909e-05, "loss": 1.3673, "step": 13 }, { "epoch": 0.002879473467708762, "grad_norm": 14.39148235321045, "learning_rate": 4.863636363636364e-05, "loss": 1.8704, "step": 14 }, { "epoch": 0.0030851501439736733, "grad_norm": 10.3450288772583, "learning_rate": 4.8181818181818186e-05, "loss": 1.0219, "step": 15 }, { "epoch": 0.003290826820238585, "grad_norm": 11.662192344665527, "learning_rate": 4.772727272727273e-05, "loss": 1.9658, "step": 16 }, { "epoch": 0.0034965034965034965, "grad_norm": 9.669842720031738, "learning_rate": 4.7272727272727275e-05, "loss": 1.7769, "step": 17 }, { "epoch": 0.003702180172768408, "grad_norm": Infinity, "learning_rate": 4.7272727272727275e-05, "loss": 1.4503, "step": 18 }, { "epoch": 0.00390785684903332, "grad_norm": 24.309511184692383, "learning_rate": 4.681818181818182e-05, "loss": 1.1466, "step": 19 }, { "epoch": 0.004113533525298231, "grad_norm": 10.933331489562988, "learning_rate": 4.636363636363636e-05, "loss": 1.9125, "step": 20 }, { "epoch": 0.0043192102015631425, "grad_norm": 16.095943450927734, "learning_rate": 4.5909090909090914e-05, "loss": 2.1035, "step": 21 }, { "epoch": 0.004524886877828055, "grad_norm": 11.987975120544434, "learning_rate": 4.545454545454546e-05, "loss": 0.6702, "step": 22 }, { "epoch": 0.004730563554092966, "grad_norm": 8.843574523925781, "learning_rate": 4.5e-05, "loss": 0.5454, "step": 23 }, { "epoch": 0.0049362402303578775, "grad_norm": 8.445805549621582, "learning_rate": 4.454545454545455e-05, "loss": 1.7987, "step": 24 }, { "epoch": 0.005141916906622789, "grad_norm": 5.443497657775879, "learning_rate": 4.409090909090909e-05, "loss": 0.2675, "step": 25 }, { "epoch": 0.0053475935828877, "grad_norm": 5.370733737945557, "learning_rate": 4.3636363636363636e-05, "loss": 0.1773, "step": 26 }, { "epoch": 0.0055532702591526125, "grad_norm": 12.393692016601562, "learning_rate": 4.318181818181819e-05, "loss": 2.0755, "step": 27 }, { "epoch": 0.005758946935417524, "grad_norm": 11.817388534545898, "learning_rate": 4.2727272727272724e-05, "loss": 1.796, "step": 28 }, { "epoch": 0.005964623611682435, "grad_norm": 7.721453666687012, "learning_rate": 4.2272727272727275e-05, "loss": 1.461, "step": 29 }, { "epoch": 0.006170300287947347, "grad_norm": 8.523295402526855, "learning_rate": 4.181818181818182e-05, "loss": 1.6376, "step": 30 }, { "epoch": 0.006375976964212258, "grad_norm": 14.518152236938477, "learning_rate": 4.1363636363636364e-05, "loss": 2.036, "step": 31 }, { "epoch": 0.00658165364047717, "grad_norm": 12.425220489501953, "learning_rate": 4.0909090909090915e-05, "loss": 0.4348, "step": 32 }, { "epoch": 0.006787330316742082, "grad_norm": 26.095151901245117, "learning_rate": 4.045454545454546e-05, "loss": 1.2404, "step": 33 }, { "epoch": 0.006993006993006993, "grad_norm": 4.784183979034424, "learning_rate": 4e-05, "loss": 1.309, "step": 34 }, { "epoch": 0.007198683669271904, "grad_norm": 13.188830375671387, "learning_rate": 3.954545454545455e-05, "loss": 1.5275, "step": 35 }, { "epoch": 0.007404360345536816, "grad_norm": 9.305349349975586, "learning_rate": 3.909090909090909e-05, "loss": 1.8645, "step": 36 }, { "epoch": 0.007610037021801728, "grad_norm": 10.391180038452148, "learning_rate": 3.8636363636363636e-05, "loss": 0.2851, "step": 37 }, { "epoch": 0.00781571369806664, "grad_norm": 2.905449390411377, "learning_rate": 3.818181818181819e-05, "loss": 0.0632, "step": 38 }, { "epoch": 0.008021390374331552, "grad_norm": 15.784213066101074, "learning_rate": 3.7727272727272725e-05, "loss": 0.7361, "step": 39 }, { "epoch": 0.008227067050596462, "grad_norm": 4.363598346710205, "learning_rate": 3.7272727272727276e-05, "loss": 0.0486, "step": 40 }, { "epoch": 0.008432743726861374, "grad_norm": 17.239139556884766, "learning_rate": 3.681818181818182e-05, "loss": 1.3015, "step": 41 }, { "epoch": 0.008638420403126285, "grad_norm": 1.9617282152175903, "learning_rate": 3.6363636363636364e-05, "loss": 0.0413, "step": 42 }, { "epoch": 0.008844097079391197, "grad_norm": 5.427540302276611, "learning_rate": 3.590909090909091e-05, "loss": 1.733, "step": 43 }, { "epoch": 0.00904977375565611, "grad_norm": 41.895721435546875, "learning_rate": 3.545454545454546e-05, "loss": 1.7908, "step": 44 }, { "epoch": 0.00925545043192102, "grad_norm": 6.50022554397583, "learning_rate": 3.5e-05, "loss": 2.393, "step": 45 }, { "epoch": 0.009461127108185932, "grad_norm": 13.534425735473633, "learning_rate": 3.454545454545455e-05, "loss": 1.0052, "step": 46 }, { "epoch": 0.009666803784450843, "grad_norm": 1.7201191186904907, "learning_rate": 3.409090909090909e-05, "loss": 0.0428, "step": 47 }, { "epoch": 0.009872480460715755, "grad_norm": 18.76580810546875, "learning_rate": 3.3636363636363636e-05, "loss": 0.8643, "step": 48 }, { "epoch": 0.010078157136980667, "grad_norm": 18.51691246032715, "learning_rate": 3.318181818181819e-05, "loss": 1.0954, "step": 49 }, { "epoch": 0.010283833813245578, "grad_norm": 8.370597839355469, "learning_rate": 3.272727272727273e-05, "loss": 0.6081, "step": 50 }, { "epoch": 0.01048951048951049, "grad_norm": 6.726954936981201, "learning_rate": 3.2272727272727276e-05, "loss": 0.0419, "step": 51 }, { "epoch": 0.0106951871657754, "grad_norm": 6.499707221984863, "learning_rate": 3.181818181818182e-05, "loss": 1.7456, "step": 52 }, { "epoch": 0.010900863842040313, "grad_norm": 17.723892211914062, "learning_rate": 3.1363636363636365e-05, "loss": 1.2133, "step": 53 }, { "epoch": 0.011106540518305225, "grad_norm": 7.051292419433594, "learning_rate": 3.090909090909091e-05, "loss": 0.2732, "step": 54 }, { "epoch": 0.011312217194570135, "grad_norm": 12.115779876708984, "learning_rate": 3.0454545454545456e-05, "loss": 2.4646, "step": 55 }, { "epoch": 0.011517893870835048, "grad_norm": 5.81415319442749, "learning_rate": 3e-05, "loss": 1.6826, "step": 56 }, { "epoch": 0.011723570547099958, "grad_norm": 13.005535125732422, "learning_rate": 2.954545454545455e-05, "loss": 0.8454, "step": 57 }, { "epoch": 0.01192924722336487, "grad_norm": 13.358834266662598, "learning_rate": 2.909090909090909e-05, "loss": 0.6585, "step": 58 }, { "epoch": 0.012134923899629783, "grad_norm": 7.232337474822998, "learning_rate": 2.863636363636364e-05, "loss": 2.0886, "step": 59 }, { "epoch": 0.012340600575894693, "grad_norm": 5.909549713134766, "learning_rate": 2.818181818181818e-05, "loss": 2.04, "step": 60 }, { "epoch": 0.012546277252159605, "grad_norm": 5.2378621101379395, "learning_rate": 2.772727272727273e-05, "loss": 1.3154, "step": 61 }, { "epoch": 0.012751953928424516, "grad_norm": 7.897792816162109, "learning_rate": 2.7272727272727273e-05, "loss": 1.7168, "step": 62 }, { "epoch": 0.012957630604689428, "grad_norm": 10.026203155517578, "learning_rate": 2.681818181818182e-05, "loss": 2.1631, "step": 63 }, { "epoch": 0.01316330728095434, "grad_norm": 7.848910808563232, "learning_rate": 2.636363636363636e-05, "loss": 0.895, "step": 64 }, { "epoch": 0.013368983957219251, "grad_norm": 8.935349464416504, "learning_rate": 2.590909090909091e-05, "loss": 1.3377, "step": 65 }, { "epoch": 0.013574660633484163, "grad_norm": 12.838030815124512, "learning_rate": 2.5454545454545454e-05, "loss": 1.0923, "step": 66 }, { "epoch": 0.013780337309749074, "grad_norm": 11.543920516967773, "learning_rate": 2.5e-05, "loss": 1.8728, "step": 67 }, { "epoch": 0.013986013986013986, "grad_norm": 5.111774444580078, "learning_rate": 2.4545454545454545e-05, "loss": 1.4745, "step": 68 }, { "epoch": 0.014191690662278898, "grad_norm": 9.102482795715332, "learning_rate": 2.4090909090909093e-05, "loss": 0.5785, "step": 69 }, { "epoch": 0.014397367338543809, "grad_norm": 10.797809600830078, "learning_rate": 2.3636363636363637e-05, "loss": 1.5844, "step": 70 }, { "epoch": 0.014603044014808721, "grad_norm": 6.701333999633789, "learning_rate": 2.318181818181818e-05, "loss": 1.6503, "step": 71 }, { "epoch": 0.014808720691073632, "grad_norm": 8.514144897460938, "learning_rate": 2.272727272727273e-05, "loss": 2.0404, "step": 72 }, { "epoch": 0.015014397367338544, "grad_norm": 4.390872001647949, "learning_rate": 2.2272727272727274e-05, "loss": 1.3714, "step": 73 }, { "epoch": 0.015220074043603456, "grad_norm": 11.0691556930542, "learning_rate": 2.1818181818181818e-05, "loss": 1.9498, "step": 74 }, { "epoch": 0.015425750719868367, "grad_norm": 4.954442024230957, "learning_rate": 2.1363636363636362e-05, "loss": 1.5526, "step": 75 }, { "epoch": 0.01563142739613328, "grad_norm": 3.523308038711548, "learning_rate": 2.090909090909091e-05, "loss": 0.0933, "step": 76 }, { "epoch": 0.01583710407239819, "grad_norm": 7.044577121734619, "learning_rate": 2.0454545454545457e-05, "loss": 1.8968, "step": 77 }, { "epoch": 0.016042780748663103, "grad_norm": 12.184310913085938, "learning_rate": 2e-05, "loss": 1.9779, "step": 78 }, { "epoch": 0.016248457424928014, "grad_norm": 7.611854076385498, "learning_rate": 1.9545454545454546e-05, "loss": 1.1828, "step": 79 }, { "epoch": 0.016454134101192924, "grad_norm": 2.3979077339172363, "learning_rate": 1.9090909090909094e-05, "loss": 0.0637, "step": 80 }, { "epoch": 0.016659810777457835, "grad_norm": 7.704205513000488, "learning_rate": 1.8636363636363638e-05, "loss": 0.8134, "step": 81 }, { "epoch": 0.01686548745372275, "grad_norm": 5.452297210693359, "learning_rate": 1.8181818181818182e-05, "loss": 1.4938, "step": 82 }, { "epoch": 0.01707116412998766, "grad_norm": 6.996687889099121, "learning_rate": 1.772727272727273e-05, "loss": 0.8019, "step": 83 }, { "epoch": 0.01727684080625257, "grad_norm": 7.0274271965026855, "learning_rate": 1.7272727272727274e-05, "loss": 0.7567, "step": 84 }, { "epoch": 0.017482517482517484, "grad_norm": 14.325960159301758, "learning_rate": 1.6818181818181818e-05, "loss": 1.861, "step": 85 }, { "epoch": 0.017688194158782394, "grad_norm": 8.082893371582031, "learning_rate": 1.6363636363636366e-05, "loss": 1.7144, "step": 86 }, { "epoch": 0.017893870835047305, "grad_norm": 18.079805374145508, "learning_rate": 1.590909090909091e-05, "loss": 1.3443, "step": 87 }, { "epoch": 0.01809954751131222, "grad_norm": 7.730350971221924, "learning_rate": 1.5454545454545454e-05, "loss": 0.6243, "step": 88 }, { "epoch": 0.01830522418757713, "grad_norm": 11.749229431152344, "learning_rate": 1.5e-05, "loss": 0.9159, "step": 89 }, { "epoch": 0.01851090086384204, "grad_norm": 1.573517918586731, "learning_rate": 1.4545454545454545e-05, "loss": 0.0361, "step": 90 }, { "epoch": 0.01871657754010695, "grad_norm": 12.70760440826416, "learning_rate": 1.409090909090909e-05, "loss": 0.6055, "step": 91 }, { "epoch": 0.018922254216371864, "grad_norm": 8.807103157043457, "learning_rate": 1.3636363636363637e-05, "loss": 1.4647, "step": 92 }, { "epoch": 0.019127930892636775, "grad_norm": 4.610854625701904, "learning_rate": 1.318181818181818e-05, "loss": 0.1759, "step": 93 }, { "epoch": 0.019333607568901685, "grad_norm": 4.804567813873291, "learning_rate": 1.2727272727272727e-05, "loss": 1.5768, "step": 94 }, { "epoch": 0.0195392842451666, "grad_norm": 8.914559364318848, "learning_rate": 1.2272727272727273e-05, "loss": 0.9042, "step": 95 }, { "epoch": 0.01974496092143151, "grad_norm": 10.30044174194336, "learning_rate": 1.1818181818181819e-05, "loss": 1.4115, "step": 96 }, { "epoch": 0.01995063759769642, "grad_norm": 1.7506386041641235, "learning_rate": 1.1363636363636365e-05, "loss": 0.0564, "step": 97 }, { "epoch": 0.020156314273961334, "grad_norm": 15.82219123840332, "learning_rate": 1.0909090909090909e-05, "loss": 1.3599, "step": 98 }, { "epoch": 0.020361990950226245, "grad_norm": 13.379084587097168, "learning_rate": 1.0454545454545455e-05, "loss": 0.6752, "step": 99 }, { "epoch": 0.020567667626491155, "grad_norm": 13.464095115661621, "learning_rate": 1e-05, "loss": 0.71, "step": 100 }, { "epoch": 0.020773344302756066, "grad_norm": 11.258218765258789, "learning_rate": 9.545454545454547e-06, "loss": 1.5134, "step": 101 }, { "epoch": 0.02097902097902098, "grad_norm": 6.195601463317871, "learning_rate": 9.090909090909091e-06, "loss": 1.9024, "step": 102 }, { "epoch": 0.02118469765528589, "grad_norm": 14.71764087677002, "learning_rate": 8.636363636363637e-06, "loss": 1.8721, "step": 103 }, { "epoch": 0.0213903743315508, "grad_norm": 18.410600662231445, "learning_rate": 8.181818181818183e-06, "loss": 0.9641, "step": 104 }, { "epoch": 0.021596051007815715, "grad_norm": 14.327963829040527, "learning_rate": 7.727272727272727e-06, "loss": 1.3635, "step": 105 }, { "epoch": 0.021801727684080625, "grad_norm": 9.677972793579102, "learning_rate": 7.272727272727272e-06, "loss": 1.1723, "step": 106 }, { "epoch": 0.022007404360345536, "grad_norm": 8.000631332397461, "learning_rate": 6.818181818181818e-06, "loss": 0.6521, "step": 107 }, { "epoch": 0.02221308103661045, "grad_norm": 1.116436243057251, "learning_rate": 6.363636363636363e-06, "loss": 0.0357, "step": 108 }, { "epoch": 0.02241875771287536, "grad_norm": 1.7491281032562256, "learning_rate": 5.909090909090909e-06, "loss": 0.0469, "step": 109 }, { "epoch": 0.02262443438914027, "grad_norm": 1.2441469430923462, "learning_rate": 5.4545454545454545e-06, "loss": 0.0386, "step": 110 }, { "epoch": 0.02283011106540518, "grad_norm": 18.10247039794922, "learning_rate": 5e-06, "loss": 1.1133, "step": 111 }, { "epoch": 0.023035787741670095, "grad_norm": 8.674224853515625, "learning_rate": 4.5454545454545455e-06, "loss": 1.4932, "step": 112 }, { "epoch": 0.023241464417935006, "grad_norm": 10.565869331359863, "learning_rate": 4.0909090909090915e-06, "loss": 0.2315, "step": 113 }, { "epoch": 0.023447141094199916, "grad_norm": 7.652951240539551, "learning_rate": 3.636363636363636e-06, "loss": 1.8242, "step": 114 }, { "epoch": 0.02365281777046483, "grad_norm": 5.98936653137207, "learning_rate": 3.1818181818181817e-06, "loss": 1.9984, "step": 115 }, { "epoch": 0.02385849444672974, "grad_norm": 9.790857315063477, "learning_rate": 2.7272727272727272e-06, "loss": 1.9085, "step": 116 }, { "epoch": 0.02406417112299465, "grad_norm": 1.22812020778656, "learning_rate": 2.2727272727272728e-06, "loss": 0.0346, "step": 117 }, { "epoch": 0.024269847799259565, "grad_norm": 14.80505657196045, "learning_rate": 1.818181818181818e-06, "loss": 1.1611, "step": 118 }, { "epoch": 0.024475524475524476, "grad_norm": 15.46601390838623, "learning_rate": 1.3636363636363636e-06, "loss": 2.3691, "step": 119 }, { "epoch": 0.024681201151789386, "grad_norm": 17.55849266052246, "learning_rate": 9.09090909090909e-07, "loss": 0.8722, "step": 120 } ], "logging_steps": 1, "max_steps": 120, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3875365448736768.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }