{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9914794732765299, "eval_steps": 500, "global_step": 3200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 12.876287460327148, "learning_rate": 1.4099996659119809e-05, "loss": 4.3352, "step": 1 }, { "epoch": 0.0, "grad_norm": 14.893763542175293, "learning_rate": 1.4099986636482399e-05, "loss": 3.9062, "step": 2 }, { "epoch": 0.0, "grad_norm": 14.747922897338867, "learning_rate": 1.409996993209727e-05, "loss": 2.1353, "step": 3 }, { "epoch": 0.0, "grad_norm": 9.3056640625, "learning_rate": 1.4099946545980254e-05, "loss": 1.2115, "step": 4 }, { "epoch": 0.0, "grad_norm": 16.262678146362305, "learning_rate": 1.4099916478153516e-05, "loss": 2.1268, "step": 5 }, { "epoch": 0.0, "grad_norm": 24.459659576416016, "learning_rate": 1.4099879728645554e-05, "loss": 2.7806, "step": 6 }, { "epoch": 0.0, "grad_norm": 13.100327491760254, "learning_rate": 1.4099836297491197e-05, "loss": 2.453, "step": 7 }, { "epoch": 0.0, "grad_norm": 10.096444129943848, "learning_rate": 1.4099786184731606e-05, "loss": 2.2396, "step": 8 }, { "epoch": 0.0, "grad_norm": 12.478560447692871, "learning_rate": 1.409972939041428e-05, "loss": 1.4557, "step": 9 }, { "epoch": 0.0, "grad_norm": 11.49439525604248, "learning_rate": 1.4099665914593043e-05, "loss": 1.7847, "step": 10 }, { "epoch": 0.0, "grad_norm": 20.96303939819336, "learning_rate": 1.4099595757328058e-05, "loss": 2.6818, "step": 11 }, { "epoch": 0.0, "grad_norm": 14.429734230041504, "learning_rate": 1.4099518918685816e-05, "loss": 3.0543, "step": 12 }, { "epoch": 0.0, "grad_norm": 13.766142845153809, "learning_rate": 1.4099435398739144e-05, "loss": 2.4683, "step": 13 }, { "epoch": 0.0, "grad_norm": 19.67556381225586, "learning_rate": 1.4099345197567196e-05, "loss": 6.3905, "step": 14 }, { "epoch": 0.0, "grad_norm": 11.328873634338379, "learning_rate": 1.4099248315255468e-05, "loss": 1.104, "step": 15 }, { "epoch": 0.0, "grad_norm": 12.752955436706543, "learning_rate": 1.4099144751895776e-05, "loss": 3.7395, "step": 16 }, { "epoch": 0.01, "grad_norm": 17.726831436157227, "learning_rate": 1.4099034507586277e-05, "loss": 5.7476, "step": 17 }, { "epoch": 0.01, "grad_norm": 14.659893035888672, "learning_rate": 1.4098917582431455e-05, "loss": 2.6832, "step": 18 }, { "epoch": 0.01, "grad_norm": 12.84981918334961, "learning_rate": 1.4098793976542129e-05, "loss": 1.827, "step": 19 }, { "epoch": 0.01, "grad_norm": 21.144001007080078, "learning_rate": 1.409866369003545e-05, "loss": 5.6839, "step": 20 }, { "epoch": 0.01, "grad_norm": 16.51398468017578, "learning_rate": 1.4098526723034896e-05, "loss": 7.7836, "step": 21 }, { "epoch": 0.01, "grad_norm": 14.258636474609375, "learning_rate": 1.4098383075670282e-05, "loss": 2.5929, "step": 22 }, { "epoch": 0.01, "grad_norm": 15.997798919677734, "learning_rate": 1.4098232748077751e-05, "loss": 3.2264, "step": 23 }, { "epoch": 0.01, "grad_norm": 12.296121597290039, "learning_rate": 1.4098075740399781e-05, "loss": 1.9712, "step": 24 }, { "epoch": 0.01, "grad_norm": 17.37089729309082, "learning_rate": 1.409791205278518e-05, "loss": 1.8801, "step": 25 }, { "epoch": 0.01, "grad_norm": 19.558774948120117, "learning_rate": 1.4097741685389079e-05, "loss": 3.0265, "step": 26 }, { "epoch": 0.01, "grad_norm": 9.684329986572266, "learning_rate": 1.4097564638372954e-05, "loss": 0.8368, "step": 27 }, { "epoch": 0.01, "grad_norm": 13.683094024658203, "learning_rate": 1.4097380911904599e-05, "loss": 1.2745, "step": 28 }, { "epoch": 0.01, "grad_norm": 13.189740180969238, "learning_rate": 1.4097190506158147e-05, "loss": 1.1601, "step": 29 }, { "epoch": 0.01, "grad_norm": 18.130149841308594, "learning_rate": 1.4096993421314056e-05, "loss": 1.8264, "step": 30 }, { "epoch": 0.01, "grad_norm": 19.705230712890625, "learning_rate": 1.409678965755912e-05, "loss": 1.8733, "step": 31 }, { "epoch": 0.01, "grad_norm": 11.09542465209961, "learning_rate": 1.4096579215086455e-05, "loss": 2.4501, "step": 32 }, { "epoch": 0.01, "grad_norm": 19.853260040283203, "learning_rate": 1.4096362094095518e-05, "loss": 2.7891, "step": 33 }, { "epoch": 0.01, "grad_norm": 14.29506778717041, "learning_rate": 1.4096138294792083e-05, "loss": 3.4722, "step": 34 }, { "epoch": 0.01, "grad_norm": 12.516790390014648, "learning_rate": 1.4095907817388264e-05, "loss": 2.1471, "step": 35 }, { "epoch": 0.01, "grad_norm": 13.4302978515625, "learning_rate": 1.4095670662102496e-05, "loss": 2.6135, "step": 36 }, { "epoch": 0.01, "grad_norm": 10.625223159790039, "learning_rate": 1.4095426829159551e-05, "loss": 1.1707, "step": 37 }, { "epoch": 0.01, "grad_norm": 17.80229949951172, "learning_rate": 1.4095176318790526e-05, "loss": 3.7891, "step": 38 }, { "epoch": 0.01, "grad_norm": 8.8538179397583, "learning_rate": 1.409491913123284e-05, "loss": 0.9273, "step": 39 }, { "epoch": 0.01, "grad_norm": 13.519434928894043, "learning_rate": 1.4094655266730254e-05, "loss": 2.6715, "step": 40 }, { "epoch": 0.01, "grad_norm": 10.914387702941895, "learning_rate": 1.4094384725532849e-05, "loss": 1.5982, "step": 41 }, { "epoch": 0.01, "grad_norm": 17.040918350219727, "learning_rate": 1.4094107507897032e-05, "loss": 3.0822, "step": 42 }, { "epoch": 0.01, "grad_norm": 11.09429931640625, "learning_rate": 1.4093823614085546e-05, "loss": 1.5496, "step": 43 }, { "epoch": 0.01, "grad_norm": 13.157367706298828, "learning_rate": 1.409353304436745e-05, "loss": 1.4657, "step": 44 }, { "epoch": 0.01, "grad_norm": 11.823859214782715, "learning_rate": 1.4093235799018141e-05, "loss": 1.637, "step": 45 }, { "epoch": 0.01, "grad_norm": 11.13479232788086, "learning_rate": 1.4092931878319336e-05, "loss": 2.6957, "step": 46 }, { "epoch": 0.01, "grad_norm": 12.315866470336914, "learning_rate": 1.4092621282559084e-05, "loss": 2.2981, "step": 47 }, { "epoch": 0.01, "grad_norm": 13.145035743713379, "learning_rate": 1.4092304012031754e-05, "loss": 2.8959, "step": 48 }, { "epoch": 0.02, "grad_norm": 12.662394523620605, "learning_rate": 1.4091980067038048e-05, "loss": 4.7688, "step": 49 }, { "epoch": 0.02, "grad_norm": 15.96580982208252, "learning_rate": 1.4091649447884988e-05, "loss": 2.0613, "step": 50 }, { "epoch": 0.02, "grad_norm": 13.999765396118164, "learning_rate": 1.4091312154885925e-05, "loss": 2.4087, "step": 51 }, { "epoch": 0.02, "grad_norm": 12.088006019592285, "learning_rate": 1.4090968188360538e-05, "loss": 1.6578, "step": 52 }, { "epoch": 0.02, "grad_norm": 17.08218002319336, "learning_rate": 1.4090617548634819e-05, "loss": 1.2898, "step": 53 }, { "epoch": 0.02, "grad_norm": 9.499032974243164, "learning_rate": 1.40902602360411e-05, "loss": 1.1168, "step": 54 }, { "epoch": 0.02, "grad_norm": 9.868734359741211, "learning_rate": 1.4089896250918028e-05, "loss": 1.3226, "step": 55 }, { "epoch": 0.02, "grad_norm": 15.432489395141602, "learning_rate": 1.4089525593610576e-05, "loss": 3.777, "step": 56 }, { "epoch": 0.02, "grad_norm": 15.179858207702637, "learning_rate": 1.4089148264470039e-05, "loss": 1.6965, "step": 57 }, { "epoch": 0.02, "grad_norm": 16.56346893310547, "learning_rate": 1.408876426385404e-05, "loss": 2.4936, "step": 58 }, { "epoch": 0.02, "grad_norm": 13.435903549194336, "learning_rate": 1.4088373592126525e-05, "loss": 3.2397, "step": 59 }, { "epoch": 0.02, "grad_norm": 17.36369514465332, "learning_rate": 1.4087976249657753e-05, "loss": 2.8435, "step": 60 }, { "epoch": 0.02, "grad_norm": 17.762537002563477, "learning_rate": 1.4087572236824316e-05, "loss": 5.0264, "step": 61 }, { "epoch": 0.02, "grad_norm": 16.8850040435791, "learning_rate": 1.4087161554009124e-05, "loss": 3.2372, "step": 62 }, { "epoch": 0.02, "grad_norm": 15.00660514831543, "learning_rate": 1.4086744201601408e-05, "loss": 3.6194, "step": 63 }, { "epoch": 0.02, "grad_norm": 15.710498809814453, "learning_rate": 1.4086320179996723e-05, "loss": 3.4525, "step": 64 }, { "epoch": 0.02, "grad_norm": 11.125383377075195, "learning_rate": 1.4085889489596941e-05, "loss": 1.8647, "step": 65 }, { "epoch": 0.02, "grad_norm": 13.840350151062012, "learning_rate": 1.4085452130810259e-05, "loss": 1.2356, "step": 66 }, { "epoch": 0.02, "grad_norm": 19.095186233520508, "learning_rate": 1.4085008104051187e-05, "loss": 5.6668, "step": 67 }, { "epoch": 0.02, "grad_norm": 28.46858024597168, "learning_rate": 1.4084557409740563e-05, "loss": 3.5707, "step": 68 }, { "epoch": 0.02, "grad_norm": 15.306602478027344, "learning_rate": 1.408410004830554e-05, "loss": 2.8939, "step": 69 }, { "epoch": 0.02, "grad_norm": 14.545845031738281, "learning_rate": 1.4083636020179586e-05, "loss": 1.1766, "step": 70 }, { "epoch": 0.02, "grad_norm": 16.976905822753906, "learning_rate": 1.4083165325802496e-05, "loss": 2.7053, "step": 71 }, { "epoch": 0.02, "grad_norm": 15.33132553100586, "learning_rate": 1.4082687965620378e-05, "loss": 1.5187, "step": 72 }, { "epoch": 0.02, "grad_norm": 14.40718936920166, "learning_rate": 1.4082203940085658e-05, "loss": 4.6465, "step": 73 }, { "epoch": 0.02, "grad_norm": 17.957759857177734, "learning_rate": 1.408171324965708e-05, "loss": 5.891, "step": 74 }, { "epoch": 0.02, "grad_norm": 11.309581756591797, "learning_rate": 1.4081215894799704e-05, "loss": 1.3456, "step": 75 }, { "epoch": 0.02, "grad_norm": 16.916112899780273, "learning_rate": 1.4080711875984907e-05, "loss": 7.3883, "step": 76 }, { "epoch": 0.02, "grad_norm": 17.052242279052734, "learning_rate": 1.408020119369038e-05, "loss": 4.8764, "step": 77 }, { "epoch": 0.02, "grad_norm": 13.979994773864746, "learning_rate": 1.4079683848400135e-05, "loss": 2.1218, "step": 78 }, { "epoch": 0.02, "grad_norm": 12.151473999023438, "learning_rate": 1.407915984060449e-05, "loss": 1.468, "step": 79 }, { "epoch": 0.02, "grad_norm": 14.50087833404541, "learning_rate": 1.4078629170800088e-05, "loss": 2.8174, "step": 80 }, { "epoch": 0.03, "grad_norm": 11.1065092086792, "learning_rate": 1.4078091839489877e-05, "loss": 2.0711, "step": 81 }, { "epoch": 0.03, "grad_norm": 18.599384307861328, "learning_rate": 1.4077547847183122e-05, "loss": 1.5431, "step": 82 }, { "epoch": 0.03, "grad_norm": 18.681108474731445, "learning_rate": 1.40769971943954e-05, "loss": 4.8315, "step": 83 }, { "epoch": 0.03, "grad_norm": 11.349267959594727, "learning_rate": 1.4076439881648607e-05, "loss": 5.1925, "step": 84 }, { "epoch": 0.03, "grad_norm": 22.059520721435547, "learning_rate": 1.4075875909470941e-05, "loss": 3.7472, "step": 85 }, { "epoch": 0.03, "grad_norm": 13.839190483093262, "learning_rate": 1.407530527839692e-05, "loss": 5.2631, "step": 86 }, { "epoch": 0.03, "grad_norm": 12.494612693786621, "learning_rate": 1.4074727988967365e-05, "loss": 2.383, "step": 87 }, { "epoch": 0.03, "grad_norm": 12.313505172729492, "learning_rate": 1.4074144041729414e-05, "loss": 1.9434, "step": 88 }, { "epoch": 0.03, "grad_norm": 27.630678176879883, "learning_rate": 1.4073553437236517e-05, "loss": 3.9264, "step": 89 }, { "epoch": 0.03, "grad_norm": 12.239396095275879, "learning_rate": 1.4072956176048426e-05, "loss": 2.1466, "step": 90 }, { "epoch": 0.03, "grad_norm": 15.377116203308105, "learning_rate": 1.4072352258731206e-05, "loss": 1.8366, "step": 91 }, { "epoch": 0.03, "grad_norm": 11.560379981994629, "learning_rate": 1.407174168585723e-05, "loss": 1.4054, "step": 92 }, { "epoch": 0.03, "grad_norm": 11.187500953674316, "learning_rate": 1.4071124458005179e-05, "loss": 2.5571, "step": 93 }, { "epoch": 0.03, "grad_norm": 12.817543029785156, "learning_rate": 1.4070500575760041e-05, "loss": 1.2708, "step": 94 }, { "epoch": 0.03, "grad_norm": 12.608696937561035, "learning_rate": 1.4069870039713115e-05, "loss": 2.3783, "step": 95 }, { "epoch": 0.03, "grad_norm": 15.780579566955566, "learning_rate": 1.4069232850461997e-05, "loss": 5.6665, "step": 96 }, { "epoch": 0.03, "grad_norm": 8.685426712036133, "learning_rate": 1.40685890086106e-05, "loss": 0.9811, "step": 97 }, { "epoch": 0.03, "grad_norm": 22.659534454345703, "learning_rate": 1.4067938514769132e-05, "loss": 8.1078, "step": 98 }, { "epoch": 0.03, "grad_norm": 12.488410949707031, "learning_rate": 1.4067281369554112e-05, "loss": 2.6526, "step": 99 }, { "epoch": 0.03, "grad_norm": 12.394866943359375, "learning_rate": 1.4066617573588359e-05, "loss": 1.8688, "step": 100 }, { "epoch": 0.03, "grad_norm": 18.25127410888672, "learning_rate": 1.4065947127500999e-05, "loss": 5.0283, "step": 101 }, { "epoch": 0.03, "grad_norm": 15.696410179138184, "learning_rate": 1.4065270031927457e-05, "loss": 4.6801, "step": 102 }, { "epoch": 0.03, "grad_norm": 15.974501609802246, "learning_rate": 1.4064586287509465e-05, "loss": 4.2069, "step": 103 }, { "epoch": 0.03, "grad_norm": 20.77216339111328, "learning_rate": 1.4063895894895051e-05, "loss": 6.9798, "step": 104 }, { "epoch": 0.03, "grad_norm": 16.946361541748047, "learning_rate": 1.4063198854738549e-05, "loss": 4.9877, "step": 105 }, { "epoch": 0.03, "grad_norm": 20.423288345336914, "learning_rate": 1.4062495167700588e-05, "loss": 5.478, "step": 106 }, { "epoch": 0.03, "grad_norm": 15.431811332702637, "learning_rate": 1.4061784834448102e-05, "loss": 5.1891, "step": 107 }, { "epoch": 0.03, "grad_norm": 12.419416427612305, "learning_rate": 1.4061067855654323e-05, "loss": 1.5543, "step": 108 }, { "epoch": 0.03, "grad_norm": 15.559722900390625, "learning_rate": 1.4060344231998774e-05, "loss": 2.5583, "step": 109 }, { "epoch": 0.03, "grad_norm": 11.121515274047852, "learning_rate": 1.405961396416729e-05, "loss": 1.8617, "step": 110 }, { "epoch": 0.03, "grad_norm": 13.224935531616211, "learning_rate": 1.4058877052851988e-05, "loss": 2.214, "step": 111 }, { "epoch": 0.03, "grad_norm": 10.346721649169922, "learning_rate": 1.4058133498751294e-05, "loss": 1.0174, "step": 112 }, { "epoch": 0.04, "grad_norm": 13.830920219421387, "learning_rate": 1.405738330256992e-05, "loss": 2.2093, "step": 113 }, { "epoch": 0.04, "grad_norm": 11.912773132324219, "learning_rate": 1.4056626465018882e-05, "loss": 1.6604, "step": 114 }, { "epoch": 0.04, "grad_norm": 13.618285179138184, "learning_rate": 1.4055862986815482e-05, "loss": 2.3275, "step": 115 }, { "epoch": 0.04, "grad_norm": 17.236412048339844, "learning_rate": 1.4055092868683323e-05, "loss": 5.0808, "step": 116 }, { "epoch": 0.04, "grad_norm": 19.668128967285156, "learning_rate": 1.4054316111352295e-05, "loss": 3.5998, "step": 117 }, { "epoch": 0.04, "grad_norm": 24.973814010620117, "learning_rate": 1.4053532715558585e-05, "loss": 2.1121, "step": 118 }, { "epoch": 0.04, "grad_norm": 12.31823444366455, "learning_rate": 1.405274268204467e-05, "loss": 1.8002, "step": 119 }, { "epoch": 0.04, "grad_norm": 10.556957244873047, "learning_rate": 1.4051946011559316e-05, "loss": 1.3042, "step": 120 }, { "epoch": 0.04, "grad_norm": 17.19139862060547, "learning_rate": 1.4051142704857584e-05, "loss": 4.2733, "step": 121 }, { "epoch": 0.04, "grad_norm": 22.5864315032959, "learning_rate": 1.405033276270082e-05, "loss": 5.6605, "step": 122 }, { "epoch": 0.04, "grad_norm": 18.1478328704834, "learning_rate": 1.4049516185856663e-05, "loss": 2.2058, "step": 123 }, { "epoch": 0.04, "grad_norm": 11.945579528808594, "learning_rate": 1.4048692975099036e-05, "loss": 1.4621, "step": 124 }, { "epoch": 0.04, "grad_norm": 13.165715217590332, "learning_rate": 1.4047863131208152e-05, "loss": 2.9207, "step": 125 }, { "epoch": 0.04, "grad_norm": 13.359014511108398, "learning_rate": 1.4047026654970509e-05, "loss": 1.7454, "step": 126 }, { "epoch": 0.04, "grad_norm": 14.294341087341309, "learning_rate": 1.4046183547178893e-05, "loss": 1.0382, "step": 127 }, { "epoch": 0.04, "grad_norm": 19.4971866607666, "learning_rate": 1.4045333808632374e-05, "loss": 4.2891, "step": 128 }, { "epoch": 0.04, "grad_norm": 9.049226760864258, "learning_rate": 1.4044477440136308e-05, "loss": 0.9562, "step": 129 }, { "epoch": 0.04, "grad_norm": 9.53789234161377, "learning_rate": 1.4043614442502331e-05, "loss": 1.303, "step": 130 }, { "epoch": 0.04, "grad_norm": 8.629888534545898, "learning_rate": 1.4042744816548363e-05, "loss": 1.0109, "step": 131 }, { "epoch": 0.04, "grad_norm": 11.556740760803223, "learning_rate": 1.4041868563098612e-05, "loss": 1.2822, "step": 132 }, { "epoch": 0.04, "grad_norm": 14.855392456054688, "learning_rate": 1.4040985682983557e-05, "loss": 2.5222, "step": 133 }, { "epoch": 0.04, "grad_norm": 20.333721160888672, "learning_rate": 1.4040096177039966e-05, "loss": 3.1756, "step": 134 }, { "epoch": 0.04, "grad_norm": 19.936588287353516, "learning_rate": 1.4039200046110882e-05, "loss": 2.2128, "step": 135 }, { "epoch": 0.04, "grad_norm": 11.913959503173828, "learning_rate": 1.4038297291045633e-05, "loss": 1.2175, "step": 136 }, { "epoch": 0.04, "grad_norm": 16.783187866210938, "learning_rate": 1.4037387912699815e-05, "loss": 2.6931, "step": 137 }, { "epoch": 0.04, "grad_norm": 10.196114540100098, "learning_rate": 1.4036471911935311e-05, "loss": 0.658, "step": 138 }, { "epoch": 0.04, "grad_norm": 14.82783031463623, "learning_rate": 1.4035549289620276e-05, "loss": 3.1209, "step": 139 }, { "epoch": 0.04, "grad_norm": 19.238548278808594, "learning_rate": 1.403462004662914e-05, "loss": 2.9366, "step": 140 }, { "epoch": 0.04, "grad_norm": 10.233245849609375, "learning_rate": 1.4033684183842608e-05, "loss": 0.8557, "step": 141 }, { "epoch": 0.04, "grad_norm": 13.334661483764648, "learning_rate": 1.4032741702147666e-05, "loss": 1.3891, "step": 142 }, { "epoch": 0.04, "grad_norm": 15.853718757629395, "learning_rate": 1.4031792602437564e-05, "loss": 2.0343, "step": 143 }, { "epoch": 0.04, "grad_norm": 10.005762100219727, "learning_rate": 1.4030836885611825e-05, "loss": 1.4376, "step": 144 }, { "epoch": 0.04, "grad_norm": 14.192896842956543, "learning_rate": 1.4029874552576251e-05, "loss": 2.3502, "step": 145 }, { "epoch": 0.05, "grad_norm": 16.873315811157227, "learning_rate": 1.4028905604242907e-05, "loss": 5.7526, "step": 146 }, { "epoch": 0.05, "grad_norm": 15.167844772338867, "learning_rate": 1.402793004153013e-05, "loss": 0.8518, "step": 147 }, { "epoch": 0.05, "grad_norm": 14.539708137512207, "learning_rate": 1.402694786536253e-05, "loss": 4.8231, "step": 148 }, { "epoch": 0.05, "grad_norm": 13.99748420715332, "learning_rate": 1.4025959076670982e-05, "loss": 1.6392, "step": 149 }, { "epoch": 0.05, "grad_norm": 13.567449569702148, "learning_rate": 1.4024963676392621e-05, "loss": 2.9904, "step": 150 }, { "epoch": 0.05, "grad_norm": 12.029122352600098, "learning_rate": 1.4023961665470862e-05, "loss": 2.3267, "step": 151 }, { "epoch": 0.05, "grad_norm": 14.277032852172852, "learning_rate": 1.4022953044855374e-05, "loss": 3.3196, "step": 152 }, { "epoch": 0.05, "grad_norm": 12.015501976013184, "learning_rate": 1.40219378155021e-05, "loss": 2.5454, "step": 153 }, { "epoch": 0.05, "grad_norm": 13.410118103027344, "learning_rate": 1.4020915978373233e-05, "loss": 1.3893, "step": 154 }, { "epoch": 0.05, "grad_norm": 17.930435180664062, "learning_rate": 1.4019887534437244e-05, "loss": 1.8514, "step": 155 }, { "epoch": 0.05, "grad_norm": 16.743331909179688, "learning_rate": 1.4018852484668857e-05, "loss": 2.6936, "step": 156 }, { "epoch": 0.05, "grad_norm": 13.879019737243652, "learning_rate": 1.4017810830049058e-05, "loss": 1.5375, "step": 157 }, { "epoch": 0.05, "grad_norm": 12.569742202758789, "learning_rate": 1.4016762571565094e-05, "loss": 3.0512, "step": 158 }, { "epoch": 0.05, "grad_norm": 9.77190113067627, "learning_rate": 1.4015707710210466e-05, "loss": 1.0528, "step": 159 }, { "epoch": 0.05, "grad_norm": 17.27251434326172, "learning_rate": 1.4014646246984943e-05, "loss": 2.5514, "step": 160 }, { "epoch": 0.05, "grad_norm": 14.10989761352539, "learning_rate": 1.4013578182894543e-05, "loss": 1.5854, "step": 161 }, { "epoch": 0.05, "grad_norm": 20.85812759399414, "learning_rate": 1.4012503518951542e-05, "loss": 1.6049, "step": 162 }, { "epoch": 0.05, "grad_norm": 13.70592975616455, "learning_rate": 1.4011422256174472e-05, "loss": 2.8355, "step": 163 }, { "epoch": 0.05, "grad_norm": 14.205408096313477, "learning_rate": 1.4010334395588117e-05, "loss": 2.7484, "step": 164 }, { "epoch": 0.05, "grad_norm": 19.103593826293945, "learning_rate": 1.400923993822352e-05, "loss": 1.7546, "step": 165 }, { "epoch": 0.05, "grad_norm": 11.702698707580566, "learning_rate": 1.4008138885117966e-05, "loss": 1.2193, "step": 166 }, { "epoch": 0.05, "grad_norm": 18.2764835357666, "learning_rate": 1.4007031237315003e-05, "loss": 2.5745, "step": 167 }, { "epoch": 0.05, "grad_norm": 13.94648551940918, "learning_rate": 1.400591699586442e-05, "loss": 1.5701, "step": 168 }, { "epoch": 0.05, "grad_norm": 11.69957160949707, "learning_rate": 1.400479616182226e-05, "loss": 1.8477, "step": 169 }, { "epoch": 0.05, "grad_norm": 23.002796173095703, "learning_rate": 1.4003668736250814e-05, "loss": 2.777, "step": 170 }, { "epoch": 0.05, "grad_norm": 12.981588363647461, "learning_rate": 1.4002534720218615e-05, "loss": 5.5741, "step": 171 }, { "epoch": 0.05, "grad_norm": 16.71035385131836, "learning_rate": 1.4001394114800454e-05, "loss": 4.5123, "step": 172 }, { "epoch": 0.05, "grad_norm": 13.110565185546875, "learning_rate": 1.4000246921077352e-05, "loss": 2.8851, "step": 173 }, { "epoch": 0.05, "grad_norm": 11.49026107788086, "learning_rate": 1.3999093140136585e-05, "loss": 3.6336, "step": 174 }, { "epoch": 0.05, "grad_norm": 10.91289234161377, "learning_rate": 1.3997932773071671e-05, "loss": 1.3063, "step": 175 }, { "epoch": 0.05, "grad_norm": 17.452695846557617, "learning_rate": 1.3996765820982366e-05, "loss": 3.5251, "step": 176 }, { "epoch": 0.05, "grad_norm": 18.41887855529785, "learning_rate": 1.3995592284974667e-05, "loss": 1.9724, "step": 177 }, { "epoch": 0.06, "grad_norm": 17.130496978759766, "learning_rate": 1.3994412166160817e-05, "loss": 2.8272, "step": 178 }, { "epoch": 0.06, "grad_norm": 11.504462242126465, "learning_rate": 1.3993225465659294e-05, "loss": 2.5295, "step": 179 }, { "epoch": 0.06, "grad_norm": 9.741791725158691, "learning_rate": 1.3992032184594812e-05, "loss": 0.7621, "step": 180 }, { "epoch": 0.06, "grad_norm": 13.722132682800293, "learning_rate": 1.3990832324098323e-05, "loss": 2.4034, "step": 181 }, { "epoch": 0.06, "grad_norm": 14.14046573638916, "learning_rate": 1.3989625885307021e-05, "loss": 2.0134, "step": 182 }, { "epoch": 0.06, "grad_norm": 15.722448348999023, "learning_rate": 1.3988412869364323e-05, "loss": 5.0702, "step": 183 }, { "epoch": 0.06, "grad_norm": 7.00212287902832, "learning_rate": 1.398719327741989e-05, "loss": 0.8542, "step": 184 }, { "epoch": 0.06, "grad_norm": 13.306962013244629, "learning_rate": 1.398596711062961e-05, "loss": 2.0083, "step": 185 }, { "epoch": 0.06, "grad_norm": 11.195137977600098, "learning_rate": 1.3984734370155603e-05, "loss": 1.4735, "step": 186 }, { "epoch": 0.06, "grad_norm": 14.470940589904785, "learning_rate": 1.3983495057166222e-05, "loss": 5.086, "step": 187 }, { "epoch": 0.06, "grad_norm": 14.928512573242188, "learning_rate": 1.398224917283605e-05, "loss": 2.2687, "step": 188 }, { "epoch": 0.06, "grad_norm": 17.653017044067383, "learning_rate": 1.398099671834589e-05, "loss": 5.4149, "step": 189 }, { "epoch": 0.06, "grad_norm": 11.542997360229492, "learning_rate": 1.3979737694882781e-05, "loss": 1.5893, "step": 190 }, { "epoch": 0.06, "grad_norm": 14.671969413757324, "learning_rate": 1.3978472103639984e-05, "loss": 2.5069, "step": 191 }, { "epoch": 0.06, "grad_norm": 18.104692459106445, "learning_rate": 1.3977199945816983e-05, "loss": 2.2704, "step": 192 }, { "epoch": 0.06, "grad_norm": 16.34786605834961, "learning_rate": 1.397592122261949e-05, "loss": 4.7383, "step": 193 }, { "epoch": 0.06, "grad_norm": 11.198836326599121, "learning_rate": 1.3974635935259439e-05, "loss": 2.0531, "step": 194 }, { "epoch": 0.06, "grad_norm": 9.655235290527344, "learning_rate": 1.3973344084954981e-05, "loss": 1.5275, "step": 195 }, { "epoch": 0.06, "grad_norm": 14.924242973327637, "learning_rate": 1.3972045672930486e-05, "loss": 4.6574, "step": 196 }, { "epoch": 0.06, "grad_norm": 17.535531997680664, "learning_rate": 1.3970740700416554e-05, "loss": 5.0241, "step": 197 }, { "epoch": 0.06, "grad_norm": 13.114269256591797, "learning_rate": 1.3969429168649992e-05, "loss": 5.2485, "step": 198 }, { "epoch": 0.06, "grad_norm": 12.427637100219727, "learning_rate": 1.3968111078873827e-05, "loss": 2.0893, "step": 199 }, { "epoch": 0.06, "grad_norm": 9.180805206298828, "learning_rate": 1.39667864323373e-05, "loss": 0.7392, "step": 200 }, { "epoch": 0.06, "grad_norm": 14.39584732055664, "learning_rate": 1.3965455230295872e-05, "loss": 3.086, "step": 201 }, { "epoch": 0.06, "grad_norm": 14.689746856689453, "learning_rate": 1.3964117474011208e-05, "loss": 1.6546, "step": 202 }, { "epoch": 0.06, "grad_norm": 17.625497817993164, "learning_rate": 1.3962773164751195e-05, "loss": 2.9516, "step": 203 }, { "epoch": 0.06, "grad_norm": 10.085124969482422, "learning_rate": 1.396142230378992e-05, "loss": 1.0635, "step": 204 }, { "epoch": 0.06, "grad_norm": 12.808405876159668, "learning_rate": 1.396006489240769e-05, "loss": 5.0727, "step": 205 }, { "epoch": 0.06, "grad_norm": 17.288103103637695, "learning_rate": 1.3958700931891013e-05, "loss": 3.326, "step": 206 }, { "epoch": 0.06, "grad_norm": 15.653545379638672, "learning_rate": 1.3957330423532604e-05, "loss": 1.8213, "step": 207 }, { "epoch": 0.06, "grad_norm": 11.816011428833008, "learning_rate": 1.3955953368631391e-05, "loss": 1.4498, "step": 208 }, { "epoch": 0.06, "grad_norm": 11.988968849182129, "learning_rate": 1.3954569768492497e-05, "loss": 1.7693, "step": 209 }, { "epoch": 0.07, "grad_norm": 14.014742851257324, "learning_rate": 1.3953179624427257e-05, "loss": 2.9494, "step": 210 }, { "epoch": 0.07, "grad_norm": 15.824225425720215, "learning_rate": 1.3951782937753205e-05, "loss": 4.9419, "step": 211 }, { "epoch": 0.07, "grad_norm": 18.75404167175293, "learning_rate": 1.395037970979407e-05, "loss": 4.0696, "step": 212 }, { "epoch": 0.07, "grad_norm": 22.080371856689453, "learning_rate": 1.394896994187979e-05, "loss": 4.0442, "step": 213 }, { "epoch": 0.07, "grad_norm": 10.431167602539062, "learning_rate": 1.3947553635346495e-05, "loss": 1.5332, "step": 214 }, { "epoch": 0.07, "grad_norm": 12.02080249786377, "learning_rate": 1.3946130791536518e-05, "loss": 2.3295, "step": 215 }, { "epoch": 0.07, "grad_norm": 12.056671142578125, "learning_rate": 1.394470141179838e-05, "loss": 5.4534, "step": 216 }, { "epoch": 0.07, "grad_norm": 6.8713698387146, "learning_rate": 1.3943265497486799e-05, "loss": 1.199, "step": 217 }, { "epoch": 0.07, "grad_norm": 18.43778419494629, "learning_rate": 1.3941823049962691e-05, "loss": 5.0473, "step": 218 }, { "epoch": 0.07, "grad_norm": 12.254149436950684, "learning_rate": 1.3940374070593162e-05, "loss": 2.1543, "step": 219 }, { "epoch": 0.07, "grad_norm": 19.002323150634766, "learning_rate": 1.3938918560751505e-05, "loss": 2.6575, "step": 220 }, { "epoch": 0.07, "grad_norm": 16.23662757873535, "learning_rate": 1.3937456521817205e-05, "loss": 6.1765, "step": 221 }, { "epoch": 0.07, "grad_norm": 17.02012825012207, "learning_rate": 1.3935987955175937e-05, "loss": 6.0835, "step": 222 }, { "epoch": 0.07, "grad_norm": 14.135772705078125, "learning_rate": 1.3934512862219556e-05, "loss": 1.0666, "step": 223 }, { "epoch": 0.07, "grad_norm": 13.325325965881348, "learning_rate": 1.3933031244346113e-05, "loss": 3.2572, "step": 224 }, { "epoch": 0.07, "grad_norm": 13.025094985961914, "learning_rate": 1.3931543102959834e-05, "loss": 1.2652, "step": 225 }, { "epoch": 0.07, "grad_norm": 14.37082290649414, "learning_rate": 1.393004843947113e-05, "loss": 2.2742, "step": 226 }, { "epoch": 0.07, "grad_norm": 10.832401275634766, "learning_rate": 1.3928547255296597e-05, "loss": 1.2151, "step": 227 }, { "epoch": 0.07, "grad_norm": 12.949665069580078, "learning_rate": 1.3927039551859006e-05, "loss": 2.5363, "step": 228 }, { "epoch": 0.07, "grad_norm": 11.304301261901855, "learning_rate": 1.3925525330587312e-05, "loss": 5.016, "step": 229 }, { "epoch": 0.07, "grad_norm": 13.987974166870117, "learning_rate": 1.3924004592916641e-05, "loss": 5.0027, "step": 230 }, { "epoch": 0.07, "grad_norm": 18.76968002319336, "learning_rate": 1.39224773402883e-05, "loss": 2.382, "step": 231 }, { "epoch": 0.07, "grad_norm": 14.265201568603516, "learning_rate": 1.392094357414977e-05, "loss": 1.8023, "step": 232 }, { "epoch": 0.07, "grad_norm": 9.563916206359863, "learning_rate": 1.3919403295954707e-05, "loss": 1.0785, "step": 233 }, { "epoch": 0.07, "grad_norm": 11.332926750183105, "learning_rate": 1.3917856507162931e-05, "loss": 1.4225, "step": 234 }, { "epoch": 0.07, "grad_norm": 15.125313758850098, "learning_rate": 1.3916303209240444e-05, "loss": 1.3242, "step": 235 }, { "epoch": 0.07, "grad_norm": 13.948397636413574, "learning_rate": 1.3914743403659406e-05, "loss": 1.6152, "step": 236 }, { "epoch": 0.07, "grad_norm": 12.970022201538086, "learning_rate": 1.3913177091898151e-05, "loss": 1.9887, "step": 237 }, { "epoch": 0.07, "grad_norm": 15.360918045043945, "learning_rate": 1.391160427544118e-05, "loss": 1.711, "step": 238 }, { "epoch": 0.07, "grad_norm": 13.110067367553711, "learning_rate": 1.3910024955779158e-05, "loss": 2.768, "step": 239 }, { "epoch": 0.07, "grad_norm": 15.645023345947266, "learning_rate": 1.3908439134408911e-05, "loss": 2.8457, "step": 240 }, { "epoch": 0.07, "grad_norm": 14.423534393310547, "learning_rate": 1.3906846812833429e-05, "loss": 1.2738, "step": 241 }, { "epoch": 0.07, "grad_norm": 15.688253402709961, "learning_rate": 1.3905247992561863e-05, "loss": 3.8152, "step": 242 }, { "epoch": 0.08, "grad_norm": 10.980319023132324, "learning_rate": 1.390364267510952e-05, "loss": 1.3614, "step": 243 }, { "epoch": 0.08, "grad_norm": 12.396924018859863, "learning_rate": 1.390203086199787e-05, "loss": 2.2875, "step": 244 }, { "epoch": 0.08, "grad_norm": 9.623881340026855, "learning_rate": 1.390041255475454e-05, "loss": 1.0153, "step": 245 }, { "epoch": 0.08, "grad_norm": 14.678177833557129, "learning_rate": 1.3898787754913304e-05, "loss": 2.5339, "step": 246 }, { "epoch": 0.08, "grad_norm": 18.33745765686035, "learning_rate": 1.3897156464014096e-05, "loss": 1.5622, "step": 247 }, { "epoch": 0.08, "grad_norm": 11.040255546569824, "learning_rate": 1.3895518683603002e-05, "loss": 1.2037, "step": 248 }, { "epoch": 0.08, "grad_norm": 20.958837509155273, "learning_rate": 1.3893874415232256e-05, "loss": 2.3203, "step": 249 }, { "epoch": 0.08, "grad_norm": 11.519689559936523, "learning_rate": 1.389222366046024e-05, "loss": 0.971, "step": 250 }, { "epoch": 0.08, "grad_norm": 10.441787719726562, "learning_rate": 1.3890566420851487e-05, "loss": 0.9101, "step": 251 }, { "epoch": 0.08, "grad_norm": 15.943977355957031, "learning_rate": 1.3888902697976677e-05, "loss": 1.9499, "step": 252 }, { "epoch": 0.08, "grad_norm": 18.179977416992188, "learning_rate": 1.3887232493412629e-05, "loss": 1.6955, "step": 253 }, { "epoch": 0.08, "grad_norm": 12.42634391784668, "learning_rate": 1.388555580874231e-05, "loss": 2.1465, "step": 254 }, { "epoch": 0.08, "grad_norm": 10.933977127075195, "learning_rate": 1.3883872645554827e-05, "loss": 0.9518, "step": 255 }, { "epoch": 0.08, "grad_norm": 20.83428382873535, "learning_rate": 1.3882183005445427e-05, "loss": 3.556, "step": 256 }, { "epoch": 0.08, "grad_norm": 13.598001480102539, "learning_rate": 1.3880486890015494e-05, "loss": 2.433, "step": 257 }, { "epoch": 0.08, "grad_norm": 15.594669342041016, "learning_rate": 1.3878784300872555e-05, "loss": 1.9596, "step": 258 }, { "epoch": 0.08, "grad_norm": 15.173829078674316, "learning_rate": 1.3877075239630266e-05, "loss": 2.4792, "step": 259 }, { "epoch": 0.08, "grad_norm": 10.16698169708252, "learning_rate": 1.387535970790842e-05, "loss": 0.8115, "step": 260 }, { "epoch": 0.08, "grad_norm": 24.462717056274414, "learning_rate": 1.3873637707332943e-05, "loss": 5.0937, "step": 261 }, { "epoch": 0.08, "grad_norm": 10.96473217010498, "learning_rate": 1.387190923953589e-05, "loss": 0.8035, "step": 262 }, { "epoch": 0.08, "grad_norm": 15.601380348205566, "learning_rate": 1.3870174306155446e-05, "loss": 2.7259, "step": 263 }, { "epoch": 0.08, "grad_norm": 13.688828468322754, "learning_rate": 1.3868432908835925e-05, "loss": 2.0762, "step": 264 }, { "epoch": 0.08, "grad_norm": 12.210821151733398, "learning_rate": 1.3866685049227767e-05, "loss": 2.1519, "step": 265 }, { "epoch": 0.08, "grad_norm": 14.092357635498047, "learning_rate": 1.3864930728987538e-05, "loss": 1.3382, "step": 266 }, { "epoch": 0.08, "grad_norm": 26.011722564697266, "learning_rate": 1.3863169949777922e-05, "loss": 3.3422, "step": 267 }, { "epoch": 0.08, "grad_norm": 10.784296989440918, "learning_rate": 1.386140271326773e-05, "loss": 1.18, "step": 268 }, { "epoch": 0.08, "grad_norm": 14.321930885314941, "learning_rate": 1.3859629021131891e-05, "loss": 5.137, "step": 269 }, { "epoch": 0.08, "grad_norm": 17.20169448852539, "learning_rate": 1.3857848875051453e-05, "loss": 2.4465, "step": 270 }, { "epoch": 0.08, "grad_norm": 16.64161491394043, "learning_rate": 1.385606227671358e-05, "loss": 2.3278, "step": 271 }, { "epoch": 0.08, "grad_norm": 18.673120498657227, "learning_rate": 1.3854269227811553e-05, "loss": 2.082, "step": 272 }, { "epoch": 0.08, "grad_norm": 13.86021900177002, "learning_rate": 1.3852469730044761e-05, "loss": 2.2902, "step": 273 }, { "epoch": 0.08, "grad_norm": 15.379273414611816, "learning_rate": 1.3850663785118716e-05, "loss": 4.6714, "step": 274 }, { "epoch": 0.09, "grad_norm": 13.341683387756348, "learning_rate": 1.384885139474503e-05, "loss": 1.8605, "step": 275 }, { "epoch": 0.09, "grad_norm": 10.289388656616211, "learning_rate": 1.3847032560641427e-05, "loss": 1.2596, "step": 276 }, { "epoch": 0.09, "grad_norm": 8.88835620880127, "learning_rate": 1.3845207284531742e-05, "loss": 0.8958, "step": 277 }, { "epoch": 0.09, "grad_norm": 18.075733184814453, "learning_rate": 1.3843375568145908e-05, "loss": 2.1797, "step": 278 }, { "epoch": 0.09, "grad_norm": 20.14925765991211, "learning_rate": 1.3841537413219967e-05, "loss": 2.904, "step": 279 }, { "epoch": 0.09, "grad_norm": 11.297093391418457, "learning_rate": 1.3839692821496065e-05, "loss": 1.7218, "step": 280 }, { "epoch": 0.09, "grad_norm": 11.621965408325195, "learning_rate": 1.3837841794722444e-05, "loss": 1.7314, "step": 281 }, { "epoch": 0.09, "grad_norm": 12.384159088134766, "learning_rate": 1.383598433465345e-05, "loss": 1.74, "step": 282 }, { "epoch": 0.09, "grad_norm": 16.62529754638672, "learning_rate": 1.3834120443049516e-05, "loss": 1.7181, "step": 283 }, { "epoch": 0.09, "grad_norm": 18.35346221923828, "learning_rate": 1.3832250121677187e-05, "loss": 1.9667, "step": 284 }, { "epoch": 0.09, "grad_norm": 16.861061096191406, "learning_rate": 1.3830373372309083e-05, "loss": 2.3541, "step": 285 }, { "epoch": 0.09, "grad_norm": 12.295331001281738, "learning_rate": 1.3828490196723935e-05, "loss": 2.0765, "step": 286 }, { "epoch": 0.09, "grad_norm": 23.350387573242188, "learning_rate": 1.3826600596706548e-05, "loss": 5.8657, "step": 287 }, { "epoch": 0.09, "grad_norm": 17.5737247467041, "learning_rate": 1.382470457404783e-05, "loss": 3.4766, "step": 288 }, { "epoch": 0.09, "grad_norm": 17.406465530395508, "learning_rate": 1.3822802130544763e-05, "loss": 1.1343, "step": 289 }, { "epoch": 0.09, "grad_norm": 10.825047492980957, "learning_rate": 1.3820893268000426e-05, "loss": 2.3149, "step": 290 }, { "epoch": 0.09, "grad_norm": 16.59432601928711, "learning_rate": 1.3818977988223976e-05, "loss": 5.5533, "step": 291 }, { "epoch": 0.09, "grad_norm": 17.41448974609375, "learning_rate": 1.381705629303065e-05, "loss": 4.7277, "step": 292 }, { "epoch": 0.09, "grad_norm": 12.198013305664062, "learning_rate": 1.3815128184241771e-05, "loss": 2.3272, "step": 293 }, { "epoch": 0.09, "grad_norm": 16.00592803955078, "learning_rate": 1.3813193663684735e-05, "loss": 2.0677, "step": 294 }, { "epoch": 0.09, "grad_norm": 16.295944213867188, "learning_rate": 1.381125273319302e-05, "loss": 3.5203, "step": 295 }, { "epoch": 0.09, "grad_norm": 15.805230140686035, "learning_rate": 1.3809305394606176e-05, "loss": 1.6731, "step": 296 }, { "epoch": 0.09, "grad_norm": 10.561698913574219, "learning_rate": 1.3807351649769828e-05, "loss": 1.9388, "step": 297 }, { "epoch": 0.09, "grad_norm": 10.345500946044922, "learning_rate": 1.3805391500535669e-05, "loss": 0.9415, "step": 298 }, { "epoch": 0.09, "grad_norm": 15.150613784790039, "learning_rate": 1.3803424948761468e-05, "loss": 1.6428, "step": 299 }, { "epoch": 0.09, "grad_norm": 18.358455657958984, "learning_rate": 1.3801451996311053e-05, "loss": 3.3602, "step": 300 }, { "epoch": 0.09, "grad_norm": 13.325264930725098, "learning_rate": 1.3799472645054333e-05, "loss": 1.349, "step": 301 }, { "epoch": 0.09, "grad_norm": 11.161480903625488, "learning_rate": 1.3797486896867266e-05, "loss": 1.3497, "step": 302 }, { "epoch": 0.09, "grad_norm": 19.63141441345215, "learning_rate": 1.379549475363188e-05, "loss": 2.321, "step": 303 }, { "epoch": 0.09, "grad_norm": 10.052618980407715, "learning_rate": 1.3793496217236267e-05, "loss": 2.1645, "step": 304 }, { "epoch": 0.09, "grad_norm": 12.28446102142334, "learning_rate": 1.3791491289574571e-05, "loss": 1.6359, "step": 305 }, { "epoch": 0.09, "grad_norm": 16.950092315673828, "learning_rate": 1.3789479972546999e-05, "loss": 2.7423, "step": 306 }, { "epoch": 0.1, "grad_norm": 17.705276489257812, "learning_rate": 1.378746226805981e-05, "loss": 2.0395, "step": 307 }, { "epoch": 0.1, "grad_norm": 17.328041076660156, "learning_rate": 1.378543817802532e-05, "loss": 3.1794, "step": 308 }, { "epoch": 0.1, "grad_norm": 18.853071212768555, "learning_rate": 1.3783407704361897e-05, "loss": 2.7778, "step": 309 }, { "epoch": 0.1, "grad_norm": 14.925019264221191, "learning_rate": 1.3781370848993955e-05, "loss": 3.5123, "step": 310 }, { "epoch": 0.1, "grad_norm": 18.84942054748535, "learning_rate": 1.377932761385196e-05, "loss": 2.1698, "step": 311 }, { "epoch": 0.1, "grad_norm": 10.201909065246582, "learning_rate": 1.3777278000872426e-05, "loss": 1.3381, "step": 312 }, { "epoch": 0.1, "grad_norm": 13.145633697509766, "learning_rate": 1.3775222011997907e-05, "loss": 2.2904, "step": 313 }, { "epoch": 0.1, "grad_norm": 16.376514434814453, "learning_rate": 1.3773159649177003e-05, "loss": 5.1685, "step": 314 }, { "epoch": 0.1, "grad_norm": 13.942585945129395, "learning_rate": 1.3771090914364354e-05, "loss": 2.2097, "step": 315 }, { "epoch": 0.1, "grad_norm": 13.932531356811523, "learning_rate": 1.3769015809520639e-05, "loss": 5.0479, "step": 316 }, { "epoch": 0.1, "grad_norm": 21.648984909057617, "learning_rate": 1.3766934336612577e-05, "loss": 6.155, "step": 317 }, { "epoch": 0.1, "grad_norm": 19.75204849243164, "learning_rate": 1.3764846497612917e-05, "loss": 1.9287, "step": 318 }, { "epoch": 0.1, "grad_norm": 15.067526817321777, "learning_rate": 1.3762752294500447e-05, "loss": 2.5628, "step": 319 }, { "epoch": 0.1, "grad_norm": 11.769791603088379, "learning_rate": 1.3760651729259985e-05, "loss": 1.8491, "step": 320 }, { "epoch": 0.1, "grad_norm": 13.101560592651367, "learning_rate": 1.3758544803882376e-05, "loss": 1.8547, "step": 321 }, { "epoch": 0.1, "grad_norm": 13.543452262878418, "learning_rate": 1.3756431520364499e-05, "loss": 4.2893, "step": 322 }, { "epoch": 0.1, "grad_norm": 22.693283081054688, "learning_rate": 1.375431188070925e-05, "loss": 2.9688, "step": 323 }, { "epoch": 0.1, "grad_norm": 15.341410636901855, "learning_rate": 1.3752185886925557e-05, "loss": 2.0474, "step": 324 }, { "epoch": 0.1, "grad_norm": 13.230315208435059, "learning_rate": 1.375005354102837e-05, "loss": 1.388, "step": 325 }, { "epoch": 0.1, "grad_norm": 15.137369155883789, "learning_rate": 1.3747914845038655e-05, "loss": 1.8743, "step": 326 }, { "epoch": 0.1, "grad_norm": 10.931097984313965, "learning_rate": 1.3745769800983398e-05, "loss": 0.8339, "step": 327 }, { "epoch": 0.1, "grad_norm": 17.05511474609375, "learning_rate": 1.3743618410895602e-05, "loss": 3.0675, "step": 328 }, { "epoch": 0.1, "grad_norm": 15.675482749938965, "learning_rate": 1.3741460676814283e-05, "loss": 2.7984, "step": 329 }, { "epoch": 0.1, "grad_norm": 16.468854904174805, "learning_rate": 1.3739296600784477e-05, "loss": 4.7487, "step": 330 }, { "epoch": 0.1, "grad_norm": 14.824066162109375, "learning_rate": 1.3737126184857218e-05, "loss": 2.1714, "step": 331 }, { "epoch": 0.1, "grad_norm": 16.126014709472656, "learning_rate": 1.3734949431089558e-05, "loss": 4.4093, "step": 332 }, { "epoch": 0.1, "grad_norm": 12.778393745422363, "learning_rate": 1.3732766341544554e-05, "loss": 2.6112, "step": 333 }, { "epoch": 0.1, "grad_norm": 12.41983699798584, "learning_rate": 1.3730576918291266e-05, "loss": 2.6381, "step": 334 }, { "epoch": 0.1, "grad_norm": 9.529387474060059, "learning_rate": 1.3728381163404758e-05, "loss": 1.1488, "step": 335 }, { "epoch": 0.1, "grad_norm": 10.54234790802002, "learning_rate": 1.3726179078966098e-05, "loss": 1.3704, "step": 336 }, { "epoch": 0.1, "grad_norm": 12.4400053024292, "learning_rate": 1.3723970667062345e-05, "loss": 1.4149, "step": 337 }, { "epoch": 0.1, "grad_norm": 10.626307487487793, "learning_rate": 1.3721755929786563e-05, "loss": 1.2803, "step": 338 }, { "epoch": 0.11, "grad_norm": 17.947769165039062, "learning_rate": 1.3719534869237809e-05, "loss": 6.1637, "step": 339 }, { "epoch": 0.11, "grad_norm": 14.783670425415039, "learning_rate": 1.371730748752113e-05, "loss": 3.3733, "step": 340 }, { "epoch": 0.11, "grad_norm": 15.04702091217041, "learning_rate": 1.3715073786747564e-05, "loss": 2.0311, "step": 341 }, { "epoch": 0.11, "grad_norm": 16.079864501953125, "learning_rate": 1.3712833769034142e-05, "loss": 2.1547, "step": 342 }, { "epoch": 0.11, "grad_norm": 10.409101486206055, "learning_rate": 1.371058743650388e-05, "loss": 1.2826, "step": 343 }, { "epoch": 0.11, "grad_norm": 11.11122989654541, "learning_rate": 1.3708334791285779e-05, "loss": 1.2537, "step": 344 }, { "epoch": 0.11, "grad_norm": 13.088030815124512, "learning_rate": 1.3706075835514823e-05, "loss": 1.6252, "step": 345 }, { "epoch": 0.11, "grad_norm": 15.13451862335205, "learning_rate": 1.3703810571331973e-05, "loss": 6.6792, "step": 346 }, { "epoch": 0.11, "grad_norm": 13.995061874389648, "learning_rate": 1.3701539000884178e-05, "loss": 1.1791, "step": 347 }, { "epoch": 0.11, "grad_norm": 13.764225959777832, "learning_rate": 1.3699261126324355e-05, "loss": 3.4514, "step": 348 }, { "epoch": 0.11, "grad_norm": 11.78808307647705, "learning_rate": 1.36969769498114e-05, "loss": 0.7701, "step": 349 }, { "epoch": 0.11, "grad_norm": 16.577268600463867, "learning_rate": 1.3694686473510184e-05, "loss": 4.2284, "step": 350 }, { "epoch": 0.11, "grad_norm": 11.578121185302734, "learning_rate": 1.369238969959154e-05, "loss": 1.2305, "step": 351 }, { "epoch": 0.11, "grad_norm": 16.763050079345703, "learning_rate": 1.369008663023228e-05, "loss": 2.4429, "step": 352 }, { "epoch": 0.11, "grad_norm": 12.74651050567627, "learning_rate": 1.3687777267615179e-05, "loss": 0.9373, "step": 353 }, { "epoch": 0.11, "grad_norm": 12.947549819946289, "learning_rate": 1.3685461613928972e-05, "loss": 2.1318, "step": 354 }, { "epoch": 0.11, "grad_norm": 9.887306213378906, "learning_rate": 1.3683139671368361e-05, "loss": 0.9985, "step": 355 }, { "epoch": 0.11, "grad_norm": 11.960177421569824, "learning_rate": 1.368081144213401e-05, "loss": 1.8042, "step": 356 }, { "epoch": 0.11, "grad_norm": 12.04848861694336, "learning_rate": 1.3678476928432535e-05, "loss": 2.0705, "step": 357 }, { "epoch": 0.11, "grad_norm": 18.955495834350586, "learning_rate": 1.3676136132476515e-05, "loss": 5.26, "step": 358 }, { "epoch": 0.11, "grad_norm": 10.15633487701416, "learning_rate": 1.367378905648448e-05, "loss": 0.9436, "step": 359 }, { "epoch": 0.11, "grad_norm": 17.71897315979004, "learning_rate": 1.367143570268091e-05, "loss": 1.9179, "step": 360 }, { "epoch": 0.11, "grad_norm": 16.045982360839844, "learning_rate": 1.366907607329624e-05, "loss": 1.6184, "step": 361 }, { "epoch": 0.11, "grad_norm": 12.041046142578125, "learning_rate": 1.3666710170566847e-05, "loss": 1.4901, "step": 362 }, { "epoch": 0.11, "grad_norm": 11.907164573669434, "learning_rate": 1.366433799673506e-05, "loss": 2.5307, "step": 363 }, { "epoch": 0.11, "grad_norm": 12.025712966918945, "learning_rate": 1.3661959554049143e-05, "loss": 1.6155, "step": 364 }, { "epoch": 0.11, "grad_norm": 10.814041137695312, "learning_rate": 1.3659574844763313e-05, "loss": 1.7437, "step": 365 }, { "epoch": 0.11, "grad_norm": 16.084716796875, "learning_rate": 1.3657183871137716e-05, "loss": 2.8582, "step": 366 }, { "epoch": 0.11, "grad_norm": 13.304903030395508, "learning_rate": 1.3654786635438439e-05, "loss": 3.8795, "step": 367 }, { "epoch": 0.11, "grad_norm": 12.790328025817871, "learning_rate": 1.3652383139937504e-05, "loss": 3.2627, "step": 368 }, { "epoch": 0.11, "grad_norm": 11.926814079284668, "learning_rate": 1.3649973386912866e-05, "loss": 1.9194, "step": 369 }, { "epoch": 0.11, "grad_norm": 15.041619300842285, "learning_rate": 1.3647557378648409e-05, "loss": 6.1905, "step": 370 }, { "epoch": 0.11, "grad_norm": 18.288606643676758, "learning_rate": 1.364513511743395e-05, "loss": 2.6999, "step": 371 }, { "epoch": 0.12, "grad_norm": 13.575446128845215, "learning_rate": 1.3642706605565228e-05, "loss": 2.5753, "step": 372 }, { "epoch": 0.12, "grad_norm": 11.08791446685791, "learning_rate": 1.3640271845343907e-05, "loss": 1.9656, "step": 373 }, { "epoch": 0.12, "grad_norm": 12.84411907196045, "learning_rate": 1.3637830839077572e-05, "loss": 1.5011, "step": 374 }, { "epoch": 0.12, "grad_norm": 13.908230781555176, "learning_rate": 1.3635383589079732e-05, "loss": 1.8645, "step": 375 }, { "epoch": 0.12, "grad_norm": 22.58487319946289, "learning_rate": 1.363293009766981e-05, "loss": 2.4851, "step": 376 }, { "epoch": 0.12, "grad_norm": 18.689922332763672, "learning_rate": 1.3630470367173147e-05, "loss": 2.8248, "step": 377 }, { "epoch": 0.12, "grad_norm": 18.89048194885254, "learning_rate": 1.3628004399920988e-05, "loss": 1.8686, "step": 378 }, { "epoch": 0.12, "grad_norm": 13.554402351379395, "learning_rate": 1.3625532198250505e-05, "loss": 1.7365, "step": 379 }, { "epoch": 0.12, "grad_norm": 12.678657531738281, "learning_rate": 1.3623053764504769e-05, "loss": 1.3599, "step": 380 }, { "epoch": 0.12, "grad_norm": 19.57718849182129, "learning_rate": 1.3620569101032753e-05, "loss": 1.7862, "step": 381 }, { "epoch": 0.12, "grad_norm": 9.472726821899414, "learning_rate": 1.3618078210189348e-05, "loss": 1.0532, "step": 382 }, { "epoch": 0.12, "grad_norm": 12.552109718322754, "learning_rate": 1.361558109433533e-05, "loss": 5.6745, "step": 383 }, { "epoch": 0.12, "grad_norm": 12.24566650390625, "learning_rate": 1.3613077755837395e-05, "loss": 4.911, "step": 384 }, { "epoch": 0.12, "grad_norm": 18.16314697265625, "learning_rate": 1.3610568197068114e-05, "loss": 1.9886, "step": 385 }, { "epoch": 0.12, "grad_norm": 14.017110824584961, "learning_rate": 1.3608052420405975e-05, "loss": 3.0355, "step": 386 }, { "epoch": 0.12, "grad_norm": 11.076988220214844, "learning_rate": 1.3605530428235346e-05, "loss": 1.3674, "step": 387 }, { "epoch": 0.12, "grad_norm": 17.93915367126465, "learning_rate": 1.3603002222946485e-05, "loss": 6.2092, "step": 388 }, { "epoch": 0.12, "grad_norm": 18.35929298400879, "learning_rate": 1.3600467806935547e-05, "loss": 2.6678, "step": 389 }, { "epoch": 0.12, "grad_norm": 10.46489429473877, "learning_rate": 1.3597927182604568e-05, "loss": 1.1638, "step": 390 }, { "epoch": 0.12, "grad_norm": 13.621467590332031, "learning_rate": 1.3595380352361468e-05, "loss": 2.3239, "step": 391 }, { "epoch": 0.12, "grad_norm": 9.548460006713867, "learning_rate": 1.3592827318620053e-05, "loss": 1.1291, "step": 392 }, { "epoch": 0.12, "grad_norm": 11.570243835449219, "learning_rate": 1.3590268083800002e-05, "loss": 1.7025, "step": 393 }, { "epoch": 0.12, "grad_norm": 15.140338897705078, "learning_rate": 1.3587702650326873e-05, "loss": 2.715, "step": 394 }, { "epoch": 0.12, "grad_norm": 18.13969612121582, "learning_rate": 1.3585131020632103e-05, "loss": 2.1997, "step": 395 }, { "epoch": 0.12, "grad_norm": 17.66150665283203, "learning_rate": 1.3582553197153e-05, "loss": 2.4102, "step": 396 }, { "epoch": 0.12, "grad_norm": 14.307275772094727, "learning_rate": 1.3579969182332738e-05, "loss": 3.3309, "step": 397 }, { "epoch": 0.12, "grad_norm": 13.166078567504883, "learning_rate": 1.3577378978620364e-05, "loss": 1.7019, "step": 398 }, { "epoch": 0.12, "grad_norm": 12.583813667297363, "learning_rate": 1.3574782588470789e-05, "loss": 1.9107, "step": 399 }, { "epoch": 0.12, "grad_norm": 10.487409591674805, "learning_rate": 1.3572180014344785e-05, "loss": 1.4231, "step": 400 }, { "epoch": 0.12, "grad_norm": 18.598587036132812, "learning_rate": 1.3569571258708989e-05, "loss": 2.8625, "step": 401 }, { "epoch": 0.12, "grad_norm": 12.454787254333496, "learning_rate": 1.3566956324035895e-05, "loss": 1.6535, "step": 402 }, { "epoch": 0.12, "grad_norm": 11.811790466308594, "learning_rate": 1.356433521280385e-05, "loss": 2.4083, "step": 403 }, { "epoch": 0.13, "grad_norm": 10.476715087890625, "learning_rate": 1.356170792749706e-05, "loss": 2.9695, "step": 404 }, { "epoch": 0.13, "grad_norm": 14.166971206665039, "learning_rate": 1.3559074470605582e-05, "loss": 1.5907, "step": 405 }, { "epoch": 0.13, "grad_norm": 11.776312828063965, "learning_rate": 1.3556434844625316e-05, "loss": 1.1981, "step": 406 }, { "epoch": 0.13, "grad_norm": 11.133995056152344, "learning_rate": 1.3553789052058017e-05, "loss": 1.6438, "step": 407 }, { "epoch": 0.13, "grad_norm": 19.32341766357422, "learning_rate": 1.3551137095411283e-05, "loss": 2.4166, "step": 408 }, { "epoch": 0.13, "grad_norm": 12.969614028930664, "learning_rate": 1.3548478977198546e-05, "loss": 1.886, "step": 409 }, { "epoch": 0.13, "grad_norm": 15.233928680419922, "learning_rate": 1.354581469993909e-05, "loss": 1.7834, "step": 410 }, { "epoch": 0.13, "grad_norm": 10.977611541748047, "learning_rate": 1.3543144266158026e-05, "loss": 1.0913, "step": 411 }, { "epoch": 0.13, "grad_norm": 15.674370765686035, "learning_rate": 1.3540467678386306e-05, "loss": 5.5188, "step": 412 }, { "epoch": 0.13, "grad_norm": 15.832097053527832, "learning_rate": 1.3537784939160712e-05, "loss": 3.5768, "step": 413 }, { "epoch": 0.13, "grad_norm": 14.40325927734375, "learning_rate": 1.3535096051023858e-05, "loss": 1.9102, "step": 414 }, { "epoch": 0.13, "grad_norm": 15.826641082763672, "learning_rate": 1.3532401016524185e-05, "loss": 2.6502, "step": 415 }, { "epoch": 0.13, "grad_norm": 20.26589584350586, "learning_rate": 1.3529699838215955e-05, "loss": 3.2667, "step": 416 }, { "epoch": 0.13, "grad_norm": 12.244672775268555, "learning_rate": 1.3526992518659262e-05, "loss": 1.8443, "step": 417 }, { "epoch": 0.13, "grad_norm": 17.309690475463867, "learning_rate": 1.3524279060420013e-05, "loss": 2.3227, "step": 418 }, { "epoch": 0.13, "grad_norm": 20.20816421508789, "learning_rate": 1.3521559466069934e-05, "loss": 2.2727, "step": 419 }, { "epoch": 0.13, "grad_norm": 13.342791557312012, "learning_rate": 1.3518833738186571e-05, "loss": 1.4227, "step": 420 }, { "epoch": 0.13, "grad_norm": 10.979636192321777, "learning_rate": 1.3516101879353277e-05, "loss": 2.3114, "step": 421 }, { "epoch": 0.13, "grad_norm": 15.170080184936523, "learning_rate": 1.351336389215922e-05, "loss": 2.8633, "step": 422 }, { "epoch": 0.13, "grad_norm": 18.355520248413086, "learning_rate": 1.3510619779199375e-05, "loss": 2.6312, "step": 423 }, { "epoch": 0.13, "grad_norm": 13.05256462097168, "learning_rate": 1.3507869543074524e-05, "loss": 1.1738, "step": 424 }, { "epoch": 0.13, "grad_norm": 11.840422630310059, "learning_rate": 1.350511318639125e-05, "loss": 2.0828, "step": 425 }, { "epoch": 0.13, "grad_norm": 15.11091423034668, "learning_rate": 1.3502350711761941e-05, "loss": 1.6218, "step": 426 }, { "epoch": 0.13, "grad_norm": 13.374017715454102, "learning_rate": 1.3499582121804774e-05, "loss": 1.4773, "step": 427 }, { "epoch": 0.13, "grad_norm": 12.468167304992676, "learning_rate": 1.3496807419143736e-05, "loss": 2.0516, "step": 428 }, { "epoch": 0.13, "grad_norm": 14.024086952209473, "learning_rate": 1.3494026606408593e-05, "loss": 0.9726, "step": 429 }, { "epoch": 0.13, "grad_norm": 10.078760147094727, "learning_rate": 1.3491239686234912e-05, "loss": 1.5166, "step": 430 }, { "epoch": 0.13, "grad_norm": 20.189712524414062, "learning_rate": 1.3488446661264046e-05, "loss": 4.3669, "step": 431 }, { "epoch": 0.13, "grad_norm": 14.545948028564453, "learning_rate": 1.3485647534143134e-05, "loss": 3.1417, "step": 432 }, { "epoch": 0.13, "grad_norm": 13.813590049743652, "learning_rate": 1.3482842307525094e-05, "loss": 1.9284, "step": 433 }, { "epoch": 0.13, "grad_norm": 6.276806354522705, "learning_rate": 1.348003098406863e-05, "loss": 0.9223, "step": 434 }, { "epoch": 0.13, "grad_norm": 23.6711368560791, "learning_rate": 1.3477213566438224e-05, "loss": 2.635, "step": 435 }, { "epoch": 0.14, "grad_norm": 10.814998626708984, "learning_rate": 1.3474390057304129e-05, "loss": 2.1365, "step": 436 }, { "epoch": 0.14, "grad_norm": 19.597017288208008, "learning_rate": 1.347156045934238e-05, "loss": 6.3379, "step": 437 }, { "epoch": 0.14, "grad_norm": 15.655862808227539, "learning_rate": 1.3468724775234775e-05, "loss": 1.3053, "step": 438 }, { "epoch": 0.14, "grad_norm": 14.103118896484375, "learning_rate": 1.3465883007668883e-05, "loss": 2.5197, "step": 439 }, { "epoch": 0.14, "grad_norm": 14.11824893951416, "learning_rate": 1.3463035159338038e-05, "loss": 4.5738, "step": 440 }, { "epoch": 0.14, "grad_norm": 22.6025447845459, "learning_rate": 1.3460181232941342e-05, "loss": 2.4246, "step": 441 }, { "epoch": 0.14, "grad_norm": 22.4508056640625, "learning_rate": 1.3457321231183648e-05, "loss": 3.8746, "step": 442 }, { "epoch": 0.14, "grad_norm": 17.415388107299805, "learning_rate": 1.3454455156775576e-05, "loss": 8.516, "step": 443 }, { "epoch": 0.14, "grad_norm": 12.681063652038574, "learning_rate": 1.3451583012433499e-05, "loss": 1.5769, "step": 444 }, { "epoch": 0.14, "grad_norm": 15.727627754211426, "learning_rate": 1.3448704800879536e-05, "loss": 2.9668, "step": 445 }, { "epoch": 0.14, "grad_norm": 12.169587135314941, "learning_rate": 1.3445820524841567e-05, "loss": 1.6273, "step": 446 }, { "epoch": 0.14, "grad_norm": 10.842358589172363, "learning_rate": 1.3442930187053217e-05, "loss": 1.948, "step": 447 }, { "epoch": 0.14, "grad_norm": 16.87100601196289, "learning_rate": 1.3440033790253847e-05, "loss": 2.1458, "step": 448 }, { "epoch": 0.14, "grad_norm": 14.896727561950684, "learning_rate": 1.3437131337188573e-05, "loss": 2.6653, "step": 449 }, { "epoch": 0.14, "grad_norm": 11.050118446350098, "learning_rate": 1.3434222830608246e-05, "loss": 1.8485, "step": 450 }, { "epoch": 0.14, "grad_norm": 23.6415958404541, "learning_rate": 1.3431308273269449e-05, "loss": 2.607, "step": 451 }, { "epoch": 0.14, "grad_norm": 13.02411937713623, "learning_rate": 1.3428387667934508e-05, "loss": 2.3825, "step": 452 }, { "epoch": 0.14, "grad_norm": 13.187039375305176, "learning_rate": 1.3425461017371479e-05, "loss": 1.4177, "step": 453 }, { "epoch": 0.14, "grad_norm": 11.099510192871094, "learning_rate": 1.3422528324354144e-05, "loss": 1.9126, "step": 454 }, { "epoch": 0.14, "grad_norm": 14.535717964172363, "learning_rate": 1.341958959166201e-05, "loss": 2.7452, "step": 455 }, { "epoch": 0.14, "grad_norm": 12.59811019897461, "learning_rate": 1.341664482208032e-05, "loss": 1.9522, "step": 456 }, { "epoch": 0.14, "grad_norm": 16.89012336730957, "learning_rate": 1.3413694018400026e-05, "loss": 6.0934, "step": 457 }, { "epoch": 0.14, "grad_norm": 17.078168869018555, "learning_rate": 1.3410737183417803e-05, "loss": 1.7698, "step": 458 }, { "epoch": 0.14, "grad_norm": 12.786270141601562, "learning_rate": 1.3407774319936045e-05, "loss": 1.9643, "step": 459 }, { "epoch": 0.14, "grad_norm": 17.108491897583008, "learning_rate": 1.3404805430762857e-05, "loss": 2.8331, "step": 460 }, { "epoch": 0.14, "grad_norm": 19.87467384338379, "learning_rate": 1.3401830518712055e-05, "loss": 5.4639, "step": 461 }, { "epoch": 0.14, "grad_norm": 12.71600341796875, "learning_rate": 1.3398849586603164e-05, "loss": 2.2668, "step": 462 }, { "epoch": 0.14, "grad_norm": 6.157910346984863, "learning_rate": 1.3395862637261415e-05, "loss": 0.699, "step": 463 }, { "epoch": 0.14, "grad_norm": 20.636333465576172, "learning_rate": 1.3392869673517742e-05, "loss": 3.6988, "step": 464 }, { "epoch": 0.14, "grad_norm": 14.185775756835938, "learning_rate": 1.3389870698208776e-05, "loss": 3.227, "step": 465 }, { "epoch": 0.14, "grad_norm": 27.643817901611328, "learning_rate": 1.3386865714176849e-05, "loss": 2.6686, "step": 466 }, { "epoch": 0.14, "grad_norm": 18.93267822265625, "learning_rate": 1.3383854724269985e-05, "loss": 5.1713, "step": 467 }, { "epoch": 0.15, "grad_norm": 11.180096626281738, "learning_rate": 1.3380837731341908e-05, "loss": 3.6741, "step": 468 }, { "epoch": 0.15, "grad_norm": 18.503711700439453, "learning_rate": 1.3377814738252022e-05, "loss": 5.9296, "step": 469 }, { "epoch": 0.15, "grad_norm": 17.148578643798828, "learning_rate": 1.337478574786542e-05, "loss": 2.8339, "step": 470 }, { "epoch": 0.15, "grad_norm": 12.4744873046875, "learning_rate": 1.3371750763052883e-05, "loss": 1.9957, "step": 471 }, { "epoch": 0.15, "grad_norm": 15.115225791931152, "learning_rate": 1.3368709786690868e-05, "loss": 2.205, "step": 472 }, { "epoch": 0.15, "grad_norm": 21.09969711303711, "learning_rate": 1.3365662821661518e-05, "loss": 2.9407, "step": 473 }, { "epoch": 0.15, "grad_norm": 13.111128807067871, "learning_rate": 1.3362609870852641e-05, "loss": 5.3922, "step": 474 }, { "epoch": 0.15, "grad_norm": 10.798126220703125, "learning_rate": 1.3359550937157732e-05, "loss": 0.9604, "step": 475 }, { "epoch": 0.15, "grad_norm": 10.442574501037598, "learning_rate": 1.3356486023475942e-05, "loss": 1.9992, "step": 476 }, { "epoch": 0.15, "grad_norm": 15.014910697937012, "learning_rate": 1.3353415132712097e-05, "loss": 2.1064, "step": 477 }, { "epoch": 0.15, "grad_norm": 13.74669075012207, "learning_rate": 1.3350338267776692e-05, "loss": 2.6105, "step": 478 }, { "epoch": 0.15, "grad_norm": 14.472315788269043, "learning_rate": 1.3347255431585876e-05, "loss": 3.1315, "step": 479 }, { "epoch": 0.15, "grad_norm": 13.927700996398926, "learning_rate": 1.334416662706146e-05, "loss": 1.6392, "step": 480 }, { "epoch": 0.15, "grad_norm": 23.360815048217773, "learning_rate": 1.3341071857130912e-05, "loss": 4.32, "step": 481 }, { "epoch": 0.15, "grad_norm": 12.88177490234375, "learning_rate": 1.3337971124727359e-05, "loss": 3.0116, "step": 482 }, { "epoch": 0.15, "grad_norm": 13.984797477722168, "learning_rate": 1.3334864432789568e-05, "loss": 2.155, "step": 483 }, { "epoch": 0.15, "grad_norm": 13.673538208007812, "learning_rate": 1.3331751784261964e-05, "loss": 2.4724, "step": 484 }, { "epoch": 0.15, "grad_norm": 17.153629302978516, "learning_rate": 1.3328633182094614e-05, "loss": 3.5844, "step": 485 }, { "epoch": 0.15, "grad_norm": 11.198391914367676, "learning_rate": 1.3325508629243226e-05, "loss": 1.4172, "step": 486 }, { "epoch": 0.15, "grad_norm": 15.56090259552002, "learning_rate": 1.332237812866915e-05, "loss": 4.2267, "step": 487 }, { "epoch": 0.15, "grad_norm": 21.435331344604492, "learning_rate": 1.3319241683339371e-05, "loss": 4.9318, "step": 488 }, { "epoch": 0.15, "grad_norm": 16.9794921875, "learning_rate": 1.3316099296226514e-05, "loss": 3.6705, "step": 489 }, { "epoch": 0.15, "grad_norm": 10.50817584991455, "learning_rate": 1.3312950970308827e-05, "loss": 1.3616, "step": 490 }, { "epoch": 0.15, "grad_norm": 16.41978645324707, "learning_rate": 1.3309796708570194e-05, "loss": 5.9281, "step": 491 }, { "epoch": 0.15, "grad_norm": 17.76799964904785, "learning_rate": 1.3306636514000118e-05, "loss": 1.8393, "step": 492 }, { "epoch": 0.15, "grad_norm": 15.68265438079834, "learning_rate": 1.3303470389593731e-05, "loss": 2.5935, "step": 493 }, { "epoch": 0.15, "grad_norm": 16.82168960571289, "learning_rate": 1.3300298338351783e-05, "loss": 5.2416, "step": 494 }, { "epoch": 0.15, "grad_norm": 10.42543888092041, "learning_rate": 1.3297120363280635e-05, "loss": 1.201, "step": 495 }, { "epoch": 0.15, "grad_norm": 14.174787521362305, "learning_rate": 1.3293936467392277e-05, "loss": 1.582, "step": 496 }, { "epoch": 0.15, "grad_norm": 16.048009872436523, "learning_rate": 1.3290746653704298e-05, "loss": 1.8858, "step": 497 }, { "epoch": 0.15, "grad_norm": 11.09557819366455, "learning_rate": 1.3287550925239899e-05, "loss": 1.8468, "step": 498 }, { "epoch": 0.15, "grad_norm": 13.926736831665039, "learning_rate": 1.3284349285027888e-05, "loss": 6.1665, "step": 499 }, { "epoch": 0.15, "grad_norm": 10.572114944458008, "learning_rate": 1.3281141736102673e-05, "loss": 1.5575, "step": 500 }, { "epoch": 0.16, "grad_norm": 17.857831954956055, "learning_rate": 1.3277928281504268e-05, "loss": 6.2458, "step": 501 }, { "epoch": 0.16, "grad_norm": 15.5122709274292, "learning_rate": 1.3274708924278278e-05, "loss": 2.3332, "step": 502 }, { "epoch": 0.16, "grad_norm": 16.162206649780273, "learning_rate": 1.3271483667475906e-05, "loss": 2.3407, "step": 503 }, { "epoch": 0.16, "grad_norm": 14.517176628112793, "learning_rate": 1.3268252514153947e-05, "loss": 5.2842, "step": 504 }, { "epoch": 0.16, "grad_norm": 19.05678939819336, "learning_rate": 1.326501546737478e-05, "loss": 2.3107, "step": 505 }, { "epoch": 0.16, "grad_norm": 11.515403747558594, "learning_rate": 1.3261772530206374e-05, "loss": 2.2464, "step": 506 }, { "epoch": 0.16, "grad_norm": 23.113420486450195, "learning_rate": 1.3258523705722276e-05, "loss": 6.0012, "step": 507 }, { "epoch": 0.16, "grad_norm": 15.704401969909668, "learning_rate": 1.3255268997001622e-05, "loss": 2.0706, "step": 508 }, { "epoch": 0.16, "grad_norm": 15.194841384887695, "learning_rate": 1.3252008407129117e-05, "loss": 3.7562, "step": 509 }, { "epoch": 0.16, "grad_norm": 16.791894912719727, "learning_rate": 1.3248741939195037e-05, "loss": 3.2929, "step": 510 }, { "epoch": 0.16, "grad_norm": 13.705718040466309, "learning_rate": 1.3245469596295241e-05, "loss": 1.7648, "step": 511 }, { "epoch": 0.16, "grad_norm": 14.339812278747559, "learning_rate": 1.3242191381531145e-05, "loss": 1.8999, "step": 512 }, { "epoch": 0.16, "grad_norm": 13.354348182678223, "learning_rate": 1.3238907298009734e-05, "loss": 1.6501, "step": 513 }, { "epoch": 0.16, "grad_norm": 16.52679443359375, "learning_rate": 1.3235617348843556e-05, "loss": 5.294, "step": 514 }, { "epoch": 0.16, "grad_norm": 23.997087478637695, "learning_rate": 1.3232321537150719e-05, "loss": 3.2144, "step": 515 }, { "epoch": 0.16, "grad_norm": 22.67416763305664, "learning_rate": 1.3229019866054885e-05, "loss": 5.2678, "step": 516 }, { "epoch": 0.16, "grad_norm": 14.221373558044434, "learning_rate": 1.3225712338685269e-05, "loss": 2.6089, "step": 517 }, { "epoch": 0.16, "grad_norm": 14.597294807434082, "learning_rate": 1.322239895817664e-05, "loss": 2.6505, "step": 518 }, { "epoch": 0.16, "grad_norm": 12.288494110107422, "learning_rate": 1.321907972766931e-05, "loss": 1.4499, "step": 519 }, { "epoch": 0.16, "grad_norm": 14.892966270446777, "learning_rate": 1.321575465030914e-05, "loss": 2.4837, "step": 520 }, { "epoch": 0.16, "grad_norm": 12.720625877380371, "learning_rate": 1.3212423729247526e-05, "loss": 2.8317, "step": 521 }, { "epoch": 0.16, "grad_norm": 10.73737907409668, "learning_rate": 1.3209086967641413e-05, "loss": 2.3667, "step": 522 }, { "epoch": 0.16, "grad_norm": 14.303455352783203, "learning_rate": 1.320574436865327e-05, "loss": 2.9824, "step": 523 }, { "epoch": 0.16, "grad_norm": 13.44922924041748, "learning_rate": 1.3202395935451102e-05, "loss": 1.7328, "step": 524 }, { "epoch": 0.16, "grad_norm": 19.608003616333008, "learning_rate": 1.319904167120845e-05, "loss": 5.1686, "step": 525 }, { "epoch": 0.16, "grad_norm": 15.829699516296387, "learning_rate": 1.3195681579104374e-05, "loss": 3.0279, "step": 526 }, { "epoch": 0.16, "grad_norm": 20.571901321411133, "learning_rate": 1.319231566232346e-05, "loss": 1.9538, "step": 527 }, { "epoch": 0.16, "grad_norm": 24.14697265625, "learning_rate": 1.3188943924055812e-05, "loss": 2.0403, "step": 528 }, { "epoch": 0.16, "grad_norm": 10.41229248046875, "learning_rate": 1.3185566367497059e-05, "loss": 1.7593, "step": 529 }, { "epoch": 0.16, "grad_norm": 12.556974411010742, "learning_rate": 1.3182182995848333e-05, "loss": 1.9916, "step": 530 }, { "epoch": 0.16, "grad_norm": 16.047039031982422, "learning_rate": 1.3178793812316288e-05, "loss": 1.9223, "step": 531 }, { "epoch": 0.16, "grad_norm": 10.137266159057617, "learning_rate": 1.3175398820113079e-05, "loss": 1.8314, "step": 532 }, { "epoch": 0.17, "grad_norm": 16.968952178955078, "learning_rate": 1.3171998022456371e-05, "loss": 2.4476, "step": 533 }, { "epoch": 0.17, "grad_norm": 14.670219421386719, "learning_rate": 1.3168591422569328e-05, "loss": 5.2239, "step": 534 }, { "epoch": 0.17, "grad_norm": 15.691137313842773, "learning_rate": 1.3165179023680615e-05, "loss": 2.215, "step": 535 }, { "epoch": 0.17, "grad_norm": 8.702433586120605, "learning_rate": 1.3161760829024394e-05, "loss": 1.2759, "step": 536 }, { "epoch": 0.17, "grad_norm": 13.136799812316895, "learning_rate": 1.3158336841840314e-05, "loss": 3.76, "step": 537 }, { "epoch": 0.17, "grad_norm": 10.581886291503906, "learning_rate": 1.3154907065373523e-05, "loss": 1.1901, "step": 538 }, { "epoch": 0.17, "grad_norm": 18.704313278198242, "learning_rate": 1.3151471502874648e-05, "loss": 2.523, "step": 539 }, { "epoch": 0.17, "grad_norm": 17.407930374145508, "learning_rate": 1.3148030157599808e-05, "loss": 2.3264, "step": 540 }, { "epoch": 0.17, "grad_norm": 18.318424224853516, "learning_rate": 1.3144583032810595e-05, "loss": 3.9198, "step": 541 }, { "epoch": 0.17, "grad_norm": 16.8125057220459, "learning_rate": 1.314113013177408e-05, "loss": 1.7953, "step": 542 }, { "epoch": 0.17, "grad_norm": 12.802943229675293, "learning_rate": 1.3137671457762812e-05, "loss": 2.2133, "step": 543 }, { "epoch": 0.17, "grad_norm": 15.229947090148926, "learning_rate": 1.3134207014054812e-05, "loss": 5.0568, "step": 544 }, { "epoch": 0.17, "grad_norm": 11.939119338989258, "learning_rate": 1.313073680393356e-05, "loss": 1.1138, "step": 545 }, { "epoch": 0.17, "grad_norm": 14.192569732666016, "learning_rate": 1.3127260830688016e-05, "loss": 5.0628, "step": 546 }, { "epoch": 0.17, "grad_norm": 20.657682418823242, "learning_rate": 1.3123779097612589e-05, "loss": 4.2917, "step": 547 }, { "epoch": 0.17, "grad_norm": 16.68743133544922, "learning_rate": 1.3120291608007155e-05, "loss": 2.6444, "step": 548 }, { "epoch": 0.17, "grad_norm": 18.14390754699707, "learning_rate": 1.3116798365177041e-05, "loss": 4.8532, "step": 549 }, { "epoch": 0.17, "grad_norm": 16.764875411987305, "learning_rate": 1.311329937243303e-05, "loss": 1.8359, "step": 550 }, { "epoch": 0.17, "grad_norm": 17.864337921142578, "learning_rate": 1.3109794633091354e-05, "loss": 3.1203, "step": 551 }, { "epoch": 0.17, "grad_norm": 11.85683536529541, "learning_rate": 1.3106284150473688e-05, "loss": 3.0269, "step": 552 }, { "epoch": 0.17, "grad_norm": 10.915852546691895, "learning_rate": 1.3102767927907157e-05, "loss": 1.3493, "step": 553 }, { "epoch": 0.17, "grad_norm": 12.88638973236084, "learning_rate": 1.3099245968724319e-05, "loss": 1.7223, "step": 554 }, { "epoch": 0.17, "grad_norm": 14.523603439331055, "learning_rate": 1.3095718276263174e-05, "loss": 2.2196, "step": 555 }, { "epoch": 0.17, "grad_norm": 15.751676559448242, "learning_rate": 1.3092184853867154e-05, "loss": 5.3548, "step": 556 }, { "epoch": 0.17, "grad_norm": 10.33692741394043, "learning_rate": 1.3088645704885122e-05, "loss": 1.6451, "step": 557 }, { "epoch": 0.17, "grad_norm": 17.207609176635742, "learning_rate": 1.3085100832671366e-05, "loss": 3.7781, "step": 558 }, { "epoch": 0.17, "grad_norm": 19.840381622314453, "learning_rate": 1.30815502405856e-05, "loss": 7.1694, "step": 559 }, { "epoch": 0.17, "grad_norm": 14.988025665283203, "learning_rate": 1.3077993931992962e-05, "loss": 4.8949, "step": 560 }, { "epoch": 0.17, "grad_norm": 11.11162281036377, "learning_rate": 1.3074431910264007e-05, "loss": 5.5227, "step": 561 }, { "epoch": 0.17, "grad_norm": 14.494366645812988, "learning_rate": 1.30708641787747e-05, "loss": 3.0651, "step": 562 }, { "epoch": 0.17, "grad_norm": 11.259400367736816, "learning_rate": 1.306729074090642e-05, "loss": 1.4138, "step": 563 }, { "epoch": 0.17, "grad_norm": 8.006839752197266, "learning_rate": 1.3063711600045956e-05, "loss": 1.6043, "step": 564 }, { "epoch": 0.18, "grad_norm": 19.398590087890625, "learning_rate": 1.3060126759585505e-05, "loss": 3.1046, "step": 565 }, { "epoch": 0.18, "grad_norm": 9.973245620727539, "learning_rate": 1.3056536222922657e-05, "loss": 1.1749, "step": 566 }, { "epoch": 0.18, "grad_norm": 14.982606887817383, "learning_rate": 1.3052939993460407e-05, "loss": 4.2511, "step": 567 }, { "epoch": 0.18, "grad_norm": 13.401564598083496, "learning_rate": 1.3049338074607144e-05, "loss": 2.8225, "step": 568 }, { "epoch": 0.18, "grad_norm": 11.368091583251953, "learning_rate": 1.3045730469776653e-05, "loss": 2.1884, "step": 569 }, { "epoch": 0.18, "grad_norm": 7.658648490905762, "learning_rate": 1.30421171823881e-05, "loss": 0.8779, "step": 570 }, { "epoch": 0.18, "grad_norm": 13.248800277709961, "learning_rate": 1.303849821586604e-05, "loss": 1.454, "step": 571 }, { "epoch": 0.18, "grad_norm": 19.093027114868164, "learning_rate": 1.3034873573640416e-05, "loss": 5.9006, "step": 572 }, { "epoch": 0.18, "grad_norm": 15.373260498046875, "learning_rate": 1.3031243259146544e-05, "loss": 1.8576, "step": 573 }, { "epoch": 0.18, "grad_norm": 21.09121322631836, "learning_rate": 1.302760727582512e-05, "loss": 2.4911, "step": 574 }, { "epoch": 0.18, "grad_norm": 15.990294456481934, "learning_rate": 1.3023965627122207e-05, "loss": 1.9747, "step": 575 }, { "epoch": 0.18, "grad_norm": 13.601783752441406, "learning_rate": 1.302031831648924e-05, "loss": 2.3455, "step": 576 }, { "epoch": 0.18, "grad_norm": 16.258012771606445, "learning_rate": 1.3016665347383024e-05, "loss": 2.3826, "step": 577 }, { "epoch": 0.18, "grad_norm": 11.222940444946289, "learning_rate": 1.3013006723265727e-05, "loss": 1.3702, "step": 578 }, { "epoch": 0.18, "grad_norm": 11.493943214416504, "learning_rate": 1.3009342447604867e-05, "loss": 2.4295, "step": 579 }, { "epoch": 0.18, "grad_norm": 13.920111656188965, "learning_rate": 1.300567252387333e-05, "loss": 3.0531, "step": 580 }, { "epoch": 0.18, "grad_norm": 18.79691505432129, "learning_rate": 1.3001996955549348e-05, "loss": 5.7553, "step": 581 }, { "epoch": 0.18, "grad_norm": 9.230389595031738, "learning_rate": 1.2998315746116507e-05, "loss": 1.6072, "step": 582 }, { "epoch": 0.18, "grad_norm": 14.716042518615723, "learning_rate": 1.2994628899063736e-05, "loss": 2.5715, "step": 583 }, { "epoch": 0.18, "grad_norm": 17.834688186645508, "learning_rate": 1.2990936417885309e-05, "loss": 3.0267, "step": 584 }, { "epoch": 0.18, "grad_norm": 10.204316139221191, "learning_rate": 1.2987238306080838e-05, "loss": 2.471, "step": 585 }, { "epoch": 0.18, "grad_norm": 18.392969131469727, "learning_rate": 1.2983534567155274e-05, "loss": 1.9733, "step": 586 }, { "epoch": 0.18, "grad_norm": 16.151996612548828, "learning_rate": 1.2979825204618902e-05, "loss": 4.7029, "step": 587 }, { "epoch": 0.18, "grad_norm": 21.312026977539062, "learning_rate": 1.2976110221987332e-05, "loss": 3.0766, "step": 588 }, { "epoch": 0.18, "grad_norm": 26.476966857910156, "learning_rate": 1.2972389622781504e-05, "loss": 1.6432, "step": 589 }, { "epoch": 0.18, "grad_norm": 19.200429916381836, "learning_rate": 1.2968663410527683e-05, "loss": 2.3434, "step": 590 }, { "epoch": 0.18, "grad_norm": 10.3663330078125, "learning_rate": 1.296493158875745e-05, "loss": 1.3629, "step": 591 }, { "epoch": 0.18, "grad_norm": 10.774557113647461, "learning_rate": 1.2961194161007703e-05, "loss": 1.1199, "step": 592 }, { "epoch": 0.18, "grad_norm": 16.63807487487793, "learning_rate": 1.2957451130820656e-05, "loss": 3.0353, "step": 593 }, { "epoch": 0.18, "grad_norm": 9.413061141967773, "learning_rate": 1.295370250174383e-05, "loss": 1.0713, "step": 594 }, { "epoch": 0.18, "grad_norm": 15.048751831054688, "learning_rate": 1.2949948277330057e-05, "loss": 2.6175, "step": 595 }, { "epoch": 0.18, "grad_norm": 13.444994926452637, "learning_rate": 1.2946188461137461e-05, "loss": 1.7131, "step": 596 }, { "epoch": 0.18, "grad_norm": 14.747013092041016, "learning_rate": 1.2942423056729482e-05, "loss": 2.4692, "step": 597 }, { "epoch": 0.19, "grad_norm": 12.79983139038086, "learning_rate": 1.2938652067674841e-05, "loss": 1.1351, "step": 598 }, { "epoch": 0.19, "grad_norm": 17.529685974121094, "learning_rate": 1.2934875497547564e-05, "loss": 2.5644, "step": 599 }, { "epoch": 0.19, "grad_norm": 14.526769638061523, "learning_rate": 1.2931093349926957e-05, "loss": 1.9986, "step": 600 }, { "epoch": 0.19, "grad_norm": 30.357128143310547, "learning_rate": 1.2927305628397617e-05, "loss": 2.2509, "step": 601 }, { "epoch": 0.19, "grad_norm": 18.066158294677734, "learning_rate": 1.2923512336549426e-05, "loss": 5.8853, "step": 602 }, { "epoch": 0.19, "grad_norm": 11.021641731262207, "learning_rate": 1.2919713477977541e-05, "loss": 3.4449, "step": 603 }, { "epoch": 0.19, "grad_norm": 14.591251373291016, "learning_rate": 1.2915909056282392e-05, "loss": 2.6123, "step": 604 }, { "epoch": 0.19, "grad_norm": 7.648482322692871, "learning_rate": 1.2912099075069694e-05, "loss": 1.0796, "step": 605 }, { "epoch": 0.19, "grad_norm": 13.213525772094727, "learning_rate": 1.2908283537950416e-05, "loss": 0.7585, "step": 606 }, { "epoch": 0.19, "grad_norm": 12.706143379211426, "learning_rate": 1.2904462448540806e-05, "loss": 1.93, "step": 607 }, { "epoch": 0.19, "grad_norm": 10.988866806030273, "learning_rate": 1.2900635810462365e-05, "loss": 1.1424, "step": 608 }, { "epoch": 0.19, "grad_norm": 13.18970775604248, "learning_rate": 1.2896803627341854e-05, "loss": 2.6315, "step": 609 }, { "epoch": 0.19, "grad_norm": 14.60741138458252, "learning_rate": 1.2892965902811291e-05, "loss": 3.0535, "step": 610 }, { "epoch": 0.19, "grad_norm": 11.429036140441895, "learning_rate": 1.2889122640507951e-05, "loss": 1.5899, "step": 611 }, { "epoch": 0.19, "grad_norm": 10.819948196411133, "learning_rate": 1.2885273844074347e-05, "loss": 1.2347, "step": 612 }, { "epoch": 0.19, "grad_norm": 7.144796848297119, "learning_rate": 1.288141951715825e-05, "loss": 0.8606, "step": 613 }, { "epoch": 0.19, "grad_norm": 21.950687408447266, "learning_rate": 1.287755966341266e-05, "loss": 2.5471, "step": 614 }, { "epoch": 0.19, "grad_norm": 20.954431533813477, "learning_rate": 1.2873694286495825e-05, "loss": 5.1857, "step": 615 }, { "epoch": 0.19, "grad_norm": 11.520883560180664, "learning_rate": 1.2869823390071218e-05, "loss": 1.6997, "step": 616 }, { "epoch": 0.19, "grad_norm": 20.543258666992188, "learning_rate": 1.2865946977807552e-05, "loss": 5.732, "step": 617 }, { "epoch": 0.19, "grad_norm": 12.407218933105469, "learning_rate": 1.2862065053378762e-05, "loss": 1.45, "step": 618 }, { "epoch": 0.19, "grad_norm": 10.259573936462402, "learning_rate": 1.2858177620464013e-05, "loss": 2.0572, "step": 619 }, { "epoch": 0.19, "grad_norm": 16.575164794921875, "learning_rate": 1.2854284682747688e-05, "loss": 2.4111, "step": 620 }, { "epoch": 0.19, "grad_norm": 19.309242248535156, "learning_rate": 1.285038624391938e-05, "loss": 4.1687, "step": 621 }, { "epoch": 0.19, "grad_norm": 21.330078125, "learning_rate": 1.2846482307673907e-05, "loss": 3.9547, "step": 622 }, { "epoch": 0.19, "grad_norm": 10.402637481689453, "learning_rate": 1.2842572877711293e-05, "loss": 1.5042, "step": 623 }, { "epoch": 0.19, "grad_norm": 12.396795272827148, "learning_rate": 1.2838657957736767e-05, "loss": 2.5599, "step": 624 }, { "epoch": 0.19, "grad_norm": 16.713336944580078, "learning_rate": 1.2834737551460762e-05, "loss": 2.0968, "step": 625 }, { "epoch": 0.19, "grad_norm": 13.006550788879395, "learning_rate": 1.2830811662598913e-05, "loss": 2.6239, "step": 626 }, { "epoch": 0.19, "grad_norm": 16.9903621673584, "learning_rate": 1.282688029487205e-05, "loss": 4.884, "step": 627 }, { "epoch": 0.19, "grad_norm": 18.870378494262695, "learning_rate": 1.2822943452006195e-05, "loss": 5.6409, "step": 628 }, { "epoch": 0.19, "grad_norm": 11.85020637512207, "learning_rate": 1.2819001137732556e-05, "loss": 1.9056, "step": 629 }, { "epoch": 0.2, "grad_norm": 15.124698638916016, "learning_rate": 1.2815053355787535e-05, "loss": 1.8045, "step": 630 }, { "epoch": 0.2, "grad_norm": 18.318387985229492, "learning_rate": 1.2811100109912706e-05, "loss": 3.9206, "step": 631 }, { "epoch": 0.2, "grad_norm": 15.948100090026855, "learning_rate": 1.2807141403854832e-05, "loss": 2.8284, "step": 632 }, { "epoch": 0.2, "grad_norm": 21.467557907104492, "learning_rate": 1.2803177241365842e-05, "loss": 2.9303, "step": 633 }, { "epoch": 0.2, "grad_norm": 13.317900657653809, "learning_rate": 1.2799207626202844e-05, "loss": 1.7439, "step": 634 }, { "epoch": 0.2, "grad_norm": 22.261106491088867, "learning_rate": 1.2795232562128104e-05, "loss": 4.7986, "step": 635 }, { "epoch": 0.2, "grad_norm": 18.067625045776367, "learning_rate": 1.2791252052909065e-05, "loss": 3.3529, "step": 636 }, { "epoch": 0.2, "grad_norm": 20.194984436035156, "learning_rate": 1.278726610231832e-05, "loss": 6.0626, "step": 637 }, { "epoch": 0.2, "grad_norm": 17.769323348999023, "learning_rate": 1.278327471413362e-05, "loss": 3.8201, "step": 638 }, { "epoch": 0.2, "grad_norm": 14.183991432189941, "learning_rate": 1.2779277892137882e-05, "loss": 3.6506, "step": 639 }, { "epoch": 0.2, "grad_norm": 12.68132495880127, "learning_rate": 1.2775275640119154e-05, "loss": 5.0845, "step": 640 }, { "epoch": 0.2, "grad_norm": 16.511796951293945, "learning_rate": 1.2771267961870646e-05, "loss": 2.6582, "step": 641 }, { "epoch": 0.2, "grad_norm": 13.455010414123535, "learning_rate": 1.2767254861190701e-05, "loss": 1.3631, "step": 642 }, { "epoch": 0.2, "grad_norm": 12.576130867004395, "learning_rate": 1.2763236341882811e-05, "loss": 2.3939, "step": 643 }, { "epoch": 0.2, "grad_norm": 12.145406723022461, "learning_rate": 1.2759212407755592e-05, "loss": 2.4653, "step": 644 }, { "epoch": 0.2, "grad_norm": 13.685088157653809, "learning_rate": 1.2755183062622799e-05, "loss": 4.6326, "step": 645 }, { "epoch": 0.2, "grad_norm": 14.559864044189453, "learning_rate": 1.2751148310303317e-05, "loss": 4.432, "step": 646 }, { "epoch": 0.2, "grad_norm": 15.920525550842285, "learning_rate": 1.274710815462115e-05, "loss": 2.2933, "step": 647 }, { "epoch": 0.2, "grad_norm": 7.618566036224365, "learning_rate": 1.2743062599405426e-05, "loss": 0.9646, "step": 648 }, { "epoch": 0.2, "grad_norm": 11.278463363647461, "learning_rate": 1.273901164849039e-05, "loss": 1.205, "step": 649 }, { "epoch": 0.2, "grad_norm": 14.634114265441895, "learning_rate": 1.2734955305715405e-05, "loss": 4.1018, "step": 650 }, { "epoch": 0.2, "grad_norm": 12.614447593688965, "learning_rate": 1.2730893574924937e-05, "loss": 2.924, "step": 651 }, { "epoch": 0.2, "grad_norm": 12.723915100097656, "learning_rate": 1.2726826459968565e-05, "loss": 2.6999, "step": 652 }, { "epoch": 0.2, "grad_norm": 17.578689575195312, "learning_rate": 1.2722753964700963e-05, "loss": 1.9297, "step": 653 }, { "epoch": 0.2, "grad_norm": 9.462774276733398, "learning_rate": 1.2718676092981914e-05, "loss": 1.9175, "step": 654 }, { "epoch": 0.2, "grad_norm": 14.999417304992676, "learning_rate": 1.2714592848676291e-05, "loss": 2.4162, "step": 655 }, { "epoch": 0.2, "grad_norm": 15.338152885437012, "learning_rate": 1.2710504235654058e-05, "loss": 2.0275, "step": 656 }, { "epoch": 0.2, "grad_norm": 10.914753913879395, "learning_rate": 1.2706410257790273e-05, "loss": 2.2412, "step": 657 }, { "epoch": 0.2, "grad_norm": 10.793363571166992, "learning_rate": 1.2702310918965072e-05, "loss": 1.4223, "step": 658 }, { "epoch": 0.2, "grad_norm": 13.35788631439209, "learning_rate": 1.2698206223063677e-05, "loss": 1.6989, "step": 659 }, { "epoch": 0.2, "grad_norm": 9.177985191345215, "learning_rate": 1.2694096173976381e-05, "loss": 1.2592, "step": 660 }, { "epoch": 0.2, "grad_norm": 21.1748104095459, "learning_rate": 1.2689980775598558e-05, "loss": 6.016, "step": 661 }, { "epoch": 0.21, "grad_norm": 16.317060470581055, "learning_rate": 1.2685860031830648e-05, "loss": 1.6618, "step": 662 }, { "epoch": 0.21, "grad_norm": 10.94815444946289, "learning_rate": 1.268173394657816e-05, "loss": 1.2826, "step": 663 }, { "epoch": 0.21, "grad_norm": 19.04676055908203, "learning_rate": 1.267760252375166e-05, "loss": 4.4989, "step": 664 }, { "epoch": 0.21, "grad_norm": 12.131474494934082, "learning_rate": 1.2673465767266773e-05, "loss": 1.3328, "step": 665 }, { "epoch": 0.21, "grad_norm": 11.820852279663086, "learning_rate": 1.266932368104419e-05, "loss": 1.5155, "step": 666 }, { "epoch": 0.21, "grad_norm": 22.58245849609375, "learning_rate": 1.2665176269009639e-05, "loss": 1.598, "step": 667 }, { "epoch": 0.21, "grad_norm": 20.890731811523438, "learning_rate": 1.2661023535093905e-05, "loss": 3.1583, "step": 668 }, { "epoch": 0.21, "grad_norm": 16.39836311340332, "learning_rate": 1.2656865483232815e-05, "loss": 3.6485, "step": 669 }, { "epoch": 0.21, "grad_norm": 14.225503921508789, "learning_rate": 1.265270211736723e-05, "loss": 1.0935, "step": 670 }, { "epoch": 0.21, "grad_norm": 17.626684188842773, "learning_rate": 1.264853344144306e-05, "loss": 2.3145, "step": 671 }, { "epoch": 0.21, "grad_norm": 11.931720733642578, "learning_rate": 1.2644359459411233e-05, "loss": 1.8545, "step": 672 }, { "epoch": 0.21, "grad_norm": 13.659822463989258, "learning_rate": 1.2640180175227718e-05, "loss": 3.7829, "step": 673 }, { "epoch": 0.21, "grad_norm": 16.827177047729492, "learning_rate": 1.2635995592853505e-05, "loss": 2.8283, "step": 674 }, { "epoch": 0.21, "grad_norm": 9.98625659942627, "learning_rate": 1.26318057162546e-05, "loss": 0.7866, "step": 675 }, { "epoch": 0.21, "grad_norm": 11.108333587646484, "learning_rate": 1.2627610549402034e-05, "loss": 1.6244, "step": 676 }, { "epoch": 0.21, "grad_norm": 13.255348205566406, "learning_rate": 1.2623410096271851e-05, "loss": 2.0722, "step": 677 }, { "epoch": 0.21, "grad_norm": 11.850077629089355, "learning_rate": 1.2619204360845104e-05, "loss": 2.4558, "step": 678 }, { "epoch": 0.21, "grad_norm": 9.59620189666748, "learning_rate": 1.2614993347107847e-05, "loss": 1.6123, "step": 679 }, { "epoch": 0.21, "grad_norm": 19.261030197143555, "learning_rate": 1.2610777059051148e-05, "loss": 2.0041, "step": 680 }, { "epoch": 0.21, "grad_norm": 12.192977905273438, "learning_rate": 1.2606555500671064e-05, "loss": 1.6192, "step": 681 }, { "epoch": 0.21, "grad_norm": 16.09978675842285, "learning_rate": 1.2602328675968651e-05, "loss": 3.3949, "step": 682 }, { "epoch": 0.21, "grad_norm": 22.485755920410156, "learning_rate": 1.2598096588949958e-05, "loss": 3.2858, "step": 683 }, { "epoch": 0.21, "grad_norm": 19.057348251342773, "learning_rate": 1.2593859243626016e-05, "loss": 4.2207, "step": 684 }, { "epoch": 0.21, "grad_norm": 11.834187507629395, "learning_rate": 1.2589616644012844e-05, "loss": 1.1678, "step": 685 }, { "epoch": 0.21, "grad_norm": 8.711618423461914, "learning_rate": 1.258536879413144e-05, "loss": 0.8833, "step": 686 }, { "epoch": 0.21, "grad_norm": 12.831104278564453, "learning_rate": 1.258111569800778e-05, "loss": 1.3791, "step": 687 }, { "epoch": 0.21, "grad_norm": 11.20345687866211, "learning_rate": 1.2576857359672804e-05, "loss": 2.4059, "step": 688 }, { "epoch": 0.21, "grad_norm": 10.189892768859863, "learning_rate": 1.2572593783162433e-05, "loss": 1.1859, "step": 689 }, { "epoch": 0.21, "grad_norm": 10.442441940307617, "learning_rate": 1.256832497251754e-05, "loss": 1.0376, "step": 690 }, { "epoch": 0.21, "grad_norm": 15.368931770324707, "learning_rate": 1.256405093178397e-05, "loss": 1.7561, "step": 691 }, { "epoch": 0.21, "grad_norm": 13.209639549255371, "learning_rate": 1.2559771665012516e-05, "loss": 2.3928, "step": 692 }, { "epoch": 0.21, "grad_norm": 12.749165534973145, "learning_rate": 1.2555487176258932e-05, "loss": 4.5768, "step": 693 }, { "epoch": 0.22, "grad_norm": 15.11466121673584, "learning_rate": 1.2551197469583912e-05, "loss": 3.2755, "step": 694 }, { "epoch": 0.22, "grad_norm": 14.278471946716309, "learning_rate": 1.25469025490531e-05, "loss": 1.6433, "step": 695 }, { "epoch": 0.22, "grad_norm": 17.323156356811523, "learning_rate": 1.2542602418737088e-05, "loss": 2.4764, "step": 696 }, { "epoch": 0.22, "grad_norm": 9.389161109924316, "learning_rate": 1.2538297082711393e-05, "loss": 1.1465, "step": 697 }, { "epoch": 0.22, "grad_norm": 19.667522430419922, "learning_rate": 1.2533986545056475e-05, "loss": 3.4705, "step": 698 }, { "epoch": 0.22, "grad_norm": 11.19377326965332, "learning_rate": 1.2529670809857722e-05, "loss": 1.4053, "step": 699 }, { "epoch": 0.22, "grad_norm": 21.029172897338867, "learning_rate": 1.2525349881205445e-05, "loss": 1.9345, "step": 700 }, { "epoch": 0.22, "grad_norm": 10.366401672363281, "learning_rate": 1.2521023763194882e-05, "loss": 2.8612, "step": 701 }, { "epoch": 0.22, "grad_norm": 13.524468421936035, "learning_rate": 1.2516692459926185e-05, "loss": 4.4014, "step": 702 }, { "epoch": 0.22, "grad_norm": 12.896738052368164, "learning_rate": 1.2512355975504422e-05, "loss": 1.7042, "step": 703 }, { "epoch": 0.22, "grad_norm": 18.412193298339844, "learning_rate": 1.250801431403957e-05, "loss": 4.6858, "step": 704 }, { "epoch": 0.22, "grad_norm": 12.425787925720215, "learning_rate": 1.250366747964652e-05, "loss": 1.6511, "step": 705 }, { "epoch": 0.22, "grad_norm": 16.743928909301758, "learning_rate": 1.2499315476445055e-05, "loss": 1.4475, "step": 706 }, { "epoch": 0.22, "grad_norm": 10.760894775390625, "learning_rate": 1.2494958308559864e-05, "loss": 1.4689, "step": 707 }, { "epoch": 0.22, "grad_norm": 12.353875160217285, "learning_rate": 1.2490595980120529e-05, "loss": 1.8056, "step": 708 }, { "epoch": 0.22, "grad_norm": 17.23015022277832, "learning_rate": 1.2486228495261522e-05, "loss": 2.3585, "step": 709 }, { "epoch": 0.22, "grad_norm": 11.202420234680176, "learning_rate": 1.2481855858122201e-05, "loss": 2.073, "step": 710 }, { "epoch": 0.22, "grad_norm": 16.27634620666504, "learning_rate": 1.2477478072846814e-05, "loss": 2.0413, "step": 711 }, { "epoch": 0.22, "grad_norm": 7.528199672698975, "learning_rate": 1.2473095143584479e-05, "loss": 0.8323, "step": 712 }, { "epoch": 0.22, "grad_norm": 13.777722358703613, "learning_rate": 1.2468707074489194e-05, "loss": 1.9507, "step": 713 }, { "epoch": 0.22, "grad_norm": 10.845873832702637, "learning_rate": 1.2464313869719831e-05, "loss": 2.0242, "step": 714 }, { "epoch": 0.22, "grad_norm": 18.222116470336914, "learning_rate": 1.2459915533440124e-05, "loss": 1.6319, "step": 715 }, { "epoch": 0.22, "grad_norm": 11.19504451751709, "learning_rate": 1.2455512069818672e-05, "loss": 1.1383, "step": 716 }, { "epoch": 0.22, "grad_norm": 12.988763809204102, "learning_rate": 1.2451103483028936e-05, "loss": 1.3111, "step": 717 }, { "epoch": 0.22, "grad_norm": 17.553518295288086, "learning_rate": 1.2446689777249232e-05, "loss": 2.2308, "step": 718 }, { "epoch": 0.22, "grad_norm": 20.2987003326416, "learning_rate": 1.2442270956662722e-05, "loss": 2.2082, "step": 719 }, { "epoch": 0.22, "grad_norm": 12.516862869262695, "learning_rate": 1.2437847025457428e-05, "loss": 1.6221, "step": 720 }, { "epoch": 0.22, "grad_norm": 21.578792572021484, "learning_rate": 1.24334179878262e-05, "loss": 6.6848, "step": 721 }, { "epoch": 0.22, "grad_norm": 17.269962310791016, "learning_rate": 1.2428983847966741e-05, "loss": 2.9452, "step": 722 }, { "epoch": 0.22, "grad_norm": 24.642614364624023, "learning_rate": 1.242454461008158e-05, "loss": 4.4548, "step": 723 }, { "epoch": 0.22, "grad_norm": 12.710604667663574, "learning_rate": 1.2420100278378088e-05, "loss": 1.6188, "step": 724 }, { "epoch": 0.22, "grad_norm": 12.620211601257324, "learning_rate": 1.241565085706845e-05, "loss": 1.69, "step": 725 }, { "epoch": 0.22, "grad_norm": 18.085607528686523, "learning_rate": 1.241119635036969e-05, "loss": 3.7645, "step": 726 }, { "epoch": 0.23, "grad_norm": 12.195068359375, "learning_rate": 1.240673676250364e-05, "loss": 2.1039, "step": 727 }, { "epoch": 0.23, "grad_norm": 19.199190139770508, "learning_rate": 1.2402272097696953e-05, "loss": 3.1963, "step": 728 }, { "epoch": 0.23, "grad_norm": 16.378055572509766, "learning_rate": 1.2397802360181093e-05, "loss": 1.9512, "step": 729 }, { "epoch": 0.23, "grad_norm": 11.149024963378906, "learning_rate": 1.2393327554192333e-05, "loss": 1.2123, "step": 730 }, { "epoch": 0.23, "grad_norm": 9.235203742980957, "learning_rate": 1.2388847683971747e-05, "loss": 1.1037, "step": 731 }, { "epoch": 0.23, "grad_norm": 9.219218254089355, "learning_rate": 1.238436275376521e-05, "loss": 1.3665, "step": 732 }, { "epoch": 0.23, "grad_norm": 22.13263702392578, "learning_rate": 1.2379872767823393e-05, "loss": 3.0206, "step": 733 }, { "epoch": 0.23, "grad_norm": 13.911103248596191, "learning_rate": 1.237537773040176e-05, "loss": 2.5949, "step": 734 }, { "epoch": 0.23, "grad_norm": 15.790095329284668, "learning_rate": 1.2370877645760557e-05, "loss": 3.5434, "step": 735 }, { "epoch": 0.23, "grad_norm": 18.366098403930664, "learning_rate": 1.2366372518164822e-05, "loss": 6.0189, "step": 736 }, { "epoch": 0.23, "grad_norm": 13.551661491394043, "learning_rate": 1.2361862351884366e-05, "loss": 1.5336, "step": 737 }, { "epoch": 0.23, "grad_norm": 13.93724250793457, "learning_rate": 1.2357347151193778e-05, "loss": 1.6109, "step": 738 }, { "epoch": 0.23, "grad_norm": 11.013720512390137, "learning_rate": 1.2352826920372418e-05, "loss": 1.5895, "step": 739 }, { "epoch": 0.23, "grad_norm": 10.229809761047363, "learning_rate": 1.2348301663704419e-05, "loss": 1.3494, "step": 740 }, { "epoch": 0.23, "grad_norm": 16.16636085510254, "learning_rate": 1.2343771385478661e-05, "loss": 2.0953, "step": 741 }, { "epoch": 0.23, "grad_norm": 16.416259765625, "learning_rate": 1.2339236089988801e-05, "loss": 3.5324, "step": 742 }, { "epoch": 0.23, "grad_norm": 20.941978454589844, "learning_rate": 1.2334695781533243e-05, "loss": 4.7186, "step": 743 }, { "epoch": 0.23, "grad_norm": 12.121553421020508, "learning_rate": 1.2330150464415144e-05, "loss": 1.2178, "step": 744 }, { "epoch": 0.23, "grad_norm": 12.87922477722168, "learning_rate": 1.2325600142942406e-05, "loss": 1.7759, "step": 745 }, { "epoch": 0.23, "grad_norm": 20.307462692260742, "learning_rate": 1.2321044821427681e-05, "loss": 3.7992, "step": 746 }, { "epoch": 0.23, "grad_norm": 15.959997177124023, "learning_rate": 1.2316484504188348e-05, "loss": 1.5344, "step": 747 }, { "epoch": 0.23, "grad_norm": 16.901805877685547, "learning_rate": 1.231191919554653e-05, "loss": 2.1397, "step": 748 }, { "epoch": 0.23, "grad_norm": 28.746898651123047, "learning_rate": 1.2307348899829073e-05, "loss": 2.3627, "step": 749 }, { "epoch": 0.23, "grad_norm": 18.00472640991211, "learning_rate": 1.2302773621367562e-05, "loss": 2.5594, "step": 750 }, { "epoch": 0.23, "grad_norm": 11.613627433776855, "learning_rate": 1.2298193364498294e-05, "loss": 2.1991, "step": 751 }, { "epoch": 0.23, "grad_norm": 15.26014518737793, "learning_rate": 1.2293608133562285e-05, "loss": 4.995, "step": 752 }, { "epoch": 0.23, "grad_norm": 11.997806549072266, "learning_rate": 1.2289017932905271e-05, "loss": 2.2007, "step": 753 }, { "epoch": 0.23, "grad_norm": 13.420331954956055, "learning_rate": 1.2284422766877692e-05, "loss": 2.6394, "step": 754 }, { "epoch": 0.23, "grad_norm": 14.628358840942383, "learning_rate": 1.2279822639834698e-05, "loss": 4.2071, "step": 755 }, { "epoch": 0.23, "grad_norm": 14.1344575881958, "learning_rate": 1.2275217556136138e-05, "loss": 1.6072, "step": 756 }, { "epoch": 0.23, "grad_norm": 15.807295799255371, "learning_rate": 1.2270607520146563e-05, "loss": 6.007, "step": 757 }, { "epoch": 0.23, "grad_norm": 6.969516754150391, "learning_rate": 1.2265992536235213e-05, "loss": 0.7291, "step": 758 }, { "epoch": 0.24, "grad_norm": 16.10032081604004, "learning_rate": 1.2261372608776021e-05, "loss": 3.4565, "step": 759 }, { "epoch": 0.24, "grad_norm": 11.56027889251709, "learning_rate": 1.2256747742147601e-05, "loss": 1.2174, "step": 760 }, { "epoch": 0.24, "grad_norm": 10.300409317016602, "learning_rate": 1.2252117940733255e-05, "loss": 1.1443, "step": 761 }, { "epoch": 0.24, "grad_norm": 13.430366516113281, "learning_rate": 1.2247483208920955e-05, "loss": 1.8809, "step": 762 }, { "epoch": 0.24, "grad_norm": 8.15425968170166, "learning_rate": 1.2242843551103349e-05, "loss": 0.8325, "step": 763 }, { "epoch": 0.24, "grad_norm": 13.97874641418457, "learning_rate": 1.2238198971677754e-05, "loss": 6.974, "step": 764 }, { "epoch": 0.24, "grad_norm": 8.714863777160645, "learning_rate": 1.2233549475046153e-05, "loss": 0.8588, "step": 765 }, { "epoch": 0.24, "grad_norm": 11.327173233032227, "learning_rate": 1.2228895065615184e-05, "loss": 2.2164, "step": 766 }, { "epoch": 0.24, "grad_norm": 14.299591064453125, "learning_rate": 1.2224235747796147e-05, "loss": 2.6141, "step": 767 }, { "epoch": 0.24, "grad_norm": 16.821212768554688, "learning_rate": 1.221957152600499e-05, "loss": 2.2579, "step": 768 }, { "epoch": 0.24, "grad_norm": 10.770112991333008, "learning_rate": 1.2214902404662314e-05, "loss": 2.2605, "step": 769 }, { "epoch": 0.24, "grad_norm": 15.763861656188965, "learning_rate": 1.2210228388193355e-05, "loss": 2.2584, "step": 770 }, { "epoch": 0.24, "grad_norm": 10.93879222869873, "learning_rate": 1.2205549481027998e-05, "loss": 1.5642, "step": 771 }, { "epoch": 0.24, "grad_norm": 10.689042091369629, "learning_rate": 1.2200865687600757e-05, "loss": 1.1944, "step": 772 }, { "epoch": 0.24, "grad_norm": 19.434879302978516, "learning_rate": 1.219617701235078e-05, "loss": 4.4623, "step": 773 }, { "epoch": 0.24, "grad_norm": 10.151241302490234, "learning_rate": 1.2191483459721837e-05, "loss": 1.1613, "step": 774 }, { "epoch": 0.24, "grad_norm": 12.30809211730957, "learning_rate": 1.2186785034162328e-05, "loss": 1.4964, "step": 775 }, { "epoch": 0.24, "grad_norm": 20.536462783813477, "learning_rate": 1.2182081740125266e-05, "loss": 7.906, "step": 776 }, { "epoch": 0.24, "grad_norm": 17.299827575683594, "learning_rate": 1.2177373582068284e-05, "loss": 3.6637, "step": 777 }, { "epoch": 0.24, "grad_norm": 16.93096923828125, "learning_rate": 1.2172660564453615e-05, "loss": 2.4282, "step": 778 }, { "epoch": 0.24, "grad_norm": 11.090471267700195, "learning_rate": 1.2167942691748105e-05, "loss": 1.3211, "step": 779 }, { "epoch": 0.24, "grad_norm": 13.850017547607422, "learning_rate": 1.2163219968423203e-05, "loss": 2.5532, "step": 780 }, { "epoch": 0.24, "grad_norm": 18.67405891418457, "learning_rate": 1.215849239895495e-05, "loss": 2.0105, "step": 781 }, { "epoch": 0.24, "grad_norm": 10.775568008422852, "learning_rate": 1.2153759987823984e-05, "loss": 0.8751, "step": 782 }, { "epoch": 0.24, "grad_norm": 19.743520736694336, "learning_rate": 1.2149022739515528e-05, "loss": 1.1554, "step": 783 }, { "epoch": 0.24, "grad_norm": 14.606451034545898, "learning_rate": 1.2144280658519392e-05, "loss": 1.904, "step": 784 }, { "epoch": 0.24, "grad_norm": 12.49717903137207, "learning_rate": 1.2139533749329968e-05, "loss": 1.2586, "step": 785 }, { "epoch": 0.24, "grad_norm": 14.731224060058594, "learning_rate": 1.2134782016446219e-05, "loss": 2.3142, "step": 786 }, { "epoch": 0.24, "grad_norm": 10.49559211730957, "learning_rate": 1.2130025464371686e-05, "loss": 1.1727, "step": 787 }, { "epoch": 0.24, "grad_norm": 13.33672046661377, "learning_rate": 1.2125264097614471e-05, "loss": 1.6586, "step": 788 }, { "epoch": 0.24, "grad_norm": 12.418742179870605, "learning_rate": 1.2120497920687241e-05, "loss": 1.5077, "step": 789 }, { "epoch": 0.24, "grad_norm": 15.242766380310059, "learning_rate": 1.2115726938107232e-05, "loss": 2.5938, "step": 790 }, { "epoch": 0.25, "grad_norm": 14.032227516174316, "learning_rate": 1.2110951154396216e-05, "loss": 1.9316, "step": 791 }, { "epoch": 0.25, "grad_norm": 14.62277603149414, "learning_rate": 1.210617057408053e-05, "loss": 2.3232, "step": 792 }, { "epoch": 0.25, "grad_norm": 16.518356323242188, "learning_rate": 1.210138520169105e-05, "loss": 2.179, "step": 793 }, { "epoch": 0.25, "grad_norm": 14.969557762145996, "learning_rate": 1.20965950417632e-05, "loss": 1.8217, "step": 794 }, { "epoch": 0.25, "grad_norm": 13.493850708007812, "learning_rate": 1.2091800098836934e-05, "loss": 4.8096, "step": 795 }, { "epoch": 0.25, "grad_norm": 12.271809577941895, "learning_rate": 1.2087000377456743e-05, "loss": 1.2367, "step": 796 }, { "epoch": 0.25, "grad_norm": 16.41793441772461, "learning_rate": 1.2082195882171647e-05, "loss": 5.0848, "step": 797 }, { "epoch": 0.25, "grad_norm": 10.58703327178955, "learning_rate": 1.2077386617535193e-05, "loss": 1.7298, "step": 798 }, { "epoch": 0.25, "grad_norm": 16.00687026977539, "learning_rate": 1.2072572588105442e-05, "loss": 2.0278, "step": 799 }, { "epoch": 0.25, "grad_norm": 12.369733810424805, "learning_rate": 1.2067753798444974e-05, "loss": 1.6132, "step": 800 }, { "epoch": 0.25, "grad_norm": 9.346023559570312, "learning_rate": 1.2062930253120885e-05, "loss": 1.0617, "step": 801 }, { "epoch": 0.25, "grad_norm": 16.150537490844727, "learning_rate": 1.2058101956704766e-05, "loss": 3.207, "step": 802 }, { "epoch": 0.25, "grad_norm": 16.184139251708984, "learning_rate": 1.205326891377273e-05, "loss": 2.0503, "step": 803 }, { "epoch": 0.25, "grad_norm": 14.664175987243652, "learning_rate": 1.204843112890537e-05, "loss": 1.9163, "step": 804 }, { "epoch": 0.25, "grad_norm": 14.745572090148926, "learning_rate": 1.204358860668778e-05, "loss": 1.4228, "step": 805 }, { "epoch": 0.25, "grad_norm": 13.390905380249023, "learning_rate": 1.2038741351709551e-05, "loss": 1.3138, "step": 806 }, { "epoch": 0.25, "grad_norm": 14.42654800415039, "learning_rate": 1.203388936856475e-05, "loss": 1.5961, "step": 807 }, { "epoch": 0.25, "grad_norm": 15.617491722106934, "learning_rate": 1.2029032661851928e-05, "loss": 1.5742, "step": 808 }, { "epoch": 0.25, "grad_norm": 15.204427719116211, "learning_rate": 1.2024171236174117e-05, "loss": 4.9142, "step": 809 }, { "epoch": 0.25, "grad_norm": 12.922470092773438, "learning_rate": 1.2019305096138813e-05, "loss": 3.9775, "step": 810 }, { "epoch": 0.25, "grad_norm": 12.659815788269043, "learning_rate": 1.2014434246357985e-05, "loss": 2.4097, "step": 811 }, { "epoch": 0.25, "grad_norm": 13.354887008666992, "learning_rate": 1.2009558691448071e-05, "loss": 1.5156, "step": 812 }, { "epoch": 0.25, "grad_norm": 12.20242691040039, "learning_rate": 1.200467843602996e-05, "loss": 4.2651, "step": 813 }, { "epoch": 0.25, "grad_norm": 13.313082695007324, "learning_rate": 1.1999793484729e-05, "loss": 2.9641, "step": 814 }, { "epoch": 0.25, "grad_norm": 16.646648406982422, "learning_rate": 1.1994903842174985e-05, "loss": 4.5349, "step": 815 }, { "epoch": 0.25, "grad_norm": 15.043517112731934, "learning_rate": 1.1990009513002166e-05, "loss": 3.5675, "step": 816 }, { "epoch": 0.25, "grad_norm": 14.295183181762695, "learning_rate": 1.198511050184922e-05, "loss": 1.8846, "step": 817 }, { "epoch": 0.25, "grad_norm": 10.43750286102295, "learning_rate": 1.198020681335928e-05, "loss": 1.2817, "step": 818 }, { "epoch": 0.25, "grad_norm": 15.210440635681152, "learning_rate": 1.1975298452179894e-05, "loss": 3.1719, "step": 819 }, { "epoch": 0.25, "grad_norm": 19.98702621459961, "learning_rate": 1.1970385422963052e-05, "loss": 1.5404, "step": 820 }, { "epoch": 0.25, "grad_norm": 14.712196350097656, "learning_rate": 1.1965467730365157e-05, "loss": 6.1263, "step": 821 }, { "epoch": 0.25, "grad_norm": 16.687849044799805, "learning_rate": 1.1960545379047047e-05, "loss": 2.2848, "step": 822 }, { "epoch": 0.25, "grad_norm": 16.070829391479492, "learning_rate": 1.1955618373673957e-05, "loss": 1.9679, "step": 823 }, { "epoch": 0.26, "grad_norm": 14.484185218811035, "learning_rate": 1.1950686718915552e-05, "loss": 2.8716, "step": 824 }, { "epoch": 0.26, "grad_norm": 11.286521911621094, "learning_rate": 1.1945750419445884e-05, "loss": 2.4105, "step": 825 }, { "epoch": 0.26, "grad_norm": 11.694199562072754, "learning_rate": 1.1940809479943422e-05, "loss": 0.9597, "step": 826 }, { "epoch": 0.26, "grad_norm": 21.48593521118164, "learning_rate": 1.1935863905091026e-05, "loss": 5.8054, "step": 827 }, { "epoch": 0.26, "grad_norm": 13.373125076293945, "learning_rate": 1.1930913699575955e-05, "loss": 3.0395, "step": 828 }, { "epoch": 0.26, "grad_norm": 12.502968788146973, "learning_rate": 1.1925958868089847e-05, "loss": 1.4822, "step": 829 }, { "epoch": 0.26, "grad_norm": 14.944729804992676, "learning_rate": 1.1920999415328728e-05, "loss": 1.9248, "step": 830 }, { "epoch": 0.26, "grad_norm": 13.607522964477539, "learning_rate": 1.1916035345993015e-05, "loss": 3.673, "step": 831 }, { "epoch": 0.26, "grad_norm": 11.033047676086426, "learning_rate": 1.1911066664787484e-05, "loss": 1.2412, "step": 832 }, { "epoch": 0.26, "grad_norm": 13.628982543945312, "learning_rate": 1.1906093376421291e-05, "loss": 1.7389, "step": 833 }, { "epoch": 0.26, "grad_norm": 20.70766258239746, "learning_rate": 1.1901115485607957e-05, "loss": 3.0788, "step": 834 }, { "epoch": 0.26, "grad_norm": 10.678114891052246, "learning_rate": 1.1896132997065364e-05, "loss": 1.2587, "step": 835 }, { "epoch": 0.26, "grad_norm": 13.934436798095703, "learning_rate": 1.1891145915515754e-05, "loss": 2.8138, "step": 836 }, { "epoch": 0.26, "grad_norm": 15.426923751831055, "learning_rate": 1.188615424568572e-05, "loss": 1.9037, "step": 837 }, { "epoch": 0.26, "grad_norm": 11.262444496154785, "learning_rate": 1.1881157992306204e-05, "loss": 1.4626, "step": 838 }, { "epoch": 0.26, "grad_norm": 19.111431121826172, "learning_rate": 1.187615716011249e-05, "loss": 4.544, "step": 839 }, { "epoch": 0.26, "grad_norm": 15.529975891113281, "learning_rate": 1.1871151753844207e-05, "loss": 3.2797, "step": 840 }, { "epoch": 0.26, "grad_norm": 18.435951232910156, "learning_rate": 1.1866141778245314e-05, "loss": 2.6301, "step": 841 }, { "epoch": 0.26, "grad_norm": 16.66749382019043, "learning_rate": 1.1861127238064103e-05, "loss": 2.8497, "step": 842 }, { "epoch": 0.26, "grad_norm": 18.804611206054688, "learning_rate": 1.1856108138053194e-05, "loss": 5.6834, "step": 843 }, { "epoch": 0.26, "grad_norm": 13.824563026428223, "learning_rate": 1.1851084482969523e-05, "loss": 5.7809, "step": 844 }, { "epoch": 0.26, "grad_norm": 14.626198768615723, "learning_rate": 1.1846056277574352e-05, "loss": 3.5987, "step": 845 }, { "epoch": 0.26, "grad_norm": 12.569183349609375, "learning_rate": 1.1841023526633243e-05, "loss": 1.5252, "step": 846 }, { "epoch": 0.26, "grad_norm": 17.080930709838867, "learning_rate": 1.1835986234916079e-05, "loss": 4.205, "step": 847 }, { "epoch": 0.26, "grad_norm": 18.488704681396484, "learning_rate": 1.183094440719704e-05, "loss": 2.2896, "step": 848 }, { "epoch": 0.26, "grad_norm": 10.405138969421387, "learning_rate": 1.1825898048254604e-05, "loss": 2.4845, "step": 849 }, { "epoch": 0.26, "grad_norm": 13.465655326843262, "learning_rate": 1.1820847162871548e-05, "loss": 2.3561, "step": 850 }, { "epoch": 0.26, "grad_norm": 12.562773704528809, "learning_rate": 1.181579175583493e-05, "loss": 2.2778, "step": 851 }, { "epoch": 0.26, "grad_norm": 16.159759521484375, "learning_rate": 1.1810731831936115e-05, "loss": 4.7586, "step": 852 }, { "epoch": 0.26, "grad_norm": 14.529434204101562, "learning_rate": 1.1805667395970719e-05, "loss": 1.5514, "step": 853 }, { "epoch": 0.26, "grad_norm": 16.40343475341797, "learning_rate": 1.1800598452738656e-05, "loss": 2.4718, "step": 854 }, { "epoch": 0.26, "grad_norm": 9.03700065612793, "learning_rate": 1.1795525007044106e-05, "loss": 0.8431, "step": 855 }, { "epoch": 0.27, "grad_norm": 20.445934295654297, "learning_rate": 1.1790447063695516e-05, "loss": 1.8509, "step": 856 }, { "epoch": 0.27, "grad_norm": 15.171225547790527, "learning_rate": 1.178536462750559e-05, "loss": 1.9565, "step": 857 }, { "epoch": 0.27, "grad_norm": 16.317176818847656, "learning_rate": 1.1780277703291302e-05, "loss": 2.3285, "step": 858 }, { "epoch": 0.27, "grad_norm": 14.370471954345703, "learning_rate": 1.1775186295873869e-05, "loss": 1.3541, "step": 859 }, { "epoch": 0.27, "grad_norm": 16.811317443847656, "learning_rate": 1.177009041007876e-05, "loss": 2.7627, "step": 860 }, { "epoch": 0.27, "grad_norm": 14.549324035644531, "learning_rate": 1.1764990050735694e-05, "loss": 2.925, "step": 861 }, { "epoch": 0.27, "grad_norm": 15.653358459472656, "learning_rate": 1.1759885222678624e-05, "loss": 1.5885, "step": 862 }, { "epoch": 0.27, "grad_norm": 12.484577178955078, "learning_rate": 1.1754775930745738e-05, "loss": 4.57, "step": 863 }, { "epoch": 0.27, "grad_norm": 10.131400108337402, "learning_rate": 1.1749662179779458e-05, "loss": 2.021, "step": 864 }, { "epoch": 0.27, "grad_norm": 13.61589241027832, "learning_rate": 1.1744543974626428e-05, "loss": 4.4905, "step": 865 }, { "epoch": 0.27, "grad_norm": 24.758455276489258, "learning_rate": 1.1739421320137521e-05, "loss": 2.7249, "step": 866 }, { "epoch": 0.27, "grad_norm": 14.530740737915039, "learning_rate": 1.173429422116782e-05, "loss": 1.3571, "step": 867 }, { "epoch": 0.27, "grad_norm": 22.10980796813965, "learning_rate": 1.172916268257662e-05, "loss": 6.113, "step": 868 }, { "epoch": 0.27, "grad_norm": 9.13565444946289, "learning_rate": 1.1724026709227428e-05, "loss": 2.1237, "step": 869 }, { "epoch": 0.27, "grad_norm": 13.099780082702637, "learning_rate": 1.1718886305987952e-05, "loss": 1.5074, "step": 870 }, { "epoch": 0.27, "grad_norm": 12.233208656311035, "learning_rate": 1.17137414777301e-05, "loss": 1.646, "step": 871 }, { "epoch": 0.27, "grad_norm": 13.301345825195312, "learning_rate": 1.170859222932997e-05, "loss": 0.8234, "step": 872 }, { "epoch": 0.27, "grad_norm": 15.48371696472168, "learning_rate": 1.1703438565667858e-05, "loss": 2.4947, "step": 873 }, { "epoch": 0.27, "grad_norm": 14.66010856628418, "learning_rate": 1.1698280491628229e-05, "loss": 4.3986, "step": 874 }, { "epoch": 0.27, "grad_norm": 16.98670768737793, "learning_rate": 1.1693118012099745e-05, "loss": 3.4212, "step": 875 }, { "epoch": 0.27, "grad_norm": 8.71246337890625, "learning_rate": 1.1687951131975234e-05, "loss": 0.7751, "step": 876 }, { "epoch": 0.27, "grad_norm": 9.071887016296387, "learning_rate": 1.1682779856151694e-05, "loss": 1.2658, "step": 877 }, { "epoch": 0.27, "grad_norm": 9.613807678222656, "learning_rate": 1.1677604189530297e-05, "loss": 2.1093, "step": 878 }, { "epoch": 0.27, "grad_norm": 18.976409912109375, "learning_rate": 1.1672424137016369e-05, "loss": 2.9181, "step": 879 }, { "epoch": 0.27, "grad_norm": 17.42270851135254, "learning_rate": 1.1667239703519393e-05, "loss": 3.4419, "step": 880 }, { "epoch": 0.27, "grad_norm": 18.03681755065918, "learning_rate": 1.1662050893953007e-05, "loss": 3.1427, "step": 881 }, { "epoch": 0.27, "grad_norm": 8.601777076721191, "learning_rate": 1.1656857713234999e-05, "loss": 0.9373, "step": 882 }, { "epoch": 0.27, "grad_norm": 14.285597801208496, "learning_rate": 1.165166016628729e-05, "loss": 1.3633, "step": 883 }, { "epoch": 0.27, "grad_norm": 12.230426788330078, "learning_rate": 1.1646458258035953e-05, "loss": 1.0753, "step": 884 }, { "epoch": 0.27, "grad_norm": 18.180387496948242, "learning_rate": 1.1641251993411182e-05, "loss": 2.7476, "step": 885 }, { "epoch": 0.27, "grad_norm": 20.50904655456543, "learning_rate": 1.1636041377347309e-05, "loss": 3.8631, "step": 886 }, { "epoch": 0.27, "grad_norm": 13.58751106262207, "learning_rate": 1.1630826414782781e-05, "loss": 2.4704, "step": 887 }, { "epoch": 0.28, "grad_norm": 13.96949291229248, "learning_rate": 1.1625607110660176e-05, "loss": 2.2461, "step": 888 }, { "epoch": 0.28, "grad_norm": 15.626188278198242, "learning_rate": 1.1620383469926176e-05, "loss": 2.2847, "step": 889 }, { "epoch": 0.28, "grad_norm": 17.77653694152832, "learning_rate": 1.161515549753158e-05, "loss": 4.4198, "step": 890 }, { "epoch": 0.28, "grad_norm": 10.263466835021973, "learning_rate": 1.1609923198431286e-05, "loss": 0.9664, "step": 891 }, { "epoch": 0.28, "grad_norm": 20.309823989868164, "learning_rate": 1.1604686577584302e-05, "loss": 2.1832, "step": 892 }, { "epoch": 0.28, "grad_norm": 14.75548267364502, "learning_rate": 1.1599445639953723e-05, "loss": 2.443, "step": 893 }, { "epoch": 0.28, "grad_norm": 10.767431259155273, "learning_rate": 1.159420039050674e-05, "loss": 1.0003, "step": 894 }, { "epoch": 0.28, "grad_norm": 14.956649780273438, "learning_rate": 1.158895083421463e-05, "loss": 3.4096, "step": 895 }, { "epoch": 0.28, "grad_norm": 14.170507431030273, "learning_rate": 1.1583696976052751e-05, "loss": 1.2359, "step": 896 }, { "epoch": 0.28, "grad_norm": 21.288522720336914, "learning_rate": 1.157843882100054e-05, "loss": 6.4301, "step": 897 }, { "epoch": 0.28, "grad_norm": 16.635541915893555, "learning_rate": 1.1573176374041503e-05, "loss": 3.6166, "step": 898 }, { "epoch": 0.28, "grad_norm": 12.411420822143555, "learning_rate": 1.1567909640163217e-05, "loss": 2.014, "step": 899 }, { "epoch": 0.28, "grad_norm": 9.194472312927246, "learning_rate": 1.1562638624357319e-05, "loss": 1.1898, "step": 900 }, { "epoch": 0.28, "grad_norm": 17.868114471435547, "learning_rate": 1.1557363331619512e-05, "loss": 2.3214, "step": 901 }, { "epoch": 0.28, "grad_norm": 14.548218727111816, "learning_rate": 1.1552083766949537e-05, "loss": 1.8704, "step": 902 }, { "epoch": 0.28, "grad_norm": 16.413217544555664, "learning_rate": 1.1546799935351202e-05, "loss": 1.5204, "step": 903 }, { "epoch": 0.28, "grad_norm": 17.646013259887695, "learning_rate": 1.1541511841832348e-05, "loss": 2.3315, "step": 904 }, { "epoch": 0.28, "grad_norm": 15.216915130615234, "learning_rate": 1.1536219491404855e-05, "loss": 2.9786, "step": 905 }, { "epoch": 0.28, "grad_norm": 14.426615715026855, "learning_rate": 1.1530922889084643e-05, "loss": 2.4699, "step": 906 }, { "epoch": 0.28, "grad_norm": 16.518457412719727, "learning_rate": 1.1525622039891659e-05, "loss": 5.3688, "step": 907 }, { "epoch": 0.28, "grad_norm": 16.908069610595703, "learning_rate": 1.1520316948849874e-05, "loss": 6.7208, "step": 908 }, { "epoch": 0.28, "grad_norm": 14.2049560546875, "learning_rate": 1.1515007620987282e-05, "loss": 1.9017, "step": 909 }, { "epoch": 0.28, "grad_norm": 16.437877655029297, "learning_rate": 1.1509694061335893e-05, "loss": 1.4594, "step": 910 }, { "epoch": 0.28, "grad_norm": 11.180994987487793, "learning_rate": 1.1504376274931718e-05, "loss": 2.1785, "step": 911 }, { "epoch": 0.28, "grad_norm": 12.00964641571045, "learning_rate": 1.149905426681479e-05, "loss": 1.1223, "step": 912 }, { "epoch": 0.28, "grad_norm": 20.051992416381836, "learning_rate": 1.1493728042029132e-05, "loss": 2.0832, "step": 913 }, { "epoch": 0.28, "grad_norm": 7.1900129318237305, "learning_rate": 1.1488397605622767e-05, "loss": 0.883, "step": 914 }, { "epoch": 0.28, "grad_norm": 12.989784240722656, "learning_rate": 1.1483062962647708e-05, "loss": 5.7096, "step": 915 }, { "epoch": 0.28, "grad_norm": 15.41681957244873, "learning_rate": 1.1477724118159955e-05, "loss": 1.1806, "step": 916 }, { "epoch": 0.28, "grad_norm": 13.027620315551758, "learning_rate": 1.1472381077219493e-05, "loss": 2.3941, "step": 917 }, { "epoch": 0.28, "grad_norm": 10.86893367767334, "learning_rate": 1.1467033844890278e-05, "loss": 1.9049, "step": 918 }, { "epoch": 0.28, "grad_norm": 17.04452896118164, "learning_rate": 1.146168242624025e-05, "loss": 3.0468, "step": 919 }, { "epoch": 0.29, "grad_norm": 15.610978126525879, "learning_rate": 1.1456326826341303e-05, "loss": 2.4455, "step": 920 }, { "epoch": 0.29, "grad_norm": 17.980670928955078, "learning_rate": 1.1450967050269304e-05, "loss": 2.5141, "step": 921 }, { "epoch": 0.29, "grad_norm": 11.66197681427002, "learning_rate": 1.1445603103104072e-05, "loss": 1.2295, "step": 922 }, { "epoch": 0.29, "grad_norm": 9.646084785461426, "learning_rate": 1.1440234989929381e-05, "loss": 1.0995, "step": 923 }, { "epoch": 0.29, "grad_norm": 11.297891616821289, "learning_rate": 1.1434862715832955e-05, "loss": 1.1596, "step": 924 }, { "epoch": 0.29, "grad_norm": 16.568077087402344, "learning_rate": 1.1429486285906462e-05, "loss": 5.4638, "step": 925 }, { "epoch": 0.29, "grad_norm": 14.074769973754883, "learning_rate": 1.1424105705245503e-05, "loss": 3.3299, "step": 926 }, { "epoch": 0.29, "grad_norm": 11.440634727478027, "learning_rate": 1.1418720978949622e-05, "loss": 2.032, "step": 927 }, { "epoch": 0.29, "grad_norm": 15.466567993164062, "learning_rate": 1.1413332112122284e-05, "loss": 1.7181, "step": 928 }, { "epoch": 0.29, "grad_norm": 10.589716911315918, "learning_rate": 1.140793910987088e-05, "loss": 1.3699, "step": 929 }, { "epoch": 0.29, "grad_norm": 11.145682334899902, "learning_rate": 1.1402541977306725e-05, "loss": 0.8361, "step": 930 }, { "epoch": 0.29, "grad_norm": 17.095247268676758, "learning_rate": 1.1397140719545047e-05, "loss": 3.4324, "step": 931 }, { "epoch": 0.29, "grad_norm": 16.94806480407715, "learning_rate": 1.1391735341704977e-05, "loss": 3.3401, "step": 932 }, { "epoch": 0.29, "grad_norm": 11.145195007324219, "learning_rate": 1.1386325848909559e-05, "loss": 2.567, "step": 933 }, { "epoch": 0.29, "grad_norm": 16.297367095947266, "learning_rate": 1.1380912246285735e-05, "loss": 2.6329, "step": 934 }, { "epoch": 0.29, "grad_norm": 10.540289878845215, "learning_rate": 1.1375494538964339e-05, "loss": 1.7408, "step": 935 }, { "epoch": 0.29, "grad_norm": 14.331464767456055, "learning_rate": 1.13700727320801e-05, "loss": 4.355, "step": 936 }, { "epoch": 0.29, "grad_norm": 14.369190216064453, "learning_rate": 1.1364646830771628e-05, "loss": 3.484, "step": 937 }, { "epoch": 0.29, "grad_norm": 14.361828804016113, "learning_rate": 1.1359216840181416e-05, "loss": 1.7254, "step": 938 }, { "epoch": 0.29, "grad_norm": 13.768353462219238, "learning_rate": 1.1353782765455831e-05, "loss": 4.9616, "step": 939 }, { "epoch": 0.29, "grad_norm": 15.60986614227295, "learning_rate": 1.1348344611745116e-05, "loss": 2.9609, "step": 940 }, { "epoch": 0.29, "grad_norm": 12.908799171447754, "learning_rate": 1.1342902384203372e-05, "loss": 1.27, "step": 941 }, { "epoch": 0.29, "grad_norm": 12.849720001220703, "learning_rate": 1.1337456087988565e-05, "loss": 2.6348, "step": 942 }, { "epoch": 0.29, "grad_norm": 17.09627342224121, "learning_rate": 1.1332005728262522e-05, "loss": 5.6902, "step": 943 }, { "epoch": 0.29, "grad_norm": 14.49213981628418, "learning_rate": 1.1326551310190911e-05, "loss": 1.6267, "step": 944 }, { "epoch": 0.29, "grad_norm": 14.392061233520508, "learning_rate": 1.1321092838943253e-05, "loss": 2.2689, "step": 945 }, { "epoch": 0.29, "grad_norm": 15.092098236083984, "learning_rate": 1.1315630319692912e-05, "loss": 1.7458, "step": 946 }, { "epoch": 0.29, "grad_norm": 12.850863456726074, "learning_rate": 1.1310163757617084e-05, "loss": 1.6304, "step": 947 }, { "epoch": 0.29, "grad_norm": 16.881410598754883, "learning_rate": 1.1304693157896797e-05, "loss": 2.6307, "step": 948 }, { "epoch": 0.29, "grad_norm": 10.893935203552246, "learning_rate": 1.1299218525716909e-05, "loss": 2.6573, "step": 949 }, { "epoch": 0.29, "grad_norm": 21.684673309326172, "learning_rate": 1.1293739866266097e-05, "loss": 2.6042, "step": 950 }, { "epoch": 0.29, "grad_norm": 24.443126678466797, "learning_rate": 1.128825718473686e-05, "loss": 8.0742, "step": 951 }, { "epoch": 0.29, "grad_norm": 12.001096725463867, "learning_rate": 1.12827704863255e-05, "loss": 1.7396, "step": 952 }, { "epoch": 0.3, "grad_norm": 11.469615936279297, "learning_rate": 1.127727977623213e-05, "loss": 2.2511, "step": 953 }, { "epoch": 0.3, "grad_norm": 15.373016357421875, "learning_rate": 1.127178505966067e-05, "loss": 2.7536, "step": 954 }, { "epoch": 0.3, "grad_norm": 13.056225776672363, "learning_rate": 1.1266286341818836e-05, "loss": 1.0603, "step": 955 }, { "epoch": 0.3, "grad_norm": 12.561933517456055, "learning_rate": 1.1260783627918125e-05, "loss": 1.4368, "step": 956 }, { "epoch": 0.3, "grad_norm": 19.184635162353516, "learning_rate": 1.125527692317384e-05, "loss": 1.4412, "step": 957 }, { "epoch": 0.3, "grad_norm": 14.610422134399414, "learning_rate": 1.1249766232805048e-05, "loss": 1.268, "step": 958 }, { "epoch": 0.3, "grad_norm": 9.994367599487305, "learning_rate": 1.1244251562034607e-05, "loss": 1.3654, "step": 959 }, { "epoch": 0.3, "grad_norm": 12.45901870727539, "learning_rate": 1.1238732916089138e-05, "loss": 3.1829, "step": 960 }, { "epoch": 0.3, "grad_norm": 16.608116149902344, "learning_rate": 1.123321030019904e-05, "loss": 3.3567, "step": 961 }, { "epoch": 0.3, "grad_norm": 16.66891098022461, "learning_rate": 1.1227683719598463e-05, "loss": 2.76, "step": 962 }, { "epoch": 0.3, "grad_norm": 13.403820037841797, "learning_rate": 1.1222153179525323e-05, "loss": 1.8815, "step": 963 }, { "epoch": 0.3, "grad_norm": 14.18431568145752, "learning_rate": 1.1216618685221285e-05, "loss": 1.7656, "step": 964 }, { "epoch": 0.3, "grad_norm": 15.747517585754395, "learning_rate": 1.1211080241931762e-05, "loss": 4.782, "step": 965 }, { "epoch": 0.3, "grad_norm": 14.531620979309082, "learning_rate": 1.120553785490591e-05, "loss": 2.3307, "step": 966 }, { "epoch": 0.3, "grad_norm": 12.81969165802002, "learning_rate": 1.1199991529396624e-05, "loss": 1.9334, "step": 967 }, { "epoch": 0.3, "grad_norm": 16.989093780517578, "learning_rate": 1.1194441270660528e-05, "loss": 3.0846, "step": 968 }, { "epoch": 0.3, "grad_norm": 24.283540725708008, "learning_rate": 1.118888708395798e-05, "loss": 2.8705, "step": 969 }, { "epoch": 0.3, "grad_norm": 11.448086738586426, "learning_rate": 1.1183328974553058e-05, "loss": 1.2765, "step": 970 }, { "epoch": 0.3, "grad_norm": 14.364688873291016, "learning_rate": 1.1177766947713552e-05, "loss": 1.5004, "step": 971 }, { "epoch": 0.3, "grad_norm": 18.686744689941406, "learning_rate": 1.1172201008710974e-05, "loss": 3.0106, "step": 972 }, { "epoch": 0.3, "grad_norm": 9.328819274902344, "learning_rate": 1.1166631162820538e-05, "loss": 1.6537, "step": 973 }, { "epoch": 0.3, "grad_norm": 15.435249328613281, "learning_rate": 1.1161057415321164e-05, "loss": 5.4632, "step": 974 }, { "epoch": 0.3, "grad_norm": 14.374683380126953, "learning_rate": 1.1155479771495465e-05, "loss": 5.2145, "step": 975 }, { "epoch": 0.3, "grad_norm": 13.833335876464844, "learning_rate": 1.1149898236629754e-05, "loss": 2.1236, "step": 976 }, { "epoch": 0.3, "grad_norm": 13.178450584411621, "learning_rate": 1.1144312816014025e-05, "loss": 2.7498, "step": 977 }, { "epoch": 0.3, "grad_norm": 13.957172393798828, "learning_rate": 1.1138723514941963e-05, "loss": 0.9344, "step": 978 }, { "epoch": 0.3, "grad_norm": 14.987807273864746, "learning_rate": 1.1133130338710917e-05, "loss": 2.0331, "step": 979 }, { "epoch": 0.3, "grad_norm": 17.33484649658203, "learning_rate": 1.1127533292621924e-05, "loss": 4.294, "step": 980 }, { "epoch": 0.3, "grad_norm": 13.04723834991455, "learning_rate": 1.1121932381979679e-05, "loss": 2.3455, "step": 981 }, { "epoch": 0.3, "grad_norm": 13.156903266906738, "learning_rate": 1.1116327612092546e-05, "loss": 1.359, "step": 982 }, { "epoch": 0.3, "grad_norm": 15.709280967712402, "learning_rate": 1.111071898827254e-05, "loss": 2.018, "step": 983 }, { "epoch": 0.3, "grad_norm": 23.15648651123047, "learning_rate": 1.1105106515835333e-05, "loss": 3.0358, "step": 984 }, { "epoch": 0.31, "grad_norm": 10.00515079498291, "learning_rate": 1.1099490200100241e-05, "loss": 1.1987, "step": 985 }, { "epoch": 0.31, "grad_norm": 10.764809608459473, "learning_rate": 1.1093870046390231e-05, "loss": 1.7425, "step": 986 }, { "epoch": 0.31, "grad_norm": 13.764495849609375, "learning_rate": 1.1088246060031897e-05, "loss": 2.7288, "step": 987 }, { "epoch": 0.31, "grad_norm": 13.281590461730957, "learning_rate": 1.108261824635547e-05, "loss": 3.4756, "step": 988 }, { "epoch": 0.31, "grad_norm": 14.334712028503418, "learning_rate": 1.1076986610694813e-05, "loss": 2.1193, "step": 989 }, { "epoch": 0.31, "grad_norm": 18.38178253173828, "learning_rate": 1.1071351158387402e-05, "loss": 1.5241, "step": 990 }, { "epoch": 0.31, "grad_norm": 12.551787376403809, "learning_rate": 1.1065711894774338e-05, "loss": 5.2846, "step": 991 }, { "epoch": 0.31, "grad_norm": 10.72806167602539, "learning_rate": 1.106006882520033e-05, "loss": 2.9111, "step": 992 }, { "epoch": 0.31, "grad_norm": 16.107839584350586, "learning_rate": 1.1054421955013695e-05, "loss": 2.7331, "step": 993 }, { "epoch": 0.31, "grad_norm": 12.12115478515625, "learning_rate": 1.1048771289566354e-05, "loss": 1.9713, "step": 994 }, { "epoch": 0.31, "grad_norm": 12.031820297241211, "learning_rate": 1.1043116834213822e-05, "loss": 1.9452, "step": 995 }, { "epoch": 0.31, "grad_norm": 15.122635841369629, "learning_rate": 1.103745859431521e-05, "loss": 2.1204, "step": 996 }, { "epoch": 0.31, "grad_norm": 6.159744739532471, "learning_rate": 1.1031796575233211e-05, "loss": 0.8231, "step": 997 }, { "epoch": 0.31, "grad_norm": 12.978343963623047, "learning_rate": 1.1026130782334106e-05, "loss": 1.0703, "step": 998 }, { "epoch": 0.31, "grad_norm": 11.258939743041992, "learning_rate": 1.1020461220987744e-05, "loss": 1.32, "step": 999 }, { "epoch": 0.31, "grad_norm": 15.83403205871582, "learning_rate": 1.1014787896567554e-05, "loss": 1.9714, "step": 1000 }, { "epoch": 0.31, "grad_norm": 14.88000774383545, "learning_rate": 1.1009110814450527e-05, "loss": 2.2317, "step": 1001 }, { "epoch": 0.31, "grad_norm": 13.894899368286133, "learning_rate": 1.100342998001722e-05, "loss": 1.7284, "step": 1002 }, { "epoch": 0.31, "grad_norm": 17.018301010131836, "learning_rate": 1.0997745398651735e-05, "loss": 2.0597, "step": 1003 }, { "epoch": 0.31, "grad_norm": 18.89592933654785, "learning_rate": 1.0992057075741742e-05, "loss": 4.1914, "step": 1004 }, { "epoch": 0.31, "grad_norm": 17.937870025634766, "learning_rate": 1.098636501667844e-05, "loss": 9.2263, "step": 1005 }, { "epoch": 0.31, "grad_norm": 22.56218147277832, "learning_rate": 1.0980669226856584e-05, "loss": 2.8435, "step": 1006 }, { "epoch": 0.31, "grad_norm": 18.284509658813477, "learning_rate": 1.0974969711674452e-05, "loss": 4.8165, "step": 1007 }, { "epoch": 0.31, "grad_norm": 15.159753799438477, "learning_rate": 1.0969266476533864e-05, "loss": 2.9532, "step": 1008 }, { "epoch": 0.31, "grad_norm": 12.554959297180176, "learning_rate": 1.0963559526840157e-05, "loss": 2.5036, "step": 1009 }, { "epoch": 0.31, "grad_norm": 13.92638111114502, "learning_rate": 1.0957848868002198e-05, "loss": 3.2055, "step": 1010 }, { "epoch": 0.31, "grad_norm": 15.291211128234863, "learning_rate": 1.0952134505432357e-05, "loss": 3.4983, "step": 1011 }, { "epoch": 0.31, "grad_norm": 10.960637092590332, "learning_rate": 1.0946416444546519e-05, "loss": 1.178, "step": 1012 }, { "epoch": 0.31, "grad_norm": 12.908455848693848, "learning_rate": 1.0940694690764083e-05, "loss": 2.153, "step": 1013 }, { "epoch": 0.31, "grad_norm": 15.675530433654785, "learning_rate": 1.0934969249507937e-05, "loss": 4.4807, "step": 1014 }, { "epoch": 0.31, "grad_norm": 12.855313301086426, "learning_rate": 1.0929240126204468e-05, "loss": 3.9184, "step": 1015 }, { "epoch": 0.31, "grad_norm": 8.087355613708496, "learning_rate": 1.0923507326283553e-05, "loss": 1.181, "step": 1016 }, { "epoch": 0.32, "grad_norm": 10.986209869384766, "learning_rate": 1.0917770855178553e-05, "loss": 1.4583, "step": 1017 }, { "epoch": 0.32, "grad_norm": 16.826547622680664, "learning_rate": 1.0912030718326308e-05, "loss": 4.0032, "step": 1018 }, { "epoch": 0.32, "grad_norm": 9.497063636779785, "learning_rate": 1.0906286921167134e-05, "loss": 1.0684, "step": 1019 }, { "epoch": 0.32, "grad_norm": 10.521944999694824, "learning_rate": 1.0900539469144812e-05, "loss": 1.7304, "step": 1020 }, { "epoch": 0.32, "grad_norm": 13.924857139587402, "learning_rate": 1.089478836770659e-05, "loss": 2.4864, "step": 1021 }, { "epoch": 0.32, "grad_norm": 12.12009334564209, "learning_rate": 1.0889033622303182e-05, "loss": 1.3925, "step": 1022 }, { "epoch": 0.32, "grad_norm": 12.136836051940918, "learning_rate": 1.0883275238388739e-05, "loss": 1.4287, "step": 1023 }, { "epoch": 0.32, "grad_norm": 12.186498641967773, "learning_rate": 1.0877513221420873e-05, "loss": 1.7405, "step": 1024 }, { "epoch": 0.32, "grad_norm": 17.94057273864746, "learning_rate": 1.087174757686064e-05, "loss": 3.8773, "step": 1025 }, { "epoch": 0.32, "grad_norm": 16.553104400634766, "learning_rate": 1.0865978310172522e-05, "loss": 2.3274, "step": 1026 }, { "epoch": 0.32, "grad_norm": 15.304353713989258, "learning_rate": 1.0860205426824449e-05, "loss": 5.7125, "step": 1027 }, { "epoch": 0.32, "grad_norm": 12.027484893798828, "learning_rate": 1.0854428932287768e-05, "loss": 1.0223, "step": 1028 }, { "epoch": 0.32, "grad_norm": 14.752253532409668, "learning_rate": 1.0848648832037257e-05, "loss": 2.5004, "step": 1029 }, { "epoch": 0.32, "grad_norm": 19.452241897583008, "learning_rate": 1.0842865131551098e-05, "loss": 2.8527, "step": 1030 }, { "epoch": 0.32, "grad_norm": 17.181283950805664, "learning_rate": 1.0837077836310903e-05, "loss": 3.4927, "step": 1031 }, { "epoch": 0.32, "grad_norm": 9.379412651062012, "learning_rate": 1.083128695180168e-05, "loss": 0.9177, "step": 1032 }, { "epoch": 0.32, "grad_norm": 11.588668823242188, "learning_rate": 1.0825492483511836e-05, "loss": 1.4227, "step": 1033 }, { "epoch": 0.32, "grad_norm": 11.405271530151367, "learning_rate": 1.0819694436933183e-05, "loss": 1.6208, "step": 1034 }, { "epoch": 0.32, "grad_norm": 16.79037094116211, "learning_rate": 1.081389281756092e-05, "loss": 2.357, "step": 1035 }, { "epoch": 0.32, "grad_norm": 13.481416702270508, "learning_rate": 1.0808087630893634e-05, "loss": 1.7349, "step": 1036 }, { "epoch": 0.32, "grad_norm": 12.72651481628418, "learning_rate": 1.0802278882433291e-05, "loss": 1.3169, "step": 1037 }, { "epoch": 0.32, "grad_norm": 15.26197624206543, "learning_rate": 1.0796466577685234e-05, "loss": 2.0093, "step": 1038 }, { "epoch": 0.32, "grad_norm": 13.19687271118164, "learning_rate": 1.0790650722158175e-05, "loss": 2.2271, "step": 1039 }, { "epoch": 0.32, "grad_norm": 11.644251823425293, "learning_rate": 1.0784831321364192e-05, "loss": 1.1088, "step": 1040 }, { "epoch": 0.32, "grad_norm": 18.361989974975586, "learning_rate": 1.0779008380818725e-05, "loss": 6.0244, "step": 1041 }, { "epoch": 0.32, "grad_norm": 20.438385009765625, "learning_rate": 1.0773181906040568e-05, "loss": 2.9884, "step": 1042 }, { "epoch": 0.32, "grad_norm": 15.359336853027344, "learning_rate": 1.0767351902551863e-05, "loss": 1.2271, "step": 1043 }, { "epoch": 0.32, "grad_norm": 16.786720275878906, "learning_rate": 1.0761518375878097e-05, "loss": 5.1943, "step": 1044 }, { "epoch": 0.32, "grad_norm": 15.47581672668457, "learning_rate": 1.0755681331548095e-05, "loss": 2.0553, "step": 1045 }, { "epoch": 0.32, "grad_norm": 13.731276512145996, "learning_rate": 1.0749840775094023e-05, "loss": 2.6001, "step": 1046 }, { "epoch": 0.32, "grad_norm": 12.26632022857666, "learning_rate": 1.0743996712051366e-05, "loss": 1.6236, "step": 1047 }, { "epoch": 0.32, "grad_norm": 12.464577674865723, "learning_rate": 1.0738149147958936e-05, "loss": 1.7641, "step": 1048 }, { "epoch": 0.33, "grad_norm": 17.070165634155273, "learning_rate": 1.0732298088358867e-05, "loss": 2.8956, "step": 1049 }, { "epoch": 0.33, "grad_norm": 17.91103172302246, "learning_rate": 1.07264435387966e-05, "loss": 3.1296, "step": 1050 }, { "epoch": 0.33, "grad_norm": 12.081746101379395, "learning_rate": 1.0720585504820885e-05, "loss": 2.3239, "step": 1051 }, { "epoch": 0.33, "grad_norm": 11.550629615783691, "learning_rate": 1.0714723991983782e-05, "loss": 2.0387, "step": 1052 }, { "epoch": 0.33, "grad_norm": 10.877471923828125, "learning_rate": 1.0708859005840637e-05, "loss": 0.9464, "step": 1053 }, { "epoch": 0.33, "grad_norm": 18.174089431762695, "learning_rate": 1.0702990551950092e-05, "loss": 2.5236, "step": 1054 }, { "epoch": 0.33, "grad_norm": 15.213211059570312, "learning_rate": 1.069711863587408e-05, "loss": 2.5513, "step": 1055 }, { "epoch": 0.33, "grad_norm": 17.575151443481445, "learning_rate": 1.0691243263177808e-05, "loss": 4.2446, "step": 1056 }, { "epoch": 0.33, "grad_norm": 17.19057273864746, "learning_rate": 1.0685364439429767e-05, "loss": 1.7243, "step": 1057 }, { "epoch": 0.33, "grad_norm": 12.374418258666992, "learning_rate": 1.0679482170201714e-05, "loss": 1.9149, "step": 1058 }, { "epoch": 0.33, "grad_norm": 19.93390464782715, "learning_rate": 1.0673596461068668e-05, "loss": 4.4496, "step": 1059 }, { "epoch": 0.33, "grad_norm": 13.360801696777344, "learning_rate": 1.0667707317608917e-05, "loss": 1.2565, "step": 1060 }, { "epoch": 0.33, "grad_norm": 14.6316556930542, "learning_rate": 1.0661814745403998e-05, "loss": 5.3259, "step": 1061 }, { "epoch": 0.33, "grad_norm": 16.264671325683594, "learning_rate": 1.0655918750038698e-05, "loss": 1.9203, "step": 1062 }, { "epoch": 0.33, "grad_norm": 25.273666381835938, "learning_rate": 1.065001933710105e-05, "loss": 7.867, "step": 1063 }, { "epoch": 0.33, "grad_norm": 17.659454345703125, "learning_rate": 1.0644116512182325e-05, "loss": 1.921, "step": 1064 }, { "epoch": 0.33, "grad_norm": 19.446399688720703, "learning_rate": 1.063821028087703e-05, "loss": 6.1517, "step": 1065 }, { "epoch": 0.33, "grad_norm": 13.461407661437988, "learning_rate": 1.0632300648782897e-05, "loss": 3.0714, "step": 1066 }, { "epoch": 0.33, "grad_norm": 19.374279022216797, "learning_rate": 1.0626387621500884e-05, "loss": 5.1723, "step": 1067 }, { "epoch": 0.33, "grad_norm": 19.21060562133789, "learning_rate": 1.0620471204635164e-05, "loss": 2.4489, "step": 1068 }, { "epoch": 0.33, "grad_norm": 14.720632553100586, "learning_rate": 1.0614551403793124e-05, "loss": 2.9885, "step": 1069 }, { "epoch": 0.33, "grad_norm": 16.278717041015625, "learning_rate": 1.060862822458536e-05, "loss": 4.4297, "step": 1070 }, { "epoch": 0.33, "grad_norm": 16.912240982055664, "learning_rate": 1.060270167262567e-05, "loss": 2.9691, "step": 1071 }, { "epoch": 0.33, "grad_norm": 20.76288414001465, "learning_rate": 1.0596771753531042e-05, "loss": 6.1958, "step": 1072 }, { "epoch": 0.33, "grad_norm": 15.202351570129395, "learning_rate": 1.0590838472921663e-05, "loss": 2.0175, "step": 1073 }, { "epoch": 0.33, "grad_norm": 14.269200325012207, "learning_rate": 1.0584901836420905e-05, "loss": 2.3095, "step": 1074 }, { "epoch": 0.33, "grad_norm": 16.454748153686523, "learning_rate": 1.0578961849655319e-05, "loss": 5.6113, "step": 1075 }, { "epoch": 0.33, "grad_norm": 17.027233123779297, "learning_rate": 1.0573018518254625e-05, "loss": 5.2381, "step": 1076 }, { "epoch": 0.33, "grad_norm": 13.088054656982422, "learning_rate": 1.0567071847851723e-05, "loss": 2.7067, "step": 1077 }, { "epoch": 0.33, "grad_norm": 12.304478645324707, "learning_rate": 1.0561121844082676e-05, "loss": 2.5341, "step": 1078 }, { "epoch": 0.33, "grad_norm": 17.764257431030273, "learning_rate": 1.0555168512586697e-05, "loss": 1.7237, "step": 1079 }, { "epoch": 0.33, "grad_norm": 23.56541633605957, "learning_rate": 1.0549211859006169e-05, "loss": 3.334, "step": 1080 }, { "epoch": 0.33, "grad_norm": 16.430952072143555, "learning_rate": 1.0543251888986605e-05, "loss": 2.1071, "step": 1081 }, { "epoch": 0.34, "grad_norm": 12.087030410766602, "learning_rate": 1.0537288608176673e-05, "loss": 2.5736, "step": 1082 }, { "epoch": 0.34, "grad_norm": 17.93031120300293, "learning_rate": 1.0531322022228181e-05, "loss": 5.9253, "step": 1083 }, { "epoch": 0.34, "grad_norm": 18.3150577545166, "learning_rate": 1.0525352136796061e-05, "loss": 2.781, "step": 1084 }, { "epoch": 0.34, "grad_norm": 12.447970390319824, "learning_rate": 1.0519378957538378e-05, "loss": 1.8882, "step": 1085 }, { "epoch": 0.34, "grad_norm": 12.831107139587402, "learning_rate": 1.0513402490116313e-05, "loss": 1.3537, "step": 1086 }, { "epoch": 0.34, "grad_norm": 18.853090286254883, "learning_rate": 1.050742274019417e-05, "loss": 7.4965, "step": 1087 }, { "epoch": 0.34, "grad_norm": 16.2085018157959, "learning_rate": 1.050143971343936e-05, "loss": 2.4386, "step": 1088 }, { "epoch": 0.34, "grad_norm": 16.624937057495117, "learning_rate": 1.0495453415522405e-05, "loss": 2.6718, "step": 1089 }, { "epoch": 0.34, "grad_norm": 28.173011779785156, "learning_rate": 1.0489463852116918e-05, "loss": 2.2093, "step": 1090 }, { "epoch": 0.34, "grad_norm": 9.845703125, "learning_rate": 1.0483471028899613e-05, "loss": 1.1327, "step": 1091 }, { "epoch": 0.34, "grad_norm": 13.236507415771484, "learning_rate": 1.0477474951550294e-05, "loss": 3.1908, "step": 1092 }, { "epoch": 0.34, "grad_norm": 7.714241981506348, "learning_rate": 1.0471475625751847e-05, "loss": 0.7375, "step": 1093 }, { "epoch": 0.34, "grad_norm": 16.44623374938965, "learning_rate": 1.046547305719024e-05, "loss": 1.7071, "step": 1094 }, { "epoch": 0.34, "grad_norm": 6.655475616455078, "learning_rate": 1.0459467251554508e-05, "loss": 0.668, "step": 1095 }, { "epoch": 0.34, "grad_norm": 12.086280822753906, "learning_rate": 1.0453458214536762e-05, "loss": 1.7233, "step": 1096 }, { "epoch": 0.34, "grad_norm": 16.326202392578125, "learning_rate": 1.0447445951832167e-05, "loss": 1.9609, "step": 1097 }, { "epoch": 0.34, "grad_norm": 9.662792205810547, "learning_rate": 1.0441430469138954e-05, "loss": 0.9869, "step": 1098 }, { "epoch": 0.34, "grad_norm": 12.256331443786621, "learning_rate": 1.0435411772158401e-05, "loss": 1.7208, "step": 1099 }, { "epoch": 0.34, "grad_norm": 23.35416030883789, "learning_rate": 1.042938986659483e-05, "loss": 2.9477, "step": 1100 }, { "epoch": 0.34, "grad_norm": 15.350637435913086, "learning_rate": 1.0423364758155609e-05, "loss": 1.8908, "step": 1101 }, { "epoch": 0.34, "grad_norm": 17.07219696044922, "learning_rate": 1.041733645255114e-05, "loss": 2.6845, "step": 1102 }, { "epoch": 0.34, "grad_norm": 19.381675720214844, "learning_rate": 1.0411304955494854e-05, "loss": 3.0835, "step": 1103 }, { "epoch": 0.34, "grad_norm": 10.797784805297852, "learning_rate": 1.040527027270321e-05, "loss": 1.382, "step": 1104 }, { "epoch": 0.34, "grad_norm": 18.23086166381836, "learning_rate": 1.0399232409895677e-05, "loss": 1.6303, "step": 1105 }, { "epoch": 0.34, "grad_norm": 9.812505722045898, "learning_rate": 1.0393191372794753e-05, "loss": 0.8937, "step": 1106 }, { "epoch": 0.34, "grad_norm": 19.530162811279297, "learning_rate": 1.0387147167125929e-05, "loss": 3.8252, "step": 1107 }, { "epoch": 0.34, "grad_norm": 21.945730209350586, "learning_rate": 1.0381099798617711e-05, "loss": 4.0083, "step": 1108 }, { "epoch": 0.34, "grad_norm": 21.621074676513672, "learning_rate": 1.0375049273001596e-05, "loss": 5.3178, "step": 1109 }, { "epoch": 0.34, "grad_norm": 13.84105110168457, "learning_rate": 1.0368995596012079e-05, "loss": 1.3034, "step": 1110 }, { "epoch": 0.34, "grad_norm": 16.137035369873047, "learning_rate": 1.036293877338663e-05, "loss": 1.0091, "step": 1111 }, { "epoch": 0.34, "grad_norm": 14.98267650604248, "learning_rate": 1.0356878810865713e-05, "loss": 4.8692, "step": 1112 }, { "epoch": 0.34, "grad_norm": 10.179339408874512, "learning_rate": 1.0350815714192765e-05, "loss": 2.0231, "step": 1113 }, { "epoch": 0.35, "grad_norm": 11.825215339660645, "learning_rate": 1.034474948911419e-05, "loss": 1.9089, "step": 1114 }, { "epoch": 0.35, "grad_norm": 15.7478609085083, "learning_rate": 1.0338680141379355e-05, "loss": 5.4506, "step": 1115 }, { "epoch": 0.35, "grad_norm": 16.455516815185547, "learning_rate": 1.0332607676740596e-05, "loss": 4.8066, "step": 1116 }, { "epoch": 0.35, "grad_norm": 11.75464916229248, "learning_rate": 1.0326532100953193e-05, "loss": 2.9567, "step": 1117 }, { "epoch": 0.35, "grad_norm": 12.472881317138672, "learning_rate": 1.0320453419775378e-05, "loss": 1.5355, "step": 1118 }, { "epoch": 0.35, "grad_norm": 14.322370529174805, "learning_rate": 1.0314371638968331e-05, "loss": 1.8469, "step": 1119 }, { "epoch": 0.35, "grad_norm": 12.017380714416504, "learning_rate": 1.0308286764296164e-05, "loss": 2.8139, "step": 1120 }, { "epoch": 0.35, "grad_norm": 10.160161972045898, "learning_rate": 1.0302198801525923e-05, "loss": 1.2088, "step": 1121 }, { "epoch": 0.35, "grad_norm": 20.627988815307617, "learning_rate": 1.029610775642758e-05, "loss": 4.5566, "step": 1122 }, { "epoch": 0.35, "grad_norm": 12.355149269104004, "learning_rate": 1.0290013634774031e-05, "loss": 1.3943, "step": 1123 }, { "epoch": 0.35, "grad_norm": 12.896719932556152, "learning_rate": 1.0283916442341085e-05, "loss": 2.469, "step": 1124 }, { "epoch": 0.35, "grad_norm": 25.20741081237793, "learning_rate": 1.0277816184907462e-05, "loss": 5.2271, "step": 1125 }, { "epoch": 0.35, "grad_norm": 13.427926063537598, "learning_rate": 1.0271712868254787e-05, "loss": 2.7404, "step": 1126 }, { "epoch": 0.35, "grad_norm": 14.849968910217285, "learning_rate": 1.0265606498167586e-05, "loss": 5.4348, "step": 1127 }, { "epoch": 0.35, "grad_norm": 15.994071960449219, "learning_rate": 1.0259497080433279e-05, "loss": 2.3489, "step": 1128 }, { "epoch": 0.35, "grad_norm": 16.78647804260254, "learning_rate": 1.0253384620842169e-05, "loss": 5.6616, "step": 1129 }, { "epoch": 0.35, "grad_norm": 12.617353439331055, "learning_rate": 1.0247269125187447e-05, "loss": 2.2434, "step": 1130 }, { "epoch": 0.35, "grad_norm": 11.257705688476562, "learning_rate": 1.0241150599265186e-05, "loss": 1.5982, "step": 1131 }, { "epoch": 0.35, "grad_norm": 14.668039321899414, "learning_rate": 1.0235029048874318e-05, "loss": 1.4743, "step": 1132 }, { "epoch": 0.35, "grad_norm": 13.072298049926758, "learning_rate": 1.0228904479816654e-05, "loss": 1.8054, "step": 1133 }, { "epoch": 0.35, "grad_norm": 13.091022491455078, "learning_rate": 1.022277689789686e-05, "loss": 1.8684, "step": 1134 }, { "epoch": 0.35, "grad_norm": 12.35072135925293, "learning_rate": 1.0216646308922457e-05, "loss": 2.476, "step": 1135 }, { "epoch": 0.35, "grad_norm": 8.345824241638184, "learning_rate": 1.021051271870382e-05, "loss": 1.0648, "step": 1136 }, { "epoch": 0.35, "grad_norm": 15.507397651672363, "learning_rate": 1.0204376133054166e-05, "loss": 6.1307, "step": 1137 }, { "epoch": 0.35, "grad_norm": 16.02056312561035, "learning_rate": 1.0198236557789548e-05, "loss": 2.7045, "step": 1138 }, { "epoch": 0.35, "grad_norm": 14.535933494567871, "learning_rate": 1.0192093998728858e-05, "loss": 2.0451, "step": 1139 }, { "epoch": 0.35, "grad_norm": 13.683833122253418, "learning_rate": 1.0185948461693812e-05, "loss": 1.9239, "step": 1140 }, { "epoch": 0.35, "grad_norm": 10.788311958312988, "learning_rate": 1.0179799952508952e-05, "loss": 1.2934, "step": 1141 }, { "epoch": 0.35, "grad_norm": 12.489497184753418, "learning_rate": 1.0173648477001632e-05, "loss": 1.8452, "step": 1142 }, { "epoch": 0.35, "grad_norm": 17.01746940612793, "learning_rate": 1.0167494041002023e-05, "loss": 2.1199, "step": 1143 }, { "epoch": 0.35, "grad_norm": 13.93575382232666, "learning_rate": 1.0161336650343097e-05, "loss": 2.2911, "step": 1144 }, { "epoch": 0.35, "grad_norm": 16.233535766601562, "learning_rate": 1.0155176310860629e-05, "loss": 4.7763, "step": 1145 }, { "epoch": 0.36, "grad_norm": 23.979536056518555, "learning_rate": 1.0149013028393188e-05, "loss": 3.326, "step": 1146 }, { "epoch": 0.36, "grad_norm": 17.001752853393555, "learning_rate": 1.0142846808782132e-05, "loss": 3.1564, "step": 1147 }, { "epoch": 0.36, "grad_norm": 23.573925018310547, "learning_rate": 1.0136677657871602e-05, "loss": 4.3396, "step": 1148 }, { "epoch": 0.36, "grad_norm": 13.067983627319336, "learning_rate": 1.0130505581508521e-05, "loss": 1.7841, "step": 1149 }, { "epoch": 0.36, "grad_norm": 19.41147804260254, "learning_rate": 1.0124330585542581e-05, "loss": 3.8713, "step": 1150 }, { "epoch": 0.36, "grad_norm": 12.91067886352539, "learning_rate": 1.0118152675826241e-05, "loss": 4.796, "step": 1151 }, { "epoch": 0.36, "grad_norm": 12.093422889709473, "learning_rate": 1.0111971858214724e-05, "loss": 1.9465, "step": 1152 }, { "epoch": 0.36, "grad_norm": 11.869210243225098, "learning_rate": 1.010578813856601e-05, "loss": 4.7352, "step": 1153 }, { "epoch": 0.36, "grad_norm": 11.492764472961426, "learning_rate": 1.0099601522740818e-05, "loss": 1.0493, "step": 1154 }, { "epoch": 0.36, "grad_norm": 12.02536678314209, "learning_rate": 1.0093412016602632e-05, "loss": 1.4691, "step": 1155 }, { "epoch": 0.36, "grad_norm": 19.572359085083008, "learning_rate": 1.0087219626017658e-05, "loss": 2.0801, "step": 1156 }, { "epoch": 0.36, "grad_norm": 11.340232849121094, "learning_rate": 1.0081024356854843e-05, "loss": 1.668, "step": 1157 }, { "epoch": 0.36, "grad_norm": 16.6257266998291, "learning_rate": 1.0074826214985863e-05, "loss": 1.0484, "step": 1158 }, { "epoch": 0.36, "grad_norm": 17.96036720275879, "learning_rate": 1.0068625206285112e-05, "loss": 2.1781, "step": 1159 }, { "epoch": 0.36, "grad_norm": 14.40553092956543, "learning_rate": 1.0062421336629707e-05, "loss": 1.4709, "step": 1160 }, { "epoch": 0.36, "grad_norm": 13.915143013000488, "learning_rate": 1.0056214611899471e-05, "loss": 1.6584, "step": 1161 }, { "epoch": 0.36, "grad_norm": 12.291485786437988, "learning_rate": 1.0050005037976941e-05, "loss": 4.2902, "step": 1162 }, { "epoch": 0.36, "grad_norm": 15.307587623596191, "learning_rate": 1.0043792620747342e-05, "loss": 5.2116, "step": 1163 }, { "epoch": 0.36, "grad_norm": 19.080533981323242, "learning_rate": 1.0037577366098603e-05, "loss": 2.7628, "step": 1164 }, { "epoch": 0.36, "grad_norm": 14.373787879943848, "learning_rate": 1.0031359279921343e-05, "loss": 2.3492, "step": 1165 }, { "epoch": 0.36, "grad_norm": 16.30225944519043, "learning_rate": 1.0025138368108858e-05, "loss": 5.119, "step": 1166 }, { "epoch": 0.36, "grad_norm": 17.795886993408203, "learning_rate": 1.0018914636557127e-05, "loss": 1.8234, "step": 1167 }, { "epoch": 0.36, "grad_norm": 13.938688278198242, "learning_rate": 1.00126880911648e-05, "loss": 3.5577, "step": 1168 }, { "epoch": 0.36, "grad_norm": 12.905827522277832, "learning_rate": 1.0006458737833194e-05, "loss": 0.7515, "step": 1169 }, { "epoch": 0.36, "grad_norm": 16.679336547851562, "learning_rate": 1.0000226582466286e-05, "loss": 4.5681, "step": 1170 }, { "epoch": 0.36, "grad_norm": 15.47931957244873, "learning_rate": 9.99399163097071e-06, "loss": 1.555, "step": 1171 }, { "epoch": 0.36, "grad_norm": 16.255754470825195, "learning_rate": 9.987753889255753e-06, "loss": 1.9615, "step": 1172 }, { "epoch": 0.36, "grad_norm": 14.039870262145996, "learning_rate": 9.981513363233342e-06, "loss": 2.3602, "step": 1173 }, { "epoch": 0.36, "grad_norm": 15.20569133758545, "learning_rate": 9.975270058818041e-06, "loss": 4.2076, "step": 1174 }, { "epoch": 0.36, "grad_norm": 9.11036205291748, "learning_rate": 9.969023981927054e-06, "loss": 1.4167, "step": 1175 }, { "epoch": 0.36, "grad_norm": 16.150285720825195, "learning_rate": 9.962775138480211e-06, "loss": 3.0512, "step": 1176 }, { "epoch": 0.36, "grad_norm": 17.30279541015625, "learning_rate": 9.956523534399956e-06, "loss": 3.5126, "step": 1177 }, { "epoch": 0.36, "grad_norm": 14.633407592773438, "learning_rate": 9.95026917561136e-06, "loss": 5.3493, "step": 1178 }, { "epoch": 0.37, "grad_norm": 21.873212814331055, "learning_rate": 9.944012068042102e-06, "loss": 5.6664, "step": 1179 }, { "epoch": 0.37, "grad_norm": 12.300834655761719, "learning_rate": 9.937752217622463e-06, "loss": 2.9006, "step": 1180 }, { "epoch": 0.37, "grad_norm": 15.483316421508789, "learning_rate": 9.931489630285323e-06, "loss": 2.9011, "step": 1181 }, { "epoch": 0.37, "grad_norm": 16.026229858398438, "learning_rate": 9.92522431196616e-06, "loss": 1.76, "step": 1182 }, { "epoch": 0.37, "grad_norm": 12.829645156860352, "learning_rate": 9.91895626860304e-06, "loss": 1.9771, "step": 1183 }, { "epoch": 0.37, "grad_norm": 8.540634155273438, "learning_rate": 9.912685506136613e-06, "loss": 1.0964, "step": 1184 }, { "epoch": 0.37, "grad_norm": 12.960216522216797, "learning_rate": 9.906412030510093e-06, "loss": 1.8555, "step": 1185 }, { "epoch": 0.37, "grad_norm": 17.319026947021484, "learning_rate": 9.900135847669287e-06, "loss": 2.5102, "step": 1186 }, { "epoch": 0.37, "grad_norm": 19.70650291442871, "learning_rate": 9.893856963562553e-06, "loss": 6.4272, "step": 1187 }, { "epoch": 0.37, "grad_norm": 10.795018196105957, "learning_rate": 9.88757538414081e-06, "loss": 1.5096, "step": 1188 }, { "epoch": 0.37, "grad_norm": 14.696696281433105, "learning_rate": 9.88129111535754e-06, "loss": 5.2524, "step": 1189 }, { "epoch": 0.37, "grad_norm": 13.470584869384766, "learning_rate": 9.875004163168761e-06, "loss": 1.545, "step": 1190 }, { "epoch": 0.37, "grad_norm": 15.876420021057129, "learning_rate": 9.868714533533047e-06, "loss": 1.419, "step": 1191 }, { "epoch": 0.37, "grad_norm": 12.037657737731934, "learning_rate": 9.862422232411503e-06, "loss": 1.228, "step": 1192 }, { "epoch": 0.37, "grad_norm": 16.153223037719727, "learning_rate": 9.856127265767766e-06, "loss": 2.9238, "step": 1193 }, { "epoch": 0.37, "grad_norm": 8.563721656799316, "learning_rate": 9.849829639568e-06, "loss": 0.9769, "step": 1194 }, { "epoch": 0.37, "grad_norm": 12.931380271911621, "learning_rate": 9.843529359780893e-06, "loss": 1.8056, "step": 1195 }, { "epoch": 0.37, "grad_norm": 14.038524627685547, "learning_rate": 9.83722643237764e-06, "loss": 2.5044, "step": 1196 }, { "epoch": 0.37, "grad_norm": 21.455846786499023, "learning_rate": 9.830920863331956e-06, "loss": 6.5308, "step": 1197 }, { "epoch": 0.37, "grad_norm": 9.880926132202148, "learning_rate": 9.824612658620052e-06, "loss": 1.3094, "step": 1198 }, { "epoch": 0.37, "grad_norm": 11.45067310333252, "learning_rate": 9.818301824220637e-06, "loss": 1.7067, "step": 1199 }, { "epoch": 0.37, "grad_norm": 11.626029968261719, "learning_rate": 9.811988366114913e-06, "loss": 2.5457, "step": 1200 }, { "epoch": 0.37, "grad_norm": 22.094850540161133, "learning_rate": 9.805672290286577e-06, "loss": 2.4551, "step": 1201 }, { "epoch": 0.37, "grad_norm": 15.181697845458984, "learning_rate": 9.799353602721792e-06, "loss": 1.7186, "step": 1202 }, { "epoch": 0.37, "grad_norm": 15.625029563903809, "learning_rate": 9.79303230940921e-06, "loss": 4.4049, "step": 1203 }, { "epoch": 0.37, "grad_norm": 14.522659301757812, "learning_rate": 9.786708416339944e-06, "loss": 1.8953, "step": 1204 }, { "epoch": 0.37, "grad_norm": 11.440786361694336, "learning_rate": 9.780381929507575e-06, "loss": 2.1252, "step": 1205 }, { "epoch": 0.37, "grad_norm": 15.497150421142578, "learning_rate": 9.774052854908138e-06, "loss": 3.1987, "step": 1206 }, { "epoch": 0.37, "grad_norm": 17.17487907409668, "learning_rate": 9.767721198540129e-06, "loss": 1.7985, "step": 1207 }, { "epoch": 0.37, "grad_norm": 11.13756275177002, "learning_rate": 9.761386966404483e-06, "loss": 1.5107, "step": 1208 }, { "epoch": 0.37, "grad_norm": 13.055180549621582, "learning_rate": 9.755050164504577e-06, "loss": 3.8065, "step": 1209 }, { "epoch": 0.37, "grad_norm": 12.56474494934082, "learning_rate": 9.748710798846232e-06, "loss": 2.9978, "step": 1210 }, { "epoch": 0.38, "grad_norm": 14.121736526489258, "learning_rate": 9.742368875437683e-06, "loss": 2.078, "step": 1211 }, { "epoch": 0.38, "grad_norm": 10.13382339477539, "learning_rate": 9.736024400289606e-06, "loss": 1.387, "step": 1212 }, { "epoch": 0.38, "grad_norm": 11.474137306213379, "learning_rate": 9.729677379415085e-06, "loss": 1.4034, "step": 1213 }, { "epoch": 0.38, "grad_norm": 15.429963111877441, "learning_rate": 9.723327818829618e-06, "loss": 1.1999, "step": 1214 }, { "epoch": 0.38, "grad_norm": 10.799750328063965, "learning_rate": 9.716975724551116e-06, "loss": 1.0906, "step": 1215 }, { "epoch": 0.38, "grad_norm": 12.66462230682373, "learning_rate": 9.710621102599883e-06, "loss": 2.6997, "step": 1216 }, { "epoch": 0.38, "grad_norm": 15.209826469421387, "learning_rate": 9.704263958998624e-06, "loss": 2.7838, "step": 1217 }, { "epoch": 0.38, "grad_norm": 9.077875137329102, "learning_rate": 9.697904299772434e-06, "loss": 1.6925, "step": 1218 }, { "epoch": 0.38, "grad_norm": 9.607660293579102, "learning_rate": 9.691542130948792e-06, "loss": 1.4831, "step": 1219 }, { "epoch": 0.38, "grad_norm": 15.674386978149414, "learning_rate": 9.685177458557548e-06, "loss": 2.0321, "step": 1220 }, { "epoch": 0.38, "grad_norm": 22.753320693969727, "learning_rate": 9.678810288630936e-06, "loss": 3.6639, "step": 1221 }, { "epoch": 0.38, "grad_norm": 16.516311645507812, "learning_rate": 9.672440627203556e-06, "loss": 7.3332, "step": 1222 }, { "epoch": 0.38, "grad_norm": 11.29770278930664, "learning_rate": 9.666068480312358e-06, "loss": 0.9376, "step": 1223 }, { "epoch": 0.38, "grad_norm": 18.074369430541992, "learning_rate": 9.659693853996658e-06, "loss": 2.0509, "step": 1224 }, { "epoch": 0.38, "grad_norm": 14.756071090698242, "learning_rate": 9.653316754298122e-06, "loss": 1.5116, "step": 1225 }, { "epoch": 0.38, "grad_norm": 18.59358024597168, "learning_rate": 9.646937187260755e-06, "loss": 4.5909, "step": 1226 }, { "epoch": 0.38, "grad_norm": 10.734687805175781, "learning_rate": 9.640555158930902e-06, "loss": 2.512, "step": 1227 }, { "epoch": 0.38, "grad_norm": 10.989273071289062, "learning_rate": 9.634170675357244e-06, "loss": 1.2963, "step": 1228 }, { "epoch": 0.38, "grad_norm": 16.843233108520508, "learning_rate": 9.627783742590786e-06, "loss": 3.031, "step": 1229 }, { "epoch": 0.38, "grad_norm": 11.894126892089844, "learning_rate": 9.621394366684852e-06, "loss": 2.2722, "step": 1230 }, { "epoch": 0.38, "grad_norm": 22.67670440673828, "learning_rate": 9.61500255369509e-06, "loss": 2.397, "step": 1231 }, { "epoch": 0.38, "grad_norm": 23.82426643371582, "learning_rate": 9.608608309679443e-06, "loss": 2.2971, "step": 1232 }, { "epoch": 0.38, "grad_norm": 13.017992973327637, "learning_rate": 9.602211640698177e-06, "loss": 1.5027, "step": 1233 }, { "epoch": 0.38, "grad_norm": 15.917731285095215, "learning_rate": 9.595812552813842e-06, "loss": 4.8058, "step": 1234 }, { "epoch": 0.38, "grad_norm": 13.124893188476562, "learning_rate": 9.589411052091286e-06, "loss": 2.5112, "step": 1235 }, { "epoch": 0.38, "grad_norm": 11.611188888549805, "learning_rate": 9.583007144597642e-06, "loss": 1.5805, "step": 1236 }, { "epoch": 0.38, "grad_norm": 10.515594482421875, "learning_rate": 9.576600836402324e-06, "loss": 1.3343, "step": 1237 }, { "epoch": 0.38, "grad_norm": 18.802034378051758, "learning_rate": 9.570192133577026e-06, "loss": 2.038, "step": 1238 }, { "epoch": 0.38, "grad_norm": 16.2142391204834, "learning_rate": 9.563781042195703e-06, "loss": 3.2628, "step": 1239 }, { "epoch": 0.38, "grad_norm": 15.4701566696167, "learning_rate": 9.557367568334583e-06, "loss": 2.5495, "step": 1240 }, { "epoch": 0.38, "grad_norm": 14.809575080871582, "learning_rate": 9.550951718072145e-06, "loss": 2.47, "step": 1241 }, { "epoch": 0.38, "grad_norm": 17.822465896606445, "learning_rate": 9.544533497489123e-06, "loss": 1.7422, "step": 1242 }, { "epoch": 0.39, "grad_norm": 11.080434799194336, "learning_rate": 9.538112912668501e-06, "loss": 1.3608, "step": 1243 }, { "epoch": 0.39, "grad_norm": 16.433822631835938, "learning_rate": 9.531689969695495e-06, "loss": 2.3869, "step": 1244 }, { "epoch": 0.39, "grad_norm": 19.75357437133789, "learning_rate": 9.525264674657563e-06, "loss": 5.278, "step": 1245 }, { "epoch": 0.39, "grad_norm": 9.760089874267578, "learning_rate": 9.518837033644393e-06, "loss": 1.2702, "step": 1246 }, { "epoch": 0.39, "grad_norm": 15.111987113952637, "learning_rate": 9.512407052747889e-06, "loss": 1.9018, "step": 1247 }, { "epoch": 0.39, "grad_norm": 17.314002990722656, "learning_rate": 9.50597473806218e-06, "loss": 3.2879, "step": 1248 }, { "epoch": 0.39, "grad_norm": 10.623244285583496, "learning_rate": 9.499540095683605e-06, "loss": 1.0273, "step": 1249 }, { "epoch": 0.39, "grad_norm": 8.672993659973145, "learning_rate": 9.49310313171071e-06, "loss": 1.1626, "step": 1250 }, { "epoch": 0.39, "grad_norm": 13.466089248657227, "learning_rate": 9.486663852244235e-06, "loss": 1.5149, "step": 1251 }, { "epoch": 0.39, "grad_norm": 17.448925018310547, "learning_rate": 9.480222263387122e-06, "loss": 4.1081, "step": 1252 }, { "epoch": 0.39, "grad_norm": 11.953042984008789, "learning_rate": 9.4737783712445e-06, "loss": 1.8349, "step": 1253 }, { "epoch": 0.39, "grad_norm": 9.215497970581055, "learning_rate": 9.467332181923682e-06, "loss": 1.0758, "step": 1254 }, { "epoch": 0.39, "grad_norm": 15.898959159851074, "learning_rate": 9.46088370153415e-06, "loss": 2.8912, "step": 1255 }, { "epoch": 0.39, "grad_norm": 14.336346626281738, "learning_rate": 9.45443293618757e-06, "loss": 3.0235, "step": 1256 }, { "epoch": 0.39, "grad_norm": 12.652429580688477, "learning_rate": 9.447979891997762e-06, "loss": 1.4084, "step": 1257 }, { "epoch": 0.39, "grad_norm": 17.48835563659668, "learning_rate": 9.441524575080714e-06, "loss": 1.9351, "step": 1258 }, { "epoch": 0.39, "grad_norm": 16.853721618652344, "learning_rate": 9.435066991554565e-06, "loss": 2.0826, "step": 1259 }, { "epoch": 0.39, "grad_norm": 11.199191093444824, "learning_rate": 9.428607147539601e-06, "loss": 1.7517, "step": 1260 }, { "epoch": 0.39, "grad_norm": 10.818768501281738, "learning_rate": 9.422145049158256e-06, "loss": 1.6581, "step": 1261 }, { "epoch": 0.39, "grad_norm": 14.718265533447266, "learning_rate": 9.415680702535092e-06, "loss": 2.4333, "step": 1262 }, { "epoch": 0.39, "grad_norm": 16.43951416015625, "learning_rate": 9.409214113796806e-06, "loss": 1.7366, "step": 1263 }, { "epoch": 0.39, "grad_norm": 13.1856050491333, "learning_rate": 9.402745289072221e-06, "loss": 5.1645, "step": 1264 }, { "epoch": 0.39, "grad_norm": 14.550506591796875, "learning_rate": 9.396274234492283e-06, "loss": 8.1488, "step": 1265 }, { "epoch": 0.39, "grad_norm": 15.178678512573242, "learning_rate": 9.389800956190039e-06, "loss": 1.7309, "step": 1266 }, { "epoch": 0.39, "grad_norm": 16.878833770751953, "learning_rate": 9.383325460300657e-06, "loss": 1.7562, "step": 1267 }, { "epoch": 0.39, "grad_norm": 14.218253135681152, "learning_rate": 9.376847752961399e-06, "loss": 3.9006, "step": 1268 }, { "epoch": 0.39, "grad_norm": 17.071884155273438, "learning_rate": 9.370367840311625e-06, "loss": 5.4157, "step": 1269 }, { "epoch": 0.39, "grad_norm": 18.59787940979004, "learning_rate": 9.363885728492784e-06, "loss": 3.4909, "step": 1270 }, { "epoch": 0.39, "grad_norm": 13.29552936553955, "learning_rate": 9.357401423648413e-06, "loss": 1.5064, "step": 1271 }, { "epoch": 0.39, "grad_norm": 15.856002807617188, "learning_rate": 9.350914931924126e-06, "loss": 4.8737, "step": 1272 }, { "epoch": 0.39, "grad_norm": 9.875200271606445, "learning_rate": 9.344426259467604e-06, "loss": 1.3462, "step": 1273 }, { "epoch": 0.39, "grad_norm": 14.881725311279297, "learning_rate": 9.337935412428602e-06, "loss": 2.0532, "step": 1274 }, { "epoch": 0.4, "grad_norm": 16.499113082885742, "learning_rate": 9.331442396958935e-06, "loss": 1.6289, "step": 1275 }, { "epoch": 0.4, "grad_norm": 22.04961395263672, "learning_rate": 9.324947219212467e-06, "loss": 4.3416, "step": 1276 }, { "epoch": 0.4, "grad_norm": 12.246705055236816, "learning_rate": 9.318449885345123e-06, "loss": 0.8192, "step": 1277 }, { "epoch": 0.4, "grad_norm": 25.260831832885742, "learning_rate": 9.311950401514857e-06, "loss": 3.559, "step": 1278 }, { "epoch": 0.4, "grad_norm": 16.55936622619629, "learning_rate": 9.305448773881671e-06, "loss": 1.8204, "step": 1279 }, { "epoch": 0.4, "grad_norm": 13.88330364227295, "learning_rate": 9.298945008607598e-06, "loss": 2.2244, "step": 1280 }, { "epoch": 0.4, "grad_norm": 10.431914329528809, "learning_rate": 9.292439111856692e-06, "loss": 2.5618, "step": 1281 }, { "epoch": 0.4, "grad_norm": 10.331016540527344, "learning_rate": 9.285931089795032e-06, "loss": 1.8892, "step": 1282 }, { "epoch": 0.4, "grad_norm": 7.850904941558838, "learning_rate": 9.279420948590708e-06, "loss": 0.8145, "step": 1283 }, { "epoch": 0.4, "grad_norm": 13.035999298095703, "learning_rate": 9.272908694413818e-06, "loss": 5.0167, "step": 1284 }, { "epoch": 0.4, "grad_norm": 13.117691040039062, "learning_rate": 9.26639433343647e-06, "loss": 1.2939, "step": 1285 }, { "epoch": 0.4, "grad_norm": 10.571340560913086, "learning_rate": 9.259877871832759e-06, "loss": 1.8576, "step": 1286 }, { "epoch": 0.4, "grad_norm": 15.457513809204102, "learning_rate": 9.253359315778775e-06, "loss": 3.5042, "step": 1287 }, { "epoch": 0.4, "grad_norm": 13.027514457702637, "learning_rate": 9.246838671452594e-06, "loss": 1.1628, "step": 1288 }, { "epoch": 0.4, "grad_norm": 10.462739944458008, "learning_rate": 9.240315945034274e-06, "loss": 1.2288, "step": 1289 }, { "epoch": 0.4, "grad_norm": 16.443859100341797, "learning_rate": 9.233791142705834e-06, "loss": 3.6546, "step": 1290 }, { "epoch": 0.4, "grad_norm": 12.439101219177246, "learning_rate": 9.227264270651276e-06, "loss": 2.3072, "step": 1291 }, { "epoch": 0.4, "grad_norm": 20.032997131347656, "learning_rate": 9.220735335056554e-06, "loss": 1.4739, "step": 1292 }, { "epoch": 0.4, "grad_norm": 17.97390365600586, "learning_rate": 9.214204342109585e-06, "loss": 2.3818, "step": 1293 }, { "epoch": 0.4, "grad_norm": 13.600703239440918, "learning_rate": 9.207671298000225e-06, "loss": 2.8134, "step": 1294 }, { "epoch": 0.4, "grad_norm": 12.583927154541016, "learning_rate": 9.201136208920286e-06, "loss": 2.2068, "step": 1295 }, { "epoch": 0.4, "grad_norm": 13.71022891998291, "learning_rate": 9.194599081063507e-06, "loss": 1.2469, "step": 1296 }, { "epoch": 0.4, "grad_norm": 17.376754760742188, "learning_rate": 9.188059920625567e-06, "loss": 2.8039, "step": 1297 }, { "epoch": 0.4, "grad_norm": 24.509260177612305, "learning_rate": 9.181518733804072e-06, "loss": 6.0718, "step": 1298 }, { "epoch": 0.4, "grad_norm": 10.079828262329102, "learning_rate": 9.174975526798544e-06, "loss": 1.4315, "step": 1299 }, { "epoch": 0.4, "grad_norm": 18.6185302734375, "learning_rate": 9.16843030581042e-06, "loss": 2.0979, "step": 1300 }, { "epoch": 0.4, "grad_norm": 22.407678604125977, "learning_rate": 9.161883077043049e-06, "loss": 3.9683, "step": 1301 }, { "epoch": 0.4, "grad_norm": 11.654013633728027, "learning_rate": 9.155333846701682e-06, "loss": 2.2643, "step": 1302 }, { "epoch": 0.4, "grad_norm": 9.130629539489746, "learning_rate": 9.148782620993464e-06, "loss": 1.0969, "step": 1303 }, { "epoch": 0.4, "grad_norm": 15.498295783996582, "learning_rate": 9.142229406127434e-06, "loss": 3.0316, "step": 1304 }, { "epoch": 0.4, "grad_norm": 12.560232162475586, "learning_rate": 9.135674208314516e-06, "loss": 3.3885, "step": 1305 }, { "epoch": 0.4, "grad_norm": 15.263082504272461, "learning_rate": 9.129117033767514e-06, "loss": 1.6005, "step": 1306 }, { "epoch": 0.4, "grad_norm": 14.036645889282227, "learning_rate": 9.122557888701104e-06, "loss": 3.1104, "step": 1307 }, { "epoch": 0.41, "grad_norm": 12.84864616394043, "learning_rate": 9.115996779331826e-06, "loss": 1.8887, "step": 1308 }, { "epoch": 0.41, "grad_norm": 12.828146934509277, "learning_rate": 9.109433711878092e-06, "loss": 2.8117, "step": 1309 }, { "epoch": 0.41, "grad_norm": 12.600400924682617, "learning_rate": 9.102868692560162e-06, "loss": 4.8946, "step": 1310 }, { "epoch": 0.41, "grad_norm": 15.0155611038208, "learning_rate": 9.096301727600144e-06, "loss": 3.2513, "step": 1311 }, { "epoch": 0.41, "grad_norm": 12.661171913146973, "learning_rate": 9.089732823221997e-06, "loss": 1.1866, "step": 1312 }, { "epoch": 0.41, "grad_norm": 22.12264060974121, "learning_rate": 9.083161985651512e-06, "loss": 1.7744, "step": 1313 }, { "epoch": 0.41, "grad_norm": 14.373431205749512, "learning_rate": 9.076589221116318e-06, "loss": 1.8977, "step": 1314 }, { "epoch": 0.41, "grad_norm": 17.13340187072754, "learning_rate": 9.070014535845864e-06, "loss": 4.0427, "step": 1315 }, { "epoch": 0.41, "grad_norm": 19.54230499267578, "learning_rate": 9.063437936071424e-06, "loss": 3.5795, "step": 1316 }, { "epoch": 0.41, "grad_norm": 23.94930076599121, "learning_rate": 9.056859428026088e-06, "loss": 3.257, "step": 1317 }, { "epoch": 0.41, "grad_norm": 13.285595893859863, "learning_rate": 9.050279017944747e-06, "loss": 1.9428, "step": 1318 }, { "epoch": 0.41, "grad_norm": 10.203804969787598, "learning_rate": 9.043696712064099e-06, "loss": 1.1842, "step": 1319 }, { "epoch": 0.41, "grad_norm": 19.172683715820312, "learning_rate": 9.037112516622643e-06, "loss": 4.4253, "step": 1320 }, { "epoch": 0.41, "grad_norm": 11.565506935119629, "learning_rate": 9.030526437860664e-06, "loss": 2.852, "step": 1321 }, { "epoch": 0.41, "grad_norm": 8.095751762390137, "learning_rate": 9.02393848202023e-06, "loss": 1.434, "step": 1322 }, { "epoch": 0.41, "grad_norm": 16.259660720825195, "learning_rate": 9.017348655345196e-06, "loss": 1.5106, "step": 1323 }, { "epoch": 0.41, "grad_norm": 13.453413963317871, "learning_rate": 9.01075696408118e-06, "loss": 1.8904, "step": 1324 }, { "epoch": 0.41, "grad_norm": 17.38831901550293, "learning_rate": 9.004163414475576e-06, "loss": 6.3193, "step": 1325 }, { "epoch": 0.41, "grad_norm": 14.774956703186035, "learning_rate": 8.997568012777534e-06, "loss": 1.5099, "step": 1326 }, { "epoch": 0.41, "grad_norm": 17.295808792114258, "learning_rate": 8.990970765237961e-06, "loss": 5.6794, "step": 1327 }, { "epoch": 0.41, "grad_norm": 11.591071128845215, "learning_rate": 8.984371678109513e-06, "loss": 1.3968, "step": 1328 }, { "epoch": 0.41, "grad_norm": 14.689584732055664, "learning_rate": 8.97777075764659e-06, "loss": 2.0836, "step": 1329 }, { "epoch": 0.41, "grad_norm": 18.209320068359375, "learning_rate": 8.971168010105329e-06, "loss": 1.9112, "step": 1330 }, { "epoch": 0.41, "grad_norm": 18.5660343170166, "learning_rate": 8.9645634417436e-06, "loss": 2.5502, "step": 1331 }, { "epoch": 0.41, "grad_norm": 9.953259468078613, "learning_rate": 8.957957058820999e-06, "loss": 0.8663, "step": 1332 }, { "epoch": 0.41, "grad_norm": 13.316336631774902, "learning_rate": 8.951348867598838e-06, "loss": 2.494, "step": 1333 }, { "epoch": 0.41, "grad_norm": 9.528247833251953, "learning_rate": 8.944738874340146e-06, "loss": 1.0284, "step": 1334 }, { "epoch": 0.41, "grad_norm": 12.781307220458984, "learning_rate": 8.93812708530966e-06, "loss": 2.5631, "step": 1335 }, { "epoch": 0.41, "grad_norm": 21.69401741027832, "learning_rate": 8.931513506773818e-06, "loss": 3.1673, "step": 1336 }, { "epoch": 0.41, "grad_norm": 11.969874382019043, "learning_rate": 8.924898145000754e-06, "loss": 2.8022, "step": 1337 }, { "epoch": 0.41, "grad_norm": 14.627640724182129, "learning_rate": 8.918281006260297e-06, "loss": 2.124, "step": 1338 }, { "epoch": 0.41, "grad_norm": 13.691817283630371, "learning_rate": 8.911662096823946e-06, "loss": 1.3643, "step": 1339 }, { "epoch": 0.42, "grad_norm": 16.14912223815918, "learning_rate": 8.905041422964897e-06, "loss": 2.7159, "step": 1340 }, { "epoch": 0.42, "grad_norm": 21.14331817626953, "learning_rate": 8.898418990958003e-06, "loss": 2.4386, "step": 1341 }, { "epoch": 0.42, "grad_norm": 8.493985176086426, "learning_rate": 8.891794807079791e-06, "loss": 1.0417, "step": 1342 }, { "epoch": 0.42, "grad_norm": 7.941287994384766, "learning_rate": 8.88516887760845e-06, "loss": 0.9287, "step": 1343 }, { "epoch": 0.42, "grad_norm": 15.730162620544434, "learning_rate": 8.878541208823817e-06, "loss": 1.849, "step": 1344 }, { "epoch": 0.42, "grad_norm": 11.380057334899902, "learning_rate": 8.871911807007382e-06, "loss": 2.0177, "step": 1345 }, { "epoch": 0.42, "grad_norm": 13.666074752807617, "learning_rate": 8.86528067844227e-06, "loss": 1.9661, "step": 1346 }, { "epoch": 0.42, "grad_norm": 9.915221214294434, "learning_rate": 8.858647829413261e-06, "loss": 1.5253, "step": 1347 }, { "epoch": 0.42, "grad_norm": 19.663179397583008, "learning_rate": 8.852013266206741e-06, "loss": 2.7269, "step": 1348 }, { "epoch": 0.42, "grad_norm": 14.238304138183594, "learning_rate": 8.845376995110742e-06, "loss": 1.6626, "step": 1349 }, { "epoch": 0.42, "grad_norm": 16.315189361572266, "learning_rate": 8.8387390224149e-06, "loss": 4.0402, "step": 1350 }, { "epoch": 0.42, "grad_norm": 8.602688789367676, "learning_rate": 8.832099354410471e-06, "loss": 0.7924, "step": 1351 }, { "epoch": 0.42, "grad_norm": 28.27524185180664, "learning_rate": 8.825457997390316e-06, "loss": 7.6666, "step": 1352 }, { "epoch": 0.42, "grad_norm": 13.535933494567871, "learning_rate": 8.818814957648897e-06, "loss": 1.5314, "step": 1353 }, { "epoch": 0.42, "grad_norm": 12.956826210021973, "learning_rate": 8.812170241482271e-06, "loss": 1.9593, "step": 1354 }, { "epoch": 0.42, "grad_norm": 15.91325855255127, "learning_rate": 8.805523855188082e-06, "loss": 6.0726, "step": 1355 }, { "epoch": 0.42, "grad_norm": 10.713898658752441, "learning_rate": 8.798875805065565e-06, "loss": 1.4066, "step": 1356 }, { "epoch": 0.42, "grad_norm": 11.808228492736816, "learning_rate": 8.792226097415512e-06, "loss": 1.284, "step": 1357 }, { "epoch": 0.42, "grad_norm": 11.787436485290527, "learning_rate": 8.785574738540314e-06, "loss": 1.9648, "step": 1358 }, { "epoch": 0.42, "grad_norm": 10.245705604553223, "learning_rate": 8.778921734743904e-06, "loss": 1.0906, "step": 1359 }, { "epoch": 0.42, "grad_norm": 9.439340591430664, "learning_rate": 8.772267092331784e-06, "loss": 1.159, "step": 1360 }, { "epoch": 0.42, "grad_norm": 24.344266891479492, "learning_rate": 8.765610817611006e-06, "loss": 3.629, "step": 1361 }, { "epoch": 0.42, "grad_norm": 11.541095733642578, "learning_rate": 8.758952916890174e-06, "loss": 1.2108, "step": 1362 }, { "epoch": 0.42, "grad_norm": 12.004585266113281, "learning_rate": 8.752293396479428e-06, "loss": 4.9698, "step": 1363 }, { "epoch": 0.42, "grad_norm": 14.796895980834961, "learning_rate": 8.74563226269044e-06, "loss": 5.4403, "step": 1364 }, { "epoch": 0.42, "grad_norm": 21.8315486907959, "learning_rate": 8.738969521836422e-06, "loss": 3.5713, "step": 1365 }, { "epoch": 0.42, "grad_norm": 13.761943817138672, "learning_rate": 8.732305180232102e-06, "loss": 1.5461, "step": 1366 }, { "epoch": 0.42, "grad_norm": 13.959177017211914, "learning_rate": 8.72563924419372e-06, "loss": 3.3256, "step": 1367 }, { "epoch": 0.42, "grad_norm": 13.9632568359375, "learning_rate": 8.71897172003904e-06, "loss": 2.1056, "step": 1368 }, { "epoch": 0.42, "grad_norm": 21.102760314941406, "learning_rate": 8.71230261408732e-06, "loss": 3.0056, "step": 1369 }, { "epoch": 0.42, "grad_norm": 8.353947639465332, "learning_rate": 8.705631932659325e-06, "loss": 1.3257, "step": 1370 }, { "epoch": 0.42, "grad_norm": 11.950407028198242, "learning_rate": 8.698959682077306e-06, "loss": 0.9416, "step": 1371 }, { "epoch": 0.43, "grad_norm": 10.873854637145996, "learning_rate": 8.692285868665007e-06, "loss": 1.8988, "step": 1372 }, { "epoch": 0.43, "grad_norm": 16.22323226928711, "learning_rate": 8.68561049874765e-06, "loss": 3.3277, "step": 1373 }, { "epoch": 0.43, "grad_norm": 12.70744514465332, "learning_rate": 8.678933578651935e-06, "loss": 2.6929, "step": 1374 }, { "epoch": 0.43, "grad_norm": 14.4379301071167, "learning_rate": 8.672255114706025e-06, "loss": 1.7289, "step": 1375 }, { "epoch": 0.43, "grad_norm": 14.871354103088379, "learning_rate": 8.665575113239554e-06, "loss": 1.4159, "step": 1376 }, { "epoch": 0.43, "grad_norm": 12.862020492553711, "learning_rate": 8.658893580583613e-06, "loss": 1.3267, "step": 1377 }, { "epoch": 0.43, "grad_norm": 12.802543640136719, "learning_rate": 8.652210523070735e-06, "loss": 2.1546, "step": 1378 }, { "epoch": 0.43, "grad_norm": 15.65637493133545, "learning_rate": 8.645525947034904e-06, "loss": 2.8595, "step": 1379 }, { "epoch": 0.43, "grad_norm": 11.872232437133789, "learning_rate": 8.63883985881155e-06, "loss": 1.7195, "step": 1380 }, { "epoch": 0.43, "grad_norm": 11.933382987976074, "learning_rate": 8.632152264737524e-06, "loss": 1.3872, "step": 1381 }, { "epoch": 0.43, "grad_norm": 10.490205764770508, "learning_rate": 8.62546317115111e-06, "loss": 1.3498, "step": 1382 }, { "epoch": 0.43, "grad_norm": 16.36244773864746, "learning_rate": 8.618772584392016e-06, "loss": 3.7617, "step": 1383 }, { "epoch": 0.43, "grad_norm": 9.92510986328125, "learning_rate": 8.612080510801363e-06, "loss": 1.2954, "step": 1384 }, { "epoch": 0.43, "grad_norm": 11.136679649353027, "learning_rate": 8.605386956721675e-06, "loss": 1.1087, "step": 1385 }, { "epoch": 0.43, "grad_norm": 15.491793632507324, "learning_rate": 8.59869192849689e-06, "loss": 2.1803, "step": 1386 }, { "epoch": 0.43, "grad_norm": 16.49875831604004, "learning_rate": 8.591995432472336e-06, "loss": 5.0587, "step": 1387 }, { "epoch": 0.43, "grad_norm": 11.55068302154541, "learning_rate": 8.58529747499473e-06, "loss": 2.4693, "step": 1388 }, { "epoch": 0.43, "grad_norm": 9.14673137664795, "learning_rate": 8.578598062412183e-06, "loss": 1.4954, "step": 1389 }, { "epoch": 0.43, "grad_norm": 12.553098678588867, "learning_rate": 8.571897201074177e-06, "loss": 1.6114, "step": 1390 }, { "epoch": 0.43, "grad_norm": 17.280818939208984, "learning_rate": 8.565194897331574e-06, "loss": 3.3327, "step": 1391 }, { "epoch": 0.43, "grad_norm": 13.013246536254883, "learning_rate": 8.558491157536597e-06, "loss": 0.8913, "step": 1392 }, { "epoch": 0.43, "grad_norm": 14.715285301208496, "learning_rate": 8.551785988042829e-06, "loss": 1.5306, "step": 1393 }, { "epoch": 0.43, "grad_norm": 14.097271919250488, "learning_rate": 8.545079395205217e-06, "loss": 6.3287, "step": 1394 }, { "epoch": 0.43, "grad_norm": 13.477266311645508, "learning_rate": 8.538371385380045e-06, "loss": 2.2853, "step": 1395 }, { "epoch": 0.43, "grad_norm": 12.329754829406738, "learning_rate": 8.531661964924951e-06, "loss": 1.3214, "step": 1396 }, { "epoch": 0.43, "grad_norm": 15.463603019714355, "learning_rate": 8.524951140198904e-06, "loss": 5.906, "step": 1397 }, { "epoch": 0.43, "grad_norm": 11.154081344604492, "learning_rate": 8.518238917562208e-06, "loss": 1.3551, "step": 1398 }, { "epoch": 0.43, "grad_norm": 17.804229736328125, "learning_rate": 8.511525303376481e-06, "loss": 2.2535, "step": 1399 }, { "epoch": 0.43, "grad_norm": 12.435379028320312, "learning_rate": 8.504810304004674e-06, "loss": 1.1441, "step": 1400 }, { "epoch": 0.43, "grad_norm": 13.349984169006348, "learning_rate": 8.498093925811045e-06, "loss": 2.4738, "step": 1401 }, { "epoch": 0.43, "grad_norm": 11.454378128051758, "learning_rate": 8.491376175161158e-06, "loss": 1.077, "step": 1402 }, { "epoch": 0.43, "grad_norm": 19.132919311523438, "learning_rate": 8.484657058421874e-06, "loss": 4.4179, "step": 1403 }, { "epoch": 0.44, "grad_norm": 9.526947021484375, "learning_rate": 8.477936581961358e-06, "loss": 1.3308, "step": 1404 }, { "epoch": 0.44, "grad_norm": 18.873443603515625, "learning_rate": 8.471214752149058e-06, "loss": 4.2097, "step": 1405 }, { "epoch": 0.44, "grad_norm": 12.703725814819336, "learning_rate": 8.464491575355702e-06, "loss": 2.342, "step": 1406 }, { "epoch": 0.44, "grad_norm": 13.23741626739502, "learning_rate": 8.4577670579533e-06, "loss": 1.5917, "step": 1407 }, { "epoch": 0.44, "grad_norm": 14.981216430664062, "learning_rate": 8.451041206315132e-06, "loss": 2.0091, "step": 1408 }, { "epoch": 0.44, "grad_norm": 18.34746742248535, "learning_rate": 8.444314026815741e-06, "loss": 7.7797, "step": 1409 }, { "epoch": 0.44, "grad_norm": 15.101547241210938, "learning_rate": 8.437585525830926e-06, "loss": 5.0375, "step": 1410 }, { "epoch": 0.44, "grad_norm": 22.929941177368164, "learning_rate": 8.430855709737744e-06, "loss": 3.8084, "step": 1411 }, { "epoch": 0.44, "grad_norm": 17.568201065063477, "learning_rate": 8.424124584914495e-06, "loss": 1.9364, "step": 1412 }, { "epoch": 0.44, "grad_norm": 10.255859375, "learning_rate": 8.417392157740718e-06, "loss": 1.0762, "step": 1413 }, { "epoch": 0.44, "grad_norm": 15.532630920410156, "learning_rate": 8.41065843459719e-06, "loss": 5.2018, "step": 1414 }, { "epoch": 0.44, "grad_norm": 14.951210975646973, "learning_rate": 8.403923421865912e-06, "loss": 1.2097, "step": 1415 }, { "epoch": 0.44, "grad_norm": 15.789084434509277, "learning_rate": 8.397187125930114e-06, "loss": 1.6282, "step": 1416 }, { "epoch": 0.44, "grad_norm": 13.053868293762207, "learning_rate": 8.390449553174235e-06, "loss": 2.3437, "step": 1417 }, { "epoch": 0.44, "grad_norm": 17.386356353759766, "learning_rate": 8.383710709983924e-06, "loss": 2.5349, "step": 1418 }, { "epoch": 0.44, "grad_norm": 14.942131042480469, "learning_rate": 8.376970602746046e-06, "loss": 1.484, "step": 1419 }, { "epoch": 0.44, "grad_norm": 13.587835311889648, "learning_rate": 8.370229237848645e-06, "loss": 5.3537, "step": 1420 }, { "epoch": 0.44, "grad_norm": 15.863311767578125, "learning_rate": 8.363486621680974e-06, "loss": 1.8422, "step": 1421 }, { "epoch": 0.44, "grad_norm": 15.610124588012695, "learning_rate": 8.356742760633463e-06, "loss": 1.3224, "step": 1422 }, { "epoch": 0.44, "grad_norm": 13.711703300476074, "learning_rate": 8.349997661097723e-06, "loss": 3.4481, "step": 1423 }, { "epoch": 0.44, "grad_norm": 16.342193603515625, "learning_rate": 8.343251329466537e-06, "loss": 2.0732, "step": 1424 }, { "epoch": 0.44, "grad_norm": 25.153188705444336, "learning_rate": 8.336503772133865e-06, "loss": 5.9056, "step": 1425 }, { "epoch": 0.44, "grad_norm": 13.40982723236084, "learning_rate": 8.329754995494821e-06, "loss": 2.0932, "step": 1426 }, { "epoch": 0.44, "grad_norm": 13.98735523223877, "learning_rate": 8.32300500594567e-06, "loss": 2.5621, "step": 1427 }, { "epoch": 0.44, "grad_norm": 12.737622261047363, "learning_rate": 8.316253809883834e-06, "loss": 1.433, "step": 1428 }, { "epoch": 0.44, "grad_norm": 11.427014350891113, "learning_rate": 8.309501413707881e-06, "loss": 1.0897, "step": 1429 }, { "epoch": 0.44, "grad_norm": 19.95100975036621, "learning_rate": 8.302747823817512e-06, "loss": 5.7549, "step": 1430 }, { "epoch": 0.44, "grad_norm": 11.455700874328613, "learning_rate": 8.295993046613552e-06, "loss": 1.9752, "step": 1431 }, { "epoch": 0.44, "grad_norm": 13.410773277282715, "learning_rate": 8.289237088497966e-06, "loss": 2.232, "step": 1432 }, { "epoch": 0.44, "grad_norm": 9.498603820800781, "learning_rate": 8.282479955873832e-06, "loss": 1.2018, "step": 1433 }, { "epoch": 0.44, "grad_norm": 13.638213157653809, "learning_rate": 8.275721655145333e-06, "loss": 2.6624, "step": 1434 }, { "epoch": 0.44, "grad_norm": 13.837607383728027, "learning_rate": 8.268962192717772e-06, "loss": 2.8477, "step": 1435 }, { "epoch": 0.44, "grad_norm": 8.231826782226562, "learning_rate": 8.262201574997547e-06, "loss": 1.0987, "step": 1436 }, { "epoch": 0.45, "grad_norm": 14.587052345275879, "learning_rate": 8.255439808392148e-06, "loss": 6.8916, "step": 1437 }, { "epoch": 0.45, "grad_norm": 10.058676719665527, "learning_rate": 8.248676899310161e-06, "loss": 1.1781, "step": 1438 }, { "epoch": 0.45, "grad_norm": 13.14787483215332, "learning_rate": 8.241912854161249e-06, "loss": 1.4953, "step": 1439 }, { "epoch": 0.45, "grad_norm": 13.948318481445312, "learning_rate": 8.235147679356154e-06, "loss": 1.6075, "step": 1440 }, { "epoch": 0.45, "grad_norm": 13.693050384521484, "learning_rate": 8.228381381306688e-06, "loss": 1.8597, "step": 1441 }, { "epoch": 0.45, "grad_norm": 15.089034080505371, "learning_rate": 8.221613966425727e-06, "loss": 1.687, "step": 1442 }, { "epoch": 0.45, "grad_norm": 9.826478004455566, "learning_rate": 8.21484544112721e-06, "loss": 1.4379, "step": 1443 }, { "epoch": 0.45, "grad_norm": 12.025218963623047, "learning_rate": 8.208075811826121e-06, "loss": 1.1459, "step": 1444 }, { "epoch": 0.45, "grad_norm": 15.948945045471191, "learning_rate": 8.201305084938495e-06, "loss": 1.8357, "step": 1445 }, { "epoch": 0.45, "grad_norm": 20.572792053222656, "learning_rate": 8.194533266881407e-06, "loss": 3.8687, "step": 1446 }, { "epoch": 0.45, "grad_norm": 19.228185653686523, "learning_rate": 8.187760364072969e-06, "loss": 1.8467, "step": 1447 }, { "epoch": 0.45, "grad_norm": 12.649337768554688, "learning_rate": 8.18098638293231e-06, "loss": 2.9462, "step": 1448 }, { "epoch": 0.45, "grad_norm": 23.110645294189453, "learning_rate": 8.174211329879596e-06, "loss": 2.1425, "step": 1449 }, { "epoch": 0.45, "grad_norm": 14.246688842773438, "learning_rate": 8.167435211335998e-06, "loss": 3.2245, "step": 1450 }, { "epoch": 0.45, "grad_norm": 25.37168312072754, "learning_rate": 8.160658033723702e-06, "loss": 3.2721, "step": 1451 }, { "epoch": 0.45, "grad_norm": 8.314859390258789, "learning_rate": 8.153879803465893e-06, "loss": 1.0462, "step": 1452 }, { "epoch": 0.45, "grad_norm": 11.370716094970703, "learning_rate": 8.14710052698676e-06, "loss": 1.8953, "step": 1453 }, { "epoch": 0.45, "grad_norm": 17.507314682006836, "learning_rate": 8.14032021071148e-06, "loss": 1.5388, "step": 1454 }, { "epoch": 0.45, "grad_norm": 13.845351219177246, "learning_rate": 8.133538861066215e-06, "loss": 6.5737, "step": 1455 }, { "epoch": 0.45, "grad_norm": 17.046667098999023, "learning_rate": 8.12675648447811e-06, "loss": 3.2323, "step": 1456 }, { "epoch": 0.45, "grad_norm": 10.533256530761719, "learning_rate": 8.119973087375277e-06, "loss": 1.7338, "step": 1457 }, { "epoch": 0.45, "grad_norm": 25.88167381286621, "learning_rate": 8.1131886761868e-06, "loss": 5.856, "step": 1458 }, { "epoch": 0.45, "grad_norm": 16.5262451171875, "learning_rate": 8.106403257342725e-06, "loss": 3.2691, "step": 1459 }, { "epoch": 0.45, "grad_norm": 19.846923828125, "learning_rate": 8.099616837274047e-06, "loss": 3.5881, "step": 1460 }, { "epoch": 0.45, "grad_norm": 13.813898086547852, "learning_rate": 8.092829422412718e-06, "loss": 3.8796, "step": 1461 }, { "epoch": 0.45, "grad_norm": 15.77165412902832, "learning_rate": 8.086041019191626e-06, "loss": 2.4237, "step": 1462 }, { "epoch": 0.45, "grad_norm": 13.56583023071289, "learning_rate": 8.0792516340446e-06, "loss": 2.3957, "step": 1463 }, { "epoch": 0.45, "grad_norm": 11.302949905395508, "learning_rate": 8.072461273406399e-06, "loss": 5.0651, "step": 1464 }, { "epoch": 0.45, "grad_norm": 13.356122016906738, "learning_rate": 8.065669943712703e-06, "loss": 1.3629, "step": 1465 }, { "epoch": 0.45, "grad_norm": 17.472110748291016, "learning_rate": 8.058877651400112e-06, "loss": 4.4384, "step": 1466 }, { "epoch": 0.45, "grad_norm": 18.280729293823242, "learning_rate": 8.052084402906144e-06, "loss": 2.0282, "step": 1467 }, { "epoch": 0.45, "grad_norm": 14.480973243713379, "learning_rate": 8.045290204669218e-06, "loss": 2.558, "step": 1468 }, { "epoch": 0.46, "grad_norm": 17.44344711303711, "learning_rate": 8.03849506312865e-06, "loss": 2.1654, "step": 1469 }, { "epoch": 0.46, "grad_norm": 16.810806274414062, "learning_rate": 8.031698984724656e-06, "loss": 2.3641, "step": 1470 }, { "epoch": 0.46, "grad_norm": 15.981531143188477, "learning_rate": 8.024901975898337e-06, "loss": 3.1273, "step": 1471 }, { "epoch": 0.46, "grad_norm": 18.475582122802734, "learning_rate": 8.018104043091681e-06, "loss": 1.7297, "step": 1472 }, { "epoch": 0.46, "grad_norm": 15.845691680908203, "learning_rate": 8.011305192747539e-06, "loss": 5.2557, "step": 1473 }, { "epoch": 0.46, "grad_norm": 14.488033294677734, "learning_rate": 8.004505431309649e-06, "loss": 4.55, "step": 1474 }, { "epoch": 0.46, "grad_norm": 10.891829490661621, "learning_rate": 7.997704765222598e-06, "loss": 4.3927, "step": 1475 }, { "epoch": 0.46, "grad_norm": 17.33945083618164, "learning_rate": 7.990903200931836e-06, "loss": 5.5007, "step": 1476 }, { "epoch": 0.46, "grad_norm": 22.563276290893555, "learning_rate": 7.984100744883664e-06, "loss": 3.6111, "step": 1477 }, { "epoch": 0.46, "grad_norm": 10.822282791137695, "learning_rate": 7.97729740352523e-06, "loss": 1.7473, "step": 1478 }, { "epoch": 0.46, "grad_norm": 22.096668243408203, "learning_rate": 7.97049318330452e-06, "loss": 3.3873, "step": 1479 }, { "epoch": 0.46, "grad_norm": 11.21049976348877, "learning_rate": 7.96368809067035e-06, "loss": 2.4169, "step": 1480 }, { "epoch": 0.46, "grad_norm": 18.811819076538086, "learning_rate": 7.956882132072366e-06, "loss": 4.7143, "step": 1481 }, { "epoch": 0.46, "grad_norm": 10.399057388305664, "learning_rate": 7.950075313961034e-06, "loss": 1.4499, "step": 1482 }, { "epoch": 0.46, "grad_norm": 14.199051856994629, "learning_rate": 7.943267642787636e-06, "loss": 1.3204, "step": 1483 }, { "epoch": 0.46, "grad_norm": 10.940443992614746, "learning_rate": 7.936459125004257e-06, "loss": 2.3052, "step": 1484 }, { "epoch": 0.46, "grad_norm": 18.73052406311035, "learning_rate": 7.929649767063791e-06, "loss": 1.48, "step": 1485 }, { "epoch": 0.46, "grad_norm": 9.472209930419922, "learning_rate": 7.92283957541993e-06, "loss": 1.8377, "step": 1486 }, { "epoch": 0.46, "grad_norm": 9.428339958190918, "learning_rate": 7.916028556527143e-06, "loss": 0.9211, "step": 1487 }, { "epoch": 0.46, "grad_norm": 18.35672378540039, "learning_rate": 7.909216716840696e-06, "loss": 6.0892, "step": 1488 }, { "epoch": 0.46, "grad_norm": 16.624366760253906, "learning_rate": 7.902404062816633e-06, "loss": 5.7379, "step": 1489 }, { "epoch": 0.46, "grad_norm": 8.005545616149902, "learning_rate": 7.89559060091176e-06, "loss": 0.5775, "step": 1490 }, { "epoch": 0.46, "grad_norm": 14.2269287109375, "learning_rate": 7.888776337583654e-06, "loss": 4.0219, "step": 1491 }, { "epoch": 0.46, "grad_norm": 16.19810676574707, "learning_rate": 7.881961279290658e-06, "loss": 2.1824, "step": 1492 }, { "epoch": 0.46, "grad_norm": 17.658531188964844, "learning_rate": 7.875145432491855e-06, "loss": 4.4864, "step": 1493 }, { "epoch": 0.46, "grad_norm": 17.673933029174805, "learning_rate": 7.868328803647085e-06, "loss": 6.9672, "step": 1494 }, { "epoch": 0.46, "grad_norm": 9.747373580932617, "learning_rate": 7.861511399216929e-06, "loss": 1.101, "step": 1495 }, { "epoch": 0.46, "grad_norm": 12.639095306396484, "learning_rate": 7.854693225662701e-06, "loss": 5.1661, "step": 1496 }, { "epoch": 0.46, "grad_norm": 13.06181812286377, "learning_rate": 7.847874289446439e-06, "loss": 1.5592, "step": 1497 }, { "epoch": 0.46, "grad_norm": 18.21415138244629, "learning_rate": 7.841054597030913e-06, "loss": 2.8689, "step": 1498 }, { "epoch": 0.46, "grad_norm": 16.860862731933594, "learning_rate": 7.834234154879605e-06, "loss": 2.0175, "step": 1499 }, { "epoch": 0.46, "grad_norm": 15.08491039276123, "learning_rate": 7.827412969456708e-06, "loss": 1.1815, "step": 1500 }, { "epoch": 0.47, "grad_norm": 13.941483497619629, "learning_rate": 7.820591047227116e-06, "loss": 1.7555, "step": 1501 }, { "epoch": 0.47, "grad_norm": 16.32042694091797, "learning_rate": 7.81376839465643e-06, "loss": 4.6121, "step": 1502 }, { "epoch": 0.47, "grad_norm": 10.87294864654541, "learning_rate": 7.806945018210935e-06, "loss": 1.4974, "step": 1503 }, { "epoch": 0.47, "grad_norm": 12.907079696655273, "learning_rate": 7.800120924357604e-06, "loss": 2.2645, "step": 1504 }, { "epoch": 0.47, "grad_norm": 7.117086887359619, "learning_rate": 7.793296119564092e-06, "loss": 0.8516, "step": 1505 }, { "epoch": 0.47, "grad_norm": 15.370654106140137, "learning_rate": 7.786470610298726e-06, "loss": 1.7695, "step": 1506 }, { "epoch": 0.47, "grad_norm": 16.78589630126953, "learning_rate": 7.779644403030505e-06, "loss": 3.1191, "step": 1507 }, { "epoch": 0.47, "grad_norm": 10.148062705993652, "learning_rate": 7.772817504229082e-06, "loss": 1.8681, "step": 1508 }, { "epoch": 0.47, "grad_norm": 12.689626693725586, "learning_rate": 7.765989920364767e-06, "loss": 2.2163, "step": 1509 }, { "epoch": 0.47, "grad_norm": 14.091196060180664, "learning_rate": 7.759161657908529e-06, "loss": 2.1025, "step": 1510 }, { "epoch": 0.47, "grad_norm": 9.173744201660156, "learning_rate": 7.752332723331967e-06, "loss": 1.3896, "step": 1511 }, { "epoch": 0.47, "grad_norm": 17.604738235473633, "learning_rate": 7.745503123107325e-06, "loss": 2.8429, "step": 1512 }, { "epoch": 0.47, "grad_norm": 8.202844619750977, "learning_rate": 7.738672863707473e-06, "loss": 0.9668, "step": 1513 }, { "epoch": 0.47, "grad_norm": 17.14142417907715, "learning_rate": 7.731841951605915e-06, "loss": 4.8675, "step": 1514 }, { "epoch": 0.47, "grad_norm": 13.663723945617676, "learning_rate": 7.72501039327676e-06, "loss": 2.7225, "step": 1515 }, { "epoch": 0.47, "grad_norm": 15.818323135375977, "learning_rate": 7.71817819519474e-06, "loss": 5.5613, "step": 1516 }, { "epoch": 0.47, "grad_norm": 13.62367057800293, "learning_rate": 7.711345363835186e-06, "loss": 3.1708, "step": 1517 }, { "epoch": 0.47, "grad_norm": 13.760972023010254, "learning_rate": 7.70451190567404e-06, "loss": 2.2154, "step": 1518 }, { "epoch": 0.47, "grad_norm": 16.931819915771484, "learning_rate": 7.697677827187824e-06, "loss": 1.8667, "step": 1519 }, { "epoch": 0.47, "grad_norm": 13.671175003051758, "learning_rate": 7.69084313485366e-06, "loss": 2.2861, "step": 1520 }, { "epoch": 0.47, "grad_norm": 12.393848419189453, "learning_rate": 7.684007835149245e-06, "loss": 1.9357, "step": 1521 }, { "epoch": 0.47, "grad_norm": 11.369022369384766, "learning_rate": 7.67717193455285e-06, "loss": 1.3349, "step": 1522 }, { "epoch": 0.47, "grad_norm": 15.161210060119629, "learning_rate": 7.670335439543325e-06, "loss": 2.1201, "step": 1523 }, { "epoch": 0.47, "grad_norm": 15.16026782989502, "learning_rate": 7.663498356600075e-06, "loss": 2.0202, "step": 1524 }, { "epoch": 0.47, "grad_norm": 6.391641616821289, "learning_rate": 7.656660692203063e-06, "loss": 0.7958, "step": 1525 }, { "epoch": 0.47, "grad_norm": 14.75525951385498, "learning_rate": 7.649822452832805e-06, "loss": 2.917, "step": 1526 }, { "epoch": 0.47, "grad_norm": 16.168161392211914, "learning_rate": 7.642983644970363e-06, "loss": 4.8536, "step": 1527 }, { "epoch": 0.47, "grad_norm": 17.493988037109375, "learning_rate": 7.636144275097334e-06, "loss": 5.3522, "step": 1528 }, { "epoch": 0.47, "grad_norm": 16.88285255432129, "learning_rate": 7.629304349695853e-06, "loss": 6.0407, "step": 1529 }, { "epoch": 0.47, "grad_norm": 16.397403717041016, "learning_rate": 7.622463875248577e-06, "loss": 2.8976, "step": 1530 }, { "epoch": 0.47, "grad_norm": 20.834239959716797, "learning_rate": 7.615622858238685e-06, "loss": 2.0012, "step": 1531 }, { "epoch": 0.47, "grad_norm": 13.2479887008667, "learning_rate": 7.608781305149871e-06, "loss": 1.7579, "step": 1532 }, { "epoch": 0.47, "grad_norm": 8.735552787780762, "learning_rate": 7.6019392224663325e-06, "loss": 0.7553, "step": 1533 }, { "epoch": 0.48, "grad_norm": 14.764557838439941, "learning_rate": 7.595096616672776e-06, "loss": 3.706, "step": 1534 }, { "epoch": 0.48, "grad_norm": 10.473408699035645, "learning_rate": 7.588253494254403e-06, "loss": 1.5389, "step": 1535 }, { "epoch": 0.48, "grad_norm": 15.223381996154785, "learning_rate": 7.581409861696897e-06, "loss": 4.5035, "step": 1536 }, { "epoch": 0.48, "grad_norm": 14.932395935058594, "learning_rate": 7.574565725486429e-06, "loss": 3.2436, "step": 1537 }, { "epoch": 0.48, "grad_norm": 15.13408088684082, "learning_rate": 7.567721092109656e-06, "loss": 1.5579, "step": 1538 }, { "epoch": 0.48, "grad_norm": 13.730198860168457, "learning_rate": 7.560875968053692e-06, "loss": 2.021, "step": 1539 }, { "epoch": 0.48, "grad_norm": 27.59801483154297, "learning_rate": 7.5540303598061235e-06, "loss": 3.208, "step": 1540 }, { "epoch": 0.48, "grad_norm": 16.457393646240234, "learning_rate": 7.547184273854998e-06, "loss": 2.2529, "step": 1541 }, { "epoch": 0.48, "grad_norm": 13.959450721740723, "learning_rate": 7.540337716688809e-06, "loss": 2.5864, "step": 1542 }, { "epoch": 0.48, "grad_norm": 14.607627868652344, "learning_rate": 7.533490694796502e-06, "loss": 4.8759, "step": 1543 }, { "epoch": 0.48, "grad_norm": 14.483607292175293, "learning_rate": 7.526643214667463e-06, "loss": 1.1964, "step": 1544 }, { "epoch": 0.48, "grad_norm": 17.84099578857422, "learning_rate": 7.519795282791509e-06, "loss": 1.5788, "step": 1545 }, { "epoch": 0.48, "grad_norm": 11.038904190063477, "learning_rate": 7.5129469056588865e-06, "loss": 1.5326, "step": 1546 }, { "epoch": 0.48, "grad_norm": 18.416704177856445, "learning_rate": 7.506098089760266e-06, "loss": 5.7714, "step": 1547 }, { "epoch": 0.48, "grad_norm": 20.013147354125977, "learning_rate": 7.4992488415867285e-06, "loss": 2.925, "step": 1548 }, { "epoch": 0.48, "grad_norm": 15.000293731689453, "learning_rate": 7.492399167629774e-06, "loss": 5.5733, "step": 1549 }, { "epoch": 0.48, "grad_norm": 15.219602584838867, "learning_rate": 7.485549074381296e-06, "loss": 4.8389, "step": 1550 }, { "epoch": 0.48, "grad_norm": 12.503700256347656, "learning_rate": 7.4786985683335895e-06, "loss": 1.1435, "step": 1551 }, { "epoch": 0.48, "grad_norm": 14.639745712280273, "learning_rate": 7.471847655979343e-06, "loss": 2.0322, "step": 1552 }, { "epoch": 0.48, "grad_norm": 13.07108211517334, "learning_rate": 7.46499634381163e-06, "loss": 2.7517, "step": 1553 }, { "epoch": 0.48, "grad_norm": 12.419761657714844, "learning_rate": 7.4581446383238945e-06, "loss": 1.1141, "step": 1554 }, { "epoch": 0.48, "grad_norm": 17.002758026123047, "learning_rate": 7.451292546009965e-06, "loss": 1.991, "step": 1555 }, { "epoch": 0.48, "grad_norm": 14.649391174316406, "learning_rate": 7.444440073364034e-06, "loss": 1.7529, "step": 1556 }, { "epoch": 0.48, "grad_norm": 17.31776237487793, "learning_rate": 7.437587226880645e-06, "loss": 2.3223, "step": 1557 }, { "epoch": 0.48, "grad_norm": 18.440576553344727, "learning_rate": 7.430734013054705e-06, "loss": 2.425, "step": 1558 }, { "epoch": 0.48, "grad_norm": 12.171570777893066, "learning_rate": 7.42388043838147e-06, "loss": 2.0255, "step": 1559 }, { "epoch": 0.48, "grad_norm": 16.10645866394043, "learning_rate": 7.417026509356532e-06, "loss": 1.5669, "step": 1560 }, { "epoch": 0.48, "grad_norm": 16.45768165588379, "learning_rate": 7.410172232475823e-06, "loss": 2.2515, "step": 1561 }, { "epoch": 0.48, "grad_norm": 13.46390438079834, "learning_rate": 7.403317614235602e-06, "loss": 1.03, "step": 1562 }, { "epoch": 0.48, "grad_norm": 12.802998542785645, "learning_rate": 7.396462661132454e-06, "loss": 0.9395, "step": 1563 }, { "epoch": 0.48, "grad_norm": 16.94371223449707, "learning_rate": 7.389607379663279e-06, "loss": 5.0141, "step": 1564 }, { "epoch": 0.48, "grad_norm": 12.43297004699707, "learning_rate": 7.3827517763252906e-06, "loss": 4.5835, "step": 1565 }, { "epoch": 0.49, "grad_norm": 12.801984786987305, "learning_rate": 7.375895857616006e-06, "loss": 0.9518, "step": 1566 }, { "epoch": 0.49, "grad_norm": 13.065658569335938, "learning_rate": 7.369039630033244e-06, "loss": 2.1409, "step": 1567 }, { "epoch": 0.49, "grad_norm": 12.188167572021484, "learning_rate": 7.362183100075111e-06, "loss": 1.1734, "step": 1568 }, { "epoch": 0.49, "grad_norm": 12.572041511535645, "learning_rate": 7.355326274240001e-06, "loss": 2.2057, "step": 1569 }, { "epoch": 0.49, "grad_norm": 14.975041389465332, "learning_rate": 7.348469159026596e-06, "loss": 3.8706, "step": 1570 }, { "epoch": 0.49, "grad_norm": 10.509519577026367, "learning_rate": 7.341611760933841e-06, "loss": 1.0018, "step": 1571 }, { "epoch": 0.49, "grad_norm": 15.144615173339844, "learning_rate": 7.3347540864609555e-06, "loss": 3.0489, "step": 1572 }, { "epoch": 0.49, "grad_norm": 20.42429542541504, "learning_rate": 7.327896142107424e-06, "loss": 5.1546, "step": 1573 }, { "epoch": 0.49, "grad_norm": 16.69227409362793, "learning_rate": 7.321037934372978e-06, "loss": 2.9772, "step": 1574 }, { "epoch": 0.49, "grad_norm": 12.439099311828613, "learning_rate": 7.314179469757607e-06, "loss": 1.4989, "step": 1575 }, { "epoch": 0.49, "grad_norm": 9.062239646911621, "learning_rate": 7.307320754761536e-06, "loss": 1.4532, "step": 1576 }, { "epoch": 0.49, "grad_norm": 21.60405158996582, "learning_rate": 7.300461795885237e-06, "loss": 2.0011, "step": 1577 }, { "epoch": 0.49, "grad_norm": 13.495698928833008, "learning_rate": 7.293602599629405e-06, "loss": 1.81, "step": 1578 }, { "epoch": 0.49, "grad_norm": 11.463644981384277, "learning_rate": 7.286743172494962e-06, "loss": 1.8246, "step": 1579 }, { "epoch": 0.49, "grad_norm": 13.608999252319336, "learning_rate": 7.279883520983053e-06, "loss": 1.7617, "step": 1580 }, { "epoch": 0.49, "grad_norm": 12.266855239868164, "learning_rate": 7.273023651595029e-06, "loss": 1.3808, "step": 1581 }, { "epoch": 0.49, "grad_norm": 16.99720573425293, "learning_rate": 7.26616357083245e-06, "loss": 3.412, "step": 1582 }, { "epoch": 0.49, "grad_norm": 12.8319673538208, "learning_rate": 7.25930328519708e-06, "loss": 2.5051, "step": 1583 }, { "epoch": 0.49, "grad_norm": 14.773306846618652, "learning_rate": 7.252442801190875e-06, "loss": 1.4383, "step": 1584 }, { "epoch": 0.49, "grad_norm": 13.512967109680176, "learning_rate": 7.245582125315974e-06, "loss": 1.763, "step": 1585 }, { "epoch": 0.49, "grad_norm": 16.133769989013672, "learning_rate": 7.238721264074704e-06, "loss": 2.7118, "step": 1586 }, { "epoch": 0.49, "grad_norm": 14.554754257202148, "learning_rate": 7.2318602239695685e-06, "loss": 1.2321, "step": 1587 }, { "epoch": 0.49, "grad_norm": 14.490348815917969, "learning_rate": 7.224999011503238e-06, "loss": 1.9627, "step": 1588 }, { "epoch": 0.49, "grad_norm": 13.49413013458252, "learning_rate": 7.21813763317854e-06, "loss": 2.0481, "step": 1589 }, { "epoch": 0.49, "grad_norm": 12.60959243774414, "learning_rate": 7.211276095498471e-06, "loss": 1.4864, "step": 1590 }, { "epoch": 0.49, "grad_norm": 14.422796249389648, "learning_rate": 7.204414404966173e-06, "loss": 1.6967, "step": 1591 }, { "epoch": 0.49, "grad_norm": 14.81104564666748, "learning_rate": 7.197552568084928e-06, "loss": 1.9473, "step": 1592 }, { "epoch": 0.49, "grad_norm": 15.132305145263672, "learning_rate": 7.190690591358166e-06, "loss": 5.9458, "step": 1593 }, { "epoch": 0.49, "grad_norm": 16.62466049194336, "learning_rate": 7.183828481289444e-06, "loss": 4.3815, "step": 1594 }, { "epoch": 0.49, "grad_norm": 13.80553150177002, "learning_rate": 7.176966244382446e-06, "loss": 6.7008, "step": 1595 }, { "epoch": 0.49, "grad_norm": 12.077177047729492, "learning_rate": 7.170103887140977e-06, "loss": 1.7794, "step": 1596 }, { "epoch": 0.49, "grad_norm": 17.30846405029297, "learning_rate": 7.163241416068956e-06, "loss": 1.4915, "step": 1597 }, { "epoch": 0.5, "grad_norm": 13.239306449890137, "learning_rate": 7.156378837670409e-06, "loss": 1.6999, "step": 1598 }, { "epoch": 0.5, "grad_norm": 14.074838638305664, "learning_rate": 7.149516158449465e-06, "loss": 3.0839, "step": 1599 }, { "epoch": 0.5, "grad_norm": 18.072620391845703, "learning_rate": 7.142653384910344e-06, "loss": 3.7544, "step": 1600 }, { "epoch": 0.5, "grad_norm": 19.225902557373047, "learning_rate": 7.1357905235573654e-06, "loss": 6.3921, "step": 1601 }, { "epoch": 0.5, "grad_norm": 14.992647171020508, "learning_rate": 7.1289275808949235e-06, "loss": 5.8246, "step": 1602 }, { "epoch": 0.5, "grad_norm": 14.987789154052734, "learning_rate": 7.122064563427488e-06, "loss": 2.5646, "step": 1603 }, { "epoch": 0.5, "grad_norm": 13.11773681640625, "learning_rate": 7.115201477659608e-06, "loss": 4.8927, "step": 1604 }, { "epoch": 0.5, "grad_norm": 16.583314895629883, "learning_rate": 7.108338330095892e-06, "loss": 1.8204, "step": 1605 }, { "epoch": 0.5, "grad_norm": 9.47932243347168, "learning_rate": 7.101475127241006e-06, "loss": 1.191, "step": 1606 }, { "epoch": 0.5, "grad_norm": 11.21604061126709, "learning_rate": 7.0946118755996695e-06, "loss": 1.4498, "step": 1607 }, { "epoch": 0.5, "grad_norm": 14.17425537109375, "learning_rate": 7.087748581676651e-06, "loss": 1.8093, "step": 1608 }, { "epoch": 0.5, "grad_norm": 12.51001262664795, "learning_rate": 7.080885251976759e-06, "loss": 1.7423, "step": 1609 }, { "epoch": 0.5, "grad_norm": 14.638405799865723, "learning_rate": 7.074021893004827e-06, "loss": 1.3783, "step": 1610 }, { "epoch": 0.5, "grad_norm": 13.604153633117676, "learning_rate": 7.06715851126573e-06, "loss": 5.2793, "step": 1611 }, { "epoch": 0.5, "grad_norm": 15.841768264770508, "learning_rate": 7.060295113264354e-06, "loss": 5.742, "step": 1612 }, { "epoch": 0.5, "grad_norm": 16.581165313720703, "learning_rate": 7.053431705505605e-06, "loss": 2.986, "step": 1613 }, { "epoch": 0.5, "grad_norm": 23.555326461791992, "learning_rate": 7.046568294494397e-06, "loss": 2.2116, "step": 1614 }, { "epoch": 0.5, "grad_norm": 16.073575973510742, "learning_rate": 7.039704886735647e-06, "loss": 2.3679, "step": 1615 }, { "epoch": 0.5, "grad_norm": 21.275436401367188, "learning_rate": 7.032841488734272e-06, "loss": 2.3565, "step": 1616 }, { "epoch": 0.5, "grad_norm": 21.215978622436523, "learning_rate": 7.025978106995174e-06, "loss": 2.5173, "step": 1617 }, { "epoch": 0.5, "grad_norm": 18.45775032043457, "learning_rate": 7.019114748023242e-06, "loss": 2.4042, "step": 1618 }, { "epoch": 0.5, "grad_norm": 13.314290046691895, "learning_rate": 7.01225141832335e-06, "loss": 2.5132, "step": 1619 }, { "epoch": 0.5, "grad_norm": 24.080631256103516, "learning_rate": 7.005388124400333e-06, "loss": 2.0978, "step": 1620 }, { "epoch": 0.5, "grad_norm": 14.010071754455566, "learning_rate": 6.9985248727589945e-06, "loss": 5.0885, "step": 1621 }, { "epoch": 0.5, "grad_norm": 8.733227729797363, "learning_rate": 6.991661669904109e-06, "loss": 0.8556, "step": 1622 }, { "epoch": 0.5, "grad_norm": 15.029264450073242, "learning_rate": 6.984798522340394e-06, "loss": 1.3525, "step": 1623 }, { "epoch": 0.5, "grad_norm": 16.164785385131836, "learning_rate": 6.977935436572511e-06, "loss": 1.9743, "step": 1624 }, { "epoch": 0.5, "grad_norm": 10.85263729095459, "learning_rate": 6.971072419105078e-06, "loss": 2.056, "step": 1625 }, { "epoch": 0.5, "grad_norm": 14.404804229736328, "learning_rate": 6.964209476442636e-06, "loss": 3.1708, "step": 1626 }, { "epoch": 0.5, "grad_norm": 13.646852493286133, "learning_rate": 6.957346615089655e-06, "loss": 2.62, "step": 1627 }, { "epoch": 0.5, "grad_norm": 23.522275924682617, "learning_rate": 6.9504838415505375e-06, "loss": 1.4201, "step": 1628 }, { "epoch": 0.5, "grad_norm": 16.97157096862793, "learning_rate": 6.943621162329593e-06, "loss": 2.1315, "step": 1629 }, { "epoch": 0.51, "grad_norm": 10.685445785522461, "learning_rate": 6.936758583931044e-06, "loss": 1.3641, "step": 1630 }, { "epoch": 0.51, "grad_norm": 15.746297836303711, "learning_rate": 6.929896112859023e-06, "loss": 2.0654, "step": 1631 }, { "epoch": 0.51, "grad_norm": 17.248628616333008, "learning_rate": 6.923033755617555e-06, "loss": 5.3468, "step": 1632 }, { "epoch": 0.51, "grad_norm": 11.610515594482422, "learning_rate": 6.916171518710556e-06, "loss": 2.0751, "step": 1633 }, { "epoch": 0.51, "grad_norm": 18.109254837036133, "learning_rate": 6.9093094086418344e-06, "loss": 4.59, "step": 1634 }, { "epoch": 0.51, "grad_norm": 19.82781410217285, "learning_rate": 6.902447431915073e-06, "loss": 3.4236, "step": 1635 }, { "epoch": 0.51, "grad_norm": 16.590665817260742, "learning_rate": 6.895585595033829e-06, "loss": 1.6634, "step": 1636 }, { "epoch": 0.51, "grad_norm": 16.019447326660156, "learning_rate": 6.88872390450153e-06, "loss": 2.4793, "step": 1637 }, { "epoch": 0.51, "grad_norm": 12.343634605407715, "learning_rate": 6.881862366821462e-06, "loss": 1.5978, "step": 1638 }, { "epoch": 0.51, "grad_norm": 19.105314254760742, "learning_rate": 6.875000988496763e-06, "loss": 3.1883, "step": 1639 }, { "epoch": 0.51, "grad_norm": 12.569679260253906, "learning_rate": 6.868139776030432e-06, "loss": 2.315, "step": 1640 }, { "epoch": 0.51, "grad_norm": 13.843108177185059, "learning_rate": 6.8612787359252966e-06, "loss": 2.538, "step": 1641 }, { "epoch": 0.51, "grad_norm": 13.337444305419922, "learning_rate": 6.854417874684026e-06, "loss": 1.852, "step": 1642 }, { "epoch": 0.51, "grad_norm": 13.132347106933594, "learning_rate": 6.847557198809127e-06, "loss": 0.8586, "step": 1643 }, { "epoch": 0.51, "grad_norm": 13.987866401672363, "learning_rate": 6.840696714802922e-06, "loss": 2.9912, "step": 1644 }, { "epoch": 0.51, "grad_norm": 17.009370803833008, "learning_rate": 6.8338364291675495e-06, "loss": 4.2418, "step": 1645 }, { "epoch": 0.51, "grad_norm": 12.241983413696289, "learning_rate": 6.826976348404973e-06, "loss": 2.1306, "step": 1646 }, { "epoch": 0.51, "grad_norm": 14.771096229553223, "learning_rate": 6.8201164790169494e-06, "loss": 4.413, "step": 1647 }, { "epoch": 0.51, "grad_norm": 8.216975212097168, "learning_rate": 6.813256827505037e-06, "loss": 1.2711, "step": 1648 }, { "epoch": 0.51, "grad_norm": 17.802562713623047, "learning_rate": 6.806397400370596e-06, "loss": 1.8284, "step": 1649 }, { "epoch": 0.51, "grad_norm": 10.894693374633789, "learning_rate": 6.7995382041147645e-06, "loss": 1.1183, "step": 1650 }, { "epoch": 0.51, "grad_norm": 15.150440216064453, "learning_rate": 6.792679245238464e-06, "loss": 4.8429, "step": 1651 }, { "epoch": 0.51, "grad_norm": 13.411111831665039, "learning_rate": 6.785820530242395e-06, "loss": 2.5226, "step": 1652 }, { "epoch": 0.51, "grad_norm": 7.378268241882324, "learning_rate": 6.7789620656270235e-06, "loss": 0.6978, "step": 1653 }, { "epoch": 0.51, "grad_norm": 15.269695281982422, "learning_rate": 6.772103857892577e-06, "loss": 3.3351, "step": 1654 }, { "epoch": 0.51, "grad_norm": 15.948509216308594, "learning_rate": 6.765245913539046e-06, "loss": 8.719, "step": 1655 }, { "epoch": 0.51, "grad_norm": 11.423715591430664, "learning_rate": 6.7583882390661615e-06, "loss": 1.6719, "step": 1656 }, { "epoch": 0.51, "grad_norm": 10.186993598937988, "learning_rate": 6.751530840973407e-06, "loss": 1.3835, "step": 1657 }, { "epoch": 0.51, "grad_norm": 12.548033714294434, "learning_rate": 6.74467372576e-06, "loss": 1.9285, "step": 1658 }, { "epoch": 0.51, "grad_norm": 16.2474308013916, "learning_rate": 6.737816899924892e-06, "loss": 1.4393, "step": 1659 }, { "epoch": 0.51, "grad_norm": 12.502729415893555, "learning_rate": 6.730960369966756e-06, "loss": 0.9805, "step": 1660 }, { "epoch": 0.51, "grad_norm": 14.059015274047852, "learning_rate": 6.724104142383995e-06, "loss": 2.7396, "step": 1661 }, { "epoch": 0.51, "grad_norm": 14.44599723815918, "learning_rate": 6.717248223674711e-06, "loss": 1.7818, "step": 1662 }, { "epoch": 0.52, "grad_norm": 9.191947937011719, "learning_rate": 6.710392620336721e-06, "loss": 1.2289, "step": 1663 }, { "epoch": 0.52, "grad_norm": 14.764534950256348, "learning_rate": 6.703537338867547e-06, "loss": 1.9445, "step": 1664 }, { "epoch": 0.52, "grad_norm": 13.833651542663574, "learning_rate": 6.696682385764401e-06, "loss": 1.4044, "step": 1665 }, { "epoch": 0.52, "grad_norm": 11.326014518737793, "learning_rate": 6.689827767524177e-06, "loss": 1.3071, "step": 1666 }, { "epoch": 0.52, "grad_norm": 14.583909034729004, "learning_rate": 6.6829734906434685e-06, "loss": 4.8377, "step": 1667 }, { "epoch": 0.52, "grad_norm": 12.085514068603516, "learning_rate": 6.676119561618531e-06, "loss": 1.1961, "step": 1668 }, { "epoch": 0.52, "grad_norm": 19.26174545288086, "learning_rate": 6.669265986945295e-06, "loss": 6.4908, "step": 1669 }, { "epoch": 0.52, "grad_norm": 11.472599983215332, "learning_rate": 6.662412773119357e-06, "loss": 1.5512, "step": 1670 }, { "epoch": 0.52, "grad_norm": 11.320133209228516, "learning_rate": 6.655559926635969e-06, "loss": 1.3109, "step": 1671 }, { "epoch": 0.52, "grad_norm": 17.44525909423828, "learning_rate": 6.648707453990034e-06, "loss": 2.0053, "step": 1672 }, { "epoch": 0.52, "grad_norm": 14.975166320800781, "learning_rate": 6.641855361676106e-06, "loss": 2.8781, "step": 1673 }, { "epoch": 0.52, "grad_norm": 16.109355926513672, "learning_rate": 6.635003656188374e-06, "loss": 5.3664, "step": 1674 }, { "epoch": 0.52, "grad_norm": 16.004348754882812, "learning_rate": 6.628152344020658e-06, "loss": 3.1915, "step": 1675 }, { "epoch": 0.52, "grad_norm": 14.833353996276855, "learning_rate": 6.621301431666412e-06, "loss": 2.4988, "step": 1676 }, { "epoch": 0.52, "grad_norm": 14.966736793518066, "learning_rate": 6.614450925618708e-06, "loss": 1.7351, "step": 1677 }, { "epoch": 0.52, "grad_norm": 9.273491859436035, "learning_rate": 6.607600832370228e-06, "loss": 1.276, "step": 1678 }, { "epoch": 0.52, "grad_norm": 9.694404602050781, "learning_rate": 6.600751158413272e-06, "loss": 2.3457, "step": 1679 }, { "epoch": 0.52, "grad_norm": 17.113826751708984, "learning_rate": 6.593901910239737e-06, "loss": 4.0584, "step": 1680 }, { "epoch": 0.52, "grad_norm": 18.73774528503418, "learning_rate": 6.587053094341113e-06, "loss": 4.5252, "step": 1681 }, { "epoch": 0.52, "grad_norm": 17.822935104370117, "learning_rate": 6.580204717208493e-06, "loss": 2.3868, "step": 1682 }, { "epoch": 0.52, "grad_norm": 16.173681259155273, "learning_rate": 6.573356785332539e-06, "loss": 2.5625, "step": 1683 }, { "epoch": 0.52, "grad_norm": 10.989896774291992, "learning_rate": 6.566509305203497e-06, "loss": 1.3217, "step": 1684 }, { "epoch": 0.52, "grad_norm": 11.736306190490723, "learning_rate": 6.559662283311192e-06, "loss": 1.5926, "step": 1685 }, { "epoch": 0.52, "grad_norm": 19.356401443481445, "learning_rate": 6.552815726145004e-06, "loss": 4.2014, "step": 1686 }, { "epoch": 0.52, "grad_norm": 15.877721786499023, "learning_rate": 6.545969640193877e-06, "loss": 3.4804, "step": 1687 }, { "epoch": 0.52, "grad_norm": 17.891929626464844, "learning_rate": 6.53912403194631e-06, "loss": 4.1462, "step": 1688 }, { "epoch": 0.52, "grad_norm": 13.31762409210205, "learning_rate": 6.532278907890346e-06, "loss": 2.0393, "step": 1689 }, { "epoch": 0.52, "grad_norm": 8.635902404785156, "learning_rate": 6.5254342745135695e-06, "loss": 0.7918, "step": 1690 }, { "epoch": 0.52, "grad_norm": 13.983869552612305, "learning_rate": 6.518590138303105e-06, "loss": 2.2377, "step": 1691 }, { "epoch": 0.52, "grad_norm": 19.41148567199707, "learning_rate": 6.511746505745599e-06, "loss": 4.7251, "step": 1692 }, { "epoch": 0.52, "grad_norm": 16.147748947143555, "learning_rate": 6.5049033833272235e-06, "loss": 1.7898, "step": 1693 }, { "epoch": 0.52, "grad_norm": 17.849939346313477, "learning_rate": 6.498060777533669e-06, "loss": 2.1782, "step": 1694 }, { "epoch": 0.53, "grad_norm": 11.791414260864258, "learning_rate": 6.491218694850132e-06, "loss": 2.4036, "step": 1695 }, { "epoch": 0.53, "grad_norm": 17.15569496154785, "learning_rate": 6.484377141761316e-06, "loss": 1.7765, "step": 1696 }, { "epoch": 0.53, "grad_norm": 15.212052345275879, "learning_rate": 6.477536124751425e-06, "loss": 1.9845, "step": 1697 }, { "epoch": 0.53, "grad_norm": 18.090255737304688, "learning_rate": 6.47069565030415e-06, "loss": 2.8683, "step": 1698 }, { "epoch": 0.53, "grad_norm": 17.535499572753906, "learning_rate": 6.463855724902666e-06, "loss": 3.7178, "step": 1699 }, { "epoch": 0.53, "grad_norm": 15.417211532592773, "learning_rate": 6.45701635502964e-06, "loss": 2.572, "step": 1700 }, { "epoch": 0.53, "grad_norm": 16.360429763793945, "learning_rate": 6.450177547167198e-06, "loss": 2.4635, "step": 1701 }, { "epoch": 0.53, "grad_norm": 13.608604431152344, "learning_rate": 6.443339307796939e-06, "loss": 1.9137, "step": 1702 }, { "epoch": 0.53, "grad_norm": 16.365615844726562, "learning_rate": 6.436501643399927e-06, "loss": 3.9508, "step": 1703 }, { "epoch": 0.53, "grad_norm": 32.33718490600586, "learning_rate": 6.429664560456678e-06, "loss": 2.7259, "step": 1704 }, { "epoch": 0.53, "grad_norm": 23.710081100463867, "learning_rate": 6.42282806544715e-06, "loss": 2.7618, "step": 1705 }, { "epoch": 0.53, "grad_norm": 17.952068328857422, "learning_rate": 6.415992164850757e-06, "loss": 2.031, "step": 1706 }, { "epoch": 0.53, "grad_norm": 15.720380783081055, "learning_rate": 6.409156865146342e-06, "loss": 1.9651, "step": 1707 }, { "epoch": 0.53, "grad_norm": 11.775768280029297, "learning_rate": 6.402322172812176e-06, "loss": 0.8801, "step": 1708 }, { "epoch": 0.53, "grad_norm": 7.3918023109436035, "learning_rate": 6.395488094325961e-06, "loss": 0.9, "step": 1709 }, { "epoch": 0.53, "grad_norm": 12.462408065795898, "learning_rate": 6.3886546361648145e-06, "loss": 2.0359, "step": 1710 }, { "epoch": 0.53, "grad_norm": 9.982486724853516, "learning_rate": 6.381821804805262e-06, "loss": 2.5078, "step": 1711 }, { "epoch": 0.53, "grad_norm": 15.839106559753418, "learning_rate": 6.374989606723242e-06, "loss": 4.6393, "step": 1712 }, { "epoch": 0.53, "grad_norm": 20.70285987854004, "learning_rate": 6.368158048394088e-06, "loss": 6.814, "step": 1713 }, { "epoch": 0.53, "grad_norm": 10.38366985321045, "learning_rate": 6.361327136292526e-06, "loss": 1.0995, "step": 1714 }, { "epoch": 0.53, "grad_norm": 12.559454917907715, "learning_rate": 6.354496876892677e-06, "loss": 2.4292, "step": 1715 }, { "epoch": 0.53, "grad_norm": 12.452824592590332, "learning_rate": 6.347667276668035e-06, "loss": 2.3951, "step": 1716 }, { "epoch": 0.53, "grad_norm": 13.784807205200195, "learning_rate": 6.340838342091473e-06, "loss": 1.9206, "step": 1717 }, { "epoch": 0.53, "grad_norm": 15.844255447387695, "learning_rate": 6.334010079635234e-06, "loss": 2.962, "step": 1718 }, { "epoch": 0.53, "grad_norm": 14.486095428466797, "learning_rate": 6.327182495770922e-06, "loss": 3.1889, "step": 1719 }, { "epoch": 0.53, "grad_norm": 13.291302680969238, "learning_rate": 6.320355596969496e-06, "loss": 2.257, "step": 1720 }, { "epoch": 0.53, "grad_norm": 13.94046688079834, "learning_rate": 6.313529389701275e-06, "loss": 3.026, "step": 1721 }, { "epoch": 0.53, "grad_norm": 15.41063404083252, "learning_rate": 6.306703880435911e-06, "loss": 1.7961, "step": 1722 }, { "epoch": 0.53, "grad_norm": 17.081377029418945, "learning_rate": 6.299879075642397e-06, "loss": 2.9324, "step": 1723 }, { "epoch": 0.53, "grad_norm": 16.9708309173584, "learning_rate": 6.293054981789066e-06, "loss": 2.3154, "step": 1724 }, { "epoch": 0.53, "grad_norm": 11.27573013305664, "learning_rate": 6.286231605343573e-06, "loss": 1.9319, "step": 1725 }, { "epoch": 0.53, "grad_norm": 16.793363571166992, "learning_rate": 6.279408952772883e-06, "loss": 1.8106, "step": 1726 }, { "epoch": 0.54, "grad_norm": 11.946924209594727, "learning_rate": 6.272587030543293e-06, "loss": 1.7559, "step": 1727 }, { "epoch": 0.54, "grad_norm": 17.80219078063965, "learning_rate": 6.2657658451203954e-06, "loss": 2.3931, "step": 1728 }, { "epoch": 0.54, "grad_norm": 16.310911178588867, "learning_rate": 6.258945402969087e-06, "loss": 3.2346, "step": 1729 }, { "epoch": 0.54, "grad_norm": 16.45994758605957, "learning_rate": 6.2521257105535616e-06, "loss": 2.502, "step": 1730 }, { "epoch": 0.54, "grad_norm": 17.91693878173828, "learning_rate": 6.2453067743373025e-06, "loss": 4.5985, "step": 1731 }, { "epoch": 0.54, "grad_norm": 16.53362464904785, "learning_rate": 6.238488600783072e-06, "loss": 2.1074, "step": 1732 }, { "epoch": 0.54, "grad_norm": 17.455644607543945, "learning_rate": 6.2316711963529155e-06, "loss": 5.2828, "step": 1733 }, { "epoch": 0.54, "grad_norm": 10.842496871948242, "learning_rate": 6.224854567508148e-06, "loss": 1.6192, "step": 1734 }, { "epoch": 0.54, "grad_norm": 13.071833610534668, "learning_rate": 6.218038720709345e-06, "loss": 1.0014, "step": 1735 }, { "epoch": 0.54, "grad_norm": 10.850285530090332, "learning_rate": 6.211223662416347e-06, "loss": 1.2009, "step": 1736 }, { "epoch": 0.54, "grad_norm": 14.262269020080566, "learning_rate": 6.204409399088243e-06, "loss": 1.4618, "step": 1737 }, { "epoch": 0.54, "grad_norm": 18.779258728027344, "learning_rate": 6.197595937183369e-06, "loss": 4.6567, "step": 1738 }, { "epoch": 0.54, "grad_norm": 15.584208488464355, "learning_rate": 6.190783283159304e-06, "loss": 2.299, "step": 1739 }, { "epoch": 0.54, "grad_norm": 14.968876838684082, "learning_rate": 6.183971443472859e-06, "loss": 2.518, "step": 1740 }, { "epoch": 0.54, "grad_norm": 14.949677467346191, "learning_rate": 6.177160424580071e-06, "loss": 1.4009, "step": 1741 }, { "epoch": 0.54, "grad_norm": 13.316316604614258, "learning_rate": 6.1703502329362096e-06, "loss": 2.442, "step": 1742 }, { "epoch": 0.54, "grad_norm": 17.885141372680664, "learning_rate": 6.163540874995746e-06, "loss": 2.819, "step": 1743 }, { "epoch": 0.54, "grad_norm": 12.24764633178711, "learning_rate": 6.156732357212366e-06, "loss": 2.331, "step": 1744 }, { "epoch": 0.54, "grad_norm": 10.27761173248291, "learning_rate": 6.149924686038967e-06, "loss": 1.0084, "step": 1745 }, { "epoch": 0.54, "grad_norm": 11.830199241638184, "learning_rate": 6.143117867927637e-06, "loss": 1.8183, "step": 1746 }, { "epoch": 0.54, "grad_norm": 14.59326171875, "learning_rate": 6.136311909329651e-06, "loss": 2.6811, "step": 1747 }, { "epoch": 0.54, "grad_norm": 14.359524726867676, "learning_rate": 6.1295068166954815e-06, "loss": 1.511, "step": 1748 }, { "epoch": 0.54, "grad_norm": 13.620701789855957, "learning_rate": 6.122702596474772e-06, "loss": 1.9805, "step": 1749 }, { "epoch": 0.54, "grad_norm": 12.86448860168457, "learning_rate": 6.115899255116336e-06, "loss": 1.7906, "step": 1750 }, { "epoch": 0.54, "grad_norm": 12.143721580505371, "learning_rate": 6.109096799068166e-06, "loss": 2.337, "step": 1751 }, { "epoch": 0.54, "grad_norm": 14.398964881896973, "learning_rate": 6.1022952347774054e-06, "loss": 4.1856, "step": 1752 }, { "epoch": 0.54, "grad_norm": 12.750125885009766, "learning_rate": 6.095494568690353e-06, "loss": 1.9975, "step": 1753 }, { "epoch": 0.54, "grad_norm": 15.612987518310547, "learning_rate": 6.088694807252462e-06, "loss": 1.4707, "step": 1754 }, { "epoch": 0.54, "grad_norm": 9.288985252380371, "learning_rate": 6.081895956908323e-06, "loss": 1.0508, "step": 1755 }, { "epoch": 0.54, "grad_norm": 13.260685920715332, "learning_rate": 6.0750980241016635e-06, "loss": 1.7748, "step": 1756 }, { "epoch": 0.54, "grad_norm": 16.665273666381836, "learning_rate": 6.068301015275347e-06, "loss": 5.0814, "step": 1757 }, { "epoch": 0.54, "grad_norm": 12.461769104003906, "learning_rate": 6.061504936871353e-06, "loss": 1.2751, "step": 1758 }, { "epoch": 0.55, "grad_norm": 11.850014686584473, "learning_rate": 6.054709795330784e-06, "loss": 2.1483, "step": 1759 }, { "epoch": 0.55, "grad_norm": 11.211248397827148, "learning_rate": 6.047915597093859e-06, "loss": 2.166, "step": 1760 }, { "epoch": 0.55, "grad_norm": 13.773746490478516, "learning_rate": 6.04112234859989e-06, "loss": 3.4716, "step": 1761 }, { "epoch": 0.55, "grad_norm": 19.844953536987305, "learning_rate": 6.034330056287299e-06, "loss": 4.2191, "step": 1762 }, { "epoch": 0.55, "grad_norm": 18.28035545349121, "learning_rate": 6.027538726593603e-06, "loss": 6.1553, "step": 1763 }, { "epoch": 0.55, "grad_norm": 14.571393013000488, "learning_rate": 6.020748365955403e-06, "loss": 2.0841, "step": 1764 }, { "epoch": 0.55, "grad_norm": 16.416486740112305, "learning_rate": 6.0139589808083746e-06, "loss": 2.7342, "step": 1765 }, { "epoch": 0.55, "grad_norm": 11.846794128417969, "learning_rate": 6.007170577587284e-06, "loss": 2.8779, "step": 1766 }, { "epoch": 0.55, "grad_norm": 14.38280200958252, "learning_rate": 6.000383162725954e-06, "loss": 2.1532, "step": 1767 }, { "epoch": 0.55, "grad_norm": 12.458091735839844, "learning_rate": 5.993596742657276e-06, "loss": 2.2165, "step": 1768 }, { "epoch": 0.55, "grad_norm": 15.689918518066406, "learning_rate": 5.986811323813201e-06, "loss": 4.93, "step": 1769 }, { "epoch": 0.55, "grad_norm": 15.026179313659668, "learning_rate": 5.9800269126247255e-06, "loss": 2.9913, "step": 1770 }, { "epoch": 0.55, "grad_norm": 15.321549415588379, "learning_rate": 5.973243515521892e-06, "loss": 4.5672, "step": 1771 }, { "epoch": 0.55, "grad_norm": 13.819236755371094, "learning_rate": 5.966461138933786e-06, "loss": 3.0129, "step": 1772 }, { "epoch": 0.55, "grad_norm": 14.934184074401855, "learning_rate": 5.9596797892885224e-06, "loss": 1.6563, "step": 1773 }, { "epoch": 0.55, "grad_norm": 13.410440444946289, "learning_rate": 5.952899473013241e-06, "loss": 1.7962, "step": 1774 }, { "epoch": 0.55, "grad_norm": 13.91740894317627, "learning_rate": 5.946120196534109e-06, "loss": 2.1482, "step": 1775 }, { "epoch": 0.55, "grad_norm": 12.390769958496094, "learning_rate": 5.939341966276302e-06, "loss": 1.7179, "step": 1776 }, { "epoch": 0.55, "grad_norm": 15.820837020874023, "learning_rate": 5.932564788664004e-06, "loss": 3.3391, "step": 1777 }, { "epoch": 0.55, "grad_norm": 13.530945777893066, "learning_rate": 5.9257886701204064e-06, "loss": 5.1542, "step": 1778 }, { "epoch": 0.55, "grad_norm": 13.225460052490234, "learning_rate": 5.919013617067692e-06, "loss": 5.2362, "step": 1779 }, { "epoch": 0.55, "grad_norm": 16.23628044128418, "learning_rate": 5.9122396359270325e-06, "loss": 3.1039, "step": 1780 }, { "epoch": 0.55, "grad_norm": 16.73491668701172, "learning_rate": 5.905466733118594e-06, "loss": 8.1694, "step": 1781 }, { "epoch": 0.55, "grad_norm": 16.948612213134766, "learning_rate": 5.8986949150615075e-06, "loss": 2.5072, "step": 1782 }, { "epoch": 0.55, "grad_norm": 13.731588363647461, "learning_rate": 5.89192418817388e-06, "loss": 2.9993, "step": 1783 }, { "epoch": 0.55, "grad_norm": 15.084639549255371, "learning_rate": 5.885154558872792e-06, "loss": 2.9031, "step": 1784 }, { "epoch": 0.55, "grad_norm": 13.718095779418945, "learning_rate": 5.878386033574273e-06, "loss": 2.0702, "step": 1785 }, { "epoch": 0.55, "grad_norm": 12.161492347717285, "learning_rate": 5.871618618693314e-06, "loss": 3.5457, "step": 1786 }, { "epoch": 0.55, "grad_norm": 16.949766159057617, "learning_rate": 5.864852320643849e-06, "loss": 3.3263, "step": 1787 }, { "epoch": 0.55, "grad_norm": 13.630675315856934, "learning_rate": 5.8580871458387525e-06, "loss": 2.5225, "step": 1788 }, { "epoch": 0.55, "grad_norm": 15.821239471435547, "learning_rate": 5.85132310068984e-06, "loss": 1.9543, "step": 1789 }, { "epoch": 0.55, "grad_norm": 12.509420394897461, "learning_rate": 5.844560191607854e-06, "loss": 2.6313, "step": 1790 }, { "epoch": 0.55, "grad_norm": 9.080439567565918, "learning_rate": 5.837798425002454e-06, "loss": 0.8224, "step": 1791 }, { "epoch": 0.56, "grad_norm": 13.666498184204102, "learning_rate": 5.831037807282229e-06, "loss": 2.6361, "step": 1792 }, { "epoch": 0.56, "grad_norm": 14.119965553283691, "learning_rate": 5.824278344854669e-06, "loss": 1.4584, "step": 1793 }, { "epoch": 0.56, "grad_norm": 10.555170059204102, "learning_rate": 5.81752004412617e-06, "loss": 1.4938, "step": 1794 }, { "epoch": 0.56, "grad_norm": 20.039859771728516, "learning_rate": 5.810762911502034e-06, "loss": 1.9742, "step": 1795 }, { "epoch": 0.56, "grad_norm": 19.904695510864258, "learning_rate": 5.8040069533864486e-06, "loss": 5.4046, "step": 1796 }, { "epoch": 0.56, "grad_norm": 15.574432373046875, "learning_rate": 5.7972521761824896e-06, "loss": 2.7755, "step": 1797 }, { "epoch": 0.56, "grad_norm": 10.398809432983398, "learning_rate": 5.790498586292119e-06, "loss": 1.321, "step": 1798 }, { "epoch": 0.56, "grad_norm": 14.65013313293457, "learning_rate": 5.783746190116167e-06, "loss": 3.4494, "step": 1799 }, { "epoch": 0.56, "grad_norm": 10.127032279968262, "learning_rate": 5.7769949940543306e-06, "loss": 2.4075, "step": 1800 }, { "epoch": 0.56, "grad_norm": 17.058229446411133, "learning_rate": 5.770245004505181e-06, "loss": 4.8438, "step": 1801 }, { "epoch": 0.56, "grad_norm": 13.680752754211426, "learning_rate": 5.763496227866136e-06, "loss": 2.608, "step": 1802 }, { "epoch": 0.56, "grad_norm": 11.628544807434082, "learning_rate": 5.756748670533461e-06, "loss": 2.0497, "step": 1803 }, { "epoch": 0.56, "grad_norm": 14.606917381286621, "learning_rate": 5.75000233890228e-06, "loss": 1.8469, "step": 1804 }, { "epoch": 0.56, "grad_norm": 12.258424758911133, "learning_rate": 5.743257239366539e-06, "loss": 1.6223, "step": 1805 }, { "epoch": 0.56, "grad_norm": 13.137167930603027, "learning_rate": 5.7365133783190266e-06, "loss": 2.2344, "step": 1806 }, { "epoch": 0.56, "grad_norm": 14.135931015014648, "learning_rate": 5.729770762151356e-06, "loss": 4.8953, "step": 1807 }, { "epoch": 0.56, "grad_norm": 12.336984634399414, "learning_rate": 5.723029397253957e-06, "loss": 2.1529, "step": 1808 }, { "epoch": 0.56, "grad_norm": 14.23254108428955, "learning_rate": 5.7162892900160755e-06, "loss": 1.3341, "step": 1809 }, { "epoch": 0.56, "grad_norm": 15.352858543395996, "learning_rate": 5.709550446825767e-06, "loss": 2.2873, "step": 1810 }, { "epoch": 0.56, "grad_norm": 14.300989151000977, "learning_rate": 5.7028128740698884e-06, "loss": 4.2608, "step": 1811 }, { "epoch": 0.56, "grad_norm": 9.150107383728027, "learning_rate": 5.696076578134088e-06, "loss": 1.1745, "step": 1812 }, { "epoch": 0.56, "grad_norm": 12.48327922821045, "learning_rate": 5.689341565402811e-06, "loss": 1.9706, "step": 1813 }, { "epoch": 0.56, "grad_norm": 15.741283416748047, "learning_rate": 5.6826078422592834e-06, "loss": 3.9089, "step": 1814 }, { "epoch": 0.56, "grad_norm": 18.629844665527344, "learning_rate": 5.675875415085506e-06, "loss": 4.4346, "step": 1815 }, { "epoch": 0.56, "grad_norm": 9.940484046936035, "learning_rate": 5.669144290262256e-06, "loss": 2.4403, "step": 1816 }, { "epoch": 0.56, "grad_norm": 19.276111602783203, "learning_rate": 5.6624144741690745e-06, "loss": 2.4949, "step": 1817 }, { "epoch": 0.56, "grad_norm": 10.42538833618164, "learning_rate": 5.655685973184258e-06, "loss": 2.155, "step": 1818 }, { "epoch": 0.56, "grad_norm": 10.172608375549316, "learning_rate": 5.648958793684869e-06, "loss": 2.2949, "step": 1819 }, { "epoch": 0.56, "grad_norm": 8.87829303741455, "learning_rate": 5.642232942046701e-06, "loss": 1.1954, "step": 1820 }, { "epoch": 0.56, "grad_norm": 10.703292846679688, "learning_rate": 5.635508424644298e-06, "loss": 1.1126, "step": 1821 }, { "epoch": 0.56, "grad_norm": 17.50180435180664, "learning_rate": 5.628785247850944e-06, "loss": 5.5155, "step": 1822 }, { "epoch": 0.56, "grad_norm": 15.151026725769043, "learning_rate": 5.622063418038643e-06, "loss": 2.6611, "step": 1823 }, { "epoch": 0.57, "grad_norm": 13.896838188171387, "learning_rate": 5.615342941578126e-06, "loss": 1.8171, "step": 1824 }, { "epoch": 0.57, "grad_norm": 21.477943420410156, "learning_rate": 5.608623824838845e-06, "loss": 4.216, "step": 1825 }, { "epoch": 0.57, "grad_norm": 12.832585334777832, "learning_rate": 5.6019060741889564e-06, "loss": 1.3468, "step": 1826 }, { "epoch": 0.57, "grad_norm": 11.833240509033203, "learning_rate": 5.595189695995325e-06, "loss": 3.0509, "step": 1827 }, { "epoch": 0.57, "grad_norm": 14.890311241149902, "learning_rate": 5.588474696623519e-06, "loss": 5.038, "step": 1828 }, { "epoch": 0.57, "grad_norm": 13.075677871704102, "learning_rate": 5.5817610824377955e-06, "loss": 2.7405, "step": 1829 }, { "epoch": 0.57, "grad_norm": 12.089751243591309, "learning_rate": 5.575048859801096e-06, "loss": 1.1481, "step": 1830 }, { "epoch": 0.57, "grad_norm": 17.9642391204834, "learning_rate": 5.568338035075049e-06, "loss": 1.942, "step": 1831 }, { "epoch": 0.57, "grad_norm": 20.85904312133789, "learning_rate": 5.561628614619956e-06, "loss": 3.8661, "step": 1832 }, { "epoch": 0.57, "grad_norm": 9.438445091247559, "learning_rate": 5.554920604794785e-06, "loss": 1.3036, "step": 1833 }, { "epoch": 0.57, "grad_norm": 11.098956108093262, "learning_rate": 5.548214011957172e-06, "loss": 1.763, "step": 1834 }, { "epoch": 0.57, "grad_norm": 16.185504913330078, "learning_rate": 5.5415088424634065e-06, "loss": 2.5151, "step": 1835 }, { "epoch": 0.57, "grad_norm": 12.786161422729492, "learning_rate": 5.534805102668425e-06, "loss": 1.4129, "step": 1836 }, { "epoch": 0.57, "grad_norm": 14.491531372070312, "learning_rate": 5.528102798925823e-06, "loss": 2.4639, "step": 1837 }, { "epoch": 0.57, "grad_norm": 13.029722213745117, "learning_rate": 5.521401937587819e-06, "loss": 2.2583, "step": 1838 }, { "epoch": 0.57, "grad_norm": 13.83185863494873, "learning_rate": 5.51470252500527e-06, "loss": 1.6569, "step": 1839 }, { "epoch": 0.57, "grad_norm": 17.041156768798828, "learning_rate": 5.5080045675276666e-06, "loss": 1.8535, "step": 1840 }, { "epoch": 0.57, "grad_norm": 14.895615577697754, "learning_rate": 5.501308071503113e-06, "loss": 1.8716, "step": 1841 }, { "epoch": 0.57, "grad_norm": 14.872110366821289, "learning_rate": 5.494613043278324e-06, "loss": 1.9894, "step": 1842 }, { "epoch": 0.57, "grad_norm": 14.688057899475098, "learning_rate": 5.487919489198638e-06, "loss": 2.6675, "step": 1843 }, { "epoch": 0.57, "grad_norm": 12.86407470703125, "learning_rate": 5.481227415607984e-06, "loss": 1.2891, "step": 1844 }, { "epoch": 0.57, "grad_norm": 14.483810424804688, "learning_rate": 5.47453682884889e-06, "loss": 2.4939, "step": 1845 }, { "epoch": 0.57, "grad_norm": 12.794442176818848, "learning_rate": 5.467847735262478e-06, "loss": 5.7496, "step": 1846 }, { "epoch": 0.57, "grad_norm": 13.446375846862793, "learning_rate": 5.461160141188452e-06, "loss": 3.5324, "step": 1847 }, { "epoch": 0.57, "grad_norm": 10.231986045837402, "learning_rate": 5.454474052965096e-06, "loss": 1.116, "step": 1848 }, { "epoch": 0.57, "grad_norm": 14.563923835754395, "learning_rate": 5.447789476929268e-06, "loss": 2.5878, "step": 1849 }, { "epoch": 0.57, "grad_norm": 13.371225357055664, "learning_rate": 5.4411064194163895e-06, "loss": 1.1454, "step": 1850 }, { "epoch": 0.57, "grad_norm": 15.974309921264648, "learning_rate": 5.434424886760445e-06, "loss": 2.1937, "step": 1851 }, { "epoch": 0.57, "grad_norm": 17.46932601928711, "learning_rate": 5.427744885293975e-06, "loss": 2.9106, "step": 1852 }, { "epoch": 0.57, "grad_norm": 9.957886695861816, "learning_rate": 5.4210664213480676e-06, "loss": 2.1941, "step": 1853 }, { "epoch": 0.57, "grad_norm": 9.03040885925293, "learning_rate": 5.41438950125235e-06, "loss": 1.5123, "step": 1854 }, { "epoch": 0.57, "grad_norm": 15.304936408996582, "learning_rate": 5.407714131334994e-06, "loss": 2.1541, "step": 1855 }, { "epoch": 0.58, "grad_norm": 14.13270092010498, "learning_rate": 5.401040317922695e-06, "loss": 2.0546, "step": 1856 }, { "epoch": 0.58, "grad_norm": 18.850942611694336, "learning_rate": 5.394368067340675e-06, "loss": 4.6624, "step": 1857 }, { "epoch": 0.58, "grad_norm": 10.505763053894043, "learning_rate": 5.387697385912681e-06, "loss": 2.6678, "step": 1858 }, { "epoch": 0.58, "grad_norm": 17.811540603637695, "learning_rate": 5.381028279960963e-06, "loss": 3.3744, "step": 1859 }, { "epoch": 0.58, "grad_norm": 11.627664566040039, "learning_rate": 5.3743607558062794e-06, "loss": 2.7316, "step": 1860 }, { "epoch": 0.58, "grad_norm": 19.460006713867188, "learning_rate": 5.367694819767901e-06, "loss": 5.1478, "step": 1861 }, { "epoch": 0.58, "grad_norm": 17.877521514892578, "learning_rate": 5.3610304781635805e-06, "loss": 1.6275, "step": 1862 }, { "epoch": 0.58, "grad_norm": 15.138850212097168, "learning_rate": 5.35436773730956e-06, "loss": 2.4314, "step": 1863 }, { "epoch": 0.58, "grad_norm": 19.668495178222656, "learning_rate": 5.347706603520575e-06, "loss": 2.4506, "step": 1864 }, { "epoch": 0.58, "grad_norm": 12.425284385681152, "learning_rate": 5.341047083109828e-06, "loss": 1.6937, "step": 1865 }, { "epoch": 0.58, "grad_norm": 15.025413513183594, "learning_rate": 5.334389182388994e-06, "loss": 2.6373, "step": 1866 }, { "epoch": 0.58, "grad_norm": 12.790462493896484, "learning_rate": 5.327732907668218e-06, "loss": 1.6166, "step": 1867 }, { "epoch": 0.58, "grad_norm": 19.806283950805664, "learning_rate": 5.321078265256098e-06, "loss": 3.5507, "step": 1868 }, { "epoch": 0.58, "grad_norm": 13.250811576843262, "learning_rate": 5.314425261459687e-06, "loss": 1.5591, "step": 1869 }, { "epoch": 0.58, "grad_norm": 14.386801719665527, "learning_rate": 5.307773902584487e-06, "loss": 1.7944, "step": 1870 }, { "epoch": 0.58, "grad_norm": 13.107830047607422, "learning_rate": 5.30112419493444e-06, "loss": 2.4954, "step": 1871 }, { "epoch": 0.58, "grad_norm": 15.570332527160645, "learning_rate": 5.294476144811917e-06, "loss": 4.9374, "step": 1872 }, { "epoch": 0.58, "grad_norm": 13.250971794128418, "learning_rate": 5.2878297585177304e-06, "loss": 2.2374, "step": 1873 }, { "epoch": 0.58, "grad_norm": 14.113117218017578, "learning_rate": 5.281185042351105e-06, "loss": 2.4667, "step": 1874 }, { "epoch": 0.58, "grad_norm": 10.013445854187012, "learning_rate": 5.274542002609686e-06, "loss": 1.7648, "step": 1875 }, { "epoch": 0.58, "grad_norm": 15.309882164001465, "learning_rate": 5.267900645589531e-06, "loss": 4.6993, "step": 1876 }, { "epoch": 0.58, "grad_norm": 17.022863388061523, "learning_rate": 5.261260977585103e-06, "loss": 2.9688, "step": 1877 }, { "epoch": 0.58, "grad_norm": 18.01882553100586, "learning_rate": 5.254623004889259e-06, "loss": 3.1605, "step": 1878 }, { "epoch": 0.58, "grad_norm": 9.066216468811035, "learning_rate": 5.2479867337932595e-06, "loss": 0.7356, "step": 1879 }, { "epoch": 0.58, "grad_norm": 13.521183967590332, "learning_rate": 5.241352170586743e-06, "loss": 2.7033, "step": 1880 }, { "epoch": 0.58, "grad_norm": 14.285184860229492, "learning_rate": 5.234719321557728e-06, "loss": 1.6921, "step": 1881 }, { "epoch": 0.58, "grad_norm": 17.454221725463867, "learning_rate": 5.2280881929926205e-06, "loss": 6.739, "step": 1882 }, { "epoch": 0.58, "grad_norm": 12.460721015930176, "learning_rate": 5.221458791176186e-06, "loss": 2.4783, "step": 1883 }, { "epoch": 0.58, "grad_norm": 16.056852340698242, "learning_rate": 5.21483112239155e-06, "loss": 2.9951, "step": 1884 }, { "epoch": 0.58, "grad_norm": 15.775067329406738, "learning_rate": 5.208205192920209e-06, "loss": 2.9683, "step": 1885 }, { "epoch": 0.58, "grad_norm": 15.0515775680542, "learning_rate": 5.201581009041998e-06, "loss": 2.7254, "step": 1886 }, { "epoch": 0.58, "grad_norm": 14.527585983276367, "learning_rate": 5.194958577035105e-06, "loss": 3.066, "step": 1887 }, { "epoch": 0.58, "grad_norm": 18.543670654296875, "learning_rate": 5.188337903176055e-06, "loss": 2.9837, "step": 1888 }, { "epoch": 0.59, "grad_norm": 13.011702537536621, "learning_rate": 5.181718993739707e-06, "loss": 2.7422, "step": 1889 }, { "epoch": 0.59, "grad_norm": 16.387422561645508, "learning_rate": 5.175101854999245e-06, "loss": 3.4034, "step": 1890 }, { "epoch": 0.59, "grad_norm": 10.453161239624023, "learning_rate": 5.168486493226182e-06, "loss": 1.6797, "step": 1891 }, { "epoch": 0.59, "grad_norm": 13.211752891540527, "learning_rate": 5.161872914690342e-06, "loss": 1.725, "step": 1892 }, { "epoch": 0.59, "grad_norm": 14.550117492675781, "learning_rate": 5.1552611256598555e-06, "loss": 2.9156, "step": 1893 }, { "epoch": 0.59, "grad_norm": 15.480829238891602, "learning_rate": 5.148651132401165e-06, "loss": 1.5313, "step": 1894 }, { "epoch": 0.59, "grad_norm": 13.496414184570312, "learning_rate": 5.142042941179003e-06, "loss": 2.5291, "step": 1895 }, { "epoch": 0.59, "grad_norm": 13.85752010345459, "learning_rate": 5.135436558256399e-06, "loss": 1.4814, "step": 1896 }, { "epoch": 0.59, "grad_norm": 13.321053504943848, "learning_rate": 5.128831989894673e-06, "loss": 2.3097, "step": 1897 }, { "epoch": 0.59, "grad_norm": 17.116474151611328, "learning_rate": 5.122229242353413e-06, "loss": 4.8876, "step": 1898 }, { "epoch": 0.59, "grad_norm": 17.035852432250977, "learning_rate": 5.115628321890488e-06, "loss": 2.701, "step": 1899 }, { "epoch": 0.59, "grad_norm": 15.801783561706543, "learning_rate": 5.1090292347620405e-06, "loss": 1.6895, "step": 1900 }, { "epoch": 0.59, "grad_norm": 16.488258361816406, "learning_rate": 5.1024319872224695e-06, "loss": 4.6689, "step": 1901 }, { "epoch": 0.59, "grad_norm": 17.51988983154297, "learning_rate": 5.095836585524424e-06, "loss": 2.0975, "step": 1902 }, { "epoch": 0.59, "grad_norm": 13.482269287109375, "learning_rate": 5.0892430359188205e-06, "loss": 1.374, "step": 1903 }, { "epoch": 0.59, "grad_norm": 12.935653686523438, "learning_rate": 5.082651344654806e-06, "loss": 2.5465, "step": 1904 }, { "epoch": 0.59, "grad_norm": 16.7248592376709, "learning_rate": 5.076061517979769e-06, "loss": 2.3183, "step": 1905 }, { "epoch": 0.59, "grad_norm": 19.343509674072266, "learning_rate": 5.069473562139337e-06, "loss": 4.0582, "step": 1906 }, { "epoch": 0.59, "grad_norm": 11.782135009765625, "learning_rate": 5.062887483377358e-06, "loss": 6.0777, "step": 1907 }, { "epoch": 0.59, "grad_norm": 16.98220443725586, "learning_rate": 5.056303287935901e-06, "loss": 1.7017, "step": 1908 }, { "epoch": 0.59, "grad_norm": 12.168288230895996, "learning_rate": 5.0497209820552556e-06, "loss": 0.841, "step": 1909 }, { "epoch": 0.59, "grad_norm": 11.430391311645508, "learning_rate": 5.043140571973915e-06, "loss": 4.2754, "step": 1910 }, { "epoch": 0.59, "grad_norm": 10.67452335357666, "learning_rate": 5.0365620639285754e-06, "loss": 1.0665, "step": 1911 }, { "epoch": 0.59, "grad_norm": 14.424131393432617, "learning_rate": 5.029985464154138e-06, "loss": 2.9558, "step": 1912 }, { "epoch": 0.59, "grad_norm": 17.27757453918457, "learning_rate": 5.023410778883685e-06, "loss": 4.7597, "step": 1913 }, { "epoch": 0.59, "grad_norm": 17.786062240600586, "learning_rate": 5.016838014348489e-06, "loss": 3.6501, "step": 1914 }, { "epoch": 0.59, "grad_norm": 9.792977333068848, "learning_rate": 5.010267176778005e-06, "loss": 4.768, "step": 1915 }, { "epoch": 0.59, "grad_norm": 14.924040794372559, "learning_rate": 5.003698272399859e-06, "loss": 4.1235, "step": 1916 }, { "epoch": 0.59, "grad_norm": 14.380438804626465, "learning_rate": 4.997131307439839e-06, "loss": 2.0261, "step": 1917 }, { "epoch": 0.59, "grad_norm": 10.831319808959961, "learning_rate": 4.99056628812191e-06, "loss": 2.286, "step": 1918 }, { "epoch": 0.59, "grad_norm": 14.513365745544434, "learning_rate": 4.984003220668176e-06, "loss": 3.7361, "step": 1919 }, { "epoch": 0.59, "grad_norm": 12.797149658203125, "learning_rate": 4.9774421112988976e-06, "loss": 5.2536, "step": 1920 }, { "epoch": 0.6, "grad_norm": 28.588335037231445, "learning_rate": 4.970882966232487e-06, "loss": 3.5549, "step": 1921 }, { "epoch": 0.6, "grad_norm": 15.213167190551758, "learning_rate": 4.964325791685487e-06, "loss": 4.5068, "step": 1922 }, { "epoch": 0.6, "grad_norm": 10.606828689575195, "learning_rate": 4.9577705938725665e-06, "loss": 1.4982, "step": 1923 }, { "epoch": 0.6, "grad_norm": 10.224977493286133, "learning_rate": 4.951217379006538e-06, "loss": 0.9959, "step": 1924 }, { "epoch": 0.6, "grad_norm": 16.293413162231445, "learning_rate": 4.9446661532983206e-06, "loss": 3.7225, "step": 1925 }, { "epoch": 0.6, "grad_norm": 17.75890350341797, "learning_rate": 4.938116922956952e-06, "loss": 3.3565, "step": 1926 }, { "epoch": 0.6, "grad_norm": 18.865028381347656, "learning_rate": 4.931569694189582e-06, "loss": 2.1867, "step": 1927 }, { "epoch": 0.6, "grad_norm": 15.345128059387207, "learning_rate": 4.925024473201459e-06, "loss": 4.05, "step": 1928 }, { "epoch": 0.6, "grad_norm": 14.990592956542969, "learning_rate": 4.91848126619593e-06, "loss": 3.244, "step": 1929 }, { "epoch": 0.6, "grad_norm": 9.98999309539795, "learning_rate": 4.911940079374434e-06, "loss": 1.2111, "step": 1930 }, { "epoch": 0.6, "grad_norm": 15.376320838928223, "learning_rate": 4.905400918936496e-06, "loss": 3.0751, "step": 1931 }, { "epoch": 0.6, "grad_norm": 15.502862930297852, "learning_rate": 4.898863791079717e-06, "loss": 1.6701, "step": 1932 }, { "epoch": 0.6, "grad_norm": 16.26798439025879, "learning_rate": 4.892328701999776e-06, "loss": 6.2586, "step": 1933 }, { "epoch": 0.6, "grad_norm": 17.54841423034668, "learning_rate": 4.885795657890418e-06, "loss": 2.0316, "step": 1934 }, { "epoch": 0.6, "grad_norm": 23.011951446533203, "learning_rate": 4.879264664943445e-06, "loss": 3.3383, "step": 1935 }, { "epoch": 0.6, "grad_norm": 14.206123352050781, "learning_rate": 4.8727357293487254e-06, "loss": 1.4116, "step": 1936 }, { "epoch": 0.6, "grad_norm": 12.767412185668945, "learning_rate": 4.866208857294168e-06, "loss": 1.9756, "step": 1937 }, { "epoch": 0.6, "grad_norm": 11.327007293701172, "learning_rate": 4.859684054965728e-06, "loss": 1.6458, "step": 1938 }, { "epoch": 0.6, "grad_norm": 8.891633987426758, "learning_rate": 4.853161328547408e-06, "loss": 1.5201, "step": 1939 }, { "epoch": 0.6, "grad_norm": 25.075712203979492, "learning_rate": 4.846640684221227e-06, "loss": 4.8558, "step": 1940 }, { "epoch": 0.6, "grad_norm": 18.45266342163086, "learning_rate": 4.840122128167242e-06, "loss": 2.6664, "step": 1941 }, { "epoch": 0.6, "grad_norm": 12.337541580200195, "learning_rate": 4.833605666563532e-06, "loss": 1.2926, "step": 1942 }, { "epoch": 0.6, "grad_norm": 11.597476959228516, "learning_rate": 4.827091305586184e-06, "loss": 1.9385, "step": 1943 }, { "epoch": 0.6, "grad_norm": 19.565479278564453, "learning_rate": 4.820579051409294e-06, "loss": 2.9348, "step": 1944 }, { "epoch": 0.6, "grad_norm": 12.648226737976074, "learning_rate": 4.81406891020497e-06, "loss": 1.6047, "step": 1945 }, { "epoch": 0.6, "grad_norm": 12.827557563781738, "learning_rate": 4.807560888143311e-06, "loss": 2.0912, "step": 1946 }, { "epoch": 0.6, "grad_norm": 12.061546325683594, "learning_rate": 4.8010549913924045e-06, "loss": 1.3529, "step": 1947 }, { "epoch": 0.6, "grad_norm": 16.40416717529297, "learning_rate": 4.794551226118331e-06, "loss": 2.5774, "step": 1948 }, { "epoch": 0.6, "grad_norm": 16.391469955444336, "learning_rate": 4.788049598485146e-06, "loss": 2.5718, "step": 1949 }, { "epoch": 0.6, "grad_norm": 8.078376770019531, "learning_rate": 4.7815501146548795e-06, "loss": 1.0108, "step": 1950 }, { "epoch": 0.6, "grad_norm": 13.772455215454102, "learning_rate": 4.775052780787534e-06, "loss": 1.3883, "step": 1951 }, { "epoch": 0.6, "grad_norm": 12.186179161071777, "learning_rate": 4.7685576030410676e-06, "loss": 2.1278, "step": 1952 }, { "epoch": 0.61, "grad_norm": 18.81841278076172, "learning_rate": 4.762064587571399e-06, "loss": 2.2821, "step": 1953 }, { "epoch": 0.61, "grad_norm": 20.560026168823242, "learning_rate": 4.755573740532398e-06, "loss": 3.2618, "step": 1954 }, { "epoch": 0.61, "grad_norm": 15.923328399658203, "learning_rate": 4.7490850680758734e-06, "loss": 5.6892, "step": 1955 }, { "epoch": 0.61, "grad_norm": 13.167757987976074, "learning_rate": 4.742598576351587e-06, "loss": 0.7712, "step": 1956 }, { "epoch": 0.61, "grad_norm": 20.166093826293945, "learning_rate": 4.736114271507217e-06, "loss": 3.5002, "step": 1957 }, { "epoch": 0.61, "grad_norm": 16.612627029418945, "learning_rate": 4.729632159688375e-06, "loss": 8.2562, "step": 1958 }, { "epoch": 0.61, "grad_norm": 12.197168350219727, "learning_rate": 4.723152247038602e-06, "loss": 1.71, "step": 1959 }, { "epoch": 0.61, "grad_norm": 12.618730545043945, "learning_rate": 4.7166745396993444e-06, "loss": 0.7106, "step": 1960 }, { "epoch": 0.61, "grad_norm": 9.799409866333008, "learning_rate": 4.71019904380996e-06, "loss": 1.5867, "step": 1961 }, { "epoch": 0.61, "grad_norm": 27.86197853088379, "learning_rate": 4.703725765507719e-06, "loss": 2.0034, "step": 1962 }, { "epoch": 0.61, "grad_norm": 18.016983032226562, "learning_rate": 4.697254710927778e-06, "loss": 3.8767, "step": 1963 }, { "epoch": 0.61, "grad_norm": 15.817669868469238, "learning_rate": 4.690785886203195e-06, "loss": 1.6525, "step": 1964 }, { "epoch": 0.61, "grad_norm": 14.399222373962402, "learning_rate": 4.68431929746491e-06, "loss": 2.8169, "step": 1965 }, { "epoch": 0.61, "grad_norm": 15.781970024108887, "learning_rate": 4.677854950841746e-06, "loss": 2.5669, "step": 1966 }, { "epoch": 0.61, "grad_norm": 23.760482788085938, "learning_rate": 4.671392852460399e-06, "loss": 2.4262, "step": 1967 }, { "epoch": 0.61, "grad_norm": 11.917473793029785, "learning_rate": 4.664933008445436e-06, "loss": 1.3121, "step": 1968 }, { "epoch": 0.61, "grad_norm": 14.976125717163086, "learning_rate": 4.658475424919288e-06, "loss": 2.0666, "step": 1969 }, { "epoch": 0.61, "grad_norm": 20.271209716796875, "learning_rate": 4.65202010800224e-06, "loss": 4.8662, "step": 1970 }, { "epoch": 0.61, "grad_norm": 13.959456443786621, "learning_rate": 4.645567063812433e-06, "loss": 1.4667, "step": 1971 }, { "epoch": 0.61, "grad_norm": 15.600061416625977, "learning_rate": 4.639116298465852e-06, "loss": 1.6943, "step": 1972 }, { "epoch": 0.61, "grad_norm": 11.672968864440918, "learning_rate": 4.632667818076319e-06, "loss": 2.446, "step": 1973 }, { "epoch": 0.61, "grad_norm": 10.582765579223633, "learning_rate": 4.6262216287555e-06, "loss": 0.8156, "step": 1974 }, { "epoch": 0.61, "grad_norm": 13.40039348602295, "learning_rate": 4.619777736612878e-06, "loss": 1.8299, "step": 1975 }, { "epoch": 0.61, "grad_norm": 11.918106079101562, "learning_rate": 4.613336147755764e-06, "loss": 1.7873, "step": 1976 }, { "epoch": 0.61, "grad_norm": 9.903485298156738, "learning_rate": 4.6068968682892905e-06, "loss": 0.9619, "step": 1977 }, { "epoch": 0.61, "grad_norm": 15.035801887512207, "learning_rate": 4.600459904316396e-06, "loss": 1.7628, "step": 1978 }, { "epoch": 0.61, "grad_norm": 14.6061429977417, "learning_rate": 4.594025261937818e-06, "loss": 2.7498, "step": 1979 }, { "epoch": 0.61, "grad_norm": 17.603635787963867, "learning_rate": 4.587592947252112e-06, "loss": 2.1893, "step": 1980 }, { "epoch": 0.61, "grad_norm": 13.130147933959961, "learning_rate": 4.581162966355609e-06, "loss": 2.1399, "step": 1981 }, { "epoch": 0.61, "grad_norm": 13.292744636535645, "learning_rate": 4.5747353253424365e-06, "loss": 3.2051, "step": 1982 }, { "epoch": 0.61, "grad_norm": 14.927051544189453, "learning_rate": 4.568310030304507e-06, "loss": 5.1818, "step": 1983 }, { "epoch": 0.61, "grad_norm": 8.793292999267578, "learning_rate": 4.561887087331501e-06, "loss": 1.0002, "step": 1984 }, { "epoch": 0.62, "grad_norm": 16.968029022216797, "learning_rate": 4.555466502510876e-06, "loss": 4.2384, "step": 1985 }, { "epoch": 0.62, "grad_norm": 11.477005004882812, "learning_rate": 4.549048281927855e-06, "loss": 1.1315, "step": 1986 }, { "epoch": 0.62, "grad_norm": 18.990814208984375, "learning_rate": 4.542632431665419e-06, "loss": 3.8732, "step": 1987 }, { "epoch": 0.62, "grad_norm": 12.12499713897705, "learning_rate": 4.536218957804297e-06, "loss": 2.9292, "step": 1988 }, { "epoch": 0.62, "grad_norm": 9.619987487792969, "learning_rate": 4.529807866422976e-06, "loss": 1.3448, "step": 1989 }, { "epoch": 0.62, "grad_norm": 12.029317855834961, "learning_rate": 4.523399163597677e-06, "loss": 1.5169, "step": 1990 }, { "epoch": 0.62, "grad_norm": 10.02697467803955, "learning_rate": 4.5169928554023594e-06, "loss": 1.5848, "step": 1991 }, { "epoch": 0.62, "grad_norm": 13.536266326904297, "learning_rate": 4.510588947908717e-06, "loss": 1.9503, "step": 1992 }, { "epoch": 0.62, "grad_norm": 8.811053276062012, "learning_rate": 4.504187447186161e-06, "loss": 1.652, "step": 1993 }, { "epoch": 0.62, "grad_norm": 10.713563919067383, "learning_rate": 4.497788359301823e-06, "loss": 3.1022, "step": 1994 }, { "epoch": 0.62, "grad_norm": 18.214853286743164, "learning_rate": 4.491391690320558e-06, "loss": 2.3007, "step": 1995 }, { "epoch": 0.62, "grad_norm": 13.674553871154785, "learning_rate": 4.484997446304915e-06, "loss": 1.1045, "step": 1996 }, { "epoch": 0.62, "grad_norm": 10.581334114074707, "learning_rate": 4.478605633315148e-06, "loss": 1.129, "step": 1997 }, { "epoch": 0.62, "grad_norm": 23.164079666137695, "learning_rate": 4.472216257409216e-06, "loss": 5.2292, "step": 1998 }, { "epoch": 0.62, "grad_norm": 16.813941955566406, "learning_rate": 4.465829324642759e-06, "loss": 2.1862, "step": 1999 }, { "epoch": 0.62, "grad_norm": 11.300929069519043, "learning_rate": 4.459444841069098e-06, "loss": 1.0576, "step": 2000 }, { "epoch": 0.62, "grad_norm": 17.166240692138672, "learning_rate": 4.4530628127392454e-06, "loss": 1.7541, "step": 2001 }, { "epoch": 0.62, "grad_norm": 15.232935905456543, "learning_rate": 4.44668324570188e-06, "loss": 1.8813, "step": 2002 }, { "epoch": 0.62, "grad_norm": 17.39579200744629, "learning_rate": 4.440306146003342e-06, "loss": 3.1383, "step": 2003 }, { "epoch": 0.62, "grad_norm": 16.826940536499023, "learning_rate": 4.433931519687644e-06, "loss": 2.4366, "step": 2004 }, { "epoch": 0.62, "grad_norm": 13.890028953552246, "learning_rate": 4.427559372796447e-06, "loss": 2.2855, "step": 2005 }, { "epoch": 0.62, "grad_norm": 22.110166549682617, "learning_rate": 4.421189711369063e-06, "loss": 2.8979, "step": 2006 }, { "epoch": 0.62, "grad_norm": 15.890730857849121, "learning_rate": 4.414822541442454e-06, "loss": 2.046, "step": 2007 }, { "epoch": 0.62, "grad_norm": 15.0035400390625, "learning_rate": 4.408457869051212e-06, "loss": 5.1188, "step": 2008 }, { "epoch": 0.62, "grad_norm": 24.372314453125, "learning_rate": 4.402095700227567e-06, "loss": 2.7475, "step": 2009 }, { "epoch": 0.62, "grad_norm": 19.236316680908203, "learning_rate": 4.395736041001377e-06, "loss": 4.2975, "step": 2010 }, { "epoch": 0.62, "grad_norm": 14.21186637878418, "learning_rate": 4.3893788974001204e-06, "loss": 1.0334, "step": 2011 }, { "epoch": 0.62, "grad_norm": 14.599188804626465, "learning_rate": 4.383024275448886e-06, "loss": 1.8703, "step": 2012 }, { "epoch": 0.62, "grad_norm": 10.629423141479492, "learning_rate": 4.376672181170383e-06, "loss": 2.0404, "step": 2013 }, { "epoch": 0.62, "grad_norm": 14.760478019714355, "learning_rate": 4.370322620584918e-06, "loss": 1.4733, "step": 2014 }, { "epoch": 0.62, "grad_norm": 11.846567153930664, "learning_rate": 4.363975599710395e-06, "loss": 1.4684, "step": 2015 }, { "epoch": 0.62, "grad_norm": 10.186484336853027, "learning_rate": 4.357631124562318e-06, "loss": 1.3573, "step": 2016 }, { "epoch": 0.62, "grad_norm": 11.071416854858398, "learning_rate": 4.351289201153772e-06, "loss": 0.8062, "step": 2017 }, { "epoch": 0.63, "grad_norm": 14.283429145812988, "learning_rate": 4.344949835495421e-06, "loss": 1.7092, "step": 2018 }, { "epoch": 0.63, "grad_norm": 27.083303451538086, "learning_rate": 4.338613033595518e-06, "loss": 2.3482, "step": 2019 }, { "epoch": 0.63, "grad_norm": 13.069631576538086, "learning_rate": 4.332278801459873e-06, "loss": 3.7772, "step": 2020 }, { "epoch": 0.63, "grad_norm": 12.28091049194336, "learning_rate": 4.325947145091861e-06, "loss": 4.7075, "step": 2021 }, { "epoch": 0.63, "grad_norm": 11.421253204345703, "learning_rate": 4.3196180704924275e-06, "loss": 2.554, "step": 2022 }, { "epoch": 0.63, "grad_norm": 12.664969444274902, "learning_rate": 4.313291583660058e-06, "loss": 1.8915, "step": 2023 }, { "epoch": 0.63, "grad_norm": 13.2357177734375, "learning_rate": 4.306967690590791e-06, "loss": 3.0887, "step": 2024 }, { "epoch": 0.63, "grad_norm": 9.014602661132812, "learning_rate": 4.300646397278208e-06, "loss": 1.2436, "step": 2025 }, { "epoch": 0.63, "grad_norm": 15.402137756347656, "learning_rate": 4.294327709713425e-06, "loss": 4.4077, "step": 2026 }, { "epoch": 0.63, "grad_norm": 19.77958869934082, "learning_rate": 4.2880116338850855e-06, "loss": 3.137, "step": 2027 }, { "epoch": 0.63, "grad_norm": 19.124786376953125, "learning_rate": 4.281698175779366e-06, "loss": 3.9406, "step": 2028 }, { "epoch": 0.63, "grad_norm": 13.08989143371582, "learning_rate": 4.2753873413799505e-06, "loss": 1.8425, "step": 2029 }, { "epoch": 0.63, "grad_norm": 15.249897003173828, "learning_rate": 4.269079136668045e-06, "loss": 2.791, "step": 2030 }, { "epoch": 0.63, "grad_norm": 11.198994636535645, "learning_rate": 4.2627735676223604e-06, "loss": 1.4869, "step": 2031 }, { "epoch": 0.63, "grad_norm": 14.075993537902832, "learning_rate": 4.25647064021911e-06, "loss": 1.2025, "step": 2032 }, { "epoch": 0.63, "grad_norm": 12.997515678405762, "learning_rate": 4.250170360432e-06, "loss": 1.106, "step": 2033 }, { "epoch": 0.63, "grad_norm": 11.242206573486328, "learning_rate": 4.2438727342322375e-06, "loss": 1.3455, "step": 2034 }, { "epoch": 0.63, "grad_norm": 15.24692153930664, "learning_rate": 4.237577767588501e-06, "loss": 1.9019, "step": 2035 }, { "epoch": 0.63, "grad_norm": 14.23598861694336, "learning_rate": 4.231285466466954e-06, "loss": 3.0527, "step": 2036 }, { "epoch": 0.63, "grad_norm": 12.469017028808594, "learning_rate": 4.224995836831241e-06, "loss": 1.6078, "step": 2037 }, { "epoch": 0.63, "grad_norm": 19.05677032470703, "learning_rate": 4.218708884642465e-06, "loss": 3.422, "step": 2038 }, { "epoch": 0.63, "grad_norm": 13.673805236816406, "learning_rate": 4.21242461585919e-06, "loss": 3.2013, "step": 2039 }, { "epoch": 0.63, "grad_norm": 17.894207000732422, "learning_rate": 4.206143036437449e-06, "loss": 6.1287, "step": 2040 }, { "epoch": 0.63, "grad_norm": 13.790003776550293, "learning_rate": 4.199864152330714e-06, "loss": 2.0449, "step": 2041 }, { "epoch": 0.63, "grad_norm": 11.329513549804688, "learning_rate": 4.193587969489906e-06, "loss": 1.8032, "step": 2042 }, { "epoch": 0.63, "grad_norm": 12.738219261169434, "learning_rate": 4.1873144938633905e-06, "loss": 0.8659, "step": 2043 }, { "epoch": 0.63, "grad_norm": 12.105698585510254, "learning_rate": 4.18104373139696e-06, "loss": 2.2742, "step": 2044 }, { "epoch": 0.63, "grad_norm": 10.643797874450684, "learning_rate": 4.17477568803384e-06, "loss": 1.5166, "step": 2045 }, { "epoch": 0.63, "grad_norm": 12.171561241149902, "learning_rate": 4.16851036971468e-06, "loss": 2.284, "step": 2046 }, { "epoch": 0.63, "grad_norm": 11.268860816955566, "learning_rate": 4.16224778237754e-06, "loss": 4.9508, "step": 2047 }, { "epoch": 0.63, "grad_norm": 14.62899398803711, "learning_rate": 4.1559879319578994e-06, "loss": 5.0698, "step": 2048 }, { "epoch": 0.63, "grad_norm": 15.234814643859863, "learning_rate": 4.14973082438864e-06, "loss": 3.7426, "step": 2049 }, { "epoch": 0.64, "grad_norm": 23.05950355529785, "learning_rate": 4.1434764656000465e-06, "loss": 3.1389, "step": 2050 }, { "epoch": 0.64, "grad_norm": 14.896961212158203, "learning_rate": 4.137224861519792e-06, "loss": 1.9389, "step": 2051 }, { "epoch": 0.64, "grad_norm": 16.13572120666504, "learning_rate": 4.1309760180729464e-06, "loss": 1.6235, "step": 2052 }, { "epoch": 0.64, "grad_norm": 7.377883434295654, "learning_rate": 4.124729941181961e-06, "loss": 0.8666, "step": 2053 }, { "epoch": 0.64, "grad_norm": 10.743086814880371, "learning_rate": 4.118486636766659e-06, "loss": 1.2833, "step": 2054 }, { "epoch": 0.64, "grad_norm": 8.719217300415039, "learning_rate": 4.112246110744247e-06, "loss": 1.0122, "step": 2055 }, { "epoch": 0.64, "grad_norm": 11.55544376373291, "learning_rate": 4.10600836902929e-06, "loss": 2.1538, "step": 2056 }, { "epoch": 0.64, "grad_norm": 10.313085556030273, "learning_rate": 4.099773417533717e-06, "loss": 4.89, "step": 2057 }, { "epoch": 0.64, "grad_norm": 17.50877571105957, "learning_rate": 4.093541262166809e-06, "loss": 2.432, "step": 2058 }, { "epoch": 0.64, "grad_norm": 15.130254745483398, "learning_rate": 4.0873119088352035e-06, "loss": 5.1446, "step": 2059 }, { "epoch": 0.64, "grad_norm": 19.7545108795166, "learning_rate": 4.081085363442874e-06, "loss": 4.3046, "step": 2060 }, { "epoch": 0.64, "grad_norm": 13.939685821533203, "learning_rate": 4.074861631891144e-06, "loss": 3.2587, "step": 2061 }, { "epoch": 0.64, "grad_norm": 7.611458778381348, "learning_rate": 4.0686407200786595e-06, "loss": 0.8931, "step": 2062 }, { "epoch": 0.64, "grad_norm": 15.956474304199219, "learning_rate": 4.062422633901396e-06, "loss": 2.0707, "step": 2063 }, { "epoch": 0.64, "grad_norm": 13.552716255187988, "learning_rate": 4.0562073792526615e-06, "loss": 1.3171, "step": 2064 }, { "epoch": 0.64, "grad_norm": 10.638819694519043, "learning_rate": 4.049994962023063e-06, "loss": 1.3762, "step": 2065 }, { "epoch": 0.64, "grad_norm": 12.033987998962402, "learning_rate": 4.0437853881005285e-06, "loss": 3.3857, "step": 2066 }, { "epoch": 0.64, "grad_norm": 13.575906753540039, "learning_rate": 4.037578663370295e-06, "loss": 1.4654, "step": 2067 }, { "epoch": 0.64, "grad_norm": 15.086124420166016, "learning_rate": 4.031374793714891e-06, "loss": 2.941, "step": 2068 }, { "epoch": 0.64, "grad_norm": 14.043783187866211, "learning_rate": 4.025173785014139e-06, "loss": 2.1311, "step": 2069 }, { "epoch": 0.64, "grad_norm": 13.548436164855957, "learning_rate": 4.0189756431451575e-06, "loss": 2.8225, "step": 2070 }, { "epoch": 0.64, "grad_norm": 20.42125701904297, "learning_rate": 4.012780373982344e-06, "loss": 0.8218, "step": 2071 }, { "epoch": 0.64, "grad_norm": 14.614411354064941, "learning_rate": 4.0065879833973694e-06, "loss": 2.4332, "step": 2072 }, { "epoch": 0.64, "grad_norm": 17.507421493530273, "learning_rate": 4.000398477259182e-06, "loss": 3.5555, "step": 2073 }, { "epoch": 0.64, "grad_norm": 10.591798782348633, "learning_rate": 3.994211861433995e-06, "loss": 1.6596, "step": 2074 }, { "epoch": 0.64, "grad_norm": 18.85674285888672, "learning_rate": 3.988028141785275e-06, "loss": 3.2954, "step": 2075 }, { "epoch": 0.64, "grad_norm": 12.868513107299805, "learning_rate": 3.9818473241737585e-06, "loss": 1.5311, "step": 2076 }, { "epoch": 0.64, "grad_norm": 13.88658332824707, "learning_rate": 3.97566941445742e-06, "loss": 2.2858, "step": 2077 }, { "epoch": 0.64, "grad_norm": 9.721027374267578, "learning_rate": 3.9694944184914804e-06, "loss": 1.3336, "step": 2078 }, { "epoch": 0.64, "grad_norm": 13.591902732849121, "learning_rate": 3.9633223421284e-06, "loss": 1.4743, "step": 2079 }, { "epoch": 0.64, "grad_norm": 21.40350341796875, "learning_rate": 3.957153191217872e-06, "loss": 2.8149, "step": 2080 }, { "epoch": 0.64, "grad_norm": 20.24881935119629, "learning_rate": 3.950986971606814e-06, "loss": 5.8637, "step": 2081 }, { "epoch": 0.65, "grad_norm": 12.11694049835205, "learning_rate": 3.944823689139373e-06, "loss": 1.6955, "step": 2082 }, { "epoch": 0.65, "grad_norm": 19.558984756469727, "learning_rate": 3.938663349656905e-06, "loss": 2.7874, "step": 2083 }, { "epoch": 0.65, "grad_norm": 12.004746437072754, "learning_rate": 3.932505958997977e-06, "loss": 1.889, "step": 2084 }, { "epoch": 0.65, "grad_norm": 9.026920318603516, "learning_rate": 3.926351522998371e-06, "loss": 0.8632, "step": 2085 }, { "epoch": 0.65, "grad_norm": 14.440336227416992, "learning_rate": 3.920200047491051e-06, "loss": 2.5162, "step": 2086 }, { "epoch": 0.65, "grad_norm": 11.653973579406738, "learning_rate": 3.9140515383061885e-06, "loss": 1.6231, "step": 2087 }, { "epoch": 0.65, "grad_norm": 16.52773094177246, "learning_rate": 3.907906001271145e-06, "loss": 2.3722, "step": 2088 }, { "epoch": 0.65, "grad_norm": 13.918020248413086, "learning_rate": 3.9017634422104545e-06, "loss": 3.0244, "step": 2089 }, { "epoch": 0.65, "grad_norm": 15.039247512817383, "learning_rate": 3.895623866945835e-06, "loss": 2.6583, "step": 2090 }, { "epoch": 0.65, "grad_norm": 14.738115310668945, "learning_rate": 3.88948728129618e-06, "loss": 2.2832, "step": 2091 }, { "epoch": 0.65, "grad_norm": 15.432753562927246, "learning_rate": 3.883353691077543e-06, "loss": 4.0483, "step": 2092 }, { "epoch": 0.65, "grad_norm": 10.476043701171875, "learning_rate": 3.8772231021031416e-06, "loss": 1.7617, "step": 2093 }, { "epoch": 0.65, "grad_norm": 15.05960750579834, "learning_rate": 3.871095520183348e-06, "loss": 1.767, "step": 2094 }, { "epoch": 0.65, "grad_norm": 10.401081085205078, "learning_rate": 3.864970951125685e-06, "loss": 1.6661, "step": 2095 }, { "epoch": 0.65, "grad_norm": 12.15285587310791, "learning_rate": 3.858849400734815e-06, "loss": 0.8678, "step": 2096 }, { "epoch": 0.65, "grad_norm": 11.537773132324219, "learning_rate": 3.852730874812552e-06, "loss": 3.9522, "step": 2097 }, { "epoch": 0.65, "grad_norm": 8.932913780212402, "learning_rate": 3.846615379157833e-06, "loss": 0.8396, "step": 2098 }, { "epoch": 0.65, "grad_norm": 12.99303913116455, "learning_rate": 3.8405029195667235e-06, "loss": 1.286, "step": 2099 }, { "epoch": 0.65, "grad_norm": 12.972646713256836, "learning_rate": 3.834393501832415e-06, "loss": 1.9852, "step": 2100 }, { "epoch": 0.65, "grad_norm": 9.994357109069824, "learning_rate": 3.828287131745215e-06, "loss": 1.8358, "step": 2101 }, { "epoch": 0.65, "grad_norm": 11.910358428955078, "learning_rate": 3.822183815092538e-06, "loss": 1.6349, "step": 2102 }, { "epoch": 0.65, "grad_norm": 17.40057373046875, "learning_rate": 3.8160835576589156e-06, "loss": 5.6253, "step": 2103 }, { "epoch": 0.65, "grad_norm": 17.70642852783203, "learning_rate": 3.8099863652259697e-06, "loss": 2.2104, "step": 2104 }, { "epoch": 0.65, "grad_norm": 14.252694129943848, "learning_rate": 3.8038922435724186e-06, "loss": 2.2713, "step": 2105 }, { "epoch": 0.65, "grad_norm": 18.08612632751465, "learning_rate": 3.7978011984740795e-06, "loss": 4.3866, "step": 2106 }, { "epoch": 0.65, "grad_norm": 13.621631622314453, "learning_rate": 3.791713235703839e-06, "loss": 2.4693, "step": 2107 }, { "epoch": 0.65, "grad_norm": 11.246854782104492, "learning_rate": 3.78562836103167e-06, "loss": 1.8537, "step": 2108 }, { "epoch": 0.65, "grad_norm": 10.283203125, "learning_rate": 3.7795465802246238e-06, "loss": 0.9031, "step": 2109 }, { "epoch": 0.65, "grad_norm": 18.088642120361328, "learning_rate": 3.77346789904681e-06, "loss": 1.7515, "step": 2110 }, { "epoch": 0.65, "grad_norm": 9.636170387268066, "learning_rate": 3.7673923232594055e-06, "loss": 0.918, "step": 2111 }, { "epoch": 0.65, "grad_norm": 18.71010971069336, "learning_rate": 3.7613198586206453e-06, "loss": 4.5926, "step": 2112 }, { "epoch": 0.65, "grad_norm": 14.63417911529541, "learning_rate": 3.7552505108858128e-06, "loss": 2.1638, "step": 2113 }, { "epoch": 0.65, "grad_norm": 12.265777587890625, "learning_rate": 3.7491842858072365e-06, "loss": 1.8688, "step": 2114 }, { "epoch": 0.66, "grad_norm": 12.674768447875977, "learning_rate": 3.7431211891342885e-06, "loss": 2.0126, "step": 2115 }, { "epoch": 0.66, "grad_norm": 12.914712905883789, "learning_rate": 3.7370612266133733e-06, "loss": 2.3795, "step": 2116 }, { "epoch": 0.66, "grad_norm": 9.552215576171875, "learning_rate": 3.731004403987924e-06, "loss": 1.2995, "step": 2117 }, { "epoch": 0.66, "grad_norm": 15.453949928283691, "learning_rate": 3.724950726998404e-06, "loss": 3.2372, "step": 2118 }, { "epoch": 0.66, "grad_norm": 12.629971504211426, "learning_rate": 3.7189002013822904e-06, "loss": 2.3727, "step": 2119 }, { "epoch": 0.66, "grad_norm": 15.596941947937012, "learning_rate": 3.7128528328740733e-06, "loss": 3.0927, "step": 2120 }, { "epoch": 0.66, "grad_norm": 17.61478042602539, "learning_rate": 3.7068086272052516e-06, "loss": 3.1227, "step": 2121 }, { "epoch": 0.66, "grad_norm": 14.685725212097168, "learning_rate": 3.700767590104325e-06, "loss": 0.7904, "step": 2122 }, { "epoch": 0.66, "grad_norm": 13.112590789794922, "learning_rate": 3.6947297272967926e-06, "loss": 2.4491, "step": 2123 }, { "epoch": 0.66, "grad_norm": 12.789117813110352, "learning_rate": 3.6886950445051465e-06, "loss": 1.0209, "step": 2124 }, { "epoch": 0.66, "grad_norm": 12.015466690063477, "learning_rate": 3.6826635474488588e-06, "loss": 2.0642, "step": 2125 }, { "epoch": 0.66, "grad_norm": 18.760639190673828, "learning_rate": 3.6766352418443893e-06, "loss": 1.7846, "step": 2126 }, { "epoch": 0.66, "grad_norm": 18.239532470703125, "learning_rate": 3.670610133405172e-06, "loss": 5.6831, "step": 2127 }, { "epoch": 0.66, "grad_norm": 14.637475967407227, "learning_rate": 3.6645882278416003e-06, "loss": 1.2087, "step": 2128 }, { "epoch": 0.66, "grad_norm": 13.088603973388672, "learning_rate": 3.6585695308610465e-06, "loss": 1.7061, "step": 2129 }, { "epoch": 0.66, "grad_norm": 17.58694076538086, "learning_rate": 3.652554048167834e-06, "loss": 5.1298, "step": 2130 }, { "epoch": 0.66, "grad_norm": 13.216436386108398, "learning_rate": 3.6465417854632377e-06, "loss": 1.4959, "step": 2131 }, { "epoch": 0.66, "grad_norm": 14.837897300720215, "learning_rate": 3.640532748445491e-06, "loss": 1.5755, "step": 2132 }, { "epoch": 0.66, "grad_norm": 14.177626609802246, "learning_rate": 3.6345269428097607e-06, "loss": 2.7501, "step": 2133 }, { "epoch": 0.66, "grad_norm": 16.52323341369629, "learning_rate": 3.6285243742481533e-06, "loss": 3.8536, "step": 2134 }, { "epoch": 0.66, "grad_norm": 13.3431396484375, "learning_rate": 3.622525048449708e-06, "loss": 2.1923, "step": 2135 }, { "epoch": 0.66, "grad_norm": 12.451409339904785, "learning_rate": 3.6165289711003897e-06, "loss": 1.0444, "step": 2136 }, { "epoch": 0.66, "grad_norm": 9.489715576171875, "learning_rate": 3.610536147883083e-06, "loss": 1.2464, "step": 2137 }, { "epoch": 0.66, "grad_norm": 10.092480659484863, "learning_rate": 3.604546584477596e-06, "loss": 2.2864, "step": 2138 }, { "epoch": 0.66, "grad_norm": 21.5133056640625, "learning_rate": 3.598560286560639e-06, "loss": 2.9101, "step": 2139 }, { "epoch": 0.66, "grad_norm": 12.151141166687012, "learning_rate": 3.592577259805829e-06, "loss": 1.4456, "step": 2140 }, { "epoch": 0.66, "grad_norm": 12.522343635559082, "learning_rate": 3.58659750988369e-06, "loss": 3.3415, "step": 2141 }, { "epoch": 0.66, "grad_norm": 14.61464786529541, "learning_rate": 3.5806210424616257e-06, "loss": 2.2732, "step": 2142 }, { "epoch": 0.66, "grad_norm": 14.204218864440918, "learning_rate": 3.574647863203939e-06, "loss": 4.3799, "step": 2143 }, { "epoch": 0.66, "grad_norm": 11.515316009521484, "learning_rate": 3.5686779777718193e-06, "loss": 2.4421, "step": 2144 }, { "epoch": 0.66, "grad_norm": 15.56559944152832, "learning_rate": 3.5627113918233276e-06, "loss": 3.1241, "step": 2145 }, { "epoch": 0.66, "grad_norm": 18.81997299194336, "learning_rate": 3.5567481110133953e-06, "loss": 4.0341, "step": 2146 }, { "epoch": 0.67, "grad_norm": 11.367809295654297, "learning_rate": 3.5507881409938328e-06, "loss": 1.2594, "step": 2147 }, { "epoch": 0.67, "grad_norm": 10.987909317016602, "learning_rate": 3.5448314874133033e-06, "loss": 1.0973, "step": 2148 }, { "epoch": 0.67, "grad_norm": 18.935338973999023, "learning_rate": 3.5388781559173267e-06, "loss": 5.2275, "step": 2149 }, { "epoch": 0.67, "grad_norm": 13.536104202270508, "learning_rate": 3.5329281521482783e-06, "loss": 1.608, "step": 2150 }, { "epoch": 0.67, "grad_norm": 20.221797943115234, "learning_rate": 3.526981481745377e-06, "loss": 2.6814, "step": 2151 }, { "epoch": 0.67, "grad_norm": 12.287632942199707, "learning_rate": 3.5210381503446822e-06, "loss": 1.271, "step": 2152 }, { "epoch": 0.67, "grad_norm": 11.780200004577637, "learning_rate": 3.5150981635790942e-06, "loss": 1.9295, "step": 2153 }, { "epoch": 0.67, "grad_norm": 10.957992553710938, "learning_rate": 3.509161527078336e-06, "loss": 1.063, "step": 2154 }, { "epoch": 0.67, "grad_norm": 14.497386932373047, "learning_rate": 3.503228246468959e-06, "loss": 5.2318, "step": 2155 }, { "epoch": 0.67, "grad_norm": 15.921914100646973, "learning_rate": 3.4972983273743326e-06, "loss": 5.0767, "step": 2156 }, { "epoch": 0.67, "grad_norm": 17.922636032104492, "learning_rate": 3.4913717754146416e-06, "loss": 1.8815, "step": 2157 }, { "epoch": 0.67, "grad_norm": 12.503357887268066, "learning_rate": 3.4854485962068766e-06, "loss": 1.9732, "step": 2158 }, { "epoch": 0.67, "grad_norm": 19.81252098083496, "learning_rate": 3.4795287953648377e-06, "loss": 3.1647, "step": 2159 }, { "epoch": 0.67, "grad_norm": 8.471256256103516, "learning_rate": 3.4736123784991183e-06, "loss": 1.1755, "step": 2160 }, { "epoch": 0.67, "grad_norm": 10.54334545135498, "learning_rate": 3.4676993512171023e-06, "loss": 0.9531, "step": 2161 }, { "epoch": 0.67, "grad_norm": 14.592487335205078, "learning_rate": 3.4617897191229724e-06, "loss": 2.5702, "step": 2162 }, { "epoch": 0.67, "grad_norm": 16.094675064086914, "learning_rate": 3.455883487817677e-06, "loss": 3.1361, "step": 2163 }, { "epoch": 0.67, "grad_norm": 13.244039535522461, "learning_rate": 3.449980662898951e-06, "loss": 1.4257, "step": 2164 }, { "epoch": 0.67, "grad_norm": 10.110610008239746, "learning_rate": 3.4440812499613035e-06, "loss": 1.3565, "step": 2165 }, { "epoch": 0.67, "grad_norm": 15.21812629699707, "learning_rate": 3.4381852545960046e-06, "loss": 3.301, "step": 2166 }, { "epoch": 0.67, "grad_norm": 13.889304161071777, "learning_rate": 3.4322926823910824e-06, "loss": 2.4784, "step": 2167 }, { "epoch": 0.67, "grad_norm": 19.99413299560547, "learning_rate": 3.4264035389313316e-06, "loss": 2.0284, "step": 2168 }, { "epoch": 0.67, "grad_norm": 14.55456256866455, "learning_rate": 3.4205178297982878e-06, "loss": 2.0917, "step": 2169 }, { "epoch": 0.67, "grad_norm": 12.45078182220459, "learning_rate": 3.4146355605702333e-06, "loss": 1.7994, "step": 2170 }, { "epoch": 0.67, "grad_norm": 12.798308372497559, "learning_rate": 3.4087567368221925e-06, "loss": 1.5656, "step": 2171 }, { "epoch": 0.67, "grad_norm": 20.649717330932617, "learning_rate": 3.402881364125923e-06, "loss": 3.8393, "step": 2172 }, { "epoch": 0.67, "grad_norm": 9.314234733581543, "learning_rate": 3.3970094480499083e-06, "loss": 2.6209, "step": 2173 }, { "epoch": 0.67, "grad_norm": 12.697712898254395, "learning_rate": 3.3911409941593645e-06, "loss": 4.9514, "step": 2174 }, { "epoch": 0.67, "grad_norm": 18.51091194152832, "learning_rate": 3.3852760080162196e-06, "loss": 3.1904, "step": 2175 }, { "epoch": 0.67, "grad_norm": 21.123741149902344, "learning_rate": 3.379414495179115e-06, "loss": 8.2013, "step": 2176 }, { "epoch": 0.67, "grad_norm": 12.869108200073242, "learning_rate": 3.3735564612034024e-06, "loss": 1.6296, "step": 2177 }, { "epoch": 0.67, "grad_norm": 8.646051406860352, "learning_rate": 3.3677019116411356e-06, "loss": 0.834, "step": 2178 }, { "epoch": 0.68, "grad_norm": 17.026458740234375, "learning_rate": 3.361850852041064e-06, "loss": 2.2227, "step": 2179 }, { "epoch": 0.68, "grad_norm": 14.233617782592773, "learning_rate": 3.3560032879486353e-06, "loss": 2.5529, "step": 2180 }, { "epoch": 0.68, "grad_norm": 18.442110061645508, "learning_rate": 3.3501592249059784e-06, "loss": 3.5626, "step": 2181 }, { "epoch": 0.68, "grad_norm": 17.84872817993164, "learning_rate": 3.3443186684519025e-06, "loss": 4.7617, "step": 2182 }, { "epoch": 0.68, "grad_norm": 15.51069450378418, "learning_rate": 3.338481624121906e-06, "loss": 6.0042, "step": 2183 }, { "epoch": 0.68, "grad_norm": 12.388606071472168, "learning_rate": 3.33264809744814e-06, "loss": 2.0355, "step": 2184 }, { "epoch": 0.68, "grad_norm": 12.3096342086792, "learning_rate": 3.326818093959433e-06, "loss": 1.449, "step": 2185 }, { "epoch": 0.68, "grad_norm": 14.437499046325684, "learning_rate": 3.3209916191812763e-06, "loss": 4.6765, "step": 2186 }, { "epoch": 0.68, "grad_norm": 11.381603240966797, "learning_rate": 3.31516867863581e-06, "loss": 2.78, "step": 2187 }, { "epoch": 0.68, "grad_norm": 10.969701766967773, "learning_rate": 3.309349277841825e-06, "loss": 1.3352, "step": 2188 }, { "epoch": 0.68, "grad_norm": 14.634147644042969, "learning_rate": 3.3035334223147673e-06, "loss": 1.3593, "step": 2189 }, { "epoch": 0.68, "grad_norm": 13.996097564697266, "learning_rate": 3.297721117566709e-06, "loss": 2.0998, "step": 2190 }, { "epoch": 0.68, "grad_norm": 12.386398315429688, "learning_rate": 3.2919123691063667e-06, "loss": 1.8084, "step": 2191 }, { "epoch": 0.68, "grad_norm": 15.382070541381836, "learning_rate": 3.286107182439081e-06, "loss": 5.2602, "step": 2192 }, { "epoch": 0.68, "grad_norm": 21.93962860107422, "learning_rate": 3.280305563066819e-06, "loss": 2.948, "step": 2193 }, { "epoch": 0.68, "grad_norm": 18.650524139404297, "learning_rate": 3.2745075164881645e-06, "loss": 4.9346, "step": 2194 }, { "epoch": 0.68, "grad_norm": 17.64752960205078, "learning_rate": 3.2687130481983222e-06, "loss": 2.3879, "step": 2195 }, { "epoch": 0.68, "grad_norm": 14.175460815429688, "learning_rate": 3.2629221636890975e-06, "loss": 2.6153, "step": 2196 }, { "epoch": 0.68, "grad_norm": 12.955414772033691, "learning_rate": 3.257134868448903e-06, "loss": 1.0509, "step": 2197 }, { "epoch": 0.68, "grad_norm": 16.26597785949707, "learning_rate": 3.2513511679627473e-06, "loss": 2.6854, "step": 2198 }, { "epoch": 0.68, "grad_norm": 14.234977722167969, "learning_rate": 3.245571067712234e-06, "loss": 1.5645, "step": 2199 }, { "epoch": 0.68, "grad_norm": 8.558510780334473, "learning_rate": 3.239794573175552e-06, "loss": 0.9032, "step": 2200 }, { "epoch": 0.68, "grad_norm": 14.81620979309082, "learning_rate": 3.234021689827479e-06, "loss": 3.2927, "step": 2201 }, { "epoch": 0.68, "grad_norm": 13.267828941345215, "learning_rate": 3.2282524231393627e-06, "loss": 2.4144, "step": 2202 }, { "epoch": 0.68, "grad_norm": 12.47176456451416, "learning_rate": 3.2224867785791255e-06, "loss": 6.066, "step": 2203 }, { "epoch": 0.68, "grad_norm": 13.159913063049316, "learning_rate": 3.2167247616112605e-06, "loss": 2.0575, "step": 2204 }, { "epoch": 0.68, "grad_norm": 17.272306442260742, "learning_rate": 3.2109663776968216e-06, "loss": 5.412, "step": 2205 }, { "epoch": 0.68, "grad_norm": 20.280254364013672, "learning_rate": 3.2052116322934085e-06, "loss": 2.2086, "step": 2206 }, { "epoch": 0.68, "grad_norm": 14.724082946777344, "learning_rate": 3.1994605308551892e-06, "loss": 3.3149, "step": 2207 }, { "epoch": 0.68, "grad_norm": 14.12836742401123, "learning_rate": 3.193713078832869e-06, "loss": 1.3179, "step": 2208 }, { "epoch": 0.68, "grad_norm": 11.377018928527832, "learning_rate": 3.187969281673692e-06, "loss": 2.8663, "step": 2209 }, { "epoch": 0.68, "grad_norm": 12.249744415283203, "learning_rate": 3.1822291448214475e-06, "loss": 1.7564, "step": 2210 }, { "epoch": 0.69, "grad_norm": 13.492171287536621, "learning_rate": 3.1764926737164476e-06, "loss": 1.8586, "step": 2211 }, { "epoch": 0.69, "grad_norm": 8.95061206817627, "learning_rate": 3.1707598737955327e-06, "loss": 0.9182, "step": 2212 }, { "epoch": 0.69, "grad_norm": 16.02012062072754, "learning_rate": 3.1650307504920644e-06, "loss": 1.7154, "step": 2213 }, { "epoch": 0.69, "grad_norm": 14.300397872924805, "learning_rate": 3.1593053092359195e-06, "loss": 2.4413, "step": 2214 }, { "epoch": 0.69, "grad_norm": 11.821404457092285, "learning_rate": 3.1535835554534806e-06, "loss": 1.5034, "step": 2215 }, { "epoch": 0.69, "grad_norm": 12.430465698242188, "learning_rate": 3.1478654945676463e-06, "loss": 2.1052, "step": 2216 }, { "epoch": 0.69, "grad_norm": 16.182870864868164, "learning_rate": 3.142151131997804e-06, "loss": 2.2128, "step": 2217 }, { "epoch": 0.69, "grad_norm": 14.833673477172852, "learning_rate": 3.136440473159843e-06, "loss": 1.4605, "step": 2218 }, { "epoch": 0.69, "grad_norm": 9.153757095336914, "learning_rate": 3.1307335234661376e-06, "loss": 0.9416, "step": 2219 }, { "epoch": 0.69, "grad_norm": 9.109410285949707, "learning_rate": 3.1250302883255502e-06, "loss": 1.2267, "step": 2220 }, { "epoch": 0.69, "grad_norm": 8.069206237792969, "learning_rate": 3.119330773143418e-06, "loss": 1.1063, "step": 2221 }, { "epoch": 0.69, "grad_norm": 11.913293838500977, "learning_rate": 3.113634983321561e-06, "loss": 1.8163, "step": 2222 }, { "epoch": 0.69, "grad_norm": 17.14433479309082, "learning_rate": 3.1079429242582614e-06, "loss": 3.1109, "step": 2223 }, { "epoch": 0.69, "grad_norm": 15.804655075073242, "learning_rate": 3.102254601348264e-06, "loss": 2.6792, "step": 2224 }, { "epoch": 0.69, "grad_norm": 11.95259952545166, "learning_rate": 3.0965700199827813e-06, "loss": 1.0476, "step": 2225 }, { "epoch": 0.69, "grad_norm": 13.387849807739258, "learning_rate": 3.090889185549475e-06, "loss": 1.4064, "step": 2226 }, { "epoch": 0.69, "grad_norm": 12.103630065917969, "learning_rate": 3.0852121034324464e-06, "loss": 1.5432, "step": 2227 }, { "epoch": 0.69, "grad_norm": 8.44951343536377, "learning_rate": 3.0795387790122575e-06, "loss": 0.7952, "step": 2228 }, { "epoch": 0.69, "grad_norm": 13.66195297241211, "learning_rate": 3.0738692176658966e-06, "loss": 3.0626, "step": 2229 }, { "epoch": 0.69, "grad_norm": 20.553163528442383, "learning_rate": 3.0682034247667883e-06, "loss": 5.6836, "step": 2230 }, { "epoch": 0.69, "grad_norm": 13.766483306884766, "learning_rate": 3.062541405684791e-06, "loss": 1.8825, "step": 2231 }, { "epoch": 0.69, "grad_norm": 10.576692581176758, "learning_rate": 3.0568831657861797e-06, "loss": 1.3714, "step": 2232 }, { "epoch": 0.69, "grad_norm": 12.139288902282715, "learning_rate": 3.0512287104336494e-06, "loss": 1.2121, "step": 2233 }, { "epoch": 0.69, "grad_norm": 15.085342407226562, "learning_rate": 3.045578044986308e-06, "loss": 2.1256, "step": 2234 }, { "epoch": 0.69, "grad_norm": 10.91439151763916, "learning_rate": 3.0399311747996738e-06, "loss": 2.0942, "step": 2235 }, { "epoch": 0.69, "grad_norm": 20.10934829711914, "learning_rate": 3.0342881052256634e-06, "loss": 3.1579, "step": 2236 }, { "epoch": 0.69, "grad_norm": 10.833390235900879, "learning_rate": 3.0286488416125994e-06, "loss": 1.3159, "step": 2237 }, { "epoch": 0.69, "grad_norm": 18.666641235351562, "learning_rate": 3.0230133893051884e-06, "loss": 2.6354, "step": 2238 }, { "epoch": 0.69, "grad_norm": 14.653985023498535, "learning_rate": 3.0173817536445305e-06, "loss": 5.5188, "step": 2239 }, { "epoch": 0.69, "grad_norm": 6.806528568267822, "learning_rate": 3.0117539399681053e-06, "loss": 0.6842, "step": 2240 }, { "epoch": 0.69, "grad_norm": 9.189783096313477, "learning_rate": 3.0061299536097723e-06, "loss": 1.1556, "step": 2241 }, { "epoch": 0.69, "grad_norm": 9.61166763305664, "learning_rate": 3.0005097998997586e-06, "loss": 1.2695, "step": 2242 }, { "epoch": 0.69, "grad_norm": 14.16010856628418, "learning_rate": 2.9948934841646695e-06, "loss": 4.5669, "step": 2243 }, { "epoch": 0.7, "grad_norm": 12.887718200683594, "learning_rate": 2.9892810117274626e-06, "loss": 2.2022, "step": 2244 }, { "epoch": 0.7, "grad_norm": 10.175048828125, "learning_rate": 2.983672387907454e-06, "loss": 1.3168, "step": 2245 }, { "epoch": 0.7, "grad_norm": 14.184016227722168, "learning_rate": 2.97806761802032e-06, "loss": 3.7449, "step": 2246 }, { "epoch": 0.7, "grad_norm": 11.650161743164062, "learning_rate": 2.972466707378079e-06, "loss": 3.1845, "step": 2247 }, { "epoch": 0.7, "grad_norm": 20.005102157592773, "learning_rate": 2.966869661289084e-06, "loss": 4.9742, "step": 2248 }, { "epoch": 0.7, "grad_norm": 12.802370071411133, "learning_rate": 2.9612764850580405e-06, "loss": 2.6339, "step": 2249 }, { "epoch": 0.7, "grad_norm": 11.79102611541748, "learning_rate": 2.955687183985976e-06, "loss": 3.4665, "step": 2250 }, { "epoch": 0.7, "grad_norm": 13.28844165802002, "learning_rate": 2.9501017633702462e-06, "loss": 1.7935, "step": 2251 }, { "epoch": 0.7, "grad_norm": 17.05403709411621, "learning_rate": 2.9445202285045356e-06, "loss": 4.087, "step": 2252 }, { "epoch": 0.7, "grad_norm": 13.470844268798828, "learning_rate": 2.9389425846788387e-06, "loss": 2.7735, "step": 2253 }, { "epoch": 0.7, "grad_norm": 13.51559066772461, "learning_rate": 2.9333688371794643e-06, "loss": 2.176, "step": 2254 }, { "epoch": 0.7, "grad_norm": 12.657980918884277, "learning_rate": 2.9277989912890284e-06, "loss": 1.8321, "step": 2255 }, { "epoch": 0.7, "grad_norm": 10.267308235168457, "learning_rate": 2.922233052286451e-06, "loss": 1.1327, "step": 2256 }, { "epoch": 0.7, "grad_norm": 13.085373878479004, "learning_rate": 2.916671025446944e-06, "loss": 1.5981, "step": 2257 }, { "epoch": 0.7, "grad_norm": 18.13713836669922, "learning_rate": 2.91111291604202e-06, "loss": 6.3258, "step": 2258 }, { "epoch": 0.7, "grad_norm": 15.099580764770508, "learning_rate": 2.9055587293394733e-06, "loss": 1.563, "step": 2259 }, { "epoch": 0.7, "grad_norm": 14.139883995056152, "learning_rate": 2.9000084706033795e-06, "loss": 3.1056, "step": 2260 }, { "epoch": 0.7, "grad_norm": 14.964570045471191, "learning_rate": 2.894462145094093e-06, "loss": 5.665, "step": 2261 }, { "epoch": 0.7, "grad_norm": 19.927221298217773, "learning_rate": 2.8889197580682424e-06, "loss": 2.7615, "step": 2262 }, { "epoch": 0.7, "grad_norm": 19.620908737182617, "learning_rate": 2.883381314778717e-06, "loss": 3.5992, "step": 2263 }, { "epoch": 0.7, "grad_norm": 15.602147102355957, "learning_rate": 2.877846820474679e-06, "loss": 3.0023, "step": 2264 }, { "epoch": 0.7, "grad_norm": 15.21510124206543, "learning_rate": 2.872316280401539e-06, "loss": 3.4408, "step": 2265 }, { "epoch": 0.7, "grad_norm": 13.735747337341309, "learning_rate": 2.86678969980096e-06, "loss": 1.3311, "step": 2266 }, { "epoch": 0.7, "grad_norm": 15.913676261901855, "learning_rate": 2.8612670839108617e-06, "loss": 1.8771, "step": 2267 }, { "epoch": 0.7, "grad_norm": 24.34699249267578, "learning_rate": 2.855748437965395e-06, "loss": 5.8405, "step": 2268 }, { "epoch": 0.7, "grad_norm": 9.142714500427246, "learning_rate": 2.850233767194954e-06, "loss": 1.0924, "step": 2269 }, { "epoch": 0.7, "grad_norm": 12.947314262390137, "learning_rate": 2.8447230768261632e-06, "loss": 4.7454, "step": 2270 }, { "epoch": 0.7, "grad_norm": 11.288124084472656, "learning_rate": 2.8392163720818763e-06, "loss": 1.3408, "step": 2271 }, { "epoch": 0.7, "grad_norm": 11.081491470336914, "learning_rate": 2.8337136581811654e-06, "loss": 1.6393, "step": 2272 }, { "epoch": 0.7, "grad_norm": 16.007856369018555, "learning_rate": 2.8282149403393293e-06, "loss": 1.7916, "step": 2273 }, { "epoch": 0.7, "grad_norm": 14.11226749420166, "learning_rate": 2.8227202237678712e-06, "loss": 2.4778, "step": 2274 }, { "epoch": 0.7, "grad_norm": 8.911184310913086, "learning_rate": 2.8172295136745035e-06, "loss": 1.3889, "step": 2275 }, { "epoch": 0.71, "grad_norm": 14.678910255432129, "learning_rate": 2.811742815263144e-06, "loss": 1.8128, "step": 2276 }, { "epoch": 0.71, "grad_norm": 12.682074546813965, "learning_rate": 2.8062601337339043e-06, "loss": 1.5763, "step": 2277 }, { "epoch": 0.71, "grad_norm": 12.303553581237793, "learning_rate": 2.8007814742830922e-06, "loss": 2.4316, "step": 2278 }, { "epoch": 0.71, "grad_norm": 13.91916275024414, "learning_rate": 2.7953068421032044e-06, "loss": 3.1608, "step": 2279 }, { "epoch": 0.71, "grad_norm": 16.613800048828125, "learning_rate": 2.789836242382919e-06, "loss": 2.4289, "step": 2280 }, { "epoch": 0.71, "grad_norm": 15.66029167175293, "learning_rate": 2.7843696803070877e-06, "loss": 2.2365, "step": 2281 }, { "epoch": 0.71, "grad_norm": 19.280305862426758, "learning_rate": 2.778907161056749e-06, "loss": 1.7019, "step": 2282 }, { "epoch": 0.71, "grad_norm": 14.321356773376465, "learning_rate": 2.7734486898090925e-06, "loss": 1.0882, "step": 2283 }, { "epoch": 0.71, "grad_norm": 9.382092475891113, "learning_rate": 2.76799427173748e-06, "loss": 1.1687, "step": 2284 }, { "epoch": 0.71, "grad_norm": 11.250184059143066, "learning_rate": 2.7625439120114352e-06, "loss": 1.5645, "step": 2285 }, { "epoch": 0.71, "grad_norm": 15.593734741210938, "learning_rate": 2.7570976157966304e-06, "loss": 2.7318, "step": 2286 }, { "epoch": 0.71, "grad_norm": 13.554723739624023, "learning_rate": 2.7516553882548847e-06, "loss": 0.8545, "step": 2287 }, { "epoch": 0.71, "grad_norm": 16.75957489013672, "learning_rate": 2.746217234544169e-06, "loss": 2.6925, "step": 2288 }, { "epoch": 0.71, "grad_norm": 16.92119026184082, "learning_rate": 2.740783159818586e-06, "loss": 2.4305, "step": 2289 }, { "epoch": 0.71, "grad_norm": 14.165618896484375, "learning_rate": 2.7353531692283744e-06, "loss": 1.7753, "step": 2290 }, { "epoch": 0.71, "grad_norm": 20.619165420532227, "learning_rate": 2.7299272679199026e-06, "loss": 6.1848, "step": 2291 }, { "epoch": 0.71, "grad_norm": 12.482929229736328, "learning_rate": 2.724505461035663e-06, "loss": 1.3093, "step": 2292 }, { "epoch": 0.71, "grad_norm": 18.101463317871094, "learning_rate": 2.7190877537142655e-06, "loss": 3.2845, "step": 2293 }, { "epoch": 0.71, "grad_norm": 22.22005844116211, "learning_rate": 2.713674151090442e-06, "loss": 2.3354, "step": 2294 }, { "epoch": 0.71, "grad_norm": 12.901947021484375, "learning_rate": 2.7082646582950244e-06, "loss": 3.2988, "step": 2295 }, { "epoch": 0.71, "grad_norm": 11.771439552307129, "learning_rate": 2.702859280454956e-06, "loss": 2.8109, "step": 2296 }, { "epoch": 0.71, "grad_norm": 20.15504264831543, "learning_rate": 2.6974580226932765e-06, "loss": 2.5444, "step": 2297 }, { "epoch": 0.71, "grad_norm": 24.20153045654297, "learning_rate": 2.6920608901291203e-06, "loss": 1.683, "step": 2298 }, { "epoch": 0.71, "grad_norm": 20.239795684814453, "learning_rate": 2.6866678878777185e-06, "loss": 2.9259, "step": 2299 }, { "epoch": 0.71, "grad_norm": 19.217615127563477, "learning_rate": 2.6812790210503795e-06, "loss": 1.3868, "step": 2300 }, { "epoch": 0.71, "grad_norm": 12.615150451660156, "learning_rate": 2.6758942947544954e-06, "loss": 2.0394, "step": 2301 }, { "epoch": 0.71, "grad_norm": 13.607015609741211, "learning_rate": 2.6705137140935384e-06, "loss": 2.4767, "step": 2302 }, { "epoch": 0.71, "grad_norm": 27.116342544555664, "learning_rate": 2.665137284167047e-06, "loss": 3.3526, "step": 2303 }, { "epoch": 0.71, "grad_norm": 15.009269714355469, "learning_rate": 2.659765010070619e-06, "loss": 2.6932, "step": 2304 }, { "epoch": 0.71, "grad_norm": 12.386222839355469, "learning_rate": 2.654396896895929e-06, "loss": 1.9814, "step": 2305 }, { "epoch": 0.71, "grad_norm": 10.858598709106445, "learning_rate": 2.649032949730697e-06, "loss": 1.7919, "step": 2306 }, { "epoch": 0.71, "grad_norm": 21.328723907470703, "learning_rate": 2.6436731736586954e-06, "loss": 3.7241, "step": 2307 }, { "epoch": 0.72, "grad_norm": 10.507683753967285, "learning_rate": 2.63831757375975e-06, "loss": 1.5231, "step": 2308 }, { "epoch": 0.72, "grad_norm": 11.425612449645996, "learning_rate": 2.6329661551097206e-06, "loss": 1.3472, "step": 2309 }, { "epoch": 0.72, "grad_norm": 14.5155611038208, "learning_rate": 2.627618922780509e-06, "loss": 2.5453, "step": 2310 }, { "epoch": 0.72, "grad_norm": 14.072335243225098, "learning_rate": 2.622275881840047e-06, "loss": 2.5709, "step": 2311 }, { "epoch": 0.72, "grad_norm": 11.811149597167969, "learning_rate": 2.616937037352295e-06, "loss": 1.7393, "step": 2312 }, { "epoch": 0.72, "grad_norm": 15.45902156829834, "learning_rate": 2.611602394377233e-06, "loss": 5.5844, "step": 2313 }, { "epoch": 0.72, "grad_norm": 10.028008460998535, "learning_rate": 2.6062719579708676e-06, "loss": 1.6212, "step": 2314 }, { "epoch": 0.72, "grad_norm": 15.309538841247559, "learning_rate": 2.6009457331852094e-06, "loss": 2.0222, "step": 2315 }, { "epoch": 0.72, "grad_norm": 12.404561996459961, "learning_rate": 2.5956237250682816e-06, "loss": 2.1896, "step": 2316 }, { "epoch": 0.72, "grad_norm": 12.74881362915039, "learning_rate": 2.5903059386641104e-06, "loss": 1.7404, "step": 2317 }, { "epoch": 0.72, "grad_norm": 11.10749340057373, "learning_rate": 2.584992379012719e-06, "loss": 1.5973, "step": 2318 }, { "epoch": 0.72, "grad_norm": 10.463422775268555, "learning_rate": 2.5796830511501252e-06, "loss": 1.3745, "step": 2319 }, { "epoch": 0.72, "grad_norm": 10.964384078979492, "learning_rate": 2.574377960108342e-06, "loss": 1.6189, "step": 2320 }, { "epoch": 0.72, "grad_norm": 11.361332893371582, "learning_rate": 2.5690771109153587e-06, "loss": 1.3829, "step": 2321 }, { "epoch": 0.72, "grad_norm": 17.673812866210938, "learning_rate": 2.5637805085951448e-06, "loss": 2.4582, "step": 2322 }, { "epoch": 0.72, "grad_norm": 18.889476776123047, "learning_rate": 2.558488158167653e-06, "loss": 4.2102, "step": 2323 }, { "epoch": 0.72, "grad_norm": 23.341211318969727, "learning_rate": 2.5532000646488008e-06, "loss": 2.0677, "step": 2324 }, { "epoch": 0.72, "grad_norm": 12.450510025024414, "learning_rate": 2.547916233050463e-06, "loss": 1.6574, "step": 2325 }, { "epoch": 0.72, "grad_norm": 12.01343822479248, "learning_rate": 2.542636668380491e-06, "loss": 1.1428, "step": 2326 }, { "epoch": 0.72, "grad_norm": 23.03483009338379, "learning_rate": 2.537361375642682e-06, "loss": 2.9922, "step": 2327 }, { "epoch": 0.72, "grad_norm": 11.131219863891602, "learning_rate": 2.5320903598367836e-06, "loss": 1.4113, "step": 2328 }, { "epoch": 0.72, "grad_norm": 14.456703186035156, "learning_rate": 2.526823625958498e-06, "loss": 2.5972, "step": 2329 }, { "epoch": 0.72, "grad_norm": 11.621821403503418, "learning_rate": 2.5215611789994617e-06, "loss": 2.6253, "step": 2330 }, { "epoch": 0.72, "grad_norm": 14.902911186218262, "learning_rate": 2.51630302394725e-06, "loss": 3.9074, "step": 2331 }, { "epoch": 0.72, "grad_norm": 16.977209091186523, "learning_rate": 2.5110491657853716e-06, "loss": 4.3072, "step": 2332 }, { "epoch": 0.72, "grad_norm": 16.70897674560547, "learning_rate": 2.5057996094932624e-06, "loss": 2.3071, "step": 2333 }, { "epoch": 0.72, "grad_norm": 14.035141944885254, "learning_rate": 2.500554360046278e-06, "loss": 3.3543, "step": 2334 }, { "epoch": 0.72, "grad_norm": 10.244784355163574, "learning_rate": 2.4953134224156995e-06, "loss": 1.8466, "step": 2335 }, { "epoch": 0.72, "grad_norm": 14.622149467468262, "learning_rate": 2.4900768015687146e-06, "loss": 4.5747, "step": 2336 }, { "epoch": 0.72, "grad_norm": 17.696693420410156, "learning_rate": 2.4848445024684204e-06, "loss": 5.5025, "step": 2337 }, { "epoch": 0.72, "grad_norm": 9.789216995239258, "learning_rate": 2.4796165300738265e-06, "loss": 0.898, "step": 2338 }, { "epoch": 0.72, "grad_norm": 18.043411254882812, "learning_rate": 2.4743928893398266e-06, "loss": 1.8609, "step": 2339 }, { "epoch": 0.73, "grad_norm": 12.23833179473877, "learning_rate": 2.469173585217219e-06, "loss": 2.3504, "step": 2340 }, { "epoch": 0.73, "grad_norm": 16.880558013916016, "learning_rate": 2.4639586226526928e-06, "loss": 5.6096, "step": 2341 }, { "epoch": 0.73, "grad_norm": 15.475369453430176, "learning_rate": 2.458748006588819e-06, "loss": 4.6972, "step": 2342 }, { "epoch": 0.73, "grad_norm": 15.600180625915527, "learning_rate": 2.4535417419640467e-06, "loss": 2.1673, "step": 2343 }, { "epoch": 0.73, "grad_norm": 18.367475509643555, "learning_rate": 2.448339833712709e-06, "loss": 1.7335, "step": 2344 }, { "epoch": 0.73, "grad_norm": 18.15220832824707, "learning_rate": 2.4431422867650023e-06, "loss": 3.1931, "step": 2345 }, { "epoch": 0.73, "grad_norm": 15.09410572052002, "learning_rate": 2.4379491060469934e-06, "loss": 2.4334, "step": 2346 }, { "epoch": 0.73, "grad_norm": 11.841636657714844, "learning_rate": 2.432760296480609e-06, "loss": 2.0151, "step": 2347 }, { "epoch": 0.73, "grad_norm": 14.133190155029297, "learning_rate": 2.427575862983633e-06, "loss": 1.5586, "step": 2348 }, { "epoch": 0.73, "grad_norm": 10.86910343170166, "learning_rate": 2.422395810469702e-06, "loss": 3.1122, "step": 2349 }, { "epoch": 0.73, "grad_norm": 13.083471298217773, "learning_rate": 2.4172201438483046e-06, "loss": 2.3513, "step": 2350 }, { "epoch": 0.73, "grad_norm": 16.80009651184082, "learning_rate": 2.412048868024767e-06, "loss": 7.2501, "step": 2351 }, { "epoch": 0.73, "grad_norm": 15.160103797912598, "learning_rate": 2.406881987900256e-06, "loss": 3.9078, "step": 2352 }, { "epoch": 0.73, "grad_norm": 25.479278564453125, "learning_rate": 2.401719508371772e-06, "loss": 2.1761, "step": 2353 }, { "epoch": 0.73, "grad_norm": 15.644292831420898, "learning_rate": 2.396561434332146e-06, "loss": 2.5117, "step": 2354 }, { "epoch": 0.73, "grad_norm": 9.581793785095215, "learning_rate": 2.3914077706700286e-06, "loss": 1.1493, "step": 2355 }, { "epoch": 0.73, "grad_norm": 13.066975593566895, "learning_rate": 2.3862585222699007e-06, "loss": 1.4776, "step": 2356 }, { "epoch": 0.73, "grad_norm": 10.616497993469238, "learning_rate": 2.381113694012049e-06, "loss": 1.3081, "step": 2357 }, { "epoch": 0.73, "grad_norm": 10.0732421875, "learning_rate": 2.3759732907725716e-06, "loss": 1.9863, "step": 2358 }, { "epoch": 0.73, "grad_norm": 10.824091911315918, "learning_rate": 2.370837317423383e-06, "loss": 1.1054, "step": 2359 }, { "epoch": 0.73, "grad_norm": 23.744216918945312, "learning_rate": 2.365705778832184e-06, "loss": 2.0633, "step": 2360 }, { "epoch": 0.73, "grad_norm": 11.767815589904785, "learning_rate": 2.3605786798624796e-06, "loss": 1.481, "step": 2361 }, { "epoch": 0.73, "grad_norm": 23.761098861694336, "learning_rate": 2.3554560253735722e-06, "loss": 3.4828, "step": 2362 }, { "epoch": 0.73, "grad_norm": 24.555688858032227, "learning_rate": 2.350337820220545e-06, "loss": 1.5381, "step": 2363 }, { "epoch": 0.73, "grad_norm": 22.43702507019043, "learning_rate": 2.3452240692542623e-06, "loss": 6.4765, "step": 2364 }, { "epoch": 0.73, "grad_norm": 8.012927055358887, "learning_rate": 2.340114777321377e-06, "loss": 1.3097, "step": 2365 }, { "epoch": 0.73, "grad_norm": 17.169174194335938, "learning_rate": 2.335009949264306e-06, "loss": 5.5977, "step": 2366 }, { "epoch": 0.73, "grad_norm": 16.629274368286133, "learning_rate": 2.3299095899212405e-06, "loss": 2.5058, "step": 2367 }, { "epoch": 0.73, "grad_norm": 12.617037773132324, "learning_rate": 2.324813704126134e-06, "loss": 3.6558, "step": 2368 }, { "epoch": 0.73, "grad_norm": 13.532926559448242, "learning_rate": 2.3197222967087014e-06, "loss": 1.8785, "step": 2369 }, { "epoch": 0.73, "grad_norm": 15.670842170715332, "learning_rate": 2.3146353724944096e-06, "loss": 2.7407, "step": 2370 }, { "epoch": 0.73, "grad_norm": 13.534579277038574, "learning_rate": 2.3095529363044857e-06, "loss": 1.2841, "step": 2371 }, { "epoch": 0.73, "grad_norm": 19.370758056640625, "learning_rate": 2.304474992955894e-06, "loss": 2.2037, "step": 2372 }, { "epoch": 0.74, "grad_norm": 12.927643775939941, "learning_rate": 2.299401547261344e-06, "loss": 1.8976, "step": 2373 }, { "epoch": 0.74, "grad_norm": 19.296655654907227, "learning_rate": 2.2943326040292827e-06, "loss": 1.4712, "step": 2374 }, { "epoch": 0.74, "grad_norm": 17.31259536743164, "learning_rate": 2.2892681680638883e-06, "loss": 4.794, "step": 2375 }, { "epoch": 0.74, "grad_norm": 14.219792366027832, "learning_rate": 2.284208244165067e-06, "loss": 2.6991, "step": 2376 }, { "epoch": 0.74, "grad_norm": 11.96985149383545, "learning_rate": 2.2791528371284547e-06, "loss": 1.4655, "step": 2377 }, { "epoch": 0.74, "grad_norm": 10.814947128295898, "learning_rate": 2.2741019517453987e-06, "loss": 1.4603, "step": 2378 }, { "epoch": 0.74, "grad_norm": 14.676878929138184, "learning_rate": 2.269055592802961e-06, "loss": 3.1338, "step": 2379 }, { "epoch": 0.74, "grad_norm": 10.400547981262207, "learning_rate": 2.2640137650839237e-06, "loss": 1.724, "step": 2380 }, { "epoch": 0.74, "grad_norm": 10.442301750183105, "learning_rate": 2.2589764733667593e-06, "loss": 1.5981, "step": 2381 }, { "epoch": 0.74, "grad_norm": 15.594927787780762, "learning_rate": 2.2539437224256494e-06, "loss": 4.5581, "step": 2382 }, { "epoch": 0.74, "grad_norm": 17.237245559692383, "learning_rate": 2.248915517030477e-06, "loss": 4.9534, "step": 2383 }, { "epoch": 0.74, "grad_norm": 14.386374473571777, "learning_rate": 2.2438918619468073e-06, "loss": 2.1693, "step": 2384 }, { "epoch": 0.74, "grad_norm": 18.36848258972168, "learning_rate": 2.2388727619358956e-06, "loss": 2.2726, "step": 2385 }, { "epoch": 0.74, "grad_norm": 10.761434555053711, "learning_rate": 2.2338582217546863e-06, "loss": 1.4383, "step": 2386 }, { "epoch": 0.74, "grad_norm": 13.74380111694336, "learning_rate": 2.228848246155794e-06, "loss": 1.5859, "step": 2387 }, { "epoch": 0.74, "grad_norm": 9.30272102355957, "learning_rate": 2.223842839887511e-06, "loss": 0.9477, "step": 2388 }, { "epoch": 0.74, "grad_norm": 15.387125015258789, "learning_rate": 2.218842007693798e-06, "loss": 4.0006, "step": 2389 }, { "epoch": 0.74, "grad_norm": 10.58398723602295, "learning_rate": 2.2138457543142815e-06, "loss": 1.5678, "step": 2390 }, { "epoch": 0.74, "grad_norm": 16.065271377563477, "learning_rate": 2.2088540844842454e-06, "loss": 4.272, "step": 2391 }, { "epoch": 0.74, "grad_norm": 16.36202621459961, "learning_rate": 2.2038670029346358e-06, "loss": 3.4879, "step": 2392 }, { "epoch": 0.74, "grad_norm": 15.290525436401367, "learning_rate": 2.1988845143920443e-06, "loss": 1.4566, "step": 2393 }, { "epoch": 0.74, "grad_norm": 13.585494995117188, "learning_rate": 2.1939066235787106e-06, "loss": 2.9677, "step": 2394 }, { "epoch": 0.74, "grad_norm": 21.19524574279785, "learning_rate": 2.1889333352125187e-06, "loss": 7.8869, "step": 2395 }, { "epoch": 0.74, "grad_norm": 12.337952613830566, "learning_rate": 2.183964654006988e-06, "loss": 1.1113, "step": 2396 }, { "epoch": 0.74, "grad_norm": 19.695003509521484, "learning_rate": 2.1790005846712713e-06, "loss": 2.2414, "step": 2397 }, { "epoch": 0.74, "grad_norm": 15.903563499450684, "learning_rate": 2.174041131910156e-06, "loss": 4.9721, "step": 2398 }, { "epoch": 0.74, "grad_norm": 13.483502388000488, "learning_rate": 2.169086300424048e-06, "loss": 2.0157, "step": 2399 }, { "epoch": 0.74, "grad_norm": 18.105079650878906, "learning_rate": 2.1641360949089727e-06, "loss": 5.2384, "step": 2400 }, { "epoch": 0.74, "grad_norm": 14.571303367614746, "learning_rate": 2.15919052005658e-06, "loss": 1.2627, "step": 2401 }, { "epoch": 0.74, "grad_norm": 29.374582290649414, "learning_rate": 2.1542495805541187e-06, "loss": 3.0239, "step": 2402 }, { "epoch": 0.74, "grad_norm": 18.31540298461914, "learning_rate": 2.1493132810844506e-06, "loss": 6.1591, "step": 2403 }, { "epoch": 0.74, "grad_norm": 12.622870445251465, "learning_rate": 2.1443816263260427e-06, "loss": 2.1064, "step": 2404 }, { "epoch": 0.75, "grad_norm": 21.38667106628418, "learning_rate": 2.139454620952956e-06, "loss": 3.1859, "step": 2405 }, { "epoch": 0.75, "grad_norm": 12.097481727600098, "learning_rate": 2.1345322696348416e-06, "loss": 1.2453, "step": 2406 }, { "epoch": 0.75, "grad_norm": 16.879291534423828, "learning_rate": 2.1296145770369503e-06, "loss": 1.3218, "step": 2407 }, { "epoch": 0.75, "grad_norm": 16.653841018676758, "learning_rate": 2.1247015478201074e-06, "loss": 1.2542, "step": 2408 }, { "epoch": 0.75, "grad_norm": 10.436256408691406, "learning_rate": 2.1197931866407225e-06, "loss": 1.5462, "step": 2409 }, { "epoch": 0.75, "grad_norm": 11.92463207244873, "learning_rate": 2.1148894981507804e-06, "loss": 1.1283, "step": 2410 }, { "epoch": 0.75, "grad_norm": 18.83355712890625, "learning_rate": 2.109990486997837e-06, "loss": 2.3514, "step": 2411 }, { "epoch": 0.75, "grad_norm": 9.721569061279297, "learning_rate": 2.1050961578250142e-06, "loss": 1.099, "step": 2412 }, { "epoch": 0.75, "grad_norm": 14.510601043701172, "learning_rate": 2.100206515271002e-06, "loss": 1.9727, "step": 2413 }, { "epoch": 0.75, "grad_norm": 14.138338088989258, "learning_rate": 2.0953215639700404e-06, "loss": 2.0488, "step": 2414 }, { "epoch": 0.75, "grad_norm": 15.770391464233398, "learning_rate": 2.0904413085519295e-06, "loss": 3.2804, "step": 2415 }, { "epoch": 0.75, "grad_norm": 11.164030075073242, "learning_rate": 2.0855657536420156e-06, "loss": 1.9102, "step": 2416 }, { "epoch": 0.75, "grad_norm": 15.141904830932617, "learning_rate": 2.0806949038611903e-06, "loss": 2.0403, "step": 2417 }, { "epoch": 0.75, "grad_norm": 14.000855445861816, "learning_rate": 2.0758287638258846e-06, "loss": 2.5367, "step": 2418 }, { "epoch": 0.75, "grad_norm": 23.206893920898438, "learning_rate": 2.070967338148072e-06, "loss": 2.5601, "step": 2419 }, { "epoch": 0.75, "grad_norm": 16.114166259765625, "learning_rate": 2.0661106314352506e-06, "loss": 2.7923, "step": 2420 }, { "epoch": 0.75, "grad_norm": 11.76025676727295, "learning_rate": 2.061258648290447e-06, "loss": 1.8805, "step": 2421 }, { "epoch": 0.75, "grad_norm": 13.431325912475586, "learning_rate": 2.0564113933122185e-06, "loss": 2.7144, "step": 2422 }, { "epoch": 0.75, "grad_norm": 20.068647384643555, "learning_rate": 2.0515688710946337e-06, "loss": 4.914, "step": 2423 }, { "epoch": 0.75, "grad_norm": 24.792346954345703, "learning_rate": 2.0467310862272714e-06, "loss": 5.9553, "step": 2424 }, { "epoch": 0.75, "grad_norm": 12.623862266540527, "learning_rate": 2.0418980432952343e-06, "loss": 2.0634, "step": 2425 }, { "epoch": 0.75, "grad_norm": 12.7319974899292, "learning_rate": 2.0370697468791187e-06, "loss": 3.1398, "step": 2426 }, { "epoch": 0.75, "grad_norm": 15.475244522094727, "learning_rate": 2.032246201555026e-06, "loss": 2.2193, "step": 2427 }, { "epoch": 0.75, "grad_norm": 14.422100067138672, "learning_rate": 2.0274274118945586e-06, "loss": 3.1081, "step": 2428 }, { "epoch": 0.75, "grad_norm": 13.511394500732422, "learning_rate": 2.022613382464808e-06, "loss": 2.309, "step": 2429 }, { "epoch": 0.75, "grad_norm": 7.807738780975342, "learning_rate": 2.017804117828353e-06, "loss": 1.4154, "step": 2430 }, { "epoch": 0.75, "grad_norm": 11.46981430053711, "learning_rate": 2.0129996225432587e-06, "loss": 2.2389, "step": 2431 }, { "epoch": 0.75, "grad_norm": 10.154441833496094, "learning_rate": 2.0081999011630687e-06, "loss": 1.2579, "step": 2432 }, { "epoch": 0.75, "grad_norm": 9.019061088562012, "learning_rate": 2.003404958236801e-06, "loss": 1.5883, "step": 2433 }, { "epoch": 0.75, "grad_norm": 13.277936935424805, "learning_rate": 1.99861479830895e-06, "loss": 4.5607, "step": 2434 }, { "epoch": 0.75, "grad_norm": 15.37705135345459, "learning_rate": 1.9938294259194714e-06, "loss": 4.9968, "step": 2435 }, { "epoch": 0.75, "grad_norm": 15.416961669921875, "learning_rate": 1.989048845603786e-06, "loss": 2.2994, "step": 2436 }, { "epoch": 0.76, "grad_norm": 10.432685852050781, "learning_rate": 1.984273061892771e-06, "loss": 1.6858, "step": 2437 }, { "epoch": 0.76, "grad_norm": 11.396636009216309, "learning_rate": 1.979502079312759e-06, "loss": 1.3837, "step": 2438 }, { "epoch": 0.76, "grad_norm": 12.962885856628418, "learning_rate": 1.9747359023855308e-06, "loss": 2.4708, "step": 2439 }, { "epoch": 0.76, "grad_norm": 12.278225898742676, "learning_rate": 1.969974535628316e-06, "loss": 3.5953, "step": 2440 }, { "epoch": 0.76, "grad_norm": 9.124987602233887, "learning_rate": 1.965217983553783e-06, "loss": 0.9435, "step": 2441 }, { "epoch": 0.76, "grad_norm": 14.870964050292969, "learning_rate": 1.9604662506700326e-06, "loss": 2.4343, "step": 2442 }, { "epoch": 0.76, "grad_norm": 12.670650482177734, "learning_rate": 1.955719341480609e-06, "loss": 3.1242, "step": 2443 }, { "epoch": 0.76, "grad_norm": 16.715402603149414, "learning_rate": 1.950977260484476e-06, "loss": 1.8638, "step": 2444 }, { "epoch": 0.76, "grad_norm": 15.901594161987305, "learning_rate": 1.9462400121760185e-06, "loss": 2.4353, "step": 2445 }, { "epoch": 0.76, "grad_norm": 10.394169807434082, "learning_rate": 1.9415076010450515e-06, "loss": 1.7736, "step": 2446 }, { "epoch": 0.76, "grad_norm": 11.738875389099121, "learning_rate": 1.936780031576799e-06, "loss": 1.1602, "step": 2447 }, { "epoch": 0.76, "grad_norm": 17.010435104370117, "learning_rate": 1.9320573082518947e-06, "loss": 5.2137, "step": 2448 }, { "epoch": 0.76, "grad_norm": 12.7792329788208, "learning_rate": 1.9273394355463864e-06, "loss": 1.8585, "step": 2449 }, { "epoch": 0.76, "grad_norm": 11.063002586364746, "learning_rate": 1.9226264179317185e-06, "loss": 2.4567, "step": 2450 }, { "epoch": 0.76, "grad_norm": 12.918591499328613, "learning_rate": 1.917918259874735e-06, "loss": 0.7596, "step": 2451 }, { "epoch": 0.76, "grad_norm": 15.485755920410156, "learning_rate": 1.9132149658376742e-06, "loss": 2.6869, "step": 2452 }, { "epoch": 0.76, "grad_norm": 9.972342491149902, "learning_rate": 1.908516540278166e-06, "loss": 1.1617, "step": 2453 }, { "epoch": 0.76, "grad_norm": 22.043930053710938, "learning_rate": 1.903822987649223e-06, "loss": 2.5623, "step": 2454 }, { "epoch": 0.76, "grad_norm": 10.817049980163574, "learning_rate": 1.899134312399245e-06, "loss": 1.2553, "step": 2455 }, { "epoch": 0.76, "grad_norm": 11.989843368530273, "learning_rate": 1.8944505189720034e-06, "loss": 1.54, "step": 2456 }, { "epoch": 0.76, "grad_norm": 15.252662658691406, "learning_rate": 1.889771611806647e-06, "loss": 3.6877, "step": 2457 }, { "epoch": 0.76, "grad_norm": 13.54154109954834, "learning_rate": 1.88509759533769e-06, "loss": 1.6751, "step": 2458 }, { "epoch": 0.76, "grad_norm": 13.486143112182617, "learning_rate": 1.8804284739950129e-06, "loss": 3.4475, "step": 2459 }, { "epoch": 0.76, "grad_norm": 9.09095287322998, "learning_rate": 1.8757642522038544e-06, "loss": 1.2124, "step": 2460 }, { "epoch": 0.76, "grad_norm": 10.430471420288086, "learning_rate": 1.8711049343848184e-06, "loss": 1.4979, "step": 2461 }, { "epoch": 0.76, "grad_norm": 13.80158805847168, "learning_rate": 1.8664505249538497e-06, "loss": 1.4916, "step": 2462 }, { "epoch": 0.76, "grad_norm": 13.338501930236816, "learning_rate": 1.8618010283222455e-06, "loss": 1.3531, "step": 2463 }, { "epoch": 0.76, "grad_norm": 17.377933502197266, "learning_rate": 1.8571564488966517e-06, "loss": 2.4318, "step": 2464 }, { "epoch": 0.76, "grad_norm": 14.895021438598633, "learning_rate": 1.8525167910790465e-06, "loss": 5.0413, "step": 2465 }, { "epoch": 0.76, "grad_norm": 15.143299102783203, "learning_rate": 1.8478820592667463e-06, "loss": 5.4828, "step": 2466 }, { "epoch": 0.76, "grad_norm": 11.624774932861328, "learning_rate": 1.8432522578523997e-06, "loss": 2.1534, "step": 2467 }, { "epoch": 0.76, "grad_norm": 13.456793785095215, "learning_rate": 1.8386273912239786e-06, "loss": 1.6974, "step": 2468 }, { "epoch": 0.76, "grad_norm": 15.70180606842041, "learning_rate": 1.8340074637647865e-06, "loss": 6.4944, "step": 2469 }, { "epoch": 0.77, "grad_norm": 9.85049057006836, "learning_rate": 1.8293924798534375e-06, "loss": 1.6007, "step": 2470 }, { "epoch": 0.77, "grad_norm": 16.042861938476562, "learning_rate": 1.824782443863862e-06, "loss": 1.3233, "step": 2471 }, { "epoch": 0.77, "grad_norm": 9.435417175292969, "learning_rate": 1.8201773601653036e-06, "loss": 1.0364, "step": 2472 }, { "epoch": 0.77, "grad_norm": 6.530979156494141, "learning_rate": 1.8155772331223098e-06, "loss": 0.9042, "step": 2473 }, { "epoch": 0.77, "grad_norm": 8.234294891357422, "learning_rate": 1.8109820670947296e-06, "loss": 1.4322, "step": 2474 }, { "epoch": 0.77, "grad_norm": 9.820496559143066, "learning_rate": 1.8063918664377154e-06, "loss": 1.0702, "step": 2475 }, { "epoch": 0.77, "grad_norm": 11.390840530395508, "learning_rate": 1.8018066355017072e-06, "loss": 1.2658, "step": 2476 }, { "epoch": 0.77, "grad_norm": 18.800928115844727, "learning_rate": 1.7972263786324366e-06, "loss": 6.5576, "step": 2477 }, { "epoch": 0.77, "grad_norm": 17.33761978149414, "learning_rate": 1.792651100170926e-06, "loss": 4.825, "step": 2478 }, { "epoch": 0.77, "grad_norm": 16.361677169799805, "learning_rate": 1.788080804453474e-06, "loss": 1.3772, "step": 2479 }, { "epoch": 0.77, "grad_norm": 22.153621673583984, "learning_rate": 1.7835154958116536e-06, "loss": 5.4864, "step": 2480 }, { "epoch": 0.77, "grad_norm": 14.149209976196289, "learning_rate": 1.7789551785723201e-06, "loss": 2.4508, "step": 2481 }, { "epoch": 0.77, "grad_norm": 14.432504653930664, "learning_rate": 1.7743998570575935e-06, "loss": 1.1354, "step": 2482 }, { "epoch": 0.77, "grad_norm": 16.748077392578125, "learning_rate": 1.7698495355848553e-06, "loss": 2.4884, "step": 2483 }, { "epoch": 0.77, "grad_norm": 18.71636199951172, "learning_rate": 1.7653042184667574e-06, "loss": 5.2609, "step": 2484 }, { "epoch": 0.77, "grad_norm": 14.906200408935547, "learning_rate": 1.7607639100112006e-06, "loss": 2.6443, "step": 2485 }, { "epoch": 0.77, "grad_norm": 10.056180953979492, "learning_rate": 1.7562286145213414e-06, "loss": 1.2123, "step": 2486 }, { "epoch": 0.77, "grad_norm": 10.615121841430664, "learning_rate": 1.751698336295585e-06, "loss": 1.2911, "step": 2487 }, { "epoch": 0.77, "grad_norm": 9.8436861038208, "learning_rate": 1.747173079627582e-06, "loss": 1.458, "step": 2488 }, { "epoch": 0.77, "grad_norm": 16.317079544067383, "learning_rate": 1.7426528488062211e-06, "loss": 1.7317, "step": 2489 }, { "epoch": 0.77, "grad_norm": 10.851561546325684, "learning_rate": 1.7381376481156337e-06, "loss": 1.1819, "step": 2490 }, { "epoch": 0.77, "grad_norm": 8.39650821685791, "learning_rate": 1.7336274818351788e-06, "loss": 1.8603, "step": 2491 }, { "epoch": 0.77, "grad_norm": 12.600789070129395, "learning_rate": 1.7291223542394436e-06, "loss": 3.2663, "step": 2492 }, { "epoch": 0.77, "grad_norm": 12.00397777557373, "learning_rate": 1.7246222695982422e-06, "loss": 1.7545, "step": 2493 }, { "epoch": 0.77, "grad_norm": 18.165191650390625, "learning_rate": 1.7201272321766084e-06, "loss": 4.9447, "step": 2494 }, { "epoch": 0.77, "grad_norm": 9.986980438232422, "learning_rate": 1.7156372462347903e-06, "loss": 1.6906, "step": 2495 }, { "epoch": 0.77, "grad_norm": 19.583627700805664, "learning_rate": 1.711152316028254e-06, "loss": 4.1174, "step": 2496 }, { "epoch": 0.77, "grad_norm": 15.571231842041016, "learning_rate": 1.7066724458076684e-06, "loss": 2.7608, "step": 2497 }, { "epoch": 0.77, "grad_norm": 24.01877784729004, "learning_rate": 1.7021976398189062e-06, "loss": 3.5909, "step": 2498 }, { "epoch": 0.77, "grad_norm": 12.039501190185547, "learning_rate": 1.697727902303047e-06, "loss": 2.7924, "step": 2499 }, { "epoch": 0.77, "grad_norm": 16.051227569580078, "learning_rate": 1.6932632374963631e-06, "loss": 2.0519, "step": 2500 }, { "epoch": 0.77, "grad_norm": 18.50823211669922, "learning_rate": 1.6888036496303116e-06, "loss": 4.4319, "step": 2501 }, { "epoch": 0.78, "grad_norm": 21.07451057434082, "learning_rate": 1.6843491429315508e-06, "loss": 2.6907, "step": 2502 }, { "epoch": 0.78, "grad_norm": 15.278072357177734, "learning_rate": 1.679899721621915e-06, "loss": 1.422, "step": 2503 }, { "epoch": 0.78, "grad_norm": 12.383358001708984, "learning_rate": 1.6754553899184191e-06, "loss": 2.0169, "step": 2504 }, { "epoch": 0.78, "grad_norm": 16.6717529296875, "learning_rate": 1.6710161520332601e-06, "loss": 3.1959, "step": 2505 }, { "epoch": 0.78, "grad_norm": 8.391064643859863, "learning_rate": 1.666582012173801e-06, "loss": 0.8575, "step": 2506 }, { "epoch": 0.78, "grad_norm": 15.009490966796875, "learning_rate": 1.6621529745425744e-06, "loss": 2.3967, "step": 2507 }, { "epoch": 0.78, "grad_norm": 9.876742362976074, "learning_rate": 1.6577290433372782e-06, "loss": 2.0324, "step": 2508 }, { "epoch": 0.78, "grad_norm": 18.65607261657715, "learning_rate": 1.6533102227507705e-06, "loss": 3.8017, "step": 2509 }, { "epoch": 0.78, "grad_norm": 11.521139144897461, "learning_rate": 1.6488965169710638e-06, "loss": 1.4466, "step": 2510 }, { "epoch": 0.78, "grad_norm": 10.26103687286377, "learning_rate": 1.6444879301813294e-06, "loss": 2.4333, "step": 2511 }, { "epoch": 0.78, "grad_norm": 9.338981628417969, "learning_rate": 1.6400844665598781e-06, "loss": 1.7815, "step": 2512 }, { "epoch": 0.78, "grad_norm": 10.998225212097168, "learning_rate": 1.635686130280171e-06, "loss": 1.176, "step": 2513 }, { "epoch": 0.78, "grad_norm": 13.324702262878418, "learning_rate": 1.631292925510807e-06, "loss": 0.9974, "step": 2514 }, { "epoch": 0.78, "grad_norm": 11.276659965515137, "learning_rate": 1.6269048564155235e-06, "loss": 0.9944, "step": 2515 }, { "epoch": 0.78, "grad_norm": 14.636279106140137, "learning_rate": 1.6225219271531878e-06, "loss": 1.7558, "step": 2516 }, { "epoch": 0.78, "grad_norm": 20.32813262939453, "learning_rate": 1.6181441418777997e-06, "loss": 1.9024, "step": 2517 }, { "epoch": 0.78, "grad_norm": 7.731251239776611, "learning_rate": 1.6137715047384803e-06, "loss": 1.06, "step": 2518 }, { "epoch": 0.78, "grad_norm": 13.945781707763672, "learning_rate": 1.609404019879471e-06, "loss": 2.0694, "step": 2519 }, { "epoch": 0.78, "grad_norm": 12.413304328918457, "learning_rate": 1.6050416914401355e-06, "loss": 3.1534, "step": 2520 }, { "epoch": 0.78, "grad_norm": 14.440204620361328, "learning_rate": 1.6006845235549472e-06, "loss": 1.7909, "step": 2521 }, { "epoch": 0.78, "grad_norm": 13.835144996643066, "learning_rate": 1.5963325203534811e-06, "loss": 1.6483, "step": 2522 }, { "epoch": 0.78, "grad_norm": 12.865160942077637, "learning_rate": 1.5919856859604304e-06, "loss": 1.72, "step": 2523 }, { "epoch": 0.78, "grad_norm": 10.06251049041748, "learning_rate": 1.587644024495581e-06, "loss": 1.3708, "step": 2524 }, { "epoch": 0.78, "grad_norm": 13.970108032226562, "learning_rate": 1.583307540073816e-06, "loss": 1.6351, "step": 2525 }, { "epoch": 0.78, "grad_norm": 14.133593559265137, "learning_rate": 1.578976236805119e-06, "loss": 1.863, "step": 2526 }, { "epoch": 0.78, "grad_norm": 11.517053604125977, "learning_rate": 1.5746501187945555e-06, "loss": 1.1233, "step": 2527 }, { "epoch": 0.78, "grad_norm": 13.115801811218262, "learning_rate": 1.5703291901422795e-06, "loss": 1.1439, "step": 2528 }, { "epoch": 0.78, "grad_norm": 15.460615158081055, "learning_rate": 1.5660134549435257e-06, "loss": 1.745, "step": 2529 }, { "epoch": 0.78, "grad_norm": 13.269757270812988, "learning_rate": 1.561702917288609e-06, "loss": 1.1088, "step": 2530 }, { "epoch": 0.78, "grad_norm": 15.581395149230957, "learning_rate": 1.5573975812629126e-06, "loss": 2.7008, "step": 2531 }, { "epoch": 0.78, "grad_norm": 15.502326965332031, "learning_rate": 1.5530974509468995e-06, "loss": 5.0785, "step": 2532 }, { "epoch": 0.78, "grad_norm": 19.524160385131836, "learning_rate": 1.5488025304160903e-06, "loss": 4.5823, "step": 2533 }, { "epoch": 0.79, "grad_norm": 13.589733123779297, "learning_rate": 1.5445128237410704e-06, "loss": 2.689, "step": 2534 }, { "epoch": 0.79, "grad_norm": 13.67539119720459, "learning_rate": 1.5402283349874845e-06, "loss": 2.7477, "step": 2535 }, { "epoch": 0.79, "grad_norm": 13.214372634887695, "learning_rate": 1.5359490682160318e-06, "loss": 3.1485, "step": 2536 }, { "epoch": 0.79, "grad_norm": 18.086790084838867, "learning_rate": 1.5316750274824598e-06, "loss": 2.002, "step": 2537 }, { "epoch": 0.79, "grad_norm": 15.74965763092041, "learning_rate": 1.5274062168375691e-06, "loss": 1.3451, "step": 2538 }, { "epoch": 0.79, "grad_norm": 16.10420036315918, "learning_rate": 1.5231426403271976e-06, "loss": 3.2361, "step": 2539 }, { "epoch": 0.79, "grad_norm": 15.154731750488281, "learning_rate": 1.5188843019922215e-06, "loss": 1.6669, "step": 2540 }, { "epoch": 0.79, "grad_norm": 10.527267456054688, "learning_rate": 1.5146312058685597e-06, "loss": 1.3462, "step": 2541 }, { "epoch": 0.79, "grad_norm": 10.436562538146973, "learning_rate": 1.5103833559871566e-06, "loss": 2.5334, "step": 2542 }, { "epoch": 0.79, "grad_norm": 18.57342529296875, "learning_rate": 1.5061407563739853e-06, "loss": 2.7942, "step": 2543 }, { "epoch": 0.79, "grad_norm": 13.763381958007812, "learning_rate": 1.5019034110500433e-06, "loss": 2.4587, "step": 2544 }, { "epoch": 0.79, "grad_norm": 13.633790969848633, "learning_rate": 1.4976713240313496e-06, "loss": 2.1891, "step": 2545 }, { "epoch": 0.79, "grad_norm": 12.143424987792969, "learning_rate": 1.4934444993289352e-06, "loss": 2.2721, "step": 2546 }, { "epoch": 0.79, "grad_norm": 13.036821365356445, "learning_rate": 1.489222940948852e-06, "loss": 1.8695, "step": 2547 }, { "epoch": 0.79, "grad_norm": 9.992802619934082, "learning_rate": 1.485006652892153e-06, "loss": 1.069, "step": 2548 }, { "epoch": 0.79, "grad_norm": 10.13967514038086, "learning_rate": 1.4807956391548982e-06, "loss": 1.1058, "step": 2549 }, { "epoch": 0.79, "grad_norm": 9.486446380615234, "learning_rate": 1.4765899037281503e-06, "loss": 1.4127, "step": 2550 }, { "epoch": 0.79, "grad_norm": 11.577086448669434, "learning_rate": 1.4723894505979675e-06, "loss": 1.7006, "step": 2551 }, { "epoch": 0.79, "grad_norm": 11.6814603805542, "learning_rate": 1.4681942837454012e-06, "loss": 2.0487, "step": 2552 }, { "epoch": 0.79, "grad_norm": 16.907302856445312, "learning_rate": 1.4640044071464966e-06, "loss": 4.4646, "step": 2553 }, { "epoch": 0.79, "grad_norm": 10.21091365814209, "learning_rate": 1.4598198247722822e-06, "loss": 1.5015, "step": 2554 }, { "epoch": 0.79, "grad_norm": 10.397065162658691, "learning_rate": 1.4556405405887654e-06, "loss": 1.2597, "step": 2555 }, { "epoch": 0.79, "grad_norm": 12.379748344421387, "learning_rate": 1.4514665585569415e-06, "loss": 2.5205, "step": 2556 }, { "epoch": 0.79, "grad_norm": 12.382526397705078, "learning_rate": 1.4472978826327705e-06, "loss": 3.0737, "step": 2557 }, { "epoch": 0.79, "grad_norm": 12.831366539001465, "learning_rate": 1.4431345167671862e-06, "loss": 2.0951, "step": 2558 }, { "epoch": 0.79, "grad_norm": 15.526214599609375, "learning_rate": 1.4389764649060951e-06, "loss": 2.0521, "step": 2559 }, { "epoch": 0.79, "grad_norm": 7.109898567199707, "learning_rate": 1.4348237309903618e-06, "loss": 0.6138, "step": 2560 }, { "epoch": 0.79, "grad_norm": 13.867196083068848, "learning_rate": 1.43067631895581e-06, "loss": 1.8706, "step": 2561 }, { "epoch": 0.79, "grad_norm": 19.599706649780273, "learning_rate": 1.4265342327332263e-06, "loss": 2.4417, "step": 2562 }, { "epoch": 0.79, "grad_norm": 18.613494873046875, "learning_rate": 1.4223974762483427e-06, "loss": 2.3428, "step": 2563 }, { "epoch": 0.79, "grad_norm": 8.591452598571777, "learning_rate": 1.4182660534218415e-06, "loss": 0.9621, "step": 2564 }, { "epoch": 0.79, "grad_norm": 13.627431869506836, "learning_rate": 1.4141399681693522e-06, "loss": 3.8242, "step": 2565 }, { "epoch": 0.8, "grad_norm": 20.302942276000977, "learning_rate": 1.4100192244014433e-06, "loss": 3.7236, "step": 2566 }, { "epoch": 0.8, "grad_norm": 16.992944717407227, "learning_rate": 1.4059038260236196e-06, "loss": 2.4187, "step": 2567 }, { "epoch": 0.8, "grad_norm": 14.403325080871582, "learning_rate": 1.4017937769363246e-06, "loss": 2.3914, "step": 2568 }, { "epoch": 0.8, "grad_norm": 18.10266876220703, "learning_rate": 1.397689081034929e-06, "loss": 2.1453, "step": 2569 }, { "epoch": 0.8, "grad_norm": 9.529816627502441, "learning_rate": 1.3935897422097285e-06, "loss": 1.4158, "step": 2570 }, { "epoch": 0.8, "grad_norm": 13.614334106445312, "learning_rate": 1.3894957643459425e-06, "loss": 1.75, "step": 2571 }, { "epoch": 0.8, "grad_norm": 24.91597557067871, "learning_rate": 1.3854071513237115e-06, "loss": 3.66, "step": 2572 }, { "epoch": 0.8, "grad_norm": 15.958589553833008, "learning_rate": 1.381323907018087e-06, "loss": 1.594, "step": 2573 }, { "epoch": 0.8, "grad_norm": 20.908681869506836, "learning_rate": 1.3772460352990386e-06, "loss": 2.5328, "step": 2574 }, { "epoch": 0.8, "grad_norm": 15.671060562133789, "learning_rate": 1.3731735400314382e-06, "loss": 1.8023, "step": 2575 }, { "epoch": 0.8, "grad_norm": 21.42799949645996, "learning_rate": 1.3691064250750629e-06, "loss": 2.8076, "step": 2576 }, { "epoch": 0.8, "grad_norm": 12.156183242797852, "learning_rate": 1.365044694284597e-06, "loss": 0.9839, "step": 2577 }, { "epoch": 0.8, "grad_norm": 15.905706405639648, "learning_rate": 1.3609883515096112e-06, "loss": 3.1947, "step": 2578 }, { "epoch": 0.8, "grad_norm": 13.29395866394043, "learning_rate": 1.3569374005945752e-06, "loss": 1.5666, "step": 2579 }, { "epoch": 0.8, "grad_norm": 13.16615104675293, "learning_rate": 1.3528918453788517e-06, "loss": 1.1203, "step": 2580 }, { "epoch": 0.8, "grad_norm": 17.40990447998047, "learning_rate": 1.3488516896966847e-06, "loss": 1.161, "step": 2581 }, { "epoch": 0.8, "grad_norm": 14.006482124328613, "learning_rate": 1.3448169373772002e-06, "loss": 3.4506, "step": 2582 }, { "epoch": 0.8, "grad_norm": 17.400257110595703, "learning_rate": 1.3407875922444088e-06, "loss": 5.522, "step": 2583 }, { "epoch": 0.8, "grad_norm": 15.19652271270752, "learning_rate": 1.3367636581171906e-06, "loss": 2.3077, "step": 2584 }, { "epoch": 0.8, "grad_norm": 18.77398681640625, "learning_rate": 1.3327451388092991e-06, "loss": 2.8604, "step": 2585 }, { "epoch": 0.8, "grad_norm": 11.453078269958496, "learning_rate": 1.3287320381293562e-06, "loss": 4.7492, "step": 2586 }, { "epoch": 0.8, "grad_norm": 15.172926902770996, "learning_rate": 1.3247243598808481e-06, "loss": 2.9449, "step": 2587 }, { "epoch": 0.8, "grad_norm": 14.221792221069336, "learning_rate": 1.3207221078621198e-06, "loss": 2.237, "step": 2588 }, { "epoch": 0.8, "grad_norm": 12.888528823852539, "learning_rate": 1.3167252858663798e-06, "loss": 2.8871, "step": 2589 }, { "epoch": 0.8, "grad_norm": 11.226152420043945, "learning_rate": 1.3127338976816826e-06, "loss": 1.6073, "step": 2590 }, { "epoch": 0.8, "grad_norm": 13.393001556396484, "learning_rate": 1.308747947090937e-06, "loss": 2.7073, "step": 2591 }, { "epoch": 0.8, "grad_norm": 11.04410457611084, "learning_rate": 1.3047674378718965e-06, "loss": 1.8482, "step": 2592 }, { "epoch": 0.8, "grad_norm": 14.1553316116333, "learning_rate": 1.3007923737971589e-06, "loss": 2.3972, "step": 2593 }, { "epoch": 0.8, "grad_norm": 13.165019035339355, "learning_rate": 1.2968227586341577e-06, "loss": 1.7444, "step": 2594 }, { "epoch": 0.8, "grad_norm": 12.589109420776367, "learning_rate": 1.2928585961451686e-06, "loss": 1.3718, "step": 2595 }, { "epoch": 0.8, "grad_norm": 16.85529136657715, "learning_rate": 1.2888998900872945e-06, "loss": 3.3007, "step": 2596 }, { "epoch": 0.8, "grad_norm": 16.978525161743164, "learning_rate": 1.2849466442124665e-06, "loss": 3.3992, "step": 2597 }, { "epoch": 0.8, "grad_norm": 14.961907386779785, "learning_rate": 1.2809988622674466e-06, "loss": 2.7419, "step": 2598 }, { "epoch": 0.81, "grad_norm": 12.649197578430176, "learning_rate": 1.277056547993809e-06, "loss": 1.668, "step": 2599 }, { "epoch": 0.81, "grad_norm": 12.195618629455566, "learning_rate": 1.2731197051279508e-06, "loss": 1.4284, "step": 2600 }, { "epoch": 0.81, "grad_norm": 15.158956527709961, "learning_rate": 1.2691883374010877e-06, "loss": 1.3633, "step": 2601 }, { "epoch": 0.81, "grad_norm": 20.267412185668945, "learning_rate": 1.2652624485392395e-06, "loss": 5.4692, "step": 2602 }, { "epoch": 0.81, "grad_norm": 25.416479110717773, "learning_rate": 1.261342042263234e-06, "loss": 3.8892, "step": 2603 }, { "epoch": 0.81, "grad_norm": 15.120532989501953, "learning_rate": 1.257427122288708e-06, "loss": 1.7587, "step": 2604 }, { "epoch": 0.81, "grad_norm": 12.286882400512695, "learning_rate": 1.2535176923260937e-06, "loss": 1.3639, "step": 2605 }, { "epoch": 0.81, "grad_norm": 11.099992752075195, "learning_rate": 1.2496137560806214e-06, "loss": 1.2341, "step": 2606 }, { "epoch": 0.81, "grad_norm": 16.102886199951172, "learning_rate": 1.245715317252315e-06, "loss": 1.8498, "step": 2607 }, { "epoch": 0.81, "grad_norm": 16.775665283203125, "learning_rate": 1.2418223795359877e-06, "loss": 1.6815, "step": 2608 }, { "epoch": 0.81, "grad_norm": 7.794978618621826, "learning_rate": 1.237934946621237e-06, "loss": 0.7872, "step": 2609 }, { "epoch": 0.81, "grad_norm": 12.038506507873535, "learning_rate": 1.2340530221924487e-06, "loss": 1.549, "step": 2610 }, { "epoch": 0.81, "grad_norm": 11.117688179016113, "learning_rate": 1.2301766099287831e-06, "loss": 1.0437, "step": 2611 }, { "epoch": 0.81, "grad_norm": 12.886580467224121, "learning_rate": 1.2263057135041772e-06, "loss": 1.0861, "step": 2612 }, { "epoch": 0.81, "grad_norm": 15.002225875854492, "learning_rate": 1.2224403365873406e-06, "loss": 2.5014, "step": 2613 }, { "epoch": 0.81, "grad_norm": 11.995274543762207, "learning_rate": 1.2185804828417515e-06, "loss": 1.7224, "step": 2614 }, { "epoch": 0.81, "grad_norm": 15.890597343444824, "learning_rate": 1.2147261559256523e-06, "loss": 2.7263, "step": 2615 }, { "epoch": 0.81, "grad_norm": 15.805421829223633, "learning_rate": 1.2108773594920514e-06, "loss": 1.3961, "step": 2616 }, { "epoch": 0.81, "grad_norm": 14.018182754516602, "learning_rate": 1.207034097188711e-06, "loss": 2.0382, "step": 2617 }, { "epoch": 0.81, "grad_norm": 14.638632774353027, "learning_rate": 1.2031963726581484e-06, "loss": 2.6442, "step": 2618 }, { "epoch": 0.81, "grad_norm": 17.171119689941406, "learning_rate": 1.1993641895376371e-06, "loss": 2.7081, "step": 2619 }, { "epoch": 0.81, "grad_norm": 16.15960121154785, "learning_rate": 1.1955375514591967e-06, "loss": 5.6566, "step": 2620 }, { "epoch": 0.81, "grad_norm": 9.930737495422363, "learning_rate": 1.1917164620495836e-06, "loss": 1.0565, "step": 2621 }, { "epoch": 0.81, "grad_norm": 18.560791015625, "learning_rate": 1.1879009249303071e-06, "loss": 1.7435, "step": 2622 }, { "epoch": 0.81, "grad_norm": 16.398456573486328, "learning_rate": 1.1840909437176075e-06, "loss": 2.5659, "step": 2623 }, { "epoch": 0.81, "grad_norm": 24.315637588500977, "learning_rate": 1.1802865220224601e-06, "loss": 2.9749, "step": 2624 }, { "epoch": 0.81, "grad_norm": 10.489819526672363, "learning_rate": 1.176487663450574e-06, "loss": 1.0255, "step": 2625 }, { "epoch": 0.81, "grad_norm": 14.841395378112793, "learning_rate": 1.1726943716023828e-06, "loss": 2.2453, "step": 2626 }, { "epoch": 0.81, "grad_norm": 20.017536163330078, "learning_rate": 1.168906650073044e-06, "loss": 2.8686, "step": 2627 }, { "epoch": 0.81, "grad_norm": 20.4256591796875, "learning_rate": 1.1651245024524376e-06, "loss": 2.2901, "step": 2628 }, { "epoch": 0.81, "grad_norm": 13.860508918762207, "learning_rate": 1.1613479323251603e-06, "loss": 3.3471, "step": 2629 }, { "epoch": 0.81, "grad_norm": 12.17288589477539, "learning_rate": 1.1575769432705193e-06, "loss": 2.8286, "step": 2630 }, { "epoch": 0.82, "grad_norm": 10.398018836975098, "learning_rate": 1.1538115388625396e-06, "loss": 1.0916, "step": 2631 }, { "epoch": 0.82, "grad_norm": 15.136306762695312, "learning_rate": 1.1500517226699459e-06, "loss": 2.655, "step": 2632 }, { "epoch": 0.82, "grad_norm": 11.895465850830078, "learning_rate": 1.146297498256171e-06, "loss": 1.1909, "step": 2633 }, { "epoch": 0.82, "grad_norm": 16.685121536254883, "learning_rate": 1.1425488691793465e-06, "loss": 2.6348, "step": 2634 }, { "epoch": 0.82, "grad_norm": 18.08514404296875, "learning_rate": 1.1388058389922978e-06, "loss": 3.771, "step": 2635 }, { "epoch": 0.82, "grad_norm": 14.868639945983887, "learning_rate": 1.1350684112425515e-06, "loss": 5.4822, "step": 2636 }, { "epoch": 0.82, "grad_norm": 11.509803771972656, "learning_rate": 1.131336589472318e-06, "loss": 1.8023, "step": 2637 }, { "epoch": 0.82, "grad_norm": 10.3806791305542, "learning_rate": 1.127610377218495e-06, "loss": 1.6137, "step": 2638 }, { "epoch": 0.82, "grad_norm": 13.26029109954834, "learning_rate": 1.1238897780126684e-06, "loss": 2.9263, "step": 2639 }, { "epoch": 0.82, "grad_norm": 9.973566055297852, "learning_rate": 1.1201747953810984e-06, "loss": 1.2468, "step": 2640 }, { "epoch": 0.82, "grad_norm": 11.38296890258789, "learning_rate": 1.1164654328447256e-06, "loss": 1.5823, "step": 2641 }, { "epoch": 0.82, "grad_norm": 8.711637496948242, "learning_rate": 1.1127616939191625e-06, "loss": 1.1944, "step": 2642 }, { "epoch": 0.82, "grad_norm": 15.474139213562012, "learning_rate": 1.109063582114693e-06, "loss": 3.4932, "step": 2643 }, { "epoch": 0.82, "grad_norm": 13.657920837402344, "learning_rate": 1.1053711009362643e-06, "loss": 4.9638, "step": 2644 }, { "epoch": 0.82, "grad_norm": 18.262088775634766, "learning_rate": 1.1016842538834934e-06, "loss": 2.6687, "step": 2645 }, { "epoch": 0.82, "grad_norm": 13.732860565185547, "learning_rate": 1.0980030444506517e-06, "loss": 3.2846, "step": 2646 }, { "epoch": 0.82, "grad_norm": 11.444441795349121, "learning_rate": 1.0943274761266708e-06, "loss": 2.0819, "step": 2647 }, { "epoch": 0.82, "grad_norm": 15.731082916259766, "learning_rate": 1.0906575523951347e-06, "loss": 2.9561, "step": 2648 }, { "epoch": 0.82, "grad_norm": 12.473064422607422, "learning_rate": 1.0869932767342762e-06, "loss": 1.7959, "step": 2649 }, { "epoch": 0.82, "grad_norm": 18.323083877563477, "learning_rate": 1.0833346526169758e-06, "loss": 2.6552, "step": 2650 }, { "epoch": 0.82, "grad_norm": 12.62148666381836, "learning_rate": 1.0796816835107606e-06, "loss": 1.621, "step": 2651 }, { "epoch": 0.82, "grad_norm": 12.119149208068848, "learning_rate": 1.0760343728777956e-06, "loss": 1.3556, "step": 2652 }, { "epoch": 0.82, "grad_norm": 10.881817817687988, "learning_rate": 1.0723927241748803e-06, "loss": 1.7563, "step": 2653 }, { "epoch": 0.82, "grad_norm": 11.731474876403809, "learning_rate": 1.068756740853456e-06, "loss": 1.1546, "step": 2654 }, { "epoch": 0.82, "grad_norm": 18.9011173248291, "learning_rate": 1.0651264263595845e-06, "loss": 4.6636, "step": 2655 }, { "epoch": 0.82, "grad_norm": 13.250627517700195, "learning_rate": 1.0615017841339591e-06, "loss": 1.9599, "step": 2656 }, { "epoch": 0.82, "grad_norm": 10.588768005371094, "learning_rate": 1.0578828176119018e-06, "loss": 4.6439, "step": 2657 }, { "epoch": 0.82, "grad_norm": 14.118270874023438, "learning_rate": 1.0542695302233486e-06, "loss": 1.6572, "step": 2658 }, { "epoch": 0.82, "grad_norm": 16.81142234802246, "learning_rate": 1.0506619253928547e-06, "loss": 3.0529, "step": 2659 }, { "epoch": 0.82, "grad_norm": 15.711450576782227, "learning_rate": 1.0470600065395927e-06, "loss": 3.9799, "step": 2660 }, { "epoch": 0.82, "grad_norm": 12.071063041687012, "learning_rate": 1.0434637770773433e-06, "loss": 2.9833, "step": 2661 }, { "epoch": 0.82, "grad_norm": 12.90408706665039, "learning_rate": 1.0398732404144961e-06, "loss": 1.855, "step": 2662 }, { "epoch": 0.83, "grad_norm": 14.79298210144043, "learning_rate": 1.0362883999540434e-06, "loss": 3.5146, "step": 2663 }, { "epoch": 0.83, "grad_norm": 16.10693359375, "learning_rate": 1.0327092590935814e-06, "loss": 2.9858, "step": 2664 }, { "epoch": 0.83, "grad_norm": 17.625398635864258, "learning_rate": 1.0291358212253015e-06, "loss": 5.5017, "step": 2665 }, { "epoch": 0.83, "grad_norm": 15.117279052734375, "learning_rate": 1.025568089735994e-06, "loss": 5.5637, "step": 2666 }, { "epoch": 0.83, "grad_norm": 12.486377716064453, "learning_rate": 1.0220060680070375e-06, "loss": 2.4685, "step": 2667 }, { "epoch": 0.83, "grad_norm": 14.307598114013672, "learning_rate": 1.0184497594144005e-06, "loss": 1.7946, "step": 2668 }, { "epoch": 0.83, "grad_norm": 10.106240272521973, "learning_rate": 1.0148991673286366e-06, "loss": 1.846, "step": 2669 }, { "epoch": 0.83, "grad_norm": 8.210283279418945, "learning_rate": 1.0113542951148806e-06, "loss": 0.7165, "step": 2670 }, { "epoch": 0.83, "grad_norm": 16.936012268066406, "learning_rate": 1.007815146132846e-06, "loss": 2.8782, "step": 2671 }, { "epoch": 0.83, "grad_norm": 11.700289726257324, "learning_rate": 1.004281723736826e-06, "loss": 5.2996, "step": 2672 }, { "epoch": 0.83, "grad_norm": 11.953660011291504, "learning_rate": 1.0007540312756809e-06, "loss": 1.2916, "step": 2673 }, { "epoch": 0.83, "grad_norm": 10.936169624328613, "learning_rate": 9.972320720928423e-07, "loss": 1.2098, "step": 2674 }, { "epoch": 0.83, "grad_norm": 13.651212692260742, "learning_rate": 9.937158495263123e-07, "loss": 1.5848, "step": 2675 }, { "epoch": 0.83, "grad_norm": 16.4189510345459, "learning_rate": 9.902053669086485e-07, "loss": 1.9543, "step": 2676 }, { "epoch": 0.83, "grad_norm": 15.403413772583008, "learning_rate": 9.867006275669705e-07, "loss": 4.6017, "step": 2677 }, { "epoch": 0.83, "grad_norm": 16.291194915771484, "learning_rate": 9.832016348229598e-07, "loss": 1.5132, "step": 2678 }, { "epoch": 0.83, "grad_norm": 13.160598754882812, "learning_rate": 9.797083919928462e-07, "loss": 2.8031, "step": 2679 }, { "epoch": 0.83, "grad_norm": 12.4559907913208, "learning_rate": 9.76220902387411e-07, "loss": 2.9015, "step": 2680 }, { "epoch": 0.83, "grad_norm": 15.56799602508545, "learning_rate": 9.727391693119844e-07, "loss": 3.8978, "step": 2681 }, { "epoch": 0.83, "grad_norm": 10.057963371276855, "learning_rate": 9.692631960664399e-07, "loss": 1.8247, "step": 2682 }, { "epoch": 0.83, "grad_norm": 22.983314514160156, "learning_rate": 9.657929859451903e-07, "loss": 6.4451, "step": 2683 }, { "epoch": 0.83, "grad_norm": 15.508147239685059, "learning_rate": 9.62328542237189e-07, "loss": 3.1221, "step": 2684 }, { "epoch": 0.83, "grad_norm": 11.232625961303711, "learning_rate": 9.588698682259217e-07, "loss": 1.0586, "step": 2685 }, { "epoch": 0.83, "grad_norm": 22.873655319213867, "learning_rate": 9.554169671894061e-07, "loss": 3.8315, "step": 2686 }, { "epoch": 0.83, "grad_norm": 13.896849632263184, "learning_rate": 9.519698424001922e-07, "loss": 5.5813, "step": 2687 }, { "epoch": 0.83, "grad_norm": 15.250011444091797, "learning_rate": 9.485284971253514e-07, "loss": 2.6697, "step": 2688 }, { "epoch": 0.83, "grad_norm": 9.72279167175293, "learning_rate": 9.450929346264781e-07, "loss": 1.4129, "step": 2689 }, { "epoch": 0.83, "grad_norm": 21.19035530090332, "learning_rate": 9.416631581596877e-07, "loss": 2.8044, "step": 2690 }, { "epoch": 0.83, "grad_norm": 17.99174690246582, "learning_rate": 9.382391709756093e-07, "loss": 2.5248, "step": 2691 }, { "epoch": 0.83, "grad_norm": 18.81049156188965, "learning_rate": 9.348209763193857e-07, "loss": 1.5409, "step": 2692 }, { "epoch": 0.83, "grad_norm": 9.588401794433594, "learning_rate": 9.314085774306725e-07, "loss": 2.1723, "step": 2693 }, { "epoch": 0.83, "grad_norm": 14.631446838378906, "learning_rate": 9.2800197754363e-07, "loss": 1.3401, "step": 2694 }, { "epoch": 0.84, "grad_norm": 29.79292106628418, "learning_rate": 9.246011798869209e-07, "loss": 4.9964, "step": 2695 }, { "epoch": 0.84, "grad_norm": 16.508506774902344, "learning_rate": 9.212061876837127e-07, "loss": 1.0711, "step": 2696 }, { "epoch": 0.84, "grad_norm": 17.153032302856445, "learning_rate": 9.17817004151669e-07, "loss": 2.5525, "step": 2697 }, { "epoch": 0.84, "grad_norm": 14.247258186340332, "learning_rate": 9.144336325029431e-07, "loss": 1.8873, "step": 2698 }, { "epoch": 0.84, "grad_norm": 12.099528312683105, "learning_rate": 9.110560759441885e-07, "loss": 1.1924, "step": 2699 }, { "epoch": 0.84, "grad_norm": 7.610776424407959, "learning_rate": 9.076843376765418e-07, "loss": 1.0963, "step": 2700 }, { "epoch": 0.84, "grad_norm": 17.039730072021484, "learning_rate": 9.043184208956258e-07, "loss": 4.1729, "step": 2701 }, { "epoch": 0.84, "grad_norm": 17.46630859375, "learning_rate": 9.009583287915495e-07, "loss": 4.3558, "step": 2702 }, { "epoch": 0.84, "grad_norm": 14.001320838928223, "learning_rate": 8.976040645488973e-07, "loss": 1.4631, "step": 2703 }, { "epoch": 0.84, "grad_norm": 15.730828285217285, "learning_rate": 8.942556313467316e-07, "loss": 3.5943, "step": 2704 }, { "epoch": 0.84, "grad_norm": 16.972795486450195, "learning_rate": 8.909130323585884e-07, "loss": 4.9959, "step": 2705 }, { "epoch": 0.84, "grad_norm": 14.503647804260254, "learning_rate": 8.875762707524742e-07, "loss": 3.0095, "step": 2706 }, { "epoch": 0.84, "grad_norm": 13.763020515441895, "learning_rate": 8.842453496908605e-07, "loss": 1.6531, "step": 2707 }, { "epoch": 0.84, "grad_norm": 19.466567993164062, "learning_rate": 8.809202723306905e-07, "loss": 2.8807, "step": 2708 }, { "epoch": 0.84, "grad_norm": 10.884944915771484, "learning_rate": 8.776010418233612e-07, "loss": 1.0522, "step": 2709 }, { "epoch": 0.84, "grad_norm": 16.030908584594727, "learning_rate": 8.742876613147325e-07, "loss": 1.9042, "step": 2710 }, { "epoch": 0.84, "grad_norm": 14.34588623046875, "learning_rate": 8.709801339451175e-07, "loss": 1.8041, "step": 2711 }, { "epoch": 0.84, "grad_norm": 17.672372817993164, "learning_rate": 8.676784628492829e-07, "loss": 3.5281, "step": 2712 }, { "epoch": 0.84, "grad_norm": 16.09285545349121, "learning_rate": 8.643826511564443e-07, "loss": 5.1449, "step": 2713 }, { "epoch": 0.84, "grad_norm": 23.066343307495117, "learning_rate": 8.610927019902668e-07, "loss": 5.009, "step": 2714 }, { "epoch": 0.84, "grad_norm": 13.979827880859375, "learning_rate": 8.578086184688562e-07, "loss": 1.3423, "step": 2715 }, { "epoch": 0.84, "grad_norm": 18.696386337280273, "learning_rate": 8.545304037047584e-07, "loss": 2.1157, "step": 2716 }, { "epoch": 0.84, "grad_norm": 15.990952491760254, "learning_rate": 8.512580608049616e-07, "loss": 2.9161, "step": 2717 }, { "epoch": 0.84, "grad_norm": 11.257905006408691, "learning_rate": 8.479915928708854e-07, "loss": 1.6421, "step": 2718 }, { "epoch": 0.84, "grad_norm": 14.700502395629883, "learning_rate": 8.44731002998378e-07, "loss": 2.5394, "step": 2719 }, { "epoch": 0.84, "grad_norm": 15.877397537231445, "learning_rate": 8.414762942777231e-07, "loss": 2.4409, "step": 2720 }, { "epoch": 0.84, "grad_norm": 10.039705276489258, "learning_rate": 8.382274697936282e-07, "loss": 1.0026, "step": 2721 }, { "epoch": 0.84, "grad_norm": 16.870891571044922, "learning_rate": 8.349845326252203e-07, "loss": 4.0844, "step": 2722 }, { "epoch": 0.84, "grad_norm": 11.927734375, "learning_rate": 8.317474858460537e-07, "loss": 2.5039, "step": 2723 }, { "epoch": 0.84, "grad_norm": 14.01502513885498, "learning_rate": 8.285163325240943e-07, "loss": 2.0861, "step": 2724 }, { "epoch": 0.84, "grad_norm": 14.202402114868164, "learning_rate": 8.252910757217231e-07, "loss": 3.7536, "step": 2725 }, { "epoch": 0.84, "grad_norm": 17.157291412353516, "learning_rate": 8.220717184957337e-07, "loss": 3.1874, "step": 2726 }, { "epoch": 0.84, "grad_norm": 18.783302307128906, "learning_rate": 8.188582638973286e-07, "loss": 3.0514, "step": 2727 }, { "epoch": 0.85, "grad_norm": 16.83509063720703, "learning_rate": 8.156507149721133e-07, "loss": 5.7338, "step": 2728 }, { "epoch": 0.85, "grad_norm": 13.39986515045166, "learning_rate": 8.124490747601021e-07, "loss": 3.11, "step": 2729 }, { "epoch": 0.85, "grad_norm": 12.768985748291016, "learning_rate": 8.092533462957026e-07, "loss": 5.6912, "step": 2730 }, { "epoch": 0.85, "grad_norm": 16.61513328552246, "learning_rate": 8.06063532607723e-07, "loss": 2.2014, "step": 2731 }, { "epoch": 0.85, "grad_norm": 12.662651062011719, "learning_rate": 8.028796367193647e-07, "loss": 1.2689, "step": 2732 }, { "epoch": 0.85, "grad_norm": 11.922401428222656, "learning_rate": 7.997016616482206e-07, "loss": 1.6796, "step": 2733 }, { "epoch": 0.85, "grad_norm": 11.682415962219238, "learning_rate": 7.965296104062696e-07, "loss": 1.2255, "step": 2734 }, { "epoch": 0.85, "grad_norm": 12.56028938293457, "learning_rate": 7.933634859998832e-07, "loss": 0.9541, "step": 2735 }, { "epoch": 0.85, "grad_norm": 13.88955020904541, "learning_rate": 7.902032914298073e-07, "loss": 2.2162, "step": 2736 }, { "epoch": 0.85, "grad_norm": 16.706653594970703, "learning_rate": 7.870490296911723e-07, "loss": 1.963, "step": 2737 }, { "epoch": 0.85, "grad_norm": 15.079216957092285, "learning_rate": 7.839007037734862e-07, "loss": 3.8992, "step": 2738 }, { "epoch": 0.85, "grad_norm": 12.343116760253906, "learning_rate": 7.8075831666063e-07, "loss": 2.2811, "step": 2739 }, { "epoch": 0.85, "grad_norm": 15.426636695861816, "learning_rate": 7.77621871330851e-07, "loss": 1.6619, "step": 2740 }, { "epoch": 0.85, "grad_norm": 15.37838077545166, "learning_rate": 7.744913707567756e-07, "loss": 3.2722, "step": 2741 }, { "epoch": 0.85, "grad_norm": 13.811568260192871, "learning_rate": 7.713668179053881e-07, "loss": 1.3035, "step": 2742 }, { "epoch": 0.85, "grad_norm": 9.749939918518066, "learning_rate": 7.682482157380361e-07, "loss": 1.1341, "step": 2743 }, { "epoch": 0.85, "grad_norm": 16.110702514648438, "learning_rate": 7.651355672104327e-07, "loss": 2.4466, "step": 2744 }, { "epoch": 0.85, "grad_norm": 11.312182426452637, "learning_rate": 7.62028875272643e-07, "loss": 2.4678, "step": 2745 }, { "epoch": 0.85, "grad_norm": 8.107397079467773, "learning_rate": 7.589281428690886e-07, "loss": 1.0105, "step": 2746 }, { "epoch": 0.85, "grad_norm": 18.883792877197266, "learning_rate": 7.558333729385425e-07, "loss": 5.5305, "step": 2747 }, { "epoch": 0.85, "grad_norm": 15.187335014343262, "learning_rate": 7.527445684141272e-07, "loss": 2.1495, "step": 2748 }, { "epoch": 0.85, "grad_norm": 11.755721092224121, "learning_rate": 7.496617322233088e-07, "loss": 4.6224, "step": 2749 }, { "epoch": 0.85, "grad_norm": 18.182004928588867, "learning_rate": 7.465848672879036e-07, "loss": 3.0456, "step": 2750 }, { "epoch": 0.85, "grad_norm": 13.412226676940918, "learning_rate": 7.435139765240602e-07, "loss": 2.3355, "step": 2751 }, { "epoch": 0.85, "grad_norm": 15.734976768493652, "learning_rate": 7.404490628422693e-07, "loss": 2.1919, "step": 2752 }, { "epoch": 0.85, "grad_norm": 18.193538665771484, "learning_rate": 7.373901291473594e-07, "loss": 5.7131, "step": 2753 }, { "epoch": 0.85, "grad_norm": 13.647350311279297, "learning_rate": 7.343371783384843e-07, "loss": 2.2852, "step": 2754 }, { "epoch": 0.85, "grad_norm": 15.154494285583496, "learning_rate": 7.312902133091318e-07, "loss": 3.8039, "step": 2755 }, { "epoch": 0.85, "grad_norm": 13.07168960571289, "learning_rate": 7.282492369471184e-07, "loss": 4.9915, "step": 2756 }, { "epoch": 0.85, "grad_norm": 13.386819839477539, "learning_rate": 7.252142521345818e-07, "loss": 1.7995, "step": 2757 }, { "epoch": 0.85, "grad_norm": 22.815553665161133, "learning_rate": 7.221852617479788e-07, "loss": 4.8514, "step": 2758 }, { "epoch": 0.85, "grad_norm": 11.641252517700195, "learning_rate": 7.191622686580927e-07, "loss": 0.8264, "step": 2759 }, { "epoch": 0.86, "grad_norm": 9.47198486328125, "learning_rate": 7.161452757300145e-07, "loss": 0.7502, "step": 2760 }, { "epoch": 0.86, "grad_norm": 10.305670738220215, "learning_rate": 7.131342858231538e-07, "loss": 1.82, "step": 2761 }, { "epoch": 0.86, "grad_norm": 16.490869522094727, "learning_rate": 7.101293017912268e-07, "loss": 2.5695, "step": 2762 }, { "epoch": 0.86, "grad_norm": 16.534713745117188, "learning_rate": 7.071303264822608e-07, "loss": 1.915, "step": 2763 }, { "epoch": 0.86, "grad_norm": 15.01233959197998, "learning_rate": 7.041373627385852e-07, "loss": 2.5617, "step": 2764 }, { "epoch": 0.86, "grad_norm": 17.969585418701172, "learning_rate": 7.011504133968359e-07, "loss": 2.0232, "step": 2765 }, { "epoch": 0.86, "grad_norm": 11.038057327270508, "learning_rate": 6.981694812879459e-07, "loss": 1.5787, "step": 2766 }, { "epoch": 0.86, "grad_norm": 12.305198669433594, "learning_rate": 6.951945692371438e-07, "loss": 1.3824, "step": 2767 }, { "epoch": 0.86, "grad_norm": 9.185097694396973, "learning_rate": 6.92225680063956e-07, "loss": 0.9478, "step": 2768 }, { "epoch": 0.86, "grad_norm": 15.012513160705566, "learning_rate": 6.892628165821988e-07, "loss": 3.1556, "step": 2769 }, { "epoch": 0.86, "grad_norm": 11.005261421203613, "learning_rate": 6.863059815999756e-07, "loss": 1.8806, "step": 2770 }, { "epoch": 0.86, "grad_norm": 14.431744575500488, "learning_rate": 6.83355177919681e-07, "loss": 1.5041, "step": 2771 }, { "epoch": 0.86, "grad_norm": 13.604820251464844, "learning_rate": 6.804104083379899e-07, "loss": 1.8549, "step": 2772 }, { "epoch": 0.86, "grad_norm": 16.08254051208496, "learning_rate": 6.774716756458579e-07, "loss": 3.1237, "step": 2773 }, { "epoch": 0.86, "grad_norm": 24.12819480895996, "learning_rate": 6.745389826285227e-07, "loss": 2.3856, "step": 2774 }, { "epoch": 0.86, "grad_norm": 16.128646850585938, "learning_rate": 6.716123320654928e-07, "loss": 3.163, "step": 2775 }, { "epoch": 0.86, "grad_norm": 8.595212936401367, "learning_rate": 6.686917267305512e-07, "loss": 0.8763, "step": 2776 }, { "epoch": 0.86, "grad_norm": 12.716238975524902, "learning_rate": 6.657771693917553e-07, "loss": 1.8973, "step": 2777 }, { "epoch": 0.86, "grad_norm": 10.591482162475586, "learning_rate": 6.628686628114276e-07, "loss": 1.7251, "step": 2778 }, { "epoch": 0.86, "grad_norm": 19.033681869506836, "learning_rate": 6.599662097461526e-07, "loss": 3.174, "step": 2779 }, { "epoch": 0.86, "grad_norm": 14.152606010437012, "learning_rate": 6.570698129467846e-07, "loss": 2.4005, "step": 2780 }, { "epoch": 0.86, "grad_norm": 10.492902755737305, "learning_rate": 6.54179475158433e-07, "loss": 0.7674, "step": 2781 }, { "epoch": 0.86, "grad_norm": 14.48991870880127, "learning_rate": 6.512951991204649e-07, "loss": 1.5215, "step": 2782 }, { "epoch": 0.86, "grad_norm": 11.252331733703613, "learning_rate": 6.484169875665041e-07, "loss": 1.5332, "step": 2783 }, { "epoch": 0.86, "grad_norm": 12.655040740966797, "learning_rate": 6.455448432244255e-07, "loss": 2.0131, "step": 2784 }, { "epoch": 0.86, "grad_norm": 14.361101150512695, "learning_rate": 6.426787688163527e-07, "loss": 3.2265, "step": 2785 }, { "epoch": 0.86, "grad_norm": 18.582107543945312, "learning_rate": 6.3981876705866e-07, "loss": 3.6963, "step": 2786 }, { "epoch": 0.86, "grad_norm": 11.305501937866211, "learning_rate": 6.36964840661962e-07, "loss": 1.7268, "step": 2787 }, { "epoch": 0.86, "grad_norm": 14.70504093170166, "learning_rate": 6.341169923311187e-07, "loss": 1.5501, "step": 2788 }, { "epoch": 0.86, "grad_norm": 11.306252479553223, "learning_rate": 6.312752247652273e-07, "loss": 3.1718, "step": 2789 }, { "epoch": 0.86, "grad_norm": 16.00751495361328, "learning_rate": 6.284395406576224e-07, "loss": 2.9333, "step": 2790 }, { "epoch": 0.86, "grad_norm": 14.812768936157227, "learning_rate": 6.25609942695871e-07, "loss": 4.0762, "step": 2791 }, { "epoch": 0.87, "grad_norm": 18.853811264038086, "learning_rate": 6.227864335617784e-07, "loss": 3.1675, "step": 2792 }, { "epoch": 0.87, "grad_norm": 13.854783058166504, "learning_rate": 6.199690159313715e-07, "loss": 1.1855, "step": 2793 }, { "epoch": 0.87, "grad_norm": 14.672539710998535, "learning_rate": 6.171576924749069e-07, "loss": 1.5224, "step": 2794 }, { "epoch": 0.87, "grad_norm": 11.445911407470703, "learning_rate": 6.143524658568685e-07, "loss": 1.1172, "step": 2795 }, { "epoch": 0.87, "grad_norm": 13.489899635314941, "learning_rate": 6.11553338735955e-07, "loss": 2.6757, "step": 2796 }, { "epoch": 0.87, "grad_norm": 21.521224975585938, "learning_rate": 6.087603137650882e-07, "loss": 3.4554, "step": 2797 }, { "epoch": 0.87, "grad_norm": 13.998003005981445, "learning_rate": 6.059733935914089e-07, "loss": 4.7792, "step": 2798 }, { "epoch": 0.87, "grad_norm": 17.415874481201172, "learning_rate": 6.03192580856267e-07, "loss": 4.8494, "step": 2799 }, { "epoch": 0.87, "grad_norm": 16.41072654724121, "learning_rate": 6.004178781952261e-07, "loss": 4.1986, "step": 2800 }, { "epoch": 0.87, "grad_norm": 11.820795059204102, "learning_rate": 5.976492882380614e-07, "loss": 1.0387, "step": 2801 }, { "epoch": 0.87, "grad_norm": 10.374963760375977, "learning_rate": 5.948868136087504e-07, "loss": 1.26, "step": 2802 }, { "epoch": 0.87, "grad_norm": 13.477418899536133, "learning_rate": 5.921304569254768e-07, "loss": 5.2774, "step": 2803 }, { "epoch": 0.87, "grad_norm": 9.892913818359375, "learning_rate": 5.893802208006254e-07, "loss": 1.2195, "step": 2804 }, { "epoch": 0.87, "grad_norm": 13.48777961730957, "learning_rate": 5.866361078407806e-07, "loss": 2.0262, "step": 2805 }, { "epoch": 0.87, "grad_norm": 14.145560264587402, "learning_rate": 5.83898120646724e-07, "loss": 2.3832, "step": 2806 }, { "epoch": 0.87, "grad_norm": 16.891164779663086, "learning_rate": 5.811662618134309e-07, "loss": 1.7395, "step": 2807 }, { "epoch": 0.87, "grad_norm": 13.735100746154785, "learning_rate": 5.784405339300669e-07, "loss": 2.7466, "step": 2808 }, { "epoch": 0.87, "grad_norm": 9.554878234863281, "learning_rate": 5.757209395799886e-07, "loss": 1.7883, "step": 2809 }, { "epoch": 0.87, "grad_norm": 15.633171081542969, "learning_rate": 5.730074813407394e-07, "loss": 1.6478, "step": 2810 }, { "epoch": 0.87, "grad_norm": 17.242467880249023, "learning_rate": 5.703001617840447e-07, "loss": 1.5473, "step": 2811 }, { "epoch": 0.87, "grad_norm": 11.249731063842773, "learning_rate": 5.675989834758162e-07, "loss": 0.9579, "step": 2812 }, { "epoch": 0.87, "grad_norm": 19.327224731445312, "learning_rate": 5.649039489761428e-07, "loss": 3.9334, "step": 2813 }, { "epoch": 0.87, "grad_norm": 18.37837028503418, "learning_rate": 5.622150608392878e-07, "loss": 5.0965, "step": 2814 }, { "epoch": 0.87, "grad_norm": 17.791837692260742, "learning_rate": 5.595323216136943e-07, "loss": 3.4107, "step": 2815 }, { "epoch": 0.87, "grad_norm": 10.968727111816406, "learning_rate": 5.568557338419748e-07, "loss": 0.996, "step": 2816 }, { "epoch": 0.87, "grad_norm": 18.726015090942383, "learning_rate": 5.54185300060912e-07, "loss": 4.8876, "step": 2817 }, { "epoch": 0.87, "grad_norm": 8.49133586883545, "learning_rate": 5.515210228014545e-07, "loss": 1.0867, "step": 2818 }, { "epoch": 0.87, "grad_norm": 14.631171226501465, "learning_rate": 5.488629045887192e-07, "loss": 1.7388, "step": 2819 }, { "epoch": 0.87, "grad_norm": 16.90894317626953, "learning_rate": 5.462109479419823e-07, "loss": 1.7231, "step": 2820 }, { "epoch": 0.87, "grad_norm": 18.553396224975586, "learning_rate": 5.435651553746843e-07, "loss": 1.7818, "step": 2821 }, { "epoch": 0.87, "grad_norm": 13.470282554626465, "learning_rate": 5.409255293944203e-07, "loss": 2.7616, "step": 2822 }, { "epoch": 0.87, "grad_norm": 6.711654186248779, "learning_rate": 5.382920725029411e-07, "loss": 0.6761, "step": 2823 }, { "epoch": 0.87, "grad_norm": 15.698296546936035, "learning_rate": 5.356647871961512e-07, "loss": 3.7117, "step": 2824 }, { "epoch": 0.88, "grad_norm": 10.690383911132812, "learning_rate": 5.330436759641078e-07, "loss": 5.3394, "step": 2825 }, { "epoch": 0.88, "grad_norm": 16.919147491455078, "learning_rate": 5.304287412910115e-07, "loss": 2.627, "step": 2826 }, { "epoch": 0.88, "grad_norm": 11.312337875366211, "learning_rate": 5.278199856552158e-07, "loss": 1.9178, "step": 2827 }, { "epoch": 0.88, "grad_norm": 14.843708992004395, "learning_rate": 5.252174115292121e-07, "loss": 2.1143, "step": 2828 }, { "epoch": 0.88, "grad_norm": 19.56746482849121, "learning_rate": 5.226210213796357e-07, "loss": 5.143, "step": 2829 }, { "epoch": 0.88, "grad_norm": 17.91864013671875, "learning_rate": 5.200308176672634e-07, "loss": 5.6695, "step": 2830 }, { "epoch": 0.88, "grad_norm": 12.32984447479248, "learning_rate": 5.174468028470023e-07, "loss": 1.4222, "step": 2831 }, { "epoch": 0.88, "grad_norm": 18.887392044067383, "learning_rate": 5.148689793678974e-07, "loss": 1.6787, "step": 2832 }, { "epoch": 0.88, "grad_norm": 11.73823356628418, "learning_rate": 5.122973496731283e-07, "loss": 2.3282, "step": 2833 }, { "epoch": 0.88, "grad_norm": 19.994966506958008, "learning_rate": 5.097319162000009e-07, "loss": 2.7318, "step": 2834 }, { "epoch": 0.88, "grad_norm": 13.204672813415527, "learning_rate": 5.071726813799483e-07, "loss": 1.5666, "step": 2835 }, { "epoch": 0.88, "grad_norm": 15.245928764343262, "learning_rate": 5.046196476385316e-07, "loss": 1.7705, "step": 2836 }, { "epoch": 0.88, "grad_norm": 10.357982635498047, "learning_rate": 5.020728173954322e-07, "loss": 5.0124, "step": 2837 }, { "epoch": 0.88, "grad_norm": 26.01468276977539, "learning_rate": 4.995321930644537e-07, "loss": 4.5425, "step": 2838 }, { "epoch": 0.88, "grad_norm": 14.262585639953613, "learning_rate": 4.969977770535164e-07, "loss": 2.3728, "step": 2839 }, { "epoch": 0.88, "grad_norm": 15.951447486877441, "learning_rate": 4.944695717646566e-07, "loss": 2.1417, "step": 2840 }, { "epoch": 0.88, "grad_norm": 11.154837608337402, "learning_rate": 4.919475795940252e-07, "loss": 1.6735, "step": 2841 }, { "epoch": 0.88, "grad_norm": 15.00161361694336, "learning_rate": 4.894318029318848e-07, "loss": 2.6789, "step": 2842 }, { "epoch": 0.88, "grad_norm": 13.821235656738281, "learning_rate": 4.869222441626074e-07, "loss": 5.2744, "step": 2843 }, { "epoch": 0.88, "grad_norm": 19.3443660736084, "learning_rate": 4.844189056646698e-07, "loss": 2.4512, "step": 2844 }, { "epoch": 0.88, "grad_norm": 9.478118896484375, "learning_rate": 4.819217898106546e-07, "loss": 1.2044, "step": 2845 }, { "epoch": 0.88, "grad_norm": 11.809135437011719, "learning_rate": 4.794308989672472e-07, "loss": 2.4362, "step": 2846 }, { "epoch": 0.88, "grad_norm": 10.744396209716797, "learning_rate": 4.769462354952327e-07, "loss": 0.7586, "step": 2847 }, { "epoch": 0.88, "grad_norm": 17.750654220581055, "learning_rate": 4.744678017494944e-07, "loss": 1.5162, "step": 2848 }, { "epoch": 0.88, "grad_norm": 13.151311874389648, "learning_rate": 4.719956000790113e-07, "loss": 1.59, "step": 2849 }, { "epoch": 0.88, "grad_norm": 12.47098445892334, "learning_rate": 4.6952963282685497e-07, "loss": 1.8405, "step": 2850 }, { "epoch": 0.88, "grad_norm": 16.631267547607422, "learning_rate": 4.6706990233019073e-07, "loss": 2.7837, "step": 2851 }, { "epoch": 0.88, "grad_norm": 9.20441722869873, "learning_rate": 4.646164109202683e-07, "loss": 1.9357, "step": 2852 }, { "epoch": 0.88, "grad_norm": 14.25259017944336, "learning_rate": 4.621691609224279e-07, "loss": 3.417, "step": 2853 }, { "epoch": 0.88, "grad_norm": 15.585476875305176, "learning_rate": 4.5972815465609407e-07, "loss": 1.8087, "step": 2854 }, { "epoch": 0.88, "grad_norm": 21.953487396240234, "learning_rate": 4.572933944347732e-07, "loss": 3.7204, "step": 2855 }, { "epoch": 0.88, "grad_norm": 10.49705696105957, "learning_rate": 4.548648825660498e-07, "loss": 1.5159, "step": 2856 }, { "epoch": 0.89, "grad_norm": 10.401366233825684, "learning_rate": 4.524426213515912e-07, "loss": 1.3898, "step": 2857 }, { "epoch": 0.89, "grad_norm": 12.927726745605469, "learning_rate": 4.5002661308713556e-07, "loss": 5.4374, "step": 2858 }, { "epoch": 0.89, "grad_norm": 11.92420482635498, "learning_rate": 4.4761686006249766e-07, "loss": 2.0195, "step": 2859 }, { "epoch": 0.89, "grad_norm": 14.106597900390625, "learning_rate": 4.452133645615623e-07, "loss": 4.3874, "step": 2860 }, { "epoch": 0.89, "grad_norm": 10.929544448852539, "learning_rate": 4.428161288622855e-07, "loss": 2.0026, "step": 2861 }, { "epoch": 0.89, "grad_norm": 16.906898498535156, "learning_rate": 4.4042515523668703e-07, "loss": 5.4916, "step": 2862 }, { "epoch": 0.89, "grad_norm": 14.238280296325684, "learning_rate": 4.380404459508562e-07, "loss": 4.5458, "step": 2863 }, { "epoch": 0.89, "grad_norm": 11.196687698364258, "learning_rate": 4.3566200326494157e-07, "loss": 1.3555, "step": 2864 }, { "epoch": 0.89, "grad_norm": 19.44993782043457, "learning_rate": 4.332898294331532e-07, "loss": 2.3461, "step": 2865 }, { "epoch": 0.89, "grad_norm": 22.673126220703125, "learning_rate": 4.309239267037614e-07, "loss": 5.4574, "step": 2866 }, { "epoch": 0.89, "grad_norm": 17.445295333862305, "learning_rate": 4.285642973190916e-07, "loss": 5.0254, "step": 2867 }, { "epoch": 0.89, "grad_norm": 15.001561164855957, "learning_rate": 4.262109435155215e-07, "loss": 2.0039, "step": 2868 }, { "epoch": 0.89, "grad_norm": 13.708137512207031, "learning_rate": 4.2386386752348646e-07, "loss": 2.3565, "step": 2869 }, { "epoch": 0.89, "grad_norm": 10.443163871765137, "learning_rate": 4.2152307156746634e-07, "loss": 1.2412, "step": 2870 }, { "epoch": 0.89, "grad_norm": 22.866369247436523, "learning_rate": 4.191885578659916e-07, "loss": 5.0355, "step": 2871 }, { "epoch": 0.89, "grad_norm": 16.499114990234375, "learning_rate": 4.1686032863164085e-07, "loss": 3.6915, "step": 2872 }, { "epoch": 0.89, "grad_norm": 10.884692192077637, "learning_rate": 4.145383860710302e-07, "loss": 1.6315, "step": 2873 }, { "epoch": 0.89, "grad_norm": 12.003686904907227, "learning_rate": 4.122227323848225e-07, "loss": 2.196, "step": 2874 }, { "epoch": 0.89, "grad_norm": 15.553420066833496, "learning_rate": 4.099133697677193e-07, "loss": 5.855, "step": 2875 }, { "epoch": 0.89, "grad_norm": 13.435556411743164, "learning_rate": 4.0761030040845973e-07, "loss": 1.3082, "step": 2876 }, { "epoch": 0.89, "grad_norm": 26.332712173461914, "learning_rate": 4.05313526489817e-07, "loss": 2.9261, "step": 2877 }, { "epoch": 0.89, "grad_norm": 16.73967933654785, "learning_rate": 4.030230501885993e-07, "loss": 5.3095, "step": 2878 }, { "epoch": 0.89, "grad_norm": 16.076337814331055, "learning_rate": 4.0073887367564516e-07, "loss": 7.312, "step": 2879 }, { "epoch": 0.89, "grad_norm": 14.803072929382324, "learning_rate": 3.984609991158226e-07, "loss": 3.0275, "step": 2880 }, { "epoch": 0.89, "grad_norm": 10.968694686889648, "learning_rate": 3.9618942866802756e-07, "loss": 1.1406, "step": 2881 }, { "epoch": 0.89, "grad_norm": 11.786733627319336, "learning_rate": 3.9392416448517924e-07, "loss": 1.3895, "step": 2882 }, { "epoch": 0.89, "grad_norm": 14.944122314453125, "learning_rate": 3.916652087142217e-07, "loss": 2.8045, "step": 2883 }, { "epoch": 0.89, "grad_norm": 17.218578338623047, "learning_rate": 3.894125634961199e-07, "loss": 1.7795, "step": 2884 }, { "epoch": 0.89, "grad_norm": 11.743535041809082, "learning_rate": 3.8716623096585873e-07, "loss": 1.3452, "step": 2885 }, { "epoch": 0.89, "grad_norm": 9.584161758422852, "learning_rate": 3.849262132524373e-07, "loss": 1.0609, "step": 2886 }, { "epoch": 0.89, "grad_norm": 14.182889938354492, "learning_rate": 3.8269251247887283e-07, "loss": 2.6492, "step": 2887 }, { "epoch": 0.89, "grad_norm": 16.427518844604492, "learning_rate": 3.8046513076219284e-07, "loss": 4.0594, "step": 2888 }, { "epoch": 0.9, "grad_norm": 19.689199447631836, "learning_rate": 3.78244070213437e-07, "loss": 4.664, "step": 2889 }, { "epoch": 0.9, "grad_norm": 12.083978652954102, "learning_rate": 3.7602933293765565e-07, "loss": 3.2096, "step": 2890 }, { "epoch": 0.9, "grad_norm": 14.33427906036377, "learning_rate": 3.738209210339037e-07, "loss": 3.754, "step": 2891 }, { "epoch": 0.9, "grad_norm": 11.477331161499023, "learning_rate": 3.716188365952413e-07, "loss": 1.8141, "step": 2892 }, { "epoch": 0.9, "grad_norm": 17.20378875732422, "learning_rate": 3.694230817087345e-07, "loss": 4.5588, "step": 2893 }, { "epoch": 0.9, "grad_norm": 15.989679336547852, "learning_rate": 3.6723365845544847e-07, "loss": 5.9715, "step": 2894 }, { "epoch": 0.9, "grad_norm": 14.15831470489502, "learning_rate": 3.6505056891044337e-07, "loss": 1.9312, "step": 2895 }, { "epoch": 0.9, "grad_norm": 11.890592575073242, "learning_rate": 3.6287381514278386e-07, "loss": 1.6506, "step": 2896 }, { "epoch": 0.9, "grad_norm": 17.143720626831055, "learning_rate": 3.6070339921552503e-07, "loss": 3.1037, "step": 2897 }, { "epoch": 0.9, "grad_norm": 9.737471580505371, "learning_rate": 3.585393231857153e-07, "loss": 1.0532, "step": 2898 }, { "epoch": 0.9, "grad_norm": 14.59965991973877, "learning_rate": 3.5638158910439836e-07, "loss": 2.3546, "step": 2899 }, { "epoch": 0.9, "grad_norm": 13.561890602111816, "learning_rate": 3.542301990166027e-07, "loss": 2.0995, "step": 2900 }, { "epoch": 0.9, "grad_norm": 16.774505615234375, "learning_rate": 3.5208515496134557e-07, "loss": 1.2884, "step": 2901 }, { "epoch": 0.9, "grad_norm": 12.472853660583496, "learning_rate": 3.4994645897163075e-07, "loss": 1.6445, "step": 2902 }, { "epoch": 0.9, "grad_norm": 12.874034881591797, "learning_rate": 3.4781411307444306e-07, "loss": 1.0967, "step": 2903 }, { "epoch": 0.9, "grad_norm": 12.599327087402344, "learning_rate": 3.456881192907505e-07, "loss": 1.184, "step": 2904 }, { "epoch": 0.9, "grad_norm": 13.708553314208984, "learning_rate": 3.435684796355029e-07, "loss": 1.7495, "step": 2905 }, { "epoch": 0.9, "grad_norm": 14.182079315185547, "learning_rate": 3.4145519611762485e-07, "loss": 2.5004, "step": 2906 }, { "epoch": 0.9, "grad_norm": 16.279220581054688, "learning_rate": 3.393482707400165e-07, "loss": 2.1366, "step": 2907 }, { "epoch": 0.9, "grad_norm": 15.938751220703125, "learning_rate": 3.37247705499554e-07, "loss": 3.5088, "step": 2908 }, { "epoch": 0.9, "grad_norm": 13.326894760131836, "learning_rate": 3.3515350238708466e-07, "loss": 2.4735, "step": 2909 }, { "epoch": 0.9, "grad_norm": 17.642976760864258, "learning_rate": 3.330656633874248e-07, "loss": 3.1495, "step": 2910 }, { "epoch": 0.9, "grad_norm": 18.092041015625, "learning_rate": 3.309841904793617e-07, "loss": 3.6098, "step": 2911 }, { "epoch": 0.9, "grad_norm": 10.679511070251465, "learning_rate": 3.2890908563564784e-07, "loss": 2.8008, "step": 2912 }, { "epoch": 0.9, "grad_norm": 9.876250267028809, "learning_rate": 3.268403508229979e-07, "loss": 0.7715, "step": 2913 }, { "epoch": 0.9, "grad_norm": 11.585461616516113, "learning_rate": 3.247779880020936e-07, "loss": 1.6819, "step": 2914 }, { "epoch": 0.9, "grad_norm": 23.991846084594727, "learning_rate": 3.2272199912757546e-07, "loss": 3.7919, "step": 2915 }, { "epoch": 0.9, "grad_norm": 15.386876106262207, "learning_rate": 3.206723861480395e-07, "loss": 1.9322, "step": 2916 }, { "epoch": 0.9, "grad_norm": 12.495972633361816, "learning_rate": 3.18629151006046e-07, "loss": 1.1509, "step": 2917 }, { "epoch": 0.9, "grad_norm": 20.994876861572266, "learning_rate": 3.1659229563810424e-07, "loss": 4.6183, "step": 2918 }, { "epoch": 0.9, "grad_norm": 18.07956886291504, "learning_rate": 3.145618219746802e-07, "loss": 2.7403, "step": 2919 }, { "epoch": 0.9, "grad_norm": 11.444549560546875, "learning_rate": 3.1253773194019116e-07, "loss": 3.1144, "step": 2920 }, { "epoch": 0.91, "grad_norm": 13.558086395263672, "learning_rate": 3.105200274530024e-07, "loss": 6.6997, "step": 2921 }, { "epoch": 0.91, "grad_norm": 17.11377716064453, "learning_rate": 3.085087104254305e-07, "loss": 5.6643, "step": 2922 }, { "epoch": 0.91, "grad_norm": 9.213759422302246, "learning_rate": 3.065037827637346e-07, "loss": 1.05, "step": 2923 }, { "epoch": 0.91, "grad_norm": 14.40109634399414, "learning_rate": 3.0450524636812055e-07, "loss": 1.237, "step": 2924 }, { "epoch": 0.91, "grad_norm": 19.069467544555664, "learning_rate": 3.0251310313273436e-07, "loss": 2.2991, "step": 2925 }, { "epoch": 0.91, "grad_norm": 15.392562866210938, "learning_rate": 3.0052735494566775e-07, "loss": 2.6311, "step": 2926 }, { "epoch": 0.91, "grad_norm": 13.271350860595703, "learning_rate": 2.985480036889458e-07, "loss": 2.2844, "step": 2927 }, { "epoch": 0.91, "grad_norm": 18.259136199951172, "learning_rate": 2.9657505123853376e-07, "loss": 2.5049, "step": 2928 }, { "epoch": 0.91, "grad_norm": 12.51673698425293, "learning_rate": 2.9460849946433253e-07, "loss": 1.6506, "step": 2929 }, { "epoch": 0.91, "grad_norm": 14.291834831237793, "learning_rate": 2.926483502301739e-07, "loss": 1.4081, "step": 2930 }, { "epoch": 0.91, "grad_norm": 17.517236709594727, "learning_rate": 2.906946053938244e-07, "loss": 2.5833, "step": 2931 }, { "epoch": 0.91, "grad_norm": 13.520174980163574, "learning_rate": 2.887472668069808e-07, "loss": 2.3661, "step": 2932 }, { "epoch": 0.91, "grad_norm": 12.30176830291748, "learning_rate": 2.868063363152659e-07, "loss": 2.445, "step": 2933 }, { "epoch": 0.91, "grad_norm": 18.104732513427734, "learning_rate": 2.8487181575823033e-07, "loss": 2.9647, "step": 2934 }, { "epoch": 0.91, "grad_norm": 17.55752182006836, "learning_rate": 2.829437069693509e-07, "loss": 5.7919, "step": 2935 }, { "epoch": 0.91, "grad_norm": 14.471856117248535, "learning_rate": 2.8102201177602596e-07, "loss": 2.0401, "step": 2936 }, { "epoch": 0.91, "grad_norm": 8.837617874145508, "learning_rate": 2.7910673199957454e-07, "loss": 1.6894, "step": 2937 }, { "epoch": 0.91, "grad_norm": 15.523258209228516, "learning_rate": 2.7719786945523714e-07, "loss": 3.0205, "step": 2938 }, { "epoch": 0.91, "grad_norm": 15.296859741210938, "learning_rate": 2.752954259521719e-07, "loss": 3.5861, "step": 2939 }, { "epoch": 0.91, "grad_norm": 13.045928955078125, "learning_rate": 2.733994032934521e-07, "loss": 3.6416, "step": 2940 }, { "epoch": 0.91, "grad_norm": 17.02593994140625, "learning_rate": 2.7150980327606636e-07, "loss": 3.4657, "step": 2941 }, { "epoch": 0.91, "grad_norm": 10.85177993774414, "learning_rate": 2.6962662769091695e-07, "loss": 1.2278, "step": 2942 }, { "epoch": 0.91, "grad_norm": 12.752093315124512, "learning_rate": 2.677498783228158e-07, "loss": 1.7542, "step": 2943 }, { "epoch": 0.91, "grad_norm": 13.72518539428711, "learning_rate": 2.658795569504847e-07, "loss": 2.2336, "step": 2944 }, { "epoch": 0.91, "grad_norm": 10.09422492980957, "learning_rate": 2.6401566534655284e-07, "loss": 1.113, "step": 2945 }, { "epoch": 0.91, "grad_norm": 12.671250343322754, "learning_rate": 2.6215820527755593e-07, "loss": 2.0924, "step": 2946 }, { "epoch": 0.91, "grad_norm": 14.427495956420898, "learning_rate": 2.6030717850393565e-07, "loss": 4.8395, "step": 2947 }, { "epoch": 0.91, "grad_norm": 9.298993110656738, "learning_rate": 2.5846258678003323e-07, "loss": 0.7284, "step": 2948 }, { "epoch": 0.91, "grad_norm": 12.819926261901855, "learning_rate": 2.566244318540941e-07, "loss": 2.285, "step": 2949 }, { "epoch": 0.91, "grad_norm": 10.828763961791992, "learning_rate": 2.5479271546826016e-07, "loss": 1.5147, "step": 2950 }, { "epoch": 0.91, "grad_norm": 13.688743591308594, "learning_rate": 2.5296743935857376e-07, "loss": 2.0614, "step": 2951 }, { "epoch": 0.91, "grad_norm": 18.027328491210938, "learning_rate": 2.511486052549705e-07, "loss": 5.5247, "step": 2952 }, { "epoch": 0.91, "grad_norm": 11.425618171691895, "learning_rate": 2.4933621488128485e-07, "loss": 2.7768, "step": 2953 }, { "epoch": 0.92, "grad_norm": 12.18440055847168, "learning_rate": 2.4753026995523893e-07, "loss": 1.2978, "step": 2954 }, { "epoch": 0.92, "grad_norm": 15.074543952941895, "learning_rate": 2.4573077218844843e-07, "loss": 2.2961, "step": 2955 }, { "epoch": 0.92, "grad_norm": 22.014270782470703, "learning_rate": 2.439377232864206e-07, "loss": 2.1697, "step": 2956 }, { "epoch": 0.92, "grad_norm": 16.891254425048828, "learning_rate": 2.4215112494854824e-07, "loss": 2.7444, "step": 2957 }, { "epoch": 0.92, "grad_norm": 14.182608604431152, "learning_rate": 2.4037097886810964e-07, "loss": 2.7952, "step": 2958 }, { "epoch": 0.92, "grad_norm": 16.573148727416992, "learning_rate": 2.3859728673227103e-07, "loss": 2.1803, "step": 2959 }, { "epoch": 0.92, "grad_norm": 9.943724632263184, "learning_rate": 2.3683005022207926e-07, "loss": 1.5011, "step": 2960 }, { "epoch": 0.92, "grad_norm": 20.083030700683594, "learning_rate": 2.3506927101246287e-07, "loss": 2.7564, "step": 2961 }, { "epoch": 0.92, "grad_norm": 12.7942533493042, "learning_rate": 2.3331495077223274e-07, "loss": 4.9782, "step": 2962 }, { "epoch": 0.92, "grad_norm": 20.043697357177734, "learning_rate": 2.3156709116407503e-07, "loss": 5.3203, "step": 2963 }, { "epoch": 0.92, "grad_norm": 16.594661712646484, "learning_rate": 2.2982569384455443e-07, "loss": 6.6035, "step": 2964 }, { "epoch": 0.92, "grad_norm": 16.41570281982422, "learning_rate": 2.2809076046411082e-07, "loss": 2.6747, "step": 2965 }, { "epoch": 0.92, "grad_norm": 10.65675163269043, "learning_rate": 2.2636229266705798e-07, "loss": 1.5477, "step": 2966 }, { "epoch": 0.92, "grad_norm": 12.77856731414795, "learning_rate": 2.2464029209158018e-07, "loss": 1.2037, "step": 2967 }, { "epoch": 0.92, "grad_norm": 12.271510124206543, "learning_rate": 2.2292476036973393e-07, "loss": 1.5294, "step": 2968 }, { "epoch": 0.92, "grad_norm": 14.198921203613281, "learning_rate": 2.212156991274456e-07, "loss": 3.7179, "step": 2969 }, { "epoch": 0.92, "grad_norm": 17.764951705932617, "learning_rate": 2.1951310998450583e-07, "loss": 2.4399, "step": 2970 }, { "epoch": 0.92, "grad_norm": 16.994083404541016, "learning_rate": 2.1781699455457524e-07, "loss": 4.2348, "step": 2971 }, { "epoch": 0.92, "grad_norm": 18.061304092407227, "learning_rate": 2.1612735444517553e-07, "loss": 3.7625, "step": 2972 }, { "epoch": 0.92, "grad_norm": 12.471445083618164, "learning_rate": 2.1444419125769131e-07, "loss": 2.0589, "step": 2973 }, { "epoch": 0.92, "grad_norm": 14.454485893249512, "learning_rate": 2.1276750658737224e-07, "loss": 1.3984, "step": 2974 }, { "epoch": 0.92, "grad_norm": 22.4586181640625, "learning_rate": 2.110973020233238e-07, "loss": 1.4389, "step": 2975 }, { "epoch": 0.92, "grad_norm": 16.39283561706543, "learning_rate": 2.0943357914851265e-07, "loss": 5.4744, "step": 2976 }, { "epoch": 0.92, "grad_norm": 16.580299377441406, "learning_rate": 2.077763395397604e-07, "loss": 1.9475, "step": 2977 }, { "epoch": 0.92, "grad_norm": 16.463586807250977, "learning_rate": 2.0612558476774527e-07, "loss": 2.557, "step": 2978 }, { "epoch": 0.92, "grad_norm": 22.39612579345703, "learning_rate": 2.0448131639699801e-07, "loss": 4.7626, "step": 2979 }, { "epoch": 0.92, "grad_norm": 17.558107376098633, "learning_rate": 2.0284353598590362e-07, "loss": 3.7814, "step": 2980 }, { "epoch": 0.92, "grad_norm": 20.783864974975586, "learning_rate": 2.0121224508669576e-07, "loss": 3.2047, "step": 2981 }, { "epoch": 0.92, "grad_norm": 16.20199203491211, "learning_rate": 1.995874452454607e-07, "loss": 2.1022, "step": 2982 }, { "epoch": 0.92, "grad_norm": 22.333803176879883, "learning_rate": 1.9796913800212956e-07, "loss": 2.7157, "step": 2983 }, { "epoch": 0.92, "grad_norm": 17.68119239807129, "learning_rate": 1.9635732489048134e-07, "loss": 1.9325, "step": 2984 }, { "epoch": 0.92, "grad_norm": 13.95864200592041, "learning_rate": 1.9475200743813983e-07, "loss": 1.6473, "step": 2985 }, { "epoch": 0.93, "grad_norm": 16.2840633392334, "learning_rate": 1.9315318716657281e-07, "loss": 3.1788, "step": 2986 }, { "epoch": 0.93, "grad_norm": 15.319509506225586, "learning_rate": 1.9156086559108976e-07, "loss": 9.2511, "step": 2987 }, { "epoch": 0.93, "grad_norm": 18.31650161743164, "learning_rate": 1.899750442208426e-07, "loss": 3.4509, "step": 2988 }, { "epoch": 0.93, "grad_norm": 18.275609970092773, "learning_rate": 1.8839572455881937e-07, "loss": 3.013, "step": 2989 }, { "epoch": 0.93, "grad_norm": 10.587852478027344, "learning_rate": 1.868229081018483e-07, "loss": 1.4377, "step": 2990 }, { "epoch": 0.93, "grad_norm": 14.27921199798584, "learning_rate": 1.8525659634059525e-07, "loss": 2.0313, "step": 2991 }, { "epoch": 0.93, "grad_norm": 10.033051490783691, "learning_rate": 1.8369679075955767e-07, "loss": 1.1735, "step": 2992 }, { "epoch": 0.93, "grad_norm": 18.491239547729492, "learning_rate": 1.821434928370684e-07, "loss": 5.3385, "step": 2993 }, { "epoch": 0.93, "grad_norm": 12.413246154785156, "learning_rate": 1.80596704045294e-07, "loss": 1.5649, "step": 2994 }, { "epoch": 0.93, "grad_norm": 9.972743034362793, "learning_rate": 1.7905642585022874e-07, "loss": 1.9612, "step": 2995 }, { "epoch": 0.93, "grad_norm": 12.768912315368652, "learning_rate": 1.7752265971169908e-07, "loss": 1.735, "step": 2996 }, { "epoch": 0.93, "grad_norm": 13.970714569091797, "learning_rate": 1.7599540708335906e-07, "loss": 2.5504, "step": 2997 }, { "epoch": 0.93, "grad_norm": 14.848921775817871, "learning_rate": 1.7447466941268872e-07, "loss": 3.7117, "step": 2998 }, { "epoch": 0.93, "grad_norm": 7.468012809753418, "learning_rate": 1.7296044814099336e-07, "loss": 0.9068, "step": 2999 }, { "epoch": 0.93, "grad_norm": 10.256609916687012, "learning_rate": 1.7145274470340347e-07, "loss": 1.6465, "step": 3000 }, { "epoch": 0.93, "grad_norm": 15.5140380859375, "learning_rate": 1.6995156052886927e-07, "loss": 4.7481, "step": 3001 }, { "epoch": 0.93, "grad_norm": 8.324983596801758, "learning_rate": 1.684568970401655e-07, "loss": 0.8189, "step": 3002 }, { "epoch": 0.93, "grad_norm": 17.819440841674805, "learning_rate": 1.6696875565388651e-07, "loss": 2.9557, "step": 3003 }, { "epoch": 0.93, "grad_norm": 13.389687538146973, "learning_rate": 1.6548713778044263e-07, "loss": 2.1643, "step": 3004 }, { "epoch": 0.93, "grad_norm": 15.54904556274414, "learning_rate": 1.640120448240638e-07, "loss": 3.0798, "step": 3005 }, { "epoch": 0.93, "grad_norm": 14.200133323669434, "learning_rate": 1.6254347818279507e-07, "loss": 2.8567, "step": 3006 }, { "epoch": 0.93, "grad_norm": 9.55955696105957, "learning_rate": 1.610814392484958e-07, "loss": 1.0734, "step": 3007 }, { "epoch": 0.93, "grad_norm": 15.413692474365234, "learning_rate": 1.5962592940683793e-07, "loss": 3.0384, "step": 3008 }, { "epoch": 0.93, "grad_norm": 11.215126991271973, "learning_rate": 1.5817695003730858e-07, "loss": 0.9001, "step": 3009 }, { "epoch": 0.93, "grad_norm": 12.100921630859375, "learning_rate": 1.5673450251320202e-07, "loss": 1.7762, "step": 3010 }, { "epoch": 0.93, "grad_norm": 13.793547630310059, "learning_rate": 1.5529858820162204e-07, "loss": 2.3921, "step": 3011 }, { "epoch": 0.93, "grad_norm": 18.29058265686035, "learning_rate": 1.538692084634829e-07, "loss": 4.5271, "step": 3012 }, { "epoch": 0.93, "grad_norm": 9.583614349365234, "learning_rate": 1.5244636465350446e-07, "loss": 1.0375, "step": 3013 }, { "epoch": 0.93, "grad_norm": 16.553199768066406, "learning_rate": 1.5103005812020983e-07, "loss": 2.4258, "step": 3014 }, { "epoch": 0.93, "grad_norm": 19.734880447387695, "learning_rate": 1.4962029020593018e-07, "loss": 7.1062, "step": 3015 }, { "epoch": 0.93, "grad_norm": 11.817924499511719, "learning_rate": 1.482170622467961e-07, "loss": 1.8422, "step": 3016 }, { "epoch": 0.93, "grad_norm": 14.2987699508667, "learning_rate": 1.468203755727422e-07, "loss": 1.4806, "step": 3017 }, { "epoch": 0.94, "grad_norm": 14.055093765258789, "learning_rate": 1.4543023150750256e-07, "loss": 1.55, "step": 3018 }, { "epoch": 0.94, "grad_norm": 19.72188949584961, "learning_rate": 1.440466313686098e-07, "loss": 2.4603, "step": 3019 }, { "epoch": 0.94, "grad_norm": 12.469618797302246, "learning_rate": 1.4266957646739674e-07, "loss": 1.2462, "step": 3020 }, { "epoch": 0.94, "grad_norm": 12.99949836730957, "learning_rate": 1.4129906810898935e-07, "loss": 2.0896, "step": 3021 }, { "epoch": 0.94, "grad_norm": 15.28267765045166, "learning_rate": 1.3993510759231143e-07, "loss": 4.4144, "step": 3022 }, { "epoch": 0.94, "grad_norm": 13.911250114440918, "learning_rate": 1.3857769621007986e-07, "loss": 4.2494, "step": 3023 }, { "epoch": 0.94, "grad_norm": 15.130582809448242, "learning_rate": 1.372268352488063e-07, "loss": 3.7505, "step": 3024 }, { "epoch": 0.94, "grad_norm": 20.72568130493164, "learning_rate": 1.3588252598879153e-07, "loss": 2.0615, "step": 3025 }, { "epoch": 0.94, "grad_norm": 18.65958023071289, "learning_rate": 1.3454476970412798e-07, "loss": 4.6672, "step": 3026 }, { "epoch": 0.94, "grad_norm": 17.683170318603516, "learning_rate": 1.3321356766269963e-07, "loss": 2.7539, "step": 3027 }, { "epoch": 0.94, "grad_norm": 11.837944030761719, "learning_rate": 1.3188892112617414e-07, "loss": 3.0784, "step": 3028 }, { "epoch": 0.94, "grad_norm": 15.589051246643066, "learning_rate": 1.3057083135000768e-07, "loss": 1.944, "step": 3029 }, { "epoch": 0.94, "grad_norm": 13.951064109802246, "learning_rate": 1.292592995834456e-07, "loss": 2.2353, "step": 3030 }, { "epoch": 0.94, "grad_norm": 13.508352279663086, "learning_rate": 1.2795432706951309e-07, "loss": 2.7971, "step": 3031 }, { "epoch": 0.94, "grad_norm": 15.353891372680664, "learning_rate": 1.2665591504502062e-07, "loss": 1.9832, "step": 3032 }, { "epoch": 0.94, "grad_norm": 11.772056579589844, "learning_rate": 1.2536406474056165e-07, "loss": 2.069, "step": 3033 }, { "epoch": 0.94, "grad_norm": 19.559324264526367, "learning_rate": 1.2407877738050943e-07, "loss": 2.9631, "step": 3034 }, { "epoch": 0.94, "grad_norm": 15.815930366516113, "learning_rate": 1.2280005418301708e-07, "loss": 5.6007, "step": 3035 }, { "epoch": 0.94, "grad_norm": 15.518260955810547, "learning_rate": 1.215278963600175e-07, "loss": 1.9506, "step": 3036 }, { "epoch": 0.94, "grad_norm": 20.898035049438477, "learning_rate": 1.2026230511722032e-07, "loss": 2.5529, "step": 3037 }, { "epoch": 0.94, "grad_norm": 8.328916549682617, "learning_rate": 1.1900328165411105e-07, "loss": 0.758, "step": 3038 }, { "epoch": 0.94, "grad_norm": 16.16352081298828, "learning_rate": 1.1775082716395116e-07, "loss": 1.857, "step": 3039 }, { "epoch": 0.94, "grad_norm": 15.048077583312988, "learning_rate": 1.165049428337772e-07, "loss": 4.866, "step": 3040 }, { "epoch": 0.94, "grad_norm": 17.457170486450195, "learning_rate": 1.1526562984439694e-07, "loss": 3.0742, "step": 3041 }, { "epoch": 0.94, "grad_norm": 11.811216354370117, "learning_rate": 1.1403288937039175e-07, "loss": 1.2268, "step": 3042 }, { "epoch": 0.94, "grad_norm": 13.016590118408203, "learning_rate": 1.1280672258011184e-07, "loss": 2.0475, "step": 3043 }, { "epoch": 0.94, "grad_norm": 9.95260238647461, "learning_rate": 1.1158713063567786e-07, "loss": 1.129, "step": 3044 }, { "epoch": 0.94, "grad_norm": 11.272063255310059, "learning_rate": 1.1037411469298089e-07, "loss": 1.6434, "step": 3045 }, { "epoch": 0.94, "grad_norm": 13.511055946350098, "learning_rate": 1.0916767590167698e-07, "loss": 1.6796, "step": 3046 }, { "epoch": 0.94, "grad_norm": 10.552799224853516, "learning_rate": 1.0796781540518867e-07, "loss": 1.3846, "step": 3047 }, { "epoch": 0.94, "grad_norm": 13.904885292053223, "learning_rate": 1.0677453434070661e-07, "loss": 0.9898, "step": 3048 }, { "epoch": 0.94, "grad_norm": 12.032923698425293, "learning_rate": 1.0558783383918326e-07, "loss": 1.9281, "step": 3049 }, { "epoch": 0.95, "grad_norm": 14.337904930114746, "learning_rate": 1.0440771502533292e-07, "loss": 1.507, "step": 3050 }, { "epoch": 0.95, "grad_norm": 17.677717208862305, "learning_rate": 1.0323417901763561e-07, "loss": 4.5251, "step": 3051 }, { "epoch": 0.95, "grad_norm": 17.238025665283203, "learning_rate": 1.0206722692832925e-07, "loss": 4.1081, "step": 3052 }, { "epoch": 0.95, "grad_norm": 13.109561920166016, "learning_rate": 1.0090685986341439e-07, "loss": 1.8459, "step": 3053 }, { "epoch": 0.95, "grad_norm": 13.919336318969727, "learning_rate": 9.975307892264793e-08, "loss": 3.0997, "step": 3054 }, { "epoch": 0.95, "grad_norm": 13.939140319824219, "learning_rate": 9.8605885199547e-08, "loss": 2.0562, "step": 3055 }, { "epoch": 0.95, "grad_norm": 10.790946960449219, "learning_rate": 9.74652797813843e-08, "loss": 1.135, "step": 3056 }, { "epoch": 0.95, "grad_norm": 7.619593620300293, "learning_rate": 9.633126374918733e-08, "loss": 1.3455, "step": 3057 }, { "epoch": 0.95, "grad_norm": 15.63952922821045, "learning_rate": 9.52038381777407e-08, "loss": 3.9738, "step": 3058 }, { "epoch": 0.95, "grad_norm": 13.779718399047852, "learning_rate": 9.408300413558064e-08, "loss": 1.712, "step": 3059 }, { "epoch": 0.95, "grad_norm": 11.850090026855469, "learning_rate": 9.296876268499823e-08, "loss": 1.4411, "step": 3060 }, { "epoch": 0.95, "grad_norm": 9.475018501281738, "learning_rate": 9.186111488203457e-08, "loss": 4.717, "step": 3061 }, { "epoch": 0.95, "grad_norm": 10.185152053833008, "learning_rate": 9.076006177648243e-08, "loss": 1.4226, "step": 3062 }, { "epoch": 0.95, "grad_norm": 15.603704452514648, "learning_rate": 8.966560441188388e-08, "loss": 2.6218, "step": 3063 }, { "epoch": 0.95, "grad_norm": 16.038780212402344, "learning_rate": 8.85777438255295e-08, "loss": 2.4836, "step": 3064 }, { "epoch": 0.95, "grad_norm": 16.55335807800293, "learning_rate": 8.74964810484592e-08, "loss": 1.6314, "step": 3065 }, { "epoch": 0.95, "grad_norm": 16.07515525817871, "learning_rate": 8.642181710545822e-08, "loss": 2.2336, "step": 3066 }, { "epoch": 0.95, "grad_norm": 10.747037887573242, "learning_rate": 8.535375301505804e-08, "loss": 0.8815, "step": 3067 }, { "epoch": 0.95, "grad_norm": 15.257098197937012, "learning_rate": 8.429228978953389e-08, "loss": 8.3597, "step": 3068 }, { "epoch": 0.95, "grad_norm": 15.84339714050293, "learning_rate": 8.323742843490879e-08, "loss": 2.8884, "step": 3069 }, { "epoch": 0.95, "grad_norm": 18.105701446533203, "learning_rate": 8.218916995094328e-08, "loss": 2.0507, "step": 3070 }, { "epoch": 0.95, "grad_norm": 13.747038841247559, "learning_rate": 8.114751533114333e-08, "loss": 5.9551, "step": 3071 }, { "epoch": 0.95, "grad_norm": 10.658041000366211, "learning_rate": 8.011246556275555e-08, "loss": 1.7899, "step": 3072 }, { "epoch": 0.95, "grad_norm": 27.648420333862305, "learning_rate": 7.908402162676728e-08, "loss": 2.5052, "step": 3073 }, { "epoch": 0.95, "grad_norm": 16.756755828857422, "learning_rate": 7.806218449790182e-08, "loss": 1.9382, "step": 3074 }, { "epoch": 0.95, "grad_norm": 15.293566703796387, "learning_rate": 7.704695514462554e-08, "loss": 1.938, "step": 3075 }, { "epoch": 0.95, "grad_norm": 15.618069648742676, "learning_rate": 7.603833452913845e-08, "loss": 6.5341, "step": 3076 }, { "epoch": 0.95, "grad_norm": 12.356037139892578, "learning_rate": 7.503632360737888e-08, "loss": 1.3101, "step": 3077 }, { "epoch": 0.95, "grad_norm": 13.837542533874512, "learning_rate": 7.40409233290204e-08, "loss": 1.2993, "step": 3078 }, { "epoch": 0.95, "grad_norm": 9.97978401184082, "learning_rate": 7.30521346374702e-08, "loss": 1.4106, "step": 3079 }, { "epoch": 0.95, "grad_norm": 13.803317070007324, "learning_rate": 7.206995846986914e-08, "loss": 4.2463, "step": 3080 }, { "epoch": 0.95, "grad_norm": 15.993304252624512, "learning_rate": 7.109439575709405e-08, "loss": 1.9746, "step": 3081 }, { "epoch": 0.95, "grad_norm": 14.088014602661133, "learning_rate": 7.012544742375074e-08, "loss": 1.7334, "step": 3082 }, { "epoch": 0.96, "grad_norm": 10.91285514831543, "learning_rate": 6.916311438817549e-08, "loss": 0.7295, "step": 3083 }, { "epoch": 0.96, "grad_norm": 8.432781219482422, "learning_rate": 6.820739756243829e-08, "loss": 1.5952, "step": 3084 }, { "epoch": 0.96, "grad_norm": 14.103729248046875, "learning_rate": 6.725829785233487e-08, "loss": 1.9413, "step": 3085 }, { "epoch": 0.96, "grad_norm": 15.959000587463379, "learning_rate": 6.631581615739155e-08, "loss": 2.0251, "step": 3086 }, { "epoch": 0.96, "grad_norm": 14.486614227294922, "learning_rate": 6.537995337086198e-08, "loss": 2.2165, "step": 3087 }, { "epoch": 0.96, "grad_norm": 20.323394775390625, "learning_rate": 6.445071037972566e-08, "loss": 2.8939, "step": 3088 }, { "epoch": 0.96, "grad_norm": 15.170675277709961, "learning_rate": 6.352808806468945e-08, "loss": 5.9418, "step": 3089 }, { "epoch": 0.96, "grad_norm": 25.27256965637207, "learning_rate": 6.261208730018529e-08, "loss": 1.3611, "step": 3090 }, { "epoch": 0.96, "grad_norm": 13.025799751281738, "learning_rate": 6.170270895436855e-08, "loss": 1.794, "step": 3091 }, { "epoch": 0.96, "grad_norm": 26.44091033935547, "learning_rate": 6.079995388911727e-08, "loss": 3.4504, "step": 3092 }, { "epoch": 0.96, "grad_norm": 15.841899871826172, "learning_rate": 5.990382296003459e-08, "loss": 2.8998, "step": 3093 }, { "epoch": 0.96, "grad_norm": 14.168822288513184, "learning_rate": 5.901431701644315e-08, "loss": 2.3676, "step": 3094 }, { "epoch": 0.96, "grad_norm": 18.31540870666504, "learning_rate": 5.81314369013891e-08, "loss": 4.4452, "step": 3095 }, { "epoch": 0.96, "grad_norm": 12.244515419006348, "learning_rate": 5.725518345163656e-08, "loss": 1.7953, "step": 3096 }, { "epoch": 0.96, "grad_norm": 16.17121696472168, "learning_rate": 5.638555749766998e-08, "loss": 2.4318, "step": 3097 }, { "epoch": 0.96, "grad_norm": 20.633235931396484, "learning_rate": 5.552255986369262e-08, "loss": 1.9895, "step": 3098 }, { "epoch": 0.96, "grad_norm": 12.283513069152832, "learning_rate": 5.46661913676257e-08, "loss": 1.7769, "step": 3099 }, { "epoch": 0.96, "grad_norm": 15.421818733215332, "learning_rate": 5.381645282110687e-08, "loss": 2.0828, "step": 3100 }, { "epoch": 0.96, "grad_norm": 18.69306755065918, "learning_rate": 5.2973345029491796e-08, "loss": 3.2615, "step": 3101 }, { "epoch": 0.96, "grad_norm": 9.494669914245605, "learning_rate": 5.213686879184941e-08, "loss": 1.1403, "step": 3102 }, { "epoch": 0.96, "grad_norm": 19.049734115600586, "learning_rate": 5.130702490096508e-08, "loss": 1.6026, "step": 3103 }, { "epoch": 0.96, "grad_norm": 11.46169662475586, "learning_rate": 5.048381414333827e-08, "loss": 1.7153, "step": 3104 }, { "epoch": 0.96, "grad_norm": 9.212632179260254, "learning_rate": 4.966723729918015e-08, "loss": 0.8264, "step": 3105 }, { "epoch": 0.96, "grad_norm": 17.575876235961914, "learning_rate": 4.8857295142416755e-08, "loss": 3.2923, "step": 3106 }, { "epoch": 0.96, "grad_norm": 15.128787994384766, "learning_rate": 4.8053988440684294e-08, "loss": 1.3332, "step": 3107 }, { "epoch": 0.96, "grad_norm": 17.256290435791016, "learning_rate": 4.7257317955331486e-08, "loss": 4.6119, "step": 3108 }, { "epoch": 0.96, "grad_norm": 16.599822998046875, "learning_rate": 4.646728444141565e-08, "loss": 4.0102, "step": 3109 }, { "epoch": 0.96, "grad_norm": 16.660905838012695, "learning_rate": 4.568388864770504e-08, "loss": 1.9701, "step": 3110 }, { "epoch": 0.96, "grad_norm": 15.143543243408203, "learning_rate": 4.4907131316677315e-08, "loss": 1.8301, "step": 3111 }, { "epoch": 0.96, "grad_norm": 13.858985900878906, "learning_rate": 4.413701318451793e-08, "loss": 1.576, "step": 3112 }, { "epoch": 0.96, "grad_norm": 16.94219970703125, "learning_rate": 4.337353498111781e-08, "loss": 2.0578, "step": 3113 }, { "epoch": 0.96, "grad_norm": 14.752725601196289, "learning_rate": 4.261669743007885e-08, "loss": 3.3421, "step": 3114 }, { "epoch": 0.97, "grad_norm": 15.993329048156738, "learning_rate": 4.1866501248706036e-08, "loss": 6.4533, "step": 3115 }, { "epoch": 0.97, "grad_norm": 11.602298736572266, "learning_rate": 4.11229471480114e-08, "loss": 1.2179, "step": 3116 }, { "epoch": 0.97, "grad_norm": 10.814712524414062, "learning_rate": 4.038603583271089e-08, "loss": 1.283, "step": 3117 }, { "epoch": 0.97, "grad_norm": 14.151748657226562, "learning_rate": 3.9655768001225895e-08, "loss": 2.2273, "step": 3118 }, { "epoch": 0.97, "grad_norm": 14.59679126739502, "learning_rate": 3.89321443456786e-08, "loss": 2.1082, "step": 3119 }, { "epoch": 0.97, "grad_norm": 12.78852367401123, "learning_rate": 3.821516555189821e-08, "loss": 1.6505, "step": 3120 }, { "epoch": 0.97, "grad_norm": 14.897425651550293, "learning_rate": 3.7504832299412366e-08, "loss": 1.7861, "step": 3121 }, { "epoch": 0.97, "grad_norm": 16.76914405822754, "learning_rate": 3.6801145261451817e-08, "loss": 4.539, "step": 3122 }, { "epoch": 0.97, "grad_norm": 10.46123218536377, "learning_rate": 3.610410510494888e-08, "loss": 1.2907, "step": 3123 }, { "epoch": 0.97, "grad_norm": 15.13853645324707, "learning_rate": 3.5413712490535053e-08, "loss": 2.8722, "step": 3124 }, { "epoch": 0.97, "grad_norm": 16.077030181884766, "learning_rate": 3.4729968072542645e-08, "loss": 1.5336, "step": 3125 }, { "epoch": 0.97, "grad_norm": 17.59398651123047, "learning_rate": 3.4052872499001574e-08, "loss": 3.9501, "step": 3126 }, { "epoch": 0.97, "grad_norm": 18.4521541595459, "learning_rate": 3.3382426411641764e-08, "loss": 1.9943, "step": 3127 }, { "epoch": 0.97, "grad_norm": 16.40480613708496, "learning_rate": 3.2718630445889206e-08, "loss": 1.7157, "step": 3128 }, { "epoch": 0.97, "grad_norm": 14.485817909240723, "learning_rate": 3.206148523086832e-08, "loss": 1.8532, "step": 3129 }, { "epoch": 0.97, "grad_norm": 18.246952056884766, "learning_rate": 3.14109913894004e-08, "loss": 4.0219, "step": 3130 }, { "epoch": 0.97, "grad_norm": 12.424639701843262, "learning_rate": 3.076714953800199e-08, "loss": 1.6797, "step": 3131 }, { "epoch": 0.97, "grad_norm": 14.660819053649902, "learning_rate": 3.012996028688575e-08, "loss": 2.8193, "step": 3132 }, { "epoch": 0.97, "grad_norm": 13.350130081176758, "learning_rate": 2.949942423995884e-08, "loss": 3.0054, "step": 3133 }, { "epoch": 0.97, "grad_norm": 11.52187728881836, "learning_rate": 2.887554199482214e-08, "loss": 2.7958, "step": 3134 }, { "epoch": 0.97, "grad_norm": 12.523505210876465, "learning_rate": 2.8258314142771835e-08, "loss": 1.1539, "step": 3135 }, { "epoch": 0.97, "grad_norm": 15.402833938598633, "learning_rate": 2.764774126879626e-08, "loss": 2.2724, "step": 3136 }, { "epoch": 0.97, "grad_norm": 19.74759864807129, "learning_rate": 2.7043823951575144e-08, "loss": 1.6459, "step": 3137 }, { "epoch": 0.97, "grad_norm": 11.21536636352539, "learning_rate": 2.644656276348349e-08, "loss": 1.2842, "step": 3138 }, { "epoch": 0.97, "grad_norm": 18.24991226196289, "learning_rate": 2.585595827058535e-08, "loss": 2.9906, "step": 3139 }, { "epoch": 0.97, "grad_norm": 12.516609191894531, "learning_rate": 2.527201103263615e-08, "loss": 1.2808, "step": 3140 }, { "epoch": 0.97, "grad_norm": 13.044878005981445, "learning_rate": 2.4694721603082695e-08, "loss": 1.7913, "step": 3141 }, { "epoch": 0.97, "grad_norm": 19.25539779663086, "learning_rate": 2.4124090529060035e-08, "loss": 2.0492, "step": 3142 }, { "epoch": 0.97, "grad_norm": 11.908823013305664, "learning_rate": 2.3560118351393824e-08, "loss": 1.1524, "step": 3143 }, { "epoch": 0.97, "grad_norm": 11.41439151763916, "learning_rate": 2.3002805604599526e-08, "loss": 0.7916, "step": 3144 }, { "epoch": 0.97, "grad_norm": 10.581273078918457, "learning_rate": 2.245215281687929e-08, "loss": 1.5594, "step": 3145 }, { "epoch": 0.97, "grad_norm": 12.786311149597168, "learning_rate": 2.1908160510124297e-08, "loss": 1.3038, "step": 3146 }, { "epoch": 0.98, "grad_norm": 11.598114967346191, "learning_rate": 2.1370829199913194e-08, "loss": 1.4897, "step": 3147 }, { "epoch": 0.98, "grad_norm": 9.448434829711914, "learning_rate": 2.0840159395508963e-08, "loss": 1.2894, "step": 3148 }, { "epoch": 0.98, "grad_norm": 13.518309593200684, "learning_rate": 2.0316151599865964e-08, "loss": 1.4369, "step": 3149 }, { "epoch": 0.98, "grad_norm": 9.300780296325684, "learning_rate": 1.979880630961976e-08, "loss": 1.6881, "step": 3150 }, { "epoch": 0.98, "grad_norm": 18.67576789855957, "learning_rate": 1.9288124015094174e-08, "loss": 2.6745, "step": 3151 }, { "epoch": 0.98, "grad_norm": 10.411372184753418, "learning_rate": 1.8784105200296566e-08, "loss": 0.9355, "step": 3152 }, { "epoch": 0.98, "grad_norm": 10.240246772766113, "learning_rate": 1.8286750342920988e-08, "loss": 1.6935, "step": 3153 }, { "epoch": 0.98, "grad_norm": 16.537799835205078, "learning_rate": 1.77960599143427e-08, "loss": 1.5408, "step": 3154 }, { "epoch": 0.98, "grad_norm": 14.367791175842285, "learning_rate": 1.7312034379622853e-08, "loss": 5.0619, "step": 3155 }, { "epoch": 0.98, "grad_norm": 12.820769309997559, "learning_rate": 1.6834674197504595e-08, "loss": 1.0659, "step": 3156 }, { "epoch": 0.98, "grad_norm": 12.2417573928833, "learning_rate": 1.636397982041462e-08, "loss": 4.2424, "step": 3157 }, { "epoch": 0.98, "grad_norm": 10.1566801071167, "learning_rate": 1.5899951694462396e-08, "loss": 1.7313, "step": 3158 }, { "epoch": 0.98, "grad_norm": 19.42092514038086, "learning_rate": 1.5442590259437807e-08, "loss": 2.3418, "step": 3159 }, { "epoch": 0.98, "grad_norm": 16.40899658203125, "learning_rate": 1.499189594881273e-08, "loss": 3.196, "step": 3160 }, { "epoch": 0.98, "grad_norm": 17.25589942932129, "learning_rate": 1.4547869189741808e-08, "loss": 2.7817, "step": 3161 }, { "epoch": 0.98, "grad_norm": 7.910188674926758, "learning_rate": 1.4110510403058546e-08, "loss": 0.5826, "step": 3162 }, { "epoch": 0.98, "grad_norm": 10.37157154083252, "learning_rate": 1.3679820003276866e-08, "loss": 0.8978, "step": 3163 }, { "epoch": 0.98, "grad_norm": 18.129549026489258, "learning_rate": 1.3255798398591895e-08, "loss": 3.2344, "step": 3164 }, { "epoch": 0.98, "grad_norm": 17.443370819091797, "learning_rate": 1.2838445990876835e-08, "loss": 2.0711, "step": 3165 }, { "epoch": 0.98, "grad_norm": 13.30059814453125, "learning_rate": 1.2427763175685308e-08, "loss": 3.252, "step": 3166 }, { "epoch": 0.98, "grad_norm": 12.549692153930664, "learning_rate": 1.202375034224823e-08, "loss": 1.5618, "step": 3167 }, { "epoch": 0.98, "grad_norm": 19.086090087890625, "learning_rate": 1.1626407873477718e-08, "loss": 1.8955, "step": 3168 }, { "epoch": 0.98, "grad_norm": 12.417947769165039, "learning_rate": 1.1235736145959269e-08, "loss": 1.935, "step": 3169 }, { "epoch": 0.98, "grad_norm": 13.785294532775879, "learning_rate": 1.0851735529961149e-08, "loss": 5.1338, "step": 3170 }, { "epoch": 0.98, "grad_norm": 11.394896507263184, "learning_rate": 1.0474406389425783e-08, "loss": 1.197, "step": 3171 }, { "epoch": 0.98, "grad_norm": 13.535491943359375, "learning_rate": 1.010374908197289e-08, "loss": 2.2509, "step": 3172 }, { "epoch": 0.98, "grad_norm": 15.352787017822266, "learning_rate": 9.739763958900261e-09, "loss": 1.9027, "step": 3173 }, { "epoch": 0.98, "grad_norm": 16.808731079101562, "learning_rate": 9.382451365180627e-09, "loss": 3.5206, "step": 3174 }, { "epoch": 0.98, "grad_norm": 17.150636672973633, "learning_rate": 9.031811639463234e-09, "loss": 1.1038, "step": 3175 }, { "epoch": 0.98, "grad_norm": 9.438704490661621, "learning_rate": 8.687845114073828e-09, "loss": 1.4847, "step": 3176 }, { "epoch": 0.98, "grad_norm": 14.827310562133789, "learning_rate": 8.350552115011539e-09, "loss": 2.1309, "step": 3177 }, { "epoch": 0.98, "grad_norm": 12.88956356048584, "learning_rate": 8.019932961952003e-09, "loss": 1.8953, "step": 3178 }, { "epoch": 0.98, "grad_norm": 21.6605224609375, "learning_rate": 7.6959879682458e-09, "loss": 6.0578, "step": 3179 }, { "epoch": 0.99, "grad_norm": 13.935561180114746, "learning_rate": 7.378717440916882e-09, "loss": 1.2751, "step": 3180 }, { "epoch": 0.99, "grad_norm": 16.300317764282227, "learning_rate": 7.068121680664153e-09, "loss": 2.9739, "step": 3181 }, { "epoch": 0.99, "grad_norm": 15.14452838897705, "learning_rate": 6.764200981859886e-09, "loss": 1.763, "step": 3182 }, { "epoch": 0.99, "grad_norm": 12.049345016479492, "learning_rate": 6.4669556325513025e-09, "loss": 1.719, "step": 3183 }, { "epoch": 0.99, "grad_norm": 21.240371704101562, "learning_rate": 6.176385914455868e-09, "loss": 3.6554, "step": 3184 }, { "epoch": 0.99, "grad_norm": 15.055821418762207, "learning_rate": 5.892492102967556e-09, "loss": 3.6622, "step": 3185 }, { "epoch": 0.99, "grad_norm": 12.99221134185791, "learning_rate": 5.615274467152154e-09, "loss": 2.4352, "step": 3186 }, { "epoch": 0.99, "grad_norm": 14.632514953613281, "learning_rate": 5.344733269745694e-09, "loss": 1.2119, "step": 3187 }, { "epoch": 0.99, "grad_norm": 15.027057647705078, "learning_rate": 5.080868767159935e-09, "loss": 4.8931, "step": 3188 }, { "epoch": 0.99, "grad_norm": 13.23940658569336, "learning_rate": 4.8236812094768826e-09, "loss": 2.2475, "step": 3189 }, { "epoch": 0.99, "grad_norm": 8.672513008117676, "learning_rate": 4.5731708404487875e-09, "loss": 0.9679, "step": 3190 }, { "epoch": 0.99, "grad_norm": 11.244919776916504, "learning_rate": 4.329337897503627e-09, "loss": 1.4577, "step": 3191 }, { "epoch": 0.99, "grad_norm": 15.41777229309082, "learning_rate": 4.0921826117372775e-09, "loss": 1.9401, "step": 3192 }, { "epoch": 0.99, "grad_norm": 12.77115249633789, "learning_rate": 3.861705207916644e-09, "loss": 1.5778, "step": 3193 }, { "epoch": 0.99, "grad_norm": 14.487737655639648, "learning_rate": 3.637905904482791e-09, "loss": 1.9504, "step": 3194 }, { "epoch": 0.99, "grad_norm": 11.114299774169922, "learning_rate": 3.4207849135446836e-09, "loss": 1.9605, "step": 3195 }, { "epoch": 0.99, "grad_norm": 12.995927810668945, "learning_rate": 3.210342440881531e-09, "loss": 1.1768, "step": 3196 }, { "epoch": 0.99, "grad_norm": 18.027769088745117, "learning_rate": 3.0065786859451382e-09, "loss": 5.1756, "step": 3197 }, { "epoch": 0.99, "grad_norm": 16.118772506713867, "learning_rate": 2.8094938418552095e-09, "loss": 2.346, "step": 3198 }, { "epoch": 0.99, "grad_norm": 12.7620849609375, "learning_rate": 2.6190880954024777e-09, "loss": 1.8738, "step": 3199 }, { "epoch": 0.99, "grad_norm": 10.473453521728516, "learning_rate": 2.4353616270479216e-09, "loss": 1.2152, "step": 3200 } ], "logging_steps": 1, "max_steps": 3227, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }