| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.9995586406130883, |
| "eval_steps": 500, |
| "global_step": 4671, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.003209886450266822, |
| "grad_norm": 4.5, |
| "learning_rate": 1.1428571428571429e-05, |
| "loss": 1.969, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.006419772900533644, |
| "grad_norm": 2.703125, |
| "learning_rate": 2.5714285714285714e-05, |
| "loss": 1.9039, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.009629659350800466, |
| "grad_norm": 2.515625, |
| "learning_rate": 4e-05, |
| "loss": 1.7918, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.012839545801067288, |
| "grad_norm": 2.15625, |
| "learning_rate": 5.428571428571428e-05, |
| "loss": 1.6624, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.01604943225133411, |
| "grad_norm": 1.9296875, |
| "learning_rate": 6.857142857142858e-05, |
| "loss": 1.5578, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.01925931870160093, |
| "grad_norm": 1.921875, |
| "learning_rate": 8.285714285714287e-05, |
| "loss": 1.4779, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.022469205151867754, |
| "grad_norm": 1.875, |
| "learning_rate": 9.714285714285715e-05, |
| "loss": 1.4165, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.025679091602134576, |
| "grad_norm": 1.8671875, |
| "learning_rate": 9.999986223659144e-05, |
| "loss": 1.3396, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.028888978052401395, |
| "grad_norm": 1.765625, |
| "learning_rate": 9.999930257447894e-05, |
| "loss": 1.3222, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.03209886450266822, |
| "grad_norm": 1.7578125, |
| "learning_rate": 9.99983124098696e-05, |
| "loss": 1.271, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.03530875095293504, |
| "grad_norm": 1.8046875, |
| "learning_rate": 9.99968917541308e-05, |
| "loss": 1.2353, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.03851863740320186, |
| "grad_norm": 1.8125, |
| "learning_rate": 9.999504062357203e-05, |
| "loss": 1.2284, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.04172852385346868, |
| "grad_norm": 1.640625, |
| "learning_rate": 9.999275903944482e-05, |
| "loss": 1.2037, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.04493841030373551, |
| "grad_norm": 1.765625, |
| "learning_rate": 9.99900470279424e-05, |
| "loss": 1.1832, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.048148296754002326, |
| "grad_norm": 1.7265625, |
| "learning_rate": 9.998690462019939e-05, |
| "loss": 1.1533, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.05135818320426915, |
| "grad_norm": 1.7734375, |
| "learning_rate": 9.998333185229152e-05, |
| "loss": 1.1481, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.05456806965453597, |
| "grad_norm": 1.90625, |
| "learning_rate": 9.99793287652352e-05, |
| "loss": 1.1369, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.05777795610480279, |
| "grad_norm": 1.765625, |
| "learning_rate": 9.997489540498695e-05, |
| "loss": 1.1191, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.060987842555069616, |
| "grad_norm": 1.7421875, |
| "learning_rate": 9.9970031822443e-05, |
| "loss": 1.1189, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.06419772900533643, |
| "grad_norm": 1.625, |
| "learning_rate": 9.996473807343865e-05, |
| "loss": 1.0978, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.06740761545560325, |
| "grad_norm": 1.9375, |
| "learning_rate": 9.995901421874761e-05, |
| "loss": 1.0831, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.07061750190587009, |
| "grad_norm": 1.9609375, |
| "learning_rate": 9.995286032408134e-05, |
| "loss": 1.0734, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.0738273883561369, |
| "grad_norm": 1.7890625, |
| "learning_rate": 9.994627646008827e-05, |
| "loss": 1.0588, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.07703727480640372, |
| "grad_norm": 2.0625, |
| "learning_rate": 9.993926270235301e-05, |
| "loss": 1.0553, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.08024716125667054, |
| "grad_norm": 1.5625, |
| "learning_rate": 9.993181913139545e-05, |
| "loss": 1.0605, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.08345704770693736, |
| "grad_norm": 1.6796875, |
| "learning_rate": 9.992394583266989e-05, |
| "loss": 1.0296, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.0866669341572042, |
| "grad_norm": 1.65625, |
| "learning_rate": 9.991564289656398e-05, |
| "loss": 1.0441, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.08987682060747101, |
| "grad_norm": 1.4609375, |
| "learning_rate": 9.990691041839778e-05, |
| "loss": 1.0367, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.09308670705773783, |
| "grad_norm": 1.625, |
| "learning_rate": 9.989774849842257e-05, |
| "loss": 1.0188, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.09629659350800465, |
| "grad_norm": 1.6328125, |
| "learning_rate": 9.988815724181975e-05, |
| "loss": 1.0121, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.09950647995827147, |
| "grad_norm": 1.6953125, |
| "learning_rate": 9.987813675869966e-05, |
| "loss": 1.0097, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.1027163664085383, |
| "grad_norm": 1.8984375, |
| "learning_rate": 9.98676871641002e-05, |
| "loss": 1.0222, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.10592625285880512, |
| "grad_norm": 1.921875, |
| "learning_rate": 9.98568085779857e-05, |
| "loss": 0.9847, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.10913613930907194, |
| "grad_norm": 1.6484375, |
| "learning_rate": 9.984550112524535e-05, |
| "loss": 1.0177, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.11234602575933876, |
| "grad_norm": 1.7109375, |
| "learning_rate": 9.983376493569186e-05, |
| "loss": 0.986, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.11555591220960558, |
| "grad_norm": 1.6875, |
| "learning_rate": 9.982160014406001e-05, |
| "loss": 0.996, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.11876579865987241, |
| "grad_norm": 1.6015625, |
| "learning_rate": 9.980900689000498e-05, |
| "loss": 0.9572, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.12197568511013923, |
| "grad_norm": 1.5703125, |
| "learning_rate": 9.979598531810088e-05, |
| "loss": 0.9589, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.12518557156040605, |
| "grad_norm": 1.7109375, |
| "learning_rate": 9.978253557783898e-05, |
| "loss": 0.9885, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.12839545801067287, |
| "grad_norm": 1.5390625, |
| "learning_rate": 9.97686578236261e-05, |
| "loss": 0.9701, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.1316053444609397, |
| "grad_norm": 1.6484375, |
| "learning_rate": 9.97543522147827e-05, |
| "loss": 0.96, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.1348152309112065, |
| "grad_norm": 1.6484375, |
| "learning_rate": 9.97396189155412e-05, |
| "loss": 0.9497, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.13802511736147333, |
| "grad_norm": 1.7734375, |
| "learning_rate": 9.9724458095044e-05, |
| "loss": 0.9269, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.14123500381174017, |
| "grad_norm": 1.6015625, |
| "learning_rate": 9.970886992734156e-05, |
| "loss": 0.9376, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.144444890262007, |
| "grad_norm": 1.59375, |
| "learning_rate": 9.969285459139044e-05, |
| "loss": 0.9344, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.1476547767122738, |
| "grad_norm": 1.5625, |
| "learning_rate": 9.967641227105115e-05, |
| "loss": 0.9316, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.15086466316254063, |
| "grad_norm": 1.6875, |
| "learning_rate": 9.965954315508615e-05, |
| "loss": 0.9611, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.15407454961280745, |
| "grad_norm": 1.5859375, |
| "learning_rate": 9.964224743715759e-05, |
| "loss": 0.9371, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.15728443606307427, |
| "grad_norm": 1.7265625, |
| "learning_rate": 9.962452531582519e-05, |
| "loss": 0.9436, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.1604943225133411, |
| "grad_norm": 1.65625, |
| "learning_rate": 9.960637699454385e-05, |
| "loss": 0.9463, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.1637042089636079, |
| "grad_norm": 1.6875, |
| "learning_rate": 9.95878026816614e-05, |
| "loss": 0.9082, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.16691409541387472, |
| "grad_norm": 1.7578125, |
| "learning_rate": 9.95688025904161e-05, |
| "loss": 0.9109, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.17012398186414154, |
| "grad_norm": 1.6328125, |
| "learning_rate": 9.954937693893438e-05, |
| "loss": 0.9137, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.1733338683144084, |
| "grad_norm": 1.703125, |
| "learning_rate": 9.952952595022813e-05, |
| "loss": 0.9238, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.1765437547646752, |
| "grad_norm": 1.84375, |
| "learning_rate": 9.950924985219228e-05, |
| "loss": 0.9301, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.17975364121494203, |
| "grad_norm": 1.7265625, |
| "learning_rate": 9.94885488776021e-05, |
| "loss": 0.8841, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.18296352766520885, |
| "grad_norm": 1.7734375, |
| "learning_rate": 9.946742326411057e-05, |
| "loss": 0.8775, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.18617341411547567, |
| "grad_norm": 1.703125, |
| "learning_rate": 9.944587325424566e-05, |
| "loss": 0.8849, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.18938330056574249, |
| "grad_norm": 1.5546875, |
| "learning_rate": 9.942389909540753e-05, |
| "loss": 0.9084, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.1925931870160093, |
| "grad_norm": 1.65625, |
| "learning_rate": 9.940150103986565e-05, |
| "loss": 0.8777, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.19580307346627612, |
| "grad_norm": 1.7734375, |
| "learning_rate": 9.9378679344756e-05, |
| "loss": 0.8883, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.19901295991654294, |
| "grad_norm": 1.5625, |
| "learning_rate": 9.935543427207801e-05, |
| "loss": 0.8874, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.20222284636680976, |
| "grad_norm": 1.7890625, |
| "learning_rate": 9.933176608869166e-05, |
| "loss": 0.8846, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.2054327328170766, |
| "grad_norm": 1.75, |
| "learning_rate": 9.930767506631427e-05, |
| "loss": 0.9083, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.20864261926734343, |
| "grad_norm": 1.6796875, |
| "learning_rate": 9.928316148151756e-05, |
| "loss": 0.9058, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.21185250571761025, |
| "grad_norm": 1.5625, |
| "learning_rate": 9.925822561572435e-05, |
| "loss": 0.8871, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.21506239216787706, |
| "grad_norm": 1.625, |
| "learning_rate": 9.923286775520537e-05, |
| "loss": 0.8707, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.21827227861814388, |
| "grad_norm": 1.6953125, |
| "learning_rate": 9.920708819107593e-05, |
| "loss": 0.8788, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.2214821650684107, |
| "grad_norm": 1.625, |
| "learning_rate": 9.918088721929266e-05, |
| "loss": 0.867, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.22469205151867752, |
| "grad_norm": 1.59375, |
| "learning_rate": 9.915426514065007e-05, |
| "loss": 0.8763, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.22790193796894434, |
| "grad_norm": 1.6875, |
| "learning_rate": 9.912722226077709e-05, |
| "loss": 0.8843, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.23111182441921116, |
| "grad_norm": 1.5703125, |
| "learning_rate": 9.90997588901335e-05, |
| "loss": 0.8689, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.234321710869478, |
| "grad_norm": 1.6953125, |
| "learning_rate": 9.907187534400655e-05, |
| "loss": 0.8666, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.23753159731974482, |
| "grad_norm": 1.6171875, |
| "learning_rate": 9.90435719425071e-05, |
| "loss": 0.8511, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.24074148377001164, |
| "grad_norm": 1.6953125, |
| "learning_rate": 9.90148490105662e-05, |
| "loss": 0.8491, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.24395137022027846, |
| "grad_norm": 1.8359375, |
| "learning_rate": 9.898570687793107e-05, |
| "loss": 0.8691, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.24716125667054528, |
| "grad_norm": 1.46875, |
| "learning_rate": 9.895614587916162e-05, |
| "loss": 0.8243, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.2503711431208121, |
| "grad_norm": 1.40625, |
| "learning_rate": 9.892616635362637e-05, |
| "loss": 0.8645, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.2535810295710789, |
| "grad_norm": 1.6171875, |
| "learning_rate": 9.889576864549867e-05, |
| "loss": 0.8191, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.25679091602134574, |
| "grad_norm": 1.5703125, |
| "learning_rate": 9.886495310375275e-05, |
| "loss": 0.8665, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.26000080247161256, |
| "grad_norm": 1.421875, |
| "learning_rate": 9.883372008215962e-05, |
| "loss": 0.8695, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.2632106889218794, |
| "grad_norm": 1.5, |
| "learning_rate": 9.880206993928313e-05, |
| "loss": 0.8283, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.2664205753721462, |
| "grad_norm": 1.421875, |
| "learning_rate": 9.87700030384758e-05, |
| "loss": 0.823, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.269630461822413, |
| "grad_norm": 1.5390625, |
| "learning_rate": 9.873751974787461e-05, |
| "loss": 0.8196, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.27284034827267983, |
| "grad_norm": 1.546875, |
| "learning_rate": 9.870462044039685e-05, |
| "loss": 0.8504, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.27605023472294665, |
| "grad_norm": 1.625, |
| "learning_rate": 9.867130549373578e-05, |
| "loss": 0.8519, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.27926012117321347, |
| "grad_norm": 1.6171875, |
| "learning_rate": 9.863757529035633e-05, |
| "loss": 0.8589, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.28247000762348035, |
| "grad_norm": 1.5859375, |
| "learning_rate": 9.860343021749065e-05, |
| "loss": 0.8209, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.28567989407374716, |
| "grad_norm": 1.53125, |
| "learning_rate": 9.856887066713378e-05, |
| "loss": 0.8453, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.288889780524014, |
| "grad_norm": 1.6484375, |
| "learning_rate": 9.853389703603901e-05, |
| "loss": 0.8433, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.2920996669742808, |
| "grad_norm": 1.640625, |
| "learning_rate": 9.849850972571344e-05, |
| "loss": 0.8281, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.2953095534245476, |
| "grad_norm": 1.5703125, |
| "learning_rate": 9.84627091424133e-05, |
| "loss": 0.8292, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.29851943987481444, |
| "grad_norm": 1.40625, |
| "learning_rate": 9.84264956971393e-05, |
| "loss": 0.8199, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.30172932632508126, |
| "grad_norm": 1.4765625, |
| "learning_rate": 9.838986980563193e-05, |
| "loss": 0.8263, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.3049392127753481, |
| "grad_norm": 1.5546875, |
| "learning_rate": 9.835283188836673e-05, |
| "loss": 0.8324, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.3081490992256149, |
| "grad_norm": 1.515625, |
| "learning_rate": 9.831538237054931e-05, |
| "loss": 0.8085, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.3113589856758817, |
| "grad_norm": 1.640625, |
| "learning_rate": 9.827752168211064e-05, |
| "loss": 0.8375, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.31456887212614854, |
| "grad_norm": 1.6015625, |
| "learning_rate": 9.823925025770206e-05, |
| "loss": 0.8027, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.31777875857641535, |
| "grad_norm": 1.6953125, |
| "learning_rate": 9.82005685366902e-05, |
| "loss": 0.8309, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.3209886450266822, |
| "grad_norm": 1.5546875, |
| "learning_rate": 9.816147696315206e-05, |
| "loss": 0.8218, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.3209886450266822, |
| "eval_loss": 0.7136461138725281, |
| "eval_runtime": 2.3986, |
| "eval_samples_per_second": 83.382, |
| "eval_steps_per_second": 83.382, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.324198531476949, |
| "grad_norm": 1.5703125, |
| "learning_rate": 9.812197598586987e-05, |
| "loss": 0.7931, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.3274084179272158, |
| "grad_norm": 1.6953125, |
| "learning_rate": 9.808206605832591e-05, |
| "loss": 0.8032, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.33061830437748263, |
| "grad_norm": 1.4921875, |
| "learning_rate": 9.80417476386973e-05, |
| "loss": 0.8131, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.33382819082774945, |
| "grad_norm": 1.625, |
| "learning_rate": 9.800102118985082e-05, |
| "loss": 0.7943, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.33703807727801627, |
| "grad_norm": 1.703125, |
| "learning_rate": 9.795988717933751e-05, |
| "loss": 0.8233, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.3402479637282831, |
| "grad_norm": 1.5234375, |
| "learning_rate": 9.79183460793873e-05, |
| "loss": 0.8013, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.3434578501785499, |
| "grad_norm": 1.7578125, |
| "learning_rate": 9.78763983669037e-05, |
| "loss": 0.8121, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.3466677366288168, |
| "grad_norm": 1.5546875, |
| "learning_rate": 9.783404452345815e-05, |
| "loss": 0.8053, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.3498776230790836, |
| "grad_norm": 1.640625, |
| "learning_rate": 9.779128503528468e-05, |
| "loss": 0.7825, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.3530875095293504, |
| "grad_norm": 1.5, |
| "learning_rate": 9.774812039327415e-05, |
| "loss": 0.7883, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.35629739597961724, |
| "grad_norm": 1.515625, |
| "learning_rate": 9.770455109296878e-05, |
| "loss": 0.8132, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.35950728242988406, |
| "grad_norm": 1.6484375, |
| "learning_rate": 9.76605776345563e-05, |
| "loss": 0.7793, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.3627171688801509, |
| "grad_norm": 1.5703125, |
| "learning_rate": 9.761620052286438e-05, |
| "loss": 0.7936, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.3659270553304177, |
| "grad_norm": 1.5078125, |
| "learning_rate": 9.757142026735464e-05, |
| "loss": 0.782, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.3691369417806845, |
| "grad_norm": 1.390625, |
| "learning_rate": 9.752623738211698e-05, |
| "loss": 0.7888, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.37234682823095133, |
| "grad_norm": 1.46875, |
| "learning_rate": 9.748065238586357e-05, |
| "loss": 0.8042, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.37555671468121815, |
| "grad_norm": 1.453125, |
| "learning_rate": 9.743466580192297e-05, |
| "loss": 0.7862, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.37876660113148497, |
| "grad_norm": 1.5234375, |
| "learning_rate": 9.738827815823399e-05, |
| "loss": 0.7994, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.3819764875817518, |
| "grad_norm": 1.5546875, |
| "learning_rate": 9.734148998733981e-05, |
| "loss": 0.7933, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.3851863740320186, |
| "grad_norm": 1.5078125, |
| "learning_rate": 9.729430182638173e-05, |
| "loss": 0.7957, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.3883962604822854, |
| "grad_norm": 1.53125, |
| "learning_rate": 9.724671421709304e-05, |
| "loss": 0.788, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.39160614693255225, |
| "grad_norm": 1.5625, |
| "learning_rate": 9.719872770579284e-05, |
| "loss": 0.7994, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.39481603338281906, |
| "grad_norm": 1.625, |
| "learning_rate": 9.71503428433797e-05, |
| "loss": 0.7882, |
| "step": 615 |
| }, |
| { |
| "epoch": 0.3980259198330859, |
| "grad_norm": 1.4375, |
| "learning_rate": 9.710156018532542e-05, |
| "loss": 0.7768, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.4012358062833527, |
| "grad_norm": 1.5859375, |
| "learning_rate": 9.705238029166855e-05, |
| "loss": 0.7844, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.4044456927336195, |
| "grad_norm": 1.390625, |
| "learning_rate": 9.700280372700807e-05, |
| "loss": 0.7825, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.4076555791838864, |
| "grad_norm": 1.3515625, |
| "learning_rate": 9.695283106049682e-05, |
| "loss": 0.7749, |
| "step": 635 |
| }, |
| { |
| "epoch": 0.4108654656341532, |
| "grad_norm": 1.578125, |
| "learning_rate": 9.6902462865835e-05, |
| "loss": 0.7849, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.41407535208442003, |
| "grad_norm": 1.5234375, |
| "learning_rate": 9.68516997212636e-05, |
| "loss": 0.7684, |
| "step": 645 |
| }, |
| { |
| "epoch": 0.41728523853468685, |
| "grad_norm": 1.2890625, |
| "learning_rate": 9.680054220955774e-05, |
| "loss": 0.763, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.42049512498495367, |
| "grad_norm": 1.5859375, |
| "learning_rate": 9.674899091801996e-05, |
| "loss": 0.7771, |
| "step": 655 |
| }, |
| { |
| "epoch": 0.4237050114352205, |
| "grad_norm": 1.46875, |
| "learning_rate": 9.669704643847358e-05, |
| "loss": 0.7729, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.4269148978854873, |
| "grad_norm": 1.4609375, |
| "learning_rate": 9.664470936725571e-05, |
| "loss": 0.7644, |
| "step": 665 |
| }, |
| { |
| "epoch": 0.43012478433575413, |
| "grad_norm": 1.4609375, |
| "learning_rate": 9.659198030521063e-05, |
| "loss": 0.7702, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.43333467078602095, |
| "grad_norm": 1.3671875, |
| "learning_rate": 9.653885985768273e-05, |
| "loss": 0.7859, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.43654455723628777, |
| "grad_norm": 1.5078125, |
| "learning_rate": 9.648534863450962e-05, |
| "loss": 0.7817, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.4397544436865546, |
| "grad_norm": 1.625, |
| "learning_rate": 9.643144725001514e-05, |
| "loss": 0.7604, |
| "step": 685 |
| }, |
| { |
| "epoch": 0.4429643301368214, |
| "grad_norm": 1.5625, |
| "learning_rate": 9.637715632300229e-05, |
| "loss": 0.7772, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.4461742165870882, |
| "grad_norm": 1.65625, |
| "learning_rate": 9.632247647674606e-05, |
| "loss": 0.7653, |
| "step": 695 |
| }, |
| { |
| "epoch": 0.44938410303735504, |
| "grad_norm": 1.609375, |
| "learning_rate": 9.626740833898648e-05, |
| "loss": 0.7522, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.45259398948762186, |
| "grad_norm": 1.4453125, |
| "learning_rate": 9.621195254192114e-05, |
| "loss": 0.7729, |
| "step": 705 |
| }, |
| { |
| "epoch": 0.4558038759378887, |
| "grad_norm": 1.421875, |
| "learning_rate": 9.615610972219816e-05, |
| "loss": 0.7425, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.4590137623881555, |
| "grad_norm": 1.5078125, |
| "learning_rate": 9.609988052090872e-05, |
| "loss": 0.7838, |
| "step": 715 |
| }, |
| { |
| "epoch": 0.4622236488384223, |
| "grad_norm": 1.546875, |
| "learning_rate": 9.604326558357983e-05, |
| "loss": 0.7653, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.46543353528868914, |
| "grad_norm": 1.578125, |
| "learning_rate": 9.598626556016682e-05, |
| "loss": 0.7702, |
| "step": 725 |
| }, |
| { |
| "epoch": 0.468643421738956, |
| "grad_norm": 1.4296875, |
| "learning_rate": 9.59288811050459e-05, |
| "loss": 0.7565, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.47185330818922283, |
| "grad_norm": 1.6015625, |
| "learning_rate": 9.587111287700672e-05, |
| "loss": 0.7352, |
| "step": 735 |
| }, |
| { |
| "epoch": 0.47506319463948965, |
| "grad_norm": 1.3671875, |
| "learning_rate": 9.581296153924468e-05, |
| "loss": 0.7715, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.47827308108975647, |
| "grad_norm": 1.5078125, |
| "learning_rate": 9.575442775935348e-05, |
| "loss": 0.7536, |
| "step": 745 |
| }, |
| { |
| "epoch": 0.4814829675400233, |
| "grad_norm": 1.4296875, |
| "learning_rate": 9.569551220931725e-05, |
| "loss": 0.7404, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.4846928539902901, |
| "grad_norm": 1.5546875, |
| "learning_rate": 9.563621556550306e-05, |
| "loss": 0.7383, |
| "step": 755 |
| }, |
| { |
| "epoch": 0.4879027404405569, |
| "grad_norm": 1.5, |
| "learning_rate": 9.557653850865293e-05, |
| "loss": 0.7391, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.49111262689082374, |
| "grad_norm": 1.4140625, |
| "learning_rate": 9.551648172387624e-05, |
| "loss": 0.751, |
| "step": 765 |
| }, |
| { |
| "epoch": 0.49432251334109056, |
| "grad_norm": 1.3125, |
| "learning_rate": 9.545604590064167e-05, |
| "loss": 0.7483, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.4975323997913574, |
| "grad_norm": 1.5234375, |
| "learning_rate": 9.539523173276942e-05, |
| "loss": 0.7284, |
| "step": 775 |
| }, |
| { |
| "epoch": 0.5007422862416242, |
| "grad_norm": 1.5390625, |
| "learning_rate": 9.533403991842317e-05, |
| "loss": 0.7356, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.5039521726918911, |
| "grad_norm": 1.609375, |
| "learning_rate": 9.527247116010207e-05, |
| "loss": 0.7591, |
| "step": 785 |
| }, |
| { |
| "epoch": 0.5071620591421578, |
| "grad_norm": 1.421875, |
| "learning_rate": 9.521052616463272e-05, |
| "loss": 0.7411, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.5103719455924247, |
| "grad_norm": 1.546875, |
| "learning_rate": 9.5148205643161e-05, |
| "loss": 0.7574, |
| "step": 795 |
| }, |
| { |
| "epoch": 0.5135818320426915, |
| "grad_norm": 1.4609375, |
| "learning_rate": 9.5085510311144e-05, |
| "loss": 0.7262, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.5167917184929584, |
| "grad_norm": 1.4921875, |
| "learning_rate": 9.502244088834164e-05, |
| "loss": 0.7584, |
| "step": 805 |
| }, |
| { |
| "epoch": 0.5200016049432251, |
| "grad_norm": 1.421875, |
| "learning_rate": 9.495899809880858e-05, |
| "loss": 0.7261, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.523211491393492, |
| "grad_norm": 1.6328125, |
| "learning_rate": 9.489518267088583e-05, |
| "loss": 0.7463, |
| "step": 815 |
| }, |
| { |
| "epoch": 0.5264213778437588, |
| "grad_norm": 1.4609375, |
| "learning_rate": 9.483099533719234e-05, |
| "loss": 0.7477, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.5296312642940256, |
| "grad_norm": 1.453125, |
| "learning_rate": 9.476643683461672e-05, |
| "loss": 0.7441, |
| "step": 825 |
| }, |
| { |
| "epoch": 0.5328411507442924, |
| "grad_norm": 1.53125, |
| "learning_rate": 9.470150790430863e-05, |
| "loss": 0.7433, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.5360510371945593, |
| "grad_norm": 1.4609375, |
| "learning_rate": 9.463620929167039e-05, |
| "loss": 0.7414, |
| "step": 835 |
| }, |
| { |
| "epoch": 0.539260923644826, |
| "grad_norm": 1.4140625, |
| "learning_rate": 9.457054174634837e-05, |
| "loss": 0.7412, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.5424708100950929, |
| "grad_norm": 1.640625, |
| "learning_rate": 9.450450602222435e-05, |
| "loss": 0.7164, |
| "step": 845 |
| }, |
| { |
| "epoch": 0.5456806965453597, |
| "grad_norm": 1.53125, |
| "learning_rate": 9.443810287740697e-05, |
| "loss": 0.755, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.5488905829956265, |
| "grad_norm": 1.4765625, |
| "learning_rate": 9.437133307422294e-05, |
| "loss": 0.7512, |
| "step": 855 |
| }, |
| { |
| "epoch": 0.5521004694458933, |
| "grad_norm": 1.5625, |
| "learning_rate": 9.430419737920828e-05, |
| "loss": 0.7385, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.5553103558961602, |
| "grad_norm": 1.515625, |
| "learning_rate": 9.42366965630996e-05, |
| "loss": 0.7316, |
| "step": 865 |
| }, |
| { |
| "epoch": 0.5585202423464269, |
| "grad_norm": 1.390625, |
| "learning_rate": 9.416883140082512e-05, |
| "loss": 0.7297, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.5617301287966938, |
| "grad_norm": 1.5, |
| "learning_rate": 9.410060267149596e-05, |
| "loss": 0.7208, |
| "step": 875 |
| }, |
| { |
| "epoch": 0.5649400152469607, |
| "grad_norm": 1.359375, |
| "learning_rate": 9.403201115839704e-05, |
| "loss": 0.7288, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.5681499016972275, |
| "grad_norm": 1.4375, |
| "learning_rate": 9.396305764897813e-05, |
| "loss": 0.7133, |
| "step": 885 |
| }, |
| { |
| "epoch": 0.5713597881474943, |
| "grad_norm": 1.4921875, |
| "learning_rate": 9.389374293484483e-05, |
| "loss": 0.7036, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.5745696745977611, |
| "grad_norm": 1.5234375, |
| "learning_rate": 9.382406781174949e-05, |
| "loss": 0.7332, |
| "step": 895 |
| }, |
| { |
| "epoch": 0.577779561048028, |
| "grad_norm": 1.484375, |
| "learning_rate": 9.3754033079582e-05, |
| "loss": 0.7343, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.5809894474982947, |
| "grad_norm": 1.546875, |
| "learning_rate": 9.368363954236075e-05, |
| "loss": 0.7119, |
| "step": 905 |
| }, |
| { |
| "epoch": 0.5841993339485616, |
| "grad_norm": 1.5703125, |
| "learning_rate": 9.361288800822321e-05, |
| "loss": 0.7339, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.5874092203988284, |
| "grad_norm": 1.453125, |
| "learning_rate": 9.354177928941687e-05, |
| "loss": 0.7163, |
| "step": 915 |
| }, |
| { |
| "epoch": 0.5906191068490952, |
| "grad_norm": 1.4453125, |
| "learning_rate": 9.347031420228969e-05, |
| "loss": 0.7281, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.593828993299362, |
| "grad_norm": 1.5, |
| "learning_rate": 9.339849356728092e-05, |
| "loss": 0.7072, |
| "step": 925 |
| }, |
| { |
| "epoch": 0.5970388797496289, |
| "grad_norm": 1.46875, |
| "learning_rate": 9.332631820891154e-05, |
| "loss": 0.729, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.6002487661998956, |
| "grad_norm": 1.4609375, |
| "learning_rate": 9.325378895577491e-05, |
| "loss": 0.7341, |
| "step": 935 |
| }, |
| { |
| "epoch": 0.6034586526501625, |
| "grad_norm": 1.4140625, |
| "learning_rate": 9.318090664052713e-05, |
| "loss": 0.708, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.6066685391004293, |
| "grad_norm": 1.3515625, |
| "learning_rate": 9.310767209987763e-05, |
| "loss": 0.7191, |
| "step": 945 |
| }, |
| { |
| "epoch": 0.6098784255506962, |
| "grad_norm": 1.6796875, |
| "learning_rate": 9.303408617457943e-05, |
| "loss": 0.7114, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.6130883120009629, |
| "grad_norm": 1.484375, |
| "learning_rate": 9.296014970941958e-05, |
| "loss": 0.704, |
| "step": 955 |
| }, |
| { |
| "epoch": 0.6162981984512298, |
| "grad_norm": 1.3359375, |
| "learning_rate": 9.288586355320938e-05, |
| "loss": 0.704, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.6195080849014966, |
| "grad_norm": 1.359375, |
| "learning_rate": 9.281122855877473e-05, |
| "loss": 0.7112, |
| "step": 965 |
| }, |
| { |
| "epoch": 0.6227179713517634, |
| "grad_norm": 1.4765625, |
| "learning_rate": 9.273624558294627e-05, |
| "loss": 0.6998, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.6259278578020303, |
| "grad_norm": 1.421875, |
| "learning_rate": 9.266091548654958e-05, |
| "loss": 0.7114, |
| "step": 975 |
| }, |
| { |
| "epoch": 0.6291377442522971, |
| "grad_norm": 1.5625, |
| "learning_rate": 9.258523913439522e-05, |
| "loss": 0.7307, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.632347630702564, |
| "grad_norm": 1.53125, |
| "learning_rate": 9.250921739526896e-05, |
| "loss": 0.7257, |
| "step": 985 |
| }, |
| { |
| "epoch": 0.6355575171528307, |
| "grad_norm": 1.578125, |
| "learning_rate": 9.243285114192163e-05, |
| "loss": 0.7261, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.6387674036030976, |
| "grad_norm": 1.40625, |
| "learning_rate": 9.235614125105922e-05, |
| "loss": 0.7139, |
| "step": 995 |
| }, |
| { |
| "epoch": 0.6419772900533643, |
| "grad_norm": 1.390625, |
| "learning_rate": 9.227908860333275e-05, |
| "loss": 0.7136, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.6419772900533643, |
| "eval_loss": 0.6108266711235046, |
| "eval_runtime": 2.3924, |
| "eval_samples_per_second": 83.597, |
| "eval_steps_per_second": 83.597, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.6451871765036312, |
| "grad_norm": 1.5, |
| "learning_rate": 9.220169408332821e-05, |
| "loss": 0.6998, |
| "step": 1005 |
| }, |
| { |
| "epoch": 0.648397062953898, |
| "grad_norm": 1.4375, |
| "learning_rate": 9.212395857955637e-05, |
| "loss": 0.7121, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.6516069494041649, |
| "grad_norm": 1.390625, |
| "learning_rate": 9.204588298444257e-05, |
| "loss": 0.7275, |
| "step": 1015 |
| }, |
| { |
| "epoch": 0.6548168358544316, |
| "grad_norm": 1.40625, |
| "learning_rate": 9.196746819431652e-05, |
| "loss": 0.7063, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.6580267223046985, |
| "grad_norm": 1.6171875, |
| "learning_rate": 9.188871510940198e-05, |
| "loss": 0.7275, |
| "step": 1025 |
| }, |
| { |
| "epoch": 0.6612366087549653, |
| "grad_norm": 1.375, |
| "learning_rate": 9.180962463380642e-05, |
| "loss": 0.6942, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.6644464952052321, |
| "grad_norm": 1.3828125, |
| "learning_rate": 9.173019767551064e-05, |
| "loss": 0.7184, |
| "step": 1035 |
| }, |
| { |
| "epoch": 0.6676563816554989, |
| "grad_norm": 1.515625, |
| "learning_rate": 9.165043514635836e-05, |
| "loss": 0.7054, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.6708662681057658, |
| "grad_norm": 1.53125, |
| "learning_rate": 9.157033796204579e-05, |
| "loss": 0.7166, |
| "step": 1045 |
| }, |
| { |
| "epoch": 0.6740761545560325, |
| "grad_norm": 1.53125, |
| "learning_rate": 9.148990704211103e-05, |
| "loss": 0.7031, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.6772860410062994, |
| "grad_norm": 1.453125, |
| "learning_rate": 9.140914330992356e-05, |
| "loss": 0.7071, |
| "step": 1055 |
| }, |
| { |
| "epoch": 0.6804959274565662, |
| "grad_norm": 1.5, |
| "learning_rate": 9.132804769267364e-05, |
| "loss": 0.7117, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.683705813906833, |
| "grad_norm": 1.703125, |
| "learning_rate": 9.124662112136169e-05, |
| "loss": 0.7063, |
| "step": 1065 |
| }, |
| { |
| "epoch": 0.6869157003570998, |
| "grad_norm": 1.7265625, |
| "learning_rate": 9.116486453078755e-05, |
| "loss": 0.7007, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.6901255868073667, |
| "grad_norm": 1.5703125, |
| "learning_rate": 9.108277885953975e-05, |
| "loss": 0.6956, |
| "step": 1075 |
| }, |
| { |
| "epoch": 0.6933354732576336, |
| "grad_norm": 1.4140625, |
| "learning_rate": 9.100036504998483e-05, |
| "loss": 0.6968, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.6965453597079003, |
| "grad_norm": 1.3203125, |
| "learning_rate": 9.091762404825639e-05, |
| "loss": 0.7131, |
| "step": 1085 |
| }, |
| { |
| "epoch": 0.6997552461581672, |
| "grad_norm": 1.46875, |
| "learning_rate": 9.08345568042443e-05, |
| "loss": 0.6982, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.702965132608434, |
| "grad_norm": 1.5546875, |
| "learning_rate": 9.075116427158379e-05, |
| "loss": 0.6743, |
| "step": 1095 |
| }, |
| { |
| "epoch": 0.7061750190587008, |
| "grad_norm": 1.359375, |
| "learning_rate": 9.06674474076445e-05, |
| "loss": 0.6925, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.7093849055089676, |
| "grad_norm": 1.4375, |
| "learning_rate": 9.058340717351948e-05, |
| "loss": 0.6849, |
| "step": 1105 |
| }, |
| { |
| "epoch": 0.7125947919592345, |
| "grad_norm": 1.3828125, |
| "learning_rate": 9.049904453401412e-05, |
| "loss": 0.6815, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.7158046784095012, |
| "grad_norm": 1.4453125, |
| "learning_rate": 9.04143604576352e-05, |
| "loss": 0.6905, |
| "step": 1115 |
| }, |
| { |
| "epoch": 0.7190145648597681, |
| "grad_norm": 1.4453125, |
| "learning_rate": 9.032935591657961e-05, |
| "loss": 0.69, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.7222244513100349, |
| "grad_norm": 1.3828125, |
| "learning_rate": 9.02440318867233e-05, |
| "loss": 0.6861, |
| "step": 1125 |
| }, |
| { |
| "epoch": 0.7254343377603018, |
| "grad_norm": 1.484375, |
| "learning_rate": 9.015838934761003e-05, |
| "loss": 0.7338, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.7286442242105685, |
| "grad_norm": 1.3515625, |
| "learning_rate": 9.007242928244014e-05, |
| "loss": 0.6787, |
| "step": 1135 |
| }, |
| { |
| "epoch": 0.7318541106608354, |
| "grad_norm": 1.421875, |
| "learning_rate": 8.998615267805922e-05, |
| "loss": 0.6793, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.7350639971111022, |
| "grad_norm": 1.3671875, |
| "learning_rate": 8.98995605249469e-05, |
| "loss": 0.6791, |
| "step": 1145 |
| }, |
| { |
| "epoch": 0.738273883561369, |
| "grad_norm": 1.4140625, |
| "learning_rate": 8.981265381720533e-05, |
| "loss": 0.7028, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.7414837700116358, |
| "grad_norm": 1.453125, |
| "learning_rate": 8.972543355254785e-05, |
| "loss": 0.712, |
| "step": 1155 |
| }, |
| { |
| "epoch": 0.7446936564619027, |
| "grad_norm": 1.4453125, |
| "learning_rate": 8.963790073228757e-05, |
| "loss": 0.6749, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.7479035429121694, |
| "grad_norm": 1.53125, |
| "learning_rate": 8.955005636132573e-05, |
| "loss": 0.6844, |
| "step": 1165 |
| }, |
| { |
| "epoch": 0.7511134293624363, |
| "grad_norm": 1.296875, |
| "learning_rate": 8.946190144814034e-05, |
| "loss": 0.6753, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.7543233158127032, |
| "grad_norm": 1.4296875, |
| "learning_rate": 8.937343700477449e-05, |
| "loss": 0.6809, |
| "step": 1175 |
| }, |
| { |
| "epoch": 0.7575332022629699, |
| "grad_norm": 1.3515625, |
| "learning_rate": 8.928466404682478e-05, |
| "loss": 0.7046, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.7607430887132368, |
| "grad_norm": 1.3515625, |
| "learning_rate": 8.91955835934296e-05, |
| "loss": 0.6763, |
| "step": 1185 |
| }, |
| { |
| "epoch": 0.7639529751635036, |
| "grad_norm": 1.359375, |
| "learning_rate": 8.910619666725755e-05, |
| "loss": 0.6788, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.7671628616137705, |
| "grad_norm": 1.40625, |
| "learning_rate": 8.901650429449553e-05, |
| "loss": 0.6874, |
| "step": 1195 |
| }, |
| { |
| "epoch": 0.7703727480640372, |
| "grad_norm": 1.3125, |
| "learning_rate": 8.892650750483715e-05, |
| "loss": 0.7008, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.7735826345143041, |
| "grad_norm": 1.3203125, |
| "learning_rate": 8.883620733147073e-05, |
| "loss": 0.6946, |
| "step": 1205 |
| }, |
| { |
| "epoch": 0.7767925209645709, |
| "grad_norm": 1.3671875, |
| "learning_rate": 8.874560481106758e-05, |
| "loss": 0.6845, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.7800024074148377, |
| "grad_norm": 1.3359375, |
| "learning_rate": 8.865470098376995e-05, |
| "loss": 0.7019, |
| "step": 1215 |
| }, |
| { |
| "epoch": 0.7832122938651045, |
| "grad_norm": 1.4140625, |
| "learning_rate": 8.856349689317933e-05, |
| "loss": 0.6611, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.7864221803153714, |
| "grad_norm": 1.4453125, |
| "learning_rate": 8.847199358634415e-05, |
| "loss": 0.6769, |
| "step": 1225 |
| }, |
| { |
| "epoch": 0.7896320667656381, |
| "grad_norm": 1.3359375, |
| "learning_rate": 8.838019211374804e-05, |
| "loss": 0.6684, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.792841953215905, |
| "grad_norm": 1.3046875, |
| "learning_rate": 8.828809352929762e-05, |
| "loss": 0.6799, |
| "step": 1235 |
| }, |
| { |
| "epoch": 0.7960518396661718, |
| "grad_norm": 1.8125, |
| "learning_rate": 8.81956988903104e-05, |
| "loss": 0.685, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.7992617261164386, |
| "grad_norm": 1.265625, |
| "learning_rate": 8.810300925750277e-05, |
| "loss": 0.6874, |
| "step": 1245 |
| }, |
| { |
| "epoch": 0.8024716125667054, |
| "grad_norm": 1.5625, |
| "learning_rate": 8.801002569497763e-05, |
| "loss": 0.6856, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.8056814990169723, |
| "grad_norm": 1.3359375, |
| "learning_rate": 8.791674927021234e-05, |
| "loss": 0.68, |
| "step": 1255 |
| }, |
| { |
| "epoch": 0.808891385467239, |
| "grad_norm": 1.2734375, |
| "learning_rate": 8.782318105404636e-05, |
| "loss": 0.6473, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.8121012719175059, |
| "grad_norm": 1.46875, |
| "learning_rate": 8.772932212066906e-05, |
| "loss": 0.6721, |
| "step": 1265 |
| }, |
| { |
| "epoch": 0.8153111583677728, |
| "grad_norm": 1.484375, |
| "learning_rate": 8.763517354760726e-05, |
| "loss": 0.6675, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.8185210448180396, |
| "grad_norm": 1.3125, |
| "learning_rate": 8.754073641571295e-05, |
| "loss": 0.6856, |
| "step": 1275 |
| }, |
| { |
| "epoch": 0.8217309312683064, |
| "grad_norm": 1.3515625, |
| "learning_rate": 8.744601180915087e-05, |
| "loss": 0.6938, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.8249408177185732, |
| "grad_norm": 1.296875, |
| "learning_rate": 8.7351000815386e-05, |
| "loss": 0.6785, |
| "step": 1285 |
| }, |
| { |
| "epoch": 0.8281507041688401, |
| "grad_norm": 1.3515625, |
| "learning_rate": 8.72557045251712e-05, |
| "loss": 0.6697, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.8313605906191068, |
| "grad_norm": 1.40625, |
| "learning_rate": 8.716012403253455e-05, |
| "loss": 0.6647, |
| "step": 1295 |
| }, |
| { |
| "epoch": 0.8345704770693737, |
| "grad_norm": 1.3125, |
| "learning_rate": 8.706426043476687e-05, |
| "loss": 0.6776, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.8377803635196405, |
| "grad_norm": 1.40625, |
| "learning_rate": 8.696811483240915e-05, |
| "loss": 0.6689, |
| "step": 1305 |
| }, |
| { |
| "epoch": 0.8409902499699073, |
| "grad_norm": 1.515625, |
| "learning_rate": 8.687168832923981e-05, |
| "loss": 0.6667, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.8442001364201741, |
| "grad_norm": 1.3828125, |
| "learning_rate": 8.67749820322621e-05, |
| "loss": 0.694, |
| "step": 1315 |
| }, |
| { |
| "epoch": 0.847410022870441, |
| "grad_norm": 1.3828125, |
| "learning_rate": 8.667799705169142e-05, |
| "loss": 0.6682, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.8506199093207077, |
| "grad_norm": 1.4296875, |
| "learning_rate": 8.65807345009425e-05, |
| "loss": 0.6942, |
| "step": 1325 |
| }, |
| { |
| "epoch": 0.8538297957709746, |
| "grad_norm": 1.40625, |
| "learning_rate": 8.648319549661668e-05, |
| "loss": 0.6832, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.8570396822212414, |
| "grad_norm": 1.3515625, |
| "learning_rate": 8.638538115848902e-05, |
| "loss": 0.673, |
| "step": 1335 |
| }, |
| { |
| "epoch": 0.8602495686715083, |
| "grad_norm": 1.4921875, |
| "learning_rate": 8.628729260949555e-05, |
| "loss": 0.6954, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.863459455121775, |
| "grad_norm": 1.59375, |
| "learning_rate": 8.618893097572027e-05, |
| "loss": 0.6585, |
| "step": 1345 |
| }, |
| { |
| "epoch": 0.8666693415720419, |
| "grad_norm": 1.4296875, |
| "learning_rate": 8.60902973863823e-05, |
| "loss": 0.6733, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.8698792280223087, |
| "grad_norm": 1.484375, |
| "learning_rate": 8.599139297382286e-05, |
| "loss": 0.6714, |
| "step": 1355 |
| }, |
| { |
| "epoch": 0.8730891144725755, |
| "grad_norm": 1.46875, |
| "learning_rate": 8.58922188734923e-05, |
| "loss": 0.6733, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.8762990009228424, |
| "grad_norm": 1.4453125, |
| "learning_rate": 8.579277622393708e-05, |
| "loss": 0.6771, |
| "step": 1365 |
| }, |
| { |
| "epoch": 0.8795088873731092, |
| "grad_norm": 1.5703125, |
| "learning_rate": 8.569306616678667e-05, |
| "loss": 0.6702, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.882718773823376, |
| "grad_norm": 1.421875, |
| "learning_rate": 8.559308984674047e-05, |
| "loss": 0.6461, |
| "step": 1375 |
| }, |
| { |
| "epoch": 0.8859286602736428, |
| "grad_norm": 1.4609375, |
| "learning_rate": 8.549284841155461e-05, |
| "loss": 0.6836, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.8891385467239097, |
| "grad_norm": 1.390625, |
| "learning_rate": 8.539234301202885e-05, |
| "loss": 0.6547, |
| "step": 1385 |
| }, |
| { |
| "epoch": 0.8923484331741764, |
| "grad_norm": 1.515625, |
| "learning_rate": 8.529157480199335e-05, |
| "loss": 0.664, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.8955583196244433, |
| "grad_norm": 1.2890625, |
| "learning_rate": 8.519054493829535e-05, |
| "loss": 0.6625, |
| "step": 1395 |
| }, |
| { |
| "epoch": 0.8987682060747101, |
| "grad_norm": 1.3125, |
| "learning_rate": 8.508925458078599e-05, |
| "loss": 0.6582, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.901978092524977, |
| "grad_norm": 1.3515625, |
| "learning_rate": 8.498770489230699e-05, |
| "loss": 0.6432, |
| "step": 1405 |
| }, |
| { |
| "epoch": 0.9051879789752437, |
| "grad_norm": 1.25, |
| "learning_rate": 8.488589703867714e-05, |
| "loss": 0.6775, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.9083978654255106, |
| "grad_norm": 1.3203125, |
| "learning_rate": 8.478383218867918e-05, |
| "loss": 0.6847, |
| "step": 1415 |
| }, |
| { |
| "epoch": 0.9116077518757774, |
| "grad_norm": 1.3515625, |
| "learning_rate": 8.468151151404616e-05, |
| "loss": 0.6691, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.9148176383260442, |
| "grad_norm": 1.3828125, |
| "learning_rate": 8.457893618944808e-05, |
| "loss": 0.6618, |
| "step": 1425 |
| }, |
| { |
| "epoch": 0.918027524776311, |
| "grad_norm": 1.421875, |
| "learning_rate": 8.447610739247838e-05, |
| "loss": 0.6755, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.9212374112265779, |
| "grad_norm": 1.25, |
| "learning_rate": 8.437302630364046e-05, |
| "loss": 0.6673, |
| "step": 1435 |
| }, |
| { |
| "epoch": 0.9244472976768446, |
| "grad_norm": 1.359375, |
| "learning_rate": 8.426969410633411e-05, |
| "loss": 0.6582, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.9276571841271115, |
| "grad_norm": 1.296875, |
| "learning_rate": 8.416611198684187e-05, |
| "loss": 0.6667, |
| "step": 1445 |
| }, |
| { |
| "epoch": 0.9308670705773783, |
| "grad_norm": 1.3828125, |
| "learning_rate": 8.406228113431552e-05, |
| "loss": 0.6716, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.9340769570276451, |
| "grad_norm": 1.2890625, |
| "learning_rate": 8.395820274076229e-05, |
| "loss": 0.6746, |
| "step": 1455 |
| }, |
| { |
| "epoch": 0.937286843477912, |
| "grad_norm": 1.3359375, |
| "learning_rate": 8.385387800103132e-05, |
| "loss": 0.6511, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.9404967299281788, |
| "grad_norm": 1.4453125, |
| "learning_rate": 8.374930811279983e-05, |
| "loss": 0.667, |
| "step": 1465 |
| }, |
| { |
| "epoch": 0.9437066163784457, |
| "grad_norm": 1.296875, |
| "learning_rate": 8.364449427655942e-05, |
| "loss": 0.6766, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.9469165028287124, |
| "grad_norm": 1.4453125, |
| "learning_rate": 8.353943769560228e-05, |
| "loss": 0.6468, |
| "step": 1475 |
| }, |
| { |
| "epoch": 0.9501263892789793, |
| "grad_norm": 1.359375, |
| "learning_rate": 8.343413957600744e-05, |
| "loss": 0.6427, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.9533362757292461, |
| "grad_norm": 1.5546875, |
| "learning_rate": 8.332860112662673e-05, |
| "loss": 0.6207, |
| "step": 1485 |
| }, |
| { |
| "epoch": 0.9565461621795129, |
| "grad_norm": 1.2890625, |
| "learning_rate": 8.322282355907117e-05, |
| "loss": 0.6548, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.9597560486297797, |
| "grad_norm": 1.546875, |
| "learning_rate": 8.311680808769682e-05, |
| "loss": 0.6662, |
| "step": 1495 |
| }, |
| { |
| "epoch": 0.9629659350800466, |
| "grad_norm": 1.3828125, |
| "learning_rate": 8.301055592959101e-05, |
| "loss": 0.6488, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.9629659350800466, |
| "eval_loss": 0.5586946606636047, |
| "eval_runtime": 2.4022, |
| "eval_samples_per_second": 83.255, |
| "eval_steps_per_second": 83.255, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.9661758215303133, |
| "grad_norm": 1.3203125, |
| "learning_rate": 8.290406830455828e-05, |
| "loss": 0.6723, |
| "step": 1505 |
| }, |
| { |
| "epoch": 0.9693857079805802, |
| "grad_norm": 1.359375, |
| "learning_rate": 8.279734643510636e-05, |
| "loss": 0.653, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.972595594430847, |
| "grad_norm": 1.359375, |
| "learning_rate": 8.269039154643224e-05, |
| "loss": 0.6535, |
| "step": 1515 |
| }, |
| { |
| "epoch": 0.9758054808811139, |
| "grad_norm": 1.3828125, |
| "learning_rate": 8.258320486640798e-05, |
| "loss": 0.6498, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.9790153673313806, |
| "grad_norm": 1.3515625, |
| "learning_rate": 8.24757876255667e-05, |
| "loss": 0.6531, |
| "step": 1525 |
| }, |
| { |
| "epoch": 0.9822252537816475, |
| "grad_norm": 1.40625, |
| "learning_rate": 8.23681410570884e-05, |
| "loss": 0.6698, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.9854351402319143, |
| "grad_norm": 1.4765625, |
| "learning_rate": 8.226026639678582e-05, |
| "loss": 0.658, |
| "step": 1535 |
| }, |
| { |
| "epoch": 0.9886450266821811, |
| "grad_norm": 1.2578125, |
| "learning_rate": 8.215216488309032e-05, |
| "loss": 0.6606, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.9918549131324479, |
| "grad_norm": 1.3515625, |
| "learning_rate": 8.204383775703752e-05, |
| "loss": 0.6519, |
| "step": 1545 |
| }, |
| { |
| "epoch": 0.9950647995827148, |
| "grad_norm": 1.40625, |
| "learning_rate": 8.19352862622532e-05, |
| "loss": 0.6452, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.9982746860329815, |
| "grad_norm": 1.3515625, |
| "learning_rate": 8.182651164493889e-05, |
| "loss": 0.6567, |
| "step": 1555 |
| }, |
| { |
| "epoch": 0.9995586406130883, |
| "eval_loss": 0.5523168444633484, |
| "eval_runtime": 2.4204, |
| "eval_samples_per_second": 82.63, |
| "eval_steps_per_second": 82.63, |
| "step": 1557 |
| }, |
| { |
| "epoch": 1.00192593187016, |
| "grad_norm": 1.3125, |
| "learning_rate": 8.171751515385769e-05, |
| "loss": 0.7609, |
| "step": 1560 |
| }, |
| { |
| "epoch": 1.0051358183204269, |
| "grad_norm": 1.2265625, |
| "learning_rate": 8.160829804031982e-05, |
| "loss": 0.615, |
| "step": 1565 |
| }, |
| { |
| "epoch": 1.0083457047706936, |
| "grad_norm": 1.3671875, |
| "learning_rate": 8.149886155816835e-05, |
| "loss": 0.6382, |
| "step": 1570 |
| }, |
| { |
| "epoch": 1.0115555912209606, |
| "grad_norm": 1.40625, |
| "learning_rate": 8.138920696376476e-05, |
| "loss": 0.6391, |
| "step": 1575 |
| }, |
| { |
| "epoch": 1.0147654776712274, |
| "grad_norm": 1.390625, |
| "learning_rate": 8.127933551597449e-05, |
| "loss": 0.6365, |
| "step": 1580 |
| }, |
| { |
| "epoch": 1.0179753641214941, |
| "grad_norm": 1.34375, |
| "learning_rate": 8.116924847615254e-05, |
| "loss": 0.6269, |
| "step": 1585 |
| }, |
| { |
| "epoch": 1.0211852505717611, |
| "grad_norm": 1.28125, |
| "learning_rate": 8.105894710812897e-05, |
| "loss": 0.6414, |
| "step": 1590 |
| }, |
| { |
| "epoch": 1.024395137022028, |
| "grad_norm": 1.421875, |
| "learning_rate": 8.094843267819438e-05, |
| "loss": 0.6218, |
| "step": 1595 |
| }, |
| { |
| "epoch": 1.0276050234722947, |
| "grad_norm": 1.4453125, |
| "learning_rate": 8.083770645508535e-05, |
| "loss": 0.6456, |
| "step": 1600 |
| }, |
| { |
| "epoch": 1.0308149099225614, |
| "grad_norm": 1.453125, |
| "learning_rate": 8.072676970996997e-05, |
| "loss": 0.6349, |
| "step": 1605 |
| }, |
| { |
| "epoch": 1.0340247963728284, |
| "grad_norm": 1.3046875, |
| "learning_rate": 8.061562371643312e-05, |
| "loss": 0.5872, |
| "step": 1610 |
| }, |
| { |
| "epoch": 1.0372346828230952, |
| "grad_norm": 1.375, |
| "learning_rate": 8.050426975046196e-05, |
| "loss": 0.6129, |
| "step": 1615 |
| }, |
| { |
| "epoch": 1.040444569273362, |
| "grad_norm": 1.34375, |
| "learning_rate": 8.039270909043119e-05, |
| "loss": 0.6275, |
| "step": 1620 |
| }, |
| { |
| "epoch": 1.0436544557236287, |
| "grad_norm": 1.40625, |
| "learning_rate": 8.028094301708843e-05, |
| "loss": 0.6198, |
| "step": 1625 |
| }, |
| { |
| "epoch": 1.0468643421738957, |
| "grad_norm": 1.4609375, |
| "learning_rate": 8.016897281353954e-05, |
| "loss": 0.6125, |
| "step": 1630 |
| }, |
| { |
| "epoch": 1.0500742286241624, |
| "grad_norm": 1.3828125, |
| "learning_rate": 8.00567997652338e-05, |
| "loss": 0.6076, |
| "step": 1635 |
| }, |
| { |
| "epoch": 1.0532841150744292, |
| "grad_norm": 1.5625, |
| "learning_rate": 7.994442515994922e-05, |
| "loss": 0.6153, |
| "step": 1640 |
| }, |
| { |
| "epoch": 1.056494001524696, |
| "grad_norm": 1.28125, |
| "learning_rate": 7.983185028777773e-05, |
| "loss": 0.614, |
| "step": 1645 |
| }, |
| { |
| "epoch": 1.059703887974963, |
| "grad_norm": 1.3828125, |
| "learning_rate": 7.971907644111043e-05, |
| "loss": 0.6287, |
| "step": 1650 |
| }, |
| { |
| "epoch": 1.0629137744252297, |
| "grad_norm": 1.4375, |
| "learning_rate": 7.960610491462265e-05, |
| "loss": 0.6234, |
| "step": 1655 |
| }, |
| { |
| "epoch": 1.0661236608754965, |
| "grad_norm": 1.390625, |
| "learning_rate": 7.949293700525914e-05, |
| "loss": 0.6352, |
| "step": 1660 |
| }, |
| { |
| "epoch": 1.0693335473257632, |
| "grad_norm": 1.328125, |
| "learning_rate": 7.93795740122192e-05, |
| "loss": 0.6275, |
| "step": 1665 |
| }, |
| { |
| "epoch": 1.0725434337760302, |
| "grad_norm": 1.2734375, |
| "learning_rate": 7.926601723694178e-05, |
| "loss": 0.6266, |
| "step": 1670 |
| }, |
| { |
| "epoch": 1.075753320226297, |
| "grad_norm": 1.40625, |
| "learning_rate": 7.915226798309042e-05, |
| "loss": 0.6111, |
| "step": 1675 |
| }, |
| { |
| "epoch": 1.0789632066765638, |
| "grad_norm": 1.3828125, |
| "learning_rate": 7.903832755653844e-05, |
| "loss": 0.6032, |
| "step": 1680 |
| }, |
| { |
| "epoch": 1.0821730931268307, |
| "grad_norm": 1.5859375, |
| "learning_rate": 7.892419726535385e-05, |
| "loss": 0.6113, |
| "step": 1685 |
| }, |
| { |
| "epoch": 1.0853829795770975, |
| "grad_norm": 1.359375, |
| "learning_rate": 7.880987841978435e-05, |
| "loss": 0.6332, |
| "step": 1690 |
| }, |
| { |
| "epoch": 1.0885928660273643, |
| "grad_norm": 1.46875, |
| "learning_rate": 7.86953723322423e-05, |
| "loss": 0.6419, |
| "step": 1695 |
| }, |
| { |
| "epoch": 1.091802752477631, |
| "grad_norm": 1.3203125, |
| "learning_rate": 7.858068031728968e-05, |
| "loss": 0.6249, |
| "step": 1700 |
| }, |
| { |
| "epoch": 1.095012638927898, |
| "grad_norm": 1.3203125, |
| "learning_rate": 7.846580369162293e-05, |
| "loss": 0.6075, |
| "step": 1705 |
| }, |
| { |
| "epoch": 1.0982225253781648, |
| "grad_norm": 1.3828125, |
| "learning_rate": 7.83507437740579e-05, |
| "loss": 0.6379, |
| "step": 1710 |
| }, |
| { |
| "epoch": 1.1014324118284315, |
| "grad_norm": 1.390625, |
| "learning_rate": 7.823550188551466e-05, |
| "loss": 0.6165, |
| "step": 1715 |
| }, |
| { |
| "epoch": 1.1046422982786983, |
| "grad_norm": 1.328125, |
| "learning_rate": 7.812007934900238e-05, |
| "loss": 0.6106, |
| "step": 1720 |
| }, |
| { |
| "epoch": 1.1078521847289653, |
| "grad_norm": 1.25, |
| "learning_rate": 7.800447748960408e-05, |
| "loss": 0.6132, |
| "step": 1725 |
| }, |
| { |
| "epoch": 1.111062071179232, |
| "grad_norm": 1.390625, |
| "learning_rate": 7.788869763446154e-05, |
| "loss": 0.6224, |
| "step": 1730 |
| }, |
| { |
| "epoch": 1.1142719576294988, |
| "grad_norm": 1.4609375, |
| "learning_rate": 7.777274111275988e-05, |
| "loss": 0.6353, |
| "step": 1735 |
| }, |
| { |
| "epoch": 1.1174818440797656, |
| "grad_norm": 1.2421875, |
| "learning_rate": 7.765660925571245e-05, |
| "loss": 0.6289, |
| "step": 1740 |
| }, |
| { |
| "epoch": 1.1206917305300326, |
| "grad_norm": 1.2890625, |
| "learning_rate": 7.754030339654552e-05, |
| "loss": 0.6091, |
| "step": 1745 |
| }, |
| { |
| "epoch": 1.1239016169802993, |
| "grad_norm": 1.2578125, |
| "learning_rate": 7.74238248704829e-05, |
| "loss": 0.6119, |
| "step": 1750 |
| }, |
| { |
| "epoch": 1.127111503430566, |
| "grad_norm": 1.2421875, |
| "learning_rate": 7.730717501473073e-05, |
| "loss": 0.6173, |
| "step": 1755 |
| }, |
| { |
| "epoch": 1.1303213898808329, |
| "grad_norm": 1.34375, |
| "learning_rate": 7.719035516846201e-05, |
| "loss": 0.6184, |
| "step": 1760 |
| }, |
| { |
| "epoch": 1.1335312763310998, |
| "grad_norm": 1.390625, |
| "learning_rate": 7.707336667280128e-05, |
| "loss": 0.6061, |
| "step": 1765 |
| }, |
| { |
| "epoch": 1.1367411627813666, |
| "grad_norm": 1.4296875, |
| "learning_rate": 7.695621087080924e-05, |
| "loss": 0.6265, |
| "step": 1770 |
| }, |
| { |
| "epoch": 1.1399510492316334, |
| "grad_norm": 1.328125, |
| "learning_rate": 7.683888910746735e-05, |
| "loss": 0.6272, |
| "step": 1775 |
| }, |
| { |
| "epoch": 1.1431609356819004, |
| "grad_norm": 1.359375, |
| "learning_rate": 7.672140272966227e-05, |
| "loss": 0.6162, |
| "step": 1780 |
| }, |
| { |
| "epoch": 1.1463708221321671, |
| "grad_norm": 1.40625, |
| "learning_rate": 7.660375308617054e-05, |
| "loss": 0.6165, |
| "step": 1785 |
| }, |
| { |
| "epoch": 1.1495807085824339, |
| "grad_norm": 1.328125, |
| "learning_rate": 7.648594152764304e-05, |
| "loss": 0.5994, |
| "step": 1790 |
| }, |
| { |
| "epoch": 1.1527905950327006, |
| "grad_norm": 1.328125, |
| "learning_rate": 7.636796940658942e-05, |
| "loss": 0.6298, |
| "step": 1795 |
| }, |
| { |
| "epoch": 1.1560004814829676, |
| "grad_norm": 1.4375, |
| "learning_rate": 7.62498380773627e-05, |
| "loss": 0.6124, |
| "step": 1800 |
| }, |
| { |
| "epoch": 1.1592103679332344, |
| "grad_norm": 1.34375, |
| "learning_rate": 7.613154889614362e-05, |
| "loss": 0.6236, |
| "step": 1805 |
| }, |
| { |
| "epoch": 1.1624202543835012, |
| "grad_norm": 1.3046875, |
| "learning_rate": 7.601310322092511e-05, |
| "loss": 0.6148, |
| "step": 1810 |
| }, |
| { |
| "epoch": 1.165630140833768, |
| "grad_norm": 1.3671875, |
| "learning_rate": 7.589450241149671e-05, |
| "loss": 0.6119, |
| "step": 1815 |
| }, |
| { |
| "epoch": 1.168840027284035, |
| "grad_norm": 1.4375, |
| "learning_rate": 7.577574782942893e-05, |
| "loss": 0.6034, |
| "step": 1820 |
| }, |
| { |
| "epoch": 1.1720499137343017, |
| "grad_norm": 1.4375, |
| "learning_rate": 7.565684083805762e-05, |
| "loss": 0.6049, |
| "step": 1825 |
| }, |
| { |
| "epoch": 1.1752598001845684, |
| "grad_norm": 1.3359375, |
| "learning_rate": 7.553778280246835e-05, |
| "loss": 0.6314, |
| "step": 1830 |
| }, |
| { |
| "epoch": 1.1784696866348352, |
| "grad_norm": 1.359375, |
| "learning_rate": 7.541857508948072e-05, |
| "loss": 0.6015, |
| "step": 1835 |
| }, |
| { |
| "epoch": 1.1816795730851022, |
| "grad_norm": 1.34375, |
| "learning_rate": 7.529921906763266e-05, |
| "loss": 0.6085, |
| "step": 1840 |
| }, |
| { |
| "epoch": 1.184889459535369, |
| "grad_norm": 1.3671875, |
| "learning_rate": 7.517971610716473e-05, |
| "loss": 0.6071, |
| "step": 1845 |
| }, |
| { |
| "epoch": 1.1880993459856357, |
| "grad_norm": 1.296875, |
| "learning_rate": 7.50600675800044e-05, |
| "loss": 0.6237, |
| "step": 1850 |
| }, |
| { |
| "epoch": 1.1913092324359025, |
| "grad_norm": 1.3984375, |
| "learning_rate": 7.494027485975027e-05, |
| "loss": 0.6062, |
| "step": 1855 |
| }, |
| { |
| "epoch": 1.1945191188861695, |
| "grad_norm": 1.359375, |
| "learning_rate": 7.482033932165631e-05, |
| "loss": 0.6111, |
| "step": 1860 |
| }, |
| { |
| "epoch": 1.1977290053364362, |
| "grad_norm": 1.265625, |
| "learning_rate": 7.470026234261611e-05, |
| "loss": 0.5957, |
| "step": 1865 |
| }, |
| { |
| "epoch": 1.200938891786703, |
| "grad_norm": 1.390625, |
| "learning_rate": 7.4580045301147e-05, |
| "loss": 0.6054, |
| "step": 1870 |
| }, |
| { |
| "epoch": 1.20414877823697, |
| "grad_norm": 1.3828125, |
| "learning_rate": 7.44596895773743e-05, |
| "loss": 0.6264, |
| "step": 1875 |
| }, |
| { |
| "epoch": 1.2073586646872367, |
| "grad_norm": 1.2578125, |
| "learning_rate": 7.433919655301543e-05, |
| "loss": 0.5918, |
| "step": 1880 |
| }, |
| { |
| "epoch": 1.2105685511375035, |
| "grad_norm": 1.40625, |
| "learning_rate": 7.421856761136405e-05, |
| "loss": 0.6138, |
| "step": 1885 |
| }, |
| { |
| "epoch": 1.2137784375877703, |
| "grad_norm": 1.3515625, |
| "learning_rate": 7.409780413727423e-05, |
| "loss": 0.623, |
| "step": 1890 |
| }, |
| { |
| "epoch": 1.2169883240380373, |
| "grad_norm": 1.234375, |
| "learning_rate": 7.397690751714444e-05, |
| "loss": 0.6118, |
| "step": 1895 |
| }, |
| { |
| "epoch": 1.220198210488304, |
| "grad_norm": 1.3515625, |
| "learning_rate": 7.385587913890175e-05, |
| "loss": 0.5957, |
| "step": 1900 |
| }, |
| { |
| "epoch": 1.2234080969385708, |
| "grad_norm": 1.53125, |
| "learning_rate": 7.373472039198583e-05, |
| "loss": 0.6201, |
| "step": 1905 |
| }, |
| { |
| "epoch": 1.2266179833888375, |
| "grad_norm": 1.328125, |
| "learning_rate": 7.361343266733307e-05, |
| "loss": 0.6029, |
| "step": 1910 |
| }, |
| { |
| "epoch": 1.2298278698391045, |
| "grad_norm": 1.2265625, |
| "learning_rate": 7.34920173573605e-05, |
| "loss": 0.6052, |
| "step": 1915 |
| }, |
| { |
| "epoch": 1.2330377562893713, |
| "grad_norm": 1.34375, |
| "learning_rate": 7.337047585594987e-05, |
| "loss": 0.6155, |
| "step": 1920 |
| }, |
| { |
| "epoch": 1.236247642739638, |
| "grad_norm": 1.2890625, |
| "learning_rate": 7.324880955843167e-05, |
| "loss": 0.5776, |
| "step": 1925 |
| }, |
| { |
| "epoch": 1.2394575291899048, |
| "grad_norm": 1.3984375, |
| "learning_rate": 7.312701986156909e-05, |
| "loss": 0.6156, |
| "step": 1930 |
| }, |
| { |
| "epoch": 1.2426674156401718, |
| "grad_norm": 1.578125, |
| "learning_rate": 7.300510816354194e-05, |
| "loss": 0.6011, |
| "step": 1935 |
| }, |
| { |
| "epoch": 1.2458773020904386, |
| "grad_norm": 1.671875, |
| "learning_rate": 7.288307586393066e-05, |
| "loss": 0.6094, |
| "step": 1940 |
| }, |
| { |
| "epoch": 1.2490871885407053, |
| "grad_norm": 1.390625, |
| "learning_rate": 7.276092436370024e-05, |
| "loss": 0.6074, |
| "step": 1945 |
| }, |
| { |
| "epoch": 1.252297074990972, |
| "grad_norm": 1.328125, |
| "learning_rate": 7.263865506518411e-05, |
| "loss": 0.6002, |
| "step": 1950 |
| }, |
| { |
| "epoch": 1.255506961441239, |
| "grad_norm": 1.359375, |
| "learning_rate": 7.251626937206806e-05, |
| "loss": 0.5956, |
| "step": 1955 |
| }, |
| { |
| "epoch": 1.2587168478915058, |
| "grad_norm": 1.375, |
| "learning_rate": 7.239376868937415e-05, |
| "loss": 0.6026, |
| "step": 1960 |
| }, |
| { |
| "epoch": 1.2619267343417726, |
| "grad_norm": 1.4453125, |
| "learning_rate": 7.227115442344452e-05, |
| "loss": 0.6136, |
| "step": 1965 |
| }, |
| { |
| "epoch": 1.2651366207920396, |
| "grad_norm": 1.3515625, |
| "learning_rate": 7.214842798192526e-05, |
| "loss": 0.6092, |
| "step": 1970 |
| }, |
| { |
| "epoch": 1.2683465072423064, |
| "grad_norm": 1.453125, |
| "learning_rate": 7.202559077375033e-05, |
| "loss": 0.6232, |
| "step": 1975 |
| }, |
| { |
| "epoch": 1.2715563936925731, |
| "grad_norm": 1.28125, |
| "learning_rate": 7.190264420912526e-05, |
| "loss": 0.6139, |
| "step": 1980 |
| }, |
| { |
| "epoch": 1.2747662801428399, |
| "grad_norm": 1.2890625, |
| "learning_rate": 7.177958969951104e-05, |
| "loss": 0.6085, |
| "step": 1985 |
| }, |
| { |
| "epoch": 1.2779761665931066, |
| "grad_norm": 1.484375, |
| "learning_rate": 7.165642865760794e-05, |
| "loss": 0.631, |
| "step": 1990 |
| }, |
| { |
| "epoch": 1.2811860530433736, |
| "grad_norm": 1.4765625, |
| "learning_rate": 7.15331624973392e-05, |
| "loss": 0.6131, |
| "step": 1995 |
| }, |
| { |
| "epoch": 1.2843959394936404, |
| "grad_norm": 1.4453125, |
| "learning_rate": 7.140979263383488e-05, |
| "loss": 0.6102, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.2843959394936404, |
| "eval_loss": 0.5290513038635254, |
| "eval_runtime": 2.3691, |
| "eval_samples_per_second": 84.421, |
| "eval_steps_per_second": 84.421, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.2876058259439072, |
| "grad_norm": 1.34375, |
| "learning_rate": 7.128632048341553e-05, |
| "loss": 0.6014, |
| "step": 2005 |
| }, |
| { |
| "epoch": 1.2908157123941741, |
| "grad_norm": 1.25, |
| "learning_rate": 7.116274746357605e-05, |
| "loss": 0.6291, |
| "step": 2010 |
| }, |
| { |
| "epoch": 1.294025598844441, |
| "grad_norm": 1.265625, |
| "learning_rate": 7.103907499296934e-05, |
| "loss": 0.5853, |
| "step": 2015 |
| }, |
| { |
| "epoch": 1.2972354852947077, |
| "grad_norm": 1.2578125, |
| "learning_rate": 7.091530449138994e-05, |
| "loss": 0.6215, |
| "step": 2020 |
| }, |
| { |
| "epoch": 1.3004453717449747, |
| "grad_norm": 1.203125, |
| "learning_rate": 7.079143737975795e-05, |
| "loss": 0.5965, |
| "step": 2025 |
| }, |
| { |
| "epoch": 1.3036552581952414, |
| "grad_norm": 1.3984375, |
| "learning_rate": 7.066747508010243e-05, |
| "loss": 0.6179, |
| "step": 2030 |
| }, |
| { |
| "epoch": 1.3068651446455082, |
| "grad_norm": 1.265625, |
| "learning_rate": 7.054341901554537e-05, |
| "loss": 0.5941, |
| "step": 2035 |
| }, |
| { |
| "epoch": 1.310075031095775, |
| "grad_norm": 1.390625, |
| "learning_rate": 7.04192706102851e-05, |
| "loss": 0.6157, |
| "step": 2040 |
| }, |
| { |
| "epoch": 1.3132849175460417, |
| "grad_norm": 1.3828125, |
| "learning_rate": 7.029503128958009e-05, |
| "loss": 0.6025, |
| "step": 2045 |
| }, |
| { |
| "epoch": 1.3164948039963087, |
| "grad_norm": 1.2421875, |
| "learning_rate": 7.017070247973255e-05, |
| "loss": 0.5932, |
| "step": 2050 |
| }, |
| { |
| "epoch": 1.3197046904465755, |
| "grad_norm": 1.4921875, |
| "learning_rate": 7.004628560807202e-05, |
| "loss": 0.5958, |
| "step": 2055 |
| }, |
| { |
| "epoch": 1.3229145768968422, |
| "grad_norm": 1.34375, |
| "learning_rate": 6.992178210293905e-05, |
| "loss": 0.6041, |
| "step": 2060 |
| }, |
| { |
| "epoch": 1.3261244633471092, |
| "grad_norm": 1.3828125, |
| "learning_rate": 6.979719339366876e-05, |
| "loss": 0.6126, |
| "step": 2065 |
| }, |
| { |
| "epoch": 1.329334349797376, |
| "grad_norm": 1.4921875, |
| "learning_rate": 6.96725209105744e-05, |
| "loss": 0.5878, |
| "step": 2070 |
| }, |
| { |
| "epoch": 1.3325442362476427, |
| "grad_norm": 1.3203125, |
| "learning_rate": 6.954776608493104e-05, |
| "loss": 0.6037, |
| "step": 2075 |
| }, |
| { |
| "epoch": 1.3357541226979095, |
| "grad_norm": 1.234375, |
| "learning_rate": 6.942293034895899e-05, |
| "loss": 0.5986, |
| "step": 2080 |
| }, |
| { |
| "epoch": 1.3389640091481763, |
| "grad_norm": 1.4140625, |
| "learning_rate": 6.929801513580747e-05, |
| "loss": 0.6124, |
| "step": 2085 |
| }, |
| { |
| "epoch": 1.3421738955984432, |
| "grad_norm": 1.265625, |
| "learning_rate": 6.917302187953811e-05, |
| "loss": 0.613, |
| "step": 2090 |
| }, |
| { |
| "epoch": 1.34538378204871, |
| "grad_norm": 1.2578125, |
| "learning_rate": 6.904795201510852e-05, |
| "loss": 0.5869, |
| "step": 2095 |
| }, |
| { |
| "epoch": 1.3485936684989768, |
| "grad_norm": 1.3671875, |
| "learning_rate": 6.892280697835576e-05, |
| "loss": 0.6194, |
| "step": 2100 |
| }, |
| { |
| "epoch": 1.3518035549492438, |
| "grad_norm": 1.3828125, |
| "learning_rate": 6.879758820597991e-05, |
| "loss": 0.5933, |
| "step": 2105 |
| }, |
| { |
| "epoch": 1.3550134413995105, |
| "grad_norm": 1.2421875, |
| "learning_rate": 6.867229713552754e-05, |
| "loss": 0.6055, |
| "step": 2110 |
| }, |
| { |
| "epoch": 1.3582233278497773, |
| "grad_norm": 1.2578125, |
| "learning_rate": 6.854693520537524e-05, |
| "loss": 0.6052, |
| "step": 2115 |
| }, |
| { |
| "epoch": 1.3614332143000443, |
| "grad_norm": 1.3046875, |
| "learning_rate": 6.842150385471307e-05, |
| "loss": 0.6174, |
| "step": 2120 |
| }, |
| { |
| "epoch": 1.364643100750311, |
| "grad_norm": 1.3828125, |
| "learning_rate": 6.829600452352806e-05, |
| "loss": 0.595, |
| "step": 2125 |
| }, |
| { |
| "epoch": 1.3678529872005778, |
| "grad_norm": 1.2421875, |
| "learning_rate": 6.817043865258774e-05, |
| "loss": 0.5939, |
| "step": 2130 |
| }, |
| { |
| "epoch": 1.3710628736508446, |
| "grad_norm": 1.3359375, |
| "learning_rate": 6.804480768342341e-05, |
| "loss": 0.6006, |
| "step": 2135 |
| }, |
| { |
| "epoch": 1.3742727601011113, |
| "grad_norm": 1.3515625, |
| "learning_rate": 6.791911305831382e-05, |
| "loss": 0.5961, |
| "step": 2140 |
| }, |
| { |
| "epoch": 1.3774826465513783, |
| "grad_norm": 1.3046875, |
| "learning_rate": 6.779335622026847e-05, |
| "loss": 0.6069, |
| "step": 2145 |
| }, |
| { |
| "epoch": 1.380692533001645, |
| "grad_norm": 1.2734375, |
| "learning_rate": 6.76675386130111e-05, |
| "loss": 0.6059, |
| "step": 2150 |
| }, |
| { |
| "epoch": 1.3839024194519118, |
| "grad_norm": 1.3828125, |
| "learning_rate": 6.754166168096306e-05, |
| "loss": 0.5894, |
| "step": 2155 |
| }, |
| { |
| "epoch": 1.3871123059021788, |
| "grad_norm": 1.4296875, |
| "learning_rate": 6.741572686922676e-05, |
| "loss": 0.6092, |
| "step": 2160 |
| }, |
| { |
| "epoch": 1.3903221923524456, |
| "grad_norm": 1.328125, |
| "learning_rate": 6.728973562356917e-05, |
| "loss": 0.5937, |
| "step": 2165 |
| }, |
| { |
| "epoch": 1.3935320788027123, |
| "grad_norm": 1.34375, |
| "learning_rate": 6.716368939040503e-05, |
| "loss": 0.5971, |
| "step": 2170 |
| }, |
| { |
| "epoch": 1.3967419652529791, |
| "grad_norm": 1.296875, |
| "learning_rate": 6.703758961678041e-05, |
| "loss": 0.5985, |
| "step": 2175 |
| }, |
| { |
| "epoch": 1.3999518517032459, |
| "grad_norm": 1.3125, |
| "learning_rate": 6.691143775035606e-05, |
| "loss": 0.6064, |
| "step": 2180 |
| }, |
| { |
| "epoch": 1.4031617381535129, |
| "grad_norm": 1.3515625, |
| "learning_rate": 6.678523523939074e-05, |
| "loss": 0.6034, |
| "step": 2185 |
| }, |
| { |
| "epoch": 1.4063716246037796, |
| "grad_norm": 1.296875, |
| "learning_rate": 6.66589835327246e-05, |
| "loss": 0.5948, |
| "step": 2190 |
| }, |
| { |
| "epoch": 1.4095815110540464, |
| "grad_norm": 1.28125, |
| "learning_rate": 6.653268407976258e-05, |
| "loss": 0.5751, |
| "step": 2195 |
| }, |
| { |
| "epoch": 1.4127913975043134, |
| "grad_norm": 1.265625, |
| "learning_rate": 6.640633833045783e-05, |
| "loss": 0.5678, |
| "step": 2200 |
| }, |
| { |
| "epoch": 1.4160012839545801, |
| "grad_norm": 1.28125, |
| "learning_rate": 6.627994773529489e-05, |
| "loss": 0.5837, |
| "step": 2205 |
| }, |
| { |
| "epoch": 1.419211170404847, |
| "grad_norm": 1.375, |
| "learning_rate": 6.615351374527323e-05, |
| "loss": 0.5856, |
| "step": 2210 |
| }, |
| { |
| "epoch": 1.4224210568551139, |
| "grad_norm": 1.3828125, |
| "learning_rate": 6.602703781189043e-05, |
| "loss": 0.5824, |
| "step": 2215 |
| }, |
| { |
| "epoch": 1.4256309433053806, |
| "grad_norm": 1.296875, |
| "learning_rate": 6.590052138712567e-05, |
| "loss": 0.6043, |
| "step": 2220 |
| }, |
| { |
| "epoch": 1.4288408297556474, |
| "grad_norm": 1.3515625, |
| "learning_rate": 6.57739659234229e-05, |
| "loss": 0.5831, |
| "step": 2225 |
| }, |
| { |
| "epoch": 1.4320507162059142, |
| "grad_norm": 1.3671875, |
| "learning_rate": 6.564737287367434e-05, |
| "loss": 0.6001, |
| "step": 2230 |
| }, |
| { |
| "epoch": 1.435260602656181, |
| "grad_norm": 1.25, |
| "learning_rate": 6.552074369120363e-05, |
| "loss": 0.6059, |
| "step": 2235 |
| }, |
| { |
| "epoch": 1.438470489106448, |
| "grad_norm": 1.3671875, |
| "learning_rate": 6.539407982974925e-05, |
| "loss": 0.5936, |
| "step": 2240 |
| }, |
| { |
| "epoch": 1.4416803755567147, |
| "grad_norm": 1.2890625, |
| "learning_rate": 6.52673827434478e-05, |
| "loss": 0.6078, |
| "step": 2245 |
| }, |
| { |
| "epoch": 1.4448902620069815, |
| "grad_norm": 1.3359375, |
| "learning_rate": 6.514065388681736e-05, |
| "loss": 0.6106, |
| "step": 2250 |
| }, |
| { |
| "epoch": 1.4481001484572484, |
| "grad_norm": 1.3515625, |
| "learning_rate": 6.501389471474066e-05, |
| "loss": 0.5819, |
| "step": 2255 |
| }, |
| { |
| "epoch": 1.4513100349075152, |
| "grad_norm": 1.1953125, |
| "learning_rate": 6.48871066824485e-05, |
| "loss": 0.5873, |
| "step": 2260 |
| }, |
| { |
| "epoch": 1.454519921357782, |
| "grad_norm": 1.328125, |
| "learning_rate": 6.476029124550303e-05, |
| "loss": 0.586, |
| "step": 2265 |
| }, |
| { |
| "epoch": 1.4577298078080487, |
| "grad_norm": 1.2578125, |
| "learning_rate": 6.463344985978095e-05, |
| "loss": 0.6004, |
| "step": 2270 |
| }, |
| { |
| "epoch": 1.4609396942583155, |
| "grad_norm": 1.3046875, |
| "learning_rate": 6.450658398145692e-05, |
| "loss": 0.5848, |
| "step": 2275 |
| }, |
| { |
| "epoch": 1.4641495807085825, |
| "grad_norm": 1.453125, |
| "learning_rate": 6.437969506698678e-05, |
| "loss": 0.6111, |
| "step": 2280 |
| }, |
| { |
| "epoch": 1.4673594671588492, |
| "grad_norm": 1.375, |
| "learning_rate": 6.425278457309075e-05, |
| "loss": 0.5844, |
| "step": 2285 |
| }, |
| { |
| "epoch": 1.470569353609116, |
| "grad_norm": 1.328125, |
| "learning_rate": 6.41258539567369e-05, |
| "loss": 0.5919, |
| "step": 2290 |
| }, |
| { |
| "epoch": 1.473779240059383, |
| "grad_norm": 1.3046875, |
| "learning_rate": 6.399890467512422e-05, |
| "loss": 0.5992, |
| "step": 2295 |
| }, |
| { |
| "epoch": 1.4769891265096498, |
| "grad_norm": 1.4453125, |
| "learning_rate": 6.387193818566605e-05, |
| "loss": 0.5969, |
| "step": 2300 |
| }, |
| { |
| "epoch": 1.4801990129599165, |
| "grad_norm": 1.28125, |
| "learning_rate": 6.374495594597322e-05, |
| "loss": 0.6171, |
| "step": 2305 |
| }, |
| { |
| "epoch": 1.4834088994101835, |
| "grad_norm": 1.21875, |
| "learning_rate": 6.361795941383746e-05, |
| "loss": 0.5789, |
| "step": 2310 |
| }, |
| { |
| "epoch": 1.4866187858604503, |
| "grad_norm": 1.34375, |
| "learning_rate": 6.349095004721447e-05, |
| "loss": 0.6131, |
| "step": 2315 |
| }, |
| { |
| "epoch": 1.489828672310717, |
| "grad_norm": 1.3203125, |
| "learning_rate": 6.336392930420738e-05, |
| "loss": 0.5972, |
| "step": 2320 |
| }, |
| { |
| "epoch": 1.4930385587609838, |
| "grad_norm": 1.3984375, |
| "learning_rate": 6.323689864304991e-05, |
| "loss": 0.5947, |
| "step": 2325 |
| }, |
| { |
| "epoch": 1.4962484452112506, |
| "grad_norm": 1.2421875, |
| "learning_rate": 6.31098595220896e-05, |
| "loss": 0.5936, |
| "step": 2330 |
| }, |
| { |
| "epoch": 1.4994583316615175, |
| "grad_norm": 1.328125, |
| "learning_rate": 6.298281339977119e-05, |
| "loss": 0.5879, |
| "step": 2335 |
| }, |
| { |
| "epoch": 1.5026682181117843, |
| "grad_norm": 1.3671875, |
| "learning_rate": 6.28557617346197e-05, |
| "loss": 0.5841, |
| "step": 2340 |
| }, |
| { |
| "epoch": 1.505878104562051, |
| "grad_norm": 1.34375, |
| "learning_rate": 6.272870598522385e-05, |
| "loss": 0.5699, |
| "step": 2345 |
| }, |
| { |
| "epoch": 1.509087991012318, |
| "grad_norm": 1.3046875, |
| "learning_rate": 6.260164761021923e-05, |
| "loss": 0.6094, |
| "step": 2350 |
| }, |
| { |
| "epoch": 1.5122978774625848, |
| "grad_norm": 1.2890625, |
| "learning_rate": 6.247458806827157e-05, |
| "loss": 0.5969, |
| "step": 2355 |
| }, |
| { |
| "epoch": 1.5155077639128516, |
| "grad_norm": 1.2421875, |
| "learning_rate": 6.234752881806001e-05, |
| "loss": 0.5865, |
| "step": 2360 |
| }, |
| { |
| "epoch": 1.5187176503631186, |
| "grad_norm": 1.3671875, |
| "learning_rate": 6.222047131826032e-05, |
| "loss": 0.5898, |
| "step": 2365 |
| }, |
| { |
| "epoch": 1.521927536813385, |
| "grad_norm": 1.3359375, |
| "learning_rate": 6.20934170275282e-05, |
| "loss": 0.6127, |
| "step": 2370 |
| }, |
| { |
| "epoch": 1.525137423263652, |
| "grad_norm": 1.3671875, |
| "learning_rate": 6.196636740448247e-05, |
| "loss": 0.5926, |
| "step": 2375 |
| }, |
| { |
| "epoch": 1.5283473097139189, |
| "grad_norm": 1.3046875, |
| "learning_rate": 6.183932390768842e-05, |
| "loss": 0.582, |
| "step": 2380 |
| }, |
| { |
| "epoch": 1.5315571961641856, |
| "grad_norm": 1.2734375, |
| "learning_rate": 6.171228799564095e-05, |
| "loss": 0.57, |
| "step": 2385 |
| }, |
| { |
| "epoch": 1.5347670826144526, |
| "grad_norm": 1.2890625, |
| "learning_rate": 6.158526112674792e-05, |
| "loss": 0.5735, |
| "step": 2390 |
| }, |
| { |
| "epoch": 1.5379769690647194, |
| "grad_norm": 1.2890625, |
| "learning_rate": 6.145824475931338e-05, |
| "loss": 0.5763, |
| "step": 2395 |
| }, |
| { |
| "epoch": 1.5411868555149861, |
| "grad_norm": 1.2890625, |
| "learning_rate": 6.133124035152078e-05, |
| "loss": 0.595, |
| "step": 2400 |
| }, |
| { |
| "epoch": 1.5443967419652531, |
| "grad_norm": 1.1953125, |
| "learning_rate": 6.120424936141631e-05, |
| "loss": 0.5876, |
| "step": 2405 |
| }, |
| { |
| "epoch": 1.5476066284155197, |
| "grad_norm": 1.203125, |
| "learning_rate": 6.10772732468921e-05, |
| "loss": 0.597, |
| "step": 2410 |
| }, |
| { |
| "epoch": 1.5508165148657866, |
| "grad_norm": 1.3125, |
| "learning_rate": 6.095031346566951e-05, |
| "loss": 0.5945, |
| "step": 2415 |
| }, |
| { |
| "epoch": 1.5540264013160534, |
| "grad_norm": 1.390625, |
| "learning_rate": 6.082337147528239e-05, |
| "loss": 0.5841, |
| "step": 2420 |
| }, |
| { |
| "epoch": 1.5572362877663202, |
| "grad_norm": 1.25, |
| "learning_rate": 6.069644873306034e-05, |
| "loss": 0.5778, |
| "step": 2425 |
| }, |
| { |
| "epoch": 1.5604461742165872, |
| "grad_norm": 1.375, |
| "learning_rate": 6.0569546696112014e-05, |
| "loss": 0.5909, |
| "step": 2430 |
| }, |
| { |
| "epoch": 1.563656060666854, |
| "grad_norm": 1.3984375, |
| "learning_rate": 6.04426668213083e-05, |
| "loss": 0.6037, |
| "step": 2435 |
| }, |
| { |
| "epoch": 1.5668659471171207, |
| "grad_norm": 1.328125, |
| "learning_rate": 6.031581056526574e-05, |
| "loss": 0.6011, |
| "step": 2440 |
| }, |
| { |
| "epoch": 1.5700758335673877, |
| "grad_norm": 1.21875, |
| "learning_rate": 6.018897938432966e-05, |
| "loss": 0.5872, |
| "step": 2445 |
| }, |
| { |
| "epoch": 1.5732857200176542, |
| "grad_norm": 1.296875, |
| "learning_rate": 6.0062174734557554e-05, |
| "loss": 0.5904, |
| "step": 2450 |
| }, |
| { |
| "epoch": 1.5764956064679212, |
| "grad_norm": 1.203125, |
| "learning_rate": 5.99353980717023e-05, |
| "loss": 0.5861, |
| "step": 2455 |
| }, |
| { |
| "epoch": 1.5797054929181882, |
| "grad_norm": 1.3984375, |
| "learning_rate": 5.9808650851195517e-05, |
| "loss": 0.5767, |
| "step": 2460 |
| }, |
| { |
| "epoch": 1.5829153793684547, |
| "grad_norm": 1.5, |
| "learning_rate": 5.968193452813079e-05, |
| "loss": 0.6083, |
| "step": 2465 |
| }, |
| { |
| "epoch": 1.5861252658187217, |
| "grad_norm": 1.421875, |
| "learning_rate": 5.9555250557247e-05, |
| "loss": 0.5851, |
| "step": 2470 |
| }, |
| { |
| "epoch": 1.5893351522689885, |
| "grad_norm": 1.3984375, |
| "learning_rate": 5.9428600392911624e-05, |
| "loss": 0.5828, |
| "step": 2475 |
| }, |
| { |
| "epoch": 1.5925450387192552, |
| "grad_norm": 1.3046875, |
| "learning_rate": 5.9301985489103984e-05, |
| "loss": 0.5983, |
| "step": 2480 |
| }, |
| { |
| "epoch": 1.5957549251695222, |
| "grad_norm": 1.2734375, |
| "learning_rate": 5.917540729939869e-05, |
| "loss": 0.5621, |
| "step": 2485 |
| }, |
| { |
| "epoch": 1.598964811619789, |
| "grad_norm": 1.3671875, |
| "learning_rate": 5.904886727694879e-05, |
| "loss": 0.5646, |
| "step": 2490 |
| }, |
| { |
| "epoch": 1.6021746980700557, |
| "grad_norm": 1.296875, |
| "learning_rate": 5.8922366874469195e-05, |
| "loss": 0.596, |
| "step": 2495 |
| }, |
| { |
| "epoch": 1.6053845845203227, |
| "grad_norm": 1.2890625, |
| "learning_rate": 5.879590754421995e-05, |
| "loss": 0.6159, |
| "step": 2500 |
| }, |
| { |
| "epoch": 1.6053845845203227, |
| "eval_loss": 0.4981262981891632, |
| "eval_runtime": 2.3761, |
| "eval_samples_per_second": 84.173, |
| "eval_steps_per_second": 84.173, |
| "step": 2500 |
| }, |
| { |
| "epoch": 1.6085944709705893, |
| "grad_norm": 1.3828125, |
| "learning_rate": 5.866949073798958e-05, |
| "loss": 0.6173, |
| "step": 2505 |
| }, |
| { |
| "epoch": 1.6118043574208563, |
| "grad_norm": 1.265625, |
| "learning_rate": 5.854311790707845e-05, |
| "loss": 0.5769, |
| "step": 2510 |
| }, |
| { |
| "epoch": 1.615014243871123, |
| "grad_norm": 1.3203125, |
| "learning_rate": 5.8416790502282026e-05, |
| "loss": 0.5856, |
| "step": 2515 |
| }, |
| { |
| "epoch": 1.6182241303213898, |
| "grad_norm": 1.2890625, |
| "learning_rate": 5.829050997387432e-05, |
| "loss": 0.5743, |
| "step": 2520 |
| }, |
| { |
| "epoch": 1.6214340167716568, |
| "grad_norm": 1.3203125, |
| "learning_rate": 5.816427777159117e-05, |
| "loss": 0.5854, |
| "step": 2525 |
| }, |
| { |
| "epoch": 1.6246439032219235, |
| "grad_norm": 1.3515625, |
| "learning_rate": 5.8038095344613595e-05, |
| "loss": 0.5837, |
| "step": 2530 |
| }, |
| { |
| "epoch": 1.6278537896721903, |
| "grad_norm": 1.2109375, |
| "learning_rate": 5.791196414155121e-05, |
| "loss": 0.6061, |
| "step": 2535 |
| }, |
| { |
| "epoch": 1.6310636761224573, |
| "grad_norm": 1.1796875, |
| "learning_rate": 5.778588561042556e-05, |
| "loss": 0.5856, |
| "step": 2540 |
| }, |
| { |
| "epoch": 1.6342735625727238, |
| "grad_norm": 1.296875, |
| "learning_rate": 5.76598611986535e-05, |
| "loss": 0.5721, |
| "step": 2545 |
| }, |
| { |
| "epoch": 1.6374834490229908, |
| "grad_norm": 1.3046875, |
| "learning_rate": 5.753389235303055e-05, |
| "loss": 0.5907, |
| "step": 2550 |
| }, |
| { |
| "epoch": 1.6406933354732578, |
| "grad_norm": 1.2578125, |
| "learning_rate": 5.7407980519714346e-05, |
| "loss": 0.5801, |
| "step": 2555 |
| }, |
| { |
| "epoch": 1.6439032219235243, |
| "grad_norm": 1.5, |
| "learning_rate": 5.728212714420804e-05, |
| "loss": 0.5794, |
| "step": 2560 |
| }, |
| { |
| "epoch": 1.6471131083737913, |
| "grad_norm": 1.3515625, |
| "learning_rate": 5.71563336713436e-05, |
| "loss": 0.5779, |
| "step": 2565 |
| }, |
| { |
| "epoch": 1.650322994824058, |
| "grad_norm": 1.4765625, |
| "learning_rate": 5.7030601545265336e-05, |
| "loss": 0.5851, |
| "step": 2570 |
| }, |
| { |
| "epoch": 1.6535328812743249, |
| "grad_norm": 1.3359375, |
| "learning_rate": 5.6904932209413276e-05, |
| "loss": 0.5868, |
| "step": 2575 |
| }, |
| { |
| "epoch": 1.6567427677245918, |
| "grad_norm": 1.3984375, |
| "learning_rate": 5.6779327106506594e-05, |
| "loss": 0.5722, |
| "step": 2580 |
| }, |
| { |
| "epoch": 1.6599526541748586, |
| "grad_norm": 1.328125, |
| "learning_rate": 5.665378767852704e-05, |
| "loss": 0.5988, |
| "step": 2585 |
| }, |
| { |
| "epoch": 1.6631625406251254, |
| "grad_norm": 1.25, |
| "learning_rate": 5.652831536670242e-05, |
| "loss": 0.5766, |
| "step": 2590 |
| }, |
| { |
| "epoch": 1.6663724270753923, |
| "grad_norm": 1.3984375, |
| "learning_rate": 5.640291161149e-05, |
| "loss": 0.592, |
| "step": 2595 |
| }, |
| { |
| "epoch": 1.669582313525659, |
| "grad_norm": 1.34375, |
| "learning_rate": 5.627757785256006e-05, |
| "loss": 0.5893, |
| "step": 2600 |
| }, |
| { |
| "epoch": 1.6727921999759259, |
| "grad_norm": 1.359375, |
| "learning_rate": 5.615231552877921e-05, |
| "loss": 0.5747, |
| "step": 2605 |
| }, |
| { |
| "epoch": 1.6760020864261926, |
| "grad_norm": 1.3125, |
| "learning_rate": 5.602712607819404e-05, |
| "loss": 0.5804, |
| "step": 2610 |
| }, |
| { |
| "epoch": 1.6792119728764594, |
| "grad_norm": 1.3515625, |
| "learning_rate": 5.590201093801449e-05, |
| "loss": 0.5734, |
| "step": 2615 |
| }, |
| { |
| "epoch": 1.6824218593267264, |
| "grad_norm": 1.21875, |
| "learning_rate": 5.577697154459742e-05, |
| "loss": 0.5708, |
| "step": 2620 |
| }, |
| { |
| "epoch": 1.6856317457769932, |
| "grad_norm": 1.3359375, |
| "learning_rate": 5.565200933343009e-05, |
| "loss": 0.5863, |
| "step": 2625 |
| }, |
| { |
| "epoch": 1.68884163222726, |
| "grad_norm": 1.2734375, |
| "learning_rate": 5.5527125739113686e-05, |
| "loss": 0.5846, |
| "step": 2630 |
| }, |
| { |
| "epoch": 1.692051518677527, |
| "grad_norm": 1.28125, |
| "learning_rate": 5.540232219534685e-05, |
| "loss": 0.5533, |
| "step": 2635 |
| }, |
| { |
| "epoch": 1.6952614051277934, |
| "grad_norm": 1.2890625, |
| "learning_rate": 5.527760013490922e-05, |
| "loss": 0.5916, |
| "step": 2640 |
| }, |
| { |
| "epoch": 1.6984712915780604, |
| "grad_norm": 1.328125, |
| "learning_rate": 5.515296098964499e-05, |
| "loss": 0.5641, |
| "step": 2645 |
| }, |
| { |
| "epoch": 1.7016811780283274, |
| "grad_norm": 1.2265625, |
| "learning_rate": 5.502840619044645e-05, |
| "loss": 0.5737, |
| "step": 2650 |
| }, |
| { |
| "epoch": 1.704891064478594, |
| "grad_norm": 1.28125, |
| "learning_rate": 5.490393716723757e-05, |
| "loss": 0.5728, |
| "step": 2655 |
| }, |
| { |
| "epoch": 1.708100950928861, |
| "grad_norm": 1.2265625, |
| "learning_rate": 5.477955534895762e-05, |
| "loss": 0.5614, |
| "step": 2660 |
| }, |
| { |
| "epoch": 1.7113108373791277, |
| "grad_norm": 1.34375, |
| "learning_rate": 5.465526216354471e-05, |
| "loss": 0.5819, |
| "step": 2665 |
| }, |
| { |
| "epoch": 1.7145207238293945, |
| "grad_norm": 1.203125, |
| "learning_rate": 5.453105903791942e-05, |
| "loss": 0.5709, |
| "step": 2670 |
| }, |
| { |
| "epoch": 1.7177306102796615, |
| "grad_norm": 1.234375, |
| "learning_rate": 5.44069473979684e-05, |
| "loss": 0.5951, |
| "step": 2675 |
| }, |
| { |
| "epoch": 1.7209404967299282, |
| "grad_norm": 1.2265625, |
| "learning_rate": 5.428292866852808e-05, |
| "loss": 0.5705, |
| "step": 2680 |
| }, |
| { |
| "epoch": 1.724150383180195, |
| "grad_norm": 1.265625, |
| "learning_rate": 5.4159004273368166e-05, |
| "loss": 0.5787, |
| "step": 2685 |
| }, |
| { |
| "epoch": 1.727360269630462, |
| "grad_norm": 1.2578125, |
| "learning_rate": 5.4035175635175464e-05, |
| "loss": 0.5832, |
| "step": 2690 |
| }, |
| { |
| "epoch": 1.7305701560807285, |
| "grad_norm": 1.3046875, |
| "learning_rate": 5.3911444175537394e-05, |
| "loss": 0.5888, |
| "step": 2695 |
| }, |
| { |
| "epoch": 1.7337800425309955, |
| "grad_norm": 1.3046875, |
| "learning_rate": 5.3787811314925776e-05, |
| "loss": 0.5695, |
| "step": 2700 |
| }, |
| { |
| "epoch": 1.7369899289812623, |
| "grad_norm": 1.21875, |
| "learning_rate": 5.3664278472680496e-05, |
| "loss": 0.569, |
| "step": 2705 |
| }, |
| { |
| "epoch": 1.740199815431529, |
| "grad_norm": 1.234375, |
| "learning_rate": 5.3540847066993173e-05, |
| "loss": 0.5853, |
| "step": 2710 |
| }, |
| { |
| "epoch": 1.743409701881796, |
| "grad_norm": 1.25, |
| "learning_rate": 5.341751851489091e-05, |
| "loss": 0.589, |
| "step": 2715 |
| }, |
| { |
| "epoch": 1.7466195883320628, |
| "grad_norm": 1.203125, |
| "learning_rate": 5.329429423222003e-05, |
| "loss": 0.5679, |
| "step": 2720 |
| }, |
| { |
| "epoch": 1.7498294747823295, |
| "grad_norm": 1.2890625, |
| "learning_rate": 5.3171175633629835e-05, |
| "loss": 0.5823, |
| "step": 2725 |
| }, |
| { |
| "epoch": 1.7530393612325965, |
| "grad_norm": 1.2265625, |
| "learning_rate": 5.3048164132556285e-05, |
| "loss": 0.5561, |
| "step": 2730 |
| }, |
| { |
| "epoch": 1.756249247682863, |
| "grad_norm": 1.265625, |
| "learning_rate": 5.292526114120589e-05, |
| "loss": 0.5701, |
| "step": 2735 |
| }, |
| { |
| "epoch": 1.75945913413313, |
| "grad_norm": 1.1953125, |
| "learning_rate": 5.28024680705394e-05, |
| "loss": 0.5779, |
| "step": 2740 |
| }, |
| { |
| "epoch": 1.762669020583397, |
| "grad_norm": 1.2421875, |
| "learning_rate": 5.267978633025568e-05, |
| "loss": 0.5607, |
| "step": 2745 |
| }, |
| { |
| "epoch": 1.7658789070336636, |
| "grad_norm": 1.171875, |
| "learning_rate": 5.255721732877546e-05, |
| "loss": 0.5862, |
| "step": 2750 |
| }, |
| { |
| "epoch": 1.7690887934839306, |
| "grad_norm": 1.296875, |
| "learning_rate": 5.243476247322521e-05, |
| "loss": 0.5764, |
| "step": 2755 |
| }, |
| { |
| "epoch": 1.7722986799341973, |
| "grad_norm": 1.296875, |
| "learning_rate": 5.2312423169420955e-05, |
| "loss": 0.5814, |
| "step": 2760 |
| }, |
| { |
| "epoch": 1.775508566384464, |
| "grad_norm": 1.2890625, |
| "learning_rate": 5.219020082185219e-05, |
| "loss": 0.5808, |
| "step": 2765 |
| }, |
| { |
| "epoch": 1.778718452834731, |
| "grad_norm": 1.2265625, |
| "learning_rate": 5.206809683366569e-05, |
| "loss": 0.58, |
| "step": 2770 |
| }, |
| { |
| "epoch": 1.7819283392849978, |
| "grad_norm": 1.2265625, |
| "learning_rate": 5.1946112606649435e-05, |
| "loss": 0.5723, |
| "step": 2775 |
| }, |
| { |
| "epoch": 1.7851382257352646, |
| "grad_norm": 1.2265625, |
| "learning_rate": 5.182424954121652e-05, |
| "loss": 0.5789, |
| "step": 2780 |
| }, |
| { |
| "epoch": 1.7883481121855316, |
| "grad_norm": 1.2890625, |
| "learning_rate": 5.170250903638909e-05, |
| "loss": 0.5726, |
| "step": 2785 |
| }, |
| { |
| "epoch": 1.7915579986357981, |
| "grad_norm": 1.1875, |
| "learning_rate": 5.158089248978221e-05, |
| "loss": 0.5718, |
| "step": 2790 |
| }, |
| { |
| "epoch": 1.794767885086065, |
| "grad_norm": 1.28125, |
| "learning_rate": 5.1459401297587916e-05, |
| "loss": 0.5845, |
| "step": 2795 |
| }, |
| { |
| "epoch": 1.7979777715363319, |
| "grad_norm": 1.203125, |
| "learning_rate": 5.1338036854559113e-05, |
| "loss": 0.563, |
| "step": 2800 |
| }, |
| { |
| "epoch": 1.8011876579865986, |
| "grad_norm": 1.2421875, |
| "learning_rate": 5.1216800553993606e-05, |
| "loss": 0.5841, |
| "step": 2805 |
| }, |
| { |
| "epoch": 1.8043975444368656, |
| "grad_norm": 1.2734375, |
| "learning_rate": 5.109569378771808e-05, |
| "loss": 0.5648, |
| "step": 2810 |
| }, |
| { |
| "epoch": 1.8076074308871324, |
| "grad_norm": 1.2421875, |
| "learning_rate": 5.097471794607214e-05, |
| "loss": 0.5768, |
| "step": 2815 |
| }, |
| { |
| "epoch": 1.8108173173373991, |
| "grad_norm": 1.2421875, |
| "learning_rate": 5.0853874417892324e-05, |
| "loss": 0.5596, |
| "step": 2820 |
| }, |
| { |
| "epoch": 1.8140272037876661, |
| "grad_norm": 1.3671875, |
| "learning_rate": 5.07331645904962e-05, |
| "loss": 0.5873, |
| "step": 2825 |
| }, |
| { |
| "epoch": 1.8172370902379327, |
| "grad_norm": 1.3125, |
| "learning_rate": 5.061258984966636e-05, |
| "loss": 0.5807, |
| "step": 2830 |
| }, |
| { |
| "epoch": 1.8204469766881997, |
| "grad_norm": 1.3359375, |
| "learning_rate": 5.049215157963464e-05, |
| "loss": 0.5814, |
| "step": 2835 |
| }, |
| { |
| "epoch": 1.8236568631384664, |
| "grad_norm": 1.4453125, |
| "learning_rate": 5.03718511630661e-05, |
| "loss": 0.5727, |
| "step": 2840 |
| }, |
| { |
| "epoch": 1.8268667495887332, |
| "grad_norm": 1.2890625, |
| "learning_rate": 5.025168998104322e-05, |
| "loss": 0.5731, |
| "step": 2845 |
| }, |
| { |
| "epoch": 1.8300766360390002, |
| "grad_norm": 1.25, |
| "learning_rate": 5.013166941304999e-05, |
| "loss": 0.5664, |
| "step": 2850 |
| }, |
| { |
| "epoch": 1.833286522489267, |
| "grad_norm": 1.3046875, |
| "learning_rate": 5.0011790836956197e-05, |
| "loss": 0.5812, |
| "step": 2855 |
| }, |
| { |
| "epoch": 1.8364964089395337, |
| "grad_norm": 1.21875, |
| "learning_rate": 4.989205562900144e-05, |
| "loss": 0.5715, |
| "step": 2860 |
| }, |
| { |
| "epoch": 1.8397062953898007, |
| "grad_norm": 1.2265625, |
| "learning_rate": 4.9772465163779474e-05, |
| "loss": 0.5785, |
| "step": 2865 |
| }, |
| { |
| "epoch": 1.8429161818400674, |
| "grad_norm": 1.1796875, |
| "learning_rate": 4.9653020814222315e-05, |
| "loss": 0.5813, |
| "step": 2870 |
| }, |
| { |
| "epoch": 1.8461260682903342, |
| "grad_norm": 1.2265625, |
| "learning_rate": 4.9533723951584554e-05, |
| "loss": 0.59, |
| "step": 2875 |
| }, |
| { |
| "epoch": 1.8493359547406012, |
| "grad_norm": 1.21875, |
| "learning_rate": 4.94145759454276e-05, |
| "loss": 0.565, |
| "step": 2880 |
| }, |
| { |
| "epoch": 1.8525458411908677, |
| "grad_norm": 1.3046875, |
| "learning_rate": 4.929557816360391e-05, |
| "loss": 0.5839, |
| "step": 2885 |
| }, |
| { |
| "epoch": 1.8557557276411347, |
| "grad_norm": 1.234375, |
| "learning_rate": 4.9176731972241376e-05, |
| "loss": 0.5755, |
| "step": 2890 |
| }, |
| { |
| "epoch": 1.8589656140914015, |
| "grad_norm": 1.3125, |
| "learning_rate": 4.905803873572755e-05, |
| "loss": 0.571, |
| "step": 2895 |
| }, |
| { |
| "epoch": 1.8621755005416682, |
| "grad_norm": 1.203125, |
| "learning_rate": 4.8939499816694035e-05, |
| "loss": 0.572, |
| "step": 2900 |
| }, |
| { |
| "epoch": 1.8653853869919352, |
| "grad_norm": 1.234375, |
| "learning_rate": 4.882111657600081e-05, |
| "loss": 0.5559, |
| "step": 2905 |
| }, |
| { |
| "epoch": 1.868595273442202, |
| "grad_norm": 1.234375, |
| "learning_rate": 4.8702890372720664e-05, |
| "loss": 0.5792, |
| "step": 2910 |
| }, |
| { |
| "epoch": 1.8718051598924688, |
| "grad_norm": 1.2265625, |
| "learning_rate": 4.85848225641235e-05, |
| "loss": 0.5611, |
| "step": 2915 |
| }, |
| { |
| "epoch": 1.8750150463427357, |
| "grad_norm": 1.2421875, |
| "learning_rate": 4.8466914505660834e-05, |
| "loss": 0.5663, |
| "step": 2920 |
| }, |
| { |
| "epoch": 1.8782249327930023, |
| "grad_norm": 1.3828125, |
| "learning_rate": 4.834916755095022e-05, |
| "loss": 0.5914, |
| "step": 2925 |
| }, |
| { |
| "epoch": 1.8814348192432693, |
| "grad_norm": 1.203125, |
| "learning_rate": 4.823158305175967e-05, |
| "loss": 0.5712, |
| "step": 2930 |
| }, |
| { |
| "epoch": 1.884644705693536, |
| "grad_norm": 1.265625, |
| "learning_rate": 4.811416235799216e-05, |
| "loss": 0.5957, |
| "step": 2935 |
| }, |
| { |
| "epoch": 1.8878545921438028, |
| "grad_norm": 1.375, |
| "learning_rate": 4.7996906817670155e-05, |
| "loss": 0.5872, |
| "step": 2940 |
| }, |
| { |
| "epoch": 1.8910644785940698, |
| "grad_norm": 1.34375, |
| "learning_rate": 4.78798177769201e-05, |
| "loss": 0.5604, |
| "step": 2945 |
| }, |
| { |
| "epoch": 1.8942743650443365, |
| "grad_norm": 1.359375, |
| "learning_rate": 4.7762896579956966e-05, |
| "loss": 0.556, |
| "step": 2950 |
| }, |
| { |
| "epoch": 1.8974842514946033, |
| "grad_norm": 1.2734375, |
| "learning_rate": 4.764614456906886e-05, |
| "loss": 0.5577, |
| "step": 2955 |
| }, |
| { |
| "epoch": 1.9006941379448703, |
| "grad_norm": 1.1484375, |
| "learning_rate": 4.752956308460155e-05, |
| "loss": 0.584, |
| "step": 2960 |
| }, |
| { |
| "epoch": 1.903904024395137, |
| "grad_norm": 1.296875, |
| "learning_rate": 4.741315346494314e-05, |
| "loss": 0.5625, |
| "step": 2965 |
| }, |
| { |
| "epoch": 1.9071139108454038, |
| "grad_norm": 1.2734375, |
| "learning_rate": 4.729691704650867e-05, |
| "loss": 0.5684, |
| "step": 2970 |
| }, |
| { |
| "epoch": 1.9103237972956708, |
| "grad_norm": 1.3359375, |
| "learning_rate": 4.718085516372478e-05, |
| "loss": 0.5851, |
| "step": 2975 |
| }, |
| { |
| "epoch": 1.9135336837459374, |
| "grad_norm": 1.1875, |
| "learning_rate": 4.70649691490144e-05, |
| "loss": 0.5637, |
| "step": 2980 |
| }, |
| { |
| "epoch": 1.9167435701962043, |
| "grad_norm": 1.296875, |
| "learning_rate": 4.694926033278142e-05, |
| "loss": 0.5792, |
| "step": 2985 |
| }, |
| { |
| "epoch": 1.919953456646471, |
| "grad_norm": 1.203125, |
| "learning_rate": 4.683373004339547e-05, |
| "loss": 0.5406, |
| "step": 2990 |
| }, |
| { |
| "epoch": 1.9231633430967379, |
| "grad_norm": 1.34375, |
| "learning_rate": 4.6718379607176634e-05, |
| "loss": 0.5777, |
| "step": 2995 |
| }, |
| { |
| "epoch": 1.9263732295470049, |
| "grad_norm": 1.25, |
| "learning_rate": 4.6603210348380235e-05, |
| "loss": 0.5742, |
| "step": 3000 |
| }, |
| { |
| "epoch": 1.9263732295470049, |
| "eval_loss": 0.48648878931999207, |
| "eval_runtime": 2.4037, |
| "eval_samples_per_second": 83.204, |
| "eval_steps_per_second": 83.204, |
| "step": 3000 |
| }, |
| { |
| "epoch": 1.9295831159972716, |
| "grad_norm": 1.2265625, |
| "learning_rate": 4.64882235891816e-05, |
| "loss": 0.5662, |
| "step": 3005 |
| }, |
| { |
| "epoch": 1.9327930024475384, |
| "grad_norm": 1.2890625, |
| "learning_rate": 4.637342064966095e-05, |
| "loss": 0.5972, |
| "step": 3010 |
| }, |
| { |
| "epoch": 1.9360028888978054, |
| "grad_norm": 1.2265625, |
| "learning_rate": 4.625880284778818e-05, |
| "loss": 0.5682, |
| "step": 3015 |
| }, |
| { |
| "epoch": 1.939212775348072, |
| "grad_norm": 1.2578125, |
| "learning_rate": 4.614437149940776e-05, |
| "loss": 0.5703, |
| "step": 3020 |
| }, |
| { |
| "epoch": 1.942422661798339, |
| "grad_norm": 1.3203125, |
| "learning_rate": 4.603012791822362e-05, |
| "loss": 0.5611, |
| "step": 3025 |
| }, |
| { |
| "epoch": 1.9456325482486057, |
| "grad_norm": 1.21875, |
| "learning_rate": 4.591607341578407e-05, |
| "loss": 0.5471, |
| "step": 3030 |
| }, |
| { |
| "epoch": 1.9488424346988724, |
| "grad_norm": 1.1640625, |
| "learning_rate": 4.580220930146675e-05, |
| "loss": 0.5398, |
| "step": 3035 |
| }, |
| { |
| "epoch": 1.9520523211491394, |
| "grad_norm": 1.3203125, |
| "learning_rate": 4.568853688246357e-05, |
| "loss": 0.5864, |
| "step": 3040 |
| }, |
| { |
| "epoch": 1.9552622075994062, |
| "grad_norm": 1.234375, |
| "learning_rate": 4.557505746376576e-05, |
| "loss": 0.5662, |
| "step": 3045 |
| }, |
| { |
| "epoch": 1.958472094049673, |
| "grad_norm": 1.25, |
| "learning_rate": 4.546177234814881e-05, |
| "loss": 0.5745, |
| "step": 3050 |
| }, |
| { |
| "epoch": 1.96168198049994, |
| "grad_norm": 1.1875, |
| "learning_rate": 4.53486828361576e-05, |
| "loss": 0.5486, |
| "step": 3055 |
| }, |
| { |
| "epoch": 1.9648918669502067, |
| "grad_norm": 1.2265625, |
| "learning_rate": 4.523579022609139e-05, |
| "loss": 0.5703, |
| "step": 3060 |
| }, |
| { |
| "epoch": 1.9681017534004734, |
| "grad_norm": 1.3125, |
| "learning_rate": 4.512309581398896e-05, |
| "loss": 0.5627, |
| "step": 3065 |
| }, |
| { |
| "epoch": 1.9713116398507404, |
| "grad_norm": 1.296875, |
| "learning_rate": 4.5010600893613714e-05, |
| "loss": 0.5839, |
| "step": 3070 |
| }, |
| { |
| "epoch": 1.974521526301007, |
| "grad_norm": 1.2421875, |
| "learning_rate": 4.489830675643888e-05, |
| "loss": 0.5638, |
| "step": 3075 |
| }, |
| { |
| "epoch": 1.977731412751274, |
| "grad_norm": 1.2578125, |
| "learning_rate": 4.478621469163259e-05, |
| "loss": 0.5709, |
| "step": 3080 |
| }, |
| { |
| "epoch": 1.9809412992015407, |
| "grad_norm": 1.2421875, |
| "learning_rate": 4.4674325986043145e-05, |
| "loss": 0.558, |
| "step": 3085 |
| }, |
| { |
| "epoch": 1.9841511856518075, |
| "grad_norm": 1.1953125, |
| "learning_rate": 4.456264192418422e-05, |
| "loss": 0.5639, |
| "step": 3090 |
| }, |
| { |
| "epoch": 1.9873610721020745, |
| "grad_norm": 1.25, |
| "learning_rate": 4.445116378822014e-05, |
| "loss": 0.5742, |
| "step": 3095 |
| }, |
| { |
| "epoch": 1.9905709585523412, |
| "grad_norm": 1.25, |
| "learning_rate": 4.433989285795112e-05, |
| "loss": 0.5653, |
| "step": 3100 |
| }, |
| { |
| "epoch": 1.993780845002608, |
| "grad_norm": 1.234375, |
| "learning_rate": 4.4228830410798594e-05, |
| "loss": 0.581, |
| "step": 3105 |
| }, |
| { |
| "epoch": 1.996990731452875, |
| "grad_norm": 1.1640625, |
| "learning_rate": 4.411797772179059e-05, |
| "loss": 0.5658, |
| "step": 3110 |
| }, |
| { |
| "epoch": 1.9995586406130883, |
| "eval_loss": 0.48290687799453735, |
| "eval_runtime": 2.4097, |
| "eval_samples_per_second": 82.996, |
| "eval_steps_per_second": 82.996, |
| "step": 3114 |
| }, |
| { |
| "epoch": 2.000641977290053, |
| "grad_norm": 3.296875, |
| "learning_rate": 4.4007336063547e-05, |
| "loss": 0.6695, |
| "step": 3115 |
| }, |
| { |
| "epoch": 2.00385186374032, |
| "grad_norm": 1.1796875, |
| "learning_rate": 4.389690670626507e-05, |
| "loss": 0.5518, |
| "step": 3120 |
| }, |
| { |
| "epoch": 2.007061750190587, |
| "grad_norm": 1.25, |
| "learning_rate": 4.378669091770474e-05, |
| "loss": 0.5527, |
| "step": 3125 |
| }, |
| { |
| "epoch": 2.0102716366408537, |
| "grad_norm": 1.34375, |
| "learning_rate": 4.367668996317413e-05, |
| "loss": 0.5517, |
| "step": 3130 |
| }, |
| { |
| "epoch": 2.0134815230911207, |
| "grad_norm": 1.3046875, |
| "learning_rate": 4.3566905105515035e-05, |
| "loss": 0.5451, |
| "step": 3135 |
| }, |
| { |
| "epoch": 2.0166914095413873, |
| "grad_norm": 1.25, |
| "learning_rate": 4.345733760508832e-05, |
| "loss": 0.5342, |
| "step": 3140 |
| }, |
| { |
| "epoch": 2.0199012959916542, |
| "grad_norm": 1.21875, |
| "learning_rate": 4.334798871975963e-05, |
| "loss": 0.5445, |
| "step": 3145 |
| }, |
| { |
| "epoch": 2.0231111824419212, |
| "grad_norm": 1.15625, |
| "learning_rate": 4.3238859704884784e-05, |
| "loss": 0.5442, |
| "step": 3150 |
| }, |
| { |
| "epoch": 2.0263210688921878, |
| "grad_norm": 1.203125, |
| "learning_rate": 4.312995181329543e-05, |
| "loss": 0.5367, |
| "step": 3155 |
| }, |
| { |
| "epoch": 2.0295309553424548, |
| "grad_norm": 1.2265625, |
| "learning_rate": 4.3021266295284665e-05, |
| "loss": 0.5466, |
| "step": 3160 |
| }, |
| { |
| "epoch": 2.0327408417927217, |
| "grad_norm": 1.1953125, |
| "learning_rate": 4.291280439859269e-05, |
| "loss": 0.5709, |
| "step": 3165 |
| }, |
| { |
| "epoch": 2.0359507282429883, |
| "grad_norm": 1.2421875, |
| "learning_rate": 4.280456736839245e-05, |
| "loss": 0.5409, |
| "step": 3170 |
| }, |
| { |
| "epoch": 2.0391606146932553, |
| "grad_norm": 1.2265625, |
| "learning_rate": 4.269655644727536e-05, |
| "loss": 0.5526, |
| "step": 3175 |
| }, |
| { |
| "epoch": 2.0423705011435223, |
| "grad_norm": 1.2578125, |
| "learning_rate": 4.258877287523707e-05, |
| "loss": 0.539, |
| "step": 3180 |
| }, |
| { |
| "epoch": 2.045580387593789, |
| "grad_norm": 1.1796875, |
| "learning_rate": 4.2481217889663156e-05, |
| "loss": 0.5503, |
| "step": 3185 |
| }, |
| { |
| "epoch": 2.048790274044056, |
| "grad_norm": 1.1875, |
| "learning_rate": 4.237389272531499e-05, |
| "loss": 0.5537, |
| "step": 3190 |
| }, |
| { |
| "epoch": 2.0520001604943223, |
| "grad_norm": 1.2578125, |
| "learning_rate": 4.2266798614315505e-05, |
| "loss": 0.544, |
| "step": 3195 |
| }, |
| { |
| "epoch": 2.0552100469445893, |
| "grad_norm": 1.3671875, |
| "learning_rate": 4.2159936786135115e-05, |
| "loss": 0.5358, |
| "step": 3200 |
| }, |
| { |
| "epoch": 2.0584199333948563, |
| "grad_norm": 1.2578125, |
| "learning_rate": 4.2053308467577516e-05, |
| "loss": 0.5185, |
| "step": 3205 |
| }, |
| { |
| "epoch": 2.061629819845123, |
| "grad_norm": 1.2109375, |
| "learning_rate": 4.1946914882765684e-05, |
| "loss": 0.5666, |
| "step": 3210 |
| }, |
| { |
| "epoch": 2.06483970629539, |
| "grad_norm": 1.1953125, |
| "learning_rate": 4.184075725312776e-05, |
| "loss": 0.5325, |
| "step": 3215 |
| }, |
| { |
| "epoch": 2.068049592745657, |
| "grad_norm": 1.25, |
| "learning_rate": 4.173483679738309e-05, |
| "loss": 0.5484, |
| "step": 3220 |
| }, |
| { |
| "epoch": 2.0712594791959233, |
| "grad_norm": 1.2890625, |
| "learning_rate": 4.162915473152816e-05, |
| "loss": 0.5483, |
| "step": 3225 |
| }, |
| { |
| "epoch": 2.0744693656461903, |
| "grad_norm": 1.28125, |
| "learning_rate": 4.152371226882268e-05, |
| "loss": 0.5411, |
| "step": 3230 |
| }, |
| { |
| "epoch": 2.077679252096457, |
| "grad_norm": 1.2578125, |
| "learning_rate": 4.141851061977565e-05, |
| "loss": 0.5503, |
| "step": 3235 |
| }, |
| { |
| "epoch": 2.080889138546724, |
| "grad_norm": 1.140625, |
| "learning_rate": 4.131355099213149e-05, |
| "loss": 0.552, |
| "step": 3240 |
| }, |
| { |
| "epoch": 2.084099024996991, |
| "grad_norm": 1.203125, |
| "learning_rate": 4.120883459085611e-05, |
| "loss": 0.5297, |
| "step": 3245 |
| }, |
| { |
| "epoch": 2.0873089114472574, |
| "grad_norm": 1.2421875, |
| "learning_rate": 4.110436261812313e-05, |
| "loss": 0.5324, |
| "step": 3250 |
| }, |
| { |
| "epoch": 2.0905187978975244, |
| "grad_norm": 1.1484375, |
| "learning_rate": 4.100013627330006e-05, |
| "loss": 0.5355, |
| "step": 3255 |
| }, |
| { |
| "epoch": 2.0937286843477914, |
| "grad_norm": 1.234375, |
| "learning_rate": 4.089615675293452e-05, |
| "loss": 0.5508, |
| "step": 3260 |
| }, |
| { |
| "epoch": 2.096938570798058, |
| "grad_norm": 1.25, |
| "learning_rate": 4.0792425250740544e-05, |
| "loss": 0.5185, |
| "step": 3265 |
| }, |
| { |
| "epoch": 2.100148457248325, |
| "grad_norm": 1.2890625, |
| "learning_rate": 4.0688942957584825e-05, |
| "loss": 0.5783, |
| "step": 3270 |
| }, |
| { |
| "epoch": 2.103358343698592, |
| "grad_norm": 1.2578125, |
| "learning_rate": 4.058571106147307e-05, |
| "loss": 0.5403, |
| "step": 3275 |
| }, |
| { |
| "epoch": 2.1065682301488584, |
| "grad_norm": 1.3359375, |
| "learning_rate": 4.048273074753637e-05, |
| "loss": 0.5358, |
| "step": 3280 |
| }, |
| { |
| "epoch": 2.1097781165991254, |
| "grad_norm": 1.296875, |
| "learning_rate": 4.038000319801756e-05, |
| "loss": 0.5203, |
| "step": 3285 |
| }, |
| { |
| "epoch": 2.112988003049392, |
| "grad_norm": 1.234375, |
| "learning_rate": 4.0277529592257676e-05, |
| "loss": 0.5501, |
| "step": 3290 |
| }, |
| { |
| "epoch": 2.116197889499659, |
| "grad_norm": 1.234375, |
| "learning_rate": 4.017531110668244e-05, |
| "loss": 0.5677, |
| "step": 3295 |
| }, |
| { |
| "epoch": 2.119407775949926, |
| "grad_norm": 1.2265625, |
| "learning_rate": 4.0073348914788684e-05, |
| "loss": 0.536, |
| "step": 3300 |
| }, |
| { |
| "epoch": 2.1226176624001925, |
| "grad_norm": 1.265625, |
| "learning_rate": 3.997164418713093e-05, |
| "loss": 0.553, |
| "step": 3305 |
| }, |
| { |
| "epoch": 2.1258275488504594, |
| "grad_norm": 1.3515625, |
| "learning_rate": 3.987019809130794e-05, |
| "loss": 0.5614, |
| "step": 3310 |
| }, |
| { |
| "epoch": 2.1290374353007264, |
| "grad_norm": 1.25, |
| "learning_rate": 3.9769011791949305e-05, |
| "loss": 0.5337, |
| "step": 3315 |
| }, |
| { |
| "epoch": 2.132247321750993, |
| "grad_norm": 1.1640625, |
| "learning_rate": 3.9668086450702086e-05, |
| "loss": 0.5257, |
| "step": 3320 |
| }, |
| { |
| "epoch": 2.13545720820126, |
| "grad_norm": 1.2109375, |
| "learning_rate": 3.956742322621747e-05, |
| "loss": 0.5379, |
| "step": 3325 |
| }, |
| { |
| "epoch": 2.1386670946515265, |
| "grad_norm": 1.2578125, |
| "learning_rate": 3.946702327413746e-05, |
| "loss": 0.5356, |
| "step": 3330 |
| }, |
| { |
| "epoch": 2.1418769811017935, |
| "grad_norm": 1.3203125, |
| "learning_rate": 3.936688774708163e-05, |
| "loss": 0.5343, |
| "step": 3335 |
| }, |
| { |
| "epoch": 2.1450868675520605, |
| "grad_norm": 1.265625, |
| "learning_rate": 3.926701779463389e-05, |
| "loss": 0.5452, |
| "step": 3340 |
| }, |
| { |
| "epoch": 2.148296754002327, |
| "grad_norm": 1.171875, |
| "learning_rate": 3.916741456332926e-05, |
| "loss": 0.5443, |
| "step": 3345 |
| }, |
| { |
| "epoch": 2.151506640452594, |
| "grad_norm": 1.28125, |
| "learning_rate": 3.906807919664073e-05, |
| "loss": 0.5368, |
| "step": 3350 |
| }, |
| { |
| "epoch": 2.154716526902861, |
| "grad_norm": 1.1953125, |
| "learning_rate": 3.8969012834966135e-05, |
| "loss": 0.5436, |
| "step": 3355 |
| }, |
| { |
| "epoch": 2.1579264133531275, |
| "grad_norm": 1.21875, |
| "learning_rate": 3.8870216615615045e-05, |
| "loss": 0.5238, |
| "step": 3360 |
| }, |
| { |
| "epoch": 2.1611362998033945, |
| "grad_norm": 1.2734375, |
| "learning_rate": 3.877169167279575e-05, |
| "loss": 0.5483, |
| "step": 3365 |
| }, |
| { |
| "epoch": 2.1643461862536615, |
| "grad_norm": 1.296875, |
| "learning_rate": 3.867343913760218e-05, |
| "loss": 0.5313, |
| "step": 3370 |
| }, |
| { |
| "epoch": 2.167556072703928, |
| "grad_norm": 1.2109375, |
| "learning_rate": 3.857546013800095e-05, |
| "loss": 0.539, |
| "step": 3375 |
| }, |
| { |
| "epoch": 2.170765959154195, |
| "grad_norm": 1.328125, |
| "learning_rate": 3.847775579881844e-05, |
| "loss": 0.5385, |
| "step": 3380 |
| }, |
| { |
| "epoch": 2.1739758456044616, |
| "grad_norm": 1.2578125, |
| "learning_rate": 3.8380327241727804e-05, |
| "loss": 0.5496, |
| "step": 3385 |
| }, |
| { |
| "epoch": 2.1771857320547285, |
| "grad_norm": 1.203125, |
| "learning_rate": 3.828317558523619e-05, |
| "loss": 0.545, |
| "step": 3390 |
| }, |
| { |
| "epoch": 2.1803956185049955, |
| "grad_norm": 1.2265625, |
| "learning_rate": 3.818630194467181e-05, |
| "loss": 0.5343, |
| "step": 3395 |
| }, |
| { |
| "epoch": 2.183605504955262, |
| "grad_norm": 1.3828125, |
| "learning_rate": 3.8089707432171193e-05, |
| "loss": 0.5325, |
| "step": 3400 |
| }, |
| { |
| "epoch": 2.186815391405529, |
| "grad_norm": 1.28125, |
| "learning_rate": 3.799339315666641e-05, |
| "loss": 0.547, |
| "step": 3405 |
| }, |
| { |
| "epoch": 2.190025277855796, |
| "grad_norm": 1.296875, |
| "learning_rate": 3.789736022387231e-05, |
| "loss": 0.5448, |
| "step": 3410 |
| }, |
| { |
| "epoch": 2.1932351643060626, |
| "grad_norm": 1.2734375, |
| "learning_rate": 3.780160973627386e-05, |
| "loss": 0.5431, |
| "step": 3415 |
| }, |
| { |
| "epoch": 2.1964450507563296, |
| "grad_norm": 1.21875, |
| "learning_rate": 3.770614279311348e-05, |
| "loss": 0.5599, |
| "step": 3420 |
| }, |
| { |
| "epoch": 2.1996549372065965, |
| "grad_norm": 1.203125, |
| "learning_rate": 3.7610960490378415e-05, |
| "loss": 0.5474, |
| "step": 3425 |
| }, |
| { |
| "epoch": 2.202864823656863, |
| "grad_norm": 1.28125, |
| "learning_rate": 3.751606392078816e-05, |
| "loss": 0.5688, |
| "step": 3430 |
| }, |
| { |
| "epoch": 2.20607471010713, |
| "grad_norm": 1.2578125, |
| "learning_rate": 3.74214541737819e-05, |
| "loss": 0.5326, |
| "step": 3435 |
| }, |
| { |
| "epoch": 2.2092845965573966, |
| "grad_norm": 1.2421875, |
| "learning_rate": 3.732713233550606e-05, |
| "loss": 0.5303, |
| "step": 3440 |
| }, |
| { |
| "epoch": 2.2124944830076636, |
| "grad_norm": 1.2734375, |
| "learning_rate": 3.723309948880176e-05, |
| "loss": 0.5402, |
| "step": 3445 |
| }, |
| { |
| "epoch": 2.2157043694579306, |
| "grad_norm": 1.265625, |
| "learning_rate": 3.713935671319239e-05, |
| "loss": 0.5268, |
| "step": 3450 |
| }, |
| { |
| "epoch": 2.218914255908197, |
| "grad_norm": 1.2734375, |
| "learning_rate": 3.704590508487129e-05, |
| "loss": 0.5613, |
| "step": 3455 |
| }, |
| { |
| "epoch": 2.222124142358464, |
| "grad_norm": 1.3125, |
| "learning_rate": 3.695274567668933e-05, |
| "loss": 0.5533, |
| "step": 3460 |
| }, |
| { |
| "epoch": 2.2253340288087307, |
| "grad_norm": 1.2109375, |
| "learning_rate": 3.6859879558142594e-05, |
| "loss": 0.5403, |
| "step": 3465 |
| }, |
| { |
| "epoch": 2.2285439152589976, |
| "grad_norm": 1.234375, |
| "learning_rate": 3.6767307795360145e-05, |
| "loss": 0.5304, |
| "step": 3470 |
| }, |
| { |
| "epoch": 2.2317538017092646, |
| "grad_norm": 1.1953125, |
| "learning_rate": 3.6675031451091755e-05, |
| "loss": 0.5323, |
| "step": 3475 |
| }, |
| { |
| "epoch": 2.234963688159531, |
| "grad_norm": 1.2578125, |
| "learning_rate": 3.65830515846957e-05, |
| "loss": 0.5299, |
| "step": 3480 |
| }, |
| { |
| "epoch": 2.238173574609798, |
| "grad_norm": 1.1875, |
| "learning_rate": 3.64913692521266e-05, |
| "loss": 0.5645, |
| "step": 3485 |
| }, |
| { |
| "epoch": 2.241383461060065, |
| "grad_norm": 1.375, |
| "learning_rate": 3.6399985505923295e-05, |
| "loss": 0.5453, |
| "step": 3490 |
| }, |
| { |
| "epoch": 2.2445933475103317, |
| "grad_norm": 1.25, |
| "learning_rate": 3.6308901395196825e-05, |
| "loss": 0.5387, |
| "step": 3495 |
| }, |
| { |
| "epoch": 2.2478032339605987, |
| "grad_norm": 1.21875, |
| "learning_rate": 3.621811796561827e-05, |
| "loss": 0.5512, |
| "step": 3500 |
| }, |
| { |
| "epoch": 2.2478032339605987, |
| "eval_loss": 0.4768131375312805, |
| "eval_runtime": 2.3764, |
| "eval_samples_per_second": 84.16, |
| "eval_steps_per_second": 84.16, |
| "step": 3500 |
| }, |
| { |
| "epoch": 2.2510131204108657, |
| "grad_norm": 1.2578125, |
| "learning_rate": 3.6127636259406837e-05, |
| "loss": 0.555, |
| "step": 3505 |
| }, |
| { |
| "epoch": 2.254223006861132, |
| "grad_norm": 1.2109375, |
| "learning_rate": 3.6037457315317844e-05, |
| "loss": 0.5454, |
| "step": 3510 |
| }, |
| { |
| "epoch": 2.257432893311399, |
| "grad_norm": 1.25, |
| "learning_rate": 3.5947582168630855e-05, |
| "loss": 0.535, |
| "step": 3515 |
| }, |
| { |
| "epoch": 2.2606427797616657, |
| "grad_norm": 1.21875, |
| "learning_rate": 3.585801185113771e-05, |
| "loss": 0.5461, |
| "step": 3520 |
| }, |
| { |
| "epoch": 2.2638526662119327, |
| "grad_norm": 1.1328125, |
| "learning_rate": 3.576874739113073e-05, |
| "loss": 0.527, |
| "step": 3525 |
| }, |
| { |
| "epoch": 2.2670625526621997, |
| "grad_norm": 1.34375, |
| "learning_rate": 3.567978981339095e-05, |
| "loss": 0.5364, |
| "step": 3530 |
| }, |
| { |
| "epoch": 2.2702724391124662, |
| "grad_norm": 1.234375, |
| "learning_rate": 3.559114013917624e-05, |
| "loss": 0.5366, |
| "step": 3535 |
| }, |
| { |
| "epoch": 2.273482325562733, |
| "grad_norm": 1.2890625, |
| "learning_rate": 3.5502799386209726e-05, |
| "loss": 0.5386, |
| "step": 3540 |
| }, |
| { |
| "epoch": 2.276692212013, |
| "grad_norm": 1.125, |
| "learning_rate": 3.5414768568667974e-05, |
| "loss": 0.5391, |
| "step": 3545 |
| }, |
| { |
| "epoch": 2.2799020984632667, |
| "grad_norm": 1.2109375, |
| "learning_rate": 3.532704869716943e-05, |
| "loss": 0.5342, |
| "step": 3550 |
| }, |
| { |
| "epoch": 2.2831119849135337, |
| "grad_norm": 1.21875, |
| "learning_rate": 3.523964077876279e-05, |
| "loss": 0.5506, |
| "step": 3555 |
| }, |
| { |
| "epoch": 2.2863218713638007, |
| "grad_norm": 1.2578125, |
| "learning_rate": 3.5152545816915446e-05, |
| "loss": 0.561, |
| "step": 3560 |
| }, |
| { |
| "epoch": 2.2895317578140673, |
| "grad_norm": 1.2734375, |
| "learning_rate": 3.506576481150194e-05, |
| "loss": 0.5429, |
| "step": 3565 |
| }, |
| { |
| "epoch": 2.2927416442643342, |
| "grad_norm": 1.2109375, |
| "learning_rate": 3.497929875879254e-05, |
| "loss": 0.5374, |
| "step": 3570 |
| }, |
| { |
| "epoch": 2.295951530714601, |
| "grad_norm": 1.2265625, |
| "learning_rate": 3.4893148651441735e-05, |
| "loss": 0.5634, |
| "step": 3575 |
| }, |
| { |
| "epoch": 2.2991614171648678, |
| "grad_norm": 1.2734375, |
| "learning_rate": 3.480731547847688e-05, |
| "loss": 0.5394, |
| "step": 3580 |
| }, |
| { |
| "epoch": 2.3023713036151348, |
| "grad_norm": 1.2109375, |
| "learning_rate": 3.472180022528686e-05, |
| "loss": 0.5342, |
| "step": 3585 |
| }, |
| { |
| "epoch": 2.3055811900654013, |
| "grad_norm": 1.1953125, |
| "learning_rate": 3.4636603873610735e-05, |
| "loss": 0.547, |
| "step": 3590 |
| }, |
| { |
| "epoch": 2.3087910765156683, |
| "grad_norm": 1.1953125, |
| "learning_rate": 3.455172740152648e-05, |
| "loss": 0.5421, |
| "step": 3595 |
| }, |
| { |
| "epoch": 2.3120009629659353, |
| "grad_norm": 1.2890625, |
| "learning_rate": 3.446717178343976e-05, |
| "loss": 0.5562, |
| "step": 3600 |
| }, |
| { |
| "epoch": 2.315210849416202, |
| "grad_norm": 1.40625, |
| "learning_rate": 3.438293799007276e-05, |
| "loss": 0.5358, |
| "step": 3605 |
| }, |
| { |
| "epoch": 2.318420735866469, |
| "grad_norm": 1.2578125, |
| "learning_rate": 3.429902698845302e-05, |
| "loss": 0.5555, |
| "step": 3610 |
| }, |
| { |
| "epoch": 2.321630622316736, |
| "grad_norm": 1.1953125, |
| "learning_rate": 3.421543974190234e-05, |
| "loss": 0.5414, |
| "step": 3615 |
| }, |
| { |
| "epoch": 2.3248405087670023, |
| "grad_norm": 1.2734375, |
| "learning_rate": 3.4132177210025724e-05, |
| "loss": 0.5336, |
| "step": 3620 |
| }, |
| { |
| "epoch": 2.3280503952172693, |
| "grad_norm": 1.3359375, |
| "learning_rate": 3.404924034870036e-05, |
| "loss": 0.5351, |
| "step": 3625 |
| }, |
| { |
| "epoch": 2.331260281667536, |
| "grad_norm": 1.28125, |
| "learning_rate": 3.396663011006465e-05, |
| "loss": 0.5523, |
| "step": 3630 |
| }, |
| { |
| "epoch": 2.334470168117803, |
| "grad_norm": 1.203125, |
| "learning_rate": 3.388434744250726e-05, |
| "loss": 0.5347, |
| "step": 3635 |
| }, |
| { |
| "epoch": 2.33768005456807, |
| "grad_norm": 1.203125, |
| "learning_rate": 3.3802393290656274e-05, |
| "loss": 0.5387, |
| "step": 3640 |
| }, |
| { |
| "epoch": 2.3408899410183364, |
| "grad_norm": 1.2109375, |
| "learning_rate": 3.372076859536831e-05, |
| "loss": 0.5309, |
| "step": 3645 |
| }, |
| { |
| "epoch": 2.3440998274686033, |
| "grad_norm": 1.296875, |
| "learning_rate": 3.363947429371772e-05, |
| "loss": 0.5531, |
| "step": 3650 |
| }, |
| { |
| "epoch": 2.34730971391887, |
| "grad_norm": 1.2109375, |
| "learning_rate": 3.355851131898585e-05, |
| "loss": 0.5437, |
| "step": 3655 |
| }, |
| { |
| "epoch": 2.350519600369137, |
| "grad_norm": 1.1953125, |
| "learning_rate": 3.347788060065036e-05, |
| "loss": 0.5143, |
| "step": 3660 |
| }, |
| { |
| "epoch": 2.353729486819404, |
| "grad_norm": 1.234375, |
| "learning_rate": 3.339758306437445e-05, |
| "loss": 0.532, |
| "step": 3665 |
| }, |
| { |
| "epoch": 2.3569393732696704, |
| "grad_norm": 1.234375, |
| "learning_rate": 3.331761963199634e-05, |
| "loss": 0.5535, |
| "step": 3670 |
| }, |
| { |
| "epoch": 2.3601492597199374, |
| "grad_norm": 1.15625, |
| "learning_rate": 3.3237991221518636e-05, |
| "loss": 0.5384, |
| "step": 3675 |
| }, |
| { |
| "epoch": 2.3633591461702044, |
| "grad_norm": 1.296875, |
| "learning_rate": 3.3158698747097784e-05, |
| "loss": 0.5444, |
| "step": 3680 |
| }, |
| { |
| "epoch": 2.366569032620471, |
| "grad_norm": 1.21875, |
| "learning_rate": 3.30797431190336e-05, |
| "loss": 0.5392, |
| "step": 3685 |
| }, |
| { |
| "epoch": 2.369778919070738, |
| "grad_norm": 1.2265625, |
| "learning_rate": 3.300112524375881e-05, |
| "loss": 0.5505, |
| "step": 3690 |
| }, |
| { |
| "epoch": 2.372988805521005, |
| "grad_norm": 1.2578125, |
| "learning_rate": 3.2922846023828645e-05, |
| "loss": 0.5432, |
| "step": 3695 |
| }, |
| { |
| "epoch": 2.3761986919712714, |
| "grad_norm": 1.265625, |
| "learning_rate": 3.2844906357910476e-05, |
| "loss": 0.5294, |
| "step": 3700 |
| }, |
| { |
| "epoch": 2.3794085784215384, |
| "grad_norm": 1.2578125, |
| "learning_rate": 3.2767307140773494e-05, |
| "loss": 0.5619, |
| "step": 3705 |
| }, |
| { |
| "epoch": 2.382618464871805, |
| "grad_norm": 1.1875, |
| "learning_rate": 3.2690049263278455e-05, |
| "loss": 0.5422, |
| "step": 3710 |
| }, |
| { |
| "epoch": 2.385828351322072, |
| "grad_norm": 1.3203125, |
| "learning_rate": 3.261313361236743e-05, |
| "loss": 0.5413, |
| "step": 3715 |
| }, |
| { |
| "epoch": 2.389038237772339, |
| "grad_norm": 1.234375, |
| "learning_rate": 3.253656107105362e-05, |
| "loss": 0.535, |
| "step": 3720 |
| }, |
| { |
| "epoch": 2.3922481242226055, |
| "grad_norm": 1.1953125, |
| "learning_rate": 3.246033251841126e-05, |
| "loss": 0.5228, |
| "step": 3725 |
| }, |
| { |
| "epoch": 2.3954580106728725, |
| "grad_norm": 1.21875, |
| "learning_rate": 3.238444882956548e-05, |
| "loss": 0.5378, |
| "step": 3730 |
| }, |
| { |
| "epoch": 2.3986678971231394, |
| "grad_norm": 1.28125, |
| "learning_rate": 3.230891087568229e-05, |
| "loss": 0.5469, |
| "step": 3735 |
| }, |
| { |
| "epoch": 2.401877783573406, |
| "grad_norm": 1.21875, |
| "learning_rate": 3.2233719523958563e-05, |
| "loss": 0.5509, |
| "step": 3740 |
| }, |
| { |
| "epoch": 2.405087670023673, |
| "grad_norm": 1.2109375, |
| "learning_rate": 3.2158875637612053e-05, |
| "loss": 0.5212, |
| "step": 3745 |
| }, |
| { |
| "epoch": 2.40829755647394, |
| "grad_norm": 1.1640625, |
| "learning_rate": 3.208438007587156e-05, |
| "loss": 0.5221, |
| "step": 3750 |
| }, |
| { |
| "epoch": 2.4115074429242065, |
| "grad_norm": 1.2109375, |
| "learning_rate": 3.201023369396699e-05, |
| "loss": 0.5311, |
| "step": 3755 |
| }, |
| { |
| "epoch": 2.4147173293744735, |
| "grad_norm": 1.15625, |
| "learning_rate": 3.193643734311958e-05, |
| "loss": 0.5403, |
| "step": 3760 |
| }, |
| { |
| "epoch": 2.41792721582474, |
| "grad_norm": 1.234375, |
| "learning_rate": 3.1862991870532106e-05, |
| "loss": 0.548, |
| "step": 3765 |
| }, |
| { |
| "epoch": 2.421137102275007, |
| "grad_norm": 1.2734375, |
| "learning_rate": 3.1789898119379156e-05, |
| "loss": 0.5466, |
| "step": 3770 |
| }, |
| { |
| "epoch": 2.424346988725274, |
| "grad_norm": 1.2578125, |
| "learning_rate": 3.171715692879748e-05, |
| "loss": 0.5336, |
| "step": 3775 |
| }, |
| { |
| "epoch": 2.4275568751755405, |
| "grad_norm": 1.1875, |
| "learning_rate": 3.164476913387631e-05, |
| "loss": 0.5341, |
| "step": 3780 |
| }, |
| { |
| "epoch": 2.4307667616258075, |
| "grad_norm": 1.2578125, |
| "learning_rate": 3.1572735565647815e-05, |
| "loss": 0.5335, |
| "step": 3785 |
| }, |
| { |
| "epoch": 2.4339766480760745, |
| "grad_norm": 1.1640625, |
| "learning_rate": 3.1501057051077535e-05, |
| "loss": 0.5309, |
| "step": 3790 |
| }, |
| { |
| "epoch": 2.437186534526341, |
| "grad_norm": 1.328125, |
| "learning_rate": 3.142973441305488e-05, |
| "loss": 0.5451, |
| "step": 3795 |
| }, |
| { |
| "epoch": 2.440396420976608, |
| "grad_norm": 1.1484375, |
| "learning_rate": 3.135876847038371e-05, |
| "loss": 0.5381, |
| "step": 3800 |
| }, |
| { |
| "epoch": 2.443606307426875, |
| "grad_norm": 1.2109375, |
| "learning_rate": 3.1288160037772953e-05, |
| "loss": 0.5474, |
| "step": 3805 |
| }, |
| { |
| "epoch": 2.4468161938771416, |
| "grad_norm": 1.28125, |
| "learning_rate": 3.121790992582717e-05, |
| "loss": 0.5424, |
| "step": 3810 |
| }, |
| { |
| "epoch": 2.4500260803274085, |
| "grad_norm": 1.2578125, |
| "learning_rate": 3.1148018941037324e-05, |
| "loss": 0.5475, |
| "step": 3815 |
| }, |
| { |
| "epoch": 2.453235966777675, |
| "grad_norm": 1.2265625, |
| "learning_rate": 3.10784878857715e-05, |
| "loss": 0.5341, |
| "step": 3820 |
| }, |
| { |
| "epoch": 2.456445853227942, |
| "grad_norm": 1.203125, |
| "learning_rate": 3.100931755826569e-05, |
| "loss": 0.5365, |
| "step": 3825 |
| }, |
| { |
| "epoch": 2.459655739678209, |
| "grad_norm": 1.234375, |
| "learning_rate": 3.094050875261462e-05, |
| "loss": 0.5628, |
| "step": 3830 |
| }, |
| { |
| "epoch": 2.4628656261284756, |
| "grad_norm": 1.1875, |
| "learning_rate": 3.087206225876266e-05, |
| "loss": 0.54, |
| "step": 3835 |
| }, |
| { |
| "epoch": 2.4660755125787426, |
| "grad_norm": 1.296875, |
| "learning_rate": 3.080397886249472e-05, |
| "loss": 0.5375, |
| "step": 3840 |
| }, |
| { |
| "epoch": 2.469285399029009, |
| "grad_norm": 1.2109375, |
| "learning_rate": 3.073625934542727e-05, |
| "loss": 0.5427, |
| "step": 3845 |
| }, |
| { |
| "epoch": 2.472495285479276, |
| "grad_norm": 1.3828125, |
| "learning_rate": 3.0668904484999334e-05, |
| "loss": 0.5511, |
| "step": 3850 |
| }, |
| { |
| "epoch": 2.475705171929543, |
| "grad_norm": 1.2421875, |
| "learning_rate": 3.060191505446357e-05, |
| "loss": 0.5377, |
| "step": 3855 |
| }, |
| { |
| "epoch": 2.4789150583798096, |
| "grad_norm": 1.2265625, |
| "learning_rate": 3.0535291822877405e-05, |
| "loss": 0.533, |
| "step": 3860 |
| }, |
| { |
| "epoch": 2.4821249448300766, |
| "grad_norm": 1.1640625, |
| "learning_rate": 3.0469035555094194e-05, |
| "loss": 0.5372, |
| "step": 3865 |
| }, |
| { |
| "epoch": 2.4853348312803436, |
| "grad_norm": 1.21875, |
| "learning_rate": 3.040314701175445e-05, |
| "loss": 0.544, |
| "step": 3870 |
| }, |
| { |
| "epoch": 2.48854471773061, |
| "grad_norm": 1.25, |
| "learning_rate": 3.0337626949277105e-05, |
| "loss": 0.5307, |
| "step": 3875 |
| }, |
| { |
| "epoch": 2.491754604180877, |
| "grad_norm": 1.2265625, |
| "learning_rate": 3.0272476119850835e-05, |
| "loss": 0.5482, |
| "step": 3880 |
| }, |
| { |
| "epoch": 2.494964490631144, |
| "grad_norm": 1.3046875, |
| "learning_rate": 3.020769527142541e-05, |
| "loss": 0.5412, |
| "step": 3885 |
| }, |
| { |
| "epoch": 2.4981743770814107, |
| "grad_norm": 1.2265625, |
| "learning_rate": 3.0143285147703114e-05, |
| "loss": 0.5554, |
| "step": 3890 |
| }, |
| { |
| "epoch": 2.5013842635316776, |
| "grad_norm": 1.3046875, |
| "learning_rate": 3.0079246488130197e-05, |
| "loss": 0.5369, |
| "step": 3895 |
| }, |
| { |
| "epoch": 2.504594149981944, |
| "grad_norm": 1.28125, |
| "learning_rate": 3.0015580027888424e-05, |
| "loss": 0.5504, |
| "step": 3900 |
| }, |
| { |
| "epoch": 2.507804036432211, |
| "grad_norm": 1.2578125, |
| "learning_rate": 2.9952286497886572e-05, |
| "loss": 0.5287, |
| "step": 3905 |
| }, |
| { |
| "epoch": 2.511013922882478, |
| "grad_norm": 1.234375, |
| "learning_rate": 2.9889366624752118e-05, |
| "loss": 0.5553, |
| "step": 3910 |
| }, |
| { |
| "epoch": 2.5142238093327447, |
| "grad_norm": 1.2578125, |
| "learning_rate": 2.9826821130822807e-05, |
| "loss": 0.5343, |
| "step": 3915 |
| }, |
| { |
| "epoch": 2.5174336957830117, |
| "grad_norm": 1.25, |
| "learning_rate": 2.9764650734138434e-05, |
| "loss": 0.5326, |
| "step": 3920 |
| }, |
| { |
| "epoch": 2.5206435822332782, |
| "grad_norm": 1.234375, |
| "learning_rate": 2.9702856148432573e-05, |
| "loss": 0.5366, |
| "step": 3925 |
| }, |
| { |
| "epoch": 2.523853468683545, |
| "grad_norm": 1.3671875, |
| "learning_rate": 2.9641438083124372e-05, |
| "loss": 0.5335, |
| "step": 3930 |
| }, |
| { |
| "epoch": 2.527063355133812, |
| "grad_norm": 1.1484375, |
| "learning_rate": 2.958039724331042e-05, |
| "loss": 0.518, |
| "step": 3935 |
| }, |
| { |
| "epoch": 2.530273241584079, |
| "grad_norm": 1.296875, |
| "learning_rate": 2.9519734329756666e-05, |
| "loss": 0.5379, |
| "step": 3940 |
| }, |
| { |
| "epoch": 2.5334831280343457, |
| "grad_norm": 1.203125, |
| "learning_rate": 2.9459450038890333e-05, |
| "loss": 0.5287, |
| "step": 3945 |
| }, |
| { |
| "epoch": 2.5366930144846127, |
| "grad_norm": 1.234375, |
| "learning_rate": 2.9399545062791967e-05, |
| "loss": 0.5245, |
| "step": 3950 |
| }, |
| { |
| "epoch": 2.5399029009348792, |
| "grad_norm": 1.171875, |
| "learning_rate": 2.9340020089187492e-05, |
| "loss": 0.541, |
| "step": 3955 |
| }, |
| { |
| "epoch": 2.5431127873851462, |
| "grad_norm": 1.25, |
| "learning_rate": 2.928087580144026e-05, |
| "loss": 0.5299, |
| "step": 3960 |
| }, |
| { |
| "epoch": 2.546322673835413, |
| "grad_norm": 1.1875, |
| "learning_rate": 2.9222112878543273e-05, |
| "loss": 0.527, |
| "step": 3965 |
| }, |
| { |
| "epoch": 2.5495325602856798, |
| "grad_norm": 1.234375, |
| "learning_rate": 2.9163731995111333e-05, |
| "loss": 0.5581, |
| "step": 3970 |
| }, |
| { |
| "epoch": 2.5527424467359467, |
| "grad_norm": 1.2109375, |
| "learning_rate": 2.9105733821373333e-05, |
| "loss": 0.5499, |
| "step": 3975 |
| }, |
| { |
| "epoch": 2.5559523331862133, |
| "grad_norm": 1.25, |
| "learning_rate": 2.9048119023164555e-05, |
| "loss": 0.5265, |
| "step": 3980 |
| }, |
| { |
| "epoch": 2.5591622196364803, |
| "grad_norm": 1.1640625, |
| "learning_rate": 2.8990888261919024e-05, |
| "loss": 0.5433, |
| "step": 3985 |
| }, |
| { |
| "epoch": 2.5623721060867473, |
| "grad_norm": 1.2265625, |
| "learning_rate": 2.8934042194661913e-05, |
| "loss": 0.5503, |
| "step": 3990 |
| }, |
| { |
| "epoch": 2.5655819925370142, |
| "grad_norm": 1.265625, |
| "learning_rate": 2.8877581474001986e-05, |
| "loss": 0.5327, |
| "step": 3995 |
| }, |
| { |
| "epoch": 2.568791878987281, |
| "grad_norm": 1.2578125, |
| "learning_rate": 2.8821506748124132e-05, |
| "loss": 0.5499, |
| "step": 4000 |
| }, |
| { |
| "epoch": 2.568791878987281, |
| "eval_loss": 0.4683253765106201, |
| "eval_runtime": 2.4022, |
| "eval_samples_per_second": 83.257, |
| "eval_steps_per_second": 83.257, |
| "step": 4000 |
| }, |
| { |
| "epoch": 2.5720017654375478, |
| "grad_norm": 1.34375, |
| "learning_rate": 2.8765818660781912e-05, |
| "loss": 0.5244, |
| "step": 4005 |
| }, |
| { |
| "epoch": 2.5752116518878143, |
| "grad_norm": 1.296875, |
| "learning_rate": 2.8710517851290174e-05, |
| "loss": 0.5457, |
| "step": 4010 |
| }, |
| { |
| "epoch": 2.5784215383380813, |
| "grad_norm": 1.28125, |
| "learning_rate": 2.865560495451769e-05, |
| "loss": 0.539, |
| "step": 4015 |
| }, |
| { |
| "epoch": 2.5816314247883483, |
| "grad_norm": 1.21875, |
| "learning_rate": 2.8601080600879892e-05, |
| "loss": 0.5469, |
| "step": 4020 |
| }, |
| { |
| "epoch": 2.584841311238615, |
| "grad_norm": 1.265625, |
| "learning_rate": 2.854694541633165e-05, |
| "loss": 0.5536, |
| "step": 4025 |
| }, |
| { |
| "epoch": 2.588051197688882, |
| "grad_norm": 1.2421875, |
| "learning_rate": 2.8493200022360027e-05, |
| "loss": 0.5324, |
| "step": 4030 |
| }, |
| { |
| "epoch": 2.5912610841391484, |
| "grad_norm": 1.2890625, |
| "learning_rate": 2.8439845035977214e-05, |
| "loss": 0.519, |
| "step": 4035 |
| }, |
| { |
| "epoch": 2.5944709705894153, |
| "grad_norm": 1.2734375, |
| "learning_rate": 2.838688106971339e-05, |
| "loss": 0.534, |
| "step": 4040 |
| }, |
| { |
| "epoch": 2.5976808570396823, |
| "grad_norm": 1.21875, |
| "learning_rate": 2.8334308731609722e-05, |
| "loss": 0.5333, |
| "step": 4045 |
| }, |
| { |
| "epoch": 2.6008907434899493, |
| "grad_norm": 1.2734375, |
| "learning_rate": 2.8282128625211378e-05, |
| "loss": 0.5319, |
| "step": 4050 |
| }, |
| { |
| "epoch": 2.604100629940216, |
| "grad_norm": 1.203125, |
| "learning_rate": 2.8230341349560603e-05, |
| "loss": 0.5411, |
| "step": 4055 |
| }, |
| { |
| "epoch": 2.607310516390483, |
| "grad_norm": 1.25, |
| "learning_rate": 2.8178947499189812e-05, |
| "loss": 0.5493, |
| "step": 4060 |
| }, |
| { |
| "epoch": 2.6105204028407494, |
| "grad_norm": 1.2109375, |
| "learning_rate": 2.812794766411481e-05, |
| "loss": 0.5491, |
| "step": 4065 |
| }, |
| { |
| "epoch": 2.6137302892910164, |
| "grad_norm": 1.2109375, |
| "learning_rate": 2.8077342429827992e-05, |
| "loss": 0.5423, |
| "step": 4070 |
| }, |
| { |
| "epoch": 2.6169401757412833, |
| "grad_norm": 1.2421875, |
| "learning_rate": 2.802713237729162e-05, |
| "loss": 0.5493, |
| "step": 4075 |
| }, |
| { |
| "epoch": 2.62015006219155, |
| "grad_norm": 1.1953125, |
| "learning_rate": 2.797731808293116e-05, |
| "loss": 0.5503, |
| "step": 4080 |
| }, |
| { |
| "epoch": 2.623359948641817, |
| "grad_norm": 1.203125, |
| "learning_rate": 2.7927900118628652e-05, |
| "loss": 0.5297, |
| "step": 4085 |
| }, |
| { |
| "epoch": 2.6265698350920834, |
| "grad_norm": 1.28125, |
| "learning_rate": 2.787887905171619e-05, |
| "loss": 0.5406, |
| "step": 4090 |
| }, |
| { |
| "epoch": 2.6297797215423504, |
| "grad_norm": 1.2109375, |
| "learning_rate": 2.7830255444969332e-05, |
| "loss": 0.531, |
| "step": 4095 |
| }, |
| { |
| "epoch": 2.6329896079926174, |
| "grad_norm": 1.40625, |
| "learning_rate": 2.7782029856600715e-05, |
| "loss": 0.5403, |
| "step": 4100 |
| }, |
| { |
| "epoch": 2.636199494442884, |
| "grad_norm": 1.2578125, |
| "learning_rate": 2.77342028402536e-05, |
| "loss": 0.5568, |
| "step": 4105 |
| }, |
| { |
| "epoch": 2.639409380893151, |
| "grad_norm": 1.1796875, |
| "learning_rate": 2.7686774944995526e-05, |
| "loss": 0.5364, |
| "step": 4110 |
| }, |
| { |
| "epoch": 2.6426192673434175, |
| "grad_norm": 1.25, |
| "learning_rate": 2.763974671531201e-05, |
| "loss": 0.5501, |
| "step": 4115 |
| }, |
| { |
| "epoch": 2.6458291537936844, |
| "grad_norm": 1.3671875, |
| "learning_rate": 2.759311869110032e-05, |
| "loss": 0.5469, |
| "step": 4120 |
| }, |
| { |
| "epoch": 2.6490390402439514, |
| "grad_norm": 1.1328125, |
| "learning_rate": 2.7546891407663216e-05, |
| "loss": 0.5401, |
| "step": 4125 |
| }, |
| { |
| "epoch": 2.6522489266942184, |
| "grad_norm": 1.2890625, |
| "learning_rate": 2.7501065395702864e-05, |
| "loss": 0.5465, |
| "step": 4130 |
| }, |
| { |
| "epoch": 2.655458813144485, |
| "grad_norm": 1.203125, |
| "learning_rate": 2.745564118131472e-05, |
| "loss": 0.5332, |
| "step": 4135 |
| }, |
| { |
| "epoch": 2.658668699594752, |
| "grad_norm": 1.2578125, |
| "learning_rate": 2.741061928598149e-05, |
| "loss": 0.5376, |
| "step": 4140 |
| }, |
| { |
| "epoch": 2.6618785860450185, |
| "grad_norm": 1.25, |
| "learning_rate": 2.736600022656714e-05, |
| "loss": 0.5382, |
| "step": 4145 |
| }, |
| { |
| "epoch": 2.6650884724952855, |
| "grad_norm": 1.203125, |
| "learning_rate": 2.7321784515310965e-05, |
| "loss": 0.5494, |
| "step": 4150 |
| }, |
| { |
| "epoch": 2.6682983589455525, |
| "grad_norm": 1.2421875, |
| "learning_rate": 2.7277972659821727e-05, |
| "loss": 0.5511, |
| "step": 4155 |
| }, |
| { |
| "epoch": 2.671508245395819, |
| "grad_norm": 1.1875, |
| "learning_rate": 2.723456516307178e-05, |
| "loss": 0.552, |
| "step": 4160 |
| }, |
| { |
| "epoch": 2.674718131846086, |
| "grad_norm": 1.2109375, |
| "learning_rate": 2.7191562523391363e-05, |
| "loss": 0.5295, |
| "step": 4165 |
| }, |
| { |
| "epoch": 2.6779280182963525, |
| "grad_norm": 1.203125, |
| "learning_rate": 2.7148965234462807e-05, |
| "loss": 0.5491, |
| "step": 4170 |
| }, |
| { |
| "epoch": 2.6811379047466195, |
| "grad_norm": 1.203125, |
| "learning_rate": 2.7106773785314937e-05, |
| "loss": 0.5218, |
| "step": 4175 |
| }, |
| { |
| "epoch": 2.6843477911968865, |
| "grad_norm": 1.1953125, |
| "learning_rate": 2.70649886603174e-05, |
| "loss": 0.5303, |
| "step": 4180 |
| }, |
| { |
| "epoch": 2.6875576776471535, |
| "grad_norm": 1.25, |
| "learning_rate": 2.7023610339175127e-05, |
| "loss": 0.5344, |
| "step": 4185 |
| }, |
| { |
| "epoch": 2.69076756409742, |
| "grad_norm": 1.1640625, |
| "learning_rate": 2.698263929692285e-05, |
| "loss": 0.5482, |
| "step": 4190 |
| }, |
| { |
| "epoch": 2.693977450547687, |
| "grad_norm": 1.1796875, |
| "learning_rate": 2.6942076003919596e-05, |
| "loss": 0.5198, |
| "step": 4195 |
| }, |
| { |
| "epoch": 2.6971873369979535, |
| "grad_norm": 1.1796875, |
| "learning_rate": 2.6901920925843338e-05, |
| "loss": 0.5366, |
| "step": 4200 |
| }, |
| { |
| "epoch": 2.7003972234482205, |
| "grad_norm": 1.1875, |
| "learning_rate": 2.6862174523685618e-05, |
| "loss": 0.5151, |
| "step": 4205 |
| }, |
| { |
| "epoch": 2.7036071098984875, |
| "grad_norm": 1.2734375, |
| "learning_rate": 2.6822837253746258e-05, |
| "loss": 0.5174, |
| "step": 4210 |
| }, |
| { |
| "epoch": 2.706816996348754, |
| "grad_norm": 1.2578125, |
| "learning_rate": 2.6783909567628153e-05, |
| "loss": 0.5391, |
| "step": 4215 |
| }, |
| { |
| "epoch": 2.710026882799021, |
| "grad_norm": 1.28125, |
| "learning_rate": 2.674539191223202e-05, |
| "loss": 0.5445, |
| "step": 4220 |
| }, |
| { |
| "epoch": 2.7132367692492876, |
| "grad_norm": 1.2578125, |
| "learning_rate": 2.6707284729751346e-05, |
| "loss": 0.5197, |
| "step": 4225 |
| }, |
| { |
| "epoch": 2.7164466556995546, |
| "grad_norm": 1.2265625, |
| "learning_rate": 2.666958845766726e-05, |
| "loss": 0.5375, |
| "step": 4230 |
| }, |
| { |
| "epoch": 2.7196565421498216, |
| "grad_norm": 1.140625, |
| "learning_rate": 2.663230352874352e-05, |
| "loss": 0.5285, |
| "step": 4235 |
| }, |
| { |
| "epoch": 2.7228664286000885, |
| "grad_norm": 1.2421875, |
| "learning_rate": 2.659543037102154e-05, |
| "loss": 0.5429, |
| "step": 4240 |
| }, |
| { |
| "epoch": 2.726076315050355, |
| "grad_norm": 1.1953125, |
| "learning_rate": 2.6558969407815525e-05, |
| "loss": 0.5288, |
| "step": 4245 |
| }, |
| { |
| "epoch": 2.729286201500622, |
| "grad_norm": 1.265625, |
| "learning_rate": 2.652292105770753e-05, |
| "loss": 0.527, |
| "step": 4250 |
| }, |
| { |
| "epoch": 2.7324960879508886, |
| "grad_norm": 1.1484375, |
| "learning_rate": 2.648728573454271e-05, |
| "loss": 0.5219, |
| "step": 4255 |
| }, |
| { |
| "epoch": 2.7357059744011556, |
| "grad_norm": 1.2890625, |
| "learning_rate": 2.6452063847424564e-05, |
| "loss": 0.5412, |
| "step": 4260 |
| }, |
| { |
| "epoch": 2.7389158608514226, |
| "grad_norm": 1.2265625, |
| "learning_rate": 2.6417255800710215e-05, |
| "loss": 0.5495, |
| "step": 4265 |
| }, |
| { |
| "epoch": 2.742125747301689, |
| "grad_norm": 1.3671875, |
| "learning_rate": 2.6382861994005792e-05, |
| "loss": 0.5353, |
| "step": 4270 |
| }, |
| { |
| "epoch": 2.745335633751956, |
| "grad_norm": 1.2421875, |
| "learning_rate": 2.6348882822161826e-05, |
| "loss": 0.5386, |
| "step": 4275 |
| }, |
| { |
| "epoch": 2.7485455202022226, |
| "grad_norm": 1.234375, |
| "learning_rate": 2.6315318675268724e-05, |
| "loss": 0.55, |
| "step": 4280 |
| }, |
| { |
| "epoch": 2.7517554066524896, |
| "grad_norm": 1.25, |
| "learning_rate": 2.6282169938652306e-05, |
| "loss": 0.5401, |
| "step": 4285 |
| }, |
| { |
| "epoch": 2.7549652931027566, |
| "grad_norm": 1.15625, |
| "learning_rate": 2.6249436992869342e-05, |
| "loss": 0.5289, |
| "step": 4290 |
| }, |
| { |
| "epoch": 2.758175179553023, |
| "grad_norm": 1.203125, |
| "learning_rate": 2.6217120213703222e-05, |
| "loss": 0.541, |
| "step": 4295 |
| }, |
| { |
| "epoch": 2.76138506600329, |
| "grad_norm": 1.21875, |
| "learning_rate": 2.6185219972159626e-05, |
| "loss": 0.5263, |
| "step": 4300 |
| }, |
| { |
| "epoch": 2.7645949524535567, |
| "grad_norm": 1.2109375, |
| "learning_rate": 2.6153736634462252e-05, |
| "loss": 0.5247, |
| "step": 4305 |
| }, |
| { |
| "epoch": 2.7678048389038237, |
| "grad_norm": 1.1640625, |
| "learning_rate": 2.6122670562048645e-05, |
| "loss": 0.5476, |
| "step": 4310 |
| }, |
| { |
| "epoch": 2.7710147253540907, |
| "grad_norm": 1.2578125, |
| "learning_rate": 2.6092022111566007e-05, |
| "loss": 0.5246, |
| "step": 4315 |
| }, |
| { |
| "epoch": 2.7742246118043576, |
| "grad_norm": 1.1953125, |
| "learning_rate": 2.6061791634867146e-05, |
| "loss": 0.5191, |
| "step": 4320 |
| }, |
| { |
| "epoch": 2.777434498254624, |
| "grad_norm": 1.2265625, |
| "learning_rate": 2.6031979479006395e-05, |
| "loss": 0.5341, |
| "step": 4325 |
| }, |
| { |
| "epoch": 2.780644384704891, |
| "grad_norm": 1.21875, |
| "learning_rate": 2.6002585986235656e-05, |
| "loss": 0.5375, |
| "step": 4330 |
| }, |
| { |
| "epoch": 2.7838542711551577, |
| "grad_norm": 1.2734375, |
| "learning_rate": 2.5973611494000462e-05, |
| "loss": 0.5502, |
| "step": 4335 |
| }, |
| { |
| "epoch": 2.7870641576054247, |
| "grad_norm": 1.375, |
| "learning_rate": 2.5945056334936092e-05, |
| "loss": 0.5263, |
| "step": 4340 |
| }, |
| { |
| "epoch": 2.7902740440556917, |
| "grad_norm": 1.2265625, |
| "learning_rate": 2.5916920836863772e-05, |
| "loss": 0.5388, |
| "step": 4345 |
| }, |
| { |
| "epoch": 2.7934839305059582, |
| "grad_norm": 1.390625, |
| "learning_rate": 2.58892053227869e-05, |
| "loss": 0.5378, |
| "step": 4350 |
| }, |
| { |
| "epoch": 2.796693816956225, |
| "grad_norm": 1.2890625, |
| "learning_rate": 2.5861910110887344e-05, |
| "loss": 0.5333, |
| "step": 4355 |
| }, |
| { |
| "epoch": 2.7999037034064918, |
| "grad_norm": 1.1484375, |
| "learning_rate": 2.5835035514521776e-05, |
| "loss": 0.5295, |
| "step": 4360 |
| }, |
| { |
| "epoch": 2.8031135898567587, |
| "grad_norm": 1.2265625, |
| "learning_rate": 2.58085818422181e-05, |
| "loss": 0.5308, |
| "step": 4365 |
| }, |
| { |
| "epoch": 2.8063234763070257, |
| "grad_norm": 1.1875, |
| "learning_rate": 2.5782549397671872e-05, |
| "loss": 0.5339, |
| "step": 4370 |
| }, |
| { |
| "epoch": 2.8095333627572927, |
| "grad_norm": 1.28125, |
| "learning_rate": 2.575693847974286e-05, |
| "loss": 0.543, |
| "step": 4375 |
| }, |
| { |
| "epoch": 2.8127432492075592, |
| "grad_norm": 1.1796875, |
| "learning_rate": 2.5731749382451565e-05, |
| "loss": 0.5417, |
| "step": 4380 |
| }, |
| { |
| "epoch": 2.8159531356578262, |
| "grad_norm": 1.2265625, |
| "learning_rate": 2.5706982394975875e-05, |
| "loss": 0.5473, |
| "step": 4385 |
| }, |
| { |
| "epoch": 2.8191630221080928, |
| "grad_norm": 1.21875, |
| "learning_rate": 2.568263780164775e-05, |
| "loss": 0.536, |
| "step": 4390 |
| }, |
| { |
| "epoch": 2.8223729085583598, |
| "grad_norm": 1.3125, |
| "learning_rate": 2.5658715881949946e-05, |
| "loss": 0.5271, |
| "step": 4395 |
| }, |
| { |
| "epoch": 2.8255827950086267, |
| "grad_norm": 1.2265625, |
| "learning_rate": 2.5635216910512793e-05, |
| "loss": 0.5437, |
| "step": 4400 |
| }, |
| { |
| "epoch": 2.8287926814588933, |
| "grad_norm": 1.2109375, |
| "learning_rate": 2.561214115711107e-05, |
| "loss": 0.5294, |
| "step": 4405 |
| }, |
| { |
| "epoch": 2.8320025679091603, |
| "grad_norm": 1.3046875, |
| "learning_rate": 2.558948888666088e-05, |
| "loss": 0.5353, |
| "step": 4410 |
| }, |
| { |
| "epoch": 2.835212454359427, |
| "grad_norm": 1.2578125, |
| "learning_rate": 2.556726035921665e-05, |
| "loss": 0.544, |
| "step": 4415 |
| }, |
| { |
| "epoch": 2.838422340809694, |
| "grad_norm": 1.2421875, |
| "learning_rate": 2.5545455829968078e-05, |
| "loss": 0.5282, |
| "step": 4420 |
| }, |
| { |
| "epoch": 2.841632227259961, |
| "grad_norm": 1.234375, |
| "learning_rate": 2.552407554923729e-05, |
| "loss": 0.5423, |
| "step": 4425 |
| }, |
| { |
| "epoch": 2.8448421137102278, |
| "grad_norm": 1.296875, |
| "learning_rate": 2.550311976247588e-05, |
| "loss": 0.5348, |
| "step": 4430 |
| }, |
| { |
| "epoch": 2.8480520001604943, |
| "grad_norm": 1.25, |
| "learning_rate": 2.548258871026216e-05, |
| "loss": 0.5591, |
| "step": 4435 |
| }, |
| { |
| "epoch": 2.8512618866107613, |
| "grad_norm": 1.2734375, |
| "learning_rate": 2.5462482628298357e-05, |
| "loss": 0.5325, |
| "step": 4440 |
| }, |
| { |
| "epoch": 2.854471773061028, |
| "grad_norm": 1.203125, |
| "learning_rate": 2.544280174740792e-05, |
| "loss": 0.534, |
| "step": 4445 |
| }, |
| { |
| "epoch": 2.857681659511295, |
| "grad_norm": 1.2421875, |
| "learning_rate": 2.542354629353288e-05, |
| "loss": 0.534, |
| "step": 4450 |
| }, |
| { |
| "epoch": 2.860891545961562, |
| "grad_norm": 1.140625, |
| "learning_rate": 2.540471648773124e-05, |
| "loss": 0.5599, |
| "step": 4455 |
| }, |
| { |
| "epoch": 2.8641014324118284, |
| "grad_norm": 1.375, |
| "learning_rate": 2.5386312546174434e-05, |
| "loss": 0.5492, |
| "step": 4460 |
| }, |
| { |
| "epoch": 2.8673113188620953, |
| "grad_norm": 1.15625, |
| "learning_rate": 2.5368334680144884e-05, |
| "loss": 0.5301, |
| "step": 4465 |
| }, |
| { |
| "epoch": 2.870521205312362, |
| "grad_norm": 1.15625, |
| "learning_rate": 2.535078309603351e-05, |
| "loss": 0.5193, |
| "step": 4470 |
| }, |
| { |
| "epoch": 2.873731091762629, |
| "grad_norm": 1.2421875, |
| "learning_rate": 2.5333657995337422e-05, |
| "loss": 0.5296, |
| "step": 4475 |
| }, |
| { |
| "epoch": 2.876940978212896, |
| "grad_norm": 1.1875, |
| "learning_rate": 2.5316959574657583e-05, |
| "loss": 0.5139, |
| "step": 4480 |
| }, |
| { |
| "epoch": 2.8801508646631624, |
| "grad_norm": 1.296875, |
| "learning_rate": 2.5300688025696517e-05, |
| "loss": 0.5349, |
| "step": 4485 |
| }, |
| { |
| "epoch": 2.8833607511134294, |
| "grad_norm": 1.203125, |
| "learning_rate": 2.5284843535256182e-05, |
| "loss": 0.5442, |
| "step": 4490 |
| }, |
| { |
| "epoch": 2.886570637563696, |
| "grad_norm": 1.28125, |
| "learning_rate": 2.5269426285235753e-05, |
| "loss": 0.5328, |
| "step": 4495 |
| }, |
| { |
| "epoch": 2.889780524013963, |
| "grad_norm": 1.171875, |
| "learning_rate": 2.5254436452629594e-05, |
| "loss": 0.5126, |
| "step": 4500 |
| }, |
| { |
| "epoch": 2.889780524013963, |
| "eval_loss": 0.4651297628879547, |
| "eval_runtime": 2.403, |
| "eval_samples_per_second": 83.23, |
| "eval_steps_per_second": 83.23, |
| "step": 4500 |
| }, |
| { |
| "epoch": 2.89299041046423, |
| "grad_norm": 1.1875, |
| "learning_rate": 2.523987420952516e-05, |
| "loss": 0.5352, |
| "step": 4505 |
| }, |
| { |
| "epoch": 2.896200296914497, |
| "grad_norm": 1.1484375, |
| "learning_rate": 2.5225739723101105e-05, |
| "loss": 0.5321, |
| "step": 4510 |
| }, |
| { |
| "epoch": 2.8994101833647634, |
| "grad_norm": 1.1796875, |
| "learning_rate": 2.521203315562528e-05, |
| "loss": 0.5323, |
| "step": 4515 |
| }, |
| { |
| "epoch": 2.9026200698150304, |
| "grad_norm": 1.21875, |
| "learning_rate": 2.5198754664452913e-05, |
| "loss": 0.5468, |
| "step": 4520 |
| }, |
| { |
| "epoch": 2.905829956265297, |
| "grad_norm": 1.296875, |
| "learning_rate": 2.5185904402024808e-05, |
| "loss": 0.53, |
| "step": 4525 |
| }, |
| { |
| "epoch": 2.909039842715564, |
| "grad_norm": 1.21875, |
| "learning_rate": 2.5173482515865582e-05, |
| "loss": 0.5181, |
| "step": 4530 |
| }, |
| { |
| "epoch": 2.912249729165831, |
| "grad_norm": 1.1875, |
| "learning_rate": 2.5161489148581962e-05, |
| "loss": 0.5294, |
| "step": 4535 |
| }, |
| { |
| "epoch": 2.9154596156160975, |
| "grad_norm": 1.1796875, |
| "learning_rate": 2.514992443786116e-05, |
| "loss": 0.5339, |
| "step": 4540 |
| }, |
| { |
| "epoch": 2.9186695020663644, |
| "grad_norm": 1.1953125, |
| "learning_rate": 2.51387885164693e-05, |
| "loss": 0.5416, |
| "step": 4545 |
| }, |
| { |
| "epoch": 2.921879388516631, |
| "grad_norm": 1.1875, |
| "learning_rate": 2.512808151224988e-05, |
| "loss": 0.546, |
| "step": 4550 |
| }, |
| { |
| "epoch": 2.925089274966898, |
| "grad_norm": 1.28125, |
| "learning_rate": 2.5117803548122305e-05, |
| "loss": 0.552, |
| "step": 4555 |
| }, |
| { |
| "epoch": 2.928299161417165, |
| "grad_norm": 1.1953125, |
| "learning_rate": 2.510795474208048e-05, |
| "loss": 0.5195, |
| "step": 4560 |
| }, |
| { |
| "epoch": 2.931509047867432, |
| "grad_norm": 1.1640625, |
| "learning_rate": 2.5098535207191458e-05, |
| "loss": 0.5446, |
| "step": 4565 |
| }, |
| { |
| "epoch": 2.9347189343176985, |
| "grad_norm": 1.125, |
| "learning_rate": 2.5089545051594136e-05, |
| "loss": 0.5417, |
| "step": 4570 |
| }, |
| { |
| "epoch": 2.9379288207679655, |
| "grad_norm": 1.234375, |
| "learning_rate": 2.5080984378498023e-05, |
| "loss": 0.5301, |
| "step": 4575 |
| }, |
| { |
| "epoch": 2.941138707218232, |
| "grad_norm": 1.3203125, |
| "learning_rate": 2.507285328618204e-05, |
| "loss": 0.5464, |
| "step": 4580 |
| }, |
| { |
| "epoch": 2.944348593668499, |
| "grad_norm": 1.1875, |
| "learning_rate": 2.506515186799341e-05, |
| "loss": 0.5348, |
| "step": 4585 |
| }, |
| { |
| "epoch": 2.947558480118766, |
| "grad_norm": 1.171875, |
| "learning_rate": 2.5057880212346564e-05, |
| "loss": 0.5296, |
| "step": 4590 |
| }, |
| { |
| "epoch": 2.9507683665690325, |
| "grad_norm": 1.21875, |
| "learning_rate": 2.505103840272215e-05, |
| "loss": 0.5267, |
| "step": 4595 |
| }, |
| { |
| "epoch": 2.9539782530192995, |
| "grad_norm": 1.2421875, |
| "learning_rate": 2.5044626517666054e-05, |
| "loss": 0.5286, |
| "step": 4600 |
| }, |
| { |
| "epoch": 2.957188139469566, |
| "grad_norm": 1.15625, |
| "learning_rate": 2.5038644630788517e-05, |
| "loss": 0.5401, |
| "step": 4605 |
| }, |
| { |
| "epoch": 2.960398025919833, |
| "grad_norm": 1.28125, |
| "learning_rate": 2.5033092810763275e-05, |
| "loss": 0.5278, |
| "step": 4610 |
| }, |
| { |
| "epoch": 2.9636079123701, |
| "grad_norm": 1.171875, |
| "learning_rate": 2.5027971121326776e-05, |
| "loss": 0.5218, |
| "step": 4615 |
| }, |
| { |
| "epoch": 2.966817798820367, |
| "grad_norm": 1.21875, |
| "learning_rate": 2.5023279621277444e-05, |
| "loss": 0.5288, |
| "step": 4620 |
| }, |
| { |
| "epoch": 2.9700276852706335, |
| "grad_norm": 1.203125, |
| "learning_rate": 2.5019018364475026e-05, |
| "loss": 0.5382, |
| "step": 4625 |
| }, |
| { |
| "epoch": 2.9732375717209005, |
| "grad_norm": 1.1171875, |
| "learning_rate": 2.5015187399839936e-05, |
| "loss": 0.5431, |
| "step": 4630 |
| }, |
| { |
| "epoch": 2.976447458171167, |
| "grad_norm": 1.21875, |
| "learning_rate": 2.501178677135272e-05, |
| "loss": 0.5417, |
| "step": 4635 |
| }, |
| { |
| "epoch": 2.979657344621434, |
| "grad_norm": 1.2421875, |
| "learning_rate": 2.5008816518053547e-05, |
| "loss": 0.5141, |
| "step": 4640 |
| }, |
| { |
| "epoch": 2.982867231071701, |
| "grad_norm": 1.15625, |
| "learning_rate": 2.500627667404176e-05, |
| "loss": 0.5438, |
| "step": 4645 |
| }, |
| { |
| "epoch": 2.9860771175219676, |
| "grad_norm": 1.2265625, |
| "learning_rate": 2.5004167268475475e-05, |
| "loss": 0.5386, |
| "step": 4650 |
| }, |
| { |
| "epoch": 2.9892870039722346, |
| "grad_norm": 1.2421875, |
| "learning_rate": 2.500248832557126e-05, |
| "loss": 0.5358, |
| "step": 4655 |
| }, |
| { |
| "epoch": 2.992496890422501, |
| "grad_norm": 1.21875, |
| "learning_rate": 2.5001239864603847e-05, |
| "loss": 0.5446, |
| "step": 4660 |
| }, |
| { |
| "epoch": 2.995706776872768, |
| "grad_norm": 1.1640625, |
| "learning_rate": 2.500042189990593e-05, |
| "loss": 0.5492, |
| "step": 4665 |
| }, |
| { |
| "epoch": 2.998916663323035, |
| "grad_norm": 1.2734375, |
| "learning_rate": 2.5000034440867958e-05, |
| "loss": 0.5393, |
| "step": 4670 |
| }, |
| { |
| "epoch": 2.9995586406130883, |
| "eval_loss": 0.4636688232421875, |
| "eval_runtime": 2.4088, |
| "eval_samples_per_second": 83.028, |
| "eval_steps_per_second": 83.028, |
| "step": 4671 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 4671, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.12480186236928e+17, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|