{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 3711, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 10.199110818936898, "learning_rate": 2.688172043010753e-08, "loss": 1.4014, "step": 1 }, { "epoch": 0.0, "grad_norm": 9.989882847960846, "learning_rate": 1.3440860215053764e-07, "loss": 1.3859, "step": 5 }, { "epoch": 0.01, "grad_norm": 9.747066516747724, "learning_rate": 2.688172043010753e-07, "loss": 1.3825, "step": 10 }, { "epoch": 0.01, "grad_norm": 9.384052273650843, "learning_rate": 4.032258064516129e-07, "loss": 1.3849, "step": 15 }, { "epoch": 0.02, "grad_norm": 8.217416051519756, "learning_rate": 5.376344086021506e-07, "loss": 1.344, "step": 20 }, { "epoch": 0.02, "grad_norm": 7.775033382898672, "learning_rate": 6.720430107526882e-07, "loss": 1.3045, "step": 25 }, { "epoch": 0.02, "grad_norm": 5.859730788492799, "learning_rate": 8.064516129032258e-07, "loss": 1.1789, "step": 30 }, { "epoch": 0.03, "grad_norm": 4.5037768296284275, "learning_rate": 9.408602150537635e-07, "loss": 1.1216, "step": 35 }, { "epoch": 0.03, "grad_norm": 3.9112344357897246, "learning_rate": 1.0752688172043011e-06, "loss": 0.9737, "step": 40 }, { "epoch": 0.04, "grad_norm": 2.4871545798077372, "learning_rate": 1.2096774193548388e-06, "loss": 0.9145, "step": 45 }, { "epoch": 0.04, "grad_norm": 2.1000135895884755, "learning_rate": 1.3440860215053765e-06, "loss": 0.8803, "step": 50 }, { "epoch": 0.04, "grad_norm": 1.793127732263159, "learning_rate": 1.4784946236559141e-06, "loss": 0.8556, "step": 55 }, { "epoch": 0.05, "grad_norm": 1.1890104490396676, "learning_rate": 1.6129032258064516e-06, "loss": 0.8304, "step": 60 }, { "epoch": 0.05, "grad_norm": 0.8602215318991128, "learning_rate": 1.7473118279569895e-06, "loss": 0.812, "step": 65 }, { "epoch": 0.06, "grad_norm": 0.6745735865168175, "learning_rate": 1.881720430107527e-06, "loss": 0.7952, "step": 70 }, { "epoch": 0.06, "grad_norm": 0.5380388650340882, "learning_rate": 2.0161290322580646e-06, "loss": 0.7932, "step": 75 }, { "epoch": 0.06, "grad_norm": 0.46811390833009625, "learning_rate": 2.1505376344086023e-06, "loss": 0.7887, "step": 80 }, { "epoch": 0.07, "grad_norm": 0.44008888635031934, "learning_rate": 2.28494623655914e-06, "loss": 0.7837, "step": 85 }, { "epoch": 0.07, "grad_norm": 0.4311119192890121, "learning_rate": 2.4193548387096776e-06, "loss": 0.7824, "step": 90 }, { "epoch": 0.08, "grad_norm": 0.4127078938414022, "learning_rate": 2.5537634408602153e-06, "loss": 0.7764, "step": 95 }, { "epoch": 0.08, "grad_norm": 0.4172098342493398, "learning_rate": 2.688172043010753e-06, "loss": 0.7702, "step": 100 }, { "epoch": 0.08, "grad_norm": 0.40651384646032246, "learning_rate": 2.822580645161291e-06, "loss": 0.7663, "step": 105 }, { "epoch": 0.09, "grad_norm": 0.3873970372029015, "learning_rate": 2.9569892473118283e-06, "loss": 0.7657, "step": 110 }, { "epoch": 0.09, "grad_norm": 0.38714894596269434, "learning_rate": 3.091397849462366e-06, "loss": 0.7626, "step": 115 }, { "epoch": 0.1, "grad_norm": 0.3842442460374844, "learning_rate": 3.225806451612903e-06, "loss": 0.7566, "step": 120 }, { "epoch": 0.1, "grad_norm": 0.37597550079933006, "learning_rate": 3.360215053763441e-06, "loss": 0.7539, "step": 125 }, { "epoch": 0.11, "grad_norm": 0.38331613185747304, "learning_rate": 3.494623655913979e-06, "loss": 0.7525, "step": 130 }, { "epoch": 0.11, "grad_norm": 0.37340334580174317, "learning_rate": 3.6290322580645166e-06, "loss": 0.7499, "step": 135 }, { "epoch": 0.11, "grad_norm": 0.37401777084114646, "learning_rate": 3.763440860215054e-06, "loss": 0.7511, "step": 140 }, { "epoch": 0.12, "grad_norm": 0.380882896168636, "learning_rate": 3.8978494623655915e-06, "loss": 0.7472, "step": 145 }, { "epoch": 0.12, "grad_norm": 0.37762040250094286, "learning_rate": 4.032258064516129e-06, "loss": 0.7429, "step": 150 }, { "epoch": 0.13, "grad_norm": 0.3645960352231046, "learning_rate": 4.166666666666667e-06, "loss": 0.7433, "step": 155 }, { "epoch": 0.13, "grad_norm": 0.3622586183025161, "learning_rate": 4.3010752688172045e-06, "loss": 0.7389, "step": 160 }, { "epoch": 0.13, "grad_norm": 0.3708144125946508, "learning_rate": 4.435483870967742e-06, "loss": 0.7421, "step": 165 }, { "epoch": 0.14, "grad_norm": 0.3800010658344533, "learning_rate": 4.56989247311828e-06, "loss": 0.738, "step": 170 }, { "epoch": 0.14, "grad_norm": 0.3723426038832112, "learning_rate": 4.7043010752688175e-06, "loss": 0.7379, "step": 175 }, { "epoch": 0.15, "grad_norm": 0.35966330774763483, "learning_rate": 4.838709677419355e-06, "loss": 0.7347, "step": 180 }, { "epoch": 0.15, "grad_norm": 0.35263251355032904, "learning_rate": 4.973118279569893e-06, "loss": 0.7357, "step": 185 }, { "epoch": 0.15, "grad_norm": 0.3663915182296496, "learning_rate": 5.1075268817204305e-06, "loss": 0.732, "step": 190 }, { "epoch": 0.16, "grad_norm": 0.36399529329830316, "learning_rate": 5.241935483870968e-06, "loss": 0.728, "step": 195 }, { "epoch": 0.16, "grad_norm": 0.3628053235769218, "learning_rate": 5.376344086021506e-06, "loss": 0.7264, "step": 200 }, { "epoch": 0.17, "grad_norm": 0.36494371864402037, "learning_rate": 5.510752688172043e-06, "loss": 0.7273, "step": 205 }, { "epoch": 0.17, "grad_norm": 0.3668928103811396, "learning_rate": 5.645161290322582e-06, "loss": 0.7228, "step": 210 }, { "epoch": 0.17, "grad_norm": 0.39394554326463743, "learning_rate": 5.779569892473119e-06, "loss": 0.7289, "step": 215 }, { "epoch": 0.18, "grad_norm": 0.35626080279969996, "learning_rate": 5.9139784946236566e-06, "loss": 0.7196, "step": 220 }, { "epoch": 0.18, "grad_norm": 0.36412637691787236, "learning_rate": 6.048387096774194e-06, "loss": 0.7159, "step": 225 }, { "epoch": 0.19, "grad_norm": 0.3702086470046273, "learning_rate": 6.182795698924732e-06, "loss": 0.7224, "step": 230 }, { "epoch": 0.19, "grad_norm": 0.48391080582336166, "learning_rate": 6.3172043010752696e-06, "loss": 0.7169, "step": 235 }, { "epoch": 0.19, "grad_norm": 0.3596882573452128, "learning_rate": 6.451612903225806e-06, "loss": 0.715, "step": 240 }, { "epoch": 0.2, "grad_norm": 0.35860023152829335, "learning_rate": 6.586021505376344e-06, "loss": 0.7162, "step": 245 }, { "epoch": 0.2, "grad_norm": 0.3628478104137735, "learning_rate": 6.720430107526882e-06, "loss": 0.7103, "step": 250 }, { "epoch": 0.21, "grad_norm": 0.3652499373094438, "learning_rate": 6.854838709677419e-06, "loss": 0.7159, "step": 255 }, { "epoch": 0.21, "grad_norm": 0.3662100716752448, "learning_rate": 6.989247311827958e-06, "loss": 0.7162, "step": 260 }, { "epoch": 0.21, "grad_norm": 0.37536119684678626, "learning_rate": 7.1236559139784956e-06, "loss": 0.715, "step": 265 }, { "epoch": 0.22, "grad_norm": 0.348328590947327, "learning_rate": 7.258064516129033e-06, "loss": 0.7116, "step": 270 }, { "epoch": 0.22, "grad_norm": 0.37523231929759776, "learning_rate": 7.392473118279571e-06, "loss": 0.7112, "step": 275 }, { "epoch": 0.23, "grad_norm": 0.38242371203620945, "learning_rate": 7.526881720430108e-06, "loss": 0.7044, "step": 280 }, { "epoch": 0.23, "grad_norm": 0.3750131944581939, "learning_rate": 7.661290322580646e-06, "loss": 0.7147, "step": 285 }, { "epoch": 0.23, "grad_norm": 0.6159762445912138, "learning_rate": 7.795698924731183e-06, "loss": 0.7074, "step": 290 }, { "epoch": 0.24, "grad_norm": 0.35924098414728584, "learning_rate": 7.93010752688172e-06, "loss": 0.7038, "step": 295 }, { "epoch": 0.24, "grad_norm": 0.3783780796519916, "learning_rate": 8.064516129032258e-06, "loss": 0.6997, "step": 300 }, { "epoch": 0.25, "grad_norm": 0.36792413754340864, "learning_rate": 8.198924731182797e-06, "loss": 0.7046, "step": 305 }, { "epoch": 0.25, "grad_norm": 0.39374647540478763, "learning_rate": 8.333333333333334e-06, "loss": 0.6963, "step": 310 }, { "epoch": 0.25, "grad_norm": 0.3577112349933175, "learning_rate": 8.467741935483872e-06, "loss": 0.6968, "step": 315 }, { "epoch": 0.26, "grad_norm": 0.36008102131369457, "learning_rate": 8.602150537634409e-06, "loss": 0.6979, "step": 320 }, { "epoch": 0.26, "grad_norm": 0.37405758129417915, "learning_rate": 8.736559139784948e-06, "loss": 0.7011, "step": 325 }, { "epoch": 0.27, "grad_norm": 0.384442291139764, "learning_rate": 8.870967741935484e-06, "loss": 0.6998, "step": 330 }, { "epoch": 0.27, "grad_norm": 0.36611354602190427, "learning_rate": 9.005376344086021e-06, "loss": 0.6972, "step": 335 }, { "epoch": 0.27, "grad_norm": 0.3701683011176801, "learning_rate": 9.13978494623656e-06, "loss": 0.6986, "step": 340 }, { "epoch": 0.28, "grad_norm": 0.35936659120843445, "learning_rate": 9.274193548387097e-06, "loss": 0.6939, "step": 345 }, { "epoch": 0.28, "grad_norm": 0.35499687307239947, "learning_rate": 9.408602150537635e-06, "loss": 0.6951, "step": 350 }, { "epoch": 0.29, "grad_norm": 0.3713492586179027, "learning_rate": 9.543010752688174e-06, "loss": 0.6921, "step": 355 }, { "epoch": 0.29, "grad_norm": 0.37661352499625905, "learning_rate": 9.67741935483871e-06, "loss": 0.6888, "step": 360 }, { "epoch": 0.3, "grad_norm": 0.37007420023678395, "learning_rate": 9.811827956989249e-06, "loss": 0.6987, "step": 365 }, { "epoch": 0.3, "grad_norm": 0.40966457136165624, "learning_rate": 9.946236559139786e-06, "loss": 0.6946, "step": 370 }, { "epoch": 0.3, "grad_norm": 0.3969484873326542, "learning_rate": 9.999980081843653e-06, "loss": 0.6919, "step": 375 }, { "epoch": 0.31, "grad_norm": 0.3952499582851307, "learning_rate": 9.999858360351776e-06, "loss": 0.6884, "step": 380 }, { "epoch": 0.31, "grad_norm": 0.4443527376278083, "learning_rate": 9.999625985701018e-06, "loss": 0.692, "step": 385 }, { "epoch": 0.32, "grad_norm": 0.4055375266413501, "learning_rate": 9.999282963034126e-06, "loss": 0.6803, "step": 390 }, { "epoch": 0.32, "grad_norm": 0.3740033940526823, "learning_rate": 9.998829299942623e-06, "loss": 0.6874, "step": 395 }, { "epoch": 0.32, "grad_norm": 0.4171063833372501, "learning_rate": 9.998265006466642e-06, "loss": 0.6853, "step": 400 }, { "epoch": 0.33, "grad_norm": 0.3704166014565844, "learning_rate": 9.997590095094712e-06, "loss": 0.6824, "step": 405 }, { "epoch": 0.33, "grad_norm": 0.37457308170926557, "learning_rate": 9.996804580763466e-06, "loss": 0.6823, "step": 410 }, { "epoch": 0.34, "grad_norm": 0.3555974679739251, "learning_rate": 9.99590848085733e-06, "loss": 0.68, "step": 415 }, { "epoch": 0.34, "grad_norm": 0.3658074710512278, "learning_rate": 9.99490181520812e-06, "loss": 0.6834, "step": 420 }, { "epoch": 0.34, "grad_norm": 0.37720132194010725, "learning_rate": 9.993784606094612e-06, "loss": 0.6843, "step": 425 }, { "epoch": 0.35, "grad_norm": 0.35365926282788546, "learning_rate": 9.992556878242055e-06, "loss": 0.6842, "step": 430 }, { "epoch": 0.35, "grad_norm": 0.3647832413674546, "learning_rate": 9.991218658821609e-06, "loss": 0.6827, "step": 435 }, { "epoch": 0.36, "grad_norm": 0.38754899812853516, "learning_rate": 9.989769977449752e-06, "loss": 0.674, "step": 440 }, { "epoch": 0.36, "grad_norm": 0.3602440303135335, "learning_rate": 9.98821086618763e-06, "loss": 0.6816, "step": 445 }, { "epoch": 0.36, "grad_norm": 0.4101690297120354, "learning_rate": 9.986541359540337e-06, "loss": 0.6833, "step": 450 }, { "epoch": 0.37, "grad_norm": 0.3962342607497507, "learning_rate": 9.984761494456155e-06, "loss": 0.6878, "step": 455 }, { "epoch": 0.37, "grad_norm": 0.34738234117858896, "learning_rate": 9.982871310325738e-06, "loss": 0.6736, "step": 460 }, { "epoch": 0.38, "grad_norm": 0.3516239459380077, "learning_rate": 9.98087084898124e-06, "loss": 0.6804, "step": 465 }, { "epoch": 0.38, "grad_norm": 0.3820795893422941, "learning_rate": 9.978760154695393e-06, "loss": 0.6763, "step": 470 }, { "epoch": 0.38, "grad_norm": 0.39963700199954716, "learning_rate": 9.976539274180515e-06, "loss": 0.6781, "step": 475 }, { "epoch": 0.39, "grad_norm": 0.3618397789523355, "learning_rate": 9.974208256587488e-06, "loss": 0.6738, "step": 480 }, { "epoch": 0.39, "grad_norm": 0.3860711991204987, "learning_rate": 9.971767153504662e-06, "loss": 0.6742, "step": 485 }, { "epoch": 0.4, "grad_norm": 0.35444028710533076, "learning_rate": 9.969216018956726e-06, "loss": 0.6791, "step": 490 }, { "epoch": 0.4, "grad_norm": 0.35921023788079653, "learning_rate": 9.966554909403495e-06, "loss": 0.6679, "step": 495 }, { "epoch": 0.4, "grad_norm": 0.35994546203864347, "learning_rate": 9.963783883738674e-06, "loss": 0.6767, "step": 500 }, { "epoch": 0.41, "grad_norm": 0.4338089504928549, "learning_rate": 9.960903003288551e-06, "loss": 0.6743, "step": 505 }, { "epoch": 0.41, "grad_norm": 0.4788314066419054, "learning_rate": 9.957912331810633e-06, "loss": 0.6765, "step": 510 }, { "epoch": 0.42, "grad_norm": 0.3610678995239796, "learning_rate": 9.954811935492249e-06, "loss": 0.6757, "step": 515 }, { "epoch": 0.42, "grad_norm": 0.3621646666026914, "learning_rate": 9.951601882949064e-06, "loss": 0.6784, "step": 520 }, { "epoch": 0.42, "grad_norm": 0.37655643619681534, "learning_rate": 9.94828224522359e-06, "loss": 0.6767, "step": 525 }, { "epoch": 0.43, "grad_norm": 0.3746684397156581, "learning_rate": 9.944853095783583e-06, "loss": 0.6701, "step": 530 }, { "epoch": 0.43, "grad_norm": 0.35960034975122823, "learning_rate": 9.94131451052044e-06, "loss": 0.6663, "step": 535 }, { "epoch": 0.44, "grad_norm": 0.3665490393357652, "learning_rate": 9.9376665677475e-06, "loss": 0.6681, "step": 540 }, { "epoch": 0.44, "grad_norm": 0.3656536234628412, "learning_rate": 9.933909348198339e-06, "loss": 0.6809, "step": 545 }, { "epoch": 0.44, "grad_norm": 0.35965008993053127, "learning_rate": 9.930042935024945e-06, "loss": 0.6711, "step": 550 }, { "epoch": 0.45, "grad_norm": 0.3486755293786901, "learning_rate": 9.926067413795914e-06, "loss": 0.6776, "step": 555 }, { "epoch": 0.45, "grad_norm": 0.3604743842077875, "learning_rate": 9.921982872494534e-06, "loss": 0.6751, "step": 560 }, { "epoch": 0.46, "grad_norm": 0.388341712779346, "learning_rate": 9.917789401516843e-06, "loss": 0.6714, "step": 565 }, { "epoch": 0.46, "grad_norm": 0.3831088958933713, "learning_rate": 9.913487093669633e-06, "loss": 0.67, "step": 570 }, { "epoch": 0.46, "grad_norm": 0.35216661085299206, "learning_rate": 9.909076044168393e-06, "loss": 0.6692, "step": 575 }, { "epoch": 0.47, "grad_norm": 0.348669001195607, "learning_rate": 9.9045563506352e-06, "loss": 0.6707, "step": 580 }, { "epoch": 0.47, "grad_norm": 0.34052401845766106, "learning_rate": 9.899928113096562e-06, "loss": 0.6661, "step": 585 }, { "epoch": 0.48, "grad_norm": 0.352477912022679, "learning_rate": 9.8951914339812e-06, "loss": 0.6691, "step": 590 }, { "epoch": 0.48, "grad_norm": 0.3392772590600422, "learning_rate": 9.890346418117784e-06, "loss": 0.6679, "step": 595 }, { "epoch": 0.49, "grad_norm": 0.3442231921488033, "learning_rate": 9.885393172732615e-06, "loss": 0.6656, "step": 600 }, { "epoch": 0.49, "grad_norm": 0.3837943347013059, "learning_rate": 9.880331807447248e-06, "loss": 0.6691, "step": 605 }, { "epoch": 0.49, "grad_norm": 0.34921430608490944, "learning_rate": 9.875162434276066e-06, "loss": 0.661, "step": 610 }, { "epoch": 0.5, "grad_norm": 0.3401060995316854, "learning_rate": 9.869885167623808e-06, "loss": 0.6636, "step": 615 }, { "epoch": 0.5, "grad_norm": 0.3595374615479717, "learning_rate": 9.864500124283024e-06, "loss": 0.6692, "step": 620 }, { "epoch": 0.51, "grad_norm": 0.35108442288203384, "learning_rate": 9.859007423431502e-06, "loss": 0.6662, "step": 625 }, { "epoch": 0.51, "grad_norm": 0.3634144966287671, "learning_rate": 9.853407186629626e-06, "loss": 0.6617, "step": 630 }, { "epoch": 0.51, "grad_norm": 0.3629403894982744, "learning_rate": 9.847699537817686e-06, "loss": 0.6669, "step": 635 }, { "epoch": 0.52, "grad_norm": 0.3465721461723826, "learning_rate": 9.841884603313133e-06, "loss": 0.6651, "step": 640 }, { "epoch": 0.52, "grad_norm": 0.35776987111346503, "learning_rate": 9.835962511807786e-06, "loss": 0.6644, "step": 645 }, { "epoch": 0.53, "grad_norm": 0.3473984069791367, "learning_rate": 9.829933394364989e-06, "loss": 0.6687, "step": 650 }, { "epoch": 0.53, "grad_norm": 0.34984520913669104, "learning_rate": 9.823797384416693e-06, "loss": 0.6653, "step": 655 }, { "epoch": 0.53, "grad_norm": 0.3881248625819974, "learning_rate": 9.817554617760529e-06, "loss": 0.6659, "step": 660 }, { "epoch": 0.54, "grad_norm": 0.3610235282185538, "learning_rate": 9.811205232556776e-06, "loss": 0.6656, "step": 665 }, { "epoch": 0.54, "grad_norm": 0.35310942198925005, "learning_rate": 9.804749369325322e-06, "loss": 0.665, "step": 670 }, { "epoch": 0.55, "grad_norm": 0.34221786900401696, "learning_rate": 9.798187170942546e-06, "loss": 0.6638, "step": 675 }, { "epoch": 0.55, "grad_norm": 0.34938476225455084, "learning_rate": 9.791518782638157e-06, "loss": 0.6653, "step": 680 }, { "epoch": 0.55, "grad_norm": 0.3368312787326202, "learning_rate": 9.784744351991985e-06, "loss": 0.6601, "step": 685 }, { "epoch": 0.56, "grad_norm": 0.3436822017037321, "learning_rate": 9.777864028930705e-06, "loss": 0.6642, "step": 690 }, { "epoch": 0.56, "grad_norm": 0.41751718623483947, "learning_rate": 9.770877965724525e-06, "loss": 0.6559, "step": 695 }, { "epoch": 0.57, "grad_norm": 0.3687415900231575, "learning_rate": 9.763786316983821e-06, "loss": 0.6575, "step": 700 }, { "epoch": 0.57, "grad_norm": 0.3501054522366654, "learning_rate": 9.756589239655705e-06, "loss": 0.6628, "step": 705 }, { "epoch": 0.57, "grad_norm": 0.34752871041065136, "learning_rate": 9.749286893020555e-06, "loss": 0.6601, "step": 710 }, { "epoch": 0.58, "grad_norm": 0.3931473327210209, "learning_rate": 9.741879438688495e-06, "loss": 0.6644, "step": 715 }, { "epoch": 0.58, "grad_norm": 0.3690748686330639, "learning_rate": 9.734367040595813e-06, "loss": 0.6584, "step": 720 }, { "epoch": 0.59, "grad_norm": 0.34447098474373605, "learning_rate": 9.726749865001337e-06, "loss": 0.6619, "step": 725 }, { "epoch": 0.59, "grad_norm": 0.391068980298796, "learning_rate": 9.719028080482746e-06, "loss": 0.6628, "step": 730 }, { "epoch": 0.59, "grad_norm": 0.41854253031252836, "learning_rate": 9.711201857932854e-06, "loss": 0.6639, "step": 735 }, { "epoch": 0.6, "grad_norm": 0.3628510904407675, "learning_rate": 9.703271370555818e-06, "loss": 0.6651, "step": 740 }, { "epoch": 0.6, "grad_norm": 0.3590532658316531, "learning_rate": 9.695236793863303e-06, "loss": 0.6629, "step": 745 }, { "epoch": 0.61, "grad_norm": 0.39193190760810226, "learning_rate": 9.687098305670606e-06, "loss": 0.657, "step": 750 }, { "epoch": 0.61, "grad_norm": 0.35312142633675936, "learning_rate": 9.678856086092714e-06, "loss": 0.6571, "step": 755 }, { "epoch": 0.61, "grad_norm": 0.33684100353341007, "learning_rate": 9.67051031754032e-06, "loss": 0.655, "step": 760 }, { "epoch": 0.62, "grad_norm": 0.3566134373195649, "learning_rate": 9.66206118471579e-06, "loss": 0.6587, "step": 765 }, { "epoch": 0.62, "grad_norm": 0.36109420760686667, "learning_rate": 9.653508874609066e-06, "loss": 0.6594, "step": 770 }, { "epoch": 0.63, "grad_norm": 0.3558119215011896, "learning_rate": 9.644853576493536e-06, "loss": 0.6581, "step": 775 }, { "epoch": 0.63, "grad_norm": 0.3524042076448574, "learning_rate": 9.63609548192184e-06, "loss": 0.6584, "step": 780 }, { "epoch": 0.63, "grad_norm": 0.33978204140733553, "learning_rate": 9.627234784721637e-06, "loss": 0.6537, "step": 785 }, { "epoch": 0.64, "grad_norm": 0.34892897529330064, "learning_rate": 9.618271680991311e-06, "loss": 0.6595, "step": 790 }, { "epoch": 0.64, "grad_norm": 0.3623130495968145, "learning_rate": 9.609206369095626e-06, "loss": 0.6576, "step": 795 }, { "epoch": 0.65, "grad_norm": 0.3365641848245978, "learning_rate": 9.600039049661345e-06, "loss": 0.6584, "step": 800 }, { "epoch": 0.65, "grad_norm": 0.34851414518760016, "learning_rate": 9.590769925572786e-06, "loss": 0.6559, "step": 805 }, { "epoch": 0.65, "grad_norm": 0.3495874060541229, "learning_rate": 9.581399201967336e-06, "loss": 0.6546, "step": 810 }, { "epoch": 0.66, "grad_norm": 0.3361516211780684, "learning_rate": 9.5719270862309e-06, "loss": 0.6565, "step": 815 }, { "epoch": 0.66, "grad_norm": 0.3366404147081797, "learning_rate": 9.562353787993321e-06, "loss": 0.6581, "step": 820 }, { "epoch": 0.67, "grad_norm": 0.3491818920298391, "learning_rate": 9.552679519123742e-06, "loss": 0.6534, "step": 825 }, { "epoch": 0.67, "grad_norm": 0.3957012445983893, "learning_rate": 9.542904493725909e-06, "loss": 0.652, "step": 830 }, { "epoch": 0.68, "grad_norm": 0.35636903762807237, "learning_rate": 9.533028928133436e-06, "loss": 0.6536, "step": 835 }, { "epoch": 0.68, "grad_norm": 0.363323356883413, "learning_rate": 9.523053040905024e-06, "loss": 0.6496, "step": 840 }, { "epoch": 0.68, "grad_norm": 0.3559212416400485, "learning_rate": 9.51297705281961e-06, "loss": 0.6558, "step": 845 }, { "epoch": 0.69, "grad_norm": 0.3629534857361547, "learning_rate": 9.502801186871493e-06, "loss": 0.6554, "step": 850 }, { "epoch": 0.69, "grad_norm": 0.342960712873135, "learning_rate": 9.4925256682654e-06, "loss": 0.6512, "step": 855 }, { "epoch": 0.7, "grad_norm": 0.339301357548824, "learning_rate": 9.482150724411486e-06, "loss": 0.6523, "step": 860 }, { "epoch": 0.7, "grad_norm": 0.3386659735255176, "learning_rate": 9.471676584920322e-06, "loss": 0.6535, "step": 865 }, { "epoch": 0.7, "grad_norm": 0.34205158026270543, "learning_rate": 9.461103481597795e-06, "loss": 0.6598, "step": 870 }, { "epoch": 0.71, "grad_norm": 0.32796187495344087, "learning_rate": 9.450431648439991e-06, "loss": 0.6568, "step": 875 }, { "epoch": 0.71, "grad_norm": 0.34482933589171144, "learning_rate": 9.439661321628012e-06, "loss": 0.652, "step": 880 }, { "epoch": 0.72, "grad_norm": 0.3351343141014817, "learning_rate": 9.428792739522747e-06, "loss": 0.6553, "step": 885 }, { "epoch": 0.72, "grad_norm": 0.33476291276445075, "learning_rate": 9.417826142659596e-06, "loss": 0.6527, "step": 890 }, { "epoch": 0.72, "grad_norm": 0.3463998801569577, "learning_rate": 9.406761773743155e-06, "loss": 0.653, "step": 895 }, { "epoch": 0.73, "grad_norm": 0.34281860407905, "learning_rate": 9.395599877641832e-06, "loss": 0.6501, "step": 900 }, { "epoch": 0.73, "grad_norm": 0.35478646464938335, "learning_rate": 9.384340701382437e-06, "loss": 0.6487, "step": 905 }, { "epoch": 0.74, "grad_norm": 0.362665631594131, "learning_rate": 9.372984494144714e-06, "loss": 0.6505, "step": 910 }, { "epoch": 0.74, "grad_norm": 0.3515020684939492, "learning_rate": 9.361531507255824e-06, "loss": 0.6545, "step": 915 }, { "epoch": 0.74, "grad_norm": 0.36181604859025146, "learning_rate": 9.349981994184783e-06, "loss": 0.6466, "step": 920 }, { "epoch": 0.75, "grad_norm": 0.360396459257514, "learning_rate": 9.33833621053685e-06, "loss": 0.6527, "step": 925 }, { "epoch": 0.75, "grad_norm": 0.3287486775760895, "learning_rate": 9.326594414047877e-06, "loss": 0.6524, "step": 930 }, { "epoch": 0.76, "grad_norm": 0.3331803270016328, "learning_rate": 9.3147568645786e-06, "loss": 0.648, "step": 935 }, { "epoch": 0.76, "grad_norm": 0.34363832036110464, "learning_rate": 9.302823824108891e-06, "loss": 0.6521, "step": 940 }, { "epoch": 0.76, "grad_norm": 0.34947199842456267, "learning_rate": 9.290795556731951e-06, "loss": 0.6487, "step": 945 }, { "epoch": 0.77, "grad_norm": 0.3577227937894934, "learning_rate": 9.278672328648482e-06, "loss": 0.6495, "step": 950 }, { "epoch": 0.77, "grad_norm": 0.3505552936476622, "learning_rate": 9.266454408160779e-06, "loss": 0.6516, "step": 955 }, { "epoch": 0.78, "grad_norm": 0.34603635918112285, "learning_rate": 9.254142065666802e-06, "loss": 0.6507, "step": 960 }, { "epoch": 0.78, "grad_norm": 0.34791455875093, "learning_rate": 9.241735573654184e-06, "loss": 0.6524, "step": 965 }, { "epoch": 0.78, "grad_norm": 0.3480678275414852, "learning_rate": 9.229235206694215e-06, "loss": 0.654, "step": 970 }, { "epoch": 0.79, "grad_norm": 0.3604324129709174, "learning_rate": 9.216641241435748e-06, "loss": 0.653, "step": 975 }, { "epoch": 0.79, "grad_norm": 0.3457908890786536, "learning_rate": 9.203953956599085e-06, "loss": 0.6479, "step": 980 }, { "epoch": 0.8, "grad_norm": 0.33808972141996535, "learning_rate": 9.191173632969812e-06, "loss": 0.6508, "step": 985 }, { "epoch": 0.8, "grad_norm": 0.3745057735077035, "learning_rate": 9.178300553392574e-06, "loss": 0.6488, "step": 990 }, { "epoch": 0.8, "grad_norm": 0.3906591238871828, "learning_rate": 9.165335002764828e-06, "loss": 0.6545, "step": 995 }, { "epoch": 0.81, "grad_norm": 0.34736320521251546, "learning_rate": 9.152277268030528e-06, "loss": 0.6456, "step": 1000 }, { "epoch": 0.81, "grad_norm": 0.3442244519131892, "learning_rate": 9.139127638173781e-06, "loss": 0.6489, "step": 1005 }, { "epoch": 0.82, "grad_norm": 0.36248373613010026, "learning_rate": 9.125886404212446e-06, "loss": 0.6444, "step": 1010 }, { "epoch": 0.82, "grad_norm": 0.34544606924971905, "learning_rate": 9.112553859191696e-06, "loss": 0.6448, "step": 1015 }, { "epoch": 0.82, "grad_norm": 0.34687946884001414, "learning_rate": 9.099130298177538e-06, "loss": 0.6443, "step": 1020 }, { "epoch": 0.83, "grad_norm": 0.3513777059583754, "learning_rate": 9.085616018250269e-06, "loss": 0.6488, "step": 1025 }, { "epoch": 0.83, "grad_norm": 0.3537267934695264, "learning_rate": 9.07201131849792e-06, "loss": 0.6519, "step": 1030 }, { "epoch": 0.84, "grad_norm": 0.33654090385818075, "learning_rate": 9.058316500009617e-06, "loss": 0.6502, "step": 1035 }, { "epoch": 0.84, "grad_norm": 0.3556483695118823, "learning_rate": 9.044531865868928e-06, "loss": 0.645, "step": 1040 }, { "epoch": 0.84, "grad_norm": 0.35750528290535094, "learning_rate": 9.030657721147165e-06, "loss": 0.6443, "step": 1045 }, { "epoch": 0.85, "grad_norm": 0.3435026549213838, "learning_rate": 9.016694372896609e-06, "loss": 0.6463, "step": 1050 }, { "epoch": 0.85, "grad_norm": 0.3423855657493052, "learning_rate": 9.002642130143733e-06, "loss": 0.6466, "step": 1055 }, { "epoch": 0.86, "grad_norm": 0.3423368465935837, "learning_rate": 8.988501303882358e-06, "loss": 0.6423, "step": 1060 }, { "epoch": 0.86, "grad_norm": 0.3618645676831919, "learning_rate": 8.974272207066767e-06, "loss": 0.6484, "step": 1065 }, { "epoch": 0.86, "grad_norm": 0.3463596904983005, "learning_rate": 8.959955154604784e-06, "loss": 0.6467, "step": 1070 }, { "epoch": 0.87, "grad_norm": 0.3346540167677227, "learning_rate": 8.945550463350801e-06, "loss": 0.6398, "step": 1075 }, { "epoch": 0.87, "grad_norm": 0.3418784422840604, "learning_rate": 8.931058452098767e-06, "loss": 0.6464, "step": 1080 }, { "epoch": 0.88, "grad_norm": 0.3483139766095285, "learning_rate": 8.91647944157513e-06, "loss": 0.6423, "step": 1085 }, { "epoch": 0.88, "grad_norm": 0.3358632385562698, "learning_rate": 8.901813754431747e-06, "loss": 0.6504, "step": 1090 }, { "epoch": 0.89, "grad_norm": 0.3293092393931251, "learning_rate": 8.887061715238732e-06, "loss": 0.6447, "step": 1095 }, { "epoch": 0.89, "grad_norm": 0.33826280157276917, "learning_rate": 8.87222365047728e-06, "loss": 0.6436, "step": 1100 }, { "epoch": 0.89, "grad_norm": 0.3462116287463627, "learning_rate": 8.857299888532444e-06, "loss": 0.6424, "step": 1105 }, { "epoch": 0.9, "grad_norm": 0.3501869661384335, "learning_rate": 8.842290759685857e-06, "loss": 0.6465, "step": 1110 }, { "epoch": 0.9, "grad_norm": 0.3433093832987501, "learning_rate": 8.827196596108435e-06, "loss": 0.6407, "step": 1115 }, { "epoch": 0.91, "grad_norm": 0.33288705092784, "learning_rate": 8.812017731853017e-06, "loss": 0.6465, "step": 1120 }, { "epoch": 0.91, "grad_norm": 0.417965330160086, "learning_rate": 8.796754502846975e-06, "loss": 0.6499, "step": 1125 }, { "epoch": 0.91, "grad_norm": 0.35443282453141656, "learning_rate": 8.781407246884777e-06, "loss": 0.6437, "step": 1130 }, { "epoch": 0.92, "grad_norm": 0.33593357952046854, "learning_rate": 8.765976303620517e-06, "loss": 0.6452, "step": 1135 }, { "epoch": 0.92, "grad_norm": 0.34887806456819376, "learning_rate": 8.750462014560391e-06, "loss": 0.6419, "step": 1140 }, { "epoch": 0.93, "grad_norm": 0.35136308719232856, "learning_rate": 8.734864723055145e-06, "loss": 0.6405, "step": 1145 }, { "epoch": 0.93, "grad_norm": 0.3444483118633577, "learning_rate": 8.71918477429247e-06, "loss": 0.6451, "step": 1150 }, { "epoch": 0.93, "grad_norm": 0.3393040816013692, "learning_rate": 8.703422515289374e-06, "loss": 0.6417, "step": 1155 }, { "epoch": 0.94, "grad_norm": 0.34469068010850457, "learning_rate": 8.687578294884483e-06, "loss": 0.6477, "step": 1160 }, { "epoch": 0.94, "grad_norm": 0.34131339477385103, "learning_rate": 8.671652463730343e-06, "loss": 0.643, "step": 1165 }, { "epoch": 0.95, "grad_norm": 0.33446502733218086, "learning_rate": 8.655645374285637e-06, "loss": 0.6431, "step": 1170 }, { "epoch": 0.95, "grad_norm": 0.3637948787124547, "learning_rate": 8.639557380807409e-06, "loss": 0.6472, "step": 1175 }, { "epoch": 0.95, "grad_norm": 0.34447049363923515, "learning_rate": 8.623388839343201e-06, "loss": 0.6437, "step": 1180 }, { "epoch": 0.96, "grad_norm": 0.5366339410735489, "learning_rate": 8.60714010772319e-06, "loss": 0.6457, "step": 1185 }, { "epoch": 0.96, "grad_norm": 0.43835203931911665, "learning_rate": 8.590811545552255e-06, "loss": 0.6469, "step": 1190 }, { "epoch": 0.97, "grad_norm": 0.40577779766570266, "learning_rate": 8.574403514202034e-06, "loss": 0.6434, "step": 1195 }, { "epoch": 0.97, "grad_norm": 0.3562768271573592, "learning_rate": 8.55791637680291e-06, "loss": 0.6431, "step": 1200 }, { "epoch": 0.97, "grad_norm": 0.348152309097625, "learning_rate": 8.54135049823599e-06, "loss": 0.64, "step": 1205 }, { "epoch": 0.98, "grad_norm": 0.3335098499578987, "learning_rate": 8.524706245125014e-06, "loss": 0.6444, "step": 1210 }, { "epoch": 0.98, "grad_norm": 0.33940776316788257, "learning_rate": 8.507983985828256e-06, "loss": 0.6404, "step": 1215 }, { "epoch": 0.99, "grad_norm": 0.3515281469753622, "learning_rate": 8.491184090430365e-06, "loss": 0.6446, "step": 1220 }, { "epoch": 0.99, "grad_norm": 0.3596274542888677, "learning_rate": 8.474306930734169e-06, "loss": 0.6424, "step": 1225 }, { "epoch": 0.99, "grad_norm": 0.394475047075805, "learning_rate": 8.457352880252456e-06, "loss": 0.6378, "step": 1230 }, { "epoch": 1.0, "grad_norm": 0.3859544263561606, "learning_rate": 8.440322314199702e-06, "loss": 0.6437, "step": 1235 }, { "epoch": 1.0, "eval_loss": 0.6421173810958862, "eval_runtime": 172.8209, "eval_samples_per_second": 48.235, "eval_steps_per_second": 0.758, "step": 1237 }, { "epoch": 1.0, "grad_norm": 0.35612683367928233, "learning_rate": 8.423215609483776e-06, "loss": 0.6167, "step": 1240 }, { "epoch": 1.01, "grad_norm": 0.3848188301086038, "learning_rate": 8.406033144697579e-06, "loss": 0.6075, "step": 1245 }, { "epoch": 1.01, "grad_norm": 0.3511745244034202, "learning_rate": 8.388775300110687e-06, "loss": 0.6068, "step": 1250 }, { "epoch": 1.01, "grad_norm": 0.34042665371361913, "learning_rate": 8.371442457660925e-06, "loss": 0.6081, "step": 1255 }, { "epoch": 1.02, "grad_norm": 0.34169228077773073, "learning_rate": 8.35403500094591e-06, "loss": 0.6069, "step": 1260 }, { "epoch": 1.02, "grad_norm": 0.38230987322877763, "learning_rate": 8.336553315214572e-06, "loss": 0.61, "step": 1265 }, { "epoch": 1.03, "grad_norm": 0.3448992045565583, "learning_rate": 8.318997787358622e-06, "loss": 0.6069, "step": 1270 }, { "epoch": 1.03, "grad_norm": 0.3514140024460396, "learning_rate": 8.301368805903988e-06, "loss": 0.6088, "step": 1275 }, { "epoch": 1.03, "grad_norm": 0.3669329708587005, "learning_rate": 8.283666761002217e-06, "loss": 0.6105, "step": 1280 }, { "epoch": 1.04, "grad_norm": 0.34787691906917084, "learning_rate": 8.265892044421846e-06, "loss": 0.6051, "step": 1285 }, { "epoch": 1.04, "grad_norm": 0.34720296046324794, "learning_rate": 8.248045049539726e-06, "loss": 0.6069, "step": 1290 }, { "epoch": 1.05, "grad_norm": 0.3392706804077391, "learning_rate": 8.230126171332318e-06, "loss": 0.6021, "step": 1295 }, { "epoch": 1.05, "grad_norm": 0.3596710598152859, "learning_rate": 8.212135806366951e-06, "loss": 0.6073, "step": 1300 }, { "epoch": 1.05, "grad_norm": 0.36935710849927605, "learning_rate": 8.194074352793048e-06, "loss": 0.6037, "step": 1305 }, { "epoch": 1.06, "grad_norm": 0.34636991004789447, "learning_rate": 8.175942210333308e-06, "loss": 0.6052, "step": 1310 }, { "epoch": 1.06, "grad_norm": 0.3497342938833143, "learning_rate": 8.157739780274869e-06, "loss": 0.6031, "step": 1315 }, { "epoch": 1.07, "grad_norm": 0.34205768024779826, "learning_rate": 8.139467465460417e-06, "loss": 0.609, "step": 1320 }, { "epoch": 1.07, "grad_norm": 0.3538862658622611, "learning_rate": 8.121125670279283e-06, "loss": 0.6068, "step": 1325 }, { "epoch": 1.08, "grad_norm": 0.3635065696395182, "learning_rate": 8.102714800658477e-06, "loss": 0.6057, "step": 1330 }, { "epoch": 1.08, "grad_norm": 0.3482547725467819, "learning_rate": 8.08423526405372e-06, "loss": 0.6099, "step": 1335 }, { "epoch": 1.08, "grad_norm": 0.3526458396232908, "learning_rate": 8.065687469440416e-06, "loss": 0.6079, "step": 1340 }, { "epoch": 1.09, "grad_norm": 0.3408668689194207, "learning_rate": 8.047071827304605e-06, "loss": 0.6009, "step": 1345 }, { "epoch": 1.09, "grad_norm": 0.34513367543254103, "learning_rate": 8.028388749633882e-06, "loss": 0.6063, "step": 1350 }, { "epoch": 1.1, "grad_norm": 0.35682466664799783, "learning_rate": 8.009638649908271e-06, "loss": 0.6025, "step": 1355 }, { "epoch": 1.1, "grad_norm": 0.3725953671989954, "learning_rate": 7.99082194309108e-06, "loss": 0.6028, "step": 1360 }, { "epoch": 1.1, "grad_norm": 0.3371954908218463, "learning_rate": 7.971939045619715e-06, "loss": 0.6073, "step": 1365 }, { "epoch": 1.11, "grad_norm": 0.34037547998817075, "learning_rate": 7.952990375396468e-06, "loss": 0.6024, "step": 1370 }, { "epoch": 1.11, "grad_norm": 0.3431896095258258, "learning_rate": 7.933976351779256e-06, "loss": 0.6047, "step": 1375 }, { "epoch": 1.12, "grad_norm": 0.33329531276617064, "learning_rate": 7.914897395572362e-06, "loss": 0.6086, "step": 1380 }, { "epoch": 1.12, "grad_norm": 0.3432559003873149, "learning_rate": 7.895753929017096e-06, "loss": 0.6029, "step": 1385 }, { "epoch": 1.12, "grad_norm": 0.3588662629623745, "learning_rate": 7.876546375782471e-06, "loss": 0.6082, "step": 1390 }, { "epoch": 1.13, "grad_norm": 0.3622204446417168, "learning_rate": 7.857275160955818e-06, "loss": 0.6082, "step": 1395 }, { "epoch": 1.13, "grad_norm": 0.363387160580236, "learning_rate": 7.837940711033374e-06, "loss": 0.6085, "step": 1400 }, { "epoch": 1.14, "grad_norm": 0.3403870308924992, "learning_rate": 7.818543453910856e-06, "loss": 0.6088, "step": 1405 }, { "epoch": 1.14, "grad_norm": 0.3533841045162882, "learning_rate": 7.79908381887398e-06, "loss": 0.6039, "step": 1410 }, { "epoch": 1.14, "grad_norm": 0.3508163520107966, "learning_rate": 7.77956223658896e-06, "loss": 0.6058, "step": 1415 }, { "epoch": 1.15, "grad_norm": 0.3541507369254433, "learning_rate": 7.759979139092987e-06, "loss": 0.6061, "step": 1420 }, { "epoch": 1.15, "grad_norm": 0.3419728282062195, "learning_rate": 7.740334959784656e-06, "loss": 0.6044, "step": 1425 }, { "epoch": 1.16, "grad_norm": 0.3715118314981658, "learning_rate": 7.720630133414385e-06, "loss": 0.6023, "step": 1430 }, { "epoch": 1.16, "grad_norm": 0.36230217561022143, "learning_rate": 7.700865096074778e-06, "loss": 0.607, "step": 1435 }, { "epoch": 1.16, "grad_norm": 0.3354777157419597, "learning_rate": 7.681040285190995e-06, "loss": 0.5977, "step": 1440 }, { "epoch": 1.17, "grad_norm": 0.358326990322836, "learning_rate": 7.661156139511053e-06, "loss": 0.6044, "step": 1445 }, { "epoch": 1.17, "grad_norm": 0.3446758320156605, "learning_rate": 7.64121309909612e-06, "loss": 0.6062, "step": 1450 }, { "epoch": 1.18, "grad_norm": 0.34050288705840426, "learning_rate": 7.621211605310792e-06, "loss": 0.6086, "step": 1455 }, { "epoch": 1.18, "grad_norm": 0.3291089897253489, "learning_rate": 7.601152100813295e-06, "loss": 0.607, "step": 1460 }, { "epoch": 1.18, "grad_norm": 0.3440479089947698, "learning_rate": 7.581035029545714e-06, "loss": 0.6043, "step": 1465 }, { "epoch": 1.19, "grad_norm": 0.352069971579739, "learning_rate": 7.5608608367241595e-06, "loss": 0.6041, "step": 1470 }, { "epoch": 1.19, "grad_norm": 0.3457882783757163, "learning_rate": 7.540629968828912e-06, "loss": 0.6078, "step": 1475 }, { "epoch": 1.2, "grad_norm": 0.3865305038262356, "learning_rate": 7.5203428735945415e-06, "loss": 0.6069, "step": 1480 }, { "epoch": 1.2, "grad_norm": 0.3451790193774824, "learning_rate": 7.500000000000001e-06, "loss": 0.6098, "step": 1485 }, { "epoch": 1.2, "grad_norm": 0.3702748135352684, "learning_rate": 7.47960179825869e-06, "loss": 0.607, "step": 1490 }, { "epoch": 1.21, "grad_norm": 0.3746936388549009, "learning_rate": 7.459148719808487e-06, "loss": 0.6066, "step": 1495 }, { "epoch": 1.21, "grad_norm": 0.36373198087446573, "learning_rate": 7.438641217301763e-06, "loss": 0.6009, "step": 1500 }, { "epoch": 1.22, "grad_norm": 0.34894326786219354, "learning_rate": 7.418079744595364e-06, "loss": 0.5968, "step": 1505 }, { "epoch": 1.22, "grad_norm": 0.3553089974400645, "learning_rate": 7.397464756740558e-06, "loss": 0.6064, "step": 1510 }, { "epoch": 1.22, "grad_norm": 0.34758493230870313, "learning_rate": 7.376796709972975e-06, "loss": 0.6051, "step": 1515 }, { "epoch": 1.23, "grad_norm": 0.34621710870141925, "learning_rate": 7.356076061702509e-06, "loss": 0.5999, "step": 1520 }, { "epoch": 1.23, "grad_norm": 0.3558061754286146, "learning_rate": 7.3353032705031826e-06, "loss": 0.6009, "step": 1525 }, { "epoch": 1.24, "grad_norm": 0.38773105335939717, "learning_rate": 7.314478796103015e-06, "loss": 0.601, "step": 1530 }, { "epoch": 1.24, "grad_norm": 0.33140917558611876, "learning_rate": 7.293603099373836e-06, "loss": 0.6024, "step": 1535 }, { "epoch": 1.24, "grad_norm": 0.35057360105977203, "learning_rate": 7.27267664232109e-06, "loss": 0.6026, "step": 1540 }, { "epoch": 1.25, "grad_norm": 0.35287320330696237, "learning_rate": 7.251699888073611e-06, "loss": 0.6019, "step": 1545 }, { "epoch": 1.25, "grad_norm": 0.34770120904263296, "learning_rate": 7.230673300873377e-06, "loss": 0.5997, "step": 1550 }, { "epoch": 1.26, "grad_norm": 0.3391153668486388, "learning_rate": 7.2095973460652265e-06, "loss": 0.599, "step": 1555 }, { "epoch": 1.26, "grad_norm": 0.3458143815878256, "learning_rate": 7.188472490086569e-06, "loss": 0.6062, "step": 1560 }, { "epoch": 1.27, "grad_norm": 0.3333067606809811, "learning_rate": 7.167299200457058e-06, "loss": 0.6017, "step": 1565 }, { "epoch": 1.27, "grad_norm": 0.3457283500961698, "learning_rate": 7.146077945768242e-06, "loss": 0.6039, "step": 1570 }, { "epoch": 1.27, "grad_norm": 0.34879856708145884, "learning_rate": 7.124809195673199e-06, "loss": 0.598, "step": 1575 }, { "epoch": 1.28, "grad_norm": 0.3393255348773791, "learning_rate": 7.103493420876142e-06, "loss": 0.6037, "step": 1580 }, { "epoch": 1.28, "grad_norm": 0.36196695404642815, "learning_rate": 7.082131093121994e-06, "loss": 0.6002, "step": 1585 }, { "epoch": 1.29, "grad_norm": 0.34627092743642773, "learning_rate": 7.060722685185961e-06, "loss": 0.606, "step": 1590 }, { "epoch": 1.29, "grad_norm": 0.3495988534567465, "learning_rate": 7.039268670863055e-06, "loss": 0.6044, "step": 1595 }, { "epoch": 1.29, "grad_norm": 0.3394880087894413, "learning_rate": 7.017769524957618e-06, "loss": 0.6082, "step": 1600 }, { "epoch": 1.3, "grad_norm": 0.34663689518347884, "learning_rate": 6.996225723272812e-06, "loss": 0.6083, "step": 1605 }, { "epoch": 1.3, "grad_norm": 0.34850358542149085, "learning_rate": 6.974637742600081e-06, "loss": 0.6009, "step": 1610 }, { "epoch": 1.31, "grad_norm": 0.36985804632767555, "learning_rate": 6.953006060708615e-06, "loss": 0.6004, "step": 1615 }, { "epoch": 1.31, "grad_norm": 0.35280758222907044, "learning_rate": 6.931331156334759e-06, "loss": 0.6038, "step": 1620 }, { "epoch": 1.31, "grad_norm": 0.3521399286587422, "learning_rate": 6.909613509171432e-06, "loss": 0.6101, "step": 1625 }, { "epoch": 1.32, "grad_norm": 0.33743550679619694, "learning_rate": 6.8878535998575e-06, "loss": 0.6028, "step": 1630 }, { "epoch": 1.32, "grad_norm": 0.3571455186929166, "learning_rate": 6.866051909967145e-06, "loss": 0.6012, "step": 1635 }, { "epoch": 1.33, "grad_norm": 0.3741314288440183, "learning_rate": 6.8442089219992085e-06, "loss": 0.6042, "step": 1640 }, { "epoch": 1.33, "grad_norm": 0.3273376833577142, "learning_rate": 6.822325119366506e-06, "loss": 0.6032, "step": 1645 }, { "epoch": 1.33, "grad_norm": 0.35624114760159614, "learning_rate": 6.800400986385136e-06, "loss": 0.6057, "step": 1650 }, { "epoch": 1.34, "grad_norm": 0.33704052485179053, "learning_rate": 6.778437008263757e-06, "loss": 0.6035, "step": 1655 }, { "epoch": 1.34, "grad_norm": 0.3503836557615777, "learning_rate": 6.756433671092853e-06, "loss": 0.6055, "step": 1660 }, { "epoch": 1.35, "grad_norm": 0.34801898829661443, "learning_rate": 6.734391461833968e-06, "loss": 0.5975, "step": 1665 }, { "epoch": 1.35, "grad_norm": 0.3842975652087955, "learning_rate": 6.71231086830894e-06, "loss": 0.5998, "step": 1670 }, { "epoch": 1.35, "grad_norm": 0.3355482774181833, "learning_rate": 6.690192379189095e-06, "loss": 0.5988, "step": 1675 }, { "epoch": 1.36, "grad_norm": 0.35442350484894886, "learning_rate": 6.668036483984441e-06, "loss": 0.6003, "step": 1680 }, { "epoch": 1.36, "grad_norm": 0.35372118352699317, "learning_rate": 6.645843673032824e-06, "loss": 0.6, "step": 1685 }, { "epoch": 1.37, "grad_norm": 0.3422594998602562, "learning_rate": 6.623614437489086e-06, "loss": 0.6017, "step": 1690 }, { "epoch": 1.37, "grad_norm": 0.3370157465905625, "learning_rate": 6.601349269314188e-06, "loss": 0.6031, "step": 1695 }, { "epoch": 1.37, "grad_norm": 0.3466009533955682, "learning_rate": 6.5790486612643304e-06, "loss": 0.6003, "step": 1700 }, { "epoch": 1.38, "grad_norm": 0.33439769935238467, "learning_rate": 6.556713106880035e-06, "loss": 0.6007, "step": 1705 }, { "epoch": 1.38, "grad_norm": 0.33542478401950865, "learning_rate": 6.5343431004752375e-06, "loss": 0.5992, "step": 1710 }, { "epoch": 1.39, "grad_norm": 0.34654430471678094, "learning_rate": 6.511939137126335e-06, "loss": 0.6024, "step": 1715 }, { "epoch": 1.39, "grad_norm": 0.33569626194639846, "learning_rate": 6.489501712661239e-06, "loss": 0.5981, "step": 1720 }, { "epoch": 1.39, "grad_norm": 0.3575718273021351, "learning_rate": 6.467031323648391e-06, "loss": 0.5972, "step": 1725 }, { "epoch": 1.4, "grad_norm": 0.34619684699452774, "learning_rate": 6.444528467385789e-06, "loss": 0.6025, "step": 1730 }, { "epoch": 1.4, "grad_norm": 0.3410612255652966, "learning_rate": 6.421993641889961e-06, "loss": 0.6028, "step": 1735 }, { "epoch": 1.41, "grad_norm": 0.35877268014296454, "learning_rate": 6.399427345884964e-06, "loss": 0.6023, "step": 1740 }, { "epoch": 1.41, "grad_norm": 0.37535707628539366, "learning_rate": 6.376830078791331e-06, "loss": 0.5998, "step": 1745 }, { "epoch": 1.41, "grad_norm": 0.3363289290001292, "learning_rate": 6.354202340715027e-06, "loss": 0.5992, "step": 1750 }, { "epoch": 1.42, "grad_norm": 0.34773704444700376, "learning_rate": 6.331544632436376e-06, "loss": 0.6007, "step": 1755 }, { "epoch": 1.42, "grad_norm": 0.35446929647477704, "learning_rate": 6.308857455398983e-06, "loss": 0.6033, "step": 1760 }, { "epoch": 1.43, "grad_norm": 0.35078615586986006, "learning_rate": 6.286141311698634e-06, "loss": 0.5962, "step": 1765 }, { "epoch": 1.43, "grad_norm": 0.34856676736171777, "learning_rate": 6.263396704072177e-06, "loss": 0.5992, "step": 1770 }, { "epoch": 1.43, "grad_norm": 0.3367079984766349, "learning_rate": 6.240624135886414e-06, "loss": 0.6001, "step": 1775 }, { "epoch": 1.44, "grad_norm": 0.35329410589016047, "learning_rate": 6.217824111126938e-06, "loss": 0.6008, "step": 1780 }, { "epoch": 1.44, "grad_norm": 0.3346407243561402, "learning_rate": 6.1949971343869975e-06, "loss": 0.5913, "step": 1785 }, { "epoch": 1.45, "grad_norm": 0.3529122847287559, "learning_rate": 6.17214371085632e-06, "loss": 0.5957, "step": 1790 }, { "epoch": 1.45, "grad_norm": 0.3421124934391437, "learning_rate": 6.149264346309934e-06, "loss": 0.6053, "step": 1795 }, { "epoch": 1.46, "grad_norm": 0.34325662088719106, "learning_rate": 6.126359547096975e-06, "loss": 0.5997, "step": 1800 }, { "epoch": 1.46, "grad_norm": 0.33811597695086393, "learning_rate": 6.103429820129483e-06, "loss": 0.5938, "step": 1805 }, { "epoch": 1.46, "grad_norm": 0.3414726258155315, "learning_rate": 6.080475672871171e-06, "loss": 0.602, "step": 1810 }, { "epoch": 1.47, "grad_norm": 0.3411869755140543, "learning_rate": 6.057497613326218e-06, "loss": 0.5978, "step": 1815 }, { "epoch": 1.47, "grad_norm": 0.3636820044048291, "learning_rate": 6.034496150028e-06, "loss": 0.601, "step": 1820 }, { "epoch": 1.48, "grad_norm": 0.348966246737792, "learning_rate": 6.011471792027853e-06, "loss": 0.5997, "step": 1825 }, { "epoch": 1.48, "grad_norm": 0.3533893554245234, "learning_rate": 5.988425048883799e-06, "loss": 0.5998, "step": 1830 }, { "epoch": 1.48, "grad_norm": 0.3560922900035671, "learning_rate": 5.965356430649276e-06, "loss": 0.6044, "step": 1835 }, { "epoch": 1.49, "grad_norm": 0.3481565362560402, "learning_rate": 5.942266447861839e-06, "loss": 0.5981, "step": 1840 }, { "epoch": 1.49, "grad_norm": 0.3599905546327302, "learning_rate": 5.919155611531872e-06, "loss": 0.5987, "step": 1845 }, { "epoch": 1.5, "grad_norm": 0.34898494062471946, "learning_rate": 5.896024433131273e-06, "loss": 0.6027, "step": 1850 }, { "epoch": 1.5, "grad_norm": 0.3439407183016253, "learning_rate": 5.872873424582136e-06, "loss": 0.5995, "step": 1855 }, { "epoch": 1.5, "grad_norm": 0.3338788953505843, "learning_rate": 5.84970309824542e-06, "loss": 0.6017, "step": 1860 }, { "epoch": 1.51, "grad_norm": 0.3364081890371372, "learning_rate": 5.826513966909612e-06, "loss": 0.5988, "step": 1865 }, { "epoch": 1.51, "grad_norm": 0.35080557373455823, "learning_rate": 5.803306543779374e-06, "loss": 0.598, "step": 1870 }, { "epoch": 1.52, "grad_norm": 0.3503431865070868, "learning_rate": 5.780081342464195e-06, "loss": 0.5986, "step": 1875 }, { "epoch": 1.52, "grad_norm": 0.3554878434984722, "learning_rate": 5.756838876967012e-06, "loss": 0.6031, "step": 1880 }, { "epoch": 1.52, "grad_norm": 0.3408524661048734, "learning_rate": 5.733579661672843e-06, "loss": 0.5976, "step": 1885 }, { "epoch": 1.53, "grad_norm": 0.3364071465295599, "learning_rate": 5.7103042113373964e-06, "loss": 0.5985, "step": 1890 }, { "epoch": 1.53, "grad_norm": 0.3742115156687691, "learning_rate": 5.6870130410756884e-06, "loss": 0.5997, "step": 1895 }, { "epoch": 1.54, "grad_norm": 0.39876952981025326, "learning_rate": 5.663706666350633e-06, "loss": 0.6009, "step": 1900 }, { "epoch": 1.54, "grad_norm": 0.35866383861250645, "learning_rate": 5.640385602961634e-06, "loss": 0.6083, "step": 1905 }, { "epoch": 1.54, "grad_norm": 0.37092260965473806, "learning_rate": 5.617050367033188e-06, "loss": 0.6014, "step": 1910 }, { "epoch": 1.55, "grad_norm": 0.3362511436204934, "learning_rate": 5.5937014750034295e-06, "loss": 0.6039, "step": 1915 }, { "epoch": 1.55, "grad_norm": 0.34430106963105644, "learning_rate": 5.570339443612734e-06, "loss": 0.6002, "step": 1920 }, { "epoch": 1.56, "grad_norm": 0.3381790178167573, "learning_rate": 5.546964789892258e-06, "loss": 0.5995, "step": 1925 }, { "epoch": 1.56, "grad_norm": 0.35197174567280254, "learning_rate": 5.523578031152516e-06, "loss": 0.6009, "step": 1930 }, { "epoch": 1.56, "grad_norm": 0.3734665580418465, "learning_rate": 5.500179684971909e-06, "loss": 0.5981, "step": 1935 }, { "epoch": 1.57, "grad_norm": 0.3325019561528593, "learning_rate": 5.4767702691852955e-06, "loss": 0.6014, "step": 1940 }, { "epoch": 1.57, "grad_norm": 0.33454739947159967, "learning_rate": 5.45335030187251e-06, "loss": 0.6001, "step": 1945 }, { "epoch": 1.58, "grad_norm": 0.35844558799357823, "learning_rate": 5.429920301346907e-06, "loss": 0.5979, "step": 1950 }, { "epoch": 1.58, "grad_norm": 0.3424733721593427, "learning_rate": 5.406480786143892e-06, "loss": 0.5945, "step": 1955 }, { "epoch": 1.58, "grad_norm": 0.34965768732245783, "learning_rate": 5.383032275009439e-06, "loss": 0.5968, "step": 1960 }, { "epoch": 1.59, "grad_norm": 0.3450848124776679, "learning_rate": 5.359575286888613e-06, "loss": 0.5957, "step": 1965 }, { "epoch": 1.59, "grad_norm": 0.3416298161956714, "learning_rate": 5.336110340914088e-06, "loss": 0.5986, "step": 1970 }, { "epoch": 1.6, "grad_norm": 0.3392650338835017, "learning_rate": 5.312637956394654e-06, "loss": 0.6063, "step": 1975 }, { "epoch": 1.6, "grad_norm": 0.3414974465071299, "learning_rate": 5.2891586528037255e-06, "loss": 0.6005, "step": 1980 }, { "epoch": 1.6, "grad_norm": 0.33135127694129735, "learning_rate": 5.265672949767842e-06, "loss": 0.5999, "step": 1985 }, { "epoch": 1.61, "grad_norm": 0.3396269560076833, "learning_rate": 5.242181367055177e-06, "loss": 0.6014, "step": 1990 }, { "epoch": 1.61, "grad_norm": 0.3374681307081042, "learning_rate": 5.218684424564023e-06, "loss": 0.5988, "step": 1995 }, { "epoch": 1.62, "grad_norm": 0.34148318236772424, "learning_rate": 5.195182642311294e-06, "loss": 0.5907, "step": 2000 }, { "epoch": 1.62, "grad_norm": 0.36276530604156537, "learning_rate": 5.171676540421013e-06, "loss": 0.5953, "step": 2005 }, { "epoch": 1.62, "grad_norm": 0.3478336704840908, "learning_rate": 5.148166639112799e-06, "loss": 0.5993, "step": 2010 }, { "epoch": 1.63, "grad_norm": 0.35802290364481915, "learning_rate": 5.1246534586903655e-06, "loss": 0.5955, "step": 2015 }, { "epoch": 1.63, "grad_norm": 0.3473543519178185, "learning_rate": 5.101137519529988e-06, "loss": 0.5989, "step": 2020 }, { "epoch": 1.64, "grad_norm": 0.3417110831936279, "learning_rate": 5.077619342069e-06, "loss": 0.5951, "step": 2025 }, { "epoch": 1.64, "grad_norm": 0.3288283511423511, "learning_rate": 5.0540994467942725e-06, "loss": 0.5951, "step": 2030 }, { "epoch": 1.65, "grad_norm": 0.33462881020064067, "learning_rate": 5.030578354230692e-06, "loss": 0.5965, "step": 2035 }, { "epoch": 1.65, "grad_norm": 0.33703401057823823, "learning_rate": 5.007056584929642e-06, "loss": 0.5982, "step": 2040 }, { "epoch": 1.65, "grad_norm": 0.3381967613698992, "learning_rate": 4.983534659457489e-06, "loss": 0.6007, "step": 2045 }, { "epoch": 1.66, "grad_norm": 0.3508991053901829, "learning_rate": 4.960013098384044e-06, "loss": 0.5993, "step": 2050 }, { "epoch": 1.66, "grad_norm": 0.33650937388486635, "learning_rate": 4.936492422271064e-06, "loss": 0.603, "step": 2055 }, { "epoch": 1.67, "grad_norm": 0.33682387625856475, "learning_rate": 4.912973151660717e-06, "loss": 0.5956, "step": 2060 }, { "epoch": 1.67, "grad_norm": 0.33937036425410855, "learning_rate": 4.889455807064066e-06, "loss": 0.6042, "step": 2065 }, { "epoch": 1.67, "grad_norm": 0.3371779384428629, "learning_rate": 4.865940908949546e-06, "loss": 0.5953, "step": 2070 }, { "epoch": 1.68, "grad_norm": 0.3599031408456829, "learning_rate": 4.842428977731454e-06, "loss": 0.5958, "step": 2075 }, { "epoch": 1.68, "grad_norm": 0.3432901552903446, "learning_rate": 4.8189205337584185e-06, "loss": 0.6032, "step": 2080 }, { "epoch": 1.69, "grad_norm": 0.33708861470550494, "learning_rate": 4.7954160973019005e-06, "loss": 0.5966, "step": 2085 }, { "epoch": 1.69, "grad_norm": 0.38348467590834423, "learning_rate": 4.771916188544657e-06, "loss": 0.5971, "step": 2090 }, { "epoch": 1.69, "grad_norm": 0.3308799379900178, "learning_rate": 4.7484213275692545e-06, "loss": 0.5982, "step": 2095 }, { "epoch": 1.7, "grad_norm": 0.35405258955192936, "learning_rate": 4.724932034346536e-06, "loss": 0.6004, "step": 2100 }, { "epoch": 1.7, "grad_norm": 0.33719158332004834, "learning_rate": 4.701448828724128e-06, "loss": 0.6, "step": 2105 }, { "epoch": 1.71, "grad_norm": 0.343245225988606, "learning_rate": 4.67797223041493e-06, "loss": 0.6, "step": 2110 }, { "epoch": 1.71, "grad_norm": 0.3584370601980231, "learning_rate": 4.654502758985611e-06, "loss": 0.5969, "step": 2115 }, { "epoch": 1.71, "grad_norm": 0.3943091207910351, "learning_rate": 4.631040933845116e-06, "loss": 0.5957, "step": 2120 }, { "epoch": 1.72, "grad_norm": 0.33450114269890413, "learning_rate": 4.607587274233167e-06, "loss": 0.5995, "step": 2125 }, { "epoch": 1.72, "grad_norm": 0.34911539991946666, "learning_rate": 4.584142299208773e-06, "loss": 0.5966, "step": 2130 }, { "epoch": 1.73, "grad_norm": 0.33647709249187696, "learning_rate": 4.5607065276387415e-06, "loss": 0.5958, "step": 2135 }, { "epoch": 1.73, "grad_norm": 0.33330390035384855, "learning_rate": 4.537280478186194e-06, "loss": 0.5951, "step": 2140 }, { "epoch": 1.73, "grad_norm": 0.33893039233702754, "learning_rate": 4.513864669299094e-06, "loss": 0.5964, "step": 2145 }, { "epoch": 1.74, "grad_norm": 0.34110060511685447, "learning_rate": 4.490459619198766e-06, "loss": 0.5966, "step": 2150 }, { "epoch": 1.74, "grad_norm": 0.3362900397772751, "learning_rate": 4.467065845868426e-06, "loss": 0.5996, "step": 2155 }, { "epoch": 1.75, "grad_norm": 0.32928276996802947, "learning_rate": 4.443683867041727e-06, "loss": 0.5973, "step": 2160 }, { "epoch": 1.75, "grad_norm": 0.3322528250567359, "learning_rate": 4.42031420019129e-06, "loss": 0.6001, "step": 2165 }, { "epoch": 1.75, "grad_norm": 0.36319782997733985, "learning_rate": 4.39695736251726e-06, "loss": 0.601, "step": 2170 }, { "epoch": 1.76, "grad_norm": 0.4098453011240976, "learning_rate": 4.373613870935849e-06, "loss": 0.5979, "step": 2175 }, { "epoch": 1.76, "grad_norm": 0.379673657370104, "learning_rate": 4.350284242067913e-06, "loss": 0.5995, "step": 2180 }, { "epoch": 1.77, "grad_norm": 0.33323942385941574, "learning_rate": 4.326968992227503e-06, "loss": 0.5989, "step": 2185 }, { "epoch": 1.77, "grad_norm": 0.33525768857724975, "learning_rate": 4.303668637410444e-06, "loss": 0.5997, "step": 2190 }, { "epoch": 1.77, "grad_norm": 0.3439270965491725, "learning_rate": 4.280383693282919e-06, "loss": 0.5997, "step": 2195 }, { "epoch": 1.78, "grad_norm": 0.3455837838168634, "learning_rate": 4.257114675170048e-06, "loss": 0.5982, "step": 2200 }, { "epoch": 1.78, "grad_norm": 0.34131586361444494, "learning_rate": 4.23386209804449e-06, "loss": 0.596, "step": 2205 }, { "epoch": 1.79, "grad_norm": 0.33577042284150305, "learning_rate": 4.210626476515048e-06, "loss": 0.5951, "step": 2210 }, { "epoch": 1.79, "grad_norm": 0.33488001630881314, "learning_rate": 4.187408324815273e-06, "loss": 0.5962, "step": 2215 }, { "epoch": 1.79, "grad_norm": 0.3373568796791887, "learning_rate": 4.1642081567920845e-06, "loss": 0.5963, "step": 2220 }, { "epoch": 1.8, "grad_norm": 0.3823953191130246, "learning_rate": 4.141026485894403e-06, "loss": 0.5929, "step": 2225 }, { "epoch": 1.8, "grad_norm": 0.3260441813510579, "learning_rate": 4.117863825161788e-06, "loss": 0.5988, "step": 2230 }, { "epoch": 1.81, "grad_norm": 0.3322309520256302, "learning_rate": 4.094720687213075e-06, "loss": 0.602, "step": 2235 }, { "epoch": 1.81, "grad_norm": 0.34414351316976893, "learning_rate": 4.071597584235035e-06, "loss": 0.5967, "step": 2240 }, { "epoch": 1.81, "grad_norm": 0.3398838448595311, "learning_rate": 4.048495027971043e-06, "loss": 0.5969, "step": 2245 }, { "epoch": 1.82, "grad_norm": 0.3390000106371949, "learning_rate": 4.025413529709751e-06, "loss": 0.598, "step": 2250 }, { "epoch": 1.82, "grad_norm": 0.33886602801796384, "learning_rate": 4.002353600273767e-06, "loss": 0.5968, "step": 2255 }, { "epoch": 1.83, "grad_norm": 0.3297494849246786, "learning_rate": 3.979315750008354e-06, "loss": 0.5944, "step": 2260 }, { "epoch": 1.83, "grad_norm": 0.3334897130158222, "learning_rate": 3.956300488770137e-06, "loss": 0.6001, "step": 2265 }, { "epoch": 1.84, "grad_norm": 0.32637663018083596, "learning_rate": 3.933308325915818e-06, "loss": 0.59, "step": 2270 }, { "epoch": 1.84, "grad_norm": 0.360394709090503, "learning_rate": 3.910339770290898e-06, "loss": 0.5922, "step": 2275 }, { "epoch": 1.84, "grad_norm": 0.3346830449246537, "learning_rate": 3.887395330218429e-06, "loss": 0.5969, "step": 2280 }, { "epoch": 1.85, "grad_norm": 0.3335640943232286, "learning_rate": 3.864475513487746e-06, "loss": 0.5976, "step": 2285 }, { "epoch": 1.85, "grad_norm": 0.3315412280408475, "learning_rate": 3.841580827343243e-06, "loss": 0.5909, "step": 2290 }, { "epoch": 1.86, "grad_norm": 0.3437658656728132, "learning_rate": 3.818711778473142e-06, "loss": 0.5948, "step": 2295 }, { "epoch": 1.86, "grad_norm": 0.32655318821830454, "learning_rate": 3.795868872998284e-06, "loss": 0.5977, "step": 2300 }, { "epoch": 1.86, "grad_norm": 0.3269750625289047, "learning_rate": 3.773052616460915e-06, "loss": 0.5963, "step": 2305 }, { "epoch": 1.87, "grad_norm": 0.334102774387526, "learning_rate": 3.7502635138135125e-06, "loss": 0.5923, "step": 2310 }, { "epoch": 1.87, "grad_norm": 0.3367842235380487, "learning_rate": 3.727502069407602e-06, "loss": 0.5966, "step": 2315 }, { "epoch": 1.88, "grad_norm": 0.34224419808221596, "learning_rate": 3.7047687869826016e-06, "loss": 0.5909, "step": 2320 }, { "epoch": 1.88, "grad_norm": 0.32965840350078623, "learning_rate": 3.682064169654663e-06, "loss": 0.5935, "step": 2325 }, { "epoch": 1.88, "grad_norm": 0.3431765721460804, "learning_rate": 3.6593887199055455e-06, "loss": 0.5902, "step": 2330 }, { "epoch": 1.89, "grad_norm": 0.3191827944559076, "learning_rate": 3.6367429395714964e-06, "loss": 0.5907, "step": 2335 }, { "epoch": 1.89, "grad_norm": 0.3336174190628238, "learning_rate": 3.614127329832139e-06, "loss": 0.5984, "step": 2340 }, { "epoch": 1.9, "grad_norm": 0.33434376641377983, "learning_rate": 3.591542391199381e-06, "loss": 0.5953, "step": 2345 }, { "epoch": 1.9, "grad_norm": 0.34561228055475135, "learning_rate": 3.568988623506342e-06, "loss": 0.593, "step": 2350 }, { "epoch": 1.9, "grad_norm": 0.34456176092100554, "learning_rate": 3.5464665258962916e-06, "loss": 0.5947, "step": 2355 }, { "epoch": 1.91, "grad_norm": 0.3408734896179227, "learning_rate": 3.5239765968115976e-06, "loss": 0.5971, "step": 2360 }, { "epoch": 1.91, "grad_norm": 0.33761745288769013, "learning_rate": 3.5015193339827018e-06, "loss": 0.5906, "step": 2365 }, { "epoch": 1.92, "grad_norm": 0.34197728377274145, "learning_rate": 3.4790952344170927e-06, "loss": 0.5946, "step": 2370 }, { "epoch": 1.92, "grad_norm": 0.3398816966491146, "learning_rate": 3.456704794388322e-06, "loss": 0.5944, "step": 2375 }, { "epoch": 1.92, "grad_norm": 0.32885141101187576, "learning_rate": 3.4343485094250094e-06, "loss": 0.5914, "step": 2380 }, { "epoch": 1.93, "grad_norm": 0.3318487935106688, "learning_rate": 3.4120268742998852e-06, "loss": 0.5912, "step": 2385 }, { "epoch": 1.93, "grad_norm": 0.32818786780308357, "learning_rate": 3.389740383018825e-06, "loss": 0.5912, "step": 2390 }, { "epoch": 1.94, "grad_norm": 0.3400044683569689, "learning_rate": 3.3674895288099364e-06, "loss": 0.5978, "step": 2395 }, { "epoch": 1.94, "grad_norm": 0.35119666817568174, "learning_rate": 3.345274804112629e-06, "loss": 0.5959, "step": 2400 }, { "epoch": 1.94, "grad_norm": 0.3315558471194881, "learning_rate": 3.323096700566726e-06, "loss": 0.5958, "step": 2405 }, { "epoch": 1.95, "grad_norm": 0.3419843973614644, "learning_rate": 3.3009557090015704e-06, "loss": 0.5954, "step": 2410 }, { "epoch": 1.95, "grad_norm": 0.34089635465877866, "learning_rate": 3.278852319425176e-06, "loss": 0.5974, "step": 2415 }, { "epoch": 1.96, "grad_norm": 0.33537246161474865, "learning_rate": 3.2567870210133744e-06, "loss": 0.5988, "step": 2420 }, { "epoch": 1.96, "grad_norm": 0.32482705542610035, "learning_rate": 3.234760302098997e-06, "loss": 0.5937, "step": 2425 }, { "epoch": 1.96, "grad_norm": 0.32656937519388646, "learning_rate": 3.2127726501610558e-06, "loss": 0.5952, "step": 2430 }, { "epoch": 1.97, "grad_norm": 0.3314276363386486, "learning_rate": 3.1908245518139637e-06, "loss": 0.5959, "step": 2435 }, { "epoch": 1.97, "grad_norm": 0.33633898019426633, "learning_rate": 3.1689164927967636e-06, "loss": 0.5899, "step": 2440 }, { "epoch": 1.98, "grad_norm": 0.3344001796344765, "learning_rate": 3.14704895796238e-06, "loss": 0.5979, "step": 2445 }, { "epoch": 1.98, "grad_norm": 0.33957868524131296, "learning_rate": 3.1252224312668818e-06, "loss": 0.5922, "step": 2450 }, { "epoch": 1.98, "grad_norm": 0.3630766364635787, "learning_rate": 3.103437395758776e-06, "loss": 0.5918, "step": 2455 }, { "epoch": 1.99, "grad_norm": 0.3267666635639959, "learning_rate": 3.081694333568323e-06, "loss": 0.598, "step": 2460 }, { "epoch": 1.99, "grad_norm": 0.3287217171489748, "learning_rate": 3.0599937258968554e-06, "loss": 0.5888, "step": 2465 }, { "epoch": 2.0, "grad_norm": 0.3270151142477496, "learning_rate": 3.0383360530061368e-06, "loss": 0.5941, "step": 2470 }, { "epoch": 2.0, "eval_loss": 0.6244343519210815, "eval_runtime": 172.678, "eval_samples_per_second": 48.275, "eval_steps_per_second": 0.759, "step": 2474 }, { "epoch": 2.0, "grad_norm": 0.43424780643568117, "learning_rate": 3.0167217942077263e-06, "loss": 0.5871, "step": 2475 }, { "epoch": 2.0, "grad_norm": 0.378799825389046, "learning_rate": 2.99515142785238e-06, "loss": 0.5619, "step": 2480 }, { "epoch": 2.01, "grad_norm": 0.373667392121592, "learning_rate": 2.9736254313194545e-06, "loss": 0.552, "step": 2485 }, { "epoch": 2.01, "grad_norm": 0.3570439701346094, "learning_rate": 2.9521442810063516e-06, "loss": 0.5549, "step": 2490 }, { "epoch": 2.02, "grad_norm": 0.36090630324663603, "learning_rate": 2.9307084523179596e-06, "loss": 0.5558, "step": 2495 }, { "epoch": 2.02, "grad_norm": 0.33917159277963727, "learning_rate": 2.9093184196561543e-06, "loss": 0.5524, "step": 2500 }, { "epoch": 2.03, "grad_norm": 0.34398893452755497, "learning_rate": 2.8879746564092792e-06, "loss": 0.5574, "step": 2505 }, { "epoch": 2.03, "grad_norm": 0.33996505753613354, "learning_rate": 2.866677634941684e-06, "loss": 0.5546, "step": 2510 }, { "epoch": 2.03, "grad_norm": 0.33969667288586725, "learning_rate": 2.8454278265832587e-06, "loss": 0.5536, "step": 2515 }, { "epoch": 2.04, "grad_norm": 0.340540174533377, "learning_rate": 2.8242257016190065e-06, "loss": 0.5587, "step": 2520 }, { "epoch": 2.04, "grad_norm": 0.34513173676942427, "learning_rate": 2.8030717292786435e-06, "loss": 0.5588, "step": 2525 }, { "epoch": 2.05, "grad_norm": 0.33346709741782166, "learning_rate": 2.7819663777262042e-06, "loss": 0.5551, "step": 2530 }, { "epoch": 2.05, "grad_norm": 0.34086249689715303, "learning_rate": 2.7609101140496863e-06, "loss": 0.5564, "step": 2535 }, { "epoch": 2.05, "grad_norm": 0.34353425761779516, "learning_rate": 2.739903404250702e-06, "loss": 0.5594, "step": 2540 }, { "epoch": 2.06, "grad_norm": 0.33440611000907805, "learning_rate": 2.718946713234185e-06, "loss": 0.5531, "step": 2545 }, { "epoch": 2.06, "grad_norm": 0.3563095188693516, "learning_rate": 2.6980405047980853e-06, "loss": 0.5538, "step": 2550 }, { "epoch": 2.07, "grad_norm": 0.36314745716928015, "learning_rate": 2.6771852416231114e-06, "loss": 0.5551, "step": 2555 }, { "epoch": 2.07, "grad_norm": 0.36420824799301177, "learning_rate": 2.6563813852624877e-06, "loss": 0.5552, "step": 2560 }, { "epoch": 2.07, "grad_norm": 0.34479203745244424, "learning_rate": 2.635629396131738e-06, "loss": 0.5593, "step": 2565 }, { "epoch": 2.08, "grad_norm": 0.34497107864168164, "learning_rate": 2.614929733498506e-06, "loss": 0.5534, "step": 2570 }, { "epoch": 2.08, "grad_norm": 0.34525328530832866, "learning_rate": 2.594282855472381e-06, "loss": 0.555, "step": 2575 }, { "epoch": 2.09, "grad_norm": 0.342520656902943, "learning_rate": 2.573689218994761e-06, "loss": 0.5575, "step": 2580 }, { "epoch": 2.09, "grad_norm": 0.34662459423202513, "learning_rate": 2.5531492798287403e-06, "loss": 0.5595, "step": 2585 }, { "epoch": 2.09, "grad_norm": 0.3387544512521863, "learning_rate": 2.532663492549029e-06, "loss": 0.5557, "step": 2590 }, { "epoch": 2.1, "grad_norm": 0.3499541775771522, "learning_rate": 2.5122323105318867e-06, "loss": 0.5607, "step": 2595 }, { "epoch": 2.1, "grad_norm": 0.3383598994579805, "learning_rate": 2.4918561859450868e-06, "loss": 0.553, "step": 2600 }, { "epoch": 2.11, "grad_norm": 0.3432654591931444, "learning_rate": 2.471535569737912e-06, "loss": 0.5573, "step": 2605 }, { "epoch": 2.11, "grad_norm": 0.34383917460157043, "learning_rate": 2.451270911631178e-06, "loss": 0.5585, "step": 2610 }, { "epoch": 2.11, "grad_norm": 0.3450670971105364, "learning_rate": 2.431062660107278e-06, "loss": 0.5554, "step": 2615 }, { "epoch": 2.12, "grad_norm": 0.3419046104588635, "learning_rate": 2.4109112624002536e-06, "loss": 0.5558, "step": 2620 }, { "epoch": 2.12, "grad_norm": 0.3420426598530408, "learning_rate": 2.390817164485898e-06, "loss": 0.5566, "step": 2625 }, { "epoch": 2.13, "grad_norm": 0.33795143677174216, "learning_rate": 2.370780811071892e-06, "loss": 0.5537, "step": 2630 }, { "epoch": 2.13, "grad_norm": 0.35152836043325386, "learning_rate": 2.3508026455879584e-06, "loss": 0.5575, "step": 2635 }, { "epoch": 2.13, "grad_norm": 0.3415733674356157, "learning_rate": 2.330883110176049e-06, "loss": 0.5568, "step": 2640 }, { "epoch": 2.14, "grad_norm": 0.33945656599372276, "learning_rate": 2.3110226456805468e-06, "loss": 0.5592, "step": 2645 }, { "epoch": 2.14, "grad_norm": 0.34038285872514057, "learning_rate": 2.2912216916385343e-06, "loss": 0.5574, "step": 2650 }, { "epoch": 2.15, "grad_norm": 0.33686768239197035, "learning_rate": 2.271480686270048e-06, "loss": 0.5552, "step": 2655 }, { "epoch": 2.15, "grad_norm": 0.3667930417402405, "learning_rate": 2.251800066468387e-06, "loss": 0.5568, "step": 2660 }, { "epoch": 2.15, "grad_norm": 0.37224776295434164, "learning_rate": 2.232180267790437e-06, "loss": 0.5556, "step": 2665 }, { "epoch": 2.16, "grad_norm": 0.36024399712074406, "learning_rate": 2.212621724447037e-06, "loss": 0.5518, "step": 2670 }, { "epoch": 2.16, "grad_norm": 0.33906837045630817, "learning_rate": 2.193124869293372e-06, "loss": 0.5556, "step": 2675 }, { "epoch": 2.17, "grad_norm": 0.35085747489764113, "learning_rate": 2.173690133819389e-06, "loss": 0.5614, "step": 2680 }, { "epoch": 2.17, "grad_norm": 0.35721523211685124, "learning_rate": 2.1543179481402443e-06, "loss": 0.5516, "step": 2685 }, { "epoch": 2.17, "grad_norm": 0.35515538032652993, "learning_rate": 2.1350087409867894e-06, "loss": 0.5515, "step": 2690 }, { "epoch": 2.18, "grad_norm": 0.3422620562848989, "learning_rate": 2.115762939696085e-06, "loss": 0.5571, "step": 2695 }, { "epoch": 2.18, "grad_norm": 0.3424979677342437, "learning_rate": 2.096580970201941e-06, "loss": 0.554, "step": 2700 }, { "epoch": 2.19, "grad_norm": 0.34115765465968273, "learning_rate": 2.077463257025484e-06, "loss": 0.5525, "step": 2705 }, { "epoch": 2.19, "grad_norm": 0.34678996134066686, "learning_rate": 2.058410223265769e-06, "loss": 0.5543, "step": 2710 }, { "epoch": 2.19, "grad_norm": 0.343760449598779, "learning_rate": 2.0394222905904164e-06, "loss": 0.5534, "step": 2715 }, { "epoch": 2.2, "grad_norm": 0.3453513649573003, "learning_rate": 2.02049987922628e-06, "loss": 0.5604, "step": 2720 }, { "epoch": 2.2, "grad_norm": 0.34420412806565154, "learning_rate": 2.001643407950138e-06, "loss": 0.556, "step": 2725 }, { "epoch": 2.21, "grad_norm": 0.3533716475528306, "learning_rate": 1.9828532940794325e-06, "loss": 0.5539, "step": 2730 }, { "epoch": 2.21, "grad_norm": 0.34168747382741826, "learning_rate": 1.9641299534630374e-06, "loss": 0.5553, "step": 2735 }, { "epoch": 2.22, "grad_norm": 0.3445082695825234, "learning_rate": 1.94547380047205e-06, "loss": 0.5526, "step": 2740 }, { "epoch": 2.22, "grad_norm": 0.34090525171984093, "learning_rate": 1.926885247990615e-06, "loss": 0.5603, "step": 2745 }, { "epoch": 2.22, "grad_norm": 0.34431327607616363, "learning_rate": 1.908364707406796e-06, "loss": 0.5525, "step": 2750 }, { "epoch": 2.23, "grad_norm": 0.3388949901820858, "learning_rate": 1.889912588603469e-06, "loss": 0.5577, "step": 2755 }, { "epoch": 2.23, "grad_norm": 0.430154311605853, "learning_rate": 1.8715292999492502e-06, "loss": 0.5583, "step": 2760 }, { "epoch": 2.24, "grad_norm": 0.3446351097283205, "learning_rate": 1.8532152482894595e-06, "loss": 0.5571, "step": 2765 }, { "epoch": 2.24, "grad_norm": 0.3396049538144242, "learning_rate": 1.8349708389371046e-06, "loss": 0.5545, "step": 2770 }, { "epoch": 2.24, "grad_norm": 0.33788340564568603, "learning_rate": 1.8167964756639334e-06, "loss": 0.5596, "step": 2775 }, { "epoch": 2.25, "grad_norm": 0.34448789733899204, "learning_rate": 1.7986925606914806e-06, "loss": 0.5568, "step": 2780 }, { "epoch": 2.25, "grad_norm": 0.3634433196572614, "learning_rate": 1.780659494682175e-06, "loss": 0.5603, "step": 2785 }, { "epoch": 2.26, "grad_norm": 0.3416180300595656, "learning_rate": 1.762697676730462e-06, "loss": 0.5556, "step": 2790 }, { "epoch": 2.26, "grad_norm": 0.34173380544732385, "learning_rate": 1.7448075043539813e-06, "loss": 0.5577, "step": 2795 }, { "epoch": 2.26, "grad_norm": 0.3475479599386037, "learning_rate": 1.7269893734847682e-06, "loss": 0.557, "step": 2800 }, { "epoch": 2.27, "grad_norm": 0.33952460793931827, "learning_rate": 1.709243678460487e-06, "loss": 0.5585, "step": 2805 }, { "epoch": 2.27, "grad_norm": 0.3471196038854188, "learning_rate": 1.6915708120157042e-06, "loss": 0.5521, "step": 2810 }, { "epoch": 2.28, "grad_norm": 0.34051445742252234, "learning_rate": 1.6739711652731977e-06, "loss": 0.559, "step": 2815 }, { "epoch": 2.28, "grad_norm": 0.33990114977244745, "learning_rate": 1.6564451277353045e-06, "loss": 0.556, "step": 2820 }, { "epoch": 2.28, "grad_norm": 0.3459524143137594, "learning_rate": 1.6389930872752968e-06, "loss": 0.56, "step": 2825 }, { "epoch": 2.29, "grad_norm": 0.35047325208139163, "learning_rate": 1.621615430128795e-06, "loss": 0.557, "step": 2830 }, { "epoch": 2.29, "grad_norm": 0.34998166759796095, "learning_rate": 1.6043125408852244e-06, "loss": 0.5578, "step": 2835 }, { "epoch": 2.3, "grad_norm": 0.3452099804209579, "learning_rate": 1.587084802479305e-06, "loss": 0.5546, "step": 2840 }, { "epoch": 2.3, "grad_norm": 0.3361374059295131, "learning_rate": 1.569932596182573e-06, "loss": 0.5578, "step": 2845 }, { "epoch": 2.3, "grad_norm": 0.3394866630196838, "learning_rate": 1.5528563015949421e-06, "loss": 0.5558, "step": 2850 }, { "epoch": 2.31, "grad_norm": 0.3460288355379662, "learning_rate": 1.5358562966363043e-06, "loss": 0.5554, "step": 2855 }, { "epoch": 2.31, "grad_norm": 0.3430825632639992, "learning_rate": 1.518932957538169e-06, "loss": 0.5614, "step": 2860 }, { "epoch": 2.32, "grad_norm": 0.3480557119260175, "learning_rate": 1.5020866588353334e-06, "loss": 0.5574, "step": 2865 }, { "epoch": 2.32, "grad_norm": 0.3457647567827056, "learning_rate": 1.4853177733575891e-06, "loss": 0.557, "step": 2870 }, { "epoch": 2.32, "grad_norm": 0.3522059241658225, "learning_rate": 1.468626672221482e-06, "loss": 0.5559, "step": 2875 }, { "epoch": 2.33, "grad_norm": 0.34152623265158755, "learning_rate": 1.4520137248220866e-06, "loss": 0.5576, "step": 2880 }, { "epoch": 2.33, "grad_norm": 0.3444660292385707, "learning_rate": 1.4354792988248412e-06, "loss": 0.5592, "step": 2885 }, { "epoch": 2.34, "grad_norm": 0.3454474317923543, "learning_rate": 1.419023760157407e-06, "loss": 0.5571, "step": 2890 }, { "epoch": 2.34, "grad_norm": 0.35213099382912866, "learning_rate": 1.402647473001565e-06, "loss": 0.5521, "step": 2895 }, { "epoch": 2.34, "grad_norm": 0.3387093667181156, "learning_rate": 1.3863507997851622e-06, "loss": 0.5621, "step": 2900 }, { "epoch": 2.35, "grad_norm": 0.34122845579922717, "learning_rate": 1.370134101174091e-06, "loss": 0.556, "step": 2905 }, { "epoch": 2.35, "grad_norm": 0.34618486601065424, "learning_rate": 1.3539977360643054e-06, "loss": 0.5604, "step": 2910 }, { "epoch": 2.36, "grad_norm": 0.34158383012624666, "learning_rate": 1.3379420615738736e-06, "loss": 0.5577, "step": 2915 }, { "epoch": 2.36, "grad_norm": 0.3402815483357088, "learning_rate": 1.3219674330350817e-06, "loss": 0.5567, "step": 2920 }, { "epoch": 2.36, "grad_norm": 0.343534392142761, "learning_rate": 1.306074203986567e-06, "loss": 0.5578, "step": 2925 }, { "epoch": 2.37, "grad_norm": 0.3567313409934853, "learning_rate": 1.2902627261654955e-06, "loss": 0.5557, "step": 2930 }, { "epoch": 2.37, "grad_norm": 0.3418954586842408, "learning_rate": 1.2745333494997715e-06, "loss": 0.5522, "step": 2935 }, { "epoch": 2.38, "grad_norm": 0.34732506830017706, "learning_rate": 1.2588864221002978e-06, "loss": 0.5537, "step": 2940 }, { "epoch": 2.38, "grad_norm": 0.34208839621313564, "learning_rate": 1.2433222902532739e-06, "loss": 0.5592, "step": 2945 }, { "epoch": 2.38, "grad_norm": 0.35523668444592976, "learning_rate": 1.2278412984125305e-06, "loss": 0.5585, "step": 2950 }, { "epoch": 2.39, "grad_norm": 0.3545395086528268, "learning_rate": 1.2124437891918995e-06, "loss": 0.5539, "step": 2955 }, { "epoch": 2.39, "grad_norm": 0.34336060641077304, "learning_rate": 1.1971301033576444e-06, "loss": 0.5615, "step": 2960 }, { "epoch": 2.4, "grad_norm": 0.3463503749556936, "learning_rate": 1.1819005798209049e-06, "loss": 0.5596, "step": 2965 }, { "epoch": 2.4, "grad_norm": 0.34888831207357546, "learning_rate": 1.1667555556302097e-06, "loss": 0.5583, "step": 2970 }, { "epoch": 2.41, "grad_norm": 0.3580448101492307, "learning_rate": 1.1516953659640035e-06, "loss": 0.56, "step": 2975 }, { "epoch": 2.41, "grad_norm": 0.3486598552979662, "learning_rate": 1.1367203441232439e-06, "loss": 0.5571, "step": 2980 }, { "epoch": 2.41, "grad_norm": 0.3383834501725202, "learning_rate": 1.1218308215240098e-06, "loss": 0.5573, "step": 2985 }, { "epoch": 2.42, "grad_norm": 0.3398621794036884, "learning_rate": 1.10702712769018e-06, "loss": 0.5536, "step": 2990 }, { "epoch": 2.42, "grad_norm": 0.3353789991021529, "learning_rate": 1.0923095902461333e-06, "loss": 0.5528, "step": 2995 }, { "epoch": 2.43, "grad_norm": 0.3435682137227129, "learning_rate": 1.0776785349094975e-06, "loss": 0.5567, "step": 3000 }, { "epoch": 2.43, "grad_norm": 0.3394100726414769, "learning_rate": 1.0631342854839421e-06, "loss": 0.5568, "step": 3005 }, { "epoch": 2.43, "grad_norm": 0.3384215504986675, "learning_rate": 1.0486771638520148e-06, "loss": 0.5564, "step": 3010 }, { "epoch": 2.44, "grad_norm": 0.34694255326876117, "learning_rate": 1.034307489968016e-06, "loss": 0.5559, "step": 3015 }, { "epoch": 2.44, "grad_norm": 0.36355012117288316, "learning_rate": 1.0200255818509152e-06, "loss": 0.5579, "step": 3020 }, { "epoch": 2.45, "grad_norm": 0.3463139547303209, "learning_rate": 1.005831755577314e-06, "loss": 0.5545, "step": 3025 }, { "epoch": 2.45, "grad_norm": 0.3438729462720352, "learning_rate": 9.917263252744553e-07, "loss": 0.5633, "step": 3030 }, { "epoch": 2.45, "grad_norm": 0.344589211547542, "learning_rate": 9.777096031132671e-07, "loss": 0.5551, "step": 3035 }, { "epoch": 2.46, "grad_norm": 0.35013529843782554, "learning_rate": 9.63781899301452e-07, "loss": 0.5573, "step": 3040 }, { "epoch": 2.46, "grad_norm": 0.34916559574228423, "learning_rate": 9.499435220766284e-07, "loss": 0.5533, "step": 3045 }, { "epoch": 2.47, "grad_norm": 0.3371689136760618, "learning_rate": 9.361947776995001e-07, "loss": 0.5552, "step": 3050 }, { "epoch": 2.47, "grad_norm": 0.3358230675302765, "learning_rate": 9.225359704470888e-07, "loss": 0.5596, "step": 3055 }, { "epoch": 2.47, "grad_norm": 0.33846163130025825, "learning_rate": 9.08967402605988e-07, "loss": 0.5536, "step": 3060 }, { "epoch": 2.48, "grad_norm": 0.34811695999814146, "learning_rate": 8.954893744656862e-07, "loss": 0.5547, "step": 3065 }, { "epoch": 2.48, "grad_norm": 0.36870433147689896, "learning_rate": 8.821021843119087e-07, "loss": 0.5582, "step": 3070 }, { "epoch": 2.49, "grad_norm": 0.34579442267718713, "learning_rate": 8.688061284200266e-07, "loss": 0.5582, "step": 3075 }, { "epoch": 2.49, "grad_norm": 0.3494450357235282, "learning_rate": 8.556015010484892e-07, "loss": 0.5584, "step": 3080 }, { "epoch": 2.49, "grad_norm": 0.34040433321057334, "learning_rate": 8.424885944323247e-07, "loss": 0.558, "step": 3085 }, { "epoch": 2.5, "grad_norm": 0.34284925738168204, "learning_rate": 8.294676987766587e-07, "loss": 0.5532, "step": 3090 }, { "epoch": 2.5, "grad_norm": 0.34400861161854035, "learning_rate": 8.165391022503044e-07, "loss": 0.5611, "step": 3095 }, { "epoch": 2.51, "grad_norm": 0.34316621687703514, "learning_rate": 8.037030909793736e-07, "loss": 0.5592, "step": 3100 }, { "epoch": 2.51, "grad_norm": 0.34104103914983847, "learning_rate": 7.909599490409548e-07, "loss": 0.5584, "step": 3105 }, { "epoch": 2.51, "grad_norm": 0.33057985725640004, "learning_rate": 7.783099584568166e-07, "loss": 0.5545, "step": 3110 }, { "epoch": 2.52, "grad_norm": 0.356251865218884, "learning_rate": 7.657533991871752e-07, "loss": 0.5548, "step": 3115 }, { "epoch": 2.52, "grad_norm": 0.33574233812884285, "learning_rate": 7.532905491244924e-07, "loss": 0.5547, "step": 3120 }, { "epoch": 2.53, "grad_norm": 0.35686238014314037, "learning_rate": 7.409216840873257e-07, "loss": 0.5513, "step": 3125 }, { "epoch": 2.53, "grad_norm": 0.3395528665723872, "learning_rate": 7.286470778142285e-07, "loss": 0.5513, "step": 3130 }, { "epoch": 2.53, "grad_norm": 0.3405566374796781, "learning_rate": 7.164670019576869e-07, "loss": 0.5561, "step": 3135 }, { "epoch": 2.54, "grad_norm": 0.3384032629299572, "learning_rate": 7.043817260781117e-07, "loss": 0.5591, "step": 3140 }, { "epoch": 2.54, "grad_norm": 0.33664121000949465, "learning_rate": 6.923915176378687e-07, "loss": 0.5554, "step": 3145 }, { "epoch": 2.55, "grad_norm": 0.33587913980088074, "learning_rate": 6.804966419953641e-07, "loss": 0.5573, "step": 3150 }, { "epoch": 2.55, "grad_norm": 0.34556927462422465, "learning_rate": 6.686973623991666e-07, "loss": 0.557, "step": 3155 }, { "epoch": 2.55, "grad_norm": 0.3435632683862961, "learning_rate": 6.569939399821878e-07, "loss": 0.5532, "step": 3160 }, { "epoch": 2.56, "grad_norm": 0.34215618775071754, "learning_rate": 6.453866337558939e-07, "loss": 0.5554, "step": 3165 }, { "epoch": 2.56, "grad_norm": 0.3421385139751989, "learning_rate": 6.338757006045854e-07, "loss": 0.5536, "step": 3170 }, { "epoch": 2.57, "grad_norm": 0.3454745293035003, "learning_rate": 6.224613952796982e-07, "loss": 0.5581, "step": 3175 }, { "epoch": 2.57, "grad_norm": 0.3427552719692443, "learning_rate": 6.111439703941796e-07, "loss": 0.5594, "step": 3180 }, { "epoch": 2.57, "grad_norm": 0.34586667334685683, "learning_rate": 5.999236764168854e-07, "loss": 0.5554, "step": 3185 }, { "epoch": 2.58, "grad_norm": 0.34668553378422917, "learning_rate": 5.888007616670461e-07, "loss": 0.5529, "step": 3190 }, { "epoch": 2.58, "grad_norm": 0.343774445414697, "learning_rate": 5.777754723087642e-07, "loss": 0.5582, "step": 3195 }, { "epoch": 2.59, "grad_norm": 0.3432260118261018, "learning_rate": 5.668480523455721e-07, "loss": 0.5561, "step": 3200 }, { "epoch": 2.59, "grad_norm": 0.3515941855095276, "learning_rate": 5.560187436150266e-07, "loss": 0.5536, "step": 3205 }, { "epoch": 2.59, "grad_norm": 0.3407117980465443, "learning_rate": 5.452877857833611e-07, "loss": 0.556, "step": 3210 }, { "epoch": 2.6, "grad_norm": 0.34247668381990537, "learning_rate": 5.346554163401818e-07, "loss": 0.5575, "step": 3215 }, { "epoch": 2.6, "grad_norm": 0.344764240740865, "learning_rate": 5.241218705932056e-07, "loss": 0.5558, "step": 3220 }, { "epoch": 2.61, "grad_norm": 0.3408171505462731, "learning_rate": 5.136873816630572e-07, "loss": 0.5605, "step": 3225 }, { "epoch": 2.61, "grad_norm": 0.3392972680865765, "learning_rate": 5.033521804781127e-07, "loss": 0.5598, "step": 3230 }, { "epoch": 2.62, "grad_norm": 0.3381390765650064, "learning_rate": 4.931164957693834e-07, "loss": 0.556, "step": 3235 }, { "epoch": 2.62, "grad_norm": 0.3533527202556582, "learning_rate": 4.829805540654547e-07, "loss": 0.5604, "step": 3240 }, { "epoch": 2.62, "grad_norm": 0.333814210655634, "learning_rate": 4.7294457968747543e-07, "loss": 0.5532, "step": 3245 }, { "epoch": 2.63, "grad_norm": 0.33568380345102444, "learning_rate": 4.630087947441897e-07, "loss": 0.5579, "step": 3250 }, { "epoch": 2.63, "grad_norm": 0.3322414039450696, "learning_rate": 4.531734191270265e-07, "loss": 0.554, "step": 3255 }, { "epoch": 2.64, "grad_norm": 0.34231721505843576, "learning_rate": 4.434386705052257e-07, "loss": 0.5563, "step": 3260 }, { "epoch": 2.64, "grad_norm": 0.3404909318925554, "learning_rate": 4.3380476432103025e-07, "loss": 0.5581, "step": 3265 }, { "epoch": 2.64, "grad_norm": 0.34598970601245227, "learning_rate": 4.242719137849077e-07, "loss": 0.5549, "step": 3270 }, { "epoch": 2.65, "grad_norm": 0.34976631382216145, "learning_rate": 4.148403298708409e-07, "loss": 0.56, "step": 3275 }, { "epoch": 2.65, "grad_norm": 0.3427643150748611, "learning_rate": 4.0551022131165353e-07, "loss": 0.559, "step": 3280 }, { "epoch": 2.66, "grad_norm": 0.34358276195475335, "learning_rate": 3.9628179459439197e-07, "loss": 0.5539, "step": 3285 }, { "epoch": 2.66, "grad_norm": 0.35131908179397314, "learning_rate": 3.8715525395575436e-07, "loss": 0.5599, "step": 3290 }, { "epoch": 2.66, "grad_norm": 0.33820325846808746, "learning_rate": 3.781308013775731e-07, "loss": 0.5558, "step": 3295 }, { "epoch": 2.67, "grad_norm": 0.34492328573913705, "learning_rate": 3.6920863658234387e-07, "loss": 0.5579, "step": 3300 }, { "epoch": 2.67, "grad_norm": 0.35281788506279804, "learning_rate": 3.603889570288033e-07, "loss": 0.5556, "step": 3305 }, { "epoch": 2.68, "grad_norm": 0.3343498533553313, "learning_rate": 3.516719579075606e-07, "loss": 0.5534, "step": 3310 }, { "epoch": 2.68, "grad_norm": 0.3469976919703614, "learning_rate": 3.430578321367789e-07, "loss": 0.5546, "step": 3315 }, { "epoch": 2.68, "grad_norm": 0.33828468783453963, "learning_rate": 3.3454677035790527e-07, "loss": 0.5565, "step": 3320 }, { "epoch": 2.69, "grad_norm": 0.34099732440416464, "learning_rate": 3.2613896093144917e-07, "loss": 0.5553, "step": 3325 }, { "epoch": 2.69, "grad_norm": 0.3391614092323769, "learning_rate": 3.17834589932815e-07, "loss": 0.5525, "step": 3330 }, { "epoch": 2.7, "grad_norm": 0.33958333771527327, "learning_rate": 3.0963384114818694e-07, "loss": 0.5531, "step": 3335 }, { "epoch": 2.7, "grad_norm": 0.3398181519052286, "learning_rate": 3.015368960704584e-07, "loss": 0.5596, "step": 3340 }, { "epoch": 2.7, "grad_norm": 0.3386688536540067, "learning_rate": 2.935439338952151e-07, "loss": 0.5542, "step": 3345 }, { "epoch": 2.71, "grad_norm": 0.3382118834844372, "learning_rate": 2.8565513151676993e-07, "loss": 0.5578, "step": 3350 }, { "epoch": 2.71, "grad_norm": 0.3419421307893813, "learning_rate": 2.778706635242495e-07, "loss": 0.5574, "step": 3355 }, { "epoch": 2.72, "grad_norm": 0.3395829767909498, "learning_rate": 2.701907021977296e-07, "loss": 0.5569, "step": 3360 }, { "epoch": 2.72, "grad_norm": 0.3399096680727189, "learning_rate": 2.626154175044221e-07, "loss": 0.5494, "step": 3365 }, { "epoch": 2.72, "grad_norm": 0.3490275432220527, "learning_rate": 2.5514497709491046e-07, "loss": 0.5549, "step": 3370 }, { "epoch": 2.73, "grad_norm": 0.34251311305373416, "learning_rate": 2.477795462994448e-07, "loss": 0.5546, "step": 3375 }, { "epoch": 2.73, "grad_norm": 0.337453106969371, "learning_rate": 2.4051928812427905e-07, "loss": 0.557, "step": 3380 }, { "epoch": 2.74, "grad_norm": 0.34052922835414007, "learning_rate": 2.3336436324806643e-07, "loss": 0.5547, "step": 3385 }, { "epoch": 2.74, "grad_norm": 0.3425405185867766, "learning_rate": 2.2631493001829984e-07, "loss": 0.5582, "step": 3390 }, { "epoch": 2.74, "grad_norm": 0.3454134816055132, "learning_rate": 2.1937114444780862e-07, "loss": 0.5551, "step": 3395 }, { "epoch": 2.75, "grad_norm": 0.3343384917048216, "learning_rate": 2.1253316021130898e-07, "loss": 0.5534, "step": 3400 }, { "epoch": 2.75, "grad_norm": 0.35023496121275305, "learning_rate": 2.058011286419992e-07, "loss": 0.5583, "step": 3405 }, { "epoch": 2.76, "grad_norm": 0.36302594050761905, "learning_rate": 1.9917519872821144e-07, "loss": 0.5537, "step": 3410 }, { "epoch": 2.76, "grad_norm": 0.3346636625411215, "learning_rate": 1.9265551711011354e-07, "loss": 0.5517, "step": 3415 }, { "epoch": 2.76, "grad_norm": 0.34193638087841716, "learning_rate": 1.8624222807646753e-07, "loss": 0.5591, "step": 3420 }, { "epoch": 2.77, "grad_norm": 0.3476844067729131, "learning_rate": 1.799354735614317e-07, "loss": 0.5607, "step": 3425 }, { "epoch": 2.77, "grad_norm": 0.3434840820332503, "learning_rate": 1.7373539314142206e-07, "loss": 0.5589, "step": 3430 }, { "epoch": 2.78, "grad_norm": 0.35253197013029536, "learning_rate": 1.6764212403202196e-07, "loss": 0.555, "step": 3435 }, { "epoch": 2.78, "grad_norm": 0.34004696151114483, "learning_rate": 1.616558010849467e-07, "loss": 0.5615, "step": 3440 }, { "epoch": 2.78, "grad_norm": 0.3366256242285153, "learning_rate": 1.5577655678505775e-07, "loss": 0.5527, "step": 3445 }, { "epoch": 2.79, "grad_norm": 0.35173071671183204, "learning_rate": 1.5000452124743326e-07, "loss": 0.5583, "step": 3450 }, { "epoch": 2.79, "grad_norm": 0.3343098566181465, "learning_rate": 1.443398222144826e-07, "loss": 0.5584, "step": 3455 }, { "epoch": 2.8, "grad_norm": 0.34420049500378824, "learning_rate": 1.3878258505312713e-07, "loss": 0.5506, "step": 3460 }, { "epoch": 2.8, "grad_norm": 0.33484504552090993, "learning_rate": 1.3333293275201942e-07, "loss": 0.5548, "step": 3465 }, { "epoch": 2.81, "grad_norm": 0.3379423999263128, "learning_rate": 1.2799098591882608e-07, "loss": 0.5527, "step": 3470 }, { "epoch": 2.81, "grad_norm": 0.3389154778216677, "learning_rate": 1.2275686277755218e-07, "loss": 0.5562, "step": 3475 }, { "epoch": 2.81, "grad_norm": 0.340527497432122, "learning_rate": 1.1763067916593263e-07, "loss": 0.5535, "step": 3480 }, { "epoch": 2.82, "grad_norm": 0.34059297753385975, "learning_rate": 1.1261254853286163e-07, "loss": 0.5523, "step": 3485 }, { "epoch": 2.82, "grad_norm": 0.35042097684886725, "learning_rate": 1.0770258193588845e-07, "loss": 0.5537, "step": 3490 }, { "epoch": 2.83, "grad_norm": 0.3403970788880998, "learning_rate": 1.0290088803875331e-07, "loss": 0.5544, "step": 3495 }, { "epoch": 2.83, "grad_norm": 0.33484805592712125, "learning_rate": 9.820757310898487e-08, "loss": 0.5574, "step": 3500 }, { "epoch": 2.83, "grad_norm": 0.3385912304132044, "learning_rate": 9.362274101555158e-08, "loss": 0.5505, "step": 3505 }, { "epoch": 2.84, "grad_norm": 0.3343642719885355, "learning_rate": 8.914649322655844e-08, "loss": 0.5563, "step": 3510 }, { "epoch": 2.84, "grad_norm": 0.3358606879853761, "learning_rate": 8.477892880700222e-08, "loss": 0.557, "step": 3515 }, { "epoch": 2.85, "grad_norm": 0.3378625790646625, "learning_rate": 8.052014441658207e-08, "loss": 0.5567, "step": 3520 }, { "epoch": 2.85, "grad_norm": 0.3400058168012056, "learning_rate": 7.637023430755674e-08, "loss": 0.5525, "step": 3525 }, { "epoch": 2.85, "grad_norm": 0.34364288579946634, "learning_rate": 7.232929032266078e-08, "loss": 0.5505, "step": 3530 }, { "epoch": 2.86, "grad_norm": 0.33179866684951853, "learning_rate": 6.839740189307054e-08, "loss": 0.5557, "step": 3535 }, { "epoch": 2.86, "grad_norm": 0.3408174592385495, "learning_rate": 6.457465603642577e-08, "loss": 0.5541, "step": 3540 }, { "epoch": 2.87, "grad_norm": 0.3442799882786652, "learning_rate": 6.086113735490507e-08, "loss": 0.554, "step": 3545 }, { "epoch": 2.87, "grad_norm": 0.33745832060253145, "learning_rate": 5.725692803335015e-08, "loss": 0.5524, "step": 3550 }, { "epoch": 2.87, "grad_norm": 0.34048908508942255, "learning_rate": 5.3762107837450615e-08, "loss": 0.5562, "step": 3555 }, { "epoch": 2.88, "grad_norm": 0.3395168628265465, "learning_rate": 5.037675411197596e-08, "loss": 0.5484, "step": 3560 }, { "epoch": 2.88, "grad_norm": 0.34171835512514714, "learning_rate": 4.710094177906466e-08, "loss": 0.5529, "step": 3565 }, { "epoch": 2.89, "grad_norm": 0.34913815959202166, "learning_rate": 4.393474333656833e-08, "loss": 0.5533, "step": 3570 }, { "epoch": 2.89, "grad_norm": 0.33910739884596597, "learning_rate": 4.087822885644299e-08, "loss": 0.5603, "step": 3575 }, { "epoch": 2.89, "grad_norm": 0.33708441603957684, "learning_rate": 3.793146598320141e-08, "loss": 0.5584, "step": 3580 }, { "epoch": 2.9, "grad_norm": 0.3406104772777445, "learning_rate": 3.5094519932415417e-08, "loss": 0.5543, "step": 3585 }, { "epoch": 2.9, "grad_norm": 0.338980369241015, "learning_rate": 3.2367453489271506e-08, "loss": 0.5488, "step": 3590 }, { "epoch": 2.91, "grad_norm": 0.3366257356086689, "learning_rate": 2.9750327007183055e-08, "loss": 0.5527, "step": 3595 }, { "epoch": 2.91, "grad_norm": 0.3400293156935555, "learning_rate": 2.7243198406453062e-08, "loss": 0.5533, "step": 3600 }, { "epoch": 2.91, "grad_norm": 0.33666291954633937, "learning_rate": 2.4846123172992953e-08, "loss": 0.5584, "step": 3605 }, { "epoch": 2.92, "grad_norm": 0.33563746505268, "learning_rate": 2.2559154357095214e-08, "loss": 0.5544, "step": 3610 }, { "epoch": 2.92, "grad_norm": 0.3313188462749996, "learning_rate": 2.0382342572258794e-08, "loss": 0.556, "step": 3615 }, { "epoch": 2.93, "grad_norm": 0.349977599020757, "learning_rate": 1.831573599406833e-08, "loss": 0.5523, "step": 3620 }, { "epoch": 2.93, "grad_norm": 0.34807560788160136, "learning_rate": 1.6359380359127762e-08, "loss": 0.5597, "step": 3625 }, { "epoch": 2.93, "grad_norm": 0.34050148486843906, "learning_rate": 1.451331896405006e-08, "loss": 0.5538, "step": 3630 }, { "epoch": 2.94, "grad_norm": 0.34313816730753993, "learning_rate": 1.2777592664498518e-08, "loss": 0.5527, "step": 3635 }, { "epoch": 2.94, "grad_norm": 0.3589993075810099, "learning_rate": 1.1152239874280268e-08, "loss": 0.5562, "step": 3640 }, { "epoch": 2.95, "grad_norm": 0.3411178283487525, "learning_rate": 9.637296564498077e-09, "loss": 0.5578, "step": 3645 }, { "epoch": 2.95, "grad_norm": 0.3439244716473971, "learning_rate": 8.232796262754305e-09, "loss": 0.557, "step": 3650 }, { "epoch": 2.95, "grad_norm": 0.3469533846283362, "learning_rate": 6.938770052409283e-09, "loss": 0.5567, "step": 3655 }, { "epoch": 2.96, "grad_norm": 0.33946222773826845, "learning_rate": 5.755246571891304e-09, "loss": 0.5546, "step": 3660 }, { "epoch": 2.96, "grad_norm": 0.35727978537637317, "learning_rate": 4.682252014064359e-09, "loss": 0.5609, "step": 3665 }, { "epoch": 2.97, "grad_norm": 0.34048777277431036, "learning_rate": 3.719810125649148e-09, "loss": 0.5542, "step": 3670 }, { "epoch": 2.97, "grad_norm": 0.34901208333309325, "learning_rate": 2.8679422066957287e-09, "loss": 0.5537, "step": 3675 }, { "epoch": 2.97, "grad_norm": 0.3383103808244704, "learning_rate": 2.1266671101138893e-09, "loss": 0.5527, "step": 3680 }, { "epoch": 2.98, "grad_norm": 0.33418287902586197, "learning_rate": 1.4960012412540415e-09, "loss": 0.5514, "step": 3685 }, { "epoch": 2.98, "grad_norm": 0.34040365367011716, "learning_rate": 9.75958557545842e-10, "loss": 0.5539, "step": 3690 }, { "epoch": 2.99, "grad_norm": 0.348264157643715, "learning_rate": 5.665505681884397e-10, "loss": 0.5522, "step": 3695 }, { "epoch": 2.99, "grad_norm": 0.34829122060086015, "learning_rate": 2.677863338962361e-10, "loss": 0.556, "step": 3700 }, { "epoch": 3.0, "grad_norm": 0.34109321924016456, "learning_rate": 7.967246669737804e-11, "loss": 0.5583, "step": 3705 }, { "epoch": 3.0, "grad_norm": 0.3367102943680112, "learning_rate": 2.213129789430113e-12, "loss": 0.5584, "step": 3710 }, { "epoch": 3.0, "eval_loss": 0.6255540251731873, "eval_runtime": 172.7575, "eval_samples_per_second": 48.253, "eval_steps_per_second": 0.758, "step": 3711 }, { "epoch": 3.0, "step": 3711, "total_flos": 1554015804456960.0, "train_loss": 0.6186258538300636, "train_runtime": 37714.0323, "train_samples_per_second": 12.595, "train_steps_per_second": 0.098 } ], "logging_steps": 5, "max_steps": 3711, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000000, "total_flos": 1554015804456960.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }