{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0005717552887363, "eval_steps": 500, "global_step": 70000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001429388221841052, "grad_norm": 8.302410125732422, "learning_rate": 9.999951636252794e-07, "loss": 1.0271, "step": 100 }, { "epoch": 0.002858776443682104, "grad_norm": 7.64557409286499, "learning_rate": 9.999802577787943e-07, "loss": 0.8182, "step": 200 }, { "epoch": 0.004288164665523156, "grad_norm": 5.914328098297119, "learning_rate": 9.999552807464755e-07, "loss": 0.7166, "step": 300 }, { "epoch": 0.005717552887364208, "grad_norm": 7.1736040115356445, "learning_rate": 9.99920233031439e-07, "loss": 0.5976, "step": 400 }, { "epoch": 0.00714694110920526, "grad_norm": 5.904966831207275, "learning_rate": 9.998751153396575e-07, "loss": 0.5194, "step": 500 }, { "epoch": 0.008576329331046312, "grad_norm": 10.023078918457031, "learning_rate": 9.998199285799438e-07, "loss": 0.4733, "step": 600 }, { "epoch": 0.010005717552887363, "grad_norm": 6.058465480804443, "learning_rate": 9.99754673863934e-07, "loss": 0.4894, "step": 700 }, { "epoch": 0.011435105774728416, "grad_norm": 7.328370571136475, "learning_rate": 9.99679352506065e-07, "loss": 0.4437, "step": 800 }, { "epoch": 0.012864493996569469, "grad_norm": 7.761216163635254, "learning_rate": 9.995939660235472e-07, "loss": 0.4611, "step": 900 }, { "epoch": 0.01429388221841052, "grad_norm": 6.76261568069458, "learning_rate": 9.994985161363347e-07, "loss": 0.4263, "step": 1000 }, { "epoch": 0.015723270440251572, "grad_norm": 6.114264488220215, "learning_rate": 9.993930047670912e-07, "loss": 0.442, "step": 1100 }, { "epoch": 0.017152658662092625, "grad_norm": 7.696556568145752, "learning_rate": 9.992774340411489e-07, "loss": 0.4384, "step": 1200 }, { "epoch": 0.018582046883933678, "grad_norm": 6.787875175476074, "learning_rate": 9.99151806286469e-07, "loss": 0.4433, "step": 1300 }, { "epoch": 0.020011435105774727, "grad_norm": 6.84804105758667, "learning_rate": 9.99016124033592e-07, "loss": 0.4036, "step": 1400 }, { "epoch": 0.02144082332761578, "grad_norm": 7.188107490539551, "learning_rate": 9.98870390015588e-07, "loss": 0.3959, "step": 1500 }, { "epoch": 0.022870211549456832, "grad_norm": 8.187899589538574, "learning_rate": 9.98714607168002e-07, "loss": 0.379, "step": 1600 }, { "epoch": 0.024299599771297885, "grad_norm": 7.0130720138549805, "learning_rate": 9.985487786287938e-07, "loss": 0.41, "step": 1700 }, { "epoch": 0.025728987993138937, "grad_norm": 8.572831153869629, "learning_rate": 9.983729077382755e-07, "loss": 0.4145, "step": 1800 }, { "epoch": 0.02715837621497999, "grad_norm": 7.464225769042969, "learning_rate": 9.981869980390434e-07, "loss": 0.4029, "step": 1900 }, { "epoch": 0.02858776443682104, "grad_norm": 8.06611442565918, "learning_rate": 9.979910532759081e-07, "loss": 0.4068, "step": 2000 }, { "epoch": 0.030017152658662092, "grad_norm": 6.385063648223877, "learning_rate": 9.977850773958174e-07, "loss": 0.4073, "step": 2100 }, { "epoch": 0.031446540880503145, "grad_norm": 10.634902954101562, "learning_rate": 9.97569074547778e-07, "loss": 0.3615, "step": 2200 }, { "epoch": 0.032875929102344194, "grad_norm": 7.179512977600098, "learning_rate": 9.973430490827712e-07, "loss": 0.3773, "step": 2300 }, { "epoch": 0.03430531732418525, "grad_norm": 8.605831146240234, "learning_rate": 9.971070055536657e-07, "loss": 0.3794, "step": 2400 }, { "epoch": 0.0357347055460263, "grad_norm": 7.761691570281982, "learning_rate": 9.96860948715126e-07, "loss": 0.388, "step": 2500 }, { "epoch": 0.037164093767867355, "grad_norm": 8.976919174194336, "learning_rate": 9.966048835235156e-07, "loss": 0.3989, "step": 2600 }, { "epoch": 0.038593481989708404, "grad_norm": 7.322809219360352, "learning_rate": 9.963388151367988e-07, "loss": 0.3864, "step": 2700 }, { "epoch": 0.040022870211549454, "grad_norm": 8.886445999145508, "learning_rate": 9.960627489144356e-07, "loss": 0.3789, "step": 2800 }, { "epoch": 0.04145225843339051, "grad_norm": 13.177335739135742, "learning_rate": 9.957766904172738e-07, "loss": 0.3852, "step": 2900 }, { "epoch": 0.04288164665523156, "grad_norm": 10.54977798461914, "learning_rate": 9.954806454074375e-07, "loss": 0.3928, "step": 3000 }, { "epoch": 0.044311034877072615, "grad_norm": 9.929242134094238, "learning_rate": 9.951746198482111e-07, "loss": 0.3845, "step": 3100 }, { "epoch": 0.045740423098913664, "grad_norm": 11.030978202819824, "learning_rate": 9.948586199039181e-07, "loss": 0.3872, "step": 3200 }, { "epoch": 0.04716981132075472, "grad_norm": 6.923922538757324, "learning_rate": 9.945326519397982e-07, "loss": 0.3954, "step": 3300 }, { "epoch": 0.04859919954259577, "grad_norm": 6.286141872406006, "learning_rate": 9.94196722521879e-07, "loss": 0.3654, "step": 3400 }, { "epoch": 0.05002858776443682, "grad_norm": 8.558923721313477, "learning_rate": 9.938508384168422e-07, "loss": 0.3664, "step": 3500 }, { "epoch": 0.051457975986277875, "grad_norm": 8.39925479888916, "learning_rate": 9.934950065918897e-07, "loss": 0.386, "step": 3600 }, { "epoch": 0.052887364208118924, "grad_norm": 7.465270519256592, "learning_rate": 9.93129234214601e-07, "loss": 0.3662, "step": 3700 }, { "epoch": 0.05431675242995998, "grad_norm": 7.879831790924072, "learning_rate": 9.927535286527903e-07, "loss": 0.3827, "step": 3800 }, { "epoch": 0.05574614065180103, "grad_norm": 11.445666313171387, "learning_rate": 9.923678974743576e-07, "loss": 0.385, "step": 3900 }, { "epoch": 0.05717552887364208, "grad_norm": 10.145365715026855, "learning_rate": 9.919723484471358e-07, "loss": 0.3854, "step": 4000 }, { "epoch": 0.058604917095483135, "grad_norm": 9.10260009765625, "learning_rate": 9.915668895387354e-07, "loss": 0.3861, "step": 4100 }, { "epoch": 0.060034305317324184, "grad_norm": 9.712854385375977, "learning_rate": 9.911515289163828e-07, "loss": 0.3807, "step": 4200 }, { "epoch": 0.06146369353916524, "grad_norm": 7.4025444984436035, "learning_rate": 9.907262749467562e-07, "loss": 0.3873, "step": 4300 }, { "epoch": 0.06289308176100629, "grad_norm": 12.814791679382324, "learning_rate": 9.902911361958174e-07, "loss": 0.3777, "step": 4400 }, { "epoch": 0.06432246998284734, "grad_norm": 8.49919605255127, "learning_rate": 9.89846121428639e-07, "loss": 0.3649, "step": 4500 }, { "epoch": 0.06575185820468839, "grad_norm": 11.82237720489502, "learning_rate": 9.89391239609228e-07, "loss": 0.3775, "step": 4600 }, { "epoch": 0.06718124642652945, "grad_norm": 10.297654151916504, "learning_rate": 9.889264999003442e-07, "loss": 0.3878, "step": 4700 }, { "epoch": 0.0686106346483705, "grad_norm": 7.117993354797363, "learning_rate": 9.88451911663318e-07, "loss": 0.3673, "step": 4800 }, { "epoch": 0.07004002287021155, "grad_norm": 8.604228019714355, "learning_rate": 9.879674844578587e-07, "loss": 0.3651, "step": 4900 }, { "epoch": 0.0714694110920526, "grad_norm": 10.55291748046875, "learning_rate": 9.874732280418652e-07, "loss": 0.3869, "step": 5000 }, { "epoch": 0.07289879931389365, "grad_norm": 9.544646263122559, "learning_rate": 9.869691523712265e-07, "loss": 0.3614, "step": 5100 }, { "epoch": 0.07432818753573471, "grad_norm": 8.484085083007812, "learning_rate": 9.864552675996231e-07, "loss": 0.384, "step": 5200 }, { "epoch": 0.07575757575757576, "grad_norm": 11.220099449157715, "learning_rate": 9.859315840783217e-07, "loss": 0.381, "step": 5300 }, { "epoch": 0.07718696397941681, "grad_norm": 8.623137474060059, "learning_rate": 9.853981123559673e-07, "loss": 0.3758, "step": 5400 }, { "epoch": 0.07861635220125786, "grad_norm": 14.061697959899902, "learning_rate": 9.848548631783698e-07, "loss": 0.3679, "step": 5500 }, { "epoch": 0.08004574042309891, "grad_norm": 8.865405082702637, "learning_rate": 9.843018474882879e-07, "loss": 0.3478, "step": 5600 }, { "epoch": 0.08147512864493997, "grad_norm": 11.747910499572754, "learning_rate": 9.83739076425209e-07, "loss": 0.3784, "step": 5700 }, { "epoch": 0.08290451686678102, "grad_norm": 11.385608673095703, "learning_rate": 9.831665613251246e-07, "loss": 0.3656, "step": 5800 }, { "epoch": 0.08433390508862207, "grad_norm": 10.687173843383789, "learning_rate": 9.825843137203023e-07, "loss": 0.3913, "step": 5900 }, { "epoch": 0.08576329331046312, "grad_norm": 9.918437004089355, "learning_rate": 9.819923453390524e-07, "loss": 0.349, "step": 6000 }, { "epoch": 0.08719268153230417, "grad_norm": 10.63182544708252, "learning_rate": 9.813906681054932e-07, "loss": 0.3642, "step": 6100 }, { "epoch": 0.08862206975414523, "grad_norm": 14.960465431213379, "learning_rate": 9.807792941393097e-07, "loss": 0.3558, "step": 6200 }, { "epoch": 0.09005145797598628, "grad_norm": 10.128252983093262, "learning_rate": 9.801582357555093e-07, "loss": 0.3721, "step": 6300 }, { "epoch": 0.09148084619782733, "grad_norm": 11.612500190734863, "learning_rate": 9.795275054641756e-07, "loss": 0.3485, "step": 6400 }, { "epoch": 0.09291023441966838, "grad_norm": 10.803449630737305, "learning_rate": 9.788871159702134e-07, "loss": 0.3394, "step": 6500 }, { "epoch": 0.09433962264150944, "grad_norm": 9.586999893188477, "learning_rate": 9.782370801730959e-07, "loss": 0.3744, "step": 6600 }, { "epoch": 0.09576901086335049, "grad_norm": 13.605101585388184, "learning_rate": 9.775774111666021e-07, "loss": 0.3668, "step": 6700 }, { "epoch": 0.09719839908519154, "grad_norm": 9.152237892150879, "learning_rate": 9.769081222385555e-07, "loss": 0.3572, "step": 6800 }, { "epoch": 0.09862778730703259, "grad_norm": 12.100299835205078, "learning_rate": 9.762292268705544e-07, "loss": 0.3528, "step": 6900 }, { "epoch": 0.10005717552887364, "grad_norm": 11.875809669494629, "learning_rate": 9.755407387377017e-07, "loss": 0.3375, "step": 7000 }, { "epoch": 0.1014865637507147, "grad_norm": 11.866809844970703, "learning_rate": 9.748426717083286e-07, "loss": 0.3797, "step": 7100 }, { "epoch": 0.10291595197255575, "grad_norm": 9.397913932800293, "learning_rate": 9.741350398437165e-07, "loss": 0.3698, "step": 7200 }, { "epoch": 0.1043453401943968, "grad_norm": 10.366521835327148, "learning_rate": 9.734178573978115e-07, "loss": 0.3367, "step": 7300 }, { "epoch": 0.10577472841623785, "grad_norm": 9.068425178527832, "learning_rate": 9.7269113881694e-07, "loss": 0.3704, "step": 7400 }, { "epoch": 0.1072041166380789, "grad_norm": 10.455464363098145, "learning_rate": 9.719548987395153e-07, "loss": 0.3431, "step": 7500 }, { "epoch": 0.10863350485991996, "grad_norm": 9.690875053405762, "learning_rate": 9.712091519957444e-07, "loss": 0.34, "step": 7600 }, { "epoch": 0.11006289308176101, "grad_norm": 10.648077011108398, "learning_rate": 9.704539136073284e-07, "loss": 0.344, "step": 7700 }, { "epoch": 0.11149228130360206, "grad_norm": 9.973433494567871, "learning_rate": 9.696891987871604e-07, "loss": 0.3276, "step": 7800 }, { "epoch": 0.11292166952544311, "grad_norm": 12.676753997802734, "learning_rate": 9.68915022939019e-07, "loss": 0.3452, "step": 7900 }, { "epoch": 0.11435105774728416, "grad_norm": 10.605828285217285, "learning_rate": 9.68131401657257e-07, "loss": 0.3478, "step": 8000 }, { "epoch": 0.11578044596912522, "grad_norm": 10.663880348205566, "learning_rate": 9.673383507264894e-07, "loss": 0.3465, "step": 8100 }, { "epoch": 0.11720983419096627, "grad_norm": 10.765434265136719, "learning_rate": 9.665358861212728e-07, "loss": 0.3666, "step": 8200 }, { "epoch": 0.11863922241280732, "grad_norm": 12.318456649780273, "learning_rate": 9.657240240057865e-07, "loss": 0.3401, "step": 8300 }, { "epoch": 0.12006861063464837, "grad_norm": 11.024144172668457, "learning_rate": 9.649027807335043e-07, "loss": 0.372, "step": 8400 }, { "epoch": 0.12149799885648942, "grad_norm": 12.339703559875488, "learning_rate": 9.640721728468666e-07, "loss": 0.3607, "step": 8500 }, { "epoch": 0.12292738707833048, "grad_norm": 11.234156608581543, "learning_rate": 9.632322170769468e-07, "loss": 0.3486, "step": 8600 }, { "epoch": 0.12435677530017153, "grad_norm": 12.952394485473633, "learning_rate": 9.623829303431144e-07, "loss": 0.3395, "step": 8700 }, { "epoch": 0.12578616352201258, "grad_norm": 10.056218147277832, "learning_rate": 9.615243297526939e-07, "loss": 0.3338, "step": 8800 }, { "epoch": 0.12721555174385363, "grad_norm": 14.559000015258789, "learning_rate": 9.606564326006202e-07, "loss": 0.3396, "step": 8900 }, { "epoch": 0.12864493996569468, "grad_norm": 11.926137924194336, "learning_rate": 9.597792563690906e-07, "loss": 0.3471, "step": 9000 }, { "epoch": 0.13007432818753573, "grad_norm": 10.330910682678223, "learning_rate": 9.588928187272126e-07, "loss": 0.3359, "step": 9100 }, { "epoch": 0.13150371640937678, "grad_norm": 12.393570899963379, "learning_rate": 9.579971375306476e-07, "loss": 0.3414, "step": 9200 }, { "epoch": 0.13293310463121785, "grad_norm": 15.440378189086914, "learning_rate": 9.570922308212513e-07, "loss": 0.3426, "step": 9300 }, { "epoch": 0.1343624928530589, "grad_norm": 11.083294868469238, "learning_rate": 9.561781168267108e-07, "loss": 0.3393, "step": 9400 }, { "epoch": 0.13579188107489995, "grad_norm": 10.22133731842041, "learning_rate": 9.552548139601774e-07, "loss": 0.3652, "step": 9500 }, { "epoch": 0.137221269296741, "grad_norm": 14.368943214416504, "learning_rate": 9.543223408198943e-07, "loss": 0.3251, "step": 9600 }, { "epoch": 0.13865065751858205, "grad_norm": 14.082287788391113, "learning_rate": 9.533807161888241e-07, "loss": 0.3319, "step": 9700 }, { "epoch": 0.1400800457404231, "grad_norm": 13.45727825164795, "learning_rate": 9.524299590342689e-07, "loss": 0.3418, "step": 9800 }, { "epoch": 0.14150943396226415, "grad_norm": 12.652352333068848, "learning_rate": 9.514700885074887e-07, "loss": 0.3363, "step": 9900 }, { "epoch": 0.1429388221841052, "grad_norm": 9.684586524963379, "learning_rate": 9.505011239433159e-07, "loss": 0.357, "step": 10000 }, { "epoch": 0.14436821040594625, "grad_norm": 11.357329368591309, "learning_rate": 9.495230848597657e-07, "loss": 0.3519, "step": 10100 }, { "epoch": 0.1457975986277873, "grad_norm": 11.410923957824707, "learning_rate": 9.485359909576426e-07, "loss": 0.3571, "step": 10200 }, { "epoch": 0.14722698684962837, "grad_norm": 9.20337200164795, "learning_rate": 9.475398621201439e-07, "loss": 0.3252, "step": 10300 }, { "epoch": 0.14865637507146942, "grad_norm": 11.669153213500977, "learning_rate": 9.465347184124592e-07, "loss": 0.3624, "step": 10400 }, { "epoch": 0.15008576329331047, "grad_norm": 11.799446105957031, "learning_rate": 9.455205800813659e-07, "loss": 0.3272, "step": 10500 }, { "epoch": 0.15151515151515152, "grad_norm": 8.722610473632812, "learning_rate": 9.444974675548221e-07, "loss": 0.3437, "step": 10600 }, { "epoch": 0.15294453973699257, "grad_norm": 9.727828979492188, "learning_rate": 9.434654014415539e-07, "loss": 0.3607, "step": 10700 }, { "epoch": 0.15437392795883362, "grad_norm": 9.897192001342773, "learning_rate": 9.424244025306417e-07, "loss": 0.3187, "step": 10800 }, { "epoch": 0.15580331618067467, "grad_norm": 12.308704376220703, "learning_rate": 9.413744917911002e-07, "loss": 0.3315, "step": 10900 }, { "epoch": 0.15723270440251572, "grad_norm": 9.758069038391113, "learning_rate": 9.40315690371457e-07, "loss": 0.3198, "step": 11000 }, { "epoch": 0.15866209262435677, "grad_norm": 17.11725425720215, "learning_rate": 9.392480195993256e-07, "loss": 0.3403, "step": 11100 }, { "epoch": 0.16009148084619781, "grad_norm": 11.781182289123535, "learning_rate": 9.381715009809769e-07, "loss": 0.3617, "step": 11200 }, { "epoch": 0.1615208690680389, "grad_norm": 10.246782302856445, "learning_rate": 9.370861562009054e-07, "loss": 0.3403, "step": 11300 }, { "epoch": 0.16295025728987994, "grad_norm": 11.310129165649414, "learning_rate": 9.359920071213919e-07, "loss": 0.3196, "step": 11400 }, { "epoch": 0.164379645511721, "grad_norm": 9.328283309936523, "learning_rate": 9.348890757820643e-07, "loss": 0.3488, "step": 11500 }, { "epoch": 0.16580903373356204, "grad_norm": 17.782825469970703, "learning_rate": 9.337773843994526e-07, "loss": 0.3187, "step": 11600 }, { "epoch": 0.1672384219554031, "grad_norm": 11.744281768798828, "learning_rate": 9.32656955366542e-07, "loss": 0.3426, "step": 11700 }, { "epoch": 0.16866781017724414, "grad_norm": 14.185038566589355, "learning_rate": 9.315278112523216e-07, "loss": 0.314, "step": 11800 }, { "epoch": 0.1700971983990852, "grad_norm": 8.075092315673828, "learning_rate": 9.303899748013301e-07, "loss": 0.3587, "step": 11900 }, { "epoch": 0.17152658662092624, "grad_norm": 14.384480476379395, "learning_rate": 9.292434689331969e-07, "loss": 0.3464, "step": 12000 }, { "epoch": 0.17295597484276728, "grad_norm": 11.065058708190918, "learning_rate": 9.280883167421813e-07, "loss": 0.3391, "step": 12100 }, { "epoch": 0.17438536306460833, "grad_norm": 2.7053768634796143, "learning_rate": 9.269245414967068e-07, "loss": 0.3479, "step": 12200 }, { "epoch": 0.1758147512864494, "grad_norm": 12.55746841430664, "learning_rate": 9.257521666388926e-07, "loss": 0.3551, "step": 12300 }, { "epoch": 0.17724413950829046, "grad_norm": 11.310877799987793, "learning_rate": 9.245712157840812e-07, "loss": 0.3401, "step": 12400 }, { "epoch": 0.1786735277301315, "grad_norm": 14.231292724609375, "learning_rate": 9.233817127203629e-07, "loss": 0.3231, "step": 12500 }, { "epoch": 0.18010291595197256, "grad_norm": 17.174856185913086, "learning_rate": 9.221836814080965e-07, "loss": 0.3378, "step": 12600 }, { "epoch": 0.1815323041738136, "grad_norm": 11.397006034851074, "learning_rate": 9.209771459794268e-07, "loss": 0.3484, "step": 12700 }, { "epoch": 0.18296169239565466, "grad_norm": 13.256636619567871, "learning_rate": 9.197621307377985e-07, "loss": 0.3631, "step": 12800 }, { "epoch": 0.1843910806174957, "grad_norm": 11.989117622375488, "learning_rate": 9.185386601574666e-07, "loss": 0.3277, "step": 12900 }, { "epoch": 0.18582046883933676, "grad_norm": 11.581043243408203, "learning_rate": 9.173067588830032e-07, "loss": 0.3418, "step": 13000 }, { "epoch": 0.1872498570611778, "grad_norm": 11.485856056213379, "learning_rate": 9.160664517288015e-07, "loss": 0.3284, "step": 13100 }, { "epoch": 0.18867924528301888, "grad_norm": 8.317375183105469, "learning_rate": 9.148177636785756e-07, "loss": 0.338, "step": 13200 }, { "epoch": 0.19010863350485993, "grad_norm": 12.842378616333008, "learning_rate": 9.135607198848576e-07, "loss": 0.3439, "step": 13300 }, { "epoch": 0.19153802172670098, "grad_norm": 15.531667709350586, "learning_rate": 9.122953456684904e-07, "loss": 0.3461, "step": 13400 }, { "epoch": 0.19296740994854203, "grad_norm": 15.904300689697266, "learning_rate": 9.110216665181181e-07, "loss": 0.3396, "step": 13500 }, { "epoch": 0.19439679817038308, "grad_norm": 14.308609008789062, "learning_rate": 9.097397080896726e-07, "loss": 0.3193, "step": 13600 }, { "epoch": 0.19582618639222413, "grad_norm": 13.688326835632324, "learning_rate": 9.084494962058567e-07, "loss": 0.3402, "step": 13700 }, { "epoch": 0.19725557461406518, "grad_norm": 11.917827606201172, "learning_rate": 9.07151056855624e-07, "loss": 0.3, "step": 13800 }, { "epoch": 0.19868496283590623, "grad_norm": 10.748669624328613, "learning_rate": 9.058444161936549e-07, "loss": 0.3292, "step": 13900 }, { "epoch": 0.20011435105774728, "grad_norm": 13.456997871398926, "learning_rate": 9.045296005398304e-07, "loss": 0.321, "step": 14000 }, { "epoch": 0.20154373927958832, "grad_norm": 15.05969524383545, "learning_rate": 9.032066363787022e-07, "loss": 0.3489, "step": 14100 }, { "epoch": 0.2029731275014294, "grad_norm": 8.975543022155762, "learning_rate": 9.018755503589581e-07, "loss": 0.3491, "step": 14200 }, { "epoch": 0.20440251572327045, "grad_norm": 11.852422714233398, "learning_rate": 9.005363692928861e-07, "loss": 0.3555, "step": 14300 }, { "epoch": 0.2058319039451115, "grad_norm": 11.369333267211914, "learning_rate": 8.991891201558342e-07, "loss": 0.3328, "step": 14400 }, { "epoch": 0.20726129216695255, "grad_norm": 15.280744552612305, "learning_rate": 8.978338300856665e-07, "loss": 0.3346, "step": 14500 }, { "epoch": 0.2086906803887936, "grad_norm": 14.367311477661133, "learning_rate": 8.964705263822174e-07, "loss": 0.3249, "step": 14600 }, { "epoch": 0.21012006861063465, "grad_norm": 12.512520790100098, "learning_rate": 8.950992365067412e-07, "loss": 0.3326, "step": 14700 }, { "epoch": 0.2115494568324757, "grad_norm": 13.751002311706543, "learning_rate": 8.937199880813588e-07, "loss": 0.3353, "step": 14800 }, { "epoch": 0.21297884505431675, "grad_norm": 13.4205904006958, "learning_rate": 8.923328088885014e-07, "loss": 0.3218, "step": 14900 }, { "epoch": 0.2144082332761578, "grad_norm": 18.001859664916992, "learning_rate": 8.909377268703513e-07, "loss": 0.3261, "step": 15000 }, { "epoch": 0.21583762149799884, "grad_norm": 13.708430290222168, "learning_rate": 8.895347701282787e-07, "loss": 0.3235, "step": 15100 }, { "epoch": 0.21726700971983992, "grad_norm": 10.588641166687012, "learning_rate": 8.881239669222753e-07, "loss": 0.3193, "step": 15200 }, { "epoch": 0.21869639794168097, "grad_norm": 12.683503150939941, "learning_rate": 8.867053456703861e-07, "loss": 0.3335, "step": 15300 }, { "epoch": 0.22012578616352202, "grad_norm": 9.490144729614258, "learning_rate": 8.852789349481354e-07, "loss": 0.3476, "step": 15400 }, { "epoch": 0.22155517438536307, "grad_norm": 8.324650764465332, "learning_rate": 8.838447634879529e-07, "loss": 0.3234, "step": 15500 }, { "epoch": 0.22298456260720412, "grad_norm": 18.675048828125, "learning_rate": 8.824028601785937e-07, "loss": 0.3208, "step": 15600 }, { "epoch": 0.22441395082904517, "grad_norm": 17.782501220703125, "learning_rate": 8.809532540645572e-07, "loss": 0.3137, "step": 15700 }, { "epoch": 0.22584333905088622, "grad_norm": 11.78647518157959, "learning_rate": 8.794959743455013e-07, "loss": 0.331, "step": 15800 }, { "epoch": 0.22727272727272727, "grad_norm": 10.095694541931152, "learning_rate": 8.780310503756546e-07, "loss": 0.3501, "step": 15900 }, { "epoch": 0.22870211549456831, "grad_norm": 8.243073463439941, "learning_rate": 8.765585116632256e-07, "loss": 0.3288, "step": 16000 }, { "epoch": 0.23013150371640936, "grad_norm": 13.941173553466797, "learning_rate": 8.750783878698074e-07, "loss": 0.3235, "step": 16100 }, { "epoch": 0.23156089193825044, "grad_norm": 11.812784194946289, "learning_rate": 8.735907088097805e-07, "loss": 0.3379, "step": 16200 }, { "epoch": 0.2329902801600915, "grad_norm": 11.740796089172363, "learning_rate": 8.720955044497131e-07, "loss": 0.3203, "step": 16300 }, { "epoch": 0.23441966838193254, "grad_norm": 11.343643188476562, "learning_rate": 8.70592804907756e-07, "loss": 0.3481, "step": 16400 }, { "epoch": 0.2358490566037736, "grad_norm": 12.909595489501953, "learning_rate": 8.690826404530373e-07, "loss": 0.3192, "step": 16500 }, { "epoch": 0.23727844482561464, "grad_norm": 10.144956588745117, "learning_rate": 8.675650415050515e-07, "loss": 0.3442, "step": 16600 }, { "epoch": 0.2387078330474557, "grad_norm": 11.924606323242188, "learning_rate": 8.66040038633048e-07, "loss": 0.3268, "step": 16700 }, { "epoch": 0.24013722126929674, "grad_norm": 12.561604499816895, "learning_rate": 8.645076625554142e-07, "loss": 0.3163, "step": 16800 }, { "epoch": 0.24156660949113778, "grad_norm": 17.487289428710938, "learning_rate": 8.629679441390573e-07, "loss": 0.3333, "step": 16900 }, { "epoch": 0.24299599771297883, "grad_norm": 12.031607627868652, "learning_rate": 8.614209143987826e-07, "loss": 0.3268, "step": 17000 }, { "epoch": 0.2444253859348199, "grad_norm": 11.001572608947754, "learning_rate": 8.598666044966682e-07, "loss": 0.3107, "step": 17100 }, { "epoch": 0.24585477415666096, "grad_norm": 17.776351928710938, "learning_rate": 8.583050457414387e-07, "loss": 0.347, "step": 17200 }, { "epoch": 0.247284162378502, "grad_norm": 10.700430870056152, "learning_rate": 8.567362695878324e-07, "loss": 0.3307, "step": 17300 }, { "epoch": 0.24871355060034306, "grad_norm": 9.782203674316406, "learning_rate": 8.551603076359695e-07, "loss": 0.3437, "step": 17400 }, { "epoch": 0.2501429388221841, "grad_norm": 15.822957038879395, "learning_rate": 8.535771916307146e-07, "loss": 0.3294, "step": 17500 }, { "epoch": 0.25157232704402516, "grad_norm": 10.07957649230957, "learning_rate": 8.519869534610382e-07, "loss": 0.355, "step": 17600 }, { "epoch": 0.25300171526586623, "grad_norm": 19.679927825927734, "learning_rate": 8.50389625159373e-07, "loss": 0.3308, "step": 17700 }, { "epoch": 0.25443110348770726, "grad_norm": 13.828248023986816, "learning_rate": 8.487852389009698e-07, "loss": 0.3336, "step": 17800 }, { "epoch": 0.25586049170954833, "grad_norm": 12.436111450195312, "learning_rate": 8.471738270032492e-07, "loss": 0.3236, "step": 17900 }, { "epoch": 0.25728987993138935, "grad_norm": 12.645123481750488, "learning_rate": 8.455554219251497e-07, "loss": 0.3282, "step": 18000 }, { "epoch": 0.25871926815323043, "grad_norm": 15.437549591064453, "learning_rate": 8.439300562664757e-07, "loss": 0.3162, "step": 18100 }, { "epoch": 0.26014865637507145, "grad_norm": 15.36680793762207, "learning_rate": 8.422977627672385e-07, "loss": 0.3302, "step": 18200 }, { "epoch": 0.26157804459691253, "grad_norm": 15.108634948730469, "learning_rate": 8.406585743069992e-07, "loss": 0.3338, "step": 18300 }, { "epoch": 0.26300743281875355, "grad_norm": 11.512022972106934, "learning_rate": 8.390125239042043e-07, "loss": 0.3386, "step": 18400 }, { "epoch": 0.2644368210405946, "grad_norm": 10.95230770111084, "learning_rate": 8.37359644715522e-07, "loss": 0.3279, "step": 18500 }, { "epoch": 0.2658662092624357, "grad_norm": 17.933120727539062, "learning_rate": 8.356999700351741e-07, "loss": 0.313, "step": 18600 }, { "epoch": 0.2672955974842767, "grad_norm": 14.285967826843262, "learning_rate": 8.340335332942647e-07, "loss": 0.3292, "step": 18700 }, { "epoch": 0.2687249857061178, "grad_norm": 15.529138565063477, "learning_rate": 8.323603680601075e-07, "loss": 0.3118, "step": 18800 }, { "epoch": 0.2701543739279588, "grad_norm": 17.855663299560547, "learning_rate": 8.306805080355491e-07, "loss": 0.3216, "step": 18900 }, { "epoch": 0.2715837621497999, "grad_norm": 12.237960815429688, "learning_rate": 8.289939870582907e-07, "loss": 0.3234, "step": 19000 }, { "epoch": 0.2730131503716409, "grad_norm": 16.071992874145508, "learning_rate": 8.273008391002057e-07, "loss": 0.3208, "step": 19100 }, { "epoch": 0.274442538593482, "grad_norm": 13.998347282409668, "learning_rate": 8.256010982666565e-07, "loss": 0.3319, "step": 19200 }, { "epoch": 0.275871926815323, "grad_norm": 8.947419166564941, "learning_rate": 8.238947987958064e-07, "loss": 0.3197, "step": 19300 }, { "epoch": 0.2773013150371641, "grad_norm": 17.14949607849121, "learning_rate": 8.221819750579309e-07, "loss": 0.356, "step": 19400 }, { "epoch": 0.2787307032590051, "grad_norm": 9.766751289367676, "learning_rate": 8.204626615547241e-07, "loss": 0.346, "step": 19500 }, { "epoch": 0.2801600914808462, "grad_norm": 12.12964153289795, "learning_rate": 8.187368929186056e-07, "loss": 0.3212, "step": 19600 }, { "epoch": 0.2815894797026873, "grad_norm": 16.39698028564453, "learning_rate": 8.170047039120208e-07, "loss": 0.3402, "step": 19700 }, { "epoch": 0.2830188679245283, "grad_norm": 18.98644256591797, "learning_rate": 8.152661294267422e-07, "loss": 0.3017, "step": 19800 }, { "epoch": 0.28444825614636937, "grad_norm": 18.11912727355957, "learning_rate": 8.135212044831665e-07, "loss": 0.3228, "step": 19900 }, { "epoch": 0.2858776443682104, "grad_norm": 17.011539459228516, "learning_rate": 8.117699642296078e-07, "loss": 0.3193, "step": 20000 }, { "epoch": 0.28730703259005147, "grad_norm": 12.597023010253906, "learning_rate": 8.10012443941591e-07, "loss": 0.3224, "step": 20100 }, { "epoch": 0.2887364208118925, "grad_norm": 11.657052040100098, "learning_rate": 8.082486790211409e-07, "loss": 0.3074, "step": 20200 }, { "epoch": 0.29016580903373357, "grad_norm": 17.029722213745117, "learning_rate": 8.064787049960691e-07, "loss": 0.3105, "step": 20300 }, { "epoch": 0.2915951972555746, "grad_norm": 17.851268768310547, "learning_rate": 8.047025575192574e-07, "loss": 0.3248, "step": 20400 }, { "epoch": 0.29302458547741567, "grad_norm": 17.955358505249023, "learning_rate": 8.029202723679416e-07, "loss": 0.3311, "step": 20500 }, { "epoch": 0.29445397369925674, "grad_norm": 12.829858779907227, "learning_rate": 8.011318854429889e-07, "loss": 0.328, "step": 20600 }, { "epoch": 0.29588336192109777, "grad_norm": 16.81481170654297, "learning_rate": 7.993374327681759e-07, "loss": 0.3107, "step": 20700 }, { "epoch": 0.29731275014293884, "grad_norm": 14.806863784790039, "learning_rate": 7.975369504894624e-07, "loss": 0.311, "step": 20800 }, { "epoch": 0.29874213836477986, "grad_norm": 11.845511436462402, "learning_rate": 7.95730474874264e-07, "loss": 0.3194, "step": 20900 }, { "epoch": 0.30017152658662094, "grad_norm": 13.77103042602539, "learning_rate": 7.939180423107201e-07, "loss": 0.3014, "step": 21000 }, { "epoch": 0.30160091480846196, "grad_norm": 7.9158854484558105, "learning_rate": 7.920996893069631e-07, "loss": 0.3256, "step": 21100 }, { "epoch": 0.30303030303030304, "grad_norm": 8.28150749206543, "learning_rate": 7.902754524903808e-07, "loss": 0.3094, "step": 21200 }, { "epoch": 0.30445969125214406, "grad_norm": 9.419288635253906, "learning_rate": 7.884453686068804e-07, "loss": 0.3265, "step": 21300 }, { "epoch": 0.30588907947398514, "grad_norm": 13.700448989868164, "learning_rate": 7.86609474520147e-07, "loss": 0.3323, "step": 21400 }, { "epoch": 0.3073184676958262, "grad_norm": 14.406478881835938, "learning_rate": 7.847678072109021e-07, "loss": 0.3203, "step": 21500 }, { "epoch": 0.30874785591766724, "grad_norm": 12.646419525146484, "learning_rate": 7.829204037761574e-07, "loss": 0.2957, "step": 21600 }, { "epoch": 0.3101772441395083, "grad_norm": 16.298370361328125, "learning_rate": 7.810673014284691e-07, "loss": 0.3216, "step": 21700 }, { "epoch": 0.31160663236134933, "grad_norm": 12.568595886230469, "learning_rate": 7.792085374951873e-07, "loss": 0.3123, "step": 21800 }, { "epoch": 0.3130360205831904, "grad_norm": 17.768095016479492, "learning_rate": 7.773441494177043e-07, "loss": 0.3309, "step": 21900 }, { "epoch": 0.31446540880503143, "grad_norm": 13.66750431060791, "learning_rate": 7.754741747507005e-07, "loss": 0.3246, "step": 22000 }, { "epoch": 0.3158947970268725, "grad_norm": 13.163860321044922, "learning_rate": 7.735986511613878e-07, "loss": 0.3323, "step": 22100 }, { "epoch": 0.31732418524871353, "grad_norm": 16.254423141479492, "learning_rate": 7.717176164287514e-07, "loss": 0.3311, "step": 22200 }, { "epoch": 0.3187535734705546, "grad_norm": 15.925135612487793, "learning_rate": 7.698311084427877e-07, "loss": 0.327, "step": 22300 }, { "epoch": 0.32018296169239563, "grad_norm": 15.258606910705566, "learning_rate": 7.679391652037423e-07, "loss": 0.319, "step": 22400 }, { "epoch": 0.3216123499142367, "grad_norm": 17.25618553161621, "learning_rate": 7.660418248213441e-07, "loss": 0.3256, "step": 22500 }, { "epoch": 0.3230417381360778, "grad_norm": 17.183578491210938, "learning_rate": 7.641391255140368e-07, "loss": 0.3255, "step": 22600 }, { "epoch": 0.3244711263579188, "grad_norm": 13.38002872467041, "learning_rate": 7.622311056082107e-07, "loss": 0.3187, "step": 22700 }, { "epoch": 0.3259005145797599, "grad_norm": 12.458002090454102, "learning_rate": 7.603178035374291e-07, "loss": 0.3061, "step": 22800 }, { "epoch": 0.3273299028016009, "grad_norm": 15.024831771850586, "learning_rate": 7.583992578416555e-07, "loss": 0.3138, "step": 22900 }, { "epoch": 0.328759291023442, "grad_norm": 19.279998779296875, "learning_rate": 7.564755071664761e-07, "loss": 0.3195, "step": 23000 }, { "epoch": 0.330188679245283, "grad_norm": 16.499897003173828, "learning_rate": 7.545465902623218e-07, "loss": 0.3481, "step": 23100 }, { "epoch": 0.3316180674671241, "grad_norm": 9.746506690979004, "learning_rate": 7.526125459836884e-07, "loss": 0.3294, "step": 23200 }, { "epoch": 0.3330474556889651, "grad_norm": 18.3165283203125, "learning_rate": 7.506734132883525e-07, "loss": 0.3358, "step": 23300 }, { "epoch": 0.3344768439108062, "grad_norm": 15.169964790344238, "learning_rate": 7.48729231236588e-07, "loss": 0.3205, "step": 23400 }, { "epoch": 0.33590623213264725, "grad_norm": 15.487222671508789, "learning_rate": 7.467800389903786e-07, "loss": 0.3241, "step": 23500 }, { "epoch": 0.3373356203544883, "grad_norm": 14.081758499145508, "learning_rate": 7.448258758126291e-07, "loss": 0.3145, "step": 23600 }, { "epoch": 0.33876500857632935, "grad_norm": 11.75092601776123, "learning_rate": 7.428667810663749e-07, "loss": 0.3269, "step": 23700 }, { "epoch": 0.3401943967981704, "grad_norm": 11.291512489318848, "learning_rate": 7.409027942139887e-07, "loss": 0.3093, "step": 23800 }, { "epoch": 0.34162378502001145, "grad_norm": 15.608858108520508, "learning_rate": 7.389339548163857e-07, "loss": 0.3132, "step": 23900 }, { "epoch": 0.34305317324185247, "grad_norm": 14.15320873260498, "learning_rate": 7.369603025322264e-07, "loss": 0.3225, "step": 24000 }, { "epoch": 0.34448256146369355, "grad_norm": 6.809738636016846, "learning_rate": 7.349818771171188e-07, "loss": 0.3392, "step": 24100 }, { "epoch": 0.34591194968553457, "grad_norm": 15.83595085144043, "learning_rate": 7.329987184228161e-07, "loss": 0.3307, "step": 24200 }, { "epoch": 0.34734133790737565, "grad_norm": 12.833749771118164, "learning_rate": 7.310108663964154e-07, "loss": 0.3179, "step": 24300 }, { "epoch": 0.34877072612921667, "grad_norm": 18.04795265197754, "learning_rate": 7.290183610795517e-07, "loss": 0.2901, "step": 24400 }, { "epoch": 0.35020011435105775, "grad_norm": 20.49600601196289, "learning_rate": 7.270212426075928e-07, "loss": 0.3161, "step": 24500 }, { "epoch": 0.3516295025728988, "grad_norm": 13.884822845458984, "learning_rate": 7.250195512088291e-07, "loss": 0.3178, "step": 24600 }, { "epoch": 0.35305889079473984, "grad_norm": 14.301074981689453, "learning_rate": 7.230133272036652e-07, "loss": 0.3329, "step": 24700 }, { "epoch": 0.3544882790165809, "grad_norm": 17.32472038269043, "learning_rate": 7.210026110038061e-07, "loss": 0.3133, "step": 24800 }, { "epoch": 0.35591766723842194, "grad_norm": 14.574315071105957, "learning_rate": 7.189874431114441e-07, "loss": 0.326, "step": 24900 }, { "epoch": 0.357347055460263, "grad_norm": 13.456582069396973, "learning_rate": 7.169678641184424e-07, "loss": 0.3384, "step": 25000 }, { "epoch": 0.35877644368210404, "grad_norm": 15.078902244567871, "learning_rate": 7.149439147055181e-07, "loss": 0.3097, "step": 25100 }, { "epoch": 0.3602058319039451, "grad_norm": 18.25004005432129, "learning_rate": 7.129156356414223e-07, "loss": 0.2905, "step": 25200 }, { "epoch": 0.36163522012578614, "grad_norm": 12.935120582580566, "learning_rate": 7.108830677821189e-07, "loss": 0.3113, "step": 25300 }, { "epoch": 0.3630646083476272, "grad_norm": 16.448625564575195, "learning_rate": 7.088462520699616e-07, "loss": 0.3087, "step": 25400 }, { "epoch": 0.3644939965694683, "grad_norm": 13.395794868469238, "learning_rate": 7.068052295328697e-07, "loss": 0.3144, "step": 25500 }, { "epoch": 0.3659233847913093, "grad_norm": 14.861013412475586, "learning_rate": 7.047600412835009e-07, "loss": 0.3199, "step": 25600 }, { "epoch": 0.3673527730131504, "grad_norm": 16.787778854370117, "learning_rate": 7.027107285184235e-07, "loss": 0.3086, "step": 25700 }, { "epoch": 0.3687821612349914, "grad_norm": 17.0662784576416, "learning_rate": 7.006573325172872e-07, "loss": 0.313, "step": 25800 }, { "epoch": 0.3702115494568325, "grad_norm": 16.2261962890625, "learning_rate": 6.985998946419902e-07, "loss": 0.3161, "step": 25900 }, { "epoch": 0.3716409376786735, "grad_norm": 16.310203552246094, "learning_rate": 6.965384563358474e-07, "loss": 0.3271, "step": 26000 }, { "epoch": 0.3730703259005146, "grad_norm": 14.0792818069458, "learning_rate": 6.944730591227549e-07, "loss": 0.3037, "step": 26100 }, { "epoch": 0.3744997141223556, "grad_norm": 15.81286334991455, "learning_rate": 6.924037446063537e-07, "loss": 0.3312, "step": 26200 }, { "epoch": 0.3759291023441967, "grad_norm": 4.341174602508545, "learning_rate": 6.903305544691915e-07, "loss": 0.3241, "step": 26300 }, { "epoch": 0.37735849056603776, "grad_norm": 10.123844146728516, "learning_rate": 6.882535304718837e-07, "loss": 0.3271, "step": 26400 }, { "epoch": 0.3787878787878788, "grad_norm": 13.688687324523926, "learning_rate": 6.861727144522716e-07, "loss": 0.3115, "step": 26500 }, { "epoch": 0.38021726700971986, "grad_norm": 16.41066551208496, "learning_rate": 6.840881483245796e-07, "loss": 0.3161, "step": 26600 }, { "epoch": 0.3816466552315609, "grad_norm": 14.114217758178711, "learning_rate": 6.819998740785714e-07, "loss": 0.3072, "step": 26700 }, { "epoch": 0.38307604345340196, "grad_norm": 12.668971061706543, "learning_rate": 6.799079337787041e-07, "loss": 0.302, "step": 26800 }, { "epoch": 0.384505431675243, "grad_norm": 13.211670875549316, "learning_rate": 6.778123695632803e-07, "loss": 0.3245, "step": 26900 }, { "epoch": 0.38593481989708406, "grad_norm": 10.632538795471191, "learning_rate": 6.757132236435999e-07, "loss": 0.3484, "step": 27000 }, { "epoch": 0.3873642081189251, "grad_norm": 20.58715057373047, "learning_rate": 6.736105383031101e-07, "loss": 0.3273, "step": 27100 }, { "epoch": 0.38879359634076616, "grad_norm": 13.834729194641113, "learning_rate": 6.715043558965527e-07, "loss": 0.329, "step": 27200 }, { "epoch": 0.3902229845626072, "grad_norm": 16.589298248291016, "learning_rate": 6.693947188491115e-07, "loss": 0.3067, "step": 27300 }, { "epoch": 0.39165237278444825, "grad_norm": 14.605484008789062, "learning_rate": 6.672816696555581e-07, "loss": 0.323, "step": 27400 }, { "epoch": 0.39308176100628933, "grad_norm": 10.18587589263916, "learning_rate": 6.651652508793952e-07, "loss": 0.3349, "step": 27500 }, { "epoch": 0.39451114922813035, "grad_norm": 19.2006893157959, "learning_rate": 6.630455051519997e-07, "loss": 0.3272, "step": 27600 }, { "epoch": 0.39594053744997143, "grad_norm": 18.699934005737305, "learning_rate": 6.609224751717636e-07, "loss": 0.3237, "step": 27700 }, { "epoch": 0.39736992567181245, "grad_norm": 11.459882736206055, "learning_rate": 6.587962037032347e-07, "loss": 0.3303, "step": 27800 }, { "epoch": 0.39879931389365353, "grad_norm": 16.175960540771484, "learning_rate": 6.566667335762541e-07, "loss": 0.3088, "step": 27900 }, { "epoch": 0.40022870211549455, "grad_norm": 16.32941436767578, "learning_rate": 6.545341076850944e-07, "loss": 0.3478, "step": 28000 }, { "epoch": 0.4016580903373356, "grad_norm": 18.530061721801758, "learning_rate": 6.523983689875949e-07, "loss": 0.3227, "step": 28100 }, { "epoch": 0.40308747855917665, "grad_norm": 18.284992218017578, "learning_rate": 6.502595605042969e-07, "loss": 0.2937, "step": 28200 }, { "epoch": 0.4045168667810177, "grad_norm": 23.14360237121582, "learning_rate": 6.481177253175769e-07, "loss": 0.3254, "step": 28300 }, { "epoch": 0.4059462550028588, "grad_norm": 12.258302688598633, "learning_rate": 6.459729065707788e-07, "loss": 0.3036, "step": 28400 }, { "epoch": 0.4073756432246998, "grad_norm": 10.581338882446289, "learning_rate": 6.438251474673449e-07, "loss": 0.2909, "step": 28500 }, { "epoch": 0.4088050314465409, "grad_norm": 18.170541763305664, "learning_rate": 6.416744912699452e-07, "loss": 0.3235, "step": 28600 }, { "epoch": 0.4102344196683819, "grad_norm": 15.704451560974121, "learning_rate": 6.395209812996068e-07, "loss": 0.3024, "step": 28700 }, { "epoch": 0.411663807890223, "grad_norm": 15.009072303771973, "learning_rate": 6.373646609348411e-07, "loss": 0.3223, "step": 28800 }, { "epoch": 0.413093196112064, "grad_norm": 11.04826831817627, "learning_rate": 6.352055736107687e-07, "loss": 0.3188, "step": 28900 }, { "epoch": 0.4145225843339051, "grad_norm": 15.899747848510742, "learning_rate": 6.330437628182467e-07, "loss": 0.3154, "step": 29000 }, { "epoch": 0.4159519725557461, "grad_norm": 12.113235473632812, "learning_rate": 6.308792721029907e-07, "loss": 0.3187, "step": 29100 }, { "epoch": 0.4173813607775872, "grad_norm": 17.46638298034668, "learning_rate": 6.287121450646984e-07, "loss": 0.2808, "step": 29200 }, { "epoch": 0.4188107489994282, "grad_norm": 15.754764556884766, "learning_rate": 6.265424253561722e-07, "loss": 0.3284, "step": 29300 }, { "epoch": 0.4202401372212693, "grad_norm": 13.17143726348877, "learning_rate": 6.243701566824381e-07, "loss": 0.3052, "step": 29400 }, { "epoch": 0.42166952544311037, "grad_norm": 13.660146713256836, "learning_rate": 6.22195382799867e-07, "loss": 0.3305, "step": 29500 }, { "epoch": 0.4230989136649514, "grad_norm": 10.232270240783691, "learning_rate": 6.200181475152924e-07, "loss": 0.3147, "step": 29600 }, { "epoch": 0.42452830188679247, "grad_norm": 11.83288860321045, "learning_rate": 6.178384946851282e-07, "loss": 0.3027, "step": 29700 }, { "epoch": 0.4259576901086335, "grad_norm": 20.159343719482422, "learning_rate": 6.156564682144855e-07, "loss": 0.3223, "step": 29800 }, { "epoch": 0.42738707833047457, "grad_norm": 13.013447761535645, "learning_rate": 6.134721120562878e-07, "loss": 0.3264, "step": 29900 }, { "epoch": 0.4288164665523156, "grad_norm": 13.751709938049316, "learning_rate": 6.11285470210386e-07, "loss": 0.2932, "step": 30000 }, { "epoch": 0.43024585477415667, "grad_norm": 14.115873336791992, "learning_rate": 6.090965867226721e-07, "loss": 0.3336, "step": 30100 }, { "epoch": 0.4316752429959977, "grad_norm": 13.346335411071777, "learning_rate": 6.069055056841917e-07, "loss": 0.3072, "step": 30200 }, { "epoch": 0.43310463121783876, "grad_norm": 18.071094512939453, "learning_rate": 6.04712271230256e-07, "loss": 0.3159, "step": 30300 }, { "epoch": 0.43453401943967984, "grad_norm": 15.872982025146484, "learning_rate": 6.02516927539553e-07, "loss": 0.3128, "step": 30400 }, { "epoch": 0.43596340766152086, "grad_norm": 16.49416160583496, "learning_rate": 6.003195188332575e-07, "loss": 0.3395, "step": 30500 }, { "epoch": 0.43739279588336194, "grad_norm": 11.139934539794922, "learning_rate": 5.981200893741396e-07, "loss": 0.3464, "step": 30600 }, { "epoch": 0.43882218410520296, "grad_norm": 16.102718353271484, "learning_rate": 5.959186834656746e-07, "loss": 0.2956, "step": 30700 }, { "epoch": 0.44025157232704404, "grad_norm": 16.206851959228516, "learning_rate": 5.93715345451149e-07, "loss": 0.3008, "step": 30800 }, { "epoch": 0.44168096054888506, "grad_norm": 16.85133934020996, "learning_rate": 5.915101197127688e-07, "loss": 0.3078, "step": 30900 }, { "epoch": 0.44311034877072614, "grad_norm": 14.434657096862793, "learning_rate": 5.893030506707641e-07, "loss": 0.3061, "step": 31000 }, { "epoch": 0.44453973699256716, "grad_norm": 11.269679069519043, "learning_rate": 5.870941827824953e-07, "loss": 0.2897, "step": 31100 }, { "epoch": 0.44596912521440824, "grad_norm": 18.077045440673828, "learning_rate": 5.848835605415573e-07, "loss": 0.3529, "step": 31200 }, { "epoch": 0.4473985134362493, "grad_norm": 16.942514419555664, "learning_rate": 5.826712284768829e-07, "loss": 0.2959, "step": 31300 }, { "epoch": 0.44882790165809033, "grad_norm": 12.557541847229004, "learning_rate": 5.804572311518463e-07, "loss": 0.3319, "step": 31400 }, { "epoch": 0.4502572898799314, "grad_norm": 14.807748794555664, "learning_rate": 5.782416131633658e-07, "loss": 0.3129, "step": 31500 }, { "epoch": 0.45168667810177243, "grad_norm": 2.660892963409424, "learning_rate": 5.76024419141004e-07, "loss": 0.3138, "step": 31600 }, { "epoch": 0.4531160663236135, "grad_norm": 15.423670768737793, "learning_rate": 5.738056937460706e-07, "loss": 0.3126, "step": 31700 }, { "epoch": 0.45454545454545453, "grad_norm": 16.488794326782227, "learning_rate": 5.71585481670722e-07, "loss": 0.3365, "step": 31800 }, { "epoch": 0.4559748427672956, "grad_norm": 14.807668685913086, "learning_rate": 5.693638276370605e-07, "loss": 0.293, "step": 31900 }, { "epoch": 0.45740423098913663, "grad_norm": 8.806946754455566, "learning_rate": 5.671407763962348e-07, "loss": 0.3233, "step": 32000 }, { "epoch": 0.4588336192109777, "grad_norm": 19.563190460205078, "learning_rate": 5.649163727275367e-07, "loss": 0.3109, "step": 32100 }, { "epoch": 0.4602630074328187, "grad_norm": 20.52071189880371, "learning_rate": 5.626906614375012e-07, "loss": 0.3198, "step": 32200 }, { "epoch": 0.4616923956546598, "grad_norm": 17.079336166381836, "learning_rate": 5.604636873590023e-07, "loss": 0.2874, "step": 32300 }, { "epoch": 0.4631217838765009, "grad_norm": 14.161290168762207, "learning_rate": 5.582354953503512e-07, "loss": 0.3146, "step": 32400 }, { "epoch": 0.4645511720983419, "grad_norm": 14.484636306762695, "learning_rate": 5.560061302943911e-07, "loss": 0.3164, "step": 32500 }, { "epoch": 0.465980560320183, "grad_norm": 14.380138397216797, "learning_rate": 5.537756370975953e-07, "loss": 0.289, "step": 32600 }, { "epoch": 0.467409948542024, "grad_norm": 8.581055641174316, "learning_rate": 5.515440606891601e-07, "loss": 0.292, "step": 32700 }, { "epoch": 0.4688393367638651, "grad_norm": 12.97655200958252, "learning_rate": 5.493114460201022e-07, "loss": 0.3071, "step": 32800 }, { "epoch": 0.4702687249857061, "grad_norm": 18.66696548461914, "learning_rate": 5.470778380623515e-07, "loss": 0.3241, "step": 32900 }, { "epoch": 0.4716981132075472, "grad_norm": 18.474149703979492, "learning_rate": 5.448432818078465e-07, "loss": 0.2949, "step": 33000 }, { "epoch": 0.4731275014293882, "grad_norm": 17.02419090270996, "learning_rate": 5.426078222676266e-07, "loss": 0.3358, "step": 33100 }, { "epoch": 0.4745568896512293, "grad_norm": 16.157123565673828, "learning_rate": 5.403715044709269e-07, "loss": 0.3236, "step": 33200 }, { "epoch": 0.47598627787307035, "grad_norm": 17.541637420654297, "learning_rate": 5.381343734642702e-07, "loss": 0.3017, "step": 33300 }, { "epoch": 0.4774156660949114, "grad_norm": 14.20406723022461, "learning_rate": 5.358964743105603e-07, "loss": 0.3301, "step": 33400 }, { "epoch": 0.47884505431675245, "grad_norm": 17.515470504760742, "learning_rate": 5.33657852088173e-07, "loss": 0.3214, "step": 33500 }, { "epoch": 0.48027444253859347, "grad_norm": 23.076448440551758, "learning_rate": 5.314185518900499e-07, "loss": 0.2972, "step": 33600 }, { "epoch": 0.48170383076043455, "grad_norm": 16.90967559814453, "learning_rate": 5.291786188227891e-07, "loss": 0.3188, "step": 33700 }, { "epoch": 0.48313321898227557, "grad_norm": 12.707032203674316, "learning_rate": 5.269380980057361e-07, "loss": 0.3217, "step": 33800 }, { "epoch": 0.48456260720411665, "grad_norm": 22.17814826965332, "learning_rate": 5.246970345700761e-07, "loss": 0.3082, "step": 33900 }, { "epoch": 0.48599199542595767, "grad_norm": 14.493890762329102, "learning_rate": 5.224554736579241e-07, "loss": 0.3007, "step": 34000 }, { "epoch": 0.48742138364779874, "grad_norm": 12.817108154296875, "learning_rate": 5.202134604214159e-07, "loss": 0.317, "step": 34100 }, { "epoch": 0.4888507718696398, "grad_norm": 17.926246643066406, "learning_rate": 5.179710400217985e-07, "loss": 0.3014, "step": 34200 }, { "epoch": 0.49028016009148084, "grad_norm": 5.114149570465088, "learning_rate": 5.157282576285209e-07, "loss": 0.2974, "step": 34300 }, { "epoch": 0.4917095483133219, "grad_norm": 15.244714736938477, "learning_rate": 5.13485158418323e-07, "loss": 0.3061, "step": 34400 }, { "epoch": 0.49313893653516294, "grad_norm": 16.27927589416504, "learning_rate": 5.112417875743269e-07, "loss": 0.3022, "step": 34500 }, { "epoch": 0.494568324757004, "grad_norm": 15.702018737792969, "learning_rate": 5.089981902851262e-07, "loss": 0.3121, "step": 34600 }, { "epoch": 0.49599771297884504, "grad_norm": 14.645057678222656, "learning_rate": 5.067544117438757e-07, "loss": 0.3281, "step": 34700 }, { "epoch": 0.4974271012006861, "grad_norm": 17.808927536010742, "learning_rate": 5.045104971473817e-07, "loss": 0.315, "step": 34800 }, { "epoch": 0.49885648942252714, "grad_norm": 14.701974868774414, "learning_rate": 5.0226649169519e-07, "loss": 0.3408, "step": 34900 }, { "epoch": 0.5002858776443682, "grad_norm": 10.982688903808594, "learning_rate": 5.000224405886777e-07, "loss": 0.3051, "step": 35000 }, { "epoch": 0.5017152658662093, "grad_norm": 16.63332176208496, "learning_rate": 4.97778389030141e-07, "loss": 0.3088, "step": 35100 }, { "epoch": 0.5031446540880503, "grad_norm": 12.840409278869629, "learning_rate": 4.955343822218849e-07, "loss": 0.2985, "step": 35200 }, { "epoch": 0.5045740423098913, "grad_norm": 17.793912887573242, "learning_rate": 4.932904653653137e-07, "loss": 0.3033, "step": 35300 }, { "epoch": 0.5060034305317325, "grad_norm": 19.91800880432129, "learning_rate": 4.91046683660019e-07, "loss": 0.3261, "step": 35400 }, { "epoch": 0.5074328187535735, "grad_norm": 17.29555320739746, "learning_rate": 4.888030823028709e-07, "loss": 0.2985, "step": 35500 }, { "epoch": 0.5088622069754145, "grad_norm": 23.83055877685547, "learning_rate": 4.865597064871056e-07, "loss": 0.31, "step": 35600 }, { "epoch": 0.5102915951972555, "grad_norm": 17.186630249023438, "learning_rate": 4.843166014014175e-07, "loss": 0.318, "step": 35700 }, { "epoch": 0.5117209834190967, "grad_norm": 10.865263938903809, "learning_rate": 4.820738122290465e-07, "loss": 0.31, "step": 35800 }, { "epoch": 0.5131503716409377, "grad_norm": 14.89427661895752, "learning_rate": 4.798313841468697e-07, "loss": 0.3004, "step": 35900 }, { "epoch": 0.5145797598627787, "grad_norm": 20.33976936340332, "learning_rate": 4.775893623244902e-07, "loss": 0.3178, "step": 36000 }, { "epoch": 0.5160091480846197, "grad_norm": 16.392250061035156, "learning_rate": 4.753477919233284e-07, "loss": 0.2968, "step": 36100 }, { "epoch": 0.5174385363064609, "grad_norm": 16.331335067749023, "learning_rate": 4.731067180957109e-07, "loss": 0.3156, "step": 36200 }, { "epoch": 0.5188679245283019, "grad_norm": 11.772985458374023, "learning_rate": 4.7086618598396235e-07, "loss": 0.3204, "step": 36300 }, { "epoch": 0.5202973127501429, "grad_norm": 14.681493759155273, "learning_rate": 4.68626240719495e-07, "loss": 0.3244, "step": 36400 }, { "epoch": 0.521726700971984, "grad_norm": 18.403217315673828, "learning_rate": 4.6638692742190045e-07, "loss": 0.3246, "step": 36500 }, { "epoch": 0.5231560891938251, "grad_norm": 14.127409934997559, "learning_rate": 4.641482911980408e-07, "loss": 0.3304, "step": 36600 }, { "epoch": 0.5245854774156661, "grad_norm": 13.635549545288086, "learning_rate": 4.6191037714113896e-07, "loss": 0.3128, "step": 36700 }, { "epoch": 0.5260148656375071, "grad_norm": 14.784650802612305, "learning_rate": 4.596732303298718e-07, "loss": 0.3112, "step": 36800 }, { "epoch": 0.5274442538593482, "grad_norm": 13.729008674621582, "learning_rate": 4.57436895827461e-07, "loss": 0.3111, "step": 36900 }, { "epoch": 0.5288736420811893, "grad_norm": 18.967445373535156, "learning_rate": 4.552014186807659e-07, "loss": 0.3074, "step": 37000 }, { "epoch": 0.5303030303030303, "grad_norm": 18.788753509521484, "learning_rate": 4.529668439193763e-07, "loss": 0.3001, "step": 37100 }, { "epoch": 0.5317324185248714, "grad_norm": 13.225625991821289, "learning_rate": 4.5073321655470444e-07, "loss": 0.3101, "step": 37200 }, { "epoch": 0.5331618067467124, "grad_norm": 12.608953475952148, "learning_rate": 4.4850058157907965e-07, "loss": 0.3242, "step": 37300 }, { "epoch": 0.5345911949685535, "grad_norm": 11.804121017456055, "learning_rate": 4.4626898396484094e-07, "loss": 0.3028, "step": 37400 }, { "epoch": 0.5360205831903945, "grad_norm": 12.934346199035645, "learning_rate": 4.4403846866343183e-07, "loss": 0.3197, "step": 37500 }, { "epoch": 0.5374499714122356, "grad_norm": 12.67959213256836, "learning_rate": 4.4180908060449444e-07, "loss": 0.3149, "step": 37600 }, { "epoch": 0.5388793596340766, "grad_norm": 15.788005828857422, "learning_rate": 4.3958086469496505e-07, "loss": 0.2921, "step": 37700 }, { "epoch": 0.5403087478559176, "grad_norm": 15.079042434692383, "learning_rate": 4.373538658181685e-07, "loss": 0.3104, "step": 37800 }, { "epoch": 0.5417381360777587, "grad_norm": 15.205528259277344, "learning_rate": 4.351281288329152e-07, "loss": 0.2855, "step": 37900 }, { "epoch": 0.5431675242995998, "grad_norm": 18.84633445739746, "learning_rate": 4.329036985725972e-07, "loss": 0.2908, "step": 38000 }, { "epoch": 0.5445969125214408, "grad_norm": 15.914019584655762, "learning_rate": 4.306806198442845e-07, "loss": 0.3141, "step": 38100 }, { "epoch": 0.5460263007432818, "grad_norm": 13.498083114624023, "learning_rate": 4.2845893742782334e-07, "loss": 0.3081, "step": 38200 }, { "epoch": 0.547455688965123, "grad_norm": 16.817855834960938, "learning_rate": 4.2623869607493353e-07, "loss": 0.3278, "step": 38300 }, { "epoch": 0.548885077186964, "grad_norm": 8.583946228027344, "learning_rate": 4.240199405083076e-07, "loss": 0.2989, "step": 38400 }, { "epoch": 0.550314465408805, "grad_norm": 18.395645141601562, "learning_rate": 4.2180271542070923e-07, "loss": 0.3191, "step": 38500 }, { "epoch": 0.551743853630646, "grad_norm": 13.686614036560059, "learning_rate": 4.19587065474074e-07, "loss": 0.298, "step": 38600 }, { "epoch": 0.5531732418524872, "grad_norm": 11.950490951538086, "learning_rate": 4.1737303529860853e-07, "loss": 0.2959, "step": 38700 }, { "epoch": 0.5546026300743282, "grad_norm": 17.453075408935547, "learning_rate": 4.151606694918926e-07, "loss": 0.3003, "step": 38800 }, { "epoch": 0.5560320182961692, "grad_norm": 14.262287139892578, "learning_rate": 4.1295001261798e-07, "loss": 0.3265, "step": 38900 }, { "epoch": 0.5574614065180102, "grad_norm": 17.048542022705078, "learning_rate": 4.107411092065015e-07, "loss": 0.3041, "step": 39000 }, { "epoch": 0.5588907947398514, "grad_norm": 15.934035301208496, "learning_rate": 4.085340037517672e-07, "loss": 0.2944, "step": 39100 }, { "epoch": 0.5603201829616924, "grad_norm": 15.25521183013916, "learning_rate": 4.0632874071187084e-07, "loss": 0.293, "step": 39200 }, { "epoch": 0.5617495711835334, "grad_norm": 13.348962783813477, "learning_rate": 4.041253645077944e-07, "loss": 0.3239, "step": 39300 }, { "epoch": 0.5631789594053745, "grad_norm": 17.120988845825195, "learning_rate": 4.019239195225129e-07, "loss": 0.3117, "step": 39400 }, { "epoch": 0.5646083476272156, "grad_norm": 18.583637237548828, "learning_rate": 3.997244501000998e-07, "loss": 0.2978, "step": 39500 }, { "epoch": 0.5660377358490566, "grad_norm": 16.680891036987305, "learning_rate": 3.9752700054483553e-07, "loss": 0.3049, "step": 39600 }, { "epoch": 0.5674671240708976, "grad_norm": 19.173202514648438, "learning_rate": 3.953316151203131e-07, "loss": 0.3078, "step": 39700 }, { "epoch": 0.5688965122927387, "grad_norm": 20.231666564941406, "learning_rate": 3.93138338048548e-07, "loss": 0.3094, "step": 39800 }, { "epoch": 0.5703259005145798, "grad_norm": 14.554839134216309, "learning_rate": 3.909472135090864e-07, "loss": 0.3196, "step": 39900 }, { "epoch": 0.5717552887364208, "grad_norm": 15.809792518615723, "learning_rate": 3.8875828563811594e-07, "loss": 0.2986, "step": 40000 }, { "epoch": 0.5731846769582619, "grad_norm": 8.43028450012207, "learning_rate": 3.865715985275763e-07, "loss": 0.2949, "step": 40100 }, { "epoch": 0.5746140651801029, "grad_norm": 14.66102123260498, "learning_rate": 3.843871962242712e-07, "loss": 0.2925, "step": 40200 }, { "epoch": 0.576043453401944, "grad_norm": 20.20011329650879, "learning_rate": 3.8220512272898087e-07, "loss": 0.3037, "step": 40300 }, { "epoch": 0.577472841623785, "grad_norm": 11.075307846069336, "learning_rate": 3.800254219955763e-07, "loss": 0.3033, "step": 40400 }, { "epoch": 0.5789022298456261, "grad_norm": 15.223038673400879, "learning_rate": 3.778481379301332e-07, "loss": 0.3267, "step": 40500 }, { "epoch": 0.5803316180674671, "grad_norm": 19.251392364501953, "learning_rate": 3.756733143900484e-07, "loss": 0.3058, "step": 40600 }, { "epoch": 0.5817610062893082, "grad_norm": 7.108490467071533, "learning_rate": 3.7350099518315564e-07, "loss": 0.2868, "step": 40700 }, { "epoch": 0.5831903945111492, "grad_norm": 14.538580894470215, "learning_rate": 3.7133122406684336e-07, "loss": 0.3025, "step": 40800 }, { "epoch": 0.5846197827329903, "grad_norm": 12.984477996826172, "learning_rate": 3.6916404474717365e-07, "loss": 0.3118, "step": 40900 }, { "epoch": 0.5860491709548313, "grad_norm": 22.763214111328125, "learning_rate": 3.6699950087800135e-07, "loss": 0.3034, "step": 41000 }, { "epoch": 0.5874785591766724, "grad_norm": 17.55906867980957, "learning_rate": 3.648376360600953e-07, "loss": 0.303, "step": 41100 }, { "epoch": 0.5889079473985135, "grad_norm": 16.349145889282227, "learning_rate": 3.626784938402594e-07, "loss": 0.3113, "step": 41200 }, { "epoch": 0.5903373356203545, "grad_norm": 15.086002349853516, "learning_rate": 3.6052211771045614e-07, "loss": 0.3077, "step": 41300 }, { "epoch": 0.5917667238421955, "grad_norm": 18.546539306640625, "learning_rate": 3.5836855110692985e-07, "loss": 0.3047, "step": 41400 }, { "epoch": 0.5931961120640366, "grad_norm": 17.640857696533203, "learning_rate": 3.5621783740933247e-07, "loss": 0.317, "step": 41500 }, { "epoch": 0.5946255002858777, "grad_norm": 14.347726821899414, "learning_rate": 3.540700199398491e-07, "loss": 0.3209, "step": 41600 }, { "epoch": 0.5960548885077187, "grad_norm": 17.817596435546875, "learning_rate": 3.5192514196232573e-07, "loss": 0.3101, "step": 41700 }, { "epoch": 0.5974842767295597, "grad_norm": 15.510146141052246, "learning_rate": 3.497832466813975e-07, "loss": 0.2928, "step": 41800 }, { "epoch": 0.5989136649514007, "grad_norm": 21.39759635925293, "learning_rate": 3.476443772416192e-07, "loss": 0.2732, "step": 41900 }, { "epoch": 0.6003430531732419, "grad_norm": 15.604277610778809, "learning_rate": 3.4550857672659497e-07, "loss": 0.3201, "step": 42000 }, { "epoch": 0.6017724413950829, "grad_norm": 18.335630416870117, "learning_rate": 3.4337588815811125e-07, "loss": 0.32, "step": 42100 }, { "epoch": 0.6032018296169239, "grad_norm": 13.139364242553711, "learning_rate": 3.4124635449527e-07, "loss": 0.3195, "step": 42200 }, { "epoch": 0.6046312178387651, "grad_norm": 2.8682501316070557, "learning_rate": 3.391200186336236e-07, "loss": 0.2983, "step": 42300 }, { "epoch": 0.6060606060606061, "grad_norm": 11.018492698669434, "learning_rate": 3.369969234043102e-07, "loss": 0.3084, "step": 42400 }, { "epoch": 0.6074899942824471, "grad_norm": 15.238458633422852, "learning_rate": 3.3487711157319185e-07, "loss": 0.301, "step": 42500 }, { "epoch": 0.6089193825042881, "grad_norm": 13.045623779296875, "learning_rate": 3.32760625839992e-07, "loss": 0.3123, "step": 42600 }, { "epoch": 0.6103487707261293, "grad_norm": 16.57142448425293, "learning_rate": 3.306475088374365e-07, "loss": 0.3027, "step": 42700 }, { "epoch": 0.6117781589479703, "grad_norm": 19.278446197509766, "learning_rate": 3.285378031303939e-07, "loss": 0.32, "step": 42800 }, { "epoch": 0.6132075471698113, "grad_norm": 18.457469940185547, "learning_rate": 3.2643155121501874e-07, "loss": 0.3105, "step": 42900 }, { "epoch": 0.6146369353916524, "grad_norm": 26.263347625732422, "learning_rate": 3.243287955178949e-07, "loss": 0.2808, "step": 43000 }, { "epoch": 0.6160663236134934, "grad_norm": 14.898819923400879, "learning_rate": 3.2222957839518163e-07, "loss": 0.3022, "step": 43100 }, { "epoch": 0.6174957118353345, "grad_norm": 14.23936939239502, "learning_rate": 3.201339421317602e-07, "loss": 0.304, "step": 43200 }, { "epoch": 0.6189251000571755, "grad_norm": 13.336498260498047, "learning_rate": 3.180419289403816e-07, "loss": 0.2864, "step": 43300 }, { "epoch": 0.6203544882790166, "grad_norm": 17.907373428344727, "learning_rate": 3.1595358096081725e-07, "loss": 0.3192, "step": 43400 }, { "epoch": 0.6217838765008576, "grad_norm": 19.81977653503418, "learning_rate": 3.1386894025900903e-07, "loss": 0.2965, "step": 43500 }, { "epoch": 0.6232132647226987, "grad_norm": 14.139717102050781, "learning_rate": 3.117880488262228e-07, "loss": 0.3052, "step": 43600 }, { "epoch": 0.6246426529445397, "grad_norm": 14.271943092346191, "learning_rate": 3.0971094857820214e-07, "loss": 0.305, "step": 43700 }, { "epoch": 0.6260720411663808, "grad_norm": 16.517471313476562, "learning_rate": 3.0763768135432444e-07, "loss": 0.298, "step": 43800 }, { "epoch": 0.6275014293882218, "grad_norm": 15.435643196105957, "learning_rate": 3.0556828891675746e-07, "loss": 0.282, "step": 43900 }, { "epoch": 0.6289308176100629, "grad_norm": 13.524007797241211, "learning_rate": 3.0350281294961877e-07, "loss": 0.318, "step": 44000 }, { "epoch": 0.630360205831904, "grad_norm": 17.173023223876953, "learning_rate": 3.014412950581357e-07, "loss": 0.2895, "step": 44100 }, { "epoch": 0.631789594053745, "grad_norm": 19.615671157836914, "learning_rate": 2.993837767678074e-07, "loss": 0.3104, "step": 44200 }, { "epoch": 0.633218982275586, "grad_norm": 9.908998489379883, "learning_rate": 2.973302995235686e-07, "loss": 0.3202, "step": 44300 }, { "epoch": 0.6346483704974271, "grad_norm": 14.276174545288086, "learning_rate": 2.95280904688954e-07, "loss": 0.306, "step": 44400 }, { "epoch": 0.6360777587192682, "grad_norm": 14.082444190979004, "learning_rate": 2.932356335452664e-07, "loss": 0.3106, "step": 44500 }, { "epoch": 0.6375071469411092, "grad_norm": 17.640113830566406, "learning_rate": 2.9119452729074386e-07, "loss": 0.2856, "step": 44600 }, { "epoch": 0.6389365351629502, "grad_norm": 16.667879104614258, "learning_rate": 2.8915762703973066e-07, "loss": 0.2948, "step": 44700 }, { "epoch": 0.6403659233847913, "grad_norm": 15.171881675720215, "learning_rate": 2.871249738218486e-07, "loss": 0.3096, "step": 44800 }, { "epoch": 0.6417953116066324, "grad_norm": 13.525574684143066, "learning_rate": 2.850966085811711e-07, "loss": 0.2743, "step": 44900 }, { "epoch": 0.6432246998284734, "grad_norm": 15.58211612701416, "learning_rate": 2.830725721753976e-07, "loss": 0.3227, "step": 45000 }, { "epoch": 0.6446540880503144, "grad_norm": 13.05323600769043, "learning_rate": 2.810529053750319e-07, "loss": 0.3017, "step": 45100 }, { "epoch": 0.6460834762721556, "grad_norm": 19.23615074157715, "learning_rate": 2.7903764886255936e-07, "loss": 0.2926, "step": 45200 }, { "epoch": 0.6475128644939966, "grad_norm": 23.290681838989258, "learning_rate": 2.770268432316286e-07, "loss": 0.3126, "step": 45300 }, { "epoch": 0.6489422527158376, "grad_norm": 19.66526222229004, "learning_rate": 2.750205289862331e-07, "loss": 0.3055, "step": 45400 }, { "epoch": 0.6503716409376786, "grad_norm": 13.931109428405762, "learning_rate": 2.730187465398961e-07, "loss": 0.2947, "step": 45500 }, { "epoch": 0.6518010291595198, "grad_norm": 21.40576934814453, "learning_rate": 2.7102153621485553e-07, "loss": 0.3257, "step": 45600 }, { "epoch": 0.6532304173813608, "grad_norm": 19.681676864624023, "learning_rate": 2.6902893824125233e-07, "loss": 0.3042, "step": 45700 }, { "epoch": 0.6546598056032018, "grad_norm": 17.58971405029297, "learning_rate": 2.670409927563207e-07, "loss": 0.2952, "step": 45800 }, { "epoch": 0.6560891938250428, "grad_norm": 17.782339096069336, "learning_rate": 2.650577398035781e-07, "loss": 0.3147, "step": 45900 }, { "epoch": 0.657518582046884, "grad_norm": 11.956877708435059, "learning_rate": 2.6307921933202037e-07, "loss": 0.312, "step": 46000 }, { "epoch": 0.658947970268725, "grad_norm": 9.567434310913086, "learning_rate": 2.6110547119531566e-07, "loss": 0.3264, "step": 46100 }, { "epoch": 0.660377358490566, "grad_norm": 14.589381217956543, "learning_rate": 2.591365351510023e-07, "loss": 0.2907, "step": 46200 }, { "epoch": 0.6618067467124071, "grad_norm": 18.533931732177734, "learning_rate": 2.5717245085968763e-07, "loss": 0.2753, "step": 46300 }, { "epoch": 0.6632361349342482, "grad_norm": 14.891212463378906, "learning_rate": 2.5521325788424996e-07, "loss": 0.3186, "step": 46400 }, { "epoch": 0.6646655231560892, "grad_norm": 14.884139060974121, "learning_rate": 2.532589956890405e-07, "loss": 0.3131, "step": 46500 }, { "epoch": 0.6660949113779302, "grad_norm": 16.716190338134766, "learning_rate": 2.5130970363908866e-07, "loss": 0.3048, "step": 46600 }, { "epoch": 0.6675242995997713, "grad_norm": 11.722126007080078, "learning_rate": 2.493654209993102e-07, "loss": 0.2993, "step": 46700 }, { "epoch": 0.6689536878216124, "grad_norm": 14.006104469299316, "learning_rate": 2.47426186933715e-07, "loss": 0.3128, "step": 46800 }, { "epoch": 0.6703830760434534, "grad_norm": 15.97590160369873, "learning_rate": 2.454920405046185e-07, "loss": 0.2967, "step": 46900 }, { "epoch": 0.6718124642652945, "grad_norm": 9.723312377929688, "learning_rate": 2.435630206718552e-07, "loss": 0.2879, "step": 47000 }, { "epoch": 0.6732418524871355, "grad_norm": 15.608738899230957, "learning_rate": 2.4163916629199383e-07, "loss": 0.3042, "step": 47100 }, { "epoch": 0.6746712407089765, "grad_norm": 14.241352081298828, "learning_rate": 2.397205161175547e-07, "loss": 0.3032, "step": 47200 }, { "epoch": 0.6761006289308176, "grad_norm": 14.38450813293457, "learning_rate": 2.3780710879622863e-07, "loss": 0.2877, "step": 47300 }, { "epoch": 0.6775300171526587, "grad_norm": 23.245580673217773, "learning_rate": 2.358989828700987e-07, "loss": 0.3215, "step": 47400 }, { "epoch": 0.6789594053744997, "grad_norm": 17.788522720336914, "learning_rate": 2.3399617677486394e-07, "loss": 0.3228, "step": 47500 }, { "epoch": 0.6803887935963407, "grad_norm": 21.889699935913086, "learning_rate": 2.320987288390658e-07, "loss": 0.2814, "step": 47600 }, { "epoch": 0.6818181818181818, "grad_norm": 20.845090866088867, "learning_rate": 2.302066772833146e-07, "loss": 0.3207, "step": 47700 }, { "epoch": 0.6832475700400229, "grad_norm": 16.08342742919922, "learning_rate": 2.2832006021952092e-07, "loss": 0.3051, "step": 47800 }, { "epoch": 0.6846769582618639, "grad_norm": 10.725104331970215, "learning_rate": 2.2643891565012724e-07, "loss": 0.309, "step": 47900 }, { "epoch": 0.6861063464837049, "grad_norm": 13.660962104797363, "learning_rate": 2.24563281467343e-07, "loss": 0.2955, "step": 48000 }, { "epoch": 0.6875357347055461, "grad_norm": 10.941113471984863, "learning_rate": 2.226931954523807e-07, "loss": 0.3069, "step": 48100 }, { "epoch": 0.6889651229273871, "grad_norm": 14.538285255432129, "learning_rate": 2.2082869527469522e-07, "loss": 0.3019, "step": 48200 }, { "epoch": 0.6903945111492281, "grad_norm": 14.886780738830566, "learning_rate": 2.1896981849122486e-07, "loss": 0.2759, "step": 48300 }, { "epoch": 0.6918238993710691, "grad_norm": 20.888874053955078, "learning_rate": 2.1711660254563534e-07, "loss": 0.3089, "step": 48400 }, { "epoch": 0.6932532875929103, "grad_norm": 8.578594207763672, "learning_rate": 2.1526908476756512e-07, "loss": 0.2925, "step": 48500 }, { "epoch": 0.6946826758147513, "grad_norm": 17.534717559814453, "learning_rate": 2.1342730237187312e-07, "loss": 0.304, "step": 48600 }, { "epoch": 0.6961120640365923, "grad_norm": 13.420719146728516, "learning_rate": 2.1159129245788966e-07, "loss": 0.306, "step": 48700 }, { "epoch": 0.6975414522584333, "grad_norm": 13.839432716369629, "learning_rate": 2.0976109200866931e-07, "loss": 0.3115, "step": 48800 }, { "epoch": 0.6989708404802745, "grad_norm": 18.911731719970703, "learning_rate": 2.0793673789024525e-07, "loss": 0.3088, "step": 48900 }, { "epoch": 0.7004002287021155, "grad_norm": 17.34248924255371, "learning_rate": 2.061182668508872e-07, "loss": 0.3161, "step": 49000 }, { "epoch": 0.7018296169239565, "grad_norm": 17.372520446777344, "learning_rate": 2.043057155203608e-07, "loss": 0.3041, "step": 49100 }, { "epoch": 0.7032590051457976, "grad_norm": 18.078920364379883, "learning_rate": 2.0249912040919053e-07, "loss": 0.3062, "step": 49200 }, { "epoch": 0.7046883933676387, "grad_norm": 17.021692276000977, "learning_rate": 2.0069851790792335e-07, "loss": 0.3096, "step": 49300 }, { "epoch": 0.7061177815894797, "grad_norm": 16.885961532592773, "learning_rate": 1.989039442863961e-07, "loss": 0.2972, "step": 49400 }, { "epoch": 0.7075471698113207, "grad_norm": 20.15297508239746, "learning_rate": 1.9711543569300476e-07, "loss": 0.3016, "step": 49500 }, { "epoch": 0.7089765580331618, "grad_norm": 13.195883750915527, "learning_rate": 1.95333028153977e-07, "loss": 0.2997, "step": 49600 }, { "epoch": 0.7104059462550029, "grad_norm": 22.616363525390625, "learning_rate": 1.9355675757264524e-07, "loss": 0.3007, "step": 49700 }, { "epoch": 0.7118353344768439, "grad_norm": 11.725162506103516, "learning_rate": 1.9178665972872476e-07, "loss": 0.3052, "step": 49800 }, { "epoch": 0.713264722698685, "grad_norm": 12.43447208404541, "learning_rate": 1.9002277027759183e-07, "loss": 0.2907, "step": 49900 }, { "epoch": 0.714694110920526, "grad_norm": 14.980326652526855, "learning_rate": 1.882651247495662e-07, "loss": 0.2869, "step": 50000 }, { "epoch": 0.7161234991423671, "grad_norm": 15.194437026977539, "learning_rate": 1.8651375854919554e-07, "loss": 0.3026, "step": 50100 }, { "epoch": 0.7175528873642081, "grad_norm": 16.40146827697754, "learning_rate": 1.8476870695454154e-07, "loss": 0.3213, "step": 50200 }, { "epoch": 0.7189822755860492, "grad_norm": 11.521158218383789, "learning_rate": 1.8303000511646998e-07, "loss": 0.3151, "step": 50300 }, { "epoch": 0.7204116638078902, "grad_norm": 14.188138961791992, "learning_rate": 1.8129768805794216e-07, "loss": 0.3278, "step": 50400 }, { "epoch": 0.7218410520297313, "grad_norm": 16.83079719543457, "learning_rate": 1.795717906733102e-07, "loss": 0.3117, "step": 50500 }, { "epoch": 0.7232704402515723, "grad_norm": 20.378862380981445, "learning_rate": 1.7785234772761325e-07, "loss": 0.3026, "step": 50600 }, { "epoch": 0.7246998284734134, "grad_norm": 5.174847602844238, "learning_rate": 1.7613939385587767e-07, "loss": 0.2872, "step": 50700 }, { "epoch": 0.7261292166952544, "grad_norm": 17.4864559173584, "learning_rate": 1.7443296356241932e-07, "loss": 0.3038, "step": 50800 }, { "epoch": 0.7275586049170955, "grad_norm": 16.40941047668457, "learning_rate": 1.727330912201488e-07, "loss": 0.2899, "step": 50900 }, { "epoch": 0.7289879931389366, "grad_norm": 14.583476066589355, "learning_rate": 1.7103981106987832e-07, "loss": 0.2797, "step": 51000 }, { "epoch": 0.7304173813607776, "grad_norm": 19.992061614990234, "learning_rate": 1.6935315721963306e-07, "loss": 0.2794, "step": 51100 }, { "epoch": 0.7318467695826186, "grad_norm": 13.148829460144043, "learning_rate": 1.676731636439629e-07, "loss": 0.3001, "step": 51200 }, { "epoch": 0.7332761578044596, "grad_norm": 21.785202026367188, "learning_rate": 1.659998641832593e-07, "loss": 0.3027, "step": 51300 }, { "epoch": 0.7347055460263008, "grad_norm": 18.78382110595703, "learning_rate": 1.643332925430726e-07, "loss": 0.2709, "step": 51400 }, { "epoch": 0.7361349342481418, "grad_norm": 11.865448951721191, "learning_rate": 1.6267348229343375e-07, "loss": 0.2933, "step": 51500 }, { "epoch": 0.7375643224699828, "grad_norm": 10.925949096679688, "learning_rate": 1.610204668681776e-07, "loss": 0.3232, "step": 51600 }, { "epoch": 0.7389937106918238, "grad_norm": 17.886837005615234, "learning_rate": 1.5937427956427018e-07, "loss": 0.3069, "step": 51700 }, { "epoch": 0.740423098913665, "grad_norm": 25.283750534057617, "learning_rate": 1.5773495354113726e-07, "loss": 0.296, "step": 51800 }, { "epoch": 0.741852487135506, "grad_norm": 15.996776580810547, "learning_rate": 1.5610252181999662e-07, "loss": 0.3013, "step": 51900 }, { "epoch": 0.743281875357347, "grad_norm": 25.283042907714844, "learning_rate": 1.544770172831929e-07, "loss": 0.3153, "step": 52000 }, { "epoch": 0.7447112635791882, "grad_norm": 19.91518211364746, "learning_rate": 1.528584726735358e-07, "loss": 0.3067, "step": 52100 }, { "epoch": 0.7461406518010292, "grad_norm": 18.83380126953125, "learning_rate": 1.5124692059363953e-07, "loss": 0.3223, "step": 52200 }, { "epoch": 0.7475700400228702, "grad_norm": 15.652589797973633, "learning_rate": 1.496423935052666e-07, "loss": 0.3143, "step": 52300 }, { "epoch": 0.7489994282447112, "grad_norm": 20.540328979492188, "learning_rate": 1.4804492372867445e-07, "loss": 0.3113, "step": 52400 }, { "epoch": 0.7504288164665524, "grad_norm": 18.228116989135742, "learning_rate": 1.4645454344196335e-07, "loss": 0.3023, "step": 52500 }, { "epoch": 0.7518582046883934, "grad_norm": 14.827437400817871, "learning_rate": 1.448712846804292e-07, "loss": 0.2828, "step": 52600 }, { "epoch": 0.7532875929102344, "grad_norm": 19.521427154541016, "learning_rate": 1.4329517933591757e-07, "loss": 0.3049, "step": 52700 }, { "epoch": 0.7547169811320755, "grad_norm": 8.15851879119873, "learning_rate": 1.4172625915618165e-07, "loss": 0.2879, "step": 52800 }, { "epoch": 0.7561463693539165, "grad_norm": 11.626235961914062, "learning_rate": 1.4016455574424247e-07, "loss": 0.3031, "step": 52900 }, { "epoch": 0.7575757575757576, "grad_norm": 8.977296829223633, "learning_rate": 1.3861010055775297e-07, "loss": 0.299, "step": 53000 }, { "epoch": 0.7590051457975986, "grad_norm": 18.390705108642578, "learning_rate": 1.3706292490836346e-07, "loss": 0.3019, "step": 53100 }, { "epoch": 0.7604345340194397, "grad_norm": 16.448415756225586, "learning_rate": 1.3552305996109138e-07, "loss": 0.3224, "step": 53200 }, { "epoch": 0.7618639222412807, "grad_norm": 18.946380615234375, "learning_rate": 1.339905367336935e-07, "loss": 0.299, "step": 53300 }, { "epoch": 0.7632933104631218, "grad_norm": 21.892333984375, "learning_rate": 1.324653860960413e-07, "loss": 0.2939, "step": 53400 }, { "epoch": 0.7647226986849628, "grad_norm": 12.889732360839844, "learning_rate": 1.3094763876949873e-07, "loss": 0.3155, "step": 53500 }, { "epoch": 0.7661520869068039, "grad_norm": 18.003572463989258, "learning_rate": 1.294373253263034e-07, "loss": 0.2984, "step": 53600 }, { "epoch": 0.7675814751286449, "grad_norm": 17.548511505126953, "learning_rate": 1.279344761889516e-07, "loss": 0.2851, "step": 53700 }, { "epoch": 0.769010863350486, "grad_norm": 13.130178451538086, "learning_rate": 1.2643912162958442e-07, "loss": 0.2939, "step": 53800 }, { "epoch": 0.7704402515723271, "grad_norm": 12.267966270446777, "learning_rate": 1.2495129176937846e-07, "loss": 0.2928, "step": 53900 }, { "epoch": 0.7718696397941681, "grad_norm": 11.627607345581055, "learning_rate": 1.2347101657793906e-07, "loss": 0.3161, "step": 54000 }, { "epoch": 0.7732990280160091, "grad_norm": 17.394630432128906, "learning_rate": 1.2199832587269642e-07, "loss": 0.325, "step": 54100 }, { "epoch": 0.7747284162378502, "grad_norm": 21.596054077148438, "learning_rate": 1.2053324931830573e-07, "loss": 0.2937, "step": 54200 }, { "epoch": 0.7761578044596913, "grad_norm": 13.607353210449219, "learning_rate": 1.1907581642604853e-07, "loss": 0.3146, "step": 54300 }, { "epoch": 0.7775871926815323, "grad_norm": 11.432480812072754, "learning_rate": 1.1762605655323899e-07, "loss": 0.3115, "step": 54400 }, { "epoch": 0.7790165809033733, "grad_norm": 18.864477157592773, "learning_rate": 1.1618399890263215e-07, "loss": 0.2886, "step": 54500 }, { "epoch": 0.7804459691252144, "grad_norm": 17.996967315673828, "learning_rate": 1.1474967252183648e-07, "loss": 0.325, "step": 54600 }, { "epoch": 0.7818753573470555, "grad_norm": 16.265716552734375, "learning_rate": 1.1332310630272757e-07, "loss": 0.2988, "step": 54700 }, { "epoch": 0.7833047455688965, "grad_norm": 13.267022132873535, "learning_rate": 1.119043289808671e-07, "loss": 0.3189, "step": 54800 }, { "epoch": 0.7847341337907375, "grad_norm": 19.030851364135742, "learning_rate": 1.1049336913492347e-07, "loss": 0.3216, "step": 54900 }, { "epoch": 0.7861635220125787, "grad_norm": 16.178241729736328, "learning_rate": 1.090902551860966e-07, "loss": 0.2841, "step": 55000 }, { "epoch": 0.7875929102344197, "grad_norm": 16.9733943939209, "learning_rate": 1.0769501539754528e-07, "loss": 0.3049, "step": 55100 }, { "epoch": 0.7890222984562607, "grad_norm": 17.60822105407715, "learning_rate": 1.063076778738174e-07, "loss": 0.3102, "step": 55200 }, { "epoch": 0.7904516866781017, "grad_norm": 18.490419387817383, "learning_rate": 1.0492827056028442e-07, "loss": 0.3107, "step": 55300 }, { "epoch": 0.7918810748999429, "grad_norm": 20.003347396850586, "learning_rate": 1.0355682124257809e-07, "loss": 0.3199, "step": 55400 }, { "epoch": 0.7933104631217839, "grad_norm": 23.92873191833496, "learning_rate": 1.0219335754603136e-07, "loss": 0.3058, "step": 55500 }, { "epoch": 0.7947398513436249, "grad_norm": 24.447021484375, "learning_rate": 1.0083790693512101e-07, "loss": 0.321, "step": 55600 }, { "epoch": 0.796169239565466, "grad_norm": 18.436662673950195, "learning_rate": 9.94904967129151e-08, "loss": 0.3102, "step": 55700 }, { "epoch": 0.7975986277873071, "grad_norm": 12.65139389038086, "learning_rate": 9.81511540205226e-08, "loss": 0.3098, "step": 55800 }, { "epoch": 0.7990280160091481, "grad_norm": 18.66716194152832, "learning_rate": 9.681990583654732e-08, "loss": 0.3038, "step": 55900 }, { "epoch": 0.8004574042309891, "grad_norm": 20.361345291137695, "learning_rate": 9.549677897654368e-08, "loss": 0.3262, "step": 56000 }, { "epoch": 0.8018867924528302, "grad_norm": 9.910906791687012, "learning_rate": 9.418180009247679e-08, "loss": 0.2901, "step": 56100 }, { "epoch": 0.8033161806746713, "grad_norm": 17.09386444091797, "learning_rate": 9.287499567218621e-08, "loss": 0.2875, "step": 56200 }, { "epoch": 0.8047455688965123, "grad_norm": 16.45813751220703, "learning_rate": 9.157639203885137e-08, "loss": 0.3165, "step": 56300 }, { "epoch": 0.8061749571183533, "grad_norm": 22.256296157836914, "learning_rate": 9.028601535046243e-08, "loss": 0.2998, "step": 56400 }, { "epoch": 0.8076043453401944, "grad_norm": 17.575212478637695, "learning_rate": 8.900389159929234e-08, "loss": 0.2999, "step": 56500 }, { "epoch": 0.8090337335620355, "grad_norm": 13.822104454040527, "learning_rate": 8.773004661137384e-08, "loss": 0.2943, "step": 56600 }, { "epoch": 0.8104631217838765, "grad_norm": 14.172625541687012, "learning_rate": 8.646450604597955e-08, "loss": 0.306, "step": 56700 }, { "epoch": 0.8118925100057176, "grad_norm": 15.35146713256836, "learning_rate": 8.520729539510424e-08, "loss": 0.2998, "step": 56800 }, { "epoch": 0.8133218982275586, "grad_norm": 20.570680618286133, "learning_rate": 8.39584399829521e-08, "loss": 0.2973, "step": 56900 }, { "epoch": 0.8147512864493996, "grad_norm": 14.586490631103516, "learning_rate": 8.271796496542616e-08, "loss": 0.3159, "step": 57000 }, { "epoch": 0.8161806746712407, "grad_norm": 14.62299633026123, "learning_rate": 8.148589532962197e-08, "loss": 0.2654, "step": 57100 }, { "epoch": 0.8176100628930818, "grad_norm": 13.184954643249512, "learning_rate": 8.026225589332397e-08, "loss": 0.297, "step": 57200 }, { "epoch": 0.8190394511149228, "grad_norm": 16.64432716369629, "learning_rate": 7.904707130450566e-08, "loss": 0.294, "step": 57300 }, { "epoch": 0.8204688393367638, "grad_norm": 16.969867706298828, "learning_rate": 7.784036604083305e-08, "loss": 0.2852, "step": 57400 }, { "epoch": 0.8218982275586049, "grad_norm": 15.365966796875, "learning_rate": 7.664216440917204e-08, "loss": 0.3187, "step": 57500 }, { "epoch": 0.823327615780446, "grad_norm": 18.290414810180664, "learning_rate": 7.545249054509789e-08, "loss": 0.3017, "step": 57600 }, { "epoch": 0.824757004002287, "grad_norm": 17.441865921020508, "learning_rate": 7.427136841241027e-08, "loss": 0.2949, "step": 57700 }, { "epoch": 0.826186392224128, "grad_norm": 11.121231079101562, "learning_rate": 7.309882180264937e-08, "loss": 0.3005, "step": 57800 }, { "epoch": 0.8276157804459692, "grad_norm": 8.872564315795898, "learning_rate": 7.193487433461732e-08, "loss": 0.2918, "step": 57900 }, { "epoch": 0.8290451686678102, "grad_norm": 17.529985427856445, "learning_rate": 7.077954945390252e-08, "loss": 0.3059, "step": 58000 }, { "epoch": 0.8304745568896512, "grad_norm": 10.752988815307617, "learning_rate": 6.963287043240695e-08, "loss": 0.308, "step": 58100 }, { "epoch": 0.8319039451114922, "grad_norm": 16.11565399169922, "learning_rate": 6.84948603678775e-08, "loss": 0.2898, "step": 58200 }, { "epoch": 0.8333333333333334, "grad_norm": 14.971604347229004, "learning_rate": 6.736554218344115e-08, "loss": 0.3107, "step": 58300 }, { "epoch": 0.8347627215551744, "grad_norm": 18.02588653564453, "learning_rate": 6.624493862714265e-08, "loss": 0.3144, "step": 58400 }, { "epoch": 0.8361921097770154, "grad_norm": 21.18177032470703, "learning_rate": 6.513307227148657e-08, "loss": 0.2813, "step": 58500 }, { "epoch": 0.8376214979988564, "grad_norm": 12.731508255004883, "learning_rate": 6.40299655129824e-08, "loss": 0.3202, "step": 58600 }, { "epoch": 0.8390508862206976, "grad_norm": 19.12579345703125, "learning_rate": 6.293564057169415e-08, "loss": 0.2872, "step": 58700 }, { "epoch": 0.8404802744425386, "grad_norm": 20.496795654296875, "learning_rate": 6.185011949079172e-08, "loss": 0.3117, "step": 58800 }, { "epoch": 0.8419096626643796, "grad_norm": 18.454668045043945, "learning_rate": 6.077342413610742e-08, "loss": 0.2905, "step": 58900 }, { "epoch": 0.8433390508862207, "grad_norm": 23.58623695373535, "learning_rate": 5.970557619569577e-08, "loss": 0.2814, "step": 59000 }, { "epoch": 0.8447684391080618, "grad_norm": 16.98134994506836, "learning_rate": 5.8646597179396075e-08, "loss": 0.3093, "step": 59100 }, { "epoch": 0.8461978273299028, "grad_norm": 14.197776794433594, "learning_rate": 5.759650841839964e-08, "loss": 0.2985, "step": 59200 }, { "epoch": 0.8476272155517438, "grad_norm": 19.599674224853516, "learning_rate": 5.6555331064819635e-08, "loss": 0.2888, "step": 59300 }, { "epoch": 0.8490566037735849, "grad_norm": 10.197669982910156, "learning_rate": 5.552308609126544e-08, "loss": 0.3214, "step": 59400 }, { "epoch": 0.850485991995426, "grad_norm": 14.60055160522461, "learning_rate": 5.449979429041984e-08, "loss": 0.2995, "step": 59500 }, { "epoch": 0.851915380217267, "grad_norm": 10.661852836608887, "learning_rate": 5.3485476274620686e-08, "loss": 0.2998, "step": 59600 }, { "epoch": 0.8533447684391081, "grad_norm": 19.17547035217285, "learning_rate": 5.248015247544502e-08, "loss": 0.3045, "step": 59700 }, { "epoch": 0.8547741566609491, "grad_norm": 12.473791122436523, "learning_rate": 5.148384314329801e-08, "loss": 0.2963, "step": 59800 }, { "epoch": 0.8562035448827902, "grad_norm": 15.893697738647461, "learning_rate": 5.049656834700494e-08, "loss": 0.3115, "step": 59900 }, { "epoch": 0.8576329331046312, "grad_norm": 28.1324462890625, "learning_rate": 4.9518347973407036e-08, "loss": 0.2987, "step": 60000 }, { "epoch": 0.8590623213264723, "grad_norm": 8.90252685546875, "learning_rate": 4.854920172696053e-08, "loss": 0.3036, "step": 60100 }, { "epoch": 0.8604917095483133, "grad_norm": 17.27793312072754, "learning_rate": 4.7589149129340146e-08, "loss": 0.3074, "step": 60200 }, { "epoch": 0.8619210977701544, "grad_norm": 21.032102584838867, "learning_rate": 4.663820951904574e-08, "loss": 0.2954, "step": 60300 }, { "epoch": 0.8633504859919954, "grad_norm": 16.893978118896484, "learning_rate": 4.569640205101261e-08, "loss": 0.2866, "step": 60400 }, { "epoch": 0.8647798742138365, "grad_norm": 18.381515502929688, "learning_rate": 4.476374569622599e-08, "loss": 0.3053, "step": 60500 }, { "epoch": 0.8662092624356775, "grad_norm": 14.010221481323242, "learning_rate": 4.3840259241338554e-08, "loss": 0.3004, "step": 60600 }, { "epoch": 0.8676386506575186, "grad_norm": 21.26782989501953, "learning_rate": 4.2925961288292065e-08, "loss": 0.308, "step": 60700 }, { "epoch": 0.8690680388793597, "grad_norm": 21.457979202270508, "learning_rate": 4.202087025394313e-08, "loss": 0.3102, "step": 60800 }, { "epoch": 0.8704974271012007, "grad_norm": 19.865032196044922, "learning_rate": 4.112500436969146e-08, "loss": 0.2831, "step": 60900 }, { "epoch": 0.8719268153230417, "grad_norm": 20.80860710144043, "learning_rate": 4.023838168111321e-08, "loss": 0.3058, "step": 61000 }, { "epoch": 0.8733562035448827, "grad_norm": 20.485183715820312, "learning_rate": 3.9361020047597174e-08, "loss": 0.2788, "step": 61100 }, { "epoch": 0.8747855917667239, "grad_norm": 20.498945236206055, "learning_rate": 3.849293714198548e-08, "loss": 0.2821, "step": 61200 }, { "epoch": 0.8762149799885649, "grad_norm": 18.507490158081055, "learning_rate": 3.76341504502169e-08, "loss": 0.3095, "step": 61300 }, { "epoch": 0.8776443682104059, "grad_norm": 15.786584854125977, "learning_rate": 3.6784677270975284e-08, "loss": 0.2772, "step": 61400 }, { "epoch": 0.8790737564322469, "grad_norm": 17.30613136291504, "learning_rate": 3.594453471534059e-08, "loss": 0.3102, "step": 61500 }, { "epoch": 0.8805031446540881, "grad_norm": 19.98883056640625, "learning_rate": 3.5113739706444714e-08, "loss": 0.3084, "step": 61600 }, { "epoch": 0.8819325328759291, "grad_norm": 17.401906967163086, "learning_rate": 3.4292308979130434e-08, "loss": 0.2959, "step": 61700 }, { "epoch": 0.8833619210977701, "grad_norm": 18.43778419494629, "learning_rate": 3.3480259079613936e-08, "loss": 0.3141, "step": 61800 }, { "epoch": 0.8847913093196113, "grad_norm": 17.81369972229004, "learning_rate": 3.267760636515199e-08, "loss": 0.2891, "step": 61900 }, { "epoch": 0.8862206975414523, "grad_norm": 17.828996658325195, "learning_rate": 3.1884367003712166e-08, "loss": 0.2943, "step": 62000 }, { "epoch": 0.8876500857632933, "grad_norm": 15.651758193969727, "learning_rate": 3.110055697364755e-08, "loss": 0.3214, "step": 62100 }, { "epoch": 0.8890794739851343, "grad_norm": 14.183752059936523, "learning_rate": 3.032619206337445e-08, "loss": 0.3032, "step": 62200 }, { "epoch": 0.8905088622069754, "grad_norm": 13.557807922363281, "learning_rate": 2.9561287871054495e-08, "loss": 0.3104, "step": 62300 }, { "epoch": 0.8919382504288165, "grad_norm": 14.538249969482422, "learning_rate": 2.880585980428063e-08, "loss": 0.2927, "step": 62400 }, { "epoch": 0.8933676386506575, "grad_norm": 17.610340118408203, "learning_rate": 2.8059923079766624e-08, "loss": 0.3133, "step": 62500 }, { "epoch": 0.8947970268724986, "grad_norm": 19.14527130126953, "learning_rate": 2.732349272304041e-08, "loss": 0.311, "step": 62600 }, { "epoch": 0.8962264150943396, "grad_norm": 20.89178466796875, "learning_rate": 2.659658356814176e-08, "loss": 0.3035, "step": 62700 }, { "epoch": 0.8976558033161807, "grad_norm": 9.735486030578613, "learning_rate": 2.5879210257322925e-08, "loss": 0.3109, "step": 62800 }, { "epoch": 0.8990851915380217, "grad_norm": 33.927120208740234, "learning_rate": 2.5171387240754628e-08, "loss": 0.3169, "step": 62900 }, { "epoch": 0.9005145797598628, "grad_norm": 19.379432678222656, "learning_rate": 2.447312877623403e-08, "loss": 0.3095, "step": 63000 }, { "epoch": 0.9019439679817038, "grad_norm": 14.171624183654785, "learning_rate": 2.3784448928897914e-08, "loss": 0.2854, "step": 63100 }, { "epoch": 0.9033733562035449, "grad_norm": 17.32662582397461, "learning_rate": 2.3105361570939398e-08, "loss": 0.2984, "step": 63200 }, { "epoch": 0.9048027444253859, "grad_norm": 17.91543197631836, "learning_rate": 2.2435880381328765e-08, "loss": 0.3016, "step": 63300 }, { "epoch": 0.906232132647227, "grad_norm": 19.051382064819336, "learning_rate": 2.1776018845537257e-08, "loss": 0.3017, "step": 63400 }, { "epoch": 0.907661520869068, "grad_norm": 19.910274505615234, "learning_rate": 2.1125790255266064e-08, "loss": 0.296, "step": 63500 }, { "epoch": 0.9090909090909091, "grad_norm": 17.08917236328125, "learning_rate": 2.048520770817819e-08, "loss": 0.306, "step": 63600 }, { "epoch": 0.9105202973127502, "grad_norm": 18.074064254760742, "learning_rate": 1.9854284107634956e-08, "loss": 0.3017, "step": 63700 }, { "epoch": 0.9119496855345912, "grad_norm": 21.050952911376953, "learning_rate": 1.9233032162435824e-08, "loss": 0.3063, "step": 63800 }, { "epoch": 0.9133790737564322, "grad_norm": 14.34924030303955, "learning_rate": 1.862146438656237e-08, "loss": 0.3152, "step": 63900 }, { "epoch": 0.9148084619782733, "grad_norm": 21.398719787597656, "learning_rate": 1.8019593098926534e-08, "loss": 0.3043, "step": 64000 }, { "epoch": 0.9162378502001144, "grad_norm": 15.73678970336914, "learning_rate": 1.7427430423122113e-08, "loss": 0.2963, "step": 64100 }, { "epoch": 0.9176672384219554, "grad_norm": 16.367048263549805, "learning_rate": 1.6844988287180895e-08, "loss": 0.2922, "step": 64200 }, { "epoch": 0.9190966266437964, "grad_norm": 12.501718521118164, "learning_rate": 1.6272278423332063e-08, "loss": 0.2943, "step": 64300 }, { "epoch": 0.9205260148656375, "grad_norm": 21.266305923461914, "learning_rate": 1.5709312367765958e-08, "loss": 0.3004, "step": 64400 }, { "epoch": 0.9219554030874786, "grad_norm": 26.346576690673828, "learning_rate": 1.5156101460401916e-08, "loss": 0.3057, "step": 64500 }, { "epoch": 0.9233847913093196, "grad_norm": 23.759737014770508, "learning_rate": 1.4612656844659676e-08, "loss": 0.3027, "step": 64600 }, { "epoch": 0.9248141795311606, "grad_norm": 20.683504104614258, "learning_rate": 1.4078989467234903e-08, "loss": 0.3132, "step": 64700 }, { "epoch": 0.9262435677530018, "grad_norm": 21.913318634033203, "learning_rate": 1.355511007787863e-08, "loss": 0.2894, "step": 64800 }, { "epoch": 0.9276729559748428, "grad_norm": 17.271446228027344, "learning_rate": 1.3041029229180823e-08, "loss": 0.3115, "step": 64900 }, { "epoch": 0.9291023441966838, "grad_norm": 26.936786651611328, "learning_rate": 1.2536757276358057e-08, "loss": 0.2985, "step": 65000 }, { "epoch": 0.9305317324185248, "grad_norm": 21.67516326904297, "learning_rate": 1.2042304377044455e-08, "loss": 0.3182, "step": 65100 }, { "epoch": 0.931961120640366, "grad_norm": 16.42747688293457, "learning_rate": 1.155768049108724e-08, "loss": 0.3072, "step": 65200 }, { "epoch": 0.933390508862207, "grad_norm": 21.463115692138672, "learning_rate": 1.1082895380346346e-08, "loss": 0.3053, "step": 65300 }, { "epoch": 0.934819897084048, "grad_norm": 27.757343292236328, "learning_rate": 1.0617958608497568e-08, "loss": 0.3027, "step": 65400 }, { "epoch": 0.9362492853058891, "grad_norm": 7.035409450531006, "learning_rate": 1.0162879540839887e-08, "loss": 0.2905, "step": 65500 }, { "epoch": 0.9376786735277302, "grad_norm": 15.803873062133789, "learning_rate": 9.717667344107116e-09, "loss": 0.3052, "step": 65600 }, { "epoch": 0.9391080617495712, "grad_norm": 20.86280632019043, "learning_rate": 9.282330986282672e-09, "loss": 0.3027, "step": 65700 }, { "epoch": 0.9405374499714122, "grad_norm": 17.12143898010254, "learning_rate": 8.856879236419701e-09, "loss": 0.2948, "step": 65800 }, { "epoch": 0.9419668381932533, "grad_norm": 16.723186492919922, "learning_rate": 8.441320664463792e-09, "loss": 0.2965, "step": 65900 }, { "epoch": 0.9433962264150944, "grad_norm": 14.78087043762207, "learning_rate": 8.035663641080658e-09, "loss": 0.3377, "step": 66000 }, { "epoch": 0.9448256146369354, "grad_norm": 26.990137100219727, "learning_rate": 7.639916337487396e-09, "loss": 0.3133, "step": 66100 }, { "epoch": 0.9462550028587764, "grad_norm": 18.1567325592041, "learning_rate": 7.254086725288156e-09, "loss": 0.3036, "step": 66200 }, { "epoch": 0.9476843910806175, "grad_norm": 20.834636688232422, "learning_rate": 6.87818257631323e-09, "loss": 0.2954, "step": 66300 }, { "epoch": 0.9491137793024585, "grad_norm": 18.86599349975586, "learning_rate": 6.5122114624626136e-09, "loss": 0.3075, "step": 66400 }, { "epoch": 0.9505431675242996, "grad_norm": 22.019954681396484, "learning_rate": 6.156180755553575e-09, "loss": 0.3053, "step": 66500 }, { "epoch": 0.9519725557461407, "grad_norm": 17.80645751953125, "learning_rate": 5.810097627172161e-09, "loss": 0.3211, "step": 66600 }, { "epoch": 0.9534019439679817, "grad_norm": 23.562326431274414, "learning_rate": 5.473969048528593e-09, "loss": 0.3259, "step": 66700 }, { "epoch": 0.9548313321898227, "grad_norm": 15.166871070861816, "learning_rate": 5.14780179031693e-09, "loss": 0.3177, "step": 66800 }, { "epoch": 0.9562607204116638, "grad_norm": 15.927968978881836, "learning_rate": 4.831602422578851e-09, "loss": 0.2979, "step": 66900 }, { "epoch": 0.9576901086335049, "grad_norm": 18.44903564453125, "learning_rate": 4.525377314570866e-09, "loss": 0.2984, "step": 67000 }, { "epoch": 0.9591194968553459, "grad_norm": 14.678949356079102, "learning_rate": 4.2291326346365895e-09, "loss": 0.2952, "step": 67100 }, { "epoch": 0.9605488850771869, "grad_norm": 16.67792510986328, "learning_rate": 3.942874350082059e-09, "loss": 0.2911, "step": 67200 }, { "epoch": 0.961978273299028, "grad_norm": 13.899800300598145, "learning_rate": 3.6666082270556674e-09, "loss": 0.3038, "step": 67300 }, { "epoch": 0.9634076615208691, "grad_norm": 16.707815170288086, "learning_rate": 3.400339830432086e-09, "loss": 0.2981, "step": 67400 }, { "epoch": 0.9648370497427101, "grad_norm": 20.098234176635742, "learning_rate": 3.14407452370008e-09, "loss": 0.3179, "step": 67500 }, { "epoch": 0.9662664379645511, "grad_norm": 17.343828201293945, "learning_rate": 2.897817468854591e-09, "loss": 0.2798, "step": 67600 }, { "epoch": 0.9676958261863923, "grad_norm": 10.933745384216309, "learning_rate": 2.6615736262926015e-09, "loss": 0.3104, "step": 67700 }, { "epoch": 0.9691252144082333, "grad_norm": 20.136072158813477, "learning_rate": 2.4353477547131572e-09, "loss": 0.3015, "step": 67800 }, { "epoch": 0.9705546026300743, "grad_norm": 9.679976463317871, "learning_rate": 2.2191444110218893e-09, "loss": 0.2847, "step": 67900 }, { "epoch": 0.9719839908519153, "grad_norm": 18.093830108642578, "learning_rate": 2.0129679502388642e-09, "loss": 0.2998, "step": 68000 }, { "epoch": 0.9734133790737565, "grad_norm": 16.950090408325195, "learning_rate": 1.816822525410877e-09, "loss": 0.3085, "step": 68100 }, { "epoch": 0.9748427672955975, "grad_norm": 16.35737419128418, "learning_rate": 1.630712087528019e-09, "loss": 0.3251, "step": 68200 }, { "epoch": 0.9762721555174385, "grad_norm": 19.01432991027832, "learning_rate": 1.454640385444017e-09, "loss": 0.3307, "step": 68300 }, { "epoch": 0.9777015437392796, "grad_norm": 22.79551887512207, "learning_rate": 1.2886109658004075e-09, "loss": 0.2819, "step": 68400 }, { "epoch": 0.9791309319611207, "grad_norm": 30.52099609375, "learning_rate": 1.1326271729556469e-09, "loss": 0.3163, "step": 68500 }, { "epoch": 0.9805603201829617, "grad_norm": 15.844212532043457, "learning_rate": 9.866921489172232e-10, "loss": 0.2924, "step": 68600 }, { "epoch": 0.9819897084048027, "grad_norm": 12.840483665466309, "learning_rate": 8.508088332788155e-10, "loss": 0.3141, "step": 68700 }, { "epoch": 0.9834190966266438, "grad_norm": 11.82694149017334, "learning_rate": 7.249799631606213e-10, "loss": 0.309, "step": 68800 }, { "epoch": 0.9848484848484849, "grad_norm": 20.686918258666992, "learning_rate": 6.092080731546767e-10, "loss": 0.2739, "step": 68900 }, { "epoch": 0.9862778730703259, "grad_norm": 15.944323539733887, "learning_rate": 5.034954952734538e-10, "loss": 0.3016, "step": 69000 }, { "epoch": 0.9877072612921669, "grad_norm": 15.672585487365723, "learning_rate": 4.0784435890312e-10, "loss": 0.2912, "step": 69100 }, { "epoch": 0.989136649514008, "grad_norm": 15.7535982131958, "learning_rate": 3.222565907604058e-10, "loss": 0.2871, "step": 69200 }, { "epoch": 0.9905660377358491, "grad_norm": 15.820429801940918, "learning_rate": 2.4673391485408033e-10, "loss": 0.2808, "step": 69300 }, { "epoch": 0.9919954259576901, "grad_norm": 14.639655113220215, "learning_rate": 1.8127785244992367e-10, "loss": 0.3105, "step": 69400 }, { "epoch": 0.9934248141795312, "grad_norm": 19.168243408203125, "learning_rate": 1.258897220404731e-10, "loss": 0.308, "step": 69500 }, { "epoch": 0.9948542024013722, "grad_norm": 15.535967826843262, "learning_rate": 8.057063931804497e-11, "loss": 0.3076, "step": 69600 }, { "epoch": 0.9962835906232133, "grad_norm": 14.585685729980469, "learning_rate": 4.5321517152419054e-11, "loss": 0.3523, "step": 69700 }, { "epoch": 0.9977129788450543, "grad_norm": 16.058551788330078, "learning_rate": 2.014306557257539e-11, "loss": 0.3078, "step": 69800 }, { "epoch": 0.9991423670668954, "grad_norm": 15.687799453735352, "learning_rate": 5.0357917523724534e-12, "loss": 0.2877, "step": 69900 }, { "epoch": 1.0005717552887363, "grad_norm": 18.27077865600586, "learning_rate": 0.0, "loss": 0.3007, "step": 70000 } ], "logging_steps": 100, "max_steps": 70000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.8818510438785434e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }