{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.163721601153985, "eval_steps": 200, "global_step": 18000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006010337780983291, "grad_norm": 0.9436860084533691, "learning_rate": 3.0048076923076927e-06, "loss": 4.4875, "step": 5 }, { "epoch": 0.0012020675561966582, "grad_norm": 0.49743083119392395, "learning_rate": 6.0096153846153855e-06, "loss": 4.2438, "step": 10 }, { "epoch": 0.0018031013342949874, "grad_norm": 0.7305557131767273, "learning_rate": 9.014423076923076e-06, "loss": 4.7438, "step": 15 }, { "epoch": 0.0024041351123933164, "grad_norm": 1.2842742204666138, "learning_rate": 1.2019230769230771e-05, "loss": 4.3125, "step": 20 }, { "epoch": 0.0030051688904916456, "grad_norm": 1.0128940343856812, "learning_rate": 1.5024038461538462e-05, "loss": 4.2969, "step": 25 }, { "epoch": 0.003606202668589975, "grad_norm": 1.6097222566604614, "learning_rate": 1.8028846153846152e-05, "loss": 4.3625, "step": 30 }, { "epoch": 0.004207236446688304, "grad_norm": 0.7380394339561462, "learning_rate": 2.103365384615385e-05, "loss": 3.6562, "step": 35 }, { "epoch": 0.004808270224786633, "grad_norm": 2.499553918838501, "learning_rate": 2.4038461538461542e-05, "loss": 3.9656, "step": 40 }, { "epoch": 0.005409304002884962, "grad_norm": 0.9382426142692566, "learning_rate": 2.704326923076923e-05, "loss": 3.7906, "step": 45 }, { "epoch": 0.006010337780983291, "grad_norm": 0.4448552429676056, "learning_rate": 3.0048076923076925e-05, "loss": 3.4531, "step": 50 }, { "epoch": 0.00661137155908162, "grad_norm": 0.6187996864318848, "learning_rate": 3.3052884615384615e-05, "loss": 3.0406, "step": 55 }, { "epoch": 0.00721240533717995, "grad_norm": 0.4894959032535553, "learning_rate": 3.6057692307692304e-05, "loss": 2.9844, "step": 60 }, { "epoch": 0.007813439115278278, "grad_norm": 0.523160994052887, "learning_rate": 3.90625e-05, "loss": 2.825, "step": 65 }, { "epoch": 0.008414472893376608, "grad_norm": 0.41818058490753174, "learning_rate": 4.20673076923077e-05, "loss": 2.5125, "step": 70 }, { "epoch": 0.009015506671474938, "grad_norm": 0.37683457136154175, "learning_rate": 4.507211538461539e-05, "loss": 2.6844, "step": 75 }, { "epoch": 0.009616540449573266, "grad_norm": 0.428375780582428, "learning_rate": 4.8076923076923084e-05, "loss": 2.6719, "step": 80 }, { "epoch": 0.010217574227671595, "grad_norm": 0.3897765576839447, "learning_rate": 5.108173076923077e-05, "loss": 2.2531, "step": 85 }, { "epoch": 0.010818608005769925, "grad_norm": 0.2265370637178421, "learning_rate": 5.408653846153846e-05, "loss": 2.2844, "step": 90 }, { "epoch": 0.011419641783868253, "grad_norm": 0.2113611400127411, "learning_rate": 5.709134615384615e-05, "loss": 2.1922, "step": 95 }, { "epoch": 0.012020675561966582, "grad_norm": 0.1886824667453766, "learning_rate": 6.009615384615385e-05, "loss": 2.3516, "step": 100 }, { "epoch": 0.012621709340064912, "grad_norm": 0.25855502486228943, "learning_rate": 6.310096153846154e-05, "loss": 2.4, "step": 105 }, { "epoch": 0.01322274311816324, "grad_norm": 0.22833962738513947, "learning_rate": 6.610576923076923e-05, "loss": 2.2844, "step": 110 }, { "epoch": 0.01382377689626157, "grad_norm": 0.30784738063812256, "learning_rate": 6.911057692307693e-05, "loss": 2.2016, "step": 115 }, { "epoch": 0.0144248106743599, "grad_norm": 0.3998744487762451, "learning_rate": 7.211538461538461e-05, "loss": 2.4125, "step": 120 }, { "epoch": 0.015025844452458229, "grad_norm": 0.24773858487606049, "learning_rate": 7.512019230769231e-05, "loss": 2.4156, "step": 125 }, { "epoch": 0.015626878230556557, "grad_norm": 0.26020580530166626, "learning_rate": 7.8125e-05, "loss": 2.0094, "step": 130 }, { "epoch": 0.016227912008654886, "grad_norm": 0.25112366676330566, "learning_rate": 8.112980769230769e-05, "loss": 2.5969, "step": 135 }, { "epoch": 0.016828945786753216, "grad_norm": 0.3155271112918854, "learning_rate": 8.41346153846154e-05, "loss": 1.9844, "step": 140 }, { "epoch": 0.017429979564851546, "grad_norm": 0.2684473693370819, "learning_rate": 8.713942307692307e-05, "loss": 2.3594, "step": 145 }, { "epoch": 0.018031013342949875, "grad_norm": 0.19519321620464325, "learning_rate": 9.014423076923077e-05, "loss": 2.1906, "step": 150 }, { "epoch": 0.0186320471210482, "grad_norm": 0.29595857858657837, "learning_rate": 9.314903846153846e-05, "loss": 2.4844, "step": 155 }, { "epoch": 0.01923308089914653, "grad_norm": 0.21725840866565704, "learning_rate": 9.615384615384617e-05, "loss": 1.9969, "step": 160 }, { "epoch": 0.01983411467724486, "grad_norm": 0.250431627035141, "learning_rate": 9.915865384615384e-05, "loss": 2.1469, "step": 165 }, { "epoch": 0.02043514845534319, "grad_norm": 0.22979402542114258, "learning_rate": 0.00010216346153846153, "loss": 2.0891, "step": 170 }, { "epoch": 0.02103618223344152, "grad_norm": 0.29841649532318115, "learning_rate": 0.00010516826923076924, "loss": 2.0891, "step": 175 }, { "epoch": 0.02163721601153985, "grad_norm": 0.3121524155139923, "learning_rate": 0.00010817307692307693, "loss": 2.2938, "step": 180 }, { "epoch": 0.02223824978963818, "grad_norm": 0.25094497203826904, "learning_rate": 0.00011117788461538462, "loss": 2.0672, "step": 185 }, { "epoch": 0.022839283567736506, "grad_norm": 0.32229083776474, "learning_rate": 0.0001141826923076923, "loss": 1.9781, "step": 190 }, { "epoch": 0.023440317345834835, "grad_norm": 0.30247944593429565, "learning_rate": 0.0001171875, "loss": 2.4469, "step": 195 }, { "epoch": 0.024041351123933165, "grad_norm": 0.3992522358894348, "learning_rate": 0.0001201923076923077, "loss": 2.1609, "step": 200 }, { "epoch": 0.024041351123933165, "eval_loss": 2.7308592796325684, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.262, "eval_samples_per_second": 4.537, "eval_steps_per_second": 1.134, "step": 200 }, { "epoch": 0.024642384902031494, "grad_norm": 0.28425589203834534, "learning_rate": 0.0001231971153846154, "loss": 2.625, "step": 205 }, { "epoch": 0.025243418680129824, "grad_norm": 0.31964734196662903, "learning_rate": 0.00012620192307692308, "loss": 1.9328, "step": 210 }, { "epoch": 0.025844452458228154, "grad_norm": 0.37272173166275024, "learning_rate": 0.00012920673076923078, "loss": 2.1641, "step": 215 }, { "epoch": 0.02644548623632648, "grad_norm": 0.32725071907043457, "learning_rate": 0.00013221153846153846, "loss": 2.225, "step": 220 }, { "epoch": 0.02704652001442481, "grad_norm": 0.25303465127944946, "learning_rate": 0.00013521634615384616, "loss": 2.2375, "step": 225 }, { "epoch": 0.02764755379252314, "grad_norm": 0.4098326861858368, "learning_rate": 0.00013822115384615386, "loss": 2.0203, "step": 230 }, { "epoch": 0.02824858757062147, "grad_norm": 0.3435593545436859, "learning_rate": 0.00014122596153846154, "loss": 2.3016, "step": 235 }, { "epoch": 0.0288496213487198, "grad_norm": 0.4556426703929901, "learning_rate": 0.00014423076923076922, "loss": 2.2969, "step": 240 }, { "epoch": 0.029450655126818128, "grad_norm": 0.39692002534866333, "learning_rate": 0.00014723557692307692, "loss": 2.3031, "step": 245 }, { "epoch": 0.030051688904916458, "grad_norm": 0.31686538457870483, "learning_rate": 0.00015024038461538462, "loss": 2.2984, "step": 250 }, { "epoch": 0.030652722683014784, "grad_norm": 0.30815422534942627, "learning_rate": 0.00015324519230769233, "loss": 2.1734, "step": 255 }, { "epoch": 0.031253756461113114, "grad_norm": 0.3927950859069824, "learning_rate": 0.00015625, "loss": 2.0031, "step": 260 }, { "epoch": 0.03185479023921144, "grad_norm": 0.3010413944721222, "learning_rate": 0.00015925480769230768, "loss": 2.1875, "step": 265 }, { "epoch": 0.03245582401730977, "grad_norm": 0.39929866790771484, "learning_rate": 0.00016225961538461538, "loss": 2.2266, "step": 270 }, { "epoch": 0.0330568577954081, "grad_norm": 0.3709786832332611, "learning_rate": 0.00016526442307692309, "loss": 2.2344, "step": 275 }, { "epoch": 0.03365789157350643, "grad_norm": 0.38551804423332214, "learning_rate": 0.0001682692307692308, "loss": 2.0391, "step": 280 }, { "epoch": 0.03425892535160476, "grad_norm": 0.3497028350830078, "learning_rate": 0.00017127403846153847, "loss": 2.1328, "step": 285 }, { "epoch": 0.03485995912970309, "grad_norm": 0.22066070139408112, "learning_rate": 0.00017427884615384614, "loss": 1.9891, "step": 290 }, { "epoch": 0.03546099290780142, "grad_norm": 0.3861188590526581, "learning_rate": 0.00017728365384615385, "loss": 2.0266, "step": 295 }, { "epoch": 0.03606202668589975, "grad_norm": 0.43038997054100037, "learning_rate": 0.00018028846153846155, "loss": 2.2062, "step": 300 }, { "epoch": 0.03666306046399807, "grad_norm": 0.4089072644710541, "learning_rate": 0.00018329326923076922, "loss": 2.2016, "step": 305 }, { "epoch": 0.0372640942420964, "grad_norm": 0.40281516313552856, "learning_rate": 0.00018629807692307693, "loss": 2.2578, "step": 310 }, { "epoch": 0.03786512802019473, "grad_norm": 0.33316513895988464, "learning_rate": 0.0001893028846153846, "loss": 2.1844, "step": 315 }, { "epoch": 0.03846616179829306, "grad_norm": 0.4020228087902069, "learning_rate": 0.00019230769230769233, "loss": 2.2109, "step": 320 }, { "epoch": 0.03906719557639139, "grad_norm": 0.36403888463974, "learning_rate": 0.0001953125, "loss": 2.0063, "step": 325 }, { "epoch": 0.03966822935448972, "grad_norm": 0.4289080500602722, "learning_rate": 0.0001983173076923077, "loss": 2.1641, "step": 330 }, { "epoch": 0.04026926313258805, "grad_norm": 0.3827407658100128, "learning_rate": 0.0002013221153846154, "loss": 2.4125, "step": 335 }, { "epoch": 0.04087029691068638, "grad_norm": 0.28297996520996094, "learning_rate": 0.00020432692307692307, "loss": 2.2047, "step": 340 }, { "epoch": 0.04147133068878471, "grad_norm": 0.3654349744319916, "learning_rate": 0.0002073317307692308, "loss": 2.0344, "step": 345 }, { "epoch": 0.04207236446688304, "grad_norm": 0.44768983125686646, "learning_rate": 0.00021033653846153847, "loss": 2.0469, "step": 350 }, { "epoch": 0.04267339824498137, "grad_norm": 0.36050865054130554, "learning_rate": 0.00021334134615384615, "loss": 1.8203, "step": 355 }, { "epoch": 0.0432744320230797, "grad_norm": 0.41343504190444946, "learning_rate": 0.00021634615384615385, "loss": 1.9031, "step": 360 }, { "epoch": 0.04387546580117803, "grad_norm": 0.33549779653549194, "learning_rate": 0.00021935096153846153, "loss": 1.9859, "step": 365 }, { "epoch": 0.04447649957927636, "grad_norm": 0.39200559258461, "learning_rate": 0.00022235576923076923, "loss": 2.0672, "step": 370 }, { "epoch": 0.04507753335737468, "grad_norm": 0.5816010236740112, "learning_rate": 0.00022536057692307694, "loss": 2.1625, "step": 375 }, { "epoch": 0.04567856713547301, "grad_norm": 0.4004225432872772, "learning_rate": 0.0002283653846153846, "loss": 2.1297, "step": 380 }, { "epoch": 0.04627960091357134, "grad_norm": 0.3329584300518036, "learning_rate": 0.00023137019230769232, "loss": 1.8969, "step": 385 }, { "epoch": 0.04688063469166967, "grad_norm": 0.3800398111343384, "learning_rate": 0.000234375, "loss": 1.875, "step": 390 }, { "epoch": 0.047481668469768, "grad_norm": 0.5345351696014404, "learning_rate": 0.0002373798076923077, "loss": 2.0641, "step": 395 }, { "epoch": 0.04808270224786633, "grad_norm": 0.31537583470344543, "learning_rate": 0.0002403846153846154, "loss": 2.0828, "step": 400 }, { "epoch": 0.04808270224786633, "eval_loss": 2.657031297683716, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2197, "eval_samples_per_second": 4.543, "eval_steps_per_second": 1.136, "step": 400 }, { "epoch": 0.04868373602596466, "grad_norm": 0.3651765286922455, "learning_rate": 0.00024338942307692307, "loss": 2.2188, "step": 405 }, { "epoch": 0.04928476980406299, "grad_norm": 0.42044126987457275, "learning_rate": 0.0002463942307692308, "loss": 1.9625, "step": 410 }, { "epoch": 0.04988580358216132, "grad_norm": 0.3405047357082367, "learning_rate": 0.00024939903846153845, "loss": 2.1203, "step": 415 }, { "epoch": 0.05048683736025965, "grad_norm": 0.5022028088569641, "learning_rate": 0.00025240384615384616, "loss": 1.7672, "step": 420 }, { "epoch": 0.05108787113835798, "grad_norm": 0.31208300590515137, "learning_rate": 0.00025540865384615386, "loss": 1.9266, "step": 425 }, { "epoch": 0.05168890491645631, "grad_norm": 0.39399516582489014, "learning_rate": 0.00025841346153846156, "loss": 1.8828, "step": 430 }, { "epoch": 0.05228993869455464, "grad_norm": 0.42515093088150024, "learning_rate": 0.0002614182692307692, "loss": 1.7656, "step": 435 }, { "epoch": 0.05289097247265296, "grad_norm": 0.3947089910507202, "learning_rate": 0.0002644230769230769, "loss": 2.0484, "step": 440 }, { "epoch": 0.05349200625075129, "grad_norm": 0.6280628442764282, "learning_rate": 0.0002674278846153846, "loss": 2.1422, "step": 445 }, { "epoch": 0.05409304002884962, "grad_norm": 0.3639807105064392, "learning_rate": 0.0002704326923076923, "loss": 1.9781, "step": 450 }, { "epoch": 0.05469407380694795, "grad_norm": 0.3984295427799225, "learning_rate": 0.0002734375, "loss": 2.2359, "step": 455 }, { "epoch": 0.05529510758504628, "grad_norm": 0.33954715728759766, "learning_rate": 0.00027644230769230773, "loss": 2.3547, "step": 460 }, { "epoch": 0.05589614136314461, "grad_norm": 0.4361511468887329, "learning_rate": 0.0002794471153846154, "loss": 2.0859, "step": 465 }, { "epoch": 0.05649717514124294, "grad_norm": 0.471563458442688, "learning_rate": 0.0002824519230769231, "loss": 2.1703, "step": 470 }, { "epoch": 0.05709820891934127, "grad_norm": 0.2517772614955902, "learning_rate": 0.0002854567307692308, "loss": 2.0281, "step": 475 }, { "epoch": 0.0576992426974396, "grad_norm": 0.3190082907676697, "learning_rate": 0.00028846153846153843, "loss": 2.0234, "step": 480 }, { "epoch": 0.05830027647553793, "grad_norm": 0.37972012162208557, "learning_rate": 0.00029146634615384614, "loss": 2.15, "step": 485 }, { "epoch": 0.058901310253636256, "grad_norm": 0.37980136275291443, "learning_rate": 0.00029447115384615384, "loss": 2.1219, "step": 490 }, { "epoch": 0.059502344031734586, "grad_norm": 0.32648953795433044, "learning_rate": 0.00029747596153846154, "loss": 1.9703, "step": 495 }, { "epoch": 0.060103377809832916, "grad_norm": 0.28836116194725037, "learning_rate": 0.00030048076923076925, "loss": 2.0406, "step": 500 }, { "epoch": 0.060704411587931245, "grad_norm": 0.2953934967517853, "learning_rate": 0.00030348557692307695, "loss": 2.2156, "step": 505 }, { "epoch": 0.06130544536602957, "grad_norm": 0.4778139889240265, "learning_rate": 0.00030649038461538465, "loss": 2.0672, "step": 510 }, { "epoch": 0.0619064791441279, "grad_norm": 0.27339640259742737, "learning_rate": 0.0003094951923076923, "loss": 1.8953, "step": 515 }, { "epoch": 0.06250751292222623, "grad_norm": 0.3127667009830475, "learning_rate": 0.0003125, "loss": 2.0859, "step": 520 }, { "epoch": 0.06310854670032456, "grad_norm": 0.2676738500595093, "learning_rate": 0.0003155048076923077, "loss": 1.9656, "step": 525 }, { "epoch": 0.06370958047842289, "grad_norm": 0.3519584834575653, "learning_rate": 0.00031850961538461536, "loss": 2.0828, "step": 530 }, { "epoch": 0.06431061425652122, "grad_norm": 0.38000714778900146, "learning_rate": 0.00032151442307692306, "loss": 1.8656, "step": 535 }, { "epoch": 0.06491164803461955, "grad_norm": 0.5076779127120972, "learning_rate": 0.00032451923076923077, "loss": 1.8938, "step": 540 }, { "epoch": 0.06551268181271787, "grad_norm": 0.3919801414012909, "learning_rate": 0.00032752403846153847, "loss": 2.1203, "step": 545 }, { "epoch": 0.0661137155908162, "grad_norm": 0.3263305425643921, "learning_rate": 0.00033052884615384617, "loss": 2.0344, "step": 550 }, { "epoch": 0.06671474936891453, "grad_norm": 0.4196506440639496, "learning_rate": 0.0003335336538461539, "loss": 2.1078, "step": 555 }, { "epoch": 0.06731578314701286, "grad_norm": 0.3997637927532196, "learning_rate": 0.0003365384615384616, "loss": 1.8844, "step": 560 }, { "epoch": 0.06791681692511119, "grad_norm": 0.39547184109687805, "learning_rate": 0.00033954326923076923, "loss": 1.9406, "step": 565 }, { "epoch": 0.06851785070320952, "grad_norm": 0.36170271039009094, "learning_rate": 0.00034254807692307693, "loss": 2.2469, "step": 570 }, { "epoch": 0.06911888448130785, "grad_norm": 0.3041069507598877, "learning_rate": 0.00034555288461538463, "loss": 1.7734, "step": 575 }, { "epoch": 0.06971991825940618, "grad_norm": 0.31936579942703247, "learning_rate": 0.0003485576923076923, "loss": 2.1141, "step": 580 }, { "epoch": 0.0703209520375045, "grad_norm": 0.5643404722213745, "learning_rate": 0.0003515625, "loss": 1.9, "step": 585 }, { "epoch": 0.07092198581560284, "grad_norm": 0.43453335762023926, "learning_rate": 0.0003545673076923077, "loss": 1.5203, "step": 590 }, { "epoch": 0.07152301959370116, "grad_norm": 0.28918376564979553, "learning_rate": 0.0003575721153846154, "loss": 1.9859, "step": 595 }, { "epoch": 0.0721240533717995, "grad_norm": 0.4441574215888977, "learning_rate": 0.0003605769230769231, "loss": 1.7422, "step": 600 }, { "epoch": 0.0721240533717995, "eval_loss": 2.598828077316284, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2182, "eval_samples_per_second": 4.543, "eval_steps_per_second": 1.136, "step": 600 }, { "epoch": 0.07272508714989782, "grad_norm": 0.47054970264434814, "learning_rate": 0.0003635817307692308, "loss": 1.975, "step": 605 }, { "epoch": 0.07332612092799615, "grad_norm": 0.4808221459388733, "learning_rate": 0.00036658653846153845, "loss": 2.0969, "step": 610 }, { "epoch": 0.07392715470609448, "grad_norm": 0.40260159969329834, "learning_rate": 0.00036959134615384615, "loss": 2.0781, "step": 615 }, { "epoch": 0.0745281884841928, "grad_norm": 0.3565881848335266, "learning_rate": 0.00037259615384615386, "loss": 1.8562, "step": 620 }, { "epoch": 0.07512922226229114, "grad_norm": 0.41623455286026, "learning_rate": 0.00037560096153846156, "loss": 1.9688, "step": 625 }, { "epoch": 0.07573025604038947, "grad_norm": 0.442056804895401, "learning_rate": 0.0003786057692307692, "loss": 2.0531, "step": 630 }, { "epoch": 0.0763312898184878, "grad_norm": 0.5474425554275513, "learning_rate": 0.0003816105769230769, "loss": 2.1391, "step": 635 }, { "epoch": 0.07693232359658612, "grad_norm": 0.29002273082733154, "learning_rate": 0.00038461538461538467, "loss": 1.6906, "step": 640 }, { "epoch": 0.07753335737468446, "grad_norm": 0.30469194054603577, "learning_rate": 0.0003876201923076923, "loss": 1.6859, "step": 645 }, { "epoch": 0.07813439115278278, "grad_norm": 0.3932645618915558, "learning_rate": 0.000390625, "loss": 1.8328, "step": 650 }, { "epoch": 0.07873542493088112, "grad_norm": 0.4049251079559326, "learning_rate": 0.0003936298076923077, "loss": 1.8672, "step": 655 }, { "epoch": 0.07933645870897944, "grad_norm": 0.4889291524887085, "learning_rate": 0.0003966346153846154, "loss": 2.0531, "step": 660 }, { "epoch": 0.07993749248707778, "grad_norm": 0.38475117087364197, "learning_rate": 0.0003996394230769231, "loss": 1.8422, "step": 665 }, { "epoch": 0.0805385262651761, "grad_norm": 0.34599217772483826, "learning_rate": 0.0004026442307692308, "loss": 1.8391, "step": 670 }, { "epoch": 0.08113956004327443, "grad_norm": 0.39600178599357605, "learning_rate": 0.00040564903846153843, "loss": 1.8484, "step": 675 }, { "epoch": 0.08174059382137276, "grad_norm": 0.3293285071849823, "learning_rate": 0.00040865384615384613, "loss": 1.6656, "step": 680 }, { "epoch": 0.08234162759947108, "grad_norm": 0.37310031056404114, "learning_rate": 0.00041165865384615384, "loss": 1.9609, "step": 685 }, { "epoch": 0.08294266137756942, "grad_norm": 0.41512343287467957, "learning_rate": 0.0004146634615384616, "loss": 1.9937, "step": 690 }, { "epoch": 0.08354369515566774, "grad_norm": 0.47950249910354614, "learning_rate": 0.00041766826923076924, "loss": 1.9109, "step": 695 }, { "epoch": 0.08414472893376608, "grad_norm": 0.4324653744697571, "learning_rate": 0.00042067307692307695, "loss": 1.9953, "step": 700 }, { "epoch": 0.0847457627118644, "grad_norm": 0.3693973422050476, "learning_rate": 0.00042367788461538465, "loss": 1.9016, "step": 705 }, { "epoch": 0.08534679648996274, "grad_norm": 0.33113107085227966, "learning_rate": 0.0004266826923076923, "loss": 2.2266, "step": 710 }, { "epoch": 0.08594783026806106, "grad_norm": 0.5808571577072144, "learning_rate": 0.0004296875, "loss": 1.5063, "step": 715 }, { "epoch": 0.0865488640461594, "grad_norm": 0.3792312443256378, "learning_rate": 0.0004326923076923077, "loss": 1.8016, "step": 720 }, { "epoch": 0.08714989782425772, "grad_norm": 0.43698450922966003, "learning_rate": 0.00043569711538461535, "loss": 1.7219, "step": 725 }, { "epoch": 0.08775093160235606, "grad_norm": 0.43264222145080566, "learning_rate": 0.00043870192307692306, "loss": 1.7234, "step": 730 }, { "epoch": 0.08835196538045438, "grad_norm": 0.5246540307998657, "learning_rate": 0.0004417067307692308, "loss": 1.7531, "step": 735 }, { "epoch": 0.08895299915855272, "grad_norm": 0.2953200936317444, "learning_rate": 0.00044471153846153846, "loss": 1.9438, "step": 740 }, { "epoch": 0.08955403293665104, "grad_norm": 0.39238616824150085, "learning_rate": 0.00044771634615384617, "loss": 1.7172, "step": 745 }, { "epoch": 0.09015506671474936, "grad_norm": 0.4887576401233673, "learning_rate": 0.00045072115384615387, "loss": 1.9594, "step": 750 }, { "epoch": 0.0907561004928477, "grad_norm": 0.391634076833725, "learning_rate": 0.0004537259615384616, "loss": 1.8406, "step": 755 }, { "epoch": 0.09135713427094602, "grad_norm": 0.4006985127925873, "learning_rate": 0.0004567307692307692, "loss": 1.7984, "step": 760 }, { "epoch": 0.09195816804904436, "grad_norm": 0.3601657748222351, "learning_rate": 0.0004597355769230769, "loss": 1.9844, "step": 765 }, { "epoch": 0.09255920182714268, "grad_norm": 0.5057326555252075, "learning_rate": 0.00046274038461538463, "loss": 1.6703, "step": 770 }, { "epoch": 0.09316023560524102, "grad_norm": 0.5787122845649719, "learning_rate": 0.0004657451923076923, "loss": 1.8984, "step": 775 }, { "epoch": 0.09376126938333934, "grad_norm": 0.4849441945552826, "learning_rate": 0.00046875, "loss": 1.8672, "step": 780 }, { "epoch": 0.09436230316143768, "grad_norm": 0.44167378544807434, "learning_rate": 0.00047175480769230774, "loss": 1.6422, "step": 785 }, { "epoch": 0.094963336939536, "grad_norm": 0.6295076608657837, "learning_rate": 0.0004747596153846154, "loss": 1.6875, "step": 790 }, { "epoch": 0.09556437071763434, "grad_norm": 0.4804101586341858, "learning_rate": 0.0004777644230769231, "loss": 1.8203, "step": 795 }, { "epoch": 0.09616540449573266, "grad_norm": 0.4898495674133301, "learning_rate": 0.0004807692307692308, "loss": 1.9891, "step": 800 }, { "epoch": 0.09616540449573266, "eval_loss": 2.535351514816284, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1754, "eval_samples_per_second": 4.549, "eval_steps_per_second": 1.137, "step": 800 }, { "epoch": 0.096766438273831, "grad_norm": 0.43688085675239563, "learning_rate": 0.00048377403846153845, "loss": 1.7234, "step": 805 }, { "epoch": 0.09736747205192932, "grad_norm": 0.5891087651252747, "learning_rate": 0.00048677884615384615, "loss": 1.7969, "step": 810 }, { "epoch": 0.09796850583002764, "grad_norm": 0.5140319466590881, "learning_rate": 0.0004897836538461539, "loss": 2.0719, "step": 815 }, { "epoch": 0.09856953960812598, "grad_norm": 0.40886375308036804, "learning_rate": 0.0004927884615384616, "loss": 2.0891, "step": 820 }, { "epoch": 0.0991705733862243, "grad_norm": 0.3513309955596924, "learning_rate": 0.0004957932692307692, "loss": 1.8453, "step": 825 }, { "epoch": 0.09977160716432264, "grad_norm": 0.5530559420585632, "learning_rate": 0.0004987980769230769, "loss": 1.675, "step": 830 }, { "epoch": 0.10037264094242096, "grad_norm": 0.4348265528678894, "learning_rate": 0.0004999999983630302, "loss": 1.7891, "step": 835 }, { "epoch": 0.1009736747205193, "grad_norm": 0.5396342277526855, "learning_rate": 0.0004999999883593255, "loss": 1.9047, "step": 840 }, { "epoch": 0.10157470849861762, "grad_norm": 0.5154384970664978, "learning_rate": 0.0004999999692613442, "loss": 1.8844, "step": 845 }, { "epoch": 0.10217574227671596, "grad_norm": 0.29072120785713196, "learning_rate": 0.0004999999410690872, "loss": 1.6531, "step": 850 }, { "epoch": 0.10277677605481428, "grad_norm": 0.4125816822052002, "learning_rate": 0.0004999999037825552, "loss": 1.9031, "step": 855 }, { "epoch": 0.10337780983291261, "grad_norm": 0.34915369749069214, "learning_rate": 0.0004999998574017497, "loss": 1.8609, "step": 860 }, { "epoch": 0.10397884361101094, "grad_norm": 0.3622804284095764, "learning_rate": 0.0004999998019266724, "loss": 1.7484, "step": 865 }, { "epoch": 0.10457987738910927, "grad_norm": 0.36787149310112, "learning_rate": 0.0004999997373573254, "loss": 1.7812, "step": 870 }, { "epoch": 0.1051809111672076, "grad_norm": 0.4469545781612396, "learning_rate": 0.0004999996636937108, "loss": 1.5484, "step": 875 }, { "epoch": 0.10578194494530592, "grad_norm": 0.30026400089263916, "learning_rate": 0.0004999995809358316, "loss": 1.6703, "step": 880 }, { "epoch": 0.10638297872340426, "grad_norm": 0.4870736002922058, "learning_rate": 0.0004999994890836904, "loss": 1.7547, "step": 885 }, { "epoch": 0.10698401250150258, "grad_norm": 0.6516287326812744, "learning_rate": 0.000499999388137291, "loss": 1.7891, "step": 890 }, { "epoch": 0.10758504627960092, "grad_norm": 0.2974604368209839, "learning_rate": 0.0004999992780966368, "loss": 1.8359, "step": 895 }, { "epoch": 0.10818608005769924, "grad_norm": 0.3521243929862976, "learning_rate": 0.0004999991589617318, "loss": 1.9141, "step": 900 }, { "epoch": 0.10878711383579757, "grad_norm": 0.38353726267814636, "learning_rate": 0.0004999990307325803, "loss": 1.775, "step": 905 }, { "epoch": 0.1093881476138959, "grad_norm": 0.46048542857170105, "learning_rate": 0.0004999988934091872, "loss": 1.7297, "step": 910 }, { "epoch": 0.10998918139199423, "grad_norm": 0.4313719570636749, "learning_rate": 0.0004999987469915573, "loss": 1.2891, "step": 915 }, { "epoch": 0.11059021517009256, "grad_norm": 0.5933486223220825, "learning_rate": 0.0004999985914796961, "loss": 1.6938, "step": 920 }, { "epoch": 0.1111912489481909, "grad_norm": 0.5271236300468445, "learning_rate": 0.000499998426873609, "loss": 1.8, "step": 925 }, { "epoch": 0.11179228272628922, "grad_norm": 0.3807511031627655, "learning_rate": 0.0004999982531733022, "loss": 1.3086, "step": 930 }, { "epoch": 0.11239331650438755, "grad_norm": 0.4684934914112091, "learning_rate": 0.0004999980703787819, "loss": 1.4875, "step": 935 }, { "epoch": 0.11299435028248588, "grad_norm": 0.5648980140686035, "learning_rate": 0.0004999978784900549, "loss": 1.6578, "step": 940 }, { "epoch": 0.1135953840605842, "grad_norm": 0.4021349549293518, "learning_rate": 0.0004999976775071278, "loss": 1.8266, "step": 945 }, { "epoch": 0.11419641783868253, "grad_norm": 0.3722395598888397, "learning_rate": 0.0004999974674300084, "loss": 1.8969, "step": 950 }, { "epoch": 0.11479745161678086, "grad_norm": 0.407781720161438, "learning_rate": 0.000499997248258704, "loss": 1.6562, "step": 955 }, { "epoch": 0.1153984853948792, "grad_norm": 0.44156748056411743, "learning_rate": 0.0004999970199932229, "loss": 2.0688, "step": 960 }, { "epoch": 0.11599951917297752, "grad_norm": 0.40020808577537537, "learning_rate": 0.000499996782633573, "loss": 1.5047, "step": 965 }, { "epoch": 0.11660055295107585, "grad_norm": 0.38710176944732666, "learning_rate": 0.0004999965361797633, "loss": 1.7367, "step": 970 }, { "epoch": 0.11720158672917418, "grad_norm": 0.344836562871933, "learning_rate": 0.0004999962806318025, "loss": 1.7828, "step": 975 }, { "epoch": 0.11780262050727251, "grad_norm": 0.3811284899711609, "learning_rate": 0.0004999960159897, "loss": 1.7766, "step": 980 }, { "epoch": 0.11840365428537084, "grad_norm": 0.5141933560371399, "learning_rate": 0.0004999957422534654, "loss": 1.75, "step": 985 }, { "epoch": 0.11900468806346917, "grad_norm": 0.37530529499053955, "learning_rate": 0.0004999954594231088, "loss": 2.0922, "step": 990 }, { "epoch": 0.1196057218415675, "grad_norm": 0.41129302978515625, "learning_rate": 0.0004999951674986401, "loss": 1.5781, "step": 995 }, { "epoch": 0.12020675561966583, "grad_norm": 0.3869934380054474, "learning_rate": 0.0004999948664800704, "loss": 1.7422, "step": 1000 }, { "epoch": 0.12020675561966583, "eval_loss": 2.4908204078674316, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1997, "eval_samples_per_second": 4.545, "eval_steps_per_second": 1.136, "step": 1000 }, { "epoch": 0.12080778939776415, "grad_norm": 0.36643335223197937, "learning_rate": 0.0004999945563674105, "loss": 1.6797, "step": 1005 }, { "epoch": 0.12140882317586249, "grad_norm": 0.45910894870758057, "learning_rate": 0.0004999942371606714, "loss": 1.7063, "step": 1010 }, { "epoch": 0.12200985695396081, "grad_norm": 0.350729763507843, "learning_rate": 0.0004999939088598652, "loss": 1.6344, "step": 1015 }, { "epoch": 0.12261089073205914, "grad_norm": 0.46493440866470337, "learning_rate": 0.0004999935714650034, "loss": 1.9641, "step": 1020 }, { "epoch": 0.12321192451015747, "grad_norm": 0.42726650834083557, "learning_rate": 0.0004999932249760984, "loss": 1.7094, "step": 1025 }, { "epoch": 0.1238129582882558, "grad_norm": 0.28014904260635376, "learning_rate": 0.000499992869393163, "loss": 1.8516, "step": 1030 }, { "epoch": 0.12441399206635413, "grad_norm": 0.4522114098072052, "learning_rate": 0.0004999925047162099, "loss": 1.3961, "step": 1035 }, { "epoch": 0.12501502584445245, "grad_norm": 0.46475955843925476, "learning_rate": 0.0004999921309452526, "loss": 1.4062, "step": 1040 }, { "epoch": 0.1256160596225508, "grad_norm": 0.44490954279899597, "learning_rate": 0.0004999917480803044, "loss": 1.6719, "step": 1045 }, { "epoch": 0.12621709340064913, "grad_norm": 0.40904587507247925, "learning_rate": 0.0004999913561213793, "loss": 1.7734, "step": 1050 }, { "epoch": 0.12681812717874744, "grad_norm": 0.36412525177001953, "learning_rate": 0.0004999909550684918, "loss": 1.2594, "step": 1055 }, { "epoch": 0.12741916095684577, "grad_norm": 0.7560976147651672, "learning_rate": 0.0004999905449216563, "loss": 1.6047, "step": 1060 }, { "epoch": 0.1280201947349441, "grad_norm": 0.5383388996124268, "learning_rate": 0.0004999901256808878, "loss": 1.6016, "step": 1065 }, { "epoch": 0.12862122851304245, "grad_norm": 0.5255587100982666, "learning_rate": 0.0004999896973462012, "loss": 1.7828, "step": 1070 }, { "epoch": 0.12922226229114075, "grad_norm": 0.4830612242221832, "learning_rate": 0.0004999892599176127, "loss": 1.8781, "step": 1075 }, { "epoch": 0.1298232960692391, "grad_norm": 0.3687385618686676, "learning_rate": 0.0004999888133951377, "loss": 1.4797, "step": 1080 }, { "epoch": 0.13042432984733743, "grad_norm": 0.3518010675907135, "learning_rate": 0.0004999883577787927, "loss": 1.7234, "step": 1085 }, { "epoch": 0.13102536362543574, "grad_norm": 0.4522668719291687, "learning_rate": 0.0004999878930685943, "loss": 1.675, "step": 1090 }, { "epoch": 0.13162639740353407, "grad_norm": 0.3153088390827179, "learning_rate": 0.0004999874192645592, "loss": 1.7328, "step": 1095 }, { "epoch": 0.1322274311816324, "grad_norm": 0.4520825147628784, "learning_rate": 0.0004999869363667048, "loss": 1.925, "step": 1100 }, { "epoch": 0.13282846495973075, "grad_norm": 0.3040079176425934, "learning_rate": 0.0004999864443750486, "loss": 1.6922, "step": 1105 }, { "epoch": 0.13342949873782906, "grad_norm": 0.5198135375976562, "learning_rate": 0.0004999859432896084, "loss": 1.6562, "step": 1110 }, { "epoch": 0.1340305325159274, "grad_norm": 0.30772989988327026, "learning_rate": 0.0004999854331104028, "loss": 1.8078, "step": 1115 }, { "epoch": 0.13463156629402573, "grad_norm": 0.39027324318885803, "learning_rate": 0.0004999849138374498, "loss": 1.625, "step": 1120 }, { "epoch": 0.13523260007212407, "grad_norm": 0.4438004195690155, "learning_rate": 0.0004999843854707688, "loss": 1.5414, "step": 1125 }, { "epoch": 0.13583363385022237, "grad_norm": 0.4966782033443451, "learning_rate": 0.0004999838480103787, "loss": 1.4836, "step": 1130 }, { "epoch": 0.1364346676283207, "grad_norm": 0.5602577328681946, "learning_rate": 0.0004999833014562992, "loss": 1.3961, "step": 1135 }, { "epoch": 0.13703570140641905, "grad_norm": 0.5276179909706116, "learning_rate": 0.0004999827458085502, "loss": 1.8422, "step": 1140 }, { "epoch": 0.13763673518451738, "grad_norm": 0.4706065058708191, "learning_rate": 0.0004999821810671518, "loss": 1.7109, "step": 1145 }, { "epoch": 0.1382377689626157, "grad_norm": 0.38341307640075684, "learning_rate": 0.0004999816072321245, "loss": 1.8859, "step": 1150 }, { "epoch": 0.13883880274071403, "grad_norm": 0.5754373073577881, "learning_rate": 0.0004999810243034894, "loss": 1.8, "step": 1155 }, { "epoch": 0.13943983651881237, "grad_norm": 0.5003094673156738, "learning_rate": 0.0004999804322812676, "loss": 1.6766, "step": 1160 }, { "epoch": 0.14004087029691067, "grad_norm": 0.31239280104637146, "learning_rate": 0.0004999798311654805, "loss": 1.775, "step": 1165 }, { "epoch": 0.140641904075009, "grad_norm": 0.3998953700065613, "learning_rate": 0.0004999792209561501, "loss": 1.7516, "step": 1170 }, { "epoch": 0.14124293785310735, "grad_norm": 0.3099336624145508, "learning_rate": 0.0004999786016532986, "loss": 1.8422, "step": 1175 }, { "epoch": 0.14184397163120568, "grad_norm": 0.48160257935523987, "learning_rate": 0.0004999779732569485, "loss": 1.6062, "step": 1180 }, { "epoch": 0.142445005409304, "grad_norm": 0.5494711399078369, "learning_rate": 0.0004999773357671227, "loss": 1.5906, "step": 1185 }, { "epoch": 0.14304603918740233, "grad_norm": 0.7721512913703918, "learning_rate": 0.0004999766891838444, "loss": 1.7734, "step": 1190 }, { "epoch": 0.14364707296550067, "grad_norm": 0.5135265588760376, "learning_rate": 0.000499976033507137, "loss": 1.4812, "step": 1195 }, { "epoch": 0.144248106743599, "grad_norm": 0.7913392186164856, "learning_rate": 0.0004999753687370245, "loss": 1.5484, "step": 1200 }, { "epoch": 0.144248106743599, "eval_loss": 2.5082030296325684, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2256, "eval_samples_per_second": 4.542, "eval_steps_per_second": 1.136, "step": 1200 }, { "epoch": 0.1448491405216973, "grad_norm": 0.6069223880767822, "learning_rate": 0.0004999746948735308, "loss": 1.4484, "step": 1205 }, { "epoch": 0.14545017429979565, "grad_norm": 0.4137849807739258, "learning_rate": 0.0004999740119166809, "loss": 1.6719, "step": 1210 }, { "epoch": 0.14605120807789398, "grad_norm": 0.7047042846679688, "learning_rate": 0.0004999733198664992, "loss": 1.5312, "step": 1215 }, { "epoch": 0.1466522418559923, "grad_norm": 0.5389900207519531, "learning_rate": 0.0004999726187230111, "loss": 1.4297, "step": 1220 }, { "epoch": 0.14725327563409063, "grad_norm": 0.5395992994308472, "learning_rate": 0.0004999719084862421, "loss": 1.6328, "step": 1225 }, { "epoch": 0.14785430941218897, "grad_norm": 0.43566471338272095, "learning_rate": 0.0004999711891562179, "loss": 1.7094, "step": 1230 }, { "epoch": 0.1484553431902873, "grad_norm": 0.3409474194049835, "learning_rate": 0.0004999704607329648, "loss": 1.6656, "step": 1235 }, { "epoch": 0.1490563769683856, "grad_norm": 0.5498088002204895, "learning_rate": 0.0004999697232165092, "loss": 1.6016, "step": 1240 }, { "epoch": 0.14965741074648395, "grad_norm": 0.567551851272583, "learning_rate": 0.000499968976606878, "loss": 1.6828, "step": 1245 }, { "epoch": 0.15025844452458229, "grad_norm": 0.4866923987865448, "learning_rate": 0.0004999682209040983, "loss": 1.6547, "step": 1250 }, { "epoch": 0.15085947830268062, "grad_norm": 0.3780736029148102, "learning_rate": 0.0004999674561081977, "loss": 1.6719, "step": 1255 }, { "epoch": 0.15146051208077893, "grad_norm": 0.3219822347164154, "learning_rate": 0.0004999666822192039, "loss": 1.4195, "step": 1260 }, { "epoch": 0.15206154585887727, "grad_norm": 0.3056913912296295, "learning_rate": 0.0004999658992371451, "loss": 1.7484, "step": 1265 }, { "epoch": 0.1526625796369756, "grad_norm": 0.4860096573829651, "learning_rate": 0.0004999651071620499, "loss": 1.6516, "step": 1270 }, { "epoch": 0.15326361341507394, "grad_norm": 0.4047755002975464, "learning_rate": 0.0004999643059939469, "loss": 1.6984, "step": 1275 }, { "epoch": 0.15386464719317225, "grad_norm": 0.27880361676216125, "learning_rate": 0.0004999634957328652, "loss": 1.8078, "step": 1280 }, { "epoch": 0.15446568097127059, "grad_norm": 0.4087715148925781, "learning_rate": 0.0004999626763788346, "loss": 1.6422, "step": 1285 }, { "epoch": 0.15506671474936892, "grad_norm": 0.556612491607666, "learning_rate": 0.0004999618479318847, "loss": 1.5359, "step": 1290 }, { "epoch": 0.15566774852746723, "grad_norm": 0.5415599346160889, "learning_rate": 0.0004999610103920457, "loss": 1.5641, "step": 1295 }, { "epoch": 0.15626878230556557, "grad_norm": 0.48660141229629517, "learning_rate": 0.0004999601637593479, "loss": 1.5, "step": 1300 }, { "epoch": 0.1568698160836639, "grad_norm": 0.5874481797218323, "learning_rate": 0.0004999593080338224, "loss": 1.3844, "step": 1305 }, { "epoch": 0.15747084986176224, "grad_norm": 0.3727753460407257, "learning_rate": 0.0004999584432155, "loss": 1.8125, "step": 1310 }, { "epoch": 0.15807188363986055, "grad_norm": 0.35395169258117676, "learning_rate": 0.0004999575693044124, "loss": 1.3305, "step": 1315 }, { "epoch": 0.1586729174179589, "grad_norm": 0.7356476783752441, "learning_rate": 0.0004999566863005912, "loss": 1.7078, "step": 1320 }, { "epoch": 0.15927395119605722, "grad_norm": 0.4838991165161133, "learning_rate": 0.0004999557942040687, "loss": 1.5969, "step": 1325 }, { "epoch": 0.15987498497415556, "grad_norm": 0.4504292905330658, "learning_rate": 0.0004999548930148773, "loss": 1.4555, "step": 1330 }, { "epoch": 0.16047601875225387, "grad_norm": 0.5174041390419006, "learning_rate": 0.0004999539827330497, "loss": 1.5266, "step": 1335 }, { "epoch": 0.1610770525303522, "grad_norm": 0.42709511518478394, "learning_rate": 0.000499953063358619, "loss": 1.4344, "step": 1340 }, { "epoch": 0.16167808630845054, "grad_norm": 0.34575751423835754, "learning_rate": 0.0004999521348916189, "loss": 1.5219, "step": 1345 }, { "epoch": 0.16227912008654885, "grad_norm": 0.4409041404724121, "learning_rate": 0.0004999511973320829, "loss": 1.6172, "step": 1350 }, { "epoch": 0.1628801538646472, "grad_norm": 0.37874963879585266, "learning_rate": 0.0004999502506800452, "loss": 1.3156, "step": 1355 }, { "epoch": 0.16348118764274552, "grad_norm": 0.39675143361091614, "learning_rate": 0.0004999492949355401, "loss": 1.7672, "step": 1360 }, { "epoch": 0.16408222142084386, "grad_norm": 0.4887191951274872, "learning_rate": 0.0004999483300986027, "loss": 1.6578, "step": 1365 }, { "epoch": 0.16468325519894217, "grad_norm": 0.5052289366722107, "learning_rate": 0.000499947356169268, "loss": 1.5766, "step": 1370 }, { "epoch": 0.1652842889770405, "grad_norm": 0.3420865833759308, "learning_rate": 0.000499946373147571, "loss": 1.4281, "step": 1375 }, { "epoch": 0.16588532275513884, "grad_norm": 0.6112978458404541, "learning_rate": 0.0004999453810335479, "loss": 1.4234, "step": 1380 }, { "epoch": 0.16648635653323718, "grad_norm": 0.46144208312034607, "learning_rate": 0.0004999443798272348, "loss": 1.4609, "step": 1385 }, { "epoch": 0.1670873903113355, "grad_norm": 0.5132108926773071, "learning_rate": 0.000499943369528668, "loss": 1.5656, "step": 1390 }, { "epoch": 0.16768842408943382, "grad_norm": 0.5717546939849854, "learning_rate": 0.000499942350137884, "loss": 1.3617, "step": 1395 }, { "epoch": 0.16828945786753216, "grad_norm": 0.4766351580619812, "learning_rate": 0.0004999413216549203, "loss": 1.5016, "step": 1400 }, { "epoch": 0.16828945786753216, "eval_loss": 2.3990235328674316, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.19, "eval_samples_per_second": 4.547, "eval_steps_per_second": 1.137, "step": 1400 }, { "epoch": 0.1688904916456305, "grad_norm": 0.43601974844932556, "learning_rate": 0.0004999402840798142, "loss": 1.4156, "step": 1405 }, { "epoch": 0.1694915254237288, "grad_norm": 0.6105599403381348, "learning_rate": 0.0004999392374126034, "loss": 1.7, "step": 1410 }, { "epoch": 0.17009255920182714, "grad_norm": 0.4957026243209839, "learning_rate": 0.0004999381816533259, "loss": 1.7969, "step": 1415 }, { "epoch": 0.17069359297992548, "grad_norm": 0.44195666909217834, "learning_rate": 0.0004999371168020201, "loss": 1.4375, "step": 1420 }, { "epoch": 0.1712946267580238, "grad_norm": 0.45855048298835754, "learning_rate": 0.0004999360428587249, "loss": 1.6141, "step": 1425 }, { "epoch": 0.17189566053612212, "grad_norm": 0.6269901990890503, "learning_rate": 0.0004999349598234792, "loss": 1.3953, "step": 1430 }, { "epoch": 0.17249669431422046, "grad_norm": 0.3805680274963379, "learning_rate": 0.0004999338676963225, "loss": 1.6484, "step": 1435 }, { "epoch": 0.1730977280923188, "grad_norm": 0.6604627966880798, "learning_rate": 0.0004999327664772945, "loss": 1.5969, "step": 1440 }, { "epoch": 0.1736987618704171, "grad_norm": 0.4411623179912567, "learning_rate": 0.0004999316561664353, "loss": 1.2609, "step": 1445 }, { "epoch": 0.17429979564851544, "grad_norm": 0.5301747918128967, "learning_rate": 0.0004999305367637852, "loss": 1.6141, "step": 1450 }, { "epoch": 0.17490082942661378, "grad_norm": 0.5128594040870667, "learning_rate": 0.000499929408269385, "loss": 1.6187, "step": 1455 }, { "epoch": 0.17550186320471212, "grad_norm": 0.596217155456543, "learning_rate": 0.0004999282706832758, "loss": 1.4531, "step": 1460 }, { "epoch": 0.17610289698281043, "grad_norm": 0.45486292243003845, "learning_rate": 0.0004999271240054987, "loss": 1.4012, "step": 1465 }, { "epoch": 0.17670393076090876, "grad_norm": 0.6031058430671692, "learning_rate": 0.0004999259682360957, "loss": 1.6203, "step": 1470 }, { "epoch": 0.1773049645390071, "grad_norm": 0.4107096493244171, "learning_rate": 0.0004999248033751088, "loss": 1.7312, "step": 1475 }, { "epoch": 0.17790599831710543, "grad_norm": 0.46700888872146606, "learning_rate": 0.0004999236294225803, "loss": 1.5234, "step": 1480 }, { "epoch": 0.17850703209520374, "grad_norm": 0.7690737247467041, "learning_rate": 0.000499922446378553, "loss": 1.307, "step": 1485 }, { "epoch": 0.17910806587330208, "grad_norm": 0.5420579314231873, "learning_rate": 0.0004999212542430698, "loss": 1.6562, "step": 1490 }, { "epoch": 0.17970909965140042, "grad_norm": 0.4624311625957489, "learning_rate": 0.0004999200530161742, "loss": 1.3234, "step": 1495 }, { "epoch": 0.18031013342949873, "grad_norm": 0.4610016345977783, "learning_rate": 0.0004999188426979097, "loss": 1.5516, "step": 1500 }, { "epoch": 0.18091116720759706, "grad_norm": 0.5131213068962097, "learning_rate": 0.0004999176232883206, "loss": 1.5867, "step": 1505 }, { "epoch": 0.1815122009856954, "grad_norm": 0.5673689842224121, "learning_rate": 0.0004999163947874511, "loss": 1.5078, "step": 1510 }, { "epoch": 0.18211323476379374, "grad_norm": 0.7008316516876221, "learning_rate": 0.000499915157195346, "loss": 1.5562, "step": 1515 }, { "epoch": 0.18271426854189204, "grad_norm": 0.5652767419815063, "learning_rate": 0.00049991391051205, "loss": 1.4469, "step": 1520 }, { "epoch": 0.18331530231999038, "grad_norm": 0.5506184101104736, "learning_rate": 0.0004999126547376089, "loss": 1.4531, "step": 1525 }, { "epoch": 0.18391633609808872, "grad_norm": 0.5806117057800293, "learning_rate": 0.000499911389872068, "loss": 1.7594, "step": 1530 }, { "epoch": 0.18451736987618705, "grad_norm": 0.5860136151313782, "learning_rate": 0.0004999101159154736, "loss": 1.5562, "step": 1535 }, { "epoch": 0.18511840365428536, "grad_norm": 0.5575783252716064, "learning_rate": 0.000499908832867872, "loss": 1.6391, "step": 1540 }, { "epoch": 0.1857194374323837, "grad_norm": 0.3992920219898224, "learning_rate": 0.0004999075407293096, "loss": 1.3859, "step": 1545 }, { "epoch": 0.18632047121048204, "grad_norm": 0.8294938206672668, "learning_rate": 0.0004999062394998336, "loss": 1.25, "step": 1550 }, { "epoch": 0.18692150498858034, "grad_norm": 0.6560512185096741, "learning_rate": 0.0004999049291794915, "loss": 1.4453, "step": 1555 }, { "epoch": 0.18752253876667868, "grad_norm": 0.5583436489105225, "learning_rate": 0.0004999036097683307, "loss": 1.3969, "step": 1560 }, { "epoch": 0.18812357254477702, "grad_norm": 0.6256234645843506, "learning_rate": 0.0004999022812663993, "loss": 1.518, "step": 1565 }, { "epoch": 0.18872460632287535, "grad_norm": 0.5769176483154297, "learning_rate": 0.0004999009436737457, "loss": 1.6609, "step": 1570 }, { "epoch": 0.18932564010097366, "grad_norm": 0.6486324071884155, "learning_rate": 0.0004998995969904183, "loss": 1.3172, "step": 1575 }, { "epoch": 0.189926673879072, "grad_norm": 0.34935474395751953, "learning_rate": 0.0004998982412164663, "loss": 1.5562, "step": 1580 }, { "epoch": 0.19052770765717034, "grad_norm": 0.5806995630264282, "learning_rate": 0.000499896876351939, "loss": 1.6219, "step": 1585 }, { "epoch": 0.19112874143526867, "grad_norm": 0.6906558275222778, "learning_rate": 0.0004998955023968862, "loss": 1.5172, "step": 1590 }, { "epoch": 0.19172977521336698, "grad_norm": 0.49730750918388367, "learning_rate": 0.0004998941193513575, "loss": 1.6797, "step": 1595 }, { "epoch": 0.19233080899146532, "grad_norm": 0.5871158242225647, "learning_rate": 0.0004998927272154036, "loss": 1.6125, "step": 1600 }, { "epoch": 0.19233080899146532, "eval_loss": 2.360156297683716, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1647, "eval_samples_per_second": 4.55, "eval_steps_per_second": 1.138, "step": 1600 }, { "epoch": 0.19293184276956366, "grad_norm": 0.3994157910346985, "learning_rate": 0.000499891325989075, "loss": 1.3305, "step": 1605 }, { "epoch": 0.193532876547662, "grad_norm": 0.3497280180454254, "learning_rate": 0.0004998899156724224, "loss": 1.3531, "step": 1610 }, { "epoch": 0.1941339103257603, "grad_norm": 0.4835513234138489, "learning_rate": 0.0004998884962654976, "loss": 1.293, "step": 1615 }, { "epoch": 0.19473494410385864, "grad_norm": 0.4717245101928711, "learning_rate": 0.0004998870677683519, "loss": 1.3742, "step": 1620 }, { "epoch": 0.19533597788195697, "grad_norm": 0.3917827308177948, "learning_rate": 0.0004998856301810373, "loss": 1.5719, "step": 1625 }, { "epoch": 0.19593701166005528, "grad_norm": 0.4725429117679596, "learning_rate": 0.0004998841835036061, "loss": 1.3859, "step": 1630 }, { "epoch": 0.19653804543815362, "grad_norm": 0.4728795289993286, "learning_rate": 0.0004998827277361111, "loss": 1.4203, "step": 1635 }, { "epoch": 0.19713907921625196, "grad_norm": 0.6246328949928284, "learning_rate": 0.000499881262878605, "loss": 1.7719, "step": 1640 }, { "epoch": 0.1977401129943503, "grad_norm": 0.7019891738891602, "learning_rate": 0.0004998797889311413, "loss": 1.3781, "step": 1645 }, { "epoch": 0.1983411467724486, "grad_norm": 0.2940036654472351, "learning_rate": 0.0004998783058937735, "loss": 1.4148, "step": 1650 }, { "epoch": 0.19894218055054694, "grad_norm": 0.434410959482193, "learning_rate": 0.0004998768137665556, "loss": 1.6094, "step": 1655 }, { "epoch": 0.19954321432864527, "grad_norm": 0.5853382349014282, "learning_rate": 0.0004998753125495418, "loss": 1.4125, "step": 1660 }, { "epoch": 0.2001442481067436, "grad_norm": 0.5105974078178406, "learning_rate": 0.0004998738022427867, "loss": 1.3313, "step": 1665 }, { "epoch": 0.20074528188484192, "grad_norm": 0.4266336262226105, "learning_rate": 0.0004998722828463455, "loss": 1.5953, "step": 1670 }, { "epoch": 0.20134631566294026, "grad_norm": 0.4918626844882965, "learning_rate": 0.0004998707543602731, "loss": 1.8383, "step": 1675 }, { "epoch": 0.2019473494410386, "grad_norm": 0.4804850220680237, "learning_rate": 0.0004998692167846253, "loss": 1.1484, "step": 1680 }, { "epoch": 0.20254838321913693, "grad_norm": 0.5131824612617493, "learning_rate": 0.0004998676701194581, "loss": 1.7109, "step": 1685 }, { "epoch": 0.20314941699723524, "grad_norm": 0.4895535111427307, "learning_rate": 0.0004998661143648277, "loss": 1.7453, "step": 1690 }, { "epoch": 0.20375045077533357, "grad_norm": 0.4180288314819336, "learning_rate": 0.0004998645495207906, "loss": 1.0766, "step": 1695 }, { "epoch": 0.2043514845534319, "grad_norm": 0.4888496696949005, "learning_rate": 0.0004998629755874037, "loss": 1.5359, "step": 1700 }, { "epoch": 0.20495251833153022, "grad_norm": 0.666147768497467, "learning_rate": 0.0004998613925647245, "loss": 1.5609, "step": 1705 }, { "epoch": 0.20555355210962856, "grad_norm": 0.563382625579834, "learning_rate": 0.0004998598004528103, "loss": 1.4187, "step": 1710 }, { "epoch": 0.2061545858877269, "grad_norm": 0.619296669960022, "learning_rate": 0.0004998581992517192, "loss": 1.3367, "step": 1715 }, { "epoch": 0.20675561966582523, "grad_norm": 0.928014874458313, "learning_rate": 0.0004998565889615096, "loss": 1.4094, "step": 1720 }, { "epoch": 0.20735665344392354, "grad_norm": 0.4932372272014618, "learning_rate": 0.0004998549695822397, "loss": 1.3719, "step": 1725 }, { "epoch": 0.20795768722202188, "grad_norm": 0.6022034287452698, "learning_rate": 0.0004998533411139685, "loss": 1.5781, "step": 1730 }, { "epoch": 0.2085587210001202, "grad_norm": 0.41716283559799194, "learning_rate": 0.0004998517035567554, "loss": 1.1914, "step": 1735 }, { "epoch": 0.20915975477821855, "grad_norm": 0.4988159239292145, "learning_rate": 0.0004998500569106599, "loss": 1.475, "step": 1740 }, { "epoch": 0.20976078855631686, "grad_norm": 0.4242478907108307, "learning_rate": 0.0004998484011757419, "loss": 1.2859, "step": 1745 }, { "epoch": 0.2103618223344152, "grad_norm": 0.5382992625236511, "learning_rate": 0.0004998467363520617, "loss": 1.3687, "step": 1750 }, { "epoch": 0.21096285611251353, "grad_norm": 0.31303003430366516, "learning_rate": 0.0004998450624396797, "loss": 1.9281, "step": 1755 }, { "epoch": 0.21156388989061184, "grad_norm": 0.5793948173522949, "learning_rate": 0.0004998433794386569, "loss": 1.457, "step": 1760 }, { "epoch": 0.21216492366871018, "grad_norm": 0.48824676871299744, "learning_rate": 0.0004998416873490544, "loss": 1.5359, "step": 1765 }, { "epoch": 0.2127659574468085, "grad_norm": 0.5384695529937744, "learning_rate": 0.000499839986170934, "loss": 1.4461, "step": 1770 }, { "epoch": 0.21336699122490685, "grad_norm": 0.5212387442588806, "learning_rate": 0.0004998382759043574, "loss": 1.2844, "step": 1775 }, { "epoch": 0.21396802500300516, "grad_norm": 0.5552918910980225, "learning_rate": 0.0004998365565493868, "loss": 1.5516, "step": 1780 }, { "epoch": 0.2145690587811035, "grad_norm": 0.5672168135643005, "learning_rate": 0.0004998348281060848, "loss": 1.6297, "step": 1785 }, { "epoch": 0.21517009255920183, "grad_norm": 0.620464026927948, "learning_rate": 0.0004998330905745143, "loss": 1.5047, "step": 1790 }, { "epoch": 0.21577112633730017, "grad_norm": 0.5900077819824219, "learning_rate": 0.0004998313439547384, "loss": 1.2367, "step": 1795 }, { "epoch": 0.21637216011539848, "grad_norm": 0.5305217504501343, "learning_rate": 0.0004998295882468209, "loss": 1.5906, "step": 1800 }, { "epoch": 0.21637216011539848, "eval_loss": 2.288867235183716, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2003, "eval_samples_per_second": 4.545, "eval_steps_per_second": 1.136, "step": 1800 }, { "epoch": 0.2169731938934968, "grad_norm": 0.5836020112037659, "learning_rate": 0.0004998278234508253, "loss": 1.4891, "step": 1805 }, { "epoch": 0.21757422767159515, "grad_norm": 0.3793884813785553, "learning_rate": 0.0004998260495668161, "loss": 1.3328, "step": 1810 }, { "epoch": 0.21817526144969349, "grad_norm": 0.5394117832183838, "learning_rate": 0.0004998242665948577, "loss": 1.368, "step": 1815 }, { "epoch": 0.2187762952277918, "grad_norm": 0.39613473415374756, "learning_rate": 0.0004998224745350148, "loss": 1.2285, "step": 1820 }, { "epoch": 0.21937732900589013, "grad_norm": 0.543116569519043, "learning_rate": 0.0004998206733873529, "loss": 1.5078, "step": 1825 }, { "epoch": 0.21997836278398847, "grad_norm": 0.4901551306247711, "learning_rate": 0.0004998188631519375, "loss": 1.4516, "step": 1830 }, { "epoch": 0.22057939656208678, "grad_norm": 0.5067916512489319, "learning_rate": 0.0004998170438288342, "loss": 1.5719, "step": 1835 }, { "epoch": 0.2211804303401851, "grad_norm": 0.4343029856681824, "learning_rate": 0.0004998152154181093, "loss": 1.3766, "step": 1840 }, { "epoch": 0.22178146411828345, "grad_norm": 0.5296164155006409, "learning_rate": 0.0004998133779198293, "loss": 1.3625, "step": 1845 }, { "epoch": 0.2223824978963818, "grad_norm": 0.4429774284362793, "learning_rate": 0.0004998115313340611, "loss": 1.3891, "step": 1850 }, { "epoch": 0.2229835316744801, "grad_norm": 0.5772582292556763, "learning_rate": 0.0004998096756608719, "loss": 1.5437, "step": 1855 }, { "epoch": 0.22358456545257843, "grad_norm": 0.5951064825057983, "learning_rate": 0.0004998078109003291, "loss": 1.4672, "step": 1860 }, { "epoch": 0.22418559923067677, "grad_norm": 0.3261686861515045, "learning_rate": 0.0004998059370525006, "loss": 1.5063, "step": 1865 }, { "epoch": 0.2247866330087751, "grad_norm": 0.3098331689834595, "learning_rate": 0.0004998040541174545, "loss": 1.5094, "step": 1870 }, { "epoch": 0.22538766678687341, "grad_norm": 0.8590214252471924, "learning_rate": 0.0004998021620952593, "loss": 1.3977, "step": 1875 }, { "epoch": 0.22598870056497175, "grad_norm": 0.5078855752944946, "learning_rate": 0.0004998002609859839, "loss": 1.2789, "step": 1880 }, { "epoch": 0.2265897343430701, "grad_norm": 0.4515461027622223, "learning_rate": 0.0004997983507896976, "loss": 1.368, "step": 1885 }, { "epoch": 0.2271907681211684, "grad_norm": 0.4937264025211334, "learning_rate": 0.0004997964315064695, "loss": 1.1953, "step": 1890 }, { "epoch": 0.22779180189926673, "grad_norm": 0.6028769612312317, "learning_rate": 0.0004997945031363697, "loss": 1.4859, "step": 1895 }, { "epoch": 0.22839283567736507, "grad_norm": 0.4746128022670746, "learning_rate": 0.0004997925656794683, "loss": 1.6016, "step": 1900 }, { "epoch": 0.2289938694554634, "grad_norm": 0.519091010093689, "learning_rate": 0.0004997906191358358, "loss": 1.3906, "step": 1905 }, { "epoch": 0.22959490323356171, "grad_norm": 0.4584903419017792, "learning_rate": 0.0004997886635055429, "loss": 1.3258, "step": 1910 }, { "epoch": 0.23019593701166005, "grad_norm": 0.7446622252464294, "learning_rate": 0.0004997866987886608, "loss": 1.2141, "step": 1915 }, { "epoch": 0.2307969707897584, "grad_norm": 0.5405495166778564, "learning_rate": 0.0004997847249852609, "loss": 1.4359, "step": 1920 }, { "epoch": 0.23139800456785672, "grad_norm": 0.38187775015830994, "learning_rate": 0.0004997827420954152, "loss": 1.7219, "step": 1925 }, { "epoch": 0.23199903834595503, "grad_norm": 0.503364622592926, "learning_rate": 0.0004997807501191957, "loss": 1.3586, "step": 1930 }, { "epoch": 0.23260007212405337, "grad_norm": 0.43855008482933044, "learning_rate": 0.0004997787490566749, "loss": 1.7625, "step": 1935 }, { "epoch": 0.2332011059021517, "grad_norm": 0.4955185651779175, "learning_rate": 0.0004997767389079255, "loss": 1.2281, "step": 1940 }, { "epoch": 0.23380213968025004, "grad_norm": 0.7726651430130005, "learning_rate": 0.0004997747196730206, "loss": 1.5445, "step": 1945 }, { "epoch": 0.23440317345834835, "grad_norm": 0.38199684023857117, "learning_rate": 0.000499772691352034, "loss": 1.4203, "step": 1950 }, { "epoch": 0.2350042072364467, "grad_norm": 0.4838792383670807, "learning_rate": 0.000499770653945039, "loss": 1.2484, "step": 1955 }, { "epoch": 0.23560524101454502, "grad_norm": 0.43874993920326233, "learning_rate": 0.00049976860745211, "loss": 1.3594, "step": 1960 }, { "epoch": 0.23620627479264333, "grad_norm": 0.4992177188396454, "learning_rate": 0.0004997665518733215, "loss": 1.1977, "step": 1965 }, { "epoch": 0.23680730857074167, "grad_norm": 0.526907742023468, "learning_rate": 0.000499764487208748, "loss": 1.1609, "step": 1970 }, { "epoch": 0.23740834234884, "grad_norm": 0.599902868270874, "learning_rate": 0.000499762413458465, "loss": 1.4203, "step": 1975 }, { "epoch": 0.23800937612693834, "grad_norm": 0.42601317167282104, "learning_rate": 0.0004997603306225475, "loss": 1.1516, "step": 1980 }, { "epoch": 0.23861040990503665, "grad_norm": 0.3787403404712677, "learning_rate": 0.0004997582387010715, "loss": 1.3391, "step": 1985 }, { "epoch": 0.239211443683135, "grad_norm": 0.5586139559745789, "learning_rate": 0.0004997561376941131, "loss": 1.6656, "step": 1990 }, { "epoch": 0.23981247746123333, "grad_norm": 0.44761109352111816, "learning_rate": 0.0004997540276017487, "loss": 1.5828, "step": 1995 }, { "epoch": 0.24041351123933166, "grad_norm": 0.4291538894176483, "learning_rate": 0.000499751908424055, "loss": 1.4539, "step": 2000 }, { "epoch": 0.24041351123933166, "eval_loss": 2.2626953125, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2012, "eval_samples_per_second": 4.545, "eval_steps_per_second": 1.136, "step": 2000 }, { "epoch": 0.24101454501742997, "grad_norm": 0.46680477261543274, "learning_rate": 0.0004997497801611093, "loss": 1.2609, "step": 2005 }, { "epoch": 0.2416155787955283, "grad_norm": 0.42086416482925415, "learning_rate": 0.0004997476428129887, "loss": 1.1609, "step": 2010 }, { "epoch": 0.24221661257362664, "grad_norm": 0.7524279952049255, "learning_rate": 0.0004997454963797713, "loss": 1.0633, "step": 2015 }, { "epoch": 0.24281764635172498, "grad_norm": 0.43722498416900635, "learning_rate": 0.0004997433408615349, "loss": 1.2969, "step": 2020 }, { "epoch": 0.2434186801298233, "grad_norm": 0.2848932147026062, "learning_rate": 0.0004997411762583581, "loss": 1.2063, "step": 2025 }, { "epoch": 0.24401971390792163, "grad_norm": 0.4349381923675537, "learning_rate": 0.0004997390025703194, "loss": 1.3625, "step": 2030 }, { "epoch": 0.24462074768601996, "grad_norm": 0.4666562080383301, "learning_rate": 0.0004997368197974982, "loss": 1.4164, "step": 2035 }, { "epoch": 0.24522178146411827, "grad_norm": 0.5730391144752502, "learning_rate": 0.0004997346279399736, "loss": 1.1633, "step": 2040 }, { "epoch": 0.2458228152422166, "grad_norm": 0.5395126938819885, "learning_rate": 0.0004997324269978255, "loss": 1.2398, "step": 2045 }, { "epoch": 0.24642384902031494, "grad_norm": 0.3828608989715576, "learning_rate": 0.000499730216971134, "loss": 1.0594, "step": 2050 }, { "epoch": 0.24702488279841328, "grad_norm": 0.796903133392334, "learning_rate": 0.0004997279978599794, "loss": 1.3055, "step": 2055 }, { "epoch": 0.2476259165765116, "grad_norm": 0.35091638565063477, "learning_rate": 0.0004997257696644424, "loss": 1.2023, "step": 2060 }, { "epoch": 0.24822695035460993, "grad_norm": 0.46753543615341187, "learning_rate": 0.000499723532384604, "loss": 1.3203, "step": 2065 }, { "epoch": 0.24882798413270826, "grad_norm": 0.5231248736381531, "learning_rate": 0.0004997212860205459, "loss": 1.3438, "step": 2070 }, { "epoch": 0.2494290179108066, "grad_norm": 0.470639169216156, "learning_rate": 0.0004997190305723495, "loss": 1.3031, "step": 2075 }, { "epoch": 0.2500300516889049, "grad_norm": 0.4669177234172821, "learning_rate": 0.000499716766040097, "loss": 1.5188, "step": 2080 }, { "epoch": 0.25063108546700325, "grad_norm": 0.5113319754600525, "learning_rate": 0.0004997144924238706, "loss": 1.0992, "step": 2085 }, { "epoch": 0.2512321192451016, "grad_norm": 0.5395264625549316, "learning_rate": 0.0004997122097237533, "loss": 1.3281, "step": 2090 }, { "epoch": 0.2518331530231999, "grad_norm": 0.47676244378089905, "learning_rate": 0.0004997099179398279, "loss": 1.2898, "step": 2095 }, { "epoch": 0.25243418680129825, "grad_norm": 0.3385642468929291, "learning_rate": 0.0004997076170721778, "loss": 1.2078, "step": 2100 }, { "epoch": 0.25303522057939654, "grad_norm": 0.3868078887462616, "learning_rate": 0.0004997053071208868, "loss": 1.4563, "step": 2105 }, { "epoch": 0.2536362543574949, "grad_norm": 0.437321275472641, "learning_rate": 0.0004997029880860389, "loss": 1.3977, "step": 2110 }, { "epoch": 0.2542372881355932, "grad_norm": 0.6515981554985046, "learning_rate": 0.0004997006599677183, "loss": 1.2461, "step": 2115 }, { "epoch": 0.25483832191369155, "grad_norm": 0.3654949367046356, "learning_rate": 0.0004996983227660099, "loss": 1.4187, "step": 2120 }, { "epoch": 0.2554393556917899, "grad_norm": 0.5203860998153687, "learning_rate": 0.0004996959764809987, "loss": 1.4328, "step": 2125 }, { "epoch": 0.2560403894698882, "grad_norm": 0.5454062223434448, "learning_rate": 0.00049969362111277, "loss": 1.5125, "step": 2130 }, { "epoch": 0.25664142324798656, "grad_norm": 0.5460174679756165, "learning_rate": 0.0004996912566614094, "loss": 1.4344, "step": 2135 }, { "epoch": 0.2572424570260849, "grad_norm": 0.4798714816570282, "learning_rate": 0.000499688883127003, "loss": 1.1953, "step": 2140 }, { "epoch": 0.2578434908041832, "grad_norm": 0.679547905921936, "learning_rate": 0.0004996865005096372, "loss": 1.2688, "step": 2145 }, { "epoch": 0.2584445245822815, "grad_norm": 0.42334336042404175, "learning_rate": 0.0004996841088093985, "loss": 1.1516, "step": 2150 }, { "epoch": 0.25904555836037985, "grad_norm": 0.4171724021434784, "learning_rate": 0.000499681708026374, "loss": 1.0961, "step": 2155 }, { "epoch": 0.2596465921384782, "grad_norm": 0.6091195940971375, "learning_rate": 0.0004996792981606511, "loss": 1.0164, "step": 2160 }, { "epoch": 0.2602476259165765, "grad_norm": 0.7312507033348083, "learning_rate": 0.0004996768792123173, "loss": 1.3031, "step": 2165 }, { "epoch": 0.26084865969467486, "grad_norm": 0.8120207786560059, "learning_rate": 0.0004996744511814609, "loss": 1.1641, "step": 2170 }, { "epoch": 0.2614496934727732, "grad_norm": 0.4702399969100952, "learning_rate": 0.0004996720140681699, "loss": 1.2805, "step": 2175 }, { "epoch": 0.2620507272508715, "grad_norm": 0.45239925384521484, "learning_rate": 0.0004996695678725331, "loss": 1.5539, "step": 2180 }, { "epoch": 0.2626517610289698, "grad_norm": 0.6370692253112793, "learning_rate": 0.0004996671125946394, "loss": 1.2156, "step": 2185 }, { "epoch": 0.26325279480706815, "grad_norm": 0.6115698218345642, "learning_rate": 0.0004996646482345781, "loss": 1.2891, "step": 2190 }, { "epoch": 0.2638538285851665, "grad_norm": 0.611488401889801, "learning_rate": 0.0004996621747924391, "loss": 1.3023, "step": 2195 }, { "epoch": 0.2644548623632648, "grad_norm": 0.6977550983428955, "learning_rate": 0.0004996596922683122, "loss": 1.3555, "step": 2200 }, { "epoch": 0.2644548623632648, "eval_loss": 2.2613282203674316, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2156, "eval_samples_per_second": 4.543, "eval_steps_per_second": 1.136, "step": 2200 }, { "epoch": 0.26505589614136316, "grad_norm": 0.6270340085029602, "learning_rate": 0.0004996572006622876, "loss": 1.5938, "step": 2205 }, { "epoch": 0.2656569299194615, "grad_norm": 0.5670061707496643, "learning_rate": 0.0004996546999744561, "loss": 1.5016, "step": 2210 }, { "epoch": 0.26625796369755983, "grad_norm": 0.38163918256759644, "learning_rate": 0.0004996521902049086, "loss": 1.2812, "step": 2215 }, { "epoch": 0.2668589974756581, "grad_norm": 0.45828545093536377, "learning_rate": 0.0004996496713537365, "loss": 1.3023, "step": 2220 }, { "epoch": 0.26746003125375645, "grad_norm": 0.4318217933177948, "learning_rate": 0.0004996471434210312, "loss": 1.6039, "step": 2225 }, { "epoch": 0.2680610650318548, "grad_norm": 0.5099067091941833, "learning_rate": 0.0004996446064068848, "loss": 1.5562, "step": 2230 }, { "epoch": 0.2686620988099531, "grad_norm": 0.7253368496894836, "learning_rate": 0.0004996420603113897, "loss": 1.2523, "step": 2235 }, { "epoch": 0.26926313258805146, "grad_norm": 0.6101372838020325, "learning_rate": 0.0004996395051346384, "loss": 1.4125, "step": 2240 }, { "epoch": 0.2698641663661498, "grad_norm": 0.5073166489601135, "learning_rate": 0.0004996369408767238, "loss": 1.1109, "step": 2245 }, { "epoch": 0.27046520014424813, "grad_norm": 0.4978417456150055, "learning_rate": 0.0004996343675377393, "loss": 1.3438, "step": 2250 }, { "epoch": 0.2710662339223464, "grad_norm": 0.695686936378479, "learning_rate": 0.0004996317851177784, "loss": 1.0445, "step": 2255 }, { "epoch": 0.27166726770044475, "grad_norm": 0.5276048183441162, "learning_rate": 0.000499629193616935, "loss": 1.2703, "step": 2260 }, { "epoch": 0.2722683014785431, "grad_norm": 0.7686821222305298, "learning_rate": 0.0004996265930353036, "loss": 1.2656, "step": 2265 }, { "epoch": 0.2728693352566414, "grad_norm": 0.673497200012207, "learning_rate": 0.0004996239833729786, "loss": 1.4055, "step": 2270 }, { "epoch": 0.27347036903473976, "grad_norm": 0.4770069122314453, "learning_rate": 0.000499621364630055, "loss": 1.1227, "step": 2275 }, { "epoch": 0.2740714028128381, "grad_norm": 0.630565881729126, "learning_rate": 0.000499618736806628, "loss": 1.293, "step": 2280 }, { "epoch": 0.27467243659093643, "grad_norm": 0.5288265943527222, "learning_rate": 0.0004996160999027933, "loss": 1.5109, "step": 2285 }, { "epoch": 0.27527347036903477, "grad_norm": 0.35486194491386414, "learning_rate": 0.0004996134539186469, "loss": 1.5078, "step": 2290 }, { "epoch": 0.27587450414713305, "grad_norm": 0.5654587745666504, "learning_rate": 0.0004996107988542847, "loss": 1.625, "step": 2295 }, { "epoch": 0.2764755379252314, "grad_norm": 0.40694040060043335, "learning_rate": 0.0004996081347098037, "loss": 1.4531, "step": 2300 }, { "epoch": 0.2770765717033297, "grad_norm": 0.5765879154205322, "learning_rate": 0.0004996054614853005, "loss": 1.343, "step": 2305 }, { "epoch": 0.27767760548142806, "grad_norm": 0.49710384011268616, "learning_rate": 0.0004996027791808725, "loss": 1.3266, "step": 2310 }, { "epoch": 0.2782786392595264, "grad_norm": 0.5011634826660156, "learning_rate": 0.0004996000877966172, "loss": 1.3438, "step": 2315 }, { "epoch": 0.27887967303762473, "grad_norm": 0.6307665705680847, "learning_rate": 0.0004995973873326326, "loss": 1.5703, "step": 2320 }, { "epoch": 0.27948070681572307, "grad_norm": 0.46662095189094543, "learning_rate": 0.0004995946777890169, "loss": 1.4414, "step": 2325 }, { "epoch": 0.28008174059382135, "grad_norm": 0.49989181756973267, "learning_rate": 0.0004995919591658687, "loss": 1.3789, "step": 2330 }, { "epoch": 0.2806827743719197, "grad_norm": 0.4880094528198242, "learning_rate": 0.0004995892314632867, "loss": 1.2633, "step": 2335 }, { "epoch": 0.281283808150018, "grad_norm": 0.6314132213592529, "learning_rate": 0.0004995864946813703, "loss": 1.5539, "step": 2340 }, { "epoch": 0.28188484192811636, "grad_norm": 0.7073726654052734, "learning_rate": 0.0004995837488202191, "loss": 1.3766, "step": 2345 }, { "epoch": 0.2824858757062147, "grad_norm": 0.5559587478637695, "learning_rate": 0.0004995809938799329, "loss": 1.4875, "step": 2350 }, { "epoch": 0.28308690948431303, "grad_norm": 0.4955267906188965, "learning_rate": 0.0004995782298606119, "loss": 1.3156, "step": 2355 }, { "epoch": 0.28368794326241137, "grad_norm": 0.4989592432975769, "learning_rate": 0.0004995754567623567, "loss": 1.2484, "step": 2360 }, { "epoch": 0.28428897704050965, "grad_norm": 0.5886387228965759, "learning_rate": 0.0004995726745852681, "loss": 1.2344, "step": 2365 }, { "epoch": 0.284890010818608, "grad_norm": 0.5085893273353577, "learning_rate": 0.0004995698833294474, "loss": 1.407, "step": 2370 }, { "epoch": 0.2854910445967063, "grad_norm": 0.4706375002861023, "learning_rate": 0.000499567082994996, "loss": 1.3117, "step": 2375 }, { "epoch": 0.28609207837480466, "grad_norm": 0.5287367701530457, "learning_rate": 0.000499564273582016, "loss": 1.2594, "step": 2380 }, { "epoch": 0.286693112152903, "grad_norm": 0.5483081936836243, "learning_rate": 0.0004995614550906093, "loss": 1.1008, "step": 2385 }, { "epoch": 0.28729414593100133, "grad_norm": 0.8154200911521912, "learning_rate": 0.0004995586275208788, "loss": 1.5164, "step": 2390 }, { "epoch": 0.28789517970909967, "grad_norm": 0.837818443775177, "learning_rate": 0.000499555790872927, "loss": 1.2531, "step": 2395 }, { "epoch": 0.288496213487198, "grad_norm": 0.4989728033542633, "learning_rate": 0.0004995529451468574, "loss": 1.3719, "step": 2400 }, { "epoch": 0.288496213487198, "eval_loss": 2.189453125, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.202, "eval_samples_per_second": 4.545, "eval_steps_per_second": 1.136, "step": 2400 }, { "epoch": 0.2890972472652963, "grad_norm": 0.5615506768226624, "learning_rate": 0.0004995500903427732, "loss": 1.1023, "step": 2405 }, { "epoch": 0.2896982810433946, "grad_norm": 0.7758134007453918, "learning_rate": 0.0004995472264607784, "loss": 1.2625, "step": 2410 }, { "epoch": 0.29029931482149296, "grad_norm": 0.6751444935798645, "learning_rate": 0.0004995443535009773, "loss": 1.5734, "step": 2415 }, { "epoch": 0.2909003485995913, "grad_norm": 0.5839786529541016, "learning_rate": 0.0004995414714634743, "loss": 1.3625, "step": 2420 }, { "epoch": 0.29150138237768963, "grad_norm": 0.5906524062156677, "learning_rate": 0.0004995385803483742, "loss": 1.0875, "step": 2425 }, { "epoch": 0.29210241615578797, "grad_norm": 0.7597156763076782, "learning_rate": 0.0004995356801557821, "loss": 1.4781, "step": 2430 }, { "epoch": 0.2927034499338863, "grad_norm": 0.5112520456314087, "learning_rate": 0.0004995327708858038, "loss": 1.2758, "step": 2435 }, { "epoch": 0.2933044837119846, "grad_norm": 0.44212523102760315, "learning_rate": 0.0004995298525385447, "loss": 1.5094, "step": 2440 }, { "epoch": 0.2939055174900829, "grad_norm": 0.43641284108161926, "learning_rate": 0.0004995269251141114, "loss": 1.1656, "step": 2445 }, { "epoch": 0.29450655126818126, "grad_norm": 0.4382478892803192, "learning_rate": 0.0004995239886126102, "loss": 1.5023, "step": 2450 }, { "epoch": 0.2951075850462796, "grad_norm": 0.6196442246437073, "learning_rate": 0.0004995210430341478, "loss": 1.2875, "step": 2455 }, { "epoch": 0.29570861882437793, "grad_norm": 0.6048389673233032, "learning_rate": 0.0004995180883788316, "loss": 0.9516, "step": 2460 }, { "epoch": 0.29630965260247627, "grad_norm": 0.5682608485221863, "learning_rate": 0.0004995151246467689, "loss": 1.3422, "step": 2465 }, { "epoch": 0.2969106863805746, "grad_norm": 0.5677405595779419, "learning_rate": 0.0004995121518380674, "loss": 1.5016, "step": 2470 }, { "epoch": 0.29751172015867294, "grad_norm": 0.48005715012550354, "learning_rate": 0.0004995091699528355, "loss": 1.3219, "step": 2475 }, { "epoch": 0.2981127539367712, "grad_norm": 0.48294246196746826, "learning_rate": 0.0004995061789911817, "loss": 1.2516, "step": 2480 }, { "epoch": 0.29871378771486956, "grad_norm": 0.7167287468910217, "learning_rate": 0.0004995031789532147, "loss": 1.3531, "step": 2485 }, { "epoch": 0.2993148214929679, "grad_norm": 0.5675193667411804, "learning_rate": 0.0004995001698390434, "loss": 1.3648, "step": 2490 }, { "epoch": 0.29991585527106623, "grad_norm": 0.5264390707015991, "learning_rate": 0.0004994971516487775, "loss": 1.1133, "step": 2495 }, { "epoch": 0.30051688904916457, "grad_norm": 0.5506901144981384, "learning_rate": 0.0004994941243825269, "loss": 1.1594, "step": 2500 }, { "epoch": 0.3011179228272629, "grad_norm": 0.9272066950798035, "learning_rate": 0.0004994910880404015, "loss": 1.4906, "step": 2505 }, { "epoch": 0.30171895660536124, "grad_norm": 0.5853176712989807, "learning_rate": 0.0004994880426225119, "loss": 1.3508, "step": 2510 }, { "epoch": 0.3023199903834595, "grad_norm": 0.4796172082424164, "learning_rate": 0.0004994849881289687, "loss": 1.3484, "step": 2515 }, { "epoch": 0.30292102416155786, "grad_norm": 0.6331420540809631, "learning_rate": 0.0004994819245598833, "loss": 1.2188, "step": 2520 }, { "epoch": 0.3035220579396562, "grad_norm": 0.6519079208374023, "learning_rate": 0.000499478851915367, "loss": 1.3531, "step": 2525 }, { "epoch": 0.30412309171775453, "grad_norm": 0.6366649866104126, "learning_rate": 0.0004994757701955314, "loss": 1.1703, "step": 2530 }, { "epoch": 0.30472412549585287, "grad_norm": 0.5621868371963501, "learning_rate": 0.0004994726794004888, "loss": 1.0441, "step": 2535 }, { "epoch": 0.3053251592739512, "grad_norm": 0.6726334095001221, "learning_rate": 0.0004994695795303517, "loss": 1.2984, "step": 2540 }, { "epoch": 0.30592619305204954, "grad_norm": 0.5448851585388184, "learning_rate": 0.0004994664705852326, "loss": 0.8781, "step": 2545 }, { "epoch": 0.3065272268301479, "grad_norm": 0.6853761076927185, "learning_rate": 0.0004994633525652448, "loss": 1.6891, "step": 2550 }, { "epoch": 0.30712826060824616, "grad_norm": 0.5627267956733704, "learning_rate": 0.0004994602254705017, "loss": 1.368, "step": 2555 }, { "epoch": 0.3077292943863445, "grad_norm": 0.38999640941619873, "learning_rate": 0.0004994570893011171, "loss": 1.3789, "step": 2560 }, { "epoch": 0.30833032816444284, "grad_norm": 0.6671114563941956, "learning_rate": 0.000499453944057205, "loss": 1.4078, "step": 2565 }, { "epoch": 0.30893136194254117, "grad_norm": 0.5521063208580017, "learning_rate": 0.0004994507897388798, "loss": 1.5859, "step": 2570 }, { "epoch": 0.3095323957206395, "grad_norm": 0.6885313391685486, "learning_rate": 0.0004994476263462563, "loss": 1.2578, "step": 2575 }, { "epoch": 0.31013342949873784, "grad_norm": 0.45498156547546387, "learning_rate": 0.0004994444538794495, "loss": 1.3914, "step": 2580 }, { "epoch": 0.3107344632768362, "grad_norm": 0.5482655167579651, "learning_rate": 0.0004994412723385749, "loss": 1.3391, "step": 2585 }, { "epoch": 0.31133549705493446, "grad_norm": 0.5240392684936523, "learning_rate": 0.0004994380817237482, "loss": 1.25, "step": 2590 }, { "epoch": 0.3119365308330328, "grad_norm": 0.5129856467247009, "learning_rate": 0.0004994348820350854, "loss": 1.4406, "step": 2595 }, { "epoch": 0.31253756461113114, "grad_norm": 0.5252668261528015, "learning_rate": 0.000499431673272703, "loss": 1.0773, "step": 2600 }, { "epoch": 0.31253756461113114, "eval_loss": 2.1500000953674316, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1975, "eval_samples_per_second": 4.546, "eval_steps_per_second": 1.136, "step": 2600 }, { "epoch": 0.31313859838922947, "grad_norm": 0.6648097634315491, "learning_rate": 0.0004994284554367176, "loss": 1.1133, "step": 2605 }, { "epoch": 0.3137396321673278, "grad_norm": 0.6218547224998474, "learning_rate": 0.0004994252285272465, "loss": 1.2937, "step": 2610 }, { "epoch": 0.31434066594542615, "grad_norm": 0.6880519390106201, "learning_rate": 0.0004994219925444068, "loss": 1.8039, "step": 2615 }, { "epoch": 0.3149416997235245, "grad_norm": 0.6464706063270569, "learning_rate": 0.0004994187474883164, "loss": 1.5594, "step": 2620 }, { "epoch": 0.3155427335016228, "grad_norm": 0.7200093865394592, "learning_rate": 0.0004994154933590932, "loss": 1.0945, "step": 2625 }, { "epoch": 0.3161437672797211, "grad_norm": 0.6853864789009094, "learning_rate": 0.0004994122301568557, "loss": 1.268, "step": 2630 }, { "epoch": 0.31674480105781944, "grad_norm": 0.5081961750984192, "learning_rate": 0.0004994089578817226, "loss": 1.4062, "step": 2635 }, { "epoch": 0.3173458348359178, "grad_norm": 0.4750553071498871, "learning_rate": 0.0004994056765338129, "loss": 1.2828, "step": 2640 }, { "epoch": 0.3179468686140161, "grad_norm": 0.5867997407913208, "learning_rate": 0.0004994023861132459, "loss": 1.2484, "step": 2645 }, { "epoch": 0.31854790239211445, "grad_norm": 0.7348740696907043, "learning_rate": 0.0004993990866201414, "loss": 1.2258, "step": 2650 }, { "epoch": 0.3191489361702128, "grad_norm": 0.5523998141288757, "learning_rate": 0.0004993957780546193, "loss": 1.2805, "step": 2655 }, { "epoch": 0.3197499699483111, "grad_norm": 0.5308116674423218, "learning_rate": 0.0004993924604168001, "loss": 1.3188, "step": 2660 }, { "epoch": 0.3203510037264094, "grad_norm": 0.40592867136001587, "learning_rate": 0.0004993891337068046, "loss": 1.2148, "step": 2665 }, { "epoch": 0.32095203750450774, "grad_norm": 0.6522583365440369, "learning_rate": 0.0004993857979247535, "loss": 1.175, "step": 2670 }, { "epoch": 0.3215530712826061, "grad_norm": 0.5981694459915161, "learning_rate": 0.0004993824530707682, "loss": 1.143, "step": 2675 }, { "epoch": 0.3221541050607044, "grad_norm": 0.6832042932510376, "learning_rate": 0.0004993790991449707, "loss": 1.2242, "step": 2680 }, { "epoch": 0.32275513883880275, "grad_norm": 0.6935708522796631, "learning_rate": 0.0004993757361474825, "loss": 0.9617, "step": 2685 }, { "epoch": 0.3233561726169011, "grad_norm": 0.5491186380386353, "learning_rate": 0.0004993723640784265, "loss": 1.3672, "step": 2690 }, { "epoch": 0.3239572063949994, "grad_norm": 0.4743538498878479, "learning_rate": 0.0004993689829379249, "loss": 1.1547, "step": 2695 }, { "epoch": 0.3245582401730977, "grad_norm": 0.641859769821167, "learning_rate": 0.0004993655927261008, "loss": 1.4078, "step": 2700 }, { "epoch": 0.32515927395119604, "grad_norm": 0.5002933144569397, "learning_rate": 0.0004993621934430778, "loss": 0.9492, "step": 2705 }, { "epoch": 0.3257603077292944, "grad_norm": 0.7241799831390381, "learning_rate": 0.0004993587850889793, "loss": 1.575, "step": 2710 }, { "epoch": 0.3263613415073927, "grad_norm": 0.5693483948707581, "learning_rate": 0.0004993553676639292, "loss": 0.9961, "step": 2715 }, { "epoch": 0.32696237528549105, "grad_norm": 0.43130815029144287, "learning_rate": 0.000499351941168052, "loss": 1.2984, "step": 2720 }, { "epoch": 0.3275634090635894, "grad_norm": 0.5054978728294373, "learning_rate": 0.0004993485056014724, "loss": 1.1375, "step": 2725 }, { "epoch": 0.3281644428416877, "grad_norm": 0.5581235289573669, "learning_rate": 0.0004993450609643152, "loss": 1.0164, "step": 2730 }, { "epoch": 0.32876547661978606, "grad_norm": 0.6733124256134033, "learning_rate": 0.0004993416072567059, "loss": 1.4078, "step": 2735 }, { "epoch": 0.32936651039788434, "grad_norm": 0.5003538727760315, "learning_rate": 0.0004993381444787699, "loss": 0.8742, "step": 2740 }, { "epoch": 0.3299675441759827, "grad_norm": 0.6292559504508972, "learning_rate": 0.0004993346726306333, "loss": 1.007, "step": 2745 }, { "epoch": 0.330568577954081, "grad_norm": 0.6760239005088806, "learning_rate": 0.0004993311917124224, "loss": 1.25, "step": 2750 }, { "epoch": 0.33116961173217935, "grad_norm": 0.6075654625892639, "learning_rate": 0.0004993277017242638, "loss": 1.5766, "step": 2755 }, { "epoch": 0.3317706455102777, "grad_norm": 0.5432557463645935, "learning_rate": 0.0004993242026662846, "loss": 1.0883, "step": 2760 }, { "epoch": 0.332371679288376, "grad_norm": 0.6972253918647766, "learning_rate": 0.0004993206945386118, "loss": 0.9992, "step": 2765 }, { "epoch": 0.33297271306647436, "grad_norm": 0.45837146043777466, "learning_rate": 0.0004993171773413731, "loss": 1.6766, "step": 2770 }, { "epoch": 0.33357374684457264, "grad_norm": 0.5207621455192566, "learning_rate": 0.0004993136510746966, "loss": 1.2578, "step": 2775 }, { "epoch": 0.334174780622671, "grad_norm": 0.7034028768539429, "learning_rate": 0.0004993101157387106, "loss": 1.3578, "step": 2780 }, { "epoch": 0.3347758144007693, "grad_norm": 0.544851541519165, "learning_rate": 0.0004993065713335434, "loss": 1.2836, "step": 2785 }, { "epoch": 0.33537684817886765, "grad_norm": 0.705143928527832, "learning_rate": 0.0004993030178593241, "loss": 1.4453, "step": 2790 }, { "epoch": 0.335977881956966, "grad_norm": 0.6619438529014587, "learning_rate": 0.0004992994553161823, "loss": 1.0547, "step": 2795 }, { "epoch": 0.3365789157350643, "grad_norm": 0.5982903242111206, "learning_rate": 0.000499295883704247, "loss": 1.4281, "step": 2800 }, { "epoch": 0.3365789157350643, "eval_loss": 2.189453125, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1965, "eval_samples_per_second": 4.546, "eval_steps_per_second": 1.136, "step": 2800 }, { "epoch": 0.33717994951316266, "grad_norm": 0.589056670665741, "learning_rate": 0.0004992923030236485, "loss": 1.3727, "step": 2805 }, { "epoch": 0.337780983291261, "grad_norm": 0.39378607273101807, "learning_rate": 0.000499288713274517, "loss": 1.1789, "step": 2810 }, { "epoch": 0.3383820170693593, "grad_norm": 0.5460519790649414, "learning_rate": 0.000499285114456983, "loss": 1.1719, "step": 2815 }, { "epoch": 0.3389830508474576, "grad_norm": 0.4953864812850952, "learning_rate": 0.0004992815065711774, "loss": 1.1672, "step": 2820 }, { "epoch": 0.33958408462555595, "grad_norm": 0.5705846548080444, "learning_rate": 0.0004992778896172317, "loss": 1.5328, "step": 2825 }, { "epoch": 0.3401851184036543, "grad_norm": 0.5687447190284729, "learning_rate": 0.0004992742635952771, "loss": 1.0063, "step": 2830 }, { "epoch": 0.3407861521817526, "grad_norm": 0.516343891620636, "learning_rate": 0.0004992706285054458, "loss": 1.1492, "step": 2835 }, { "epoch": 0.34138718595985096, "grad_norm": 0.6128392815589905, "learning_rate": 0.0004992669843478699, "loss": 1.325, "step": 2840 }, { "epoch": 0.3419882197379493, "grad_norm": 0.5104270577430725, "learning_rate": 0.000499263331122682, "loss": 1.0195, "step": 2845 }, { "epoch": 0.3425892535160476, "grad_norm": 0.38332250714302063, "learning_rate": 0.0004992596688300149, "loss": 1.302, "step": 2850 }, { "epoch": 0.3431902872941459, "grad_norm": 0.5674039125442505, "learning_rate": 0.000499255997470002, "loss": 1.2094, "step": 2855 }, { "epoch": 0.34379132107224425, "grad_norm": 0.7987366914749146, "learning_rate": 0.0004992523170427766, "loss": 1.2047, "step": 2860 }, { "epoch": 0.3443923548503426, "grad_norm": 0.45501282811164856, "learning_rate": 0.0004992486275484729, "loss": 1.1539, "step": 2865 }, { "epoch": 0.3449933886284409, "grad_norm": 0.5390669703483582, "learning_rate": 0.0004992449289872249, "loss": 1.2102, "step": 2870 }, { "epoch": 0.34559442240653926, "grad_norm": 0.6710581183433533, "learning_rate": 0.0004992412213591672, "loss": 1.4297, "step": 2875 }, { "epoch": 0.3461954561846376, "grad_norm": 0.6371570825576782, "learning_rate": 0.0004992375046644347, "loss": 1.0164, "step": 2880 }, { "epoch": 0.34679648996273593, "grad_norm": 0.49934741854667664, "learning_rate": 0.0004992337789031625, "loss": 1.1313, "step": 2885 }, { "epoch": 0.3473975237408342, "grad_norm": 0.41756120324134827, "learning_rate": 0.0004992300440754862, "loss": 1.1969, "step": 2890 }, { "epoch": 0.34799855751893255, "grad_norm": 0.8102174997329712, "learning_rate": 0.0004992263001815418, "loss": 1.2719, "step": 2895 }, { "epoch": 0.3485995912970309, "grad_norm": 0.45573851466178894, "learning_rate": 0.0004992225472214653, "loss": 1.1375, "step": 2900 }, { "epoch": 0.3492006250751292, "grad_norm": 0.5512142777442932, "learning_rate": 0.0004992187851953932, "loss": 1.4781, "step": 2905 }, { "epoch": 0.34980165885322756, "grad_norm": 0.6429489850997925, "learning_rate": 0.0004992150141034624, "loss": 1.3453, "step": 2910 }, { "epoch": 0.3504026926313259, "grad_norm": 0.6230481266975403, "learning_rate": 0.0004992112339458103, "loss": 1.2766, "step": 2915 }, { "epoch": 0.35100372640942423, "grad_norm": 0.6134311556816101, "learning_rate": 0.0004992074447225741, "loss": 1.0664, "step": 2920 }, { "epoch": 0.3516047601875225, "grad_norm": 0.5894711017608643, "learning_rate": 0.0004992036464338918, "loss": 0.9, "step": 2925 }, { "epoch": 0.35220579396562085, "grad_norm": 0.525947630405426, "learning_rate": 0.0004991998390799016, "loss": 1.4844, "step": 2930 }, { "epoch": 0.3528068277437192, "grad_norm": 0.5282221436500549, "learning_rate": 0.0004991960226607418, "loss": 1.0398, "step": 2935 }, { "epoch": 0.3534078615218175, "grad_norm": 0.5808281302452087, "learning_rate": 0.0004991921971765514, "loss": 0.943, "step": 2940 }, { "epoch": 0.35400889529991586, "grad_norm": 0.6163946390151978, "learning_rate": 0.0004991883626274696, "loss": 1.1086, "step": 2945 }, { "epoch": 0.3546099290780142, "grad_norm": 0.4154224991798401, "learning_rate": 0.0004991845190136357, "loss": 1.2703, "step": 2950 }, { "epoch": 0.35521096285611253, "grad_norm": 0.4030189514160156, "learning_rate": 0.0004991806663351897, "loss": 1.1086, "step": 2955 }, { "epoch": 0.35581199663421087, "grad_norm": 0.5927292704582214, "learning_rate": 0.0004991768045922718, "loss": 0.9758, "step": 2960 }, { "epoch": 0.35641303041230915, "grad_norm": 0.476971834897995, "learning_rate": 0.0004991729337850223, "loss": 1.525, "step": 2965 }, { "epoch": 0.3570140641904075, "grad_norm": 0.5584660768508911, "learning_rate": 0.000499169053913582, "loss": 1.2125, "step": 2970 }, { "epoch": 0.3576150979685058, "grad_norm": 0.5617804527282715, "learning_rate": 0.0004991651649780922, "loss": 1.3102, "step": 2975 }, { "epoch": 0.35821613174660416, "grad_norm": 0.3463181257247925, "learning_rate": 0.0004991612669786942, "loss": 1.4227, "step": 2980 }, { "epoch": 0.3588171655247025, "grad_norm": 0.5156741142272949, "learning_rate": 0.0004991573599155299, "loss": 1.3828, "step": 2985 }, { "epoch": 0.35941819930280083, "grad_norm": 0.6080055832862854, "learning_rate": 0.0004991534437887414, "loss": 1.0102, "step": 2990 }, { "epoch": 0.36001923308089917, "grad_norm": 0.5142369866371155, "learning_rate": 0.0004991495185984711, "loss": 1.3469, "step": 2995 }, { "epoch": 0.36062026685899745, "grad_norm": 0.4148232638835907, "learning_rate": 0.000499145584344862, "loss": 1.3102, "step": 3000 }, { "epoch": 0.36062026685899745, "eval_loss": 2.1435546875, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2124, "eval_samples_per_second": 4.544, "eval_steps_per_second": 1.136, "step": 3000 }, { "epoch": 0.3612213006370958, "grad_norm": 0.43302541971206665, "learning_rate": 0.000499141641028057, "loss": 1.1625, "step": 3005 }, { "epoch": 0.3618223344151941, "grad_norm": 0.5702094435691833, "learning_rate": 0.0004991376886481996, "loss": 1.4937, "step": 3010 }, { "epoch": 0.36242336819329246, "grad_norm": 0.602148175239563, "learning_rate": 0.0004991337272054336, "loss": 1.4453, "step": 3015 }, { "epoch": 0.3630244019713908, "grad_norm": 0.7094736099243164, "learning_rate": 0.0004991297566999031, "loss": 1.1242, "step": 3020 }, { "epoch": 0.36362543574948913, "grad_norm": 1.0491294860839844, "learning_rate": 0.0004991257771317525, "loss": 1.2945, "step": 3025 }, { "epoch": 0.36422646952758747, "grad_norm": 0.5383880734443665, "learning_rate": 0.0004991217885011266, "loss": 1.4398, "step": 3030 }, { "epoch": 0.36482750330568575, "grad_norm": 0.5303685665130615, "learning_rate": 0.0004991177908081706, "loss": 1.357, "step": 3035 }, { "epoch": 0.3654285370837841, "grad_norm": 0.5658953785896301, "learning_rate": 0.0004991137840530297, "loss": 1.625, "step": 3040 }, { "epoch": 0.3660295708618824, "grad_norm": 0.6413328051567078, "learning_rate": 0.0004991097682358498, "loss": 1.0664, "step": 3045 }, { "epoch": 0.36663060463998076, "grad_norm": 0.40573734045028687, "learning_rate": 0.000499105743356777, "loss": 1.4219, "step": 3050 }, { "epoch": 0.3672316384180791, "grad_norm": 0.4775443375110626, "learning_rate": 0.0004991017094159576, "loss": 1.1383, "step": 3055 }, { "epoch": 0.36783267219617743, "grad_norm": 0.4478199779987335, "learning_rate": 0.0004990976664135384, "loss": 1.1906, "step": 3060 }, { "epoch": 0.36843370597427577, "grad_norm": 0.5353224277496338, "learning_rate": 0.0004990936143496664, "loss": 1.0695, "step": 3065 }, { "epoch": 0.3690347397523741, "grad_norm": 0.6140496730804443, "learning_rate": 0.0004990895532244893, "loss": 1.45, "step": 3070 }, { "epoch": 0.3696357735304724, "grad_norm": 0.674697995185852, "learning_rate": 0.0004990854830381545, "loss": 1.2617, "step": 3075 }, { "epoch": 0.3702368073085707, "grad_norm": 0.793539822101593, "learning_rate": 0.0004990814037908102, "loss": 1.0797, "step": 3080 }, { "epoch": 0.37083784108666906, "grad_norm": 0.4356972873210907, "learning_rate": 0.0004990773154826048, "loss": 1.257, "step": 3085 }, { "epoch": 0.3714388748647674, "grad_norm": 0.4007517099380493, "learning_rate": 0.000499073218113687, "loss": 1.6695, "step": 3090 }, { "epoch": 0.37203990864286574, "grad_norm": 0.7213647961616516, "learning_rate": 0.0004990691116842058, "loss": 1.1039, "step": 3095 }, { "epoch": 0.37264094242096407, "grad_norm": 0.5188817977905273, "learning_rate": 0.0004990649961943105, "loss": 1.3844, "step": 3100 }, { "epoch": 0.3732419761990624, "grad_norm": 0.5494900345802307, "learning_rate": 0.0004990608716441511, "loss": 1.3094, "step": 3105 }, { "epoch": 0.3738430099771607, "grad_norm": 0.6364961266517639, "learning_rate": 0.0004990567380338774, "loss": 1.432, "step": 3110 }, { "epoch": 0.374444043755259, "grad_norm": 0.7705929279327393, "learning_rate": 0.0004990525953636399, "loss": 1.3594, "step": 3115 }, { "epoch": 0.37504507753335736, "grad_norm": 0.6256239414215088, "learning_rate": 0.0004990484436335892, "loss": 1.4969, "step": 3120 }, { "epoch": 0.3756461113114557, "grad_norm": 0.8040322661399841, "learning_rate": 0.0004990442828438764, "loss": 1.3664, "step": 3125 }, { "epoch": 0.37624714508955404, "grad_norm": 0.7349644303321838, "learning_rate": 0.0004990401129946528, "loss": 1.3016, "step": 3130 }, { "epoch": 0.3768481788676524, "grad_norm": 0.7406263947486877, "learning_rate": 0.0004990359340860701, "loss": 1.2672, "step": 3135 }, { "epoch": 0.3774492126457507, "grad_norm": 0.6488270163536072, "learning_rate": 0.0004990317461182803, "loss": 1.5125, "step": 3140 }, { "epoch": 0.37805024642384905, "grad_norm": 0.450980544090271, "learning_rate": 0.0004990275490914358, "loss": 0.9531, "step": 3145 }, { "epoch": 0.3786512802019473, "grad_norm": 0.5992457866668701, "learning_rate": 0.0004990233430056892, "loss": 1.2563, "step": 3150 }, { "epoch": 0.37925231398004566, "grad_norm": 0.4323638677597046, "learning_rate": 0.0004990191278611936, "loss": 1.1438, "step": 3155 }, { "epoch": 0.379853347758144, "grad_norm": 0.9859302639961243, "learning_rate": 0.0004990149036581023, "loss": 1.2555, "step": 3160 }, { "epoch": 0.38045438153624234, "grad_norm": 0.8136280179023743, "learning_rate": 0.0004990106703965689, "loss": 1.3172, "step": 3165 }, { "epoch": 0.3810554153143407, "grad_norm": 0.5532881021499634, "learning_rate": 0.0004990064280767475, "loss": 0.9656, "step": 3170 }, { "epoch": 0.381656449092439, "grad_norm": 0.5996264219284058, "learning_rate": 0.0004990021766987923, "loss": 1.2688, "step": 3175 }, { "epoch": 0.38225748287053735, "grad_norm": 0.6474243402481079, "learning_rate": 0.0004989979162628582, "loss": 0.9461, "step": 3180 }, { "epoch": 0.3828585166486356, "grad_norm": 0.5693522691726685, "learning_rate": 0.0004989936467690998, "loss": 1.0906, "step": 3185 }, { "epoch": 0.38345955042673396, "grad_norm": 0.5393794775009155, "learning_rate": 0.0004989893682176727, "loss": 1.2445, "step": 3190 }, { "epoch": 0.3840605842048323, "grad_norm": 0.6597670316696167, "learning_rate": 0.0004989850806087325, "loss": 1.4289, "step": 3195 }, { "epoch": 0.38466161798293064, "grad_norm": 0.6367236971855164, "learning_rate": 0.0004989807839424352, "loss": 1.2266, "step": 3200 }, { "epoch": 0.38466161798293064, "eval_loss": 2.1474609375, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1942, "eval_samples_per_second": 4.546, "eval_steps_per_second": 1.137, "step": 3200 }, { "epoch": 0.385262651761029, "grad_norm": 0.5790736079216003, "learning_rate": 0.0004989764782189369, "loss": 1.2164, "step": 3205 }, { "epoch": 0.3858636855391273, "grad_norm": 0.502346932888031, "learning_rate": 0.0004989721634383943, "loss": 1.3391, "step": 3210 }, { "epoch": 0.38646471931722565, "grad_norm": 0.6391408443450928, "learning_rate": 0.0004989678396009645, "loss": 1.2711, "step": 3215 }, { "epoch": 0.387065753095324, "grad_norm": 0.40186697244644165, "learning_rate": 0.0004989635067068047, "loss": 1.1691, "step": 3220 }, { "epoch": 0.38766678687342226, "grad_norm": 0.5028790235519409, "learning_rate": 0.0004989591647560726, "loss": 1.1609, "step": 3225 }, { "epoch": 0.3882678206515206, "grad_norm": 0.4072652757167816, "learning_rate": 0.0004989548137489259, "loss": 1.5672, "step": 3230 }, { "epoch": 0.38886885442961894, "grad_norm": 0.4738612174987793, "learning_rate": 0.0004989504536855232, "loss": 1.3344, "step": 3235 }, { "epoch": 0.3894698882077173, "grad_norm": 0.9375470876693726, "learning_rate": 0.0004989460845660229, "loss": 1.0484, "step": 3240 }, { "epoch": 0.3900709219858156, "grad_norm": 0.4789920449256897, "learning_rate": 0.000498941706390584, "loss": 1.1109, "step": 3245 }, { "epoch": 0.39067195576391395, "grad_norm": 0.6214213967323303, "learning_rate": 0.0004989373191593658, "loss": 1.3516, "step": 3250 }, { "epoch": 0.3912729895420123, "grad_norm": 0.49041748046875, "learning_rate": 0.0004989329228725277, "loss": 1.2977, "step": 3255 }, { "epoch": 0.39187402332011056, "grad_norm": 0.5892441272735596, "learning_rate": 0.00049892851753023, "loss": 1.1109, "step": 3260 }, { "epoch": 0.3924750570982089, "grad_norm": 0.4488331973552704, "learning_rate": 0.0004989241031326326, "loss": 1.2734, "step": 3265 }, { "epoch": 0.39307609087630724, "grad_norm": 0.43196800351142883, "learning_rate": 0.0004989196796798963, "loss": 1.1059, "step": 3270 }, { "epoch": 0.3936771246544056, "grad_norm": 0.3739017844200134, "learning_rate": 0.0004989152471721819, "loss": 0.8492, "step": 3275 }, { "epoch": 0.3942781584325039, "grad_norm": 0.5097094178199768, "learning_rate": 0.0004989108056096505, "loss": 1.1836, "step": 3280 }, { "epoch": 0.39487919221060225, "grad_norm": 0.7751893401145935, "learning_rate": 0.000498906354992464, "loss": 1.443, "step": 3285 }, { "epoch": 0.3954802259887006, "grad_norm": 0.6941812634468079, "learning_rate": 0.0004989018953207841, "loss": 1.2094, "step": 3290 }, { "epoch": 0.3960812597667989, "grad_norm": 0.5261918902397156, "learning_rate": 0.0004988974265947731, "loss": 1.1297, "step": 3295 }, { "epoch": 0.3966822935448972, "grad_norm": 0.6817163228988647, "learning_rate": 0.0004988929488145934, "loss": 1.5297, "step": 3300 }, { "epoch": 0.39728332732299554, "grad_norm": 0.6343579888343811, "learning_rate": 0.0004988884619804082, "loss": 1.1766, "step": 3305 }, { "epoch": 0.3978843611010939, "grad_norm": 0.4660755693912506, "learning_rate": 0.0004988839660923805, "loss": 0.9953, "step": 3310 }, { "epoch": 0.3984853948791922, "grad_norm": 0.6850960850715637, "learning_rate": 0.0004988794611506738, "loss": 1.4023, "step": 3315 }, { "epoch": 0.39908642865729055, "grad_norm": 0.7170994877815247, "learning_rate": 0.0004988749471554521, "loss": 1.1391, "step": 3320 }, { "epoch": 0.3996874624353889, "grad_norm": 0.9181567430496216, "learning_rate": 0.0004988704241068795, "loss": 1.3047, "step": 3325 }, { "epoch": 0.4002884962134872, "grad_norm": 0.5317991971969604, "learning_rate": 0.0004988658920051207, "loss": 1.3273, "step": 3330 }, { "epoch": 0.4008895299915855, "grad_norm": 0.5464211702346802, "learning_rate": 0.0004988613508503405, "loss": 1.0758, "step": 3335 }, { "epoch": 0.40149056376968384, "grad_norm": 0.664720892906189, "learning_rate": 0.0004988568006427039, "loss": 1.1383, "step": 3340 }, { "epoch": 0.4020915975477822, "grad_norm": 0.5137344002723694, "learning_rate": 0.0004988522413823767, "loss": 0.8992, "step": 3345 }, { "epoch": 0.4026926313258805, "grad_norm": 0.6021727919578552, "learning_rate": 0.0004988476730695246, "loss": 1.0391, "step": 3350 }, { "epoch": 0.40329366510397885, "grad_norm": 0.5493384003639221, "learning_rate": 0.0004988430957043138, "loss": 1.2063, "step": 3355 }, { "epoch": 0.4038946988820772, "grad_norm": 0.44344305992126465, "learning_rate": 0.0004988385092869109, "loss": 1.1539, "step": 3360 }, { "epoch": 0.4044957326601755, "grad_norm": 0.5496264100074768, "learning_rate": 0.0004988339138174827, "loss": 1.0008, "step": 3365 }, { "epoch": 0.40509676643827386, "grad_norm": 0.5868191719055176, "learning_rate": 0.0004988293092961962, "loss": 1.0273, "step": 3370 }, { "epoch": 0.40569780021637214, "grad_norm": 0.47252243757247925, "learning_rate": 0.0004988246957232191, "loss": 1.1547, "step": 3375 }, { "epoch": 0.4062988339944705, "grad_norm": 0.5384260416030884, "learning_rate": 0.0004988200730987192, "loss": 0.9969, "step": 3380 }, { "epoch": 0.4068998677725688, "grad_norm": 0.5157658457756042, "learning_rate": 0.0004988154414228645, "loss": 1.218, "step": 3385 }, { "epoch": 0.40750090155066715, "grad_norm": 0.609667181968689, "learning_rate": 0.0004988108006958237, "loss": 1.2977, "step": 3390 }, { "epoch": 0.4081019353287655, "grad_norm": 0.6255223751068115, "learning_rate": 0.0004988061509177656, "loss": 1.0859, "step": 3395 }, { "epoch": 0.4087029691068638, "grad_norm": 0.48681220412254333, "learning_rate": 0.0004988014920888592, "loss": 1.1094, "step": 3400 }, { "epoch": 0.4087029691068638, "eval_loss": 2.1009764671325684, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1797, "eval_samples_per_second": 4.548, "eval_steps_per_second": 1.137, "step": 3400 }, { "epoch": 0.40930400288496216, "grad_norm": 0.7313075065612793, "learning_rate": 0.0004987968242092741, "loss": 1.4648, "step": 3405 }, { "epoch": 0.40990503666306044, "grad_norm": 0.3973584771156311, "learning_rate": 0.00049879214727918, "loss": 1.4195, "step": 3410 }, { "epoch": 0.4105060704411588, "grad_norm": 0.5780074596405029, "learning_rate": 0.0004987874612987471, "loss": 1.1383, "step": 3415 }, { "epoch": 0.4111071042192571, "grad_norm": 0.5280987620353699, "learning_rate": 0.0004987827662681459, "loss": 1.2125, "step": 3420 }, { "epoch": 0.41170813799735545, "grad_norm": 0.672178328037262, "learning_rate": 0.0004987780621875471, "loss": 1.2563, "step": 3425 }, { "epoch": 0.4123091717754538, "grad_norm": 0.5133812427520752, "learning_rate": 0.0004987733490571218, "loss": 1.2266, "step": 3430 }, { "epoch": 0.4129102055535521, "grad_norm": 0.7771773338317871, "learning_rate": 0.0004987686268770415, "loss": 1.2648, "step": 3435 }, { "epoch": 0.41351123933165046, "grad_norm": 0.555830180644989, "learning_rate": 0.0004987638956474781, "loss": 1.2445, "step": 3440 }, { "epoch": 0.41411227310974874, "grad_norm": 0.377913236618042, "learning_rate": 0.0004987591553686035, "loss": 1.0813, "step": 3445 }, { "epoch": 0.4147133068878471, "grad_norm": 0.6742091774940491, "learning_rate": 0.0004987544060405903, "loss": 1.1516, "step": 3450 }, { "epoch": 0.4153143406659454, "grad_norm": 0.6832530498504639, "learning_rate": 0.0004987496476636112, "loss": 0.9953, "step": 3455 }, { "epoch": 0.41591537444404375, "grad_norm": 0.7047104835510254, "learning_rate": 0.0004987448802378393, "loss": 1.2781, "step": 3460 }, { "epoch": 0.4165164082221421, "grad_norm": 0.5788468718528748, "learning_rate": 0.000498740103763448, "loss": 1.1266, "step": 3465 }, { "epoch": 0.4171174420002404, "grad_norm": 0.6551057696342468, "learning_rate": 0.0004987353182406111, "loss": 0.9711, "step": 3470 }, { "epoch": 0.41771847577833876, "grad_norm": 0.7851700782775879, "learning_rate": 0.0004987305236695027, "loss": 1.2258, "step": 3475 }, { "epoch": 0.4183195095564371, "grad_norm": 0.45742517709732056, "learning_rate": 0.000498725720050297, "loss": 1.1375, "step": 3480 }, { "epoch": 0.4189205433345354, "grad_norm": 0.5413776636123657, "learning_rate": 0.0004987209073831691, "loss": 1.3938, "step": 3485 }, { "epoch": 0.4195215771126337, "grad_norm": 0.5107463598251343, "learning_rate": 0.0004987160856682938, "loss": 1.0172, "step": 3490 }, { "epoch": 0.42012261089073205, "grad_norm": 0.6791651844978333, "learning_rate": 0.0004987112549058466, "loss": 1.2555, "step": 3495 }, { "epoch": 0.4207236446688304, "grad_norm": 0.71052086353302, "learning_rate": 0.0004987064150960033, "loss": 1.057, "step": 3500 }, { "epoch": 0.4213246784469287, "grad_norm": 0.5631623268127441, "learning_rate": 0.0004987015662389398, "loss": 1.143, "step": 3505 }, { "epoch": 0.42192571222502706, "grad_norm": 0.7245007753372192, "learning_rate": 0.0004986967083348325, "loss": 1.0203, "step": 3510 }, { "epoch": 0.4225267460031254, "grad_norm": 0.5436373353004456, "learning_rate": 0.0004986918413838583, "loss": 1.0805, "step": 3515 }, { "epoch": 0.4231277797812237, "grad_norm": 0.4831235110759735, "learning_rate": 0.0004986869653861941, "loss": 1.2867, "step": 3520 }, { "epoch": 0.423728813559322, "grad_norm": 0.4958447813987732, "learning_rate": 0.0004986820803420172, "loss": 1.143, "step": 3525 }, { "epoch": 0.42432984733742035, "grad_norm": 0.7917611002922058, "learning_rate": 0.0004986771862515055, "loss": 1.2012, "step": 3530 }, { "epoch": 0.4249308811155187, "grad_norm": 0.6926260590553284, "learning_rate": 0.0004986722831148369, "loss": 1.3297, "step": 3535 }, { "epoch": 0.425531914893617, "grad_norm": 0.5297942757606506, "learning_rate": 0.0004986673709321898, "loss": 1.0195, "step": 3540 }, { "epoch": 0.42613294867171536, "grad_norm": 0.6709704399108887, "learning_rate": 0.0004986624497037429, "loss": 1.307, "step": 3545 }, { "epoch": 0.4267339824498137, "grad_norm": 0.586955189704895, "learning_rate": 0.0004986575194296752, "loss": 0.9141, "step": 3550 }, { "epoch": 0.42733501622791203, "grad_norm": 0.573215663433075, "learning_rate": 0.000498652580110166, "loss": 1.0344, "step": 3555 }, { "epoch": 0.4279360500060103, "grad_norm": 0.4756597578525543, "learning_rate": 0.0004986476317453951, "loss": 1.1664, "step": 3560 }, { "epoch": 0.42853708378410865, "grad_norm": 0.5231835246086121, "learning_rate": 0.0004986426743355425, "loss": 1.1281, "step": 3565 }, { "epoch": 0.429138117562207, "grad_norm": 0.4669915735721588, "learning_rate": 0.0004986377078807884, "loss": 1.2641, "step": 3570 }, { "epoch": 0.4297391513403053, "grad_norm": 0.49261239171028137, "learning_rate": 0.0004986327323813135, "loss": 1.1812, "step": 3575 }, { "epoch": 0.43034018511840366, "grad_norm": 0.5701055526733398, "learning_rate": 0.0004986277478372989, "loss": 0.943, "step": 3580 }, { "epoch": 0.430941218896502, "grad_norm": 0.8740291595458984, "learning_rate": 0.0004986227542489259, "loss": 1.4688, "step": 3585 }, { "epoch": 0.43154225267460034, "grad_norm": 0.5957376956939697, "learning_rate": 0.000498617751616376, "loss": 1.0031, "step": 3590 }, { "epoch": 0.4321432864526986, "grad_norm": 0.5455657243728638, "learning_rate": 0.0004986127399398315, "loss": 1.1375, "step": 3595 }, { "epoch": 0.43274432023079695, "grad_norm": 0.7837244868278503, "learning_rate": 0.0004986077192194743, "loss": 1.1492, "step": 3600 }, { "epoch": 0.43274432023079695, "eval_loss": 2.097851514816284, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2375, "eval_samples_per_second": 4.541, "eval_steps_per_second": 1.135, "step": 3600 }, { "epoch": 0.4333453540088953, "grad_norm": 0.6420156359672546, "learning_rate": 0.0004986026894554874, "loss": 1.6125, "step": 3605 }, { "epoch": 0.4339463877869936, "grad_norm": 0.37207821011543274, "learning_rate": 0.0004985976506480535, "loss": 1.0586, "step": 3610 }, { "epoch": 0.43454742156509196, "grad_norm": 0.4620397090911865, "learning_rate": 0.000498592602797356, "loss": 1.1328, "step": 3615 }, { "epoch": 0.4351484553431903, "grad_norm": 0.8707792162895203, "learning_rate": 0.0004985875459035786, "loss": 1.2578, "step": 3620 }, { "epoch": 0.43574948912128864, "grad_norm": 0.5785757303237915, "learning_rate": 0.0004985824799669052, "loss": 1.1313, "step": 3625 }, { "epoch": 0.43635052289938697, "grad_norm": 0.581447958946228, "learning_rate": 0.00049857740498752, "loss": 1.2219, "step": 3630 }, { "epoch": 0.43695155667748525, "grad_norm": 0.5232359170913696, "learning_rate": 0.0004985723209656078, "loss": 1.0891, "step": 3635 }, { "epoch": 0.4375525904555836, "grad_norm": 0.4474778175354004, "learning_rate": 0.0004985672279013534, "loss": 1.625, "step": 3640 }, { "epoch": 0.4381536242336819, "grad_norm": 0.7095141410827637, "learning_rate": 0.000498562125794942, "loss": 0.9273, "step": 3645 }, { "epoch": 0.43875465801178026, "grad_norm": 0.6456997394561768, "learning_rate": 0.0004985570146465593, "loss": 1.1453, "step": 3650 }, { "epoch": 0.4393556917898786, "grad_norm": 0.7230183482170105, "learning_rate": 0.0004985518944563914, "loss": 1.15, "step": 3655 }, { "epoch": 0.43995672556797694, "grad_norm": 0.6240507364273071, "learning_rate": 0.0004985467652246243, "loss": 1.0586, "step": 3660 }, { "epoch": 0.4405577593460753, "grad_norm": 1.0956453084945679, "learning_rate": 0.0004985416269514447, "loss": 1.3141, "step": 3665 }, { "epoch": 0.44115879312417355, "grad_norm": 0.4983053505420685, "learning_rate": 0.0004985364796370394, "loss": 1.0805, "step": 3670 }, { "epoch": 0.4417598269022719, "grad_norm": 0.5128179788589478, "learning_rate": 0.0004985313232815958, "loss": 0.9055, "step": 3675 }, { "epoch": 0.4423608606803702, "grad_norm": 0.535322368144989, "learning_rate": 0.0004985261578853014, "loss": 0.9563, "step": 3680 }, { "epoch": 0.44296189445846856, "grad_norm": 0.6636273264884949, "learning_rate": 0.000498520983448344, "loss": 1.2719, "step": 3685 }, { "epoch": 0.4435629282365669, "grad_norm": 0.7313540577888489, "learning_rate": 0.0004985157999709122, "loss": 1.5672, "step": 3690 }, { "epoch": 0.44416396201466524, "grad_norm": 0.45398956537246704, "learning_rate": 0.000498510607453194, "loss": 1.0688, "step": 3695 }, { "epoch": 0.4447649957927636, "grad_norm": 0.4940120577812195, "learning_rate": 0.0004985054058953788, "loss": 1.2211, "step": 3700 }, { "epoch": 0.4453660295708619, "grad_norm": 0.48061200976371765, "learning_rate": 0.0004985001952976556, "loss": 1.1328, "step": 3705 }, { "epoch": 0.4459670633489602, "grad_norm": 0.6256884336471558, "learning_rate": 0.0004984949756602139, "loss": 1.2969, "step": 3710 }, { "epoch": 0.4465680971270585, "grad_norm": 0.5207997560501099, "learning_rate": 0.0004984897469832437, "loss": 1.3641, "step": 3715 }, { "epoch": 0.44716913090515686, "grad_norm": 0.4868186414241791, "learning_rate": 0.000498484509266935, "loss": 0.8984, "step": 3720 }, { "epoch": 0.4477701646832552, "grad_norm": 0.5324142575263977, "learning_rate": 0.0004984792625114786, "loss": 1.1617, "step": 3725 }, { "epoch": 0.44837119846135354, "grad_norm": 0.5691832304000854, "learning_rate": 0.0004984740067170651, "loss": 1.2117, "step": 3730 }, { "epoch": 0.4489722322394519, "grad_norm": 0.419612318277359, "learning_rate": 0.0004984687418838859, "loss": 1.0063, "step": 3735 }, { "epoch": 0.4495732660175502, "grad_norm": 0.5638041496276855, "learning_rate": 0.0004984634680121325, "loss": 1.0805, "step": 3740 }, { "epoch": 0.4501742997956485, "grad_norm": 0.5611728429794312, "learning_rate": 0.0004984581851019966, "loss": 1.282, "step": 3745 }, { "epoch": 0.45077533357374683, "grad_norm": 0.6137074828147888, "learning_rate": 0.0004984528931536705, "loss": 1.2445, "step": 3750 }, { "epoch": 0.45137636735184516, "grad_norm": 0.4265820384025574, "learning_rate": 0.0004984475921673466, "loss": 0.9336, "step": 3755 }, { "epoch": 0.4519774011299435, "grad_norm": 0.6829708218574524, "learning_rate": 0.0004984422821432178, "loss": 1.4375, "step": 3760 }, { "epoch": 0.45257843490804184, "grad_norm": 0.41247573494911194, "learning_rate": 0.0004984369630814773, "loss": 1.3852, "step": 3765 }, { "epoch": 0.4531794686861402, "grad_norm": 0.5414575338363647, "learning_rate": 0.0004984316349823186, "loss": 1.0758, "step": 3770 }, { "epoch": 0.4537805024642385, "grad_norm": 0.5673078298568726, "learning_rate": 0.0004984262978459355, "loss": 0.9523, "step": 3775 }, { "epoch": 0.4543815362423368, "grad_norm": 0.5617050528526306, "learning_rate": 0.0004984209516725221, "loss": 1.0805, "step": 3780 }, { "epoch": 0.45498257002043513, "grad_norm": 0.5333116054534912, "learning_rate": 0.0004984155964622729, "loss": 1.0984, "step": 3785 }, { "epoch": 0.45558360379853347, "grad_norm": 0.6070541739463806, "learning_rate": 0.0004984102322153827, "loss": 1.2125, "step": 3790 }, { "epoch": 0.4561846375766318, "grad_norm": 0.6610611081123352, "learning_rate": 0.0004984048589320467, "loss": 1.1281, "step": 3795 }, { "epoch": 0.45678567135473014, "grad_norm": 0.6735252737998962, "learning_rate": 0.0004983994766124602, "loss": 1.0371, "step": 3800 }, { "epoch": 0.45678567135473014, "eval_loss": 2.081835985183716, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1969, "eval_samples_per_second": 4.546, "eval_steps_per_second": 1.136, "step": 3800 }, { "epoch": 0.4573867051328285, "grad_norm": 0.48256927728652954, "learning_rate": 0.0004983940852568193, "loss": 1.0188, "step": 3805 }, { "epoch": 0.4579877389109268, "grad_norm": 0.4252387583255768, "learning_rate": 0.0004983886848653197, "loss": 1.1875, "step": 3810 }, { "epoch": 0.45858877268902515, "grad_norm": 0.5931745767593384, "learning_rate": 0.0004983832754381582, "loss": 1.0766, "step": 3815 }, { "epoch": 0.45918980646712343, "grad_norm": 0.5438306927680969, "learning_rate": 0.0004983778569755315, "loss": 1.4563, "step": 3820 }, { "epoch": 0.45979084024522177, "grad_norm": 0.4355262219905853, "learning_rate": 0.0004983724294776366, "loss": 1.0938, "step": 3825 }, { "epoch": 0.4603918740233201, "grad_norm": 0.44057753682136536, "learning_rate": 0.0004983669929446711, "loss": 1.0375, "step": 3830 }, { "epoch": 0.46099290780141844, "grad_norm": 0.5248241424560547, "learning_rate": 0.0004983615473768326, "loss": 1.1828, "step": 3835 }, { "epoch": 0.4615939415795168, "grad_norm": 0.6102302074432373, "learning_rate": 0.0004983560927743193, "loss": 1.2516, "step": 3840 }, { "epoch": 0.4621949753576151, "grad_norm": 0.6916151642799377, "learning_rate": 0.0004983506291373295, "loss": 1.6047, "step": 3845 }, { "epoch": 0.46279600913571345, "grad_norm": 0.5055292844772339, "learning_rate": 0.0004983451564660622, "loss": 1.0172, "step": 3850 }, { "epoch": 0.46339704291381173, "grad_norm": 0.6418437957763672, "learning_rate": 0.0004983396747607161, "loss": 1.1805, "step": 3855 }, { "epoch": 0.46399807669191007, "grad_norm": 0.5011945962905884, "learning_rate": 0.000498334184021491, "loss": 1.0055, "step": 3860 }, { "epoch": 0.4645991104700084, "grad_norm": 0.5504122376441956, "learning_rate": 0.0004983286842485864, "loss": 0.9742, "step": 3865 }, { "epoch": 0.46520014424810674, "grad_norm": 0.3638380467891693, "learning_rate": 0.0004983231754422024, "loss": 0.9906, "step": 3870 }, { "epoch": 0.4658011780262051, "grad_norm": 0.5893705487251282, "learning_rate": 0.0004983176576025394, "loss": 1.0367, "step": 3875 }, { "epoch": 0.4664022118043034, "grad_norm": 0.5191811323165894, "learning_rate": 0.0004983121307297983, "loss": 1.4234, "step": 3880 }, { "epoch": 0.46700324558240175, "grad_norm": 0.5002449750900269, "learning_rate": 0.0004983065948241799, "loss": 1.1242, "step": 3885 }, { "epoch": 0.4676042793605001, "grad_norm": 0.5473368167877197, "learning_rate": 0.0004983010498858857, "loss": 1.1219, "step": 3890 }, { "epoch": 0.46820531313859837, "grad_norm": 0.5295068621635437, "learning_rate": 0.0004982954959151174, "loss": 1.4508, "step": 3895 }, { "epoch": 0.4688063469166967, "grad_norm": 0.8546839356422424, "learning_rate": 0.000498289932912077, "loss": 0.9664, "step": 3900 }, { "epoch": 0.46940738069479504, "grad_norm": 0.6534102559089661, "learning_rate": 0.000498284360876967, "loss": 1.4344, "step": 3905 }, { "epoch": 0.4700084144728934, "grad_norm": 0.4570360779762268, "learning_rate": 0.0004982787798099898, "loss": 0.9531, "step": 3910 }, { "epoch": 0.4706094482509917, "grad_norm": 0.5392407178878784, "learning_rate": 0.0004982731897113488, "loss": 1.243, "step": 3915 }, { "epoch": 0.47121048202909005, "grad_norm": 0.7176635265350342, "learning_rate": 0.0004982675905812469, "loss": 0.968, "step": 3920 }, { "epoch": 0.4718115158071884, "grad_norm": 0.5123677253723145, "learning_rate": 0.0004982619824198882, "loss": 1.4969, "step": 3925 }, { "epoch": 0.47241254958528667, "grad_norm": 0.5915489196777344, "learning_rate": 0.0004982563652274766, "loss": 1.3805, "step": 3930 }, { "epoch": 0.473013583363385, "grad_norm": 0.5139390230178833, "learning_rate": 0.0004982507390042163, "loss": 1.057, "step": 3935 }, { "epoch": 0.47361461714148334, "grad_norm": 0.6480752229690552, "learning_rate": 0.0004982451037503121, "loss": 1.0063, "step": 3940 }, { "epoch": 0.4742156509195817, "grad_norm": 0.6889259219169617, "learning_rate": 0.0004982394594659689, "loss": 1.4828, "step": 3945 }, { "epoch": 0.47481668469768, "grad_norm": 0.5907976627349854, "learning_rate": 0.0004982338061513921, "loss": 1.2148, "step": 3950 }, { "epoch": 0.47541771847577835, "grad_norm": 0.6065306067466736, "learning_rate": 0.0004982281438067874, "loss": 1.3336, "step": 3955 }, { "epoch": 0.4760187522538767, "grad_norm": 0.5058939456939697, "learning_rate": 0.0004982224724323606, "loss": 1.2148, "step": 3960 }, { "epoch": 0.476619786031975, "grad_norm": 0.633465588092804, "learning_rate": 0.0004982167920283181, "loss": 1.1977, "step": 3965 }, { "epoch": 0.4772208198100733, "grad_norm": 0.48402997851371765, "learning_rate": 0.0004982111025948666, "loss": 1.0758, "step": 3970 }, { "epoch": 0.47782185358817164, "grad_norm": 0.4740035831928253, "learning_rate": 0.000498205404132213, "loss": 0.9914, "step": 3975 }, { "epoch": 0.47842288736627, "grad_norm": 0.6403579711914062, "learning_rate": 0.0004981996966405646, "loss": 0.9035, "step": 3980 }, { "epoch": 0.4790239211443683, "grad_norm": 0.6459212303161621, "learning_rate": 0.000498193980120129, "loss": 1.1273, "step": 3985 }, { "epoch": 0.47962495492246665, "grad_norm": 0.5674965977668762, "learning_rate": 0.0004981882545711142, "loss": 1.3828, "step": 3990 }, { "epoch": 0.480225988700565, "grad_norm": 0.6159428954124451, "learning_rate": 0.0004981825199937285, "loss": 1.2344, "step": 3995 }, { "epoch": 0.4808270224786633, "grad_norm": 0.6476476788520813, "learning_rate": 0.0004981767763881803, "loss": 1.4625, "step": 4000 }, { "epoch": 0.4808270224786633, "eval_loss": 2.0904297828674316, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1956, "eval_samples_per_second": 4.546, "eval_steps_per_second": 1.137, "step": 4000 }, { "epoch": 0.4814280562567616, "grad_norm": 0.7665592432022095, "learning_rate": 0.0004981710237546789, "loss": 1.2703, "step": 4005 }, { "epoch": 0.48202909003485994, "grad_norm": 0.5057299733161926, "learning_rate": 0.0004981652620934333, "loss": 1.3039, "step": 4010 }, { "epoch": 0.4826301238129583, "grad_norm": 0.46735164523124695, "learning_rate": 0.0004981594914046532, "loss": 1.0281, "step": 4015 }, { "epoch": 0.4832311575910566, "grad_norm": 0.6298277378082275, "learning_rate": 0.0004981537116885484, "loss": 1.0656, "step": 4020 }, { "epoch": 0.48383219136915495, "grad_norm": 0.6472790837287903, "learning_rate": 0.0004981479229453292, "loss": 1.0418, "step": 4025 }, { "epoch": 0.4844332251472533, "grad_norm": 0.5558010339736938, "learning_rate": 0.0004981421251752063, "loss": 1.0992, "step": 4030 }, { "epoch": 0.4850342589253516, "grad_norm": 0.46242016553878784, "learning_rate": 0.0004981363183783903, "loss": 1.002, "step": 4035 }, { "epoch": 0.48563529270344996, "grad_norm": 0.48881420493125916, "learning_rate": 0.0004981305025550929, "loss": 1.1867, "step": 4040 }, { "epoch": 0.48623632648154824, "grad_norm": 0.4354248642921448, "learning_rate": 0.0004981246777055252, "loss": 1.4141, "step": 4045 }, { "epoch": 0.4868373602596466, "grad_norm": 0.5004878044128418, "learning_rate": 0.0004981188438298995, "loss": 0.9684, "step": 4050 }, { "epoch": 0.4874383940377449, "grad_norm": 0.5613626837730408, "learning_rate": 0.0004981130009284277, "loss": 1.1555, "step": 4055 }, { "epoch": 0.48803942781584325, "grad_norm": 0.4278530776500702, "learning_rate": 0.0004981071490013225, "loss": 1.1289, "step": 4060 }, { "epoch": 0.4886404615939416, "grad_norm": 0.6259580850601196, "learning_rate": 0.0004981012880487968, "loss": 0.9828, "step": 4065 }, { "epoch": 0.4892414953720399, "grad_norm": 0.5686140656471252, "learning_rate": 0.0004980954180710636, "loss": 1.1805, "step": 4070 }, { "epoch": 0.48984252915013826, "grad_norm": 1.0444506406784058, "learning_rate": 0.0004980895390683367, "loss": 0.9242, "step": 4075 }, { "epoch": 0.49044356292823654, "grad_norm": 0.49180054664611816, "learning_rate": 0.0004980836510408297, "loss": 0.9906, "step": 4080 }, { "epoch": 0.4910445967063349, "grad_norm": 0.39391833543777466, "learning_rate": 0.000498077753988757, "loss": 0.9578, "step": 4085 }, { "epoch": 0.4916456304844332, "grad_norm": 0.6872186660766602, "learning_rate": 0.0004980718479123332, "loss": 1.2711, "step": 4090 }, { "epoch": 0.49224666426253155, "grad_norm": 0.4728960692882538, "learning_rate": 0.0004980659328117728, "loss": 1.0828, "step": 4095 }, { "epoch": 0.4928476980406299, "grad_norm": 0.4187394380569458, "learning_rate": 0.0004980600086872913, "loss": 1.0727, "step": 4100 }, { "epoch": 0.4934487318187282, "grad_norm": 0.5191169381141663, "learning_rate": 0.000498054075539104, "loss": 0.9246, "step": 4105 }, { "epoch": 0.49404976559682656, "grad_norm": 0.5990204811096191, "learning_rate": 0.0004980481333674269, "loss": 1.2625, "step": 4110 }, { "epoch": 0.49465079937492484, "grad_norm": 0.5898841619491577, "learning_rate": 0.0004980421821724759, "loss": 0.9133, "step": 4115 }, { "epoch": 0.4952518331530232, "grad_norm": 0.576693594455719, "learning_rate": 0.0004980362219544677, "loss": 1.2969, "step": 4120 }, { "epoch": 0.4958528669311215, "grad_norm": 0.5620527267456055, "learning_rate": 0.000498030252713619, "loss": 0.9883, "step": 4125 }, { "epoch": 0.49645390070921985, "grad_norm": 0.46309694647789, "learning_rate": 0.0004980242744501472, "loss": 1.3211, "step": 4130 }, { "epoch": 0.4970549344873182, "grad_norm": 0.6207218170166016, "learning_rate": 0.0004980182871642694, "loss": 1.2563, "step": 4135 }, { "epoch": 0.4976559682654165, "grad_norm": 0.5662885904312134, "learning_rate": 0.0004980122908562036, "loss": 1.3328, "step": 4140 }, { "epoch": 0.49825700204351486, "grad_norm": 0.6134732961654663, "learning_rate": 0.000498006285526168, "loss": 1.132, "step": 4145 }, { "epoch": 0.4988580358216132, "grad_norm": 0.7582330107688904, "learning_rate": 0.0004980002711743809, "loss": 0.8836, "step": 4150 }, { "epoch": 0.4994590695997115, "grad_norm": 0.6320111155509949, "learning_rate": 0.0004979942478010612, "loss": 1.1727, "step": 4155 }, { "epoch": 0.5000601033778098, "grad_norm": 0.5162654519081116, "learning_rate": 0.0004979882154064279, "loss": 1.2016, "step": 4160 }, { "epoch": 0.5006611371559082, "grad_norm": 0.25934234261512756, "learning_rate": 0.0004979821739907005, "loss": 1.0988, "step": 4165 }, { "epoch": 0.5012621709340065, "grad_norm": 0.6046193838119507, "learning_rate": 0.0004979761235540988, "loss": 1.3953, "step": 4170 }, { "epoch": 0.5018632047121048, "grad_norm": 0.5944646596908569, "learning_rate": 0.0004979700640968429, "loss": 1.1305, "step": 4175 }, { "epoch": 0.5024642384902032, "grad_norm": 0.5635605454444885, "learning_rate": 0.0004979639956191531, "loss": 1.2656, "step": 4180 }, { "epoch": 0.5030652722683014, "grad_norm": 0.5029686689376831, "learning_rate": 0.0004979579181212504, "loss": 1.2305, "step": 4185 }, { "epoch": 0.5036663060463998, "grad_norm": 0.8044158816337585, "learning_rate": 0.0004979518316033556, "loss": 1.225, "step": 4190 }, { "epoch": 0.5042673398244981, "grad_norm": 0.6193830966949463, "learning_rate": 0.0004979457360656902, "loss": 0.9453, "step": 4195 }, { "epoch": 0.5048683736025965, "grad_norm": 0.5999032258987427, "learning_rate": 0.0004979396315084761, "loss": 1.2148, "step": 4200 }, { "epoch": 0.5048683736025965, "eval_loss": 2.069140672683716, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1981, "eval_samples_per_second": 4.546, "eval_steps_per_second": 1.136, "step": 4200 }, { "epoch": 0.5054694073806948, "grad_norm": 0.6504601836204529, "learning_rate": 0.0004979335179319352, "loss": 1.082, "step": 4205 }, { "epoch": 0.5060704411587931, "grad_norm": 0.5235728025436401, "learning_rate": 0.00049792739533629, "loss": 0.9156, "step": 4210 }, { "epoch": 0.5066714749368915, "grad_norm": 0.46760815382003784, "learning_rate": 0.0004979212637217631, "loss": 1.007, "step": 4215 }, { "epoch": 0.5072725087149897, "grad_norm": 0.5325601696968079, "learning_rate": 0.0004979151230885776, "loss": 1.1094, "step": 4220 }, { "epoch": 0.5078735424930881, "grad_norm": 0.4098140299320221, "learning_rate": 0.0004979089734369568, "loss": 0.8449, "step": 4225 }, { "epoch": 0.5084745762711864, "grad_norm": 0.6042998433113098, "learning_rate": 0.0004979028147671246, "loss": 1.1672, "step": 4230 }, { "epoch": 0.5090756100492848, "grad_norm": 0.41027069091796875, "learning_rate": 0.0004978966470793049, "loss": 1.1992, "step": 4235 }, { "epoch": 0.5096766438273831, "grad_norm": 0.5344287753105164, "learning_rate": 0.0004978904703737221, "loss": 0.9934, "step": 4240 }, { "epoch": 0.5102776776054815, "grad_norm": 0.5234737992286682, "learning_rate": 0.000497884284650601, "loss": 1.0617, "step": 4245 }, { "epoch": 0.5108787113835798, "grad_norm": 0.4804365336894989, "learning_rate": 0.0004978780899101663, "loss": 1.0098, "step": 4250 }, { "epoch": 0.511479745161678, "grad_norm": 0.6976252198219299, "learning_rate": 0.0004978718861526438, "loss": 1.2852, "step": 4255 }, { "epoch": 0.5120807789397764, "grad_norm": 0.4136990010738373, "learning_rate": 0.0004978656733782588, "loss": 1.1062, "step": 4260 }, { "epoch": 0.5126818127178747, "grad_norm": 0.5344454646110535, "learning_rate": 0.0004978594515872373, "loss": 1.2984, "step": 4265 }, { "epoch": 0.5132828464959731, "grad_norm": 0.42136499285697937, "learning_rate": 0.0004978532207798059, "loss": 1.0766, "step": 4270 }, { "epoch": 0.5138838802740714, "grad_norm": 0.4809805750846863, "learning_rate": 0.0004978469809561911, "loss": 1.1469, "step": 4275 }, { "epoch": 0.5144849140521698, "grad_norm": 0.5629724264144897, "learning_rate": 0.0004978407321166199, "loss": 1.218, "step": 4280 }, { "epoch": 0.5150859478302681, "grad_norm": 0.34920796751976013, "learning_rate": 0.0004978344742613195, "loss": 0.8383, "step": 4285 }, { "epoch": 0.5156869816083663, "grad_norm": 0.5682929158210754, "learning_rate": 0.0004978282073905178, "loss": 1.198, "step": 4290 }, { "epoch": 0.5162880153864647, "grad_norm": 0.49201318621635437, "learning_rate": 0.0004978219315044426, "loss": 1.2352, "step": 4295 }, { "epoch": 0.516889049164563, "grad_norm": 0.721458911895752, "learning_rate": 0.0004978156466033222, "loss": 0.9203, "step": 4300 }, { "epoch": 0.5174900829426614, "grad_norm": 0.5688953995704651, "learning_rate": 0.0004978093526873853, "loss": 1.1977, "step": 4305 }, { "epoch": 0.5180911167207597, "grad_norm": 0.6028774976730347, "learning_rate": 0.0004978030497568607, "loss": 1.475, "step": 4310 }, { "epoch": 0.5186921504988581, "grad_norm": 0.6064693331718445, "learning_rate": 0.000497796737811978, "loss": 1.1285, "step": 4315 }, { "epoch": 0.5192931842769564, "grad_norm": 0.47966837882995605, "learning_rate": 0.0004977904168529664, "loss": 1.2102, "step": 4320 }, { "epoch": 0.5198942180550546, "grad_norm": 0.7327610850334167, "learning_rate": 0.0004977840868800561, "loss": 1.1367, "step": 4325 }, { "epoch": 0.520495251833153, "grad_norm": 0.5901091694831848, "learning_rate": 0.0004977777478934774, "loss": 1.3352, "step": 4330 }, { "epoch": 0.5210962856112513, "grad_norm": 0.5034139752388, "learning_rate": 0.0004977713998934607, "loss": 0.9094, "step": 4335 }, { "epoch": 0.5216973193893497, "grad_norm": 0.43822070956230164, "learning_rate": 0.0004977650428802371, "loss": 1.3539, "step": 4340 }, { "epoch": 0.522298353167448, "grad_norm": 0.5016989707946777, "learning_rate": 0.0004977586768540377, "loss": 1.2398, "step": 4345 }, { "epoch": 0.5228993869455464, "grad_norm": 0.4488658607006073, "learning_rate": 0.0004977523018150941, "loss": 1.0188, "step": 4350 }, { "epoch": 0.5235004207236447, "grad_norm": 0.5360990762710571, "learning_rate": 0.0004977459177636384, "loss": 1.1133, "step": 4355 }, { "epoch": 0.524101454501743, "grad_norm": 0.41790762543678284, "learning_rate": 0.0004977395246999026, "loss": 1.0016, "step": 4360 }, { "epoch": 0.5247024882798413, "grad_norm": 0.8560382723808289, "learning_rate": 0.0004977331226241194, "loss": 1.1953, "step": 4365 }, { "epoch": 0.5253035220579396, "grad_norm": 0.5320670008659363, "learning_rate": 0.0004977267115365216, "loss": 1.0711, "step": 4370 }, { "epoch": 0.525904555836038, "grad_norm": 0.5246152877807617, "learning_rate": 0.0004977202914373426, "loss": 1.2891, "step": 4375 }, { "epoch": 0.5265055896141363, "grad_norm": 0.6537752747535706, "learning_rate": 0.0004977138623268156, "loss": 1.2719, "step": 4380 }, { "epoch": 0.5271066233922347, "grad_norm": 0.5286105275154114, "learning_rate": 0.0004977074242051748, "loss": 1.1391, "step": 4385 }, { "epoch": 0.527707657170333, "grad_norm": 0.5731997489929199, "learning_rate": 0.0004977009770726541, "loss": 1.0484, "step": 4390 }, { "epoch": 0.5283086909484312, "grad_norm": 0.6958596110343933, "learning_rate": 0.0004976945209294884, "loss": 0.9648, "step": 4395 }, { "epoch": 0.5289097247265296, "grad_norm": 0.8171592950820923, "learning_rate": 0.0004976880557759124, "loss": 1.5773, "step": 4400 }, { "epoch": 0.5289097247265296, "eval_loss": 2.049023389816284, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1838, "eval_samples_per_second": 4.548, "eval_steps_per_second": 1.137, "step": 4400 }, { "epoch": 0.5295107585046279, "grad_norm": 0.4283256530761719, "learning_rate": 0.000497681581612161, "loss": 0.9379, "step": 4405 }, { "epoch": 0.5301117922827263, "grad_norm": 0.6683317422866821, "learning_rate": 0.0004976750984384701, "loss": 1.3484, "step": 4410 }, { "epoch": 0.5307128260608246, "grad_norm": 0.7179862260818481, "learning_rate": 0.0004976686062550754, "loss": 0.7375, "step": 4415 }, { "epoch": 0.531313859838923, "grad_norm": 0.5547930002212524, "learning_rate": 0.000497662105062213, "loss": 0.932, "step": 4420 }, { "epoch": 0.5319148936170213, "grad_norm": 0.6749304533004761, "learning_rate": 0.0004976555948601194, "loss": 1.0969, "step": 4425 }, { "epoch": 0.5325159273951197, "grad_norm": 0.4598415493965149, "learning_rate": 0.0004976490756490316, "loss": 1.2563, "step": 4430 }, { "epoch": 0.5331169611732179, "grad_norm": 1.0315395593643188, "learning_rate": 0.0004976425474291866, "loss": 0.9234, "step": 4435 }, { "epoch": 0.5337179949513162, "grad_norm": 0.5202517509460449, "learning_rate": 0.0004976360102008219, "loss": 0.9219, "step": 4440 }, { "epoch": 0.5343190287294146, "grad_norm": 0.6117017269134521, "learning_rate": 0.0004976294639641753, "loss": 1.1805, "step": 4445 }, { "epoch": 0.5349200625075129, "grad_norm": 0.6244452595710754, "learning_rate": 0.000497622908719485, "loss": 1.1406, "step": 4450 }, { "epoch": 0.5355210962856113, "grad_norm": 0.6520891785621643, "learning_rate": 0.0004976163444669893, "loss": 1.2078, "step": 4455 }, { "epoch": 0.5361221300637096, "grad_norm": 0.6139251589775085, "learning_rate": 0.0004976097712069272, "loss": 1.0719, "step": 4460 }, { "epoch": 0.536723163841808, "grad_norm": 0.46594560146331787, "learning_rate": 0.0004976031889395376, "loss": 0.9531, "step": 4465 }, { "epoch": 0.5373241976199062, "grad_norm": 0.6443547606468201, "learning_rate": 0.0004975965976650601, "loss": 1.0945, "step": 4470 }, { "epoch": 0.5379252313980045, "grad_norm": 0.6175803542137146, "learning_rate": 0.0004975899973837344, "loss": 1.5594, "step": 4475 }, { "epoch": 0.5385262651761029, "grad_norm": 0.7328972220420837, "learning_rate": 0.0004975833880958006, "loss": 1.1953, "step": 4480 }, { "epoch": 0.5391272989542012, "grad_norm": 0.485140860080719, "learning_rate": 0.0004975767698014992, "loss": 1.1117, "step": 4485 }, { "epoch": 0.5397283327322996, "grad_norm": 0.5293512344360352, "learning_rate": 0.0004975701425010709, "loss": 1.025, "step": 4490 }, { "epoch": 0.5403293665103979, "grad_norm": 0.7554160952568054, "learning_rate": 0.0004975635061947568, "loss": 1.0391, "step": 4495 }, { "epoch": 0.5409304002884963, "grad_norm": 0.41734662652015686, "learning_rate": 0.0004975568608827982, "loss": 0.8695, "step": 4500 }, { "epoch": 0.5415314340665945, "grad_norm": 0.39808404445648193, "learning_rate": 0.0004975502065654371, "loss": 1.1461, "step": 4505 }, { "epoch": 0.5421324678446928, "grad_norm": 0.6031619906425476, "learning_rate": 0.0004975435432429153, "loss": 1.2563, "step": 4510 }, { "epoch": 0.5427335016227912, "grad_norm": 0.7665862441062927, "learning_rate": 0.0004975368709154753, "loss": 1.2812, "step": 4515 }, { "epoch": 0.5433345354008895, "grad_norm": 0.7608307003974915, "learning_rate": 0.0004975301895833598, "loss": 1.1766, "step": 4520 }, { "epoch": 0.5439355691789879, "grad_norm": 0.6584916114807129, "learning_rate": 0.0004975234992468118, "loss": 1.0078, "step": 4525 }, { "epoch": 0.5445366029570862, "grad_norm": 0.5199896097183228, "learning_rate": 0.0004975167999060748, "loss": 1.2406, "step": 4530 }, { "epoch": 0.5451376367351846, "grad_norm": 0.5209721922874451, "learning_rate": 0.0004975100915613925, "loss": 1.0367, "step": 4535 }, { "epoch": 0.5457386705132828, "grad_norm": 0.6140447854995728, "learning_rate": 0.0004975033742130087, "loss": 0.9734, "step": 4540 }, { "epoch": 0.5463397042913811, "grad_norm": 0.5982779860496521, "learning_rate": 0.0004974966478611681, "loss": 1.182, "step": 4545 }, { "epoch": 0.5469407380694795, "grad_norm": 0.48514458537101746, "learning_rate": 0.0004974899125061151, "loss": 1.2273, "step": 4550 }, { "epoch": 0.5475417718475778, "grad_norm": 0.5001315474510193, "learning_rate": 0.0004974831681480949, "loss": 1.2789, "step": 4555 }, { "epoch": 0.5481428056256762, "grad_norm": 0.4541556239128113, "learning_rate": 0.0004974764147873526, "loss": 0.9297, "step": 4560 }, { "epoch": 0.5487438394037745, "grad_norm": 0.5090661644935608, "learning_rate": 0.0004974696524241342, "loss": 0.984, "step": 4565 }, { "epoch": 0.5493448731818729, "grad_norm": 0.49955224990844727, "learning_rate": 0.0004974628810586854, "loss": 1.0625, "step": 4570 }, { "epoch": 0.5499459069599711, "grad_norm": 0.6976388692855835, "learning_rate": 0.0004974561006912527, "loss": 1.1914, "step": 4575 }, { "epoch": 0.5505469407380695, "grad_norm": 0.4535270929336548, "learning_rate": 0.0004974493113220827, "loss": 0.7617, "step": 4580 }, { "epoch": 0.5511479745161678, "grad_norm": 0.8714334964752197, "learning_rate": 0.0004974425129514224, "loss": 0.9938, "step": 4585 }, { "epoch": 0.5517490082942661, "grad_norm": 0.49312853813171387, "learning_rate": 0.000497435705579519, "loss": 1.2945, "step": 4590 }, { "epoch": 0.5523500420723645, "grad_norm": 0.5688885450363159, "learning_rate": 0.0004974288892066203, "loss": 1.0527, "step": 4595 }, { "epoch": 0.5529510758504628, "grad_norm": 0.5999502539634705, "learning_rate": 0.0004974220638329741, "loss": 0.9891, "step": 4600 }, { "epoch": 0.5529510758504628, "eval_loss": 2.064257860183716, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2145, "eval_samples_per_second": 4.544, "eval_steps_per_second": 1.136, "step": 4600 }, { "epoch": 0.5535521096285612, "grad_norm": 0.5289875268936157, "learning_rate": 0.0004974152294588289, "loss": 1.1328, "step": 4605 }, { "epoch": 0.5541531434066594, "grad_norm": 0.7003397941589355, "learning_rate": 0.000497408386084433, "loss": 0.8477, "step": 4610 }, { "epoch": 0.5547541771847578, "grad_norm": 0.5111108422279358, "learning_rate": 0.0004974015337100357, "loss": 1.168, "step": 4615 }, { "epoch": 0.5553552109628561, "grad_norm": 0.4160408675670624, "learning_rate": 0.0004973946723358858, "loss": 1.3211, "step": 4620 }, { "epoch": 0.5559562447409544, "grad_norm": 0.5501172542572021, "learning_rate": 0.0004973878019622335, "loss": 1.3945, "step": 4625 }, { "epoch": 0.5565572785190528, "grad_norm": 0.4727821350097656, "learning_rate": 0.0004973809225893282, "loss": 1.3078, "step": 4630 }, { "epoch": 0.5571583122971511, "grad_norm": 0.5417948365211487, "learning_rate": 0.0004973740342174204, "loss": 1.1062, "step": 4635 }, { "epoch": 0.5577593460752495, "grad_norm": 0.39009690284729004, "learning_rate": 0.0004973671368467607, "loss": 0.7863, "step": 4640 }, { "epoch": 0.5583603798533477, "grad_norm": 0.6143473386764526, "learning_rate": 0.0004973602304776, "loss": 1.0836, "step": 4645 }, { "epoch": 0.5589614136314461, "grad_norm": 0.46591776609420776, "learning_rate": 0.0004973533151101893, "loss": 1.0797, "step": 4650 }, { "epoch": 0.5595624474095444, "grad_norm": 0.636924684047699, "learning_rate": 0.0004973463907447804, "loss": 0.9961, "step": 4655 }, { "epoch": 0.5601634811876427, "grad_norm": 0.7269019484519958, "learning_rate": 0.0004973394573816252, "loss": 1.0727, "step": 4660 }, { "epoch": 0.5607645149657411, "grad_norm": 0.5581966638565063, "learning_rate": 0.0004973325150209758, "loss": 0.832, "step": 4665 }, { "epoch": 0.5613655487438394, "grad_norm": 0.586113691329956, "learning_rate": 0.0004973255636630847, "loss": 1.132, "step": 4670 }, { "epoch": 0.5619665825219378, "grad_norm": 0.7046157121658325, "learning_rate": 0.0004973186033082049, "loss": 1.1008, "step": 4675 }, { "epoch": 0.562567616300036, "grad_norm": 0.6651023626327515, "learning_rate": 0.0004973116339565897, "loss": 1.1445, "step": 4680 }, { "epoch": 0.5631686500781344, "grad_norm": 0.6627675890922546, "learning_rate": 0.0004973046556084923, "loss": 1.2563, "step": 4685 }, { "epoch": 0.5637696838562327, "grad_norm": 0.4929293692111969, "learning_rate": 0.0004972976682641668, "loss": 1.1836, "step": 4690 }, { "epoch": 0.564370717634331, "grad_norm": 0.500619649887085, "learning_rate": 0.0004972906719238673, "loss": 1.1789, "step": 4695 }, { "epoch": 0.5649717514124294, "grad_norm": 0.456321120262146, "learning_rate": 0.0004972836665878483, "loss": 1.1375, "step": 4700 }, { "epoch": 0.5655727851905277, "grad_norm": 0.43033653497695923, "learning_rate": 0.0004972766522563648, "loss": 0.7367, "step": 4705 }, { "epoch": 0.5661738189686261, "grad_norm": 0.5689796805381775, "learning_rate": 0.0004972696289296715, "loss": 0.7828, "step": 4710 }, { "epoch": 0.5667748527467243, "grad_norm": 0.6046550273895264, "learning_rate": 0.0004972625966080244, "loss": 1.082, "step": 4715 }, { "epoch": 0.5673758865248227, "grad_norm": 0.6092924475669861, "learning_rate": 0.0004972555552916791, "loss": 1.0945, "step": 4720 }, { "epoch": 0.567976920302921, "grad_norm": 0.6022126078605652, "learning_rate": 0.0004972485049808918, "loss": 1.0648, "step": 4725 }, { "epoch": 0.5685779540810193, "grad_norm": 0.6475672721862793, "learning_rate": 0.0004972414456759189, "loss": 1.4375, "step": 4730 }, { "epoch": 0.5691789878591177, "grad_norm": 0.5474737882614136, "learning_rate": 0.0004972343773770172, "loss": 1.2367, "step": 4735 }, { "epoch": 0.569780021637216, "grad_norm": 0.5624226331710815, "learning_rate": 0.0004972273000844439, "loss": 0.9305, "step": 4740 }, { "epoch": 0.5703810554153144, "grad_norm": 0.5779100656509399, "learning_rate": 0.0004972202137984564, "loss": 0.9156, "step": 4745 }, { "epoch": 0.5709820891934126, "grad_norm": 0.8340283036231995, "learning_rate": 0.0004972131185193123, "loss": 1.3273, "step": 4750 }, { "epoch": 0.571583122971511, "grad_norm": 0.5648449063301086, "learning_rate": 0.0004972060142472702, "loss": 1.1727, "step": 4755 }, { "epoch": 0.5721841567496093, "grad_norm": 0.6421749591827393, "learning_rate": 0.0004971989009825879, "loss": 1.0184, "step": 4760 }, { "epoch": 0.5727851905277077, "grad_norm": 0.5583392381668091, "learning_rate": 0.0004971917787255247, "loss": 1.3945, "step": 4765 }, { "epoch": 0.573386224305806, "grad_norm": 0.6780814528465271, "learning_rate": 0.0004971846474763394, "loss": 1.2617, "step": 4770 }, { "epoch": 0.5739872580839043, "grad_norm": 0.6801932454109192, "learning_rate": 0.0004971775072352914, "loss": 1.127, "step": 4775 }, { "epoch": 0.5745882918620027, "grad_norm": 0.7682610154151917, "learning_rate": 0.0004971703580026407, "loss": 1.093, "step": 4780 }, { "epoch": 0.575189325640101, "grad_norm": 0.5586374998092651, "learning_rate": 0.000497163199778647, "loss": 1.1852, "step": 4785 }, { "epoch": 0.5757903594181993, "grad_norm": 0.574213445186615, "learning_rate": 0.000497156032563571, "loss": 1.257, "step": 4790 }, { "epoch": 0.5763913931962976, "grad_norm": 0.4758005738258362, "learning_rate": 0.0004971488563576732, "loss": 0.7699, "step": 4795 }, { "epoch": 0.576992426974396, "grad_norm": 0.7012805938720703, "learning_rate": 0.0004971416711612149, "loss": 0.9437, "step": 4800 }, { "epoch": 0.576992426974396, "eval_loss": 2.10546875, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1867, "eval_samples_per_second": 4.547, "eval_steps_per_second": 1.137, "step": 4800 }, { "epoch": 0.5775934607524943, "grad_norm": 0.5702618360519409, "learning_rate": 0.0004971344769744572, "loss": 1.4156, "step": 4805 }, { "epoch": 0.5781944945305926, "grad_norm": 0.5781665444374084, "learning_rate": 0.000497127273797662, "loss": 1.1836, "step": 4810 }, { "epoch": 0.578795528308691, "grad_norm": 0.6150456070899963, "learning_rate": 0.0004971200616310914, "loss": 1.0434, "step": 4815 }, { "epoch": 0.5793965620867892, "grad_norm": 0.49010351300239563, "learning_rate": 0.0004971128404750075, "loss": 1.0312, "step": 4820 }, { "epoch": 0.5799975958648876, "grad_norm": 0.5255641341209412, "learning_rate": 0.000497105610329673, "loss": 1.1898, "step": 4825 }, { "epoch": 0.5805986296429859, "grad_norm": 0.5797913670539856, "learning_rate": 0.0004970983711953512, "loss": 0.9797, "step": 4830 }, { "epoch": 0.5811996634210843, "grad_norm": 0.3937300741672516, "learning_rate": 0.0004970911230723052, "loss": 1.1031, "step": 4835 }, { "epoch": 0.5818006971991826, "grad_norm": 0.5963497161865234, "learning_rate": 0.0004970838659607987, "loss": 1.0164, "step": 4840 }, { "epoch": 0.5824017309772809, "grad_norm": 0.7100831866264343, "learning_rate": 0.0004970765998610957, "loss": 0.943, "step": 4845 }, { "epoch": 0.5830027647553793, "grad_norm": 0.5534042119979858, "learning_rate": 0.0004970693247734606, "loss": 1.143, "step": 4850 }, { "epoch": 0.5836037985334775, "grad_norm": 0.8651660084724426, "learning_rate": 0.000497062040698158, "loss": 0.8562, "step": 4855 }, { "epoch": 0.5842048323115759, "grad_norm": 0.5462768077850342, "learning_rate": 0.0004970547476354528, "loss": 1.118, "step": 4860 }, { "epoch": 0.5848058660896742, "grad_norm": 0.5735900402069092, "learning_rate": 0.0004970474455856103, "loss": 1.0475, "step": 4865 }, { "epoch": 0.5854068998677726, "grad_norm": 0.7304772138595581, "learning_rate": 0.0004970401345488962, "loss": 0.9406, "step": 4870 }, { "epoch": 0.5860079336458709, "grad_norm": 0.8543664813041687, "learning_rate": 0.0004970328145255767, "loss": 1.1461, "step": 4875 }, { "epoch": 0.5866089674239692, "grad_norm": 0.47991877794265747, "learning_rate": 0.0004970254855159176, "loss": 1.2852, "step": 4880 }, { "epoch": 0.5872100012020676, "grad_norm": 0.5912867188453674, "learning_rate": 0.0004970181475201857, "loss": 1.0023, "step": 4885 }, { "epoch": 0.5878110349801658, "grad_norm": 0.598426878452301, "learning_rate": 0.0004970108005386482, "loss": 1.3953, "step": 4890 }, { "epoch": 0.5884120687582642, "grad_norm": 0.40192949771881104, "learning_rate": 0.0004970034445715719, "loss": 1.3156, "step": 4895 }, { "epoch": 0.5890131025363625, "grad_norm": 0.663162112236023, "learning_rate": 0.0004969960796192246, "loss": 1.0555, "step": 4900 }, { "epoch": 0.5896141363144609, "grad_norm": 0.5530306696891785, "learning_rate": 0.0004969887056818743, "loss": 1.3445, "step": 4905 }, { "epoch": 0.5902151700925592, "grad_norm": 0.5497463941574097, "learning_rate": 0.0004969813227597892, "loss": 1.3188, "step": 4910 }, { "epoch": 0.5908162038706576, "grad_norm": 0.40855851769447327, "learning_rate": 0.0004969739308532379, "loss": 1.1141, "step": 4915 }, { "epoch": 0.5914172376487559, "grad_norm": 0.6550401449203491, "learning_rate": 0.0004969665299624891, "loss": 1.0875, "step": 4920 }, { "epoch": 0.5920182714268541, "grad_norm": 0.6358505487442017, "learning_rate": 0.0004969591200878122, "loss": 1.4852, "step": 4925 }, { "epoch": 0.5926193052049525, "grad_norm": 0.45583033561706543, "learning_rate": 0.0004969517012294768, "loss": 1.1062, "step": 4930 }, { "epoch": 0.5932203389830508, "grad_norm": 0.6163007020950317, "learning_rate": 0.0004969442733877526, "loss": 1.1102, "step": 4935 }, { "epoch": 0.5938213727611492, "grad_norm": 0.7980563640594482, "learning_rate": 0.00049693683656291, "loss": 1.1547, "step": 4940 }, { "epoch": 0.5944224065392475, "grad_norm": 0.371162086725235, "learning_rate": 0.0004969293907552193, "loss": 1.2648, "step": 4945 }, { "epoch": 0.5950234403173459, "grad_norm": 0.4477928578853607, "learning_rate": 0.0004969219359649516, "loss": 1.193, "step": 4950 }, { "epoch": 0.5956244740954442, "grad_norm": 0.5350117683410645, "learning_rate": 0.0004969144721923779, "loss": 0.7547, "step": 4955 }, { "epoch": 0.5962255078735424, "grad_norm": 0.7265620231628418, "learning_rate": 0.0004969069994377697, "loss": 1.1359, "step": 4960 }, { "epoch": 0.5968265416516408, "grad_norm": 0.4857397973537445, "learning_rate": 0.0004968995177013991, "loss": 1.2906, "step": 4965 }, { "epoch": 0.5974275754297391, "grad_norm": 0.43424108624458313, "learning_rate": 0.000496892026983538, "loss": 1.1633, "step": 4970 }, { "epoch": 0.5980286092078375, "grad_norm": 0.5249903202056885, "learning_rate": 0.0004968845272844589, "loss": 0.8766, "step": 4975 }, { "epoch": 0.5986296429859358, "grad_norm": 0.591312825679779, "learning_rate": 0.0004968770186044347, "loss": 1.107, "step": 4980 }, { "epoch": 0.5992306767640342, "grad_norm": 0.5988960266113281, "learning_rate": 0.0004968695009437385, "loss": 1.0281, "step": 4985 }, { "epoch": 0.5998317105421325, "grad_norm": 0.5821179151535034, "learning_rate": 0.0004968619743026439, "loss": 1.0852, "step": 4990 }, { "epoch": 0.6004327443202307, "grad_norm": 0.5867944359779358, "learning_rate": 0.0004968544386814245, "loss": 1.1008, "step": 4995 }, { "epoch": 0.6010337780983291, "grad_norm": 0.461322158575058, "learning_rate": 0.0004968468940803546, "loss": 0.9305, "step": 5000 }, { "epoch": 0.6010337780983291, "eval_loss": 2.072265625, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2054, "eval_samples_per_second": 4.545, "eval_steps_per_second": 1.136, "step": 5000 }, { "epoch": 0.6016348118764274, "grad_norm": 0.47805655002593994, "learning_rate": 0.0004968393404997085, "loss": 0.8844, "step": 5005 }, { "epoch": 0.6022358456545258, "grad_norm": 0.6164557933807373, "learning_rate": 0.0004968317779397611, "loss": 1.0875, "step": 5010 }, { "epoch": 0.6028368794326241, "grad_norm": 0.6458466053009033, "learning_rate": 0.0004968242064007875, "loss": 0.8664, "step": 5015 }, { "epoch": 0.6034379132107225, "grad_norm": 0.5833812355995178, "learning_rate": 0.000496816625883063, "loss": 0.7797, "step": 5020 }, { "epoch": 0.6040389469888208, "grad_norm": 0.6610152721405029, "learning_rate": 0.0004968090363868634, "loss": 0.8711, "step": 5025 }, { "epoch": 0.604639980766919, "grad_norm": 0.5444918274879456, "learning_rate": 0.0004968014379124649, "loss": 1.1195, "step": 5030 }, { "epoch": 0.6052410145450174, "grad_norm": 0.7095007300376892, "learning_rate": 0.0004967938304601438, "loss": 1.2203, "step": 5035 }, { "epoch": 0.6058420483231157, "grad_norm": 0.5106276869773865, "learning_rate": 0.000496786214030177, "loss": 0.9242, "step": 5040 }, { "epoch": 0.6064430821012141, "grad_norm": 0.7541503310203552, "learning_rate": 0.0004967785886228414, "loss": 1.1953, "step": 5045 }, { "epoch": 0.6070441158793124, "grad_norm": 0.6698989868164062, "learning_rate": 0.0004967709542384142, "loss": 1.0977, "step": 5050 }, { "epoch": 0.6076451496574108, "grad_norm": 0.65958172082901, "learning_rate": 0.0004967633108771735, "loss": 1.1953, "step": 5055 }, { "epoch": 0.6082461834355091, "grad_norm": 0.4623226225376129, "learning_rate": 0.0004967556585393972, "loss": 1.0875, "step": 5060 }, { "epoch": 0.6088472172136074, "grad_norm": 0.5534173846244812, "learning_rate": 0.0004967479972253637, "loss": 0.8086, "step": 5065 }, { "epoch": 0.6094482509917057, "grad_norm": 0.5159333944320679, "learning_rate": 0.0004967403269353516, "loss": 1.0781, "step": 5070 }, { "epoch": 0.610049284769804, "grad_norm": 0.9555249810218811, "learning_rate": 0.00049673264766964, "loss": 1.3039, "step": 5075 }, { "epoch": 0.6106503185479024, "grad_norm": 0.5126526951789856, "learning_rate": 0.0004967249594285081, "loss": 1.3594, "step": 5080 }, { "epoch": 0.6112513523260007, "grad_norm": 0.4960355758666992, "learning_rate": 0.0004967172622122358, "loss": 1.0219, "step": 5085 }, { "epoch": 0.6118523861040991, "grad_norm": 0.6622259020805359, "learning_rate": 0.000496709556021103, "loss": 0.9633, "step": 5090 }, { "epoch": 0.6124534198821974, "grad_norm": 0.5355902910232544, "learning_rate": 0.0004967018408553901, "loss": 1.2641, "step": 5095 }, { "epoch": 0.6130544536602958, "grad_norm": 0.5535646080970764, "learning_rate": 0.0004966941167153776, "loss": 1.125, "step": 5100 }, { "epoch": 0.613655487438394, "grad_norm": 0.40969082713127136, "learning_rate": 0.0004966863836013465, "loss": 1.5328, "step": 5105 }, { "epoch": 0.6142565212164923, "grad_norm": 0.7018454074859619, "learning_rate": 0.0004966786415135783, "loss": 1.009, "step": 5110 }, { "epoch": 0.6148575549945907, "grad_norm": 0.7394018769264221, "learning_rate": 0.0004966708904523546, "loss": 1.4625, "step": 5115 }, { "epoch": 0.615458588772689, "grad_norm": 0.43013033270835876, "learning_rate": 0.0004966631304179571, "loss": 0.7812, "step": 5120 }, { "epoch": 0.6160596225507874, "grad_norm": 0.6355203986167908, "learning_rate": 0.0004966553614106684, "loss": 0.8352, "step": 5125 }, { "epoch": 0.6166606563288857, "grad_norm": 0.4331319332122803, "learning_rate": 0.0004966475834307708, "loss": 1.0063, "step": 5130 }, { "epoch": 0.6172616901069841, "grad_norm": 0.47516930103302, "learning_rate": 0.0004966397964785475, "loss": 1.1465, "step": 5135 }, { "epoch": 0.6178627238850823, "grad_norm": 0.46409985423088074, "learning_rate": 0.0004966320005542817, "loss": 1.3172, "step": 5140 }, { "epoch": 0.6184637576631806, "grad_norm": 0.5062604546546936, "learning_rate": 0.000496624195658257, "loss": 1.0352, "step": 5145 }, { "epoch": 0.619064791441279, "grad_norm": 0.5797644257545471, "learning_rate": 0.0004966163817907573, "loss": 1.043, "step": 5150 }, { "epoch": 0.6196658252193773, "grad_norm": 0.6194161176681519, "learning_rate": 0.0004966085589520668, "loss": 0.9633, "step": 5155 }, { "epoch": 0.6202668589974757, "grad_norm": 0.570642352104187, "learning_rate": 0.0004966007271424701, "loss": 1.6047, "step": 5160 }, { "epoch": 0.620867892775574, "grad_norm": 0.6968045830726624, "learning_rate": 0.0004965928863622522, "loss": 1.2914, "step": 5165 }, { "epoch": 0.6214689265536724, "grad_norm": 0.4688586890697479, "learning_rate": 0.0004965850366116982, "loss": 1.4031, "step": 5170 }, { "epoch": 0.6220699603317706, "grad_norm": 0.6257545351982117, "learning_rate": 0.0004965771778910936, "loss": 1.1641, "step": 5175 }, { "epoch": 0.6226709941098689, "grad_norm": 0.5326806902885437, "learning_rate": 0.0004965693102007245, "loss": 0.973, "step": 5180 }, { "epoch": 0.6232720278879673, "grad_norm": 0.5571873188018799, "learning_rate": 0.0004965614335408769, "loss": 0.9023, "step": 5185 }, { "epoch": 0.6238730616660656, "grad_norm": 0.67780601978302, "learning_rate": 0.0004965535479118374, "loss": 1.1531, "step": 5190 }, { "epoch": 0.624474095444164, "grad_norm": 0.5032584071159363, "learning_rate": 0.0004965456533138928, "loss": 1.3109, "step": 5195 }, { "epoch": 0.6250751292222623, "grad_norm": 0.5984988808631897, "learning_rate": 0.0004965377497473304, "loss": 1.3664, "step": 5200 }, { "epoch": 0.6250751292222623, "eval_loss": 1.9845702648162842, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2079, "eval_samples_per_second": 4.544, "eval_steps_per_second": 1.136, "step": 5200 }, { "epoch": 0.6256761630003607, "grad_norm": 0.6601026654243469, "learning_rate": 0.0004965298372124376, "loss": 1.0969, "step": 5205 }, { "epoch": 0.6262771967784589, "grad_norm": 0.6069896817207336, "learning_rate": 0.0004965219157095024, "loss": 0.7707, "step": 5210 }, { "epoch": 0.6268782305565572, "grad_norm": 0.5757645964622498, "learning_rate": 0.0004965139852388127, "loss": 0.9836, "step": 5215 }, { "epoch": 0.6274792643346556, "grad_norm": 0.658743143081665, "learning_rate": 0.0004965060458006573, "loss": 0.7297, "step": 5220 }, { "epoch": 0.6280802981127539, "grad_norm": 0.6893699765205383, "learning_rate": 0.0004964980973953247, "loss": 1.0586, "step": 5225 }, { "epoch": 0.6286813318908523, "grad_norm": 0.6678071022033691, "learning_rate": 0.0004964901400231043, "loss": 0.9922, "step": 5230 }, { "epoch": 0.6292823656689506, "grad_norm": 0.8199008703231812, "learning_rate": 0.0004964821736842854, "loss": 1.0664, "step": 5235 }, { "epoch": 0.629883399447049, "grad_norm": 0.516659140586853, "learning_rate": 0.0004964741983791578, "loss": 1.0359, "step": 5240 }, { "epoch": 0.6304844332251472, "grad_norm": 0.5256388187408447, "learning_rate": 0.0004964662141080117, "loss": 0.8906, "step": 5245 }, { "epoch": 0.6310854670032456, "grad_norm": 0.6450999975204468, "learning_rate": 0.0004964582208711375, "loss": 0.8363, "step": 5250 }, { "epoch": 0.6316865007813439, "grad_norm": 0.7471349239349365, "learning_rate": 0.000496450218668826, "loss": 1.198, "step": 5255 }, { "epoch": 0.6322875345594422, "grad_norm": 0.5109617114067078, "learning_rate": 0.0004964422075013682, "loss": 0.9227, "step": 5260 }, { "epoch": 0.6328885683375406, "grad_norm": 0.48837974667549133, "learning_rate": 0.0004964341873690557, "loss": 0.9469, "step": 5265 }, { "epoch": 0.6334896021156389, "grad_norm": 0.7013422250747681, "learning_rate": 0.0004964261582721801, "loss": 1.1594, "step": 5270 }, { "epoch": 0.6340906358937373, "grad_norm": 0.3042369484901428, "learning_rate": 0.0004964181202110335, "loss": 1.3625, "step": 5275 }, { "epoch": 0.6346916696718355, "grad_norm": 0.8467724919319153, "learning_rate": 0.0004964100731859084, "loss": 0.9328, "step": 5280 }, { "epoch": 0.6352927034499339, "grad_norm": 0.7622195482254028, "learning_rate": 0.0004964020171970974, "loss": 1.0672, "step": 5285 }, { "epoch": 0.6358937372280322, "grad_norm": 0.5378862023353577, "learning_rate": 0.0004963939522448936, "loss": 1.0277, "step": 5290 }, { "epoch": 0.6364947710061305, "grad_norm": 0.5819463133811951, "learning_rate": 0.0004963858783295905, "loss": 1.1039, "step": 5295 }, { "epoch": 0.6370958047842289, "grad_norm": 0.5175995826721191, "learning_rate": 0.0004963777954514816, "loss": 1.3875, "step": 5300 }, { "epoch": 0.6376968385623272, "grad_norm": 0.6846615076065063, "learning_rate": 0.000496369703610861, "loss": 0.8723, "step": 5305 }, { "epoch": 0.6382978723404256, "grad_norm": 0.6325473189353943, "learning_rate": 0.000496361602808023, "loss": 1.2469, "step": 5310 }, { "epoch": 0.6388989061185238, "grad_norm": 0.49080929160118103, "learning_rate": 0.0004963534930432625, "loss": 1.2977, "step": 5315 }, { "epoch": 0.6394999398966222, "grad_norm": 0.4832301437854767, "learning_rate": 0.0004963453743168743, "loss": 1.1719, "step": 5320 }, { "epoch": 0.6401009736747205, "grad_norm": 0.7145053744316101, "learning_rate": 0.000496337246629154, "loss": 1.2859, "step": 5325 }, { "epoch": 0.6407020074528188, "grad_norm": 0.5795300602912903, "learning_rate": 0.0004963291099803969, "loss": 0.9617, "step": 5330 }, { "epoch": 0.6413030412309172, "grad_norm": 0.9390627145767212, "learning_rate": 0.0004963209643708991, "loss": 1.1703, "step": 5335 }, { "epoch": 0.6419040750090155, "grad_norm": 0.7437942624092102, "learning_rate": 0.000496312809800957, "loss": 1.0711, "step": 5340 }, { "epoch": 0.6425051087871139, "grad_norm": 0.5893829464912415, "learning_rate": 0.0004963046462708673, "loss": 1.2203, "step": 5345 }, { "epoch": 0.6431061425652121, "grad_norm": 0.430411696434021, "learning_rate": 0.0004962964737809268, "loss": 1.068, "step": 5350 }, { "epoch": 0.6437071763433105, "grad_norm": 0.4798862040042877, "learning_rate": 0.0004962882923314329, "loss": 1.0195, "step": 5355 }, { "epoch": 0.6443082101214088, "grad_norm": 0.7981201410293579, "learning_rate": 0.0004962801019226833, "loss": 0.9211, "step": 5360 }, { "epoch": 0.6449092438995071, "grad_norm": 0.4801620841026306, "learning_rate": 0.0004962719025549757, "loss": 1.2961, "step": 5365 }, { "epoch": 0.6455102776776055, "grad_norm": 0.4748445451259613, "learning_rate": 0.0004962636942286086, "loss": 1.207, "step": 5370 }, { "epoch": 0.6461113114557038, "grad_norm": 0.5984495282173157, "learning_rate": 0.0004962554769438802, "loss": 0.9734, "step": 5375 }, { "epoch": 0.6467123452338022, "grad_norm": 0.6702393889427185, "learning_rate": 0.0004962472507010901, "loss": 1.0977, "step": 5380 }, { "epoch": 0.6473133790119004, "grad_norm": 0.5429012775421143, "learning_rate": 0.0004962390155005369, "loss": 1.0641, "step": 5385 }, { "epoch": 0.6479144127899988, "grad_norm": 0.39091694355010986, "learning_rate": 0.0004962307713425206, "loss": 0.9906, "step": 5390 }, { "epoch": 0.6485154465680971, "grad_norm": 0.4490283131599426, "learning_rate": 0.0004962225182273409, "loss": 1.1297, "step": 5395 }, { "epoch": 0.6491164803461954, "grad_norm": 0.6250773072242737, "learning_rate": 0.0004962142561552981, "loss": 0.8102, "step": 5400 }, { "epoch": 0.6491164803461954, "eval_loss": 2.007031202316284, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2151, "eval_samples_per_second": 4.544, "eval_steps_per_second": 1.136, "step": 5400 }, { "epoch": 0.6497175141242938, "grad_norm": 0.4846701920032501, "learning_rate": 0.0004962059851266926, "loss": 1.4289, "step": 5405 }, { "epoch": 0.6503185479023921, "grad_norm": 0.5710358023643494, "learning_rate": 0.0004961977051418253, "loss": 1.0961, "step": 5410 }, { "epoch": 0.6509195816804905, "grad_norm": 0.46397438645362854, "learning_rate": 0.0004961894162009977, "loss": 0.9297, "step": 5415 }, { "epoch": 0.6515206154585887, "grad_norm": 0.5259897708892822, "learning_rate": 0.0004961811183045111, "loss": 1.0891, "step": 5420 }, { "epoch": 0.6521216492366871, "grad_norm": 0.44706469774246216, "learning_rate": 0.0004961728114526672, "loss": 1.175, "step": 5425 }, { "epoch": 0.6527226830147854, "grad_norm": 0.6101322174072266, "learning_rate": 0.0004961644956457685, "loss": 1.143, "step": 5430 }, { "epoch": 0.6533237167928838, "grad_norm": 0.544548511505127, "learning_rate": 0.0004961561708841173, "loss": 0.9246, "step": 5435 }, { "epoch": 0.6539247505709821, "grad_norm": 0.7637187242507935, "learning_rate": 0.0004961478371680165, "loss": 1.3031, "step": 5440 }, { "epoch": 0.6545257843490804, "grad_norm": 0.5239999294281006, "learning_rate": 0.0004961394944977692, "loss": 1.0242, "step": 5445 }, { "epoch": 0.6551268181271788, "grad_norm": 0.6028966307640076, "learning_rate": 0.000496131142873679, "loss": 1.2961, "step": 5450 }, { "epoch": 0.655727851905277, "grad_norm": 0.9021722674369812, "learning_rate": 0.0004961227822960495, "loss": 1.0543, "step": 5455 }, { "epoch": 0.6563288856833754, "grad_norm": 0.8960506916046143, "learning_rate": 0.0004961144127651851, "loss": 1.4555, "step": 5460 }, { "epoch": 0.6569299194614737, "grad_norm": 0.6960546374320984, "learning_rate": 0.0004961060342813901, "loss": 0.7801, "step": 5465 }, { "epoch": 0.6575309532395721, "grad_norm": 0.7192292213439941, "learning_rate": 0.0004960976468449692, "loss": 0.9641, "step": 5470 }, { "epoch": 0.6581319870176704, "grad_norm": 0.605060875415802, "learning_rate": 0.0004960892504562277, "loss": 1.6898, "step": 5475 }, { "epoch": 0.6587330207957687, "grad_norm": 0.5543851256370544, "learning_rate": 0.000496080845115471, "loss": 1.4141, "step": 5480 }, { "epoch": 0.6593340545738671, "grad_norm": 0.6614383459091187, "learning_rate": 0.0004960724308230047, "loss": 1.0016, "step": 5485 }, { "epoch": 0.6599350883519653, "grad_norm": 0.5792709589004517, "learning_rate": 0.0004960640075791351, "loss": 1.2906, "step": 5490 }, { "epoch": 0.6605361221300637, "grad_norm": 0.509619951248169, "learning_rate": 0.0004960555753841685, "loss": 0.9914, "step": 5495 }, { "epoch": 0.661137155908162, "grad_norm": 0.6596083045005798, "learning_rate": 0.0004960471342384116, "loss": 0.8969, "step": 5500 }, { "epoch": 0.6617381896862604, "grad_norm": 0.43772363662719727, "learning_rate": 0.0004960386841421716, "loss": 1.3281, "step": 5505 }, { "epoch": 0.6623392234643587, "grad_norm": 0.551668643951416, "learning_rate": 0.0004960302250957558, "loss": 0.9844, "step": 5510 }, { "epoch": 0.662940257242457, "grad_norm": 0.5039249658584595, "learning_rate": 0.0004960217570994719, "loss": 1.1273, "step": 5515 }, { "epoch": 0.6635412910205554, "grad_norm": 0.48881009221076965, "learning_rate": 0.0004960132801536281, "loss": 1.3953, "step": 5520 }, { "epoch": 0.6641423247986536, "grad_norm": 0.7355161905288696, "learning_rate": 0.0004960047942585324, "loss": 0.868, "step": 5525 }, { "epoch": 0.664743358576752, "grad_norm": 0.38211506605148315, "learning_rate": 0.0004959962994144939, "loss": 1.1547, "step": 5530 }, { "epoch": 0.6653443923548503, "grad_norm": 0.5978872776031494, "learning_rate": 0.0004959877956218213, "loss": 1.1145, "step": 5535 }, { "epoch": 0.6659454261329487, "grad_norm": 0.6254252791404724, "learning_rate": 0.0004959792828808241, "loss": 1.1023, "step": 5540 }, { "epoch": 0.666546459911047, "grad_norm": 0.5093739032745361, "learning_rate": 0.0004959707611918121, "loss": 1.1742, "step": 5545 }, { "epoch": 0.6671474936891453, "grad_norm": 0.5325815677642822, "learning_rate": 0.0004959622305550951, "loss": 1.1641, "step": 5550 }, { "epoch": 0.6677485274672437, "grad_norm": 0.5326328873634338, "learning_rate": 0.0004959536909709834, "loss": 0.8969, "step": 5555 }, { "epoch": 0.668349561245342, "grad_norm": 0.45378610491752625, "learning_rate": 0.0004959451424397879, "loss": 1.0051, "step": 5560 }, { "epoch": 0.6689505950234403, "grad_norm": 0.6395484209060669, "learning_rate": 0.0004959365849618192, "loss": 1.0594, "step": 5565 }, { "epoch": 0.6695516288015386, "grad_norm": 0.6629273891448975, "learning_rate": 0.0004959280185373888, "loss": 1.182, "step": 5570 }, { "epoch": 0.670152662579637, "grad_norm": 0.6001206040382385, "learning_rate": 0.0004959194431668084, "loss": 0.825, "step": 5575 }, { "epoch": 0.6707536963577353, "grad_norm": 0.6217541098594666, "learning_rate": 0.0004959108588503898, "loss": 1.0355, "step": 5580 }, { "epoch": 0.6713547301358337, "grad_norm": 0.5813589096069336, "learning_rate": 0.0004959022655884453, "loss": 0.9965, "step": 5585 }, { "epoch": 0.671955763913932, "grad_norm": 0.4870590269565582, "learning_rate": 0.0004958936633812876, "loss": 1.1906, "step": 5590 }, { "epoch": 0.6725567976920303, "grad_norm": 0.46450790762901306, "learning_rate": 0.0004958850522292295, "loss": 0.9266, "step": 5595 }, { "epoch": 0.6731578314701286, "grad_norm": 0.7803342938423157, "learning_rate": 0.0004958764321325843, "loss": 1.1867, "step": 5600 }, { "epoch": 0.6731578314701286, "eval_loss": 1.960351586341858, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1953, "eval_samples_per_second": 4.546, "eval_steps_per_second": 1.137, "step": 5600 }, { "epoch": 0.6737588652482269, "grad_norm": 0.39002323150634766, "learning_rate": 0.0004958678030916655, "loss": 0.9242, "step": 5605 }, { "epoch": 0.6743598990263253, "grad_norm": 0.6863328814506531, "learning_rate": 0.0004958591651067872, "loss": 0.9203, "step": 5610 }, { "epoch": 0.6749609328044236, "grad_norm": 0.5728829503059387, "learning_rate": 0.0004958505181782635, "loss": 1.1797, "step": 5615 }, { "epoch": 0.675561966582522, "grad_norm": 0.5186361074447632, "learning_rate": 0.0004958418623064088, "loss": 0.7125, "step": 5620 }, { "epoch": 0.6761630003606203, "grad_norm": 0.5343161225318909, "learning_rate": 0.0004958331974915382, "loss": 1.0008, "step": 5625 }, { "epoch": 0.6767640341387186, "grad_norm": 0.6222845315933228, "learning_rate": 0.0004958245237339669, "loss": 1.0906, "step": 5630 }, { "epoch": 0.6773650679168169, "grad_norm": 0.5673485398292542, "learning_rate": 0.0004958158410340103, "loss": 1.0516, "step": 5635 }, { "epoch": 0.6779661016949152, "grad_norm": 0.6129093170166016, "learning_rate": 0.0004958071493919842, "loss": 0.932, "step": 5640 }, { "epoch": 0.6785671354730136, "grad_norm": 0.43740183115005493, "learning_rate": 0.0004957984488082049, "loss": 1.4109, "step": 5645 }, { "epoch": 0.6791681692511119, "grad_norm": 0.632262647151947, "learning_rate": 0.0004957897392829889, "loss": 1.3547, "step": 5650 }, { "epoch": 0.6797692030292103, "grad_norm": 0.5351587533950806, "learning_rate": 0.0004957810208166531, "loss": 1.1938, "step": 5655 }, { "epoch": 0.6803702368073086, "grad_norm": 0.6918035745620728, "learning_rate": 0.0004957722934095145, "loss": 0.9172, "step": 5660 }, { "epoch": 0.6809712705854069, "grad_norm": 0.5031920671463013, "learning_rate": 0.0004957635570618906, "loss": 0.9242, "step": 5665 }, { "epoch": 0.6815723043635052, "grad_norm": 0.8015581965446472, "learning_rate": 0.0004957548117740993, "loss": 0.884, "step": 5670 }, { "epoch": 0.6821733381416035, "grad_norm": 0.5075511336326599, "learning_rate": 0.0004957460575464586, "loss": 1.0656, "step": 5675 }, { "epoch": 0.6827743719197019, "grad_norm": 0.43553194403648376, "learning_rate": 0.000495737294379287, "loss": 1.1762, "step": 5680 }, { "epoch": 0.6833754056978002, "grad_norm": 0.48835834860801697, "learning_rate": 0.0004957285222729034, "loss": 1.0547, "step": 5685 }, { "epoch": 0.6839764394758986, "grad_norm": 0.5027529001235962, "learning_rate": 0.0004957197412276267, "loss": 1.1742, "step": 5690 }, { "epoch": 0.6845774732539969, "grad_norm": 0.581287682056427, "learning_rate": 0.0004957109512437766, "loss": 0.9242, "step": 5695 }, { "epoch": 0.6851785070320952, "grad_norm": 0.5454129576683044, "learning_rate": 0.0004957021523216725, "loss": 1.1492, "step": 5700 }, { "epoch": 0.6857795408101935, "grad_norm": 0.5323740839958191, "learning_rate": 0.0004956933444616347, "loss": 0.8586, "step": 5705 }, { "epoch": 0.6863805745882918, "grad_norm": 0.8149203658103943, "learning_rate": 0.0004956845276639836, "loss": 1.15, "step": 5710 }, { "epoch": 0.6869816083663902, "grad_norm": 0.6589764952659607, "learning_rate": 0.00049567570192904, "loss": 1.132, "step": 5715 }, { "epoch": 0.6875826421444885, "grad_norm": 0.5598886609077454, "learning_rate": 0.0004956668672571247, "loss": 0.9125, "step": 5720 }, { "epoch": 0.6881836759225869, "grad_norm": 0.4387410581111908, "learning_rate": 0.0004956580236485593, "loss": 0.8129, "step": 5725 }, { "epoch": 0.6887847097006852, "grad_norm": 1.346904993057251, "learning_rate": 0.0004956491711036654, "loss": 1.3328, "step": 5730 }, { "epoch": 0.6893857434787835, "grad_norm": 0.597332775592804, "learning_rate": 0.0004956403096227651, "loss": 1.2812, "step": 5735 }, { "epoch": 0.6899867772568818, "grad_norm": 0.4443131685256958, "learning_rate": 0.0004956314392061808, "loss": 0.9059, "step": 5740 }, { "epoch": 0.6905878110349801, "grad_norm": 0.547560453414917, "learning_rate": 0.000495622559854235, "loss": 1.1344, "step": 5745 }, { "epoch": 0.6911888448130785, "grad_norm": 0.5721062421798706, "learning_rate": 0.0004956136715672509, "loss": 1.1805, "step": 5750 }, { "epoch": 0.6917898785911768, "grad_norm": 0.5476487278938293, "learning_rate": 0.0004956047743455517, "loss": 1.2406, "step": 5755 }, { "epoch": 0.6923909123692752, "grad_norm": 0.5772802829742432, "learning_rate": 0.0004955958681894611, "loss": 1.1797, "step": 5760 }, { "epoch": 0.6929919461473735, "grad_norm": 0.5622345805168152, "learning_rate": 0.000495586953099303, "loss": 1.0258, "step": 5765 }, { "epoch": 0.6935929799254719, "grad_norm": 0.3974035680294037, "learning_rate": 0.0004955780290754018, "loss": 0.9348, "step": 5770 }, { "epoch": 0.6941940137035701, "grad_norm": 0.7566075921058655, "learning_rate": 0.0004955690961180822, "loss": 1.2188, "step": 5775 }, { "epoch": 0.6947950474816684, "grad_norm": 0.5623620748519897, "learning_rate": 0.0004955601542276691, "loss": 1.0195, "step": 5780 }, { "epoch": 0.6953960812597668, "grad_norm": 0.5877131819725037, "learning_rate": 0.0004955512034044876, "loss": 0.7508, "step": 5785 }, { "epoch": 0.6959971150378651, "grad_norm": 0.35780635476112366, "learning_rate": 0.0004955422436488635, "loss": 1.0418, "step": 5790 }, { "epoch": 0.6965981488159635, "grad_norm": 0.6338667869567871, "learning_rate": 0.0004955332749611227, "loss": 0.866, "step": 5795 }, { "epoch": 0.6971991825940618, "grad_norm": 0.6346995830535889, "learning_rate": 0.0004955242973415915, "loss": 0.9914, "step": 5800 }, { "epoch": 0.6971991825940618, "eval_loss": 1.979101538658142, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1923, "eval_samples_per_second": 4.546, "eval_steps_per_second": 1.137, "step": 5800 }, { "epoch": 0.6978002163721602, "grad_norm": 0.6727893948554993, "learning_rate": 0.0004955153107905964, "loss": 1.1859, "step": 5805 }, { "epoch": 0.6984012501502584, "grad_norm": 0.4857117235660553, "learning_rate": 0.0004955063153084642, "loss": 1.5766, "step": 5810 }, { "epoch": 0.6990022839283567, "grad_norm": 0.6355534791946411, "learning_rate": 0.0004954973108955223, "loss": 1.1781, "step": 5815 }, { "epoch": 0.6996033177064551, "grad_norm": 0.5181488990783691, "learning_rate": 0.0004954882975520983, "loss": 1.2094, "step": 5820 }, { "epoch": 0.7002043514845534, "grad_norm": 0.3698882460594177, "learning_rate": 0.0004954792752785198, "loss": 1.0766, "step": 5825 }, { "epoch": 0.7008053852626518, "grad_norm": 0.6260622143745422, "learning_rate": 0.0004954702440751153, "loss": 0.9172, "step": 5830 }, { "epoch": 0.7014064190407501, "grad_norm": 0.5361654162406921, "learning_rate": 0.0004954612039422132, "loss": 1.0484, "step": 5835 }, { "epoch": 0.7020074528188485, "grad_norm": 0.4724903404712677, "learning_rate": 0.0004954521548801424, "loss": 1.225, "step": 5840 }, { "epoch": 0.7026084865969467, "grad_norm": 0.5695177912712097, "learning_rate": 0.000495443096889232, "loss": 0.9734, "step": 5845 }, { "epoch": 0.703209520375045, "grad_norm": 0.8220880627632141, "learning_rate": 0.0004954340299698116, "loss": 0.9305, "step": 5850 }, { "epoch": 0.7038105541531434, "grad_norm": 0.46409645676612854, "learning_rate": 0.000495424954122211, "loss": 1.5328, "step": 5855 }, { "epoch": 0.7044115879312417, "grad_norm": 0.5940005779266357, "learning_rate": 0.0004954158693467603, "loss": 0.9586, "step": 5860 }, { "epoch": 0.7050126217093401, "grad_norm": 0.6850906014442444, "learning_rate": 0.00049540677564379, "loss": 1.2406, "step": 5865 }, { "epoch": 0.7056136554874384, "grad_norm": 0.6025176048278809, "learning_rate": 0.0004953976730136309, "loss": 1.0621, "step": 5870 }, { "epoch": 0.7062146892655368, "grad_norm": 0.5903462767601013, "learning_rate": 0.0004953885614566142, "loss": 0.8078, "step": 5875 }, { "epoch": 0.706815723043635, "grad_norm": 0.8402755856513977, "learning_rate": 0.0004953794409730713, "loss": 0.8309, "step": 5880 }, { "epoch": 0.7074167568217333, "grad_norm": 0.525725245475769, "learning_rate": 0.0004953703115633339, "loss": 1.1781, "step": 5885 }, { "epoch": 0.7080177905998317, "grad_norm": 0.5770952701568604, "learning_rate": 0.0004953611732277342, "loss": 1.3102, "step": 5890 }, { "epoch": 0.70861882437793, "grad_norm": 0.5640763640403748, "learning_rate": 0.0004953520259666046, "loss": 1.1016, "step": 5895 }, { "epoch": 0.7092198581560284, "grad_norm": 0.6585139036178589, "learning_rate": 0.0004953428697802778, "loss": 0.9719, "step": 5900 }, { "epoch": 0.7098208919341267, "grad_norm": 0.8663554191589355, "learning_rate": 0.0004953337046690871, "loss": 0.9508, "step": 5905 }, { "epoch": 0.7104219257122251, "grad_norm": 0.7086400985717773, "learning_rate": 0.0004953245306333656, "loss": 0.8703, "step": 5910 }, { "epoch": 0.7110229594903233, "grad_norm": 0.36062702536582947, "learning_rate": 0.0004953153476734472, "loss": 0.8773, "step": 5915 }, { "epoch": 0.7116239932684217, "grad_norm": 0.5696423053741455, "learning_rate": 0.0004953061557896658, "loss": 1.15, "step": 5920 }, { "epoch": 0.71222502704652, "grad_norm": 0.3699018955230713, "learning_rate": 0.000495296954982356, "loss": 1.0984, "step": 5925 }, { "epoch": 0.7128260608246183, "grad_norm": 0.5031812787055969, "learning_rate": 0.0004952877452518523, "loss": 0.802, "step": 5930 }, { "epoch": 0.7134270946027167, "grad_norm": 0.5342919230461121, "learning_rate": 0.0004952785265984898, "loss": 1.0727, "step": 5935 }, { "epoch": 0.714028128380815, "grad_norm": 0.7044137120246887, "learning_rate": 0.0004952692990226039, "loss": 0.7984, "step": 5940 }, { "epoch": 0.7146291621589134, "grad_norm": 0.5580726861953735, "learning_rate": 0.0004952600625245301, "loss": 1.1875, "step": 5945 }, { "epoch": 0.7152301959370116, "grad_norm": 0.5382014513015747, "learning_rate": 0.0004952508171046046, "loss": 0.9633, "step": 5950 }, { "epoch": 0.71583122971511, "grad_norm": 0.6365616321563721, "learning_rate": 0.0004952415627631636, "loss": 1.0867, "step": 5955 }, { "epoch": 0.7164322634932083, "grad_norm": 0.572903573513031, "learning_rate": 0.0004952322995005438, "loss": 0.9461, "step": 5960 }, { "epoch": 0.7170332972713066, "grad_norm": 0.5480600595474243, "learning_rate": 0.0004952230273170822, "loss": 1.0547, "step": 5965 }, { "epoch": 0.717634331049405, "grad_norm": 0.6455062031745911, "learning_rate": 0.000495213746213116, "loss": 0.8516, "step": 5970 }, { "epoch": 0.7182353648275033, "grad_norm": 0.4712236225605011, "learning_rate": 0.0004952044561889829, "loss": 0.8742, "step": 5975 }, { "epoch": 0.7188363986056017, "grad_norm": 0.4829672873020172, "learning_rate": 0.0004951951572450207, "loss": 0.9465, "step": 5980 }, { "epoch": 0.7194374323837, "grad_norm": 0.5550187826156616, "learning_rate": 0.000495185849381568, "loss": 0.8234, "step": 5985 }, { "epoch": 0.7200384661617983, "grad_norm": 0.4478211998939514, "learning_rate": 0.000495176532598963, "loss": 1.3477, "step": 5990 }, { "epoch": 0.7206394999398966, "grad_norm": 0.5679304599761963, "learning_rate": 0.0004951672068975448, "loss": 1.0531, "step": 5995 }, { "epoch": 0.7212405337179949, "grad_norm": 0.4058151841163635, "learning_rate": 0.0004951578722776526, "loss": 0.9563, "step": 6000 }, { "epoch": 0.7212405337179949, "eval_loss": 1.9480469226837158, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1671, "eval_samples_per_second": 4.55, "eval_steps_per_second": 1.137, "step": 6000 }, { "epoch": 0.7218415674960933, "grad_norm": 0.5920779705047607, "learning_rate": 0.0004951485287396261, "loss": 0.9551, "step": 6005 }, { "epoch": 0.7224426012741916, "grad_norm": 0.8503037095069885, "learning_rate": 0.0004951391762838051, "loss": 1.0383, "step": 6010 }, { "epoch": 0.72304363505229, "grad_norm": 0.6085087656974792, "learning_rate": 0.0004951298149105298, "loss": 0.9805, "step": 6015 }, { "epoch": 0.7236446688303882, "grad_norm": 0.5195494890213013, "learning_rate": 0.0004951204446201407, "loss": 1.0531, "step": 6020 }, { "epoch": 0.7242457026084866, "grad_norm": 0.4968985617160797, "learning_rate": 0.0004951110654129786, "loss": 1.2793, "step": 6025 }, { "epoch": 0.7248467363865849, "grad_norm": 0.63482666015625, "learning_rate": 0.000495101677289385, "loss": 0.9906, "step": 6030 }, { "epoch": 0.7254477701646832, "grad_norm": 0.9290293455123901, "learning_rate": 0.0004950922802497011, "loss": 1.2523, "step": 6035 }, { "epoch": 0.7260488039427816, "grad_norm": 0.7131595611572266, "learning_rate": 0.0004950828742942689, "loss": 0.8672, "step": 6040 }, { "epoch": 0.7266498377208799, "grad_norm": 0.7611229419708252, "learning_rate": 0.0004950734594234306, "loss": 0.9555, "step": 6045 }, { "epoch": 0.7272508714989783, "grad_norm": 0.5345115661621094, "learning_rate": 0.0004950640356375283, "loss": 1.0609, "step": 6050 }, { "epoch": 0.7278519052770765, "grad_norm": 0.41503068804740906, "learning_rate": 0.0004950546029369054, "loss": 1.2922, "step": 6055 }, { "epoch": 0.7284529390551749, "grad_norm": 0.539353609085083, "learning_rate": 0.0004950451613219046, "loss": 1.0805, "step": 6060 }, { "epoch": 0.7290539728332732, "grad_norm": 0.5110329389572144, "learning_rate": 0.0004950357107928696, "loss": 0.6949, "step": 6065 }, { "epoch": 0.7296550066113715, "grad_norm": 0.41400015354156494, "learning_rate": 0.0004950262513501439, "loss": 0.7719, "step": 6070 }, { "epoch": 0.7302560403894699, "grad_norm": 0.5604378581047058, "learning_rate": 0.000495016782994072, "loss": 0.7719, "step": 6075 }, { "epoch": 0.7308570741675682, "grad_norm": 0.6464505195617676, "learning_rate": 0.0004950073057249979, "loss": 0.8914, "step": 6080 }, { "epoch": 0.7314581079456666, "grad_norm": 0.7905951738357544, "learning_rate": 0.0004949978195432668, "loss": 0.8078, "step": 6085 }, { "epoch": 0.7320591417237649, "grad_norm": 0.4895305335521698, "learning_rate": 0.0004949883244492234, "loss": 1.0867, "step": 6090 }, { "epoch": 0.7326601755018632, "grad_norm": 0.8767439723014832, "learning_rate": 0.0004949788204432133, "loss": 1.118, "step": 6095 }, { "epoch": 0.7332612092799615, "grad_norm": 0.4090585708618164, "learning_rate": 0.0004949693075255822, "loss": 1.175, "step": 6100 }, { "epoch": 0.7338622430580599, "grad_norm": 0.6413434743881226, "learning_rate": 0.0004949597856966761, "loss": 0.9797, "step": 6105 }, { "epoch": 0.7344632768361582, "grad_norm": 0.4368354082107544, "learning_rate": 0.0004949502549568414, "loss": 1.2039, "step": 6110 }, { "epoch": 0.7350643106142565, "grad_norm": 0.5256956815719604, "learning_rate": 0.0004949407153064249, "loss": 1.2367, "step": 6115 }, { "epoch": 0.7356653443923549, "grad_norm": 0.5167532563209534, "learning_rate": 0.0004949311667457735, "loss": 1.243, "step": 6120 }, { "epoch": 0.7362663781704532, "grad_norm": 0.45178014039993286, "learning_rate": 0.0004949216092752347, "loss": 1.3469, "step": 6125 }, { "epoch": 0.7368674119485515, "grad_norm": 0.5148994326591492, "learning_rate": 0.0004949120428951559, "loss": 0.8805, "step": 6130 }, { "epoch": 0.7374684457266498, "grad_norm": 0.6306822896003723, "learning_rate": 0.0004949024676058853, "loss": 0.782, "step": 6135 }, { "epoch": 0.7380694795047482, "grad_norm": 0.3867519199848175, "learning_rate": 0.0004948928834077712, "loss": 1.3062, "step": 6140 }, { "epoch": 0.7386705132828465, "grad_norm": 0.46258437633514404, "learning_rate": 0.0004948832903011623, "loss": 1.0398, "step": 6145 }, { "epoch": 0.7392715470609448, "grad_norm": 0.5857958793640137, "learning_rate": 0.0004948736882864074, "loss": 0.9008, "step": 6150 }, { "epoch": 0.7398725808390432, "grad_norm": 0.6315101385116577, "learning_rate": 0.000494864077363856, "loss": 1.3992, "step": 6155 }, { "epoch": 0.7404736146171415, "grad_norm": 0.8710448741912842, "learning_rate": 0.0004948544575338575, "loss": 1.0953, "step": 6160 }, { "epoch": 0.7410746483952398, "grad_norm": 0.46991831064224243, "learning_rate": 0.0004948448287967619, "loss": 0.9688, "step": 6165 }, { "epoch": 0.7416756821733381, "grad_norm": 0.5042182207107544, "learning_rate": 0.0004948351911529198, "loss": 0.9371, "step": 6170 }, { "epoch": 0.7422767159514365, "grad_norm": 0.5035787224769592, "learning_rate": 0.0004948255446026813, "loss": 0.943, "step": 6175 }, { "epoch": 0.7428777497295348, "grad_norm": 0.7247022390365601, "learning_rate": 0.0004948158891463975, "loss": 1.1156, "step": 6180 }, { "epoch": 0.7434787835076331, "grad_norm": 0.6135059595108032, "learning_rate": 0.0004948062247844196, "loss": 1.4766, "step": 6185 }, { "epoch": 0.7440798172857315, "grad_norm": 0.6409685015678406, "learning_rate": 0.0004947965515170994, "loss": 1.2414, "step": 6190 }, { "epoch": 0.7446808510638298, "grad_norm": 0.5241739749908447, "learning_rate": 0.0004947868693447884, "loss": 0.9391, "step": 6195 }, { "epoch": 0.7452818848419281, "grad_norm": 0.6567567586898804, "learning_rate": 0.0004947771782678391, "loss": 0.7797, "step": 6200 }, { "epoch": 0.7452818848419281, "eval_loss": 1.9669921398162842, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1972, "eval_samples_per_second": 4.546, "eval_steps_per_second": 1.136, "step": 6200 }, { "epoch": 0.7458829186200264, "grad_norm": 0.7450057864189148, "learning_rate": 0.000494767478286604, "loss": 1.0414, "step": 6205 }, { "epoch": 0.7464839523981248, "grad_norm": 0.7323078513145447, "learning_rate": 0.000494757769401436, "loss": 1.0836, "step": 6210 }, { "epoch": 0.7470849861762231, "grad_norm": 0.6413726210594177, "learning_rate": 0.000494748051612688, "loss": 0.9234, "step": 6215 }, { "epoch": 0.7476860199543214, "grad_norm": 0.6522002816200256, "learning_rate": 0.0004947383249207138, "loss": 1.2234, "step": 6220 }, { "epoch": 0.7482870537324198, "grad_norm": 0.7493404746055603, "learning_rate": 0.000494728589325867, "loss": 1.1875, "step": 6225 }, { "epoch": 0.748888087510518, "grad_norm": 0.6021580100059509, "learning_rate": 0.0004947188448285021, "loss": 0.9313, "step": 6230 }, { "epoch": 0.7494891212886164, "grad_norm": 0.5463228225708008, "learning_rate": 0.0004947090914289731, "loss": 0.9648, "step": 6235 }, { "epoch": 0.7500901550667147, "grad_norm": 0.37141141295433044, "learning_rate": 0.0004946993291276351, "loss": 1.2734, "step": 6240 }, { "epoch": 0.7506911888448131, "grad_norm": 0.47802188992500305, "learning_rate": 0.0004946895579248432, "loss": 0.9426, "step": 6245 }, { "epoch": 0.7512922226229114, "grad_norm": 0.6864299178123474, "learning_rate": 0.0004946797778209529, "loss": 1.4195, "step": 6250 }, { "epoch": 0.7518932564010098, "grad_norm": 0.7433485388755798, "learning_rate": 0.0004946699888163198, "loss": 1.1281, "step": 6255 }, { "epoch": 0.7524942901791081, "grad_norm": 0.773834764957428, "learning_rate": 0.0004946601909113002, "loss": 1.1102, "step": 6260 }, { "epoch": 0.7530953239572064, "grad_norm": 0.5566007494926453, "learning_rate": 0.0004946503841062503, "loss": 1.4867, "step": 6265 }, { "epoch": 0.7536963577353047, "grad_norm": 0.6335606575012207, "learning_rate": 0.000494640568401527, "loss": 1.0359, "step": 6270 }, { "epoch": 0.754297391513403, "grad_norm": 0.524815559387207, "learning_rate": 0.0004946307437974873, "loss": 1.2031, "step": 6275 }, { "epoch": 0.7548984252915014, "grad_norm": 0.5556230545043945, "learning_rate": 0.0004946209102944886, "loss": 1.0094, "step": 6280 }, { "epoch": 0.7554994590695997, "grad_norm": 0.604891836643219, "learning_rate": 0.0004946110678928886, "loss": 1.0289, "step": 6285 }, { "epoch": 0.7561004928476981, "grad_norm": 0.5057227611541748, "learning_rate": 0.0004946012165930455, "loss": 1.2984, "step": 6290 }, { "epoch": 0.7567015266257964, "grad_norm": 0.5134652256965637, "learning_rate": 0.0004945913563953174, "loss": 0.7477, "step": 6295 }, { "epoch": 0.7573025604038947, "grad_norm": 0.5149896144866943, "learning_rate": 0.0004945814873000632, "loss": 1.1414, "step": 6300 }, { "epoch": 0.757903594181993, "grad_norm": 0.6168910264968872, "learning_rate": 0.0004945716093076416, "loss": 1.3906, "step": 6305 }, { "epoch": 0.7585046279600913, "grad_norm": 0.5330949425697327, "learning_rate": 0.0004945617224184123, "loss": 1.1125, "step": 6310 }, { "epoch": 0.7591056617381897, "grad_norm": 0.7349749803543091, "learning_rate": 0.0004945518266327349, "loss": 0.932, "step": 6315 }, { "epoch": 0.759706695516288, "grad_norm": 0.7780669927597046, "learning_rate": 0.0004945419219509692, "loss": 1.0938, "step": 6320 }, { "epoch": 0.7603077292943864, "grad_norm": 0.5112941265106201, "learning_rate": 0.0004945320083734755, "loss": 0.9844, "step": 6325 }, { "epoch": 0.7609087630724847, "grad_norm": 0.6444630026817322, "learning_rate": 0.0004945220859006147, "loss": 1.159, "step": 6330 }, { "epoch": 0.761509796850583, "grad_norm": 0.7215844988822937, "learning_rate": 0.0004945121545327474, "loss": 1.0555, "step": 6335 }, { "epoch": 0.7621108306286813, "grad_norm": 0.6337628960609436, "learning_rate": 0.0004945022142702352, "loss": 0.9406, "step": 6340 }, { "epoch": 0.7627118644067796, "grad_norm": 0.5415433645248413, "learning_rate": 0.0004944922651134395, "loss": 1.0148, "step": 6345 }, { "epoch": 0.763312898184878, "grad_norm": 0.4147307872772217, "learning_rate": 0.0004944823070627222, "loss": 0.8797, "step": 6350 }, { "epoch": 0.7639139319629763, "grad_norm": 0.4711264967918396, "learning_rate": 0.0004944723401184456, "loss": 1.0352, "step": 6355 }, { "epoch": 0.7645149657410747, "grad_norm": 0.6182102560997009, "learning_rate": 0.0004944623642809723, "loss": 0.9797, "step": 6360 }, { "epoch": 0.765115999519173, "grad_norm": 0.5168282389640808, "learning_rate": 0.0004944523795506652, "loss": 1.1133, "step": 6365 }, { "epoch": 0.7657170332972713, "grad_norm": 0.6675767302513123, "learning_rate": 0.0004944423859278875, "loss": 0.7422, "step": 6370 }, { "epoch": 0.7663180670753696, "grad_norm": 0.5700129270553589, "learning_rate": 0.0004944323834130027, "loss": 1.3008, "step": 6375 }, { "epoch": 0.7669191008534679, "grad_norm": 0.4999338686466217, "learning_rate": 0.0004944223720063746, "loss": 0.8391, "step": 6380 }, { "epoch": 0.7675201346315663, "grad_norm": 0.5673699378967285, "learning_rate": 0.0004944123517083676, "loss": 1.0242, "step": 6385 }, { "epoch": 0.7681211684096646, "grad_norm": 0.6721099019050598, "learning_rate": 0.000494402322519346, "loss": 1.5437, "step": 6390 }, { "epoch": 0.768722202187763, "grad_norm": 0.43901994824409485, "learning_rate": 0.0004943922844396748, "loss": 1.1172, "step": 6395 }, { "epoch": 0.7693232359658613, "grad_norm": 0.5509328246116638, "learning_rate": 0.000494382237469719, "loss": 0.7242, "step": 6400 }, { "epoch": 0.7693232359658613, "eval_loss": 1.9357421398162842, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1965, "eval_samples_per_second": 4.546, "eval_steps_per_second": 1.136, "step": 6400 }, { "epoch": 0.7699242697439596, "grad_norm": 0.5436112284660339, "learning_rate": 0.0004943721816098442, "loss": 0.8125, "step": 6405 }, { "epoch": 0.770525303522058, "grad_norm": 0.6365628838539124, "learning_rate": 0.0004943621168604161, "loss": 0.8051, "step": 6410 }, { "epoch": 0.7711263373001562, "grad_norm": 0.5547435879707336, "learning_rate": 0.0004943520432218009, "loss": 1.1773, "step": 6415 }, { "epoch": 0.7717273710782546, "grad_norm": 0.5300225615501404, "learning_rate": 0.0004943419606943651, "loss": 1.1414, "step": 6420 }, { "epoch": 0.7723284048563529, "grad_norm": 0.41618016362190247, "learning_rate": 0.0004943318692784752, "loss": 0.7164, "step": 6425 }, { "epoch": 0.7729294386344513, "grad_norm": 0.5764371752738953, "learning_rate": 0.0004943217689744988, "loss": 0.9301, "step": 6430 }, { "epoch": 0.7735304724125496, "grad_norm": 0.6483813524246216, "learning_rate": 0.0004943116597828028, "loss": 0.8297, "step": 6435 }, { "epoch": 0.774131506190648, "grad_norm": 0.8637044429779053, "learning_rate": 0.0004943015417037553, "loss": 1.0195, "step": 6440 }, { "epoch": 0.7747325399687462, "grad_norm": 0.5276628136634827, "learning_rate": 0.0004942914147377241, "loss": 1.0102, "step": 6445 }, { "epoch": 0.7753335737468445, "grad_norm": 0.6736142635345459, "learning_rate": 0.0004942812788850777, "loss": 1.0266, "step": 6450 }, { "epoch": 0.7759346075249429, "grad_norm": 0.6700810790061951, "learning_rate": 0.000494271134146185, "loss": 1.0906, "step": 6455 }, { "epoch": 0.7765356413030412, "grad_norm": 0.36965152621269226, "learning_rate": 0.0004942609805214148, "loss": 1.1977, "step": 6460 }, { "epoch": 0.7771366750811396, "grad_norm": 0.9631960391998291, "learning_rate": 0.0004942508180111365, "loss": 0.7117, "step": 6465 }, { "epoch": 0.7777377088592379, "grad_norm": 0.6460843682289124, "learning_rate": 0.0004942406466157198, "loss": 1.0977, "step": 6470 }, { "epoch": 0.7783387426373363, "grad_norm": 0.44630011916160583, "learning_rate": 0.0004942304663355347, "loss": 1.0008, "step": 6475 }, { "epoch": 0.7789397764154345, "grad_norm": 0.5840423107147217, "learning_rate": 0.0004942202771709516, "loss": 1.2016, "step": 6480 }, { "epoch": 0.7795408101935328, "grad_norm": 0.7917207479476929, "learning_rate": 0.0004942100791223411, "loss": 1.2242, "step": 6485 }, { "epoch": 0.7801418439716312, "grad_norm": 0.5441203117370605, "learning_rate": 0.0004941998721900741, "loss": 1.0992, "step": 6490 }, { "epoch": 0.7807428777497295, "grad_norm": 0.48659607768058777, "learning_rate": 0.000494189656374522, "loss": 0.9187, "step": 6495 }, { "epoch": 0.7813439115278279, "grad_norm": 0.5477159023284912, "learning_rate": 0.0004941794316760565, "loss": 1.3281, "step": 6500 }, { "epoch": 0.7819449453059262, "grad_norm": 0.7956238389015198, "learning_rate": 0.0004941691980950493, "loss": 1.0984, "step": 6505 }, { "epoch": 0.7825459790840246, "grad_norm": 0.5686649680137634, "learning_rate": 0.0004941589556318729, "loss": 1.0703, "step": 6510 }, { "epoch": 0.7831470128621228, "grad_norm": 0.34903407096862793, "learning_rate": 0.0004941487042868998, "loss": 0.7379, "step": 6515 }, { "epoch": 0.7837480466402211, "grad_norm": 0.7311768531799316, "learning_rate": 0.0004941384440605029, "loss": 0.9281, "step": 6520 }, { "epoch": 0.7843490804183195, "grad_norm": 0.6680164933204651, "learning_rate": 0.0004941281749530554, "loss": 0.8383, "step": 6525 }, { "epoch": 0.7849501141964178, "grad_norm": 0.4928357005119324, "learning_rate": 0.0004941178969649309, "loss": 1.0305, "step": 6530 }, { "epoch": 0.7855511479745162, "grad_norm": 0.5012532472610474, "learning_rate": 0.0004941076100965033, "loss": 0.9531, "step": 6535 }, { "epoch": 0.7861521817526145, "grad_norm": 0.31756752729415894, "learning_rate": 0.0004940973143481469, "loss": 1.1867, "step": 6540 }, { "epoch": 0.7867532155307129, "grad_norm": 0.5039989948272705, "learning_rate": 0.000494087009720236, "loss": 0.7156, "step": 6545 }, { "epoch": 0.7873542493088111, "grad_norm": 0.3640342652797699, "learning_rate": 0.0004940766962131457, "loss": 1.0105, "step": 6550 }, { "epoch": 0.7879552830869094, "grad_norm": 0.3968428075313568, "learning_rate": 0.000494066373827251, "loss": 0.9219, "step": 6555 }, { "epoch": 0.7885563168650078, "grad_norm": 0.44752684235572815, "learning_rate": 0.0004940560425629275, "loss": 1.1621, "step": 6560 }, { "epoch": 0.7891573506431061, "grad_norm": 0.46121954917907715, "learning_rate": 0.0004940457024205509, "loss": 0.8449, "step": 6565 }, { "epoch": 0.7897583844212045, "grad_norm": 0.6438422203063965, "learning_rate": 0.0004940353534004974, "loss": 1.0266, "step": 6570 }, { "epoch": 0.7903594181993028, "grad_norm": 0.6750705242156982, "learning_rate": 0.0004940249955031436, "loss": 0.8523, "step": 6575 }, { "epoch": 0.7909604519774012, "grad_norm": 0.6813221573829651, "learning_rate": 0.0004940146287288661, "loss": 0.9297, "step": 6580 }, { "epoch": 0.7915614857554994, "grad_norm": 0.774385929107666, "learning_rate": 0.0004940042530780422, "loss": 0.8969, "step": 6585 }, { "epoch": 0.7921625195335978, "grad_norm": 0.6112849712371826, "learning_rate": 0.0004939938685510491, "loss": 1.1547, "step": 6590 }, { "epoch": 0.7927635533116961, "grad_norm": 0.562152087688446, "learning_rate": 0.0004939834751482648, "loss": 1.075, "step": 6595 }, { "epoch": 0.7933645870897944, "grad_norm": 0.5512693524360657, "learning_rate": 0.0004939730728700672, "loss": 1.0562, "step": 6600 }, { "epoch": 0.7933645870897944, "eval_loss": 1.974218726158142, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1913, "eval_samples_per_second": 4.547, "eval_steps_per_second": 1.137, "step": 6600 }, { "epoch": 0.7939656208678928, "grad_norm": 0.6438180208206177, "learning_rate": 0.0004939626617168348, "loss": 0.9742, "step": 6605 }, { "epoch": 0.7945666546459911, "grad_norm": 0.6227211952209473, "learning_rate": 0.0004939522416889463, "loss": 1.2336, "step": 6610 }, { "epoch": 0.7951676884240895, "grad_norm": 0.5621742010116577, "learning_rate": 0.0004939418127867807, "loss": 0.9906, "step": 6615 }, { "epoch": 0.7957687222021878, "grad_norm": 0.42686790227890015, "learning_rate": 0.0004939313750107175, "loss": 0.9117, "step": 6620 }, { "epoch": 0.7963697559802861, "grad_norm": 0.48856663703918457, "learning_rate": 0.0004939209283611364, "loss": 1.5156, "step": 6625 }, { "epoch": 0.7969707897583844, "grad_norm": 0.4700355529785156, "learning_rate": 0.0004939104728384171, "loss": 0.7562, "step": 6630 }, { "epoch": 0.7975718235364827, "grad_norm": 0.5643510818481445, "learning_rate": 0.0004939000084429404, "loss": 1.1695, "step": 6635 }, { "epoch": 0.7981728573145811, "grad_norm": 0.5543480515480042, "learning_rate": 0.0004938895351750867, "loss": 0.8992, "step": 6640 }, { "epoch": 0.7987738910926794, "grad_norm": 0.49549707770347595, "learning_rate": 0.000493879053035237, "loss": 0.9477, "step": 6645 }, { "epoch": 0.7993749248707778, "grad_norm": 0.5950228571891785, "learning_rate": 0.0004938685620237726, "loss": 1.2164, "step": 6650 }, { "epoch": 0.799975958648876, "grad_norm": 0.525283932685852, "learning_rate": 0.0004938580621410753, "loss": 1.1055, "step": 6655 }, { "epoch": 0.8005769924269744, "grad_norm": 0.6012600660324097, "learning_rate": 0.0004938475533875268, "loss": 0.9945, "step": 6660 }, { "epoch": 0.8011780262050727, "grad_norm": 0.4771900177001953, "learning_rate": 0.0004938370357635097, "loss": 0.9461, "step": 6665 }, { "epoch": 0.801779059983171, "grad_norm": 0.7490991950035095, "learning_rate": 0.0004938265092694062, "loss": 0.9125, "step": 6670 }, { "epoch": 0.8023800937612694, "grad_norm": 0.9589657187461853, "learning_rate": 0.0004938159739055996, "loss": 0.8398, "step": 6675 }, { "epoch": 0.8029811275393677, "grad_norm": 0.9547123312950134, "learning_rate": 0.0004938054296724729, "loss": 0.9523, "step": 6680 }, { "epoch": 0.8035821613174661, "grad_norm": 0.7088879942893982, "learning_rate": 0.0004937948765704098, "loss": 1.3687, "step": 6685 }, { "epoch": 0.8041831950955644, "grad_norm": 0.533961832523346, "learning_rate": 0.0004937843145997941, "loss": 0.7945, "step": 6690 }, { "epoch": 0.8047842288736627, "grad_norm": 0.43692246079444885, "learning_rate": 0.00049377374376101, "loss": 1.3234, "step": 6695 }, { "epoch": 0.805385262651761, "grad_norm": 0.7477303743362427, "learning_rate": 0.0004937631640544423, "loss": 0.8465, "step": 6700 }, { "epoch": 0.8059862964298593, "grad_norm": 0.48803970217704773, "learning_rate": 0.0004937525754804754, "loss": 1.2703, "step": 6705 }, { "epoch": 0.8065873302079577, "grad_norm": 0.7295065522193909, "learning_rate": 0.0004937419780394948, "loss": 0.9242, "step": 6710 }, { "epoch": 0.807188363986056, "grad_norm": 0.7123075723648071, "learning_rate": 0.000493731371731886, "loss": 0.9289, "step": 6715 }, { "epoch": 0.8077893977641544, "grad_norm": 0.7304734587669373, "learning_rate": 0.0004937207565580348, "loss": 0.9039, "step": 6720 }, { "epoch": 0.8083904315422527, "grad_norm": 0.9955569505691528, "learning_rate": 0.0004937101325183272, "loss": 1.007, "step": 6725 }, { "epoch": 0.808991465320351, "grad_norm": 0.8564704656600952, "learning_rate": 0.0004936994996131499, "loss": 0.9313, "step": 6730 }, { "epoch": 0.8095924990984493, "grad_norm": 0.5352545976638794, "learning_rate": 0.0004936888578428895, "loss": 1.057, "step": 6735 }, { "epoch": 0.8101935328765477, "grad_norm": 0.8784704804420471, "learning_rate": 0.0004936782072079332, "loss": 1.2234, "step": 6740 }, { "epoch": 0.810794566654646, "grad_norm": 0.5149188041687012, "learning_rate": 0.0004936675477086685, "loss": 1.0672, "step": 6745 }, { "epoch": 0.8113956004327443, "grad_norm": 0.6401064991950989, "learning_rate": 0.000493656879345483, "loss": 1.0156, "step": 6750 }, { "epoch": 0.8119966342108427, "grad_norm": 0.500579297542572, "learning_rate": 0.000493646202118765, "loss": 1.2781, "step": 6755 }, { "epoch": 0.812597667988941, "grad_norm": 0.6466786861419678, "learning_rate": 0.0004936355160289026, "loss": 0.9258, "step": 6760 }, { "epoch": 0.8131987017670393, "grad_norm": 0.7493544220924377, "learning_rate": 0.0004936248210762849, "loss": 0.868, "step": 6765 }, { "epoch": 0.8137997355451376, "grad_norm": 0.60451740026474, "learning_rate": 0.0004936141172613007, "loss": 1.1617, "step": 6770 }, { "epoch": 0.814400769323236, "grad_norm": 0.529296875, "learning_rate": 0.0004936034045843395, "loss": 1.1227, "step": 6775 }, { "epoch": 0.8150018031013343, "grad_norm": 0.643993616104126, "learning_rate": 0.0004935926830457909, "loss": 1.3633, "step": 6780 }, { "epoch": 0.8156028368794326, "grad_norm": 0.5737214684486389, "learning_rate": 0.0004935819526460449, "loss": 1.1219, "step": 6785 }, { "epoch": 0.816203870657531, "grad_norm": 0.3480701446533203, "learning_rate": 0.0004935712133854919, "loss": 0.9266, "step": 6790 }, { "epoch": 0.8168049044356293, "grad_norm": 0.8722716569900513, "learning_rate": 0.0004935604652645227, "loss": 0.9453, "step": 6795 }, { "epoch": 0.8174059382137276, "grad_norm": 0.6065110564231873, "learning_rate": 0.0004935497082835281, "loss": 0.9945, "step": 6800 }, { "epoch": 0.8174059382137276, "eval_loss": 1.945898413658142, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2185, "eval_samples_per_second": 4.543, "eval_steps_per_second": 1.136, "step": 6800 }, { "epoch": 0.8180069719918259, "grad_norm": 0.6503706574440002, "learning_rate": 0.0004935389424428994, "loss": 0.9922, "step": 6805 }, { "epoch": 0.8186080057699243, "grad_norm": 0.6121295094490051, "learning_rate": 0.0004935281677430283, "loss": 1.3852, "step": 6810 }, { "epoch": 0.8192090395480226, "grad_norm": 0.4303320348262787, "learning_rate": 0.0004935173841843067, "loss": 1.1453, "step": 6815 }, { "epoch": 0.8198100733261209, "grad_norm": 0.7867786884307861, "learning_rate": 0.000493506591767127, "loss": 0.868, "step": 6820 }, { "epoch": 0.8204111071042193, "grad_norm": 0.4826887249946594, "learning_rate": 0.0004934957904918816, "loss": 1.0484, "step": 6825 }, { "epoch": 0.8210121408823176, "grad_norm": 0.7813975811004639, "learning_rate": 0.0004934849803589636, "loss": 0.7844, "step": 6830 }, { "epoch": 0.821613174660416, "grad_norm": 0.47828686237335205, "learning_rate": 0.0004934741613687662, "loss": 1.5148, "step": 6835 }, { "epoch": 0.8222142084385142, "grad_norm": 0.7920680046081543, "learning_rate": 0.0004934633335216828, "loss": 1.1469, "step": 6840 }, { "epoch": 0.8228152422166126, "grad_norm": 1.0974485874176025, "learning_rate": 0.0004934524968181075, "loss": 0.9727, "step": 6845 }, { "epoch": 0.8234162759947109, "grad_norm": 0.7024199366569519, "learning_rate": 0.0004934416512584345, "loss": 1.1938, "step": 6850 }, { "epoch": 0.8240173097728092, "grad_norm": 0.6169788837432861, "learning_rate": 0.0004934307968430582, "loss": 1.1281, "step": 6855 }, { "epoch": 0.8246183435509076, "grad_norm": 0.5251727104187012, "learning_rate": 0.0004934199335723734, "loss": 0.6023, "step": 6860 }, { "epoch": 0.8252193773290059, "grad_norm": 0.6100950837135315, "learning_rate": 0.0004934090614467755, "loss": 0.8187, "step": 6865 }, { "epoch": 0.8258204111071042, "grad_norm": 0.68628990650177, "learning_rate": 0.00049339818046666, "loss": 0.9266, "step": 6870 }, { "epoch": 0.8264214448852025, "grad_norm": 0.6150997281074524, "learning_rate": 0.0004933872906324224, "loss": 0.9812, "step": 6875 }, { "epoch": 0.8270224786633009, "grad_norm": 0.5543625950813293, "learning_rate": 0.0004933763919444591, "loss": 1.1148, "step": 6880 }, { "epoch": 0.8276235124413992, "grad_norm": 0.5212106108665466, "learning_rate": 0.0004933654844031665, "loss": 0.8719, "step": 6885 }, { "epoch": 0.8282245462194975, "grad_norm": 0.6294976472854614, "learning_rate": 0.0004933545680089414, "loss": 1.2664, "step": 6890 }, { "epoch": 0.8288255799975959, "grad_norm": 0.5645679235458374, "learning_rate": 0.0004933436427621809, "loss": 1.0367, "step": 6895 }, { "epoch": 0.8294266137756942, "grad_norm": 0.6843478679656982, "learning_rate": 0.0004933327086632824, "loss": 1.1313, "step": 6900 }, { "epoch": 0.8300276475537925, "grad_norm": 0.6626003384590149, "learning_rate": 0.0004933217657126438, "loss": 1.1641, "step": 6905 }, { "epoch": 0.8306286813318908, "grad_norm": 0.5868483781814575, "learning_rate": 0.0004933108139106629, "loss": 0.8938, "step": 6910 }, { "epoch": 0.8312297151099892, "grad_norm": 0.542267918586731, "learning_rate": 0.0004932998532577382, "loss": 0.9781, "step": 6915 }, { "epoch": 0.8318307488880875, "grad_norm": 0.909706711769104, "learning_rate": 0.0004932888837542686, "loss": 1.0219, "step": 6920 }, { "epoch": 0.8324317826661859, "grad_norm": 0.41403937339782715, "learning_rate": 0.000493277905400653, "loss": 1.1586, "step": 6925 }, { "epoch": 0.8330328164442842, "grad_norm": 0.5755292773246765, "learning_rate": 0.0004932669181972907, "loss": 0.9539, "step": 6930 }, { "epoch": 0.8336338502223825, "grad_norm": 0.6402360200881958, "learning_rate": 0.0004932559221445813, "loss": 1.2836, "step": 6935 }, { "epoch": 0.8342348840004808, "grad_norm": 0.6989447474479675, "learning_rate": 0.0004932449172429251, "loss": 0.8547, "step": 6940 }, { "epoch": 0.8348359177785791, "grad_norm": 0.7221708297729492, "learning_rate": 0.0004932339034927222, "loss": 0.8523, "step": 6945 }, { "epoch": 0.8354369515566775, "grad_norm": 0.5851629972457886, "learning_rate": 0.0004932228808943735, "loss": 1.3805, "step": 6950 }, { "epoch": 0.8360379853347758, "grad_norm": 0.6313065886497498, "learning_rate": 0.0004932118494482795, "loss": 1.0703, "step": 6955 }, { "epoch": 0.8366390191128742, "grad_norm": 0.7215156555175781, "learning_rate": 0.000493200809154842, "loss": 0.7953, "step": 6960 }, { "epoch": 0.8372400528909725, "grad_norm": 0.5459794998168945, "learning_rate": 0.0004931897600144624, "loss": 1.0133, "step": 6965 }, { "epoch": 0.8378410866690708, "grad_norm": 0.6738685965538025, "learning_rate": 0.0004931787020275426, "loss": 0.7379, "step": 6970 }, { "epoch": 0.8384421204471691, "grad_norm": 0.48789772391319275, "learning_rate": 0.0004931676351944848, "loss": 0.7891, "step": 6975 }, { "epoch": 0.8390431542252674, "grad_norm": 0.8837066292762756, "learning_rate": 0.0004931565595156917, "loss": 0.9375, "step": 6980 }, { "epoch": 0.8396441880033658, "grad_norm": 0.7249802350997925, "learning_rate": 0.0004931454749915662, "loss": 1.0891, "step": 6985 }, { "epoch": 0.8402452217814641, "grad_norm": 0.50520920753479, "learning_rate": 0.0004931343816225116, "loss": 0.9656, "step": 6990 }, { "epoch": 0.8408462555595625, "grad_norm": 0.4688724875450134, "learning_rate": 0.0004931232794089313, "loss": 1.1488, "step": 6995 }, { "epoch": 0.8414472893376608, "grad_norm": 0.49341389536857605, "learning_rate": 0.000493112168351229, "loss": 0.9695, "step": 7000 }, { "epoch": 0.8414472893376608, "eval_loss": 1.953125, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2169, "eval_samples_per_second": 4.543, "eval_steps_per_second": 1.136, "step": 7000 }, { "epoch": 0.8420483231157591, "grad_norm": 0.6272909045219421, "learning_rate": 0.0004931010484498093, "loss": 1.1516, "step": 7005 }, { "epoch": 0.8426493568938574, "grad_norm": 0.7011892795562744, "learning_rate": 0.0004930899197050764, "loss": 0.8078, "step": 7010 }, { "epoch": 0.8432503906719557, "grad_norm": 0.6199313402175903, "learning_rate": 0.0004930787821174353, "loss": 0.9609, "step": 7015 }, { "epoch": 0.8438514244500541, "grad_norm": 0.6322630643844604, "learning_rate": 0.0004930676356872909, "loss": 1.1258, "step": 7020 }, { "epoch": 0.8444524582281524, "grad_norm": 0.6029014587402344, "learning_rate": 0.0004930564804150492, "loss": 1.018, "step": 7025 }, { "epoch": 0.8450534920062508, "grad_norm": 0.5072593092918396, "learning_rate": 0.0004930453163011154, "loss": 1.0633, "step": 7030 }, { "epoch": 0.8456545257843491, "grad_norm": 0.5095769762992859, "learning_rate": 0.000493034143345896, "loss": 0.9047, "step": 7035 }, { "epoch": 0.8462555595624474, "grad_norm": 0.45354166626930237, "learning_rate": 0.0004930229615497972, "loss": 0.8516, "step": 7040 }, { "epoch": 0.8468565933405457, "grad_norm": 0.4816553592681885, "learning_rate": 0.000493011770913226, "loss": 1.5977, "step": 7045 }, { "epoch": 0.847457627118644, "grad_norm": 0.5562716722488403, "learning_rate": 0.0004930005714365893, "loss": 0.9703, "step": 7050 }, { "epoch": 0.8480586608967424, "grad_norm": 0.6599060297012329, "learning_rate": 0.0004929893631202946, "loss": 1.1109, "step": 7055 }, { "epoch": 0.8486596946748407, "grad_norm": 0.6208095550537109, "learning_rate": 0.0004929781459647494, "loss": 0.7117, "step": 7060 }, { "epoch": 0.8492607284529391, "grad_norm": 0.6619543433189392, "learning_rate": 0.0004929669199703622, "loss": 0.9266, "step": 7065 }, { "epoch": 0.8498617622310374, "grad_norm": 0.5926734805107117, "learning_rate": 0.000492955685137541, "loss": 0.8957, "step": 7070 }, { "epoch": 0.8504627960091358, "grad_norm": 0.521955668926239, "learning_rate": 0.0004929444414666946, "loss": 0.9031, "step": 7075 }, { "epoch": 0.851063829787234, "grad_norm": 0.5727534294128418, "learning_rate": 0.0004929331889582321, "loss": 1.1406, "step": 7080 }, { "epoch": 0.8516648635653323, "grad_norm": 0.7056397795677185, "learning_rate": 0.0004929219276125626, "loss": 0.9, "step": 7085 }, { "epoch": 0.8522658973434307, "grad_norm": 0.5122407078742981, "learning_rate": 0.000492910657430096, "loss": 1.05, "step": 7090 }, { "epoch": 0.852866931121529, "grad_norm": 0.5585248470306396, "learning_rate": 0.0004928993784112421, "loss": 1.3078, "step": 7095 }, { "epoch": 0.8534679648996274, "grad_norm": 0.8451035618782043, "learning_rate": 0.0004928880905564114, "loss": 1.2547, "step": 7100 }, { "epoch": 0.8540689986777257, "grad_norm": 0.5938951373100281, "learning_rate": 0.0004928767938660142, "loss": 1.1281, "step": 7105 }, { "epoch": 0.8546700324558241, "grad_norm": 0.42980799078941345, "learning_rate": 0.0004928654883404618, "loss": 1.1625, "step": 7110 }, { "epoch": 0.8552710662339224, "grad_norm": 0.62227463722229, "learning_rate": 0.0004928541739801653, "loss": 1.2316, "step": 7115 }, { "epoch": 0.8558721000120206, "grad_norm": 0.5308201313018799, "learning_rate": 0.0004928428507855363, "loss": 1.1164, "step": 7120 }, { "epoch": 0.856473133790119, "grad_norm": 0.48991772532463074, "learning_rate": 0.0004928315187569865, "loss": 0.9305, "step": 7125 }, { "epoch": 0.8570741675682173, "grad_norm": 0.6767470836639404, "learning_rate": 0.0004928201778949285, "loss": 0.9047, "step": 7130 }, { "epoch": 0.8576752013463157, "grad_norm": 0.6267699003219604, "learning_rate": 0.0004928088281997746, "loss": 1.132, "step": 7135 }, { "epoch": 0.858276235124414, "grad_norm": 0.5477824211120605, "learning_rate": 0.0004927974696719377, "loss": 1.1012, "step": 7140 }, { "epoch": 0.8588772689025124, "grad_norm": 0.3394117057323456, "learning_rate": 0.000492786102311831, "loss": 0.668, "step": 7145 }, { "epoch": 0.8594783026806107, "grad_norm": 0.5851861834526062, "learning_rate": 0.0004927747261198681, "loss": 1.35, "step": 7150 }, { "epoch": 0.8600793364587089, "grad_norm": 0.5231513381004333, "learning_rate": 0.0004927633410964627, "loss": 1.0281, "step": 7155 }, { "epoch": 0.8606803702368073, "grad_norm": 0.532753586769104, "learning_rate": 0.0004927519472420291, "loss": 0.932, "step": 7160 }, { "epoch": 0.8612814040149056, "grad_norm": 0.7795905470848083, "learning_rate": 0.0004927405445569817, "loss": 1.3422, "step": 7165 }, { "epoch": 0.861882437793004, "grad_norm": 0.5528318881988525, "learning_rate": 0.0004927291330417352, "loss": 1.3664, "step": 7170 }, { "epoch": 0.8624834715711023, "grad_norm": 0.518151581287384, "learning_rate": 0.0004927177126967049, "loss": 0.925, "step": 7175 }, { "epoch": 0.8630845053492007, "grad_norm": 0.5630781054496765, "learning_rate": 0.0004927062835223061, "loss": 0.7414, "step": 7180 }, { "epoch": 0.863685539127299, "grad_norm": 0.667807936668396, "learning_rate": 0.0004926948455189546, "loss": 1.0164, "step": 7185 }, { "epoch": 0.8642865729053972, "grad_norm": 0.6349160075187683, "learning_rate": 0.0004926833986870665, "loss": 0.8102, "step": 7190 }, { "epoch": 0.8648876066834956, "grad_norm": 0.6104588508605957, "learning_rate": 0.0004926719430270582, "loss": 1.0063, "step": 7195 }, { "epoch": 0.8654886404615939, "grad_norm": 0.6458672881126404, "learning_rate": 0.0004926604785393464, "loss": 0.9891, "step": 7200 }, { "epoch": 0.8654886404615939, "eval_loss": 1.987207055091858, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2186, "eval_samples_per_second": 4.543, "eval_steps_per_second": 1.136, "step": 7200 }, { "epoch": 0.8660896742396923, "grad_norm": 0.852249801158905, "learning_rate": 0.0004926490052243481, "loss": 1.1508, "step": 7205 }, { "epoch": 0.8666907080177906, "grad_norm": 0.6328768730163574, "learning_rate": 0.0004926375230824808, "loss": 0.8727, "step": 7210 }, { "epoch": 0.867291741795889, "grad_norm": 0.615470290184021, "learning_rate": 0.0004926260321141623, "loss": 1.2406, "step": 7215 }, { "epoch": 0.8678927755739873, "grad_norm": 0.5798203349113464, "learning_rate": 0.0004926145323198101, "loss": 1.0117, "step": 7220 }, { "epoch": 0.8684938093520855, "grad_norm": 0.4167349338531494, "learning_rate": 0.0004926030236998431, "loss": 1.0406, "step": 7225 }, { "epoch": 0.8690948431301839, "grad_norm": 0.6174272894859314, "learning_rate": 0.0004925915062546796, "loss": 1.2125, "step": 7230 }, { "epoch": 0.8696958769082822, "grad_norm": 0.7083294987678528, "learning_rate": 0.0004925799799847386, "loss": 0.9492, "step": 7235 }, { "epoch": 0.8702969106863806, "grad_norm": 0.6833418607711792, "learning_rate": 0.0004925684448904396, "loss": 1.1977, "step": 7240 }, { "epoch": 0.8708979444644789, "grad_norm": 0.6882427334785461, "learning_rate": 0.0004925569009722019, "loss": 1.0266, "step": 7245 }, { "epoch": 0.8714989782425773, "grad_norm": 0.4113350808620453, "learning_rate": 0.0004925453482304458, "loss": 0.9461, "step": 7250 }, { "epoch": 0.8721000120206756, "grad_norm": 0.7593322992324829, "learning_rate": 0.0004925337866655914, "loss": 0.8812, "step": 7255 }, { "epoch": 0.8727010457987739, "grad_norm": 0.6697939038276672, "learning_rate": 0.0004925222162780591, "loss": 0.8914, "step": 7260 }, { "epoch": 0.8733020795768722, "grad_norm": 0.4989682137966156, "learning_rate": 0.00049251063706827, "loss": 1.0234, "step": 7265 }, { "epoch": 0.8739031133549705, "grad_norm": 0.5618255734443665, "learning_rate": 0.0004924990490366453, "loss": 1.1336, "step": 7270 }, { "epoch": 0.8745041471330689, "grad_norm": 0.45250099897384644, "learning_rate": 0.0004924874521836063, "loss": 1.3758, "step": 7275 }, { "epoch": 0.8751051809111672, "grad_norm": 0.6784215569496155, "learning_rate": 0.0004924758465095752, "loss": 1.2719, "step": 7280 }, { "epoch": 0.8757062146892656, "grad_norm": 0.7080884575843811, "learning_rate": 0.0004924642320149741, "loss": 1.2031, "step": 7285 }, { "epoch": 0.8763072484673639, "grad_norm": 0.5413027405738831, "learning_rate": 0.0004924526087002254, "loss": 1.0156, "step": 7290 }, { "epoch": 0.8769082822454622, "grad_norm": 0.6434570550918579, "learning_rate": 0.000492440976565752, "loss": 1.1719, "step": 7295 }, { "epoch": 0.8775093160235605, "grad_norm": 0.4324966073036194, "learning_rate": 0.000492429335611977, "loss": 1.1352, "step": 7300 }, { "epoch": 0.8781103498016588, "grad_norm": 0.5412725210189819, "learning_rate": 0.0004924176858393238, "loss": 0.6863, "step": 7305 }, { "epoch": 0.8787113835797572, "grad_norm": 0.5473257303237915, "learning_rate": 0.0004924060272482163, "loss": 0.8305, "step": 7310 }, { "epoch": 0.8793124173578555, "grad_norm": 0.6203455924987793, "learning_rate": 0.0004923943598390785, "loss": 0.9453, "step": 7315 }, { "epoch": 0.8799134511359539, "grad_norm": 0.6368551254272461, "learning_rate": 0.0004923826836123351, "loss": 1.2023, "step": 7320 }, { "epoch": 0.8805144849140522, "grad_norm": 0.5397272109985352, "learning_rate": 0.0004923709985684104, "loss": 1.1375, "step": 7325 }, { "epoch": 0.8811155186921505, "grad_norm": 0.5577839016914368, "learning_rate": 0.0004923593047077298, "loss": 1.007, "step": 7330 }, { "epoch": 0.8817165524702488, "grad_norm": 0.6538661122322083, "learning_rate": 0.0004923476020307186, "loss": 1.0914, "step": 7335 }, { "epoch": 0.8823175862483471, "grad_norm": 0.719508945941925, "learning_rate": 0.0004923358905378025, "loss": 1.0852, "step": 7340 }, { "epoch": 0.8829186200264455, "grad_norm": 0.43969088792800903, "learning_rate": 0.0004923241702294075, "loss": 0.9992, "step": 7345 }, { "epoch": 0.8835196538045438, "grad_norm": 0.44342780113220215, "learning_rate": 0.0004923124411059601, "loss": 0.8969, "step": 7350 }, { "epoch": 0.8841206875826422, "grad_norm": 0.8577675819396973, "learning_rate": 0.0004923007031678868, "loss": 0.9219, "step": 7355 }, { "epoch": 0.8847217213607405, "grad_norm": 0.48573756217956543, "learning_rate": 0.0004922889564156146, "loss": 0.8047, "step": 7360 }, { "epoch": 0.8853227551388388, "grad_norm": 0.5752463936805725, "learning_rate": 0.0004922772008495709, "loss": 1.0852, "step": 7365 }, { "epoch": 0.8859237889169371, "grad_norm": 0.4537794291973114, "learning_rate": 0.0004922654364701832, "loss": 1.0305, "step": 7370 }, { "epoch": 0.8865248226950354, "grad_norm": 0.5247781276702881, "learning_rate": 0.0004922536632778797, "loss": 1.0562, "step": 7375 }, { "epoch": 0.8871258564731338, "grad_norm": 0.4838447868824005, "learning_rate": 0.0004922418812730884, "loss": 1.0039, "step": 7380 }, { "epoch": 0.8877268902512321, "grad_norm": 0.44885578751564026, "learning_rate": 0.0004922300904562382, "loss": 1.2289, "step": 7385 }, { "epoch": 0.8883279240293305, "grad_norm": 0.5934878587722778, "learning_rate": 0.0004922182908277576, "loss": 0.8531, "step": 7390 }, { "epoch": 0.8889289578074288, "grad_norm": 0.7617976069450378, "learning_rate": 0.0004922064823880762, "loss": 0.9938, "step": 7395 }, { "epoch": 0.8895299915855271, "grad_norm": 0.6728799343109131, "learning_rate": 0.0004921946651376234, "loss": 1.0352, "step": 7400 }, { "epoch": 0.8895299915855271, "eval_loss": 1.944238305091858, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2018, "eval_samples_per_second": 4.545, "eval_steps_per_second": 1.136, "step": 7400 }, { "epoch": 0.8901310253636254, "grad_norm": 0.48037540912628174, "learning_rate": 0.0004921828390768291, "loss": 1.0437, "step": 7405 }, { "epoch": 0.8907320591417238, "grad_norm": 0.6825998425483704, "learning_rate": 0.0004921710042061235, "loss": 0.893, "step": 7410 }, { "epoch": 0.8913330929198221, "grad_norm": 0.743591845035553, "learning_rate": 0.0004921591605259372, "loss": 0.807, "step": 7415 }, { "epoch": 0.8919341266979204, "grad_norm": 0.90023273229599, "learning_rate": 0.0004921473080367009, "loss": 0.8555, "step": 7420 }, { "epoch": 0.8925351604760188, "grad_norm": 0.5256183743476868, "learning_rate": 0.0004921354467388458, "loss": 1.1672, "step": 7425 }, { "epoch": 0.893136194254117, "grad_norm": 0.5608157515525818, "learning_rate": 0.0004921235766328035, "loss": 0.8395, "step": 7430 }, { "epoch": 0.8937372280322154, "grad_norm": 0.5922031998634338, "learning_rate": 0.0004921116977190056, "loss": 1.1973, "step": 7435 }, { "epoch": 0.8943382618103137, "grad_norm": 0.3371446132659912, "learning_rate": 0.0004920998099978843, "loss": 1.175, "step": 7440 }, { "epoch": 0.8949392955884121, "grad_norm": 0.7467451095581055, "learning_rate": 0.0004920879134698721, "loss": 0.8684, "step": 7445 }, { "epoch": 0.8955403293665104, "grad_norm": 0.5667518377304077, "learning_rate": 0.0004920760081354018, "loss": 1.0547, "step": 7450 }, { "epoch": 0.8961413631446087, "grad_norm": 0.566737949848175, "learning_rate": 0.0004920640939949063, "loss": 0.8891, "step": 7455 }, { "epoch": 0.8967423969227071, "grad_norm": 0.7466226816177368, "learning_rate": 0.0004920521710488192, "loss": 1.1484, "step": 7460 }, { "epoch": 0.8973434307008054, "grad_norm": 0.9273832440376282, "learning_rate": 0.0004920402392975741, "loss": 1.2352, "step": 7465 }, { "epoch": 0.8979444644789037, "grad_norm": 0.8421907424926758, "learning_rate": 0.000492028298741605, "loss": 1.3086, "step": 7470 }, { "epoch": 0.898545498257002, "grad_norm": 0.6307106614112854, "learning_rate": 0.0004920163493813463, "loss": 0.8187, "step": 7475 }, { "epoch": 0.8991465320351004, "grad_norm": 0.5602814555168152, "learning_rate": 0.0004920043912172329, "loss": 1.375, "step": 7480 }, { "epoch": 0.8997475658131987, "grad_norm": 0.41203978657722473, "learning_rate": 0.0004919924242496995, "loss": 0.7098, "step": 7485 }, { "epoch": 0.900348599591297, "grad_norm": 0.605702817440033, "learning_rate": 0.0004919804484791816, "loss": 0.9227, "step": 7490 }, { "epoch": 0.9009496333693954, "grad_norm": 0.8872311115264893, "learning_rate": 0.0004919684639061148, "loss": 1.1559, "step": 7495 }, { "epoch": 0.9015506671474937, "grad_norm": 0.6117646098136902, "learning_rate": 0.0004919564705309348, "loss": 1.0891, "step": 7500 }, { "epoch": 0.902151700925592, "grad_norm": 0.5304883122444153, "learning_rate": 0.0004919444683540785, "loss": 0.9578, "step": 7505 }, { "epoch": 0.9027527347036903, "grad_norm": 1.0804104804992676, "learning_rate": 0.0004919324573759819, "loss": 1.3266, "step": 7510 }, { "epoch": 0.9033537684817887, "grad_norm": 0.6823940873146057, "learning_rate": 0.0004919204375970822, "loss": 1.1953, "step": 7515 }, { "epoch": 0.903954802259887, "grad_norm": 0.5779481530189514, "learning_rate": 0.0004919084090178168, "loss": 0.8297, "step": 7520 }, { "epoch": 0.9045558360379853, "grad_norm": 0.773202657699585, "learning_rate": 0.0004918963716386229, "loss": 0.9203, "step": 7525 }, { "epoch": 0.9051568698160837, "grad_norm": 0.6383361220359802, "learning_rate": 0.0004918843254599386, "loss": 0.8125, "step": 7530 }, { "epoch": 0.905757903594182, "grad_norm": 0.4668700098991394, "learning_rate": 0.000491872270482202, "loss": 1.2758, "step": 7535 }, { "epoch": 0.9063589373722803, "grad_norm": 0.5505266189575195, "learning_rate": 0.0004918602067058518, "loss": 1.018, "step": 7540 }, { "epoch": 0.9069599711503786, "grad_norm": 0.5363745093345642, "learning_rate": 0.0004918481341313266, "loss": 1.3641, "step": 7545 }, { "epoch": 0.907561004928477, "grad_norm": 0.5243716835975647, "learning_rate": 0.0004918360527590658, "loss": 0.9219, "step": 7550 }, { "epoch": 0.9081620387065753, "grad_norm": 0.5445330142974854, "learning_rate": 0.0004918239625895088, "loss": 0.9555, "step": 7555 }, { "epoch": 0.9087630724846736, "grad_norm": 0.4769819974899292, "learning_rate": 0.0004918118636230953, "loss": 0.9664, "step": 7560 }, { "epoch": 0.909364106262772, "grad_norm": 0.5455062389373779, "learning_rate": 0.0004917997558602657, "loss": 1.2281, "step": 7565 }, { "epoch": 0.9099651400408703, "grad_norm": 0.5463053584098816, "learning_rate": 0.0004917876393014602, "loss": 1.1203, "step": 7570 }, { "epoch": 0.9105661738189686, "grad_norm": 0.5657302737236023, "learning_rate": 0.0004917755139471195, "loss": 0.9156, "step": 7575 }, { "epoch": 0.9111672075970669, "grad_norm": 0.597939133644104, "learning_rate": 0.0004917633797976849, "loss": 0.9289, "step": 7580 }, { "epoch": 0.9117682413751653, "grad_norm": 0.5934147834777832, "learning_rate": 0.0004917512368535978, "loss": 1.3148, "step": 7585 }, { "epoch": 0.9123692751532636, "grad_norm": 0.4753275513648987, "learning_rate": 0.0004917390851152997, "loss": 1.0836, "step": 7590 }, { "epoch": 0.912970308931362, "grad_norm": 0.5918622016906738, "learning_rate": 0.0004917269245832328, "loss": 0.9219, "step": 7595 }, { "epoch": 0.9135713427094603, "grad_norm": 0.792513906955719, "learning_rate": 0.0004917147552578396, "loss": 1.0805, "step": 7600 }, { "epoch": 0.9135713427094603, "eval_loss": 1.9660155773162842, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1923, "eval_samples_per_second": 4.546, "eval_steps_per_second": 1.137, "step": 7600 }, { "epoch": 0.9141723764875586, "grad_norm": 0.5691604614257812, "learning_rate": 0.0004917025771395626, "loss": 1.1789, "step": 7605 }, { "epoch": 0.914773410265657, "grad_norm": 0.6456632018089294, "learning_rate": 0.0004916903902288447, "loss": 1.132, "step": 7610 }, { "epoch": 0.9153744440437552, "grad_norm": 0.48219436407089233, "learning_rate": 0.0004916781945261295, "loss": 1.1305, "step": 7615 }, { "epoch": 0.9159754778218536, "grad_norm": 0.41352659463882446, "learning_rate": 0.0004916659900318605, "loss": 0.8137, "step": 7620 }, { "epoch": 0.9165765115999519, "grad_norm": 0.6277100443840027, "learning_rate": 0.0004916537767464816, "loss": 0.7043, "step": 7625 }, { "epoch": 0.9171775453780503, "grad_norm": 0.4114673435688019, "learning_rate": 0.0004916415546704372, "loss": 1.0902, "step": 7630 }, { "epoch": 0.9177785791561486, "grad_norm": 0.7943468689918518, "learning_rate": 0.0004916293238041719, "loss": 1.0141, "step": 7635 }, { "epoch": 0.9183796129342469, "grad_norm": 0.6111139059066772, "learning_rate": 0.0004916170841481306, "loss": 0.9125, "step": 7640 }, { "epoch": 0.9189806467123453, "grad_norm": 0.5859405398368835, "learning_rate": 0.0004916048357027585, "loss": 0.6945, "step": 7645 }, { "epoch": 0.9195816804904435, "grad_norm": 0.6355502009391785, "learning_rate": 0.0004915925784685012, "loss": 0.7305, "step": 7650 }, { "epoch": 0.9201827142685419, "grad_norm": 0.7294882535934448, "learning_rate": 0.0004915803124458046, "loss": 0.8762, "step": 7655 }, { "epoch": 0.9207837480466402, "grad_norm": 0.622084379196167, "learning_rate": 0.0004915680376351148, "loss": 0.9, "step": 7660 }, { "epoch": 0.9213847818247386, "grad_norm": 0.6312769055366516, "learning_rate": 0.0004915557540368785, "loss": 1.0164, "step": 7665 }, { "epoch": 0.9219858156028369, "grad_norm": 0.5897203683853149, "learning_rate": 0.0004915434616515424, "loss": 0.8633, "step": 7670 }, { "epoch": 0.9225868493809352, "grad_norm": 0.6419596672058105, "learning_rate": 0.0004915311604795537, "loss": 1.1828, "step": 7675 }, { "epoch": 0.9231878831590336, "grad_norm": 0.7553310990333557, "learning_rate": 0.0004915188505213599, "loss": 0.8313, "step": 7680 }, { "epoch": 0.9237889169371318, "grad_norm": 0.6417075991630554, "learning_rate": 0.0004915065317774087, "loss": 1.1738, "step": 7685 }, { "epoch": 0.9243899507152302, "grad_norm": 0.6571639776229858, "learning_rate": 0.0004914942042481484, "loss": 1.1586, "step": 7690 }, { "epoch": 0.9249909844933285, "grad_norm": 0.4908573627471924, "learning_rate": 0.0004914818679340274, "loss": 1.0141, "step": 7695 }, { "epoch": 0.9255920182714269, "grad_norm": 0.5546748042106628, "learning_rate": 0.0004914695228354943, "loss": 0.9133, "step": 7700 }, { "epoch": 0.9261930520495252, "grad_norm": 0.7343969345092773, "learning_rate": 0.0004914571689529983, "loss": 0.893, "step": 7705 }, { "epoch": 0.9267940858276235, "grad_norm": 0.3860927224159241, "learning_rate": 0.0004914448062869888, "loss": 0.9008, "step": 7710 }, { "epoch": 0.9273951196057219, "grad_norm": 0.5206835269927979, "learning_rate": 0.0004914324348379155, "loss": 0.7336, "step": 7715 }, { "epoch": 0.9279961533838201, "grad_norm": 0.6308187246322632, "learning_rate": 0.0004914200546062285, "loss": 1.4469, "step": 7720 }, { "epoch": 0.9285971871619185, "grad_norm": 0.5945355296134949, "learning_rate": 0.0004914076655923781, "loss": 1.1281, "step": 7725 }, { "epoch": 0.9291982209400168, "grad_norm": 0.5737658143043518, "learning_rate": 0.0004913952677968148, "loss": 0.9914, "step": 7730 }, { "epoch": 0.9297992547181152, "grad_norm": 0.6509815454483032, "learning_rate": 0.00049138286121999, "loss": 1.3453, "step": 7735 }, { "epoch": 0.9304002884962135, "grad_norm": 0.6595441699028015, "learning_rate": 0.0004913704458623547, "loss": 0.8406, "step": 7740 }, { "epoch": 0.9310013222743119, "grad_norm": 0.8509159684181213, "learning_rate": 0.0004913580217243606, "loss": 1.3531, "step": 7745 }, { "epoch": 0.9316023560524102, "grad_norm": 0.6536901593208313, "learning_rate": 0.0004913455888064597, "loss": 1.1992, "step": 7750 }, { "epoch": 0.9322033898305084, "grad_norm": 0.6607285141944885, "learning_rate": 0.0004913331471091041, "loss": 1.0414, "step": 7755 }, { "epoch": 0.9328044236086068, "grad_norm": 0.6921988129615784, "learning_rate": 0.0004913206966327468, "loss": 0.8664, "step": 7760 }, { "epoch": 0.9334054573867051, "grad_norm": 0.5597063899040222, "learning_rate": 0.0004913082373778402, "loss": 0.802, "step": 7765 }, { "epoch": 0.9340064911648035, "grad_norm": 0.3904553949832916, "learning_rate": 0.000491295769344838, "loss": 1.1102, "step": 7770 }, { "epoch": 0.9346075249429018, "grad_norm": 0.6223692893981934, "learning_rate": 0.0004912832925341934, "loss": 0.7461, "step": 7775 }, { "epoch": 0.9352085587210002, "grad_norm": 0.6292567253112793, "learning_rate": 0.0004912708069463605, "loss": 1.0852, "step": 7780 }, { "epoch": 0.9358095924990985, "grad_norm": 0.4139750003814697, "learning_rate": 0.0004912583125817933, "loss": 0.7582, "step": 7785 }, { "epoch": 0.9364106262771967, "grad_norm": 0.5861048102378845, "learning_rate": 0.0004912458094409464, "loss": 1.0344, "step": 7790 }, { "epoch": 0.9370116600552951, "grad_norm": 0.712786078453064, "learning_rate": 0.0004912332975242747, "loss": 0.8656, "step": 7795 }, { "epoch": 0.9376126938333934, "grad_norm": 0.5423747897148132, "learning_rate": 0.0004912207768322333, "loss": 1.3156, "step": 7800 }, { "epoch": 0.9376126938333934, "eval_loss": 1.9586913585662842, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1936, "eval_samples_per_second": 4.546, "eval_steps_per_second": 1.137, "step": 7800 }, { "epoch": 0.9382137276114918, "grad_norm": 0.4645959734916687, "learning_rate": 0.0004912082473652776, "loss": 0.6793, "step": 7805 }, { "epoch": 0.9388147613895901, "grad_norm": 0.5236163139343262, "learning_rate": 0.0004911957091238634, "loss": 0.7969, "step": 7810 }, { "epoch": 0.9394157951676885, "grad_norm": 0.6534927487373352, "learning_rate": 0.0004911831621084469, "loss": 1.0148, "step": 7815 }, { "epoch": 0.9400168289457868, "grad_norm": 0.5032287836074829, "learning_rate": 0.0004911706063194844, "loss": 1.0398, "step": 7820 }, { "epoch": 0.940617862723885, "grad_norm": 0.6310720443725586, "learning_rate": 0.0004911580417574327, "loss": 1.425, "step": 7825 }, { "epoch": 0.9412188965019834, "grad_norm": 0.41097697615623474, "learning_rate": 0.000491145468422749, "loss": 1.0453, "step": 7830 }, { "epoch": 0.9418199302800817, "grad_norm": 0.5711127519607544, "learning_rate": 0.0004911328863158904, "loss": 1.0195, "step": 7835 }, { "epoch": 0.9424209640581801, "grad_norm": 0.6290940046310425, "learning_rate": 0.0004911202954373147, "loss": 1.1555, "step": 7840 }, { "epoch": 0.9430219978362784, "grad_norm": 0.5807398557662964, "learning_rate": 0.0004911076957874801, "loss": 0.8434, "step": 7845 }, { "epoch": 0.9436230316143768, "grad_norm": 0.5235177874565125, "learning_rate": 0.0004910950873668449, "loss": 0.9969, "step": 7850 }, { "epoch": 0.944224065392475, "grad_norm": 0.6139911413192749, "learning_rate": 0.0004910824701758675, "loss": 0.9043, "step": 7855 }, { "epoch": 0.9448250991705733, "grad_norm": 0.4899279475212097, "learning_rate": 0.000491069844215007, "loss": 0.9156, "step": 7860 }, { "epoch": 0.9454261329486717, "grad_norm": 0.6272503137588501, "learning_rate": 0.0004910572094847229, "loss": 0.8703, "step": 7865 }, { "epoch": 0.94602716672677, "grad_norm": 0.7308134436607361, "learning_rate": 0.0004910445659854746, "loss": 0.784, "step": 7870 }, { "epoch": 0.9466282005048684, "grad_norm": 0.504143238067627, "learning_rate": 0.0004910319137177221, "loss": 0.9629, "step": 7875 }, { "epoch": 0.9472292342829667, "grad_norm": 0.6525449156761169, "learning_rate": 0.0004910192526819255, "loss": 1.218, "step": 7880 }, { "epoch": 0.9478302680610651, "grad_norm": 0.718891441822052, "learning_rate": 0.0004910065828785456, "loss": 1.2719, "step": 7885 }, { "epoch": 0.9484313018391634, "grad_norm": 0.7051753997802734, "learning_rate": 0.0004909939043080433, "loss": 0.8734, "step": 7890 }, { "epoch": 0.9490323356172616, "grad_norm": 0.5742840766906738, "learning_rate": 0.0004909812169708796, "loss": 1.1148, "step": 7895 }, { "epoch": 0.94963336939536, "grad_norm": 0.5299107432365417, "learning_rate": 0.0004909685208675162, "loss": 1.0551, "step": 7900 }, { "epoch": 0.9502344031734583, "grad_norm": 0.49730536341667175, "learning_rate": 0.0004909558159984147, "loss": 1.0406, "step": 7905 }, { "epoch": 0.9508354369515567, "grad_norm": 0.781014084815979, "learning_rate": 0.0004909431023640377, "loss": 1.0945, "step": 7910 }, { "epoch": 0.951436470729655, "grad_norm": 0.5173167586326599, "learning_rate": 0.0004909303799648473, "loss": 0.7383, "step": 7915 }, { "epoch": 0.9520375045077534, "grad_norm": 0.5512794852256775, "learning_rate": 0.0004909176488013065, "loss": 0.8844, "step": 7920 }, { "epoch": 0.9526385382858517, "grad_norm": 0.5370757579803467, "learning_rate": 0.0004909049088738784, "loss": 0.7469, "step": 7925 }, { "epoch": 0.95323957206395, "grad_norm": 0.5077857375144958, "learning_rate": 0.0004908921601830264, "loss": 1.1938, "step": 7930 }, { "epoch": 0.9538406058420483, "grad_norm": 0.46787217259407043, "learning_rate": 0.0004908794027292143, "loss": 0.8223, "step": 7935 }, { "epoch": 0.9544416396201466, "grad_norm": 0.9677898287773132, "learning_rate": 0.000490866636512906, "loss": 1.0117, "step": 7940 }, { "epoch": 0.955042673398245, "grad_norm": 0.4023076891899109, "learning_rate": 0.0004908538615345662, "loss": 0.8125, "step": 7945 }, { "epoch": 0.9556437071763433, "grad_norm": 0.48280128836631775, "learning_rate": 0.0004908410777946594, "loss": 0.8617, "step": 7950 }, { "epoch": 0.9562447409544417, "grad_norm": 0.4277903139591217, "learning_rate": 0.0004908282852936507, "loss": 1.2188, "step": 7955 }, { "epoch": 0.95684577473254, "grad_norm": 0.6882354617118835, "learning_rate": 0.0004908154840320055, "loss": 0.9437, "step": 7960 }, { "epoch": 0.9574468085106383, "grad_norm": 0.8440707325935364, "learning_rate": 0.0004908026740101894, "loss": 1.2594, "step": 7965 }, { "epoch": 0.9580478422887366, "grad_norm": 0.3909856677055359, "learning_rate": 0.0004907898552286684, "loss": 0.8418, "step": 7970 }, { "epoch": 0.9586488760668349, "grad_norm": 0.6478394865989685, "learning_rate": 0.0004907770276879088, "loss": 1.2016, "step": 7975 }, { "epoch": 0.9592499098449333, "grad_norm": 0.5579742789268494, "learning_rate": 0.0004907641913883772, "loss": 1.0414, "step": 7980 }, { "epoch": 0.9598509436230316, "grad_norm": 0.7744593620300293, "learning_rate": 0.0004907513463305407, "loss": 0.9383, "step": 7985 }, { "epoch": 0.96045197740113, "grad_norm": 0.6367405652999878, "learning_rate": 0.0004907384925148666, "loss": 1.0406, "step": 7990 }, { "epoch": 0.9610530111792283, "grad_norm": 0.8236774206161499, "learning_rate": 0.0004907256299418221, "loss": 1.2617, "step": 7995 }, { "epoch": 0.9616540449573266, "grad_norm": 0.8294329047203064, "learning_rate": 0.0004907127586118755, "loss": 1.1047, "step": 8000 }, { "epoch": 0.9616540449573266, "eval_loss": 1.9250977039337158, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1901, "eval_samples_per_second": 4.547, "eval_steps_per_second": 1.137, "step": 8000 }, { "epoch": 0.9622550787354249, "grad_norm": 0.6141639947891235, "learning_rate": 0.0004906998785254949, "loss": 0.9727, "step": 8005 }, { "epoch": 0.9628561125135232, "grad_norm": 0.539454460144043, "learning_rate": 0.0004906869896831487, "loss": 1.2273, "step": 8010 }, { "epoch": 0.9634571462916216, "grad_norm": 0.4912640452384949, "learning_rate": 0.000490674092085306, "loss": 0.5742, "step": 8015 }, { "epoch": 0.9640581800697199, "grad_norm": 0.5446999669075012, "learning_rate": 0.0004906611857324358, "loss": 1.132, "step": 8020 }, { "epoch": 0.9646592138478183, "grad_norm": 0.5784686207771301, "learning_rate": 0.0004906482706250076, "loss": 0.9492, "step": 8025 }, { "epoch": 0.9652602476259166, "grad_norm": 0.6620561480522156, "learning_rate": 0.0004906353467634914, "loss": 0.5887, "step": 8030 }, { "epoch": 0.965861281404015, "grad_norm": 0.3918837904930115, "learning_rate": 0.0004906224141483571, "loss": 1.2422, "step": 8035 }, { "epoch": 0.9664623151821132, "grad_norm": 0.6284928917884827, "learning_rate": 0.0004906094727800753, "loss": 1.1391, "step": 8040 }, { "epoch": 0.9670633489602115, "grad_norm": 0.626060962677002, "learning_rate": 0.0004905965226591167, "loss": 0.968, "step": 8045 }, { "epoch": 0.9676643827383099, "grad_norm": 0.5235432386398315, "learning_rate": 0.0004905835637859523, "loss": 1.0656, "step": 8050 }, { "epoch": 0.9682654165164082, "grad_norm": 0.4649801254272461, "learning_rate": 0.0004905705961610539, "loss": 1.0437, "step": 8055 }, { "epoch": 0.9688664502945066, "grad_norm": 0.6150076389312744, "learning_rate": 0.0004905576197848927, "loss": 1.1094, "step": 8060 }, { "epoch": 0.9694674840726049, "grad_norm": 0.7004176378250122, "learning_rate": 0.000490544634657941, "loss": 0.9266, "step": 8065 }, { "epoch": 0.9700685178507032, "grad_norm": 0.5747764706611633, "learning_rate": 0.0004905316407806711, "loss": 0.875, "step": 8070 }, { "epoch": 0.9706695516288015, "grad_norm": 0.5066094398498535, "learning_rate": 0.0004905186381535559, "loss": 1.0219, "step": 8075 }, { "epoch": 0.9712705854068999, "grad_norm": 0.6182261109352112, "learning_rate": 0.0004905056267770681, "loss": 0.9867, "step": 8080 }, { "epoch": 0.9718716191849982, "grad_norm": 0.7428590655326843, "learning_rate": 0.0004904926066516812, "loss": 1.1789, "step": 8085 }, { "epoch": 0.9724726529630965, "grad_norm": 0.45018380880355835, "learning_rate": 0.0004904795777778687, "loss": 1.2758, "step": 8090 }, { "epoch": 0.9730736867411949, "grad_norm": 0.5391820073127747, "learning_rate": 0.0004904665401561047, "loss": 1.0672, "step": 8095 }, { "epoch": 0.9736747205192932, "grad_norm": 0.488711416721344, "learning_rate": 0.0004904534937868634, "loss": 0.8734, "step": 8100 }, { "epoch": 0.9742757542973915, "grad_norm": 0.7133549451828003, "learning_rate": 0.0004904404386706192, "loss": 1.1695, "step": 8105 }, { "epoch": 0.9748767880754898, "grad_norm": 0.7054321765899658, "learning_rate": 0.0004904273748078474, "loss": 1.1266, "step": 8110 }, { "epoch": 0.9754778218535882, "grad_norm": 0.6871386170387268, "learning_rate": 0.000490414302199023, "loss": 1.0734, "step": 8115 }, { "epoch": 0.9760788556316865, "grad_norm": 0.7475893497467041, "learning_rate": 0.0004904012208446215, "loss": 1.032, "step": 8120 }, { "epoch": 0.9766798894097848, "grad_norm": 0.3854070007801056, "learning_rate": 0.0004903881307451187, "loss": 0.8223, "step": 8125 }, { "epoch": 0.9772809231878832, "grad_norm": 0.7500688433647156, "learning_rate": 0.0004903750319009911, "loss": 1.1562, "step": 8130 }, { "epoch": 0.9778819569659815, "grad_norm": 0.4827736020088196, "learning_rate": 0.0004903619243127149, "loss": 1.0188, "step": 8135 }, { "epoch": 0.9784829907440799, "grad_norm": 0.630518913269043, "learning_rate": 0.0004903488079807672, "loss": 1.0227, "step": 8140 }, { "epoch": 0.9790840245221781, "grad_norm": 1.0363420248031616, "learning_rate": 0.0004903356829056247, "loss": 1.1977, "step": 8145 }, { "epoch": 0.9796850583002765, "grad_norm": 0.7192338109016418, "learning_rate": 0.0004903225490877652, "loss": 0.8012, "step": 8150 }, { "epoch": 0.9802860920783748, "grad_norm": 0.7177491188049316, "learning_rate": 0.0004903094065276664, "loss": 1.2859, "step": 8155 }, { "epoch": 0.9808871258564731, "grad_norm": 0.5712778568267822, "learning_rate": 0.0004902962552258063, "loss": 1.368, "step": 8160 }, { "epoch": 0.9814881596345715, "grad_norm": 0.5604929327964783, "learning_rate": 0.0004902830951826635, "loss": 1.0688, "step": 8165 }, { "epoch": 0.9820891934126698, "grad_norm": 0.5562752485275269, "learning_rate": 0.0004902699263987165, "loss": 0.8336, "step": 8170 }, { "epoch": 0.9826902271907682, "grad_norm": 0.4216817617416382, "learning_rate": 0.0004902567488744444, "loss": 0.8648, "step": 8175 }, { "epoch": 0.9832912609688664, "grad_norm": 0.6181256175041199, "learning_rate": 0.0004902435626103266, "loss": 0.9437, "step": 8180 }, { "epoch": 0.9838922947469648, "grad_norm": 0.7647708058357239, "learning_rate": 0.0004902303676068429, "loss": 0.7098, "step": 8185 }, { "epoch": 0.9844933285250631, "grad_norm": 0.8473737835884094, "learning_rate": 0.000490217163864473, "loss": 0.9621, "step": 8190 }, { "epoch": 0.9850943623031614, "grad_norm": 0.7344334721565247, "learning_rate": 0.0004902039513836976, "loss": 1.2563, "step": 8195 }, { "epoch": 0.9856953960812598, "grad_norm": 0.854670524597168, "learning_rate": 0.000490190730164997, "loss": 1.0609, "step": 8200 }, { "epoch": 0.9856953960812598, "eval_loss": 1.9345703125, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2169, "eval_samples_per_second": 4.543, "eval_steps_per_second": 1.136, "step": 8200 }, { "epoch": 0.9862964298593581, "grad_norm": 0.6333422064781189, "learning_rate": 0.000490177500208852, "loss": 1.3516, "step": 8205 }, { "epoch": 0.9868974636374565, "grad_norm": 0.5873801708221436, "learning_rate": 0.0004901642615157445, "loss": 1.0105, "step": 8210 }, { "epoch": 0.9874984974155547, "grad_norm": 0.7199957370758057, "learning_rate": 0.0004901510140861556, "loss": 1.3672, "step": 8215 }, { "epoch": 0.9880995311936531, "grad_norm": 0.6357819437980652, "learning_rate": 0.0004901377579205673, "loss": 1.0727, "step": 8220 }, { "epoch": 0.9887005649717514, "grad_norm": 0.8580417633056641, "learning_rate": 0.0004901244930194618, "loss": 1.0586, "step": 8225 }, { "epoch": 0.9893015987498497, "grad_norm": 0.5398697257041931, "learning_rate": 0.0004901112193833218, "loss": 0.8965, "step": 8230 }, { "epoch": 0.9899026325279481, "grad_norm": 0.43280282616615295, "learning_rate": 0.00049009793701263, "loss": 1.0375, "step": 8235 }, { "epoch": 0.9905036663060464, "grad_norm": 0.6764346361160278, "learning_rate": 0.0004900846459078696, "loss": 0.9008, "step": 8240 }, { "epoch": 0.9911047000841448, "grad_norm": 0.4584319293498993, "learning_rate": 0.0004900713460695241, "loss": 1.1316, "step": 8245 }, { "epoch": 0.991705733862243, "grad_norm": 0.6319305300712585, "learning_rate": 0.0004900580374980773, "loss": 0.9242, "step": 8250 }, { "epoch": 0.9923067676403414, "grad_norm": 0.6038274168968201, "learning_rate": 0.0004900447201940134, "loss": 1.0281, "step": 8255 }, { "epoch": 0.9929078014184397, "grad_norm": 0.6040610671043396, "learning_rate": 0.0004900313941578167, "loss": 0.9094, "step": 8260 }, { "epoch": 0.9935088351965381, "grad_norm": 0.44790542125701904, "learning_rate": 0.000490018059389972, "loss": 0.7422, "step": 8265 }, { "epoch": 0.9941098689746364, "grad_norm": 0.8088905215263367, "learning_rate": 0.0004900047158909645, "loss": 0.932, "step": 8270 }, { "epoch": 0.9947109027527347, "grad_norm": 0.530339777469635, "learning_rate": 0.0004899913636612796, "loss": 1.0063, "step": 8275 }, { "epoch": 0.995311936530833, "grad_norm": 0.6597589254379272, "learning_rate": 0.0004899780027014029, "loss": 1.0379, "step": 8280 }, { "epoch": 0.9959129703089313, "grad_norm": 0.6081587672233582, "learning_rate": 0.0004899646330118203, "loss": 0.7465, "step": 8285 }, { "epoch": 0.9965140040870297, "grad_norm": 0.629805326461792, "learning_rate": 0.0004899512545930186, "loss": 0.6945, "step": 8290 }, { "epoch": 0.997115037865128, "grad_norm": 0.49113985896110535, "learning_rate": 0.0004899378674454841, "loss": 0.9498, "step": 8295 }, { "epoch": 0.9977160716432264, "grad_norm": 0.787839949131012, "learning_rate": 0.0004899244715697037, "loss": 0.7812, "step": 8300 }, { "epoch": 0.9983171054213247, "grad_norm": 0.6006747484207153, "learning_rate": 0.0004899110669661651, "loss": 0.9688, "step": 8305 }, { "epoch": 0.998918139199423, "grad_norm": 0.6008657813072205, "learning_rate": 0.0004898976536353557, "loss": 0.9273, "step": 8310 }, { "epoch": 0.9995191729775214, "grad_norm": 0.723429262638092, "learning_rate": 0.0004898842315777634, "loss": 0.8125, "step": 8315 }, { "epoch": 1.0001202067556196, "grad_norm": 0.5185750722885132, "learning_rate": 0.0004898708007938765, "loss": 0.9336, "step": 8320 }, { "epoch": 1.000721240533718, "grad_norm": 0.5277417898178101, "learning_rate": 0.0004898573612841835, "loss": 1.0055, "step": 8325 }, { "epoch": 1.0013222743118164, "grad_norm": 0.5660562515258789, "learning_rate": 0.0004898439130491734, "loss": 1.0242, "step": 8330 }, { "epoch": 1.0019233080899146, "grad_norm": 0.6881816983222961, "learning_rate": 0.0004898304560893354, "loss": 0.8938, "step": 8335 }, { "epoch": 1.002524341868013, "grad_norm": 0.598729133605957, "learning_rate": 0.0004898169904051589, "loss": 0.6754, "step": 8340 }, { "epoch": 1.0031253756461114, "grad_norm": 0.4618369936943054, "learning_rate": 0.0004898035159971339, "loss": 0.7711, "step": 8345 }, { "epoch": 1.0037264094242095, "grad_norm": 0.4155135154724121, "learning_rate": 0.0004897900328657505, "loss": 0.902, "step": 8350 }, { "epoch": 1.004327443202308, "grad_norm": 0.41335535049438477, "learning_rate": 0.0004897765410114992, "loss": 0.7145, "step": 8355 }, { "epoch": 1.0049284769804063, "grad_norm": 0.47209951281547546, "learning_rate": 0.0004897630404348708, "loss": 0.9953, "step": 8360 }, { "epoch": 1.0055295107585047, "grad_norm": 0.4678365886211395, "learning_rate": 0.0004897495311363563, "loss": 0.6926, "step": 8365 }, { "epoch": 1.0061305445366029, "grad_norm": 0.5993611812591553, "learning_rate": 0.0004897360131164472, "loss": 0.9891, "step": 8370 }, { "epoch": 1.0067315783147013, "grad_norm": 0.5312657952308655, "learning_rate": 0.0004897224863756353, "loss": 0.5711, "step": 8375 }, { "epoch": 1.0073326120927997, "grad_norm": 0.6508992314338684, "learning_rate": 0.0004897089509144127, "loss": 0.6695, "step": 8380 }, { "epoch": 1.0079336458708978, "grad_norm": 0.5376356840133667, "learning_rate": 0.0004896954067332715, "loss": 0.6527, "step": 8385 }, { "epoch": 1.0085346796489962, "grad_norm": 0.6709601283073425, "learning_rate": 0.0004896818538327048, "loss": 0.9539, "step": 8390 }, { "epoch": 1.0091357134270946, "grad_norm": 0.7939410209655762, "learning_rate": 0.0004896682922132053, "loss": 0.8016, "step": 8395 }, { "epoch": 1.009736747205193, "grad_norm": 0.6639977097511292, "learning_rate": 0.0004896547218752664, "loss": 0.6945, "step": 8400 }, { "epoch": 1.009736747205193, "eval_loss": 2.0069336891174316, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2077, "eval_samples_per_second": 4.544, "eval_steps_per_second": 1.136, "step": 8400 }, { "epoch": 1.0103377809832912, "grad_norm": 0.5364493727684021, "learning_rate": 0.0004896411428193819, "loss": 0.8273, "step": 8405 }, { "epoch": 1.0109388147613896, "grad_norm": 0.6952767968177795, "learning_rate": 0.0004896275550460457, "loss": 0.9453, "step": 8410 }, { "epoch": 1.011539848539488, "grad_norm": 0.581611692905426, "learning_rate": 0.000489613958555752, "loss": 0.5684, "step": 8415 }, { "epoch": 1.0121408823175861, "grad_norm": 0.6203157901763916, "learning_rate": 0.0004896003533489954, "loss": 0.9207, "step": 8420 }, { "epoch": 1.0127419160956845, "grad_norm": 0.6239716410636902, "learning_rate": 0.0004895867394262708, "loss": 0.566, "step": 8425 }, { "epoch": 1.013342949873783, "grad_norm": 0.4263458251953125, "learning_rate": 0.0004895731167880737, "loss": 1.2348, "step": 8430 }, { "epoch": 1.0139439836518813, "grad_norm": 0.6249896883964539, "learning_rate": 0.0004895594854348993, "loss": 0.759, "step": 8435 }, { "epoch": 1.0145450174299795, "grad_norm": 0.5187356472015381, "learning_rate": 0.0004895458453672436, "loss": 0.7691, "step": 8440 }, { "epoch": 1.0151460512080779, "grad_norm": 0.6687159538269043, "learning_rate": 0.000489532196585603, "loss": 0.7211, "step": 8445 }, { "epoch": 1.0157470849861763, "grad_norm": 0.5391019582748413, "learning_rate": 0.0004895185390904736, "loss": 0.7238, "step": 8450 }, { "epoch": 1.0163481187642744, "grad_norm": 0.6238473653793335, "learning_rate": 0.0004895048728823525, "loss": 0.8141, "step": 8455 }, { "epoch": 1.0169491525423728, "grad_norm": 0.5949072241783142, "learning_rate": 0.000489491197961737, "loss": 1.0371, "step": 8460 }, { "epoch": 1.0175501863204712, "grad_norm": 0.49681204557418823, "learning_rate": 0.000489477514329124, "loss": 0.9836, "step": 8465 }, { "epoch": 1.0181512200985696, "grad_norm": 0.6939390897750854, "learning_rate": 0.0004894638219850117, "loss": 0.7605, "step": 8470 }, { "epoch": 1.0187522538766678, "grad_norm": 0.8151843547821045, "learning_rate": 0.0004894501209298981, "loss": 0.9187, "step": 8475 }, { "epoch": 1.0193532876547662, "grad_norm": 0.6012240648269653, "learning_rate": 0.0004894364111642817, "loss": 0.6211, "step": 8480 }, { "epoch": 1.0199543214328646, "grad_norm": 0.5730084776878357, "learning_rate": 0.0004894226926886609, "loss": 0.9836, "step": 8485 }, { "epoch": 1.0205553552109627, "grad_norm": 0.49402958154678345, "learning_rate": 0.0004894089655035351, "loss": 0.8344, "step": 8490 }, { "epoch": 1.0211563889890611, "grad_norm": 0.5203851461410522, "learning_rate": 0.0004893952296094034, "loss": 0.7824, "step": 8495 }, { "epoch": 1.0217574227671595, "grad_norm": 0.5643207430839539, "learning_rate": 0.0004893814850067656, "loss": 0.7672, "step": 8500 }, { "epoch": 1.022358456545258, "grad_norm": 0.5308443307876587, "learning_rate": 0.0004893677316961217, "loss": 0.7809, "step": 8505 }, { "epoch": 1.022959490323356, "grad_norm": 0.8188532590866089, "learning_rate": 0.0004893539696779719, "loss": 1.3758, "step": 8510 }, { "epoch": 1.0235605241014545, "grad_norm": 0.5664224624633789, "learning_rate": 0.000489340198952817, "loss": 1.0063, "step": 8515 }, { "epoch": 1.0241615578795529, "grad_norm": 0.8532259464263916, "learning_rate": 0.0004893264195211577, "loss": 1.0594, "step": 8520 }, { "epoch": 1.0247625916576513, "grad_norm": 0.6089533567428589, "learning_rate": 0.0004893126313834955, "loss": 0.8941, "step": 8525 }, { "epoch": 1.0253636254357494, "grad_norm": 1.0394479036331177, "learning_rate": 0.0004892988345403319, "loss": 1.0789, "step": 8530 }, { "epoch": 1.0259646592138478, "grad_norm": 0.6304917931556702, "learning_rate": 0.0004892850289921687, "loss": 0.7742, "step": 8535 }, { "epoch": 1.0265656929919462, "grad_norm": 0.504525363445282, "learning_rate": 0.0004892712147395081, "loss": 0.7102, "step": 8540 }, { "epoch": 1.0271667267700444, "grad_norm": 0.6600879430770874, "learning_rate": 0.0004892573917828527, "loss": 0.6367, "step": 8545 }, { "epoch": 1.0277677605481428, "grad_norm": 0.5845037698745728, "learning_rate": 0.0004892435601227053, "loss": 1.1102, "step": 8550 }, { "epoch": 1.0283687943262412, "grad_norm": 0.4483342170715332, "learning_rate": 0.0004892297197595691, "loss": 0.7086, "step": 8555 }, { "epoch": 1.0289698281043396, "grad_norm": 0.5517778396606445, "learning_rate": 0.0004892158706939475, "loss": 0.5734, "step": 8560 }, { "epoch": 1.0295708618824377, "grad_norm": 0.6166327595710754, "learning_rate": 0.0004892020129263443, "loss": 0.8305, "step": 8565 }, { "epoch": 1.0301718956605361, "grad_norm": 0.7943052649497986, "learning_rate": 0.0004891881464572637, "loss": 0.7328, "step": 8570 }, { "epoch": 1.0307729294386345, "grad_norm": 0.6075527667999268, "learning_rate": 0.0004891742712872101, "loss": 0.9672, "step": 8575 }, { "epoch": 1.0313739632167327, "grad_norm": 0.5800756216049194, "learning_rate": 0.0004891603874166882, "loss": 0.9324, "step": 8580 }, { "epoch": 1.031974996994831, "grad_norm": 0.6770453453063965, "learning_rate": 0.000489146494846203, "loss": 0.7641, "step": 8585 }, { "epoch": 1.0325760307729295, "grad_norm": 0.6018184423446655, "learning_rate": 0.00048913259357626, "loss": 0.7164, "step": 8590 }, { "epoch": 1.0331770645510279, "grad_norm": 0.6010670065879822, "learning_rate": 0.0004891186836073647, "loss": 0.8781, "step": 8595 }, { "epoch": 1.033778098329126, "grad_norm": 0.6388862729072571, "learning_rate": 0.0004891047649400233, "loss": 0.8672, "step": 8600 }, { "epoch": 1.033778098329126, "eval_loss": 1.959375023841858, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1931, "eval_samples_per_second": 4.546, "eval_steps_per_second": 1.137, "step": 8600 }, { "epoch": 1.0343791321072244, "grad_norm": 0.4996091425418854, "learning_rate": 0.0004890908375747421, "loss": 0.7734, "step": 8605 }, { "epoch": 1.0349801658853228, "grad_norm": 0.4541950225830078, "learning_rate": 0.0004890769015120275, "loss": 0.9844, "step": 8610 }, { "epoch": 1.035581199663421, "grad_norm": 0.6546748876571655, "learning_rate": 0.0004890629567523868, "loss": 0.6719, "step": 8615 }, { "epoch": 1.0361822334415194, "grad_norm": 0.6568920016288757, "learning_rate": 0.0004890490032963269, "loss": 0.8898, "step": 8620 }, { "epoch": 1.0367832672196178, "grad_norm": 0.6672289371490479, "learning_rate": 0.0004890350411443558, "loss": 1.2523, "step": 8625 }, { "epoch": 1.0373843009977162, "grad_norm": 1.0342859029769897, "learning_rate": 0.0004890210702969811, "loss": 1.132, "step": 8630 }, { "epoch": 1.0379853347758143, "grad_norm": 0.4899213910102844, "learning_rate": 0.0004890070907547111, "loss": 0.8371, "step": 8635 }, { "epoch": 1.0385863685539127, "grad_norm": 0.7239575386047363, "learning_rate": 0.0004889931025180543, "loss": 0.6107, "step": 8640 }, { "epoch": 1.0391874023320111, "grad_norm": 0.5722755193710327, "learning_rate": 0.0004889791055875197, "loss": 0.6227, "step": 8645 }, { "epoch": 1.0397884361101093, "grad_norm": 0.44977515935897827, "learning_rate": 0.0004889650999636163, "loss": 0.8754, "step": 8650 }, { "epoch": 1.0403894698882077, "grad_norm": 0.644370973110199, "learning_rate": 0.0004889510856468536, "loss": 1.1984, "step": 8655 }, { "epoch": 1.040990503666306, "grad_norm": 0.5723581314086914, "learning_rate": 0.0004889370626377416, "loss": 0.9086, "step": 8660 }, { "epoch": 1.0415915374444045, "grad_norm": 0.6146963238716125, "learning_rate": 0.0004889230309367902, "loss": 0.7223, "step": 8665 }, { "epoch": 1.0421925712225026, "grad_norm": 0.4661894738674164, "learning_rate": 0.0004889089905445098, "loss": 0.934, "step": 8670 }, { "epoch": 1.042793605000601, "grad_norm": 0.5599977970123291, "learning_rate": 0.0004888949414614114, "loss": 0.7234, "step": 8675 }, { "epoch": 1.0433946387786994, "grad_norm": 0.7143124938011169, "learning_rate": 0.0004888808836880057, "loss": 0.7266, "step": 8680 }, { "epoch": 1.0439956725567976, "grad_norm": 0.757452130317688, "learning_rate": 0.0004888668172248044, "loss": 0.8688, "step": 8685 }, { "epoch": 1.044596706334896, "grad_norm": 0.7577751278877258, "learning_rate": 0.0004888527420723192, "loss": 0.8537, "step": 8690 }, { "epoch": 1.0451977401129944, "grad_norm": 0.49657824635505676, "learning_rate": 0.0004888386582310619, "loss": 1.1113, "step": 8695 }, { "epoch": 1.0457987738910928, "grad_norm": 0.5334681272506714, "learning_rate": 0.000488824565701545, "loss": 1.1789, "step": 8700 }, { "epoch": 1.046399807669191, "grad_norm": 0.566023051738739, "learning_rate": 0.000488810464484281, "loss": 1.125, "step": 8705 }, { "epoch": 1.0470008414472893, "grad_norm": 0.7300113439559937, "learning_rate": 0.0004887963545797829, "loss": 0.9492, "step": 8710 }, { "epoch": 1.0476018752253877, "grad_norm": 0.6816569566726685, "learning_rate": 0.0004887822359885641, "loss": 0.9258, "step": 8715 }, { "epoch": 1.048202909003486, "grad_norm": 0.6783853769302368, "learning_rate": 0.0004887681087111381, "loss": 0.882, "step": 8720 }, { "epoch": 1.0488039427815843, "grad_norm": 0.6293913125991821, "learning_rate": 0.0004887539727480188, "loss": 0.8359, "step": 8725 }, { "epoch": 1.0494049765596827, "grad_norm": 0.5151919722557068, "learning_rate": 0.0004887398280997204, "loss": 0.6813, "step": 8730 }, { "epoch": 1.050006010337781, "grad_norm": 0.7799313068389893, "learning_rate": 0.0004887256747667575, "loss": 0.998, "step": 8735 }, { "epoch": 1.0506070441158792, "grad_norm": 0.7907360792160034, "learning_rate": 0.000488711512749645, "loss": 0.8133, "step": 8740 }, { "epoch": 1.0512080778939776, "grad_norm": 0.6638164520263672, "learning_rate": 0.0004886973420488981, "loss": 1.0699, "step": 8745 }, { "epoch": 1.051809111672076, "grad_norm": 0.5922715663909912, "learning_rate": 0.000488683162665032, "loss": 0.6781, "step": 8750 }, { "epoch": 1.0524101454501742, "grad_norm": 0.5023715496063232, "learning_rate": 0.0004886689745985628, "loss": 1.1219, "step": 8755 }, { "epoch": 1.0530111792282726, "grad_norm": 0.5563696026802063, "learning_rate": 0.0004886547778500066, "loss": 0.7641, "step": 8760 }, { "epoch": 1.053612213006371, "grad_norm": 0.544734001159668, "learning_rate": 0.0004886405724198797, "loss": 0.7512, "step": 8765 }, { "epoch": 1.0542132467844694, "grad_norm": 0.7026242017745972, "learning_rate": 0.0004886263583086989, "loss": 1.0484, "step": 8770 }, { "epoch": 1.0548142805625675, "grad_norm": 0.4334627389907837, "learning_rate": 0.0004886121355169813, "loss": 0.7449, "step": 8775 }, { "epoch": 1.055415314340666, "grad_norm": 0.5260578989982605, "learning_rate": 0.0004885979040452444, "loss": 0.816, "step": 8780 }, { "epoch": 1.0560163481187643, "grad_norm": 0.48516446352005005, "learning_rate": 0.0004885836638940056, "loss": 0.7617, "step": 8785 }, { "epoch": 1.0566173818968627, "grad_norm": 0.40738537907600403, "learning_rate": 0.0004885694150637832, "loss": 1.0562, "step": 8790 }, { "epoch": 1.0572184156749609, "grad_norm": 0.6780609488487244, "learning_rate": 0.0004885551575550954, "loss": 0.8414, "step": 8795 }, { "epoch": 1.0578194494530593, "grad_norm": 0.7620958089828491, "learning_rate": 0.0004885408913684608, "loss": 0.8609, "step": 8800 }, { "epoch": 1.0578194494530593, "eval_loss": 1.9694335460662842, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2116, "eval_samples_per_second": 4.544, "eval_steps_per_second": 1.136, "step": 8800 }, { "epoch": 1.0584204832311577, "grad_norm": 0.7030707001686096, "learning_rate": 0.0004885266165043984, "loss": 0.6246, "step": 8805 }, { "epoch": 1.0590215170092558, "grad_norm": 0.29418814182281494, "learning_rate": 0.0004885123329634276, "loss": 0.5852, "step": 8810 }, { "epoch": 1.0596225507873542, "grad_norm": 0.6476624011993408, "learning_rate": 0.000488498040746068, "loss": 0.7898, "step": 8815 }, { "epoch": 1.0602235845654526, "grad_norm": 0.5112327337265015, "learning_rate": 0.0004884837398528392, "loss": 0.9133, "step": 8820 }, { "epoch": 1.0608246183435508, "grad_norm": 0.5061384439468384, "learning_rate": 0.0004884694302842617, "loss": 0.5664, "step": 8825 }, { "epoch": 1.0614256521216492, "grad_norm": 0.5925976037979126, "learning_rate": 0.000488455112040856, "loss": 1.0883, "step": 8830 }, { "epoch": 1.0620266858997476, "grad_norm": 0.4409756660461426, "learning_rate": 0.000488440785123143, "loss": 0.8326, "step": 8835 }, { "epoch": 1.062627719677846, "grad_norm": 0.6364176869392395, "learning_rate": 0.0004884264495316437, "loss": 0.8469, "step": 8840 }, { "epoch": 1.0632287534559441, "grad_norm": 0.7234607338905334, "learning_rate": 0.0004884121052668798, "loss": 1.0078, "step": 8845 }, { "epoch": 1.0638297872340425, "grad_norm": 0.5747237205505371, "learning_rate": 0.0004883977523293729, "loss": 0.9168, "step": 8850 }, { "epoch": 1.064430821012141, "grad_norm": 0.6856431365013123, "learning_rate": 0.0004883833907196452, "loss": 1.0883, "step": 8855 }, { "epoch": 1.0650318547902393, "grad_norm": 0.6114706993103027, "learning_rate": 0.0004883690204382193, "loss": 0.9855, "step": 8860 }, { "epoch": 1.0656328885683375, "grad_norm": 0.5267056226730347, "learning_rate": 0.0004883546414856176, "loss": 0.8688, "step": 8865 }, { "epoch": 1.0662339223464359, "grad_norm": 0.7107062935829163, "learning_rate": 0.0004883402538623635, "loss": 0.9836, "step": 8870 }, { "epoch": 1.0668349561245343, "grad_norm": 0.5223785042762756, "learning_rate": 0.0004883258575689802, "loss": 1.3586, "step": 8875 }, { "epoch": 1.0674359899026324, "grad_norm": 0.6106747388839722, "learning_rate": 0.0004883114526059914, "loss": 0.7469, "step": 8880 }, { "epoch": 1.0680370236807308, "grad_norm": 0.7474291920661926, "learning_rate": 0.0004882970389739212, "loss": 0.8523, "step": 8885 }, { "epoch": 1.0686380574588292, "grad_norm": 0.671882688999176, "learning_rate": 0.0004882826166732937, "loss": 0.7371, "step": 8890 }, { "epoch": 1.0692390912369276, "grad_norm": 0.7095271944999695, "learning_rate": 0.00048826818570463394, "loss": 0.9039, "step": 8895 }, { "epoch": 1.0698401250150258, "grad_norm": 0.5591096878051758, "learning_rate": 0.0004882537460684666, "loss": 1.0383, "step": 8900 }, { "epoch": 1.0704411587931242, "grad_norm": 0.5546512007713318, "learning_rate": 0.00048823929776531703, "loss": 1.2125, "step": 8905 }, { "epoch": 1.0710421925712226, "grad_norm": 0.6155726313591003, "learning_rate": 0.0004882248407957107, "loss": 0.6395, "step": 8910 }, { "epoch": 1.0716432263493207, "grad_norm": 0.49040448665618896, "learning_rate": 0.00048821037516017364, "loss": 0.6961, "step": 8915 }, { "epoch": 1.0722442601274191, "grad_norm": 0.8472973704338074, "learning_rate": 0.00048819590085923207, "loss": 1.0492, "step": 8920 }, { "epoch": 1.0728452939055175, "grad_norm": 0.668751060962677, "learning_rate": 0.00048818141789341244, "loss": 1.0004, "step": 8925 }, { "epoch": 1.073446327683616, "grad_norm": 0.6818864345550537, "learning_rate": 0.0004881669262632417, "loss": 1.2766, "step": 8930 }, { "epoch": 1.074047361461714, "grad_norm": 0.8253021836280823, "learning_rate": 0.0004881524259692469, "loss": 0.8078, "step": 8935 }, { "epoch": 1.0746483952398125, "grad_norm": 0.7862038016319275, "learning_rate": 0.0004881379170119556, "loss": 1.1773, "step": 8940 }, { "epoch": 1.0752494290179109, "grad_norm": 0.6154944896697998, "learning_rate": 0.00048812339939189556, "loss": 0.634, "step": 8945 }, { "epoch": 1.075850462796009, "grad_norm": 0.5454443097114563, "learning_rate": 0.00048810887310959494, "loss": 0.8578, "step": 8950 }, { "epoch": 1.0764514965741074, "grad_norm": 0.9031620025634766, "learning_rate": 0.0004880943381655821, "loss": 0.7477, "step": 8955 }, { "epoch": 1.0770525303522058, "grad_norm": 0.8679643273353577, "learning_rate": 0.00048807979456038573, "loss": 1.0605, "step": 8960 }, { "epoch": 1.0776535641303042, "grad_norm": 0.7441648840904236, "learning_rate": 0.00048806524229453506, "loss": 1.0195, "step": 8965 }, { "epoch": 1.0782545979084024, "grad_norm": 0.7008485794067383, "learning_rate": 0.00048805068136855936, "loss": 0.9508, "step": 8970 }, { "epoch": 1.0788556316865008, "grad_norm": 0.5873783230781555, "learning_rate": 0.0004880361117829882, "loss": 1.0383, "step": 8975 }, { "epoch": 1.0794566654645992, "grad_norm": 0.6851660013198853, "learning_rate": 0.0004880215335383518, "loss": 0.8875, "step": 8980 }, { "epoch": 1.0800576992426973, "grad_norm": 0.509058952331543, "learning_rate": 0.0004880069466351803, "loss": 0.6211, "step": 8985 }, { "epoch": 1.0806587330207957, "grad_norm": 0.6431728601455688, "learning_rate": 0.0004879923510740044, "loss": 0.8668, "step": 8990 }, { "epoch": 1.0812597667988941, "grad_norm": 0.7993625402450562, "learning_rate": 0.00048797774685535513, "loss": 0.957, "step": 8995 }, { "epoch": 1.0818608005769925, "grad_norm": 0.4389391243457794, "learning_rate": 0.00048796313397976354, "loss": 0.7219, "step": 9000 }, { "epoch": 1.0818608005769925, "eval_loss": 1.9000976085662842, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2031, "eval_samples_per_second": 4.545, "eval_steps_per_second": 1.136, "step": 9000 }, { "epoch": 1.0824618343550907, "grad_norm": 0.5477054715156555, "learning_rate": 0.0004879485124477614, "loss": 0.8793, "step": 9005 }, { "epoch": 1.083062868133189, "grad_norm": 0.7327670454978943, "learning_rate": 0.0004879338822598805, "loss": 0.7883, "step": 9010 }, { "epoch": 1.0836639019112875, "grad_norm": 0.5200571417808533, "learning_rate": 0.00048791924341665306, "loss": 0.8223, "step": 9015 }, { "epoch": 1.0842649356893856, "grad_norm": 0.667089581489563, "learning_rate": 0.0004879045959186116, "loss": 0.9582, "step": 9020 }, { "epoch": 1.084865969467484, "grad_norm": 0.5208351016044617, "learning_rate": 0.0004878899397662889, "loss": 0.7625, "step": 9025 }, { "epoch": 1.0854670032455824, "grad_norm": 0.8315151929855347, "learning_rate": 0.00048787527496021825, "loss": 1.043, "step": 9030 }, { "epoch": 1.0860680370236808, "grad_norm": 0.4226670265197754, "learning_rate": 0.00048786060150093305, "loss": 1.0336, "step": 9035 }, { "epoch": 1.086669070801779, "grad_norm": 0.6351620554924011, "learning_rate": 0.000487845919388967, "loss": 0.5309, "step": 9040 }, { "epoch": 1.0872701045798774, "grad_norm": 0.7362414002418518, "learning_rate": 0.0004878312286248543, "loss": 1.0016, "step": 9045 }, { "epoch": 1.0878711383579758, "grad_norm": 0.6137573719024658, "learning_rate": 0.00048781652920912923, "loss": 1.1012, "step": 9050 }, { "epoch": 1.088472172136074, "grad_norm": 0.4512140452861786, "learning_rate": 0.0004878018211423266, "loss": 0.8469, "step": 9055 }, { "epoch": 1.0890732059141723, "grad_norm": 0.5843665599822998, "learning_rate": 0.0004877871044249815, "loss": 0.6816, "step": 9060 }, { "epoch": 1.0896742396922707, "grad_norm": 0.6020638346672058, "learning_rate": 0.0004877723790576292, "loss": 0.7195, "step": 9065 }, { "epoch": 1.0902752734703691, "grad_norm": 0.9359005093574524, "learning_rate": 0.00048775764504080534, "loss": 0.8461, "step": 9070 }, { "epoch": 1.0908763072484673, "grad_norm": 0.6977394819259644, "learning_rate": 0.000487742902375046, "loss": 0.8891, "step": 9075 }, { "epoch": 1.0914773410265657, "grad_norm": 0.7778275012969971, "learning_rate": 0.0004877281510608874, "loss": 0.825, "step": 9080 }, { "epoch": 1.092078374804664, "grad_norm": 0.7757279276847839, "learning_rate": 0.00048771339109886615, "loss": 0.8391, "step": 9085 }, { "epoch": 1.0926794085827622, "grad_norm": 0.7895723581314087, "learning_rate": 0.00048769862248951925, "loss": 1.1707, "step": 9090 }, { "epoch": 1.0932804423608606, "grad_norm": 0.5030030012130737, "learning_rate": 0.0004876838452333838, "loss": 0.8715, "step": 9095 }, { "epoch": 1.093881476138959, "grad_norm": 0.5875306725502014, "learning_rate": 0.00048766905933099745, "loss": 0.7648, "step": 9100 }, { "epoch": 1.0944825099170574, "grad_norm": 0.661494791507721, "learning_rate": 0.00048765426478289815, "loss": 1.0094, "step": 9105 }, { "epoch": 1.0950835436951556, "grad_norm": 0.4800059497356415, "learning_rate": 0.0004876394615896239, "loss": 0.8906, "step": 9110 }, { "epoch": 1.095684577473254, "grad_norm": 0.6578857898712158, "learning_rate": 0.0004876246497517134, "loss": 0.7836, "step": 9115 }, { "epoch": 1.0962856112513524, "grad_norm": 0.6155128479003906, "learning_rate": 0.0004876098292697052, "loss": 0.9078, "step": 9120 }, { "epoch": 1.0968866450294508, "grad_norm": 0.680698573589325, "learning_rate": 0.0004875950001441387, "loss": 0.8172, "step": 9125 }, { "epoch": 1.097487678807549, "grad_norm": 0.6632817387580872, "learning_rate": 0.00048758016237555315, "loss": 0.7355, "step": 9130 }, { "epoch": 1.0980887125856473, "grad_norm": 0.824616551399231, "learning_rate": 0.0004875653159644884, "loss": 0.8727, "step": 9135 }, { "epoch": 1.0986897463637457, "grad_norm": 0.612898051738739, "learning_rate": 0.0004875504609114845, "loss": 0.5258, "step": 9140 }, { "epoch": 1.099290780141844, "grad_norm": 0.7447797060012817, "learning_rate": 0.00048753559721708176, "loss": 0.9887, "step": 9145 }, { "epoch": 1.0998918139199423, "grad_norm": 0.506405770778656, "learning_rate": 0.000487520724881821, "loss": 0.5691, "step": 9150 }, { "epoch": 1.1004928476980407, "grad_norm": 0.5486372113227844, "learning_rate": 0.0004875058439062431, "loss": 0.6941, "step": 9155 }, { "epoch": 1.1010938814761388, "grad_norm": 0.6493369936943054, "learning_rate": 0.00048749095429088957, "loss": 1.2594, "step": 9160 }, { "epoch": 1.1016949152542372, "grad_norm": 0.7150719165802002, "learning_rate": 0.0004874760560363018, "loss": 1.0977, "step": 9165 }, { "epoch": 1.1022959490323356, "grad_norm": 0.5288280844688416, "learning_rate": 0.0004874611491430221, "loss": 0.5332, "step": 9170 }, { "epoch": 1.102896982810434, "grad_norm": 0.6431601047515869, "learning_rate": 0.00048744623361159236, "loss": 0.7551, "step": 9175 }, { "epoch": 1.1034980165885322, "grad_norm": 0.6951828598976135, "learning_rate": 0.00048743130944255543, "loss": 0.9383, "step": 9180 }, { "epoch": 1.1040990503666306, "grad_norm": 0.6183615922927856, "learning_rate": 0.0004874163766364541, "loss": 0.9977, "step": 9185 }, { "epoch": 1.104700084144729, "grad_norm": 0.3825269341468811, "learning_rate": 0.0004874014351938315, "loss": 0.8074, "step": 9190 }, { "epoch": 1.1053011179228274, "grad_norm": 0.6892231106758118, "learning_rate": 0.0004873864851152313, "loss": 0.8059, "step": 9195 }, { "epoch": 1.1059021517009255, "grad_norm": 0.9281521439552307, "learning_rate": 0.0004873715264011973, "loss": 1.2414, "step": 9200 }, { "epoch": 1.1059021517009255, "eval_loss": 1.947265625, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1963, "eval_samples_per_second": 4.546, "eval_steps_per_second": 1.136, "step": 9200 }, { "epoch": 1.106503185479024, "grad_norm": 0.5714129209518433, "learning_rate": 0.00048735655905227365, "loss": 1.3344, "step": 9205 }, { "epoch": 1.1071042192571223, "grad_norm": 0.5282446146011353, "learning_rate": 0.00048734158306900483, "loss": 0.9742, "step": 9210 }, { "epoch": 1.1077052530352205, "grad_norm": 0.5285758972167969, "learning_rate": 0.00048732659845193563, "loss": 0.7145, "step": 9215 }, { "epoch": 1.1083062868133189, "grad_norm": 0.510167121887207, "learning_rate": 0.00048731160520161103, "loss": 1.1641, "step": 9220 }, { "epoch": 1.1089073205914173, "grad_norm": 0.597683846950531, "learning_rate": 0.0004872966033185766, "loss": 1.0164, "step": 9225 }, { "epoch": 1.1095083543695157, "grad_norm": 0.7455722689628601, "learning_rate": 0.00048728159280337794, "loss": 0.6961, "step": 9230 }, { "epoch": 1.1101093881476138, "grad_norm": 0.5456376671791077, "learning_rate": 0.00048726657365656125, "loss": 0.7891, "step": 9235 }, { "epoch": 1.1107104219257122, "grad_norm": 0.7263533473014832, "learning_rate": 0.0004872515458786727, "loss": 0.8496, "step": 9240 }, { "epoch": 1.1113114557038106, "grad_norm": 0.499738872051239, "learning_rate": 0.0004872365094702591, "loss": 0.543, "step": 9245 }, { "epoch": 1.1119124894819088, "grad_norm": 0.5579853057861328, "learning_rate": 0.00048722146443186734, "loss": 0.8957, "step": 9250 }, { "epoch": 1.1125135232600072, "grad_norm": 20.049766540527344, "learning_rate": 0.0004872064107640447, "loss": 0.9352, "step": 9255 }, { "epoch": 1.1131145570381056, "grad_norm": 0.7877547740936279, "learning_rate": 0.0004871913484673389, "loss": 1.1938, "step": 9260 }, { "epoch": 1.113715590816204, "grad_norm": 0.6161669492721558, "learning_rate": 0.00048717627754229776, "loss": 0.932, "step": 9265 }, { "epoch": 1.1143166245943021, "grad_norm": 0.7744840979576111, "learning_rate": 0.00048716119798946955, "loss": 0.666, "step": 9270 }, { "epoch": 1.1149176583724005, "grad_norm": 0.4891236424446106, "learning_rate": 0.0004871461098094029, "loss": 0.9016, "step": 9275 }, { "epoch": 1.115518692150499, "grad_norm": 0.47612643241882324, "learning_rate": 0.0004871310130026465, "loss": 0.9203, "step": 9280 }, { "epoch": 1.116119725928597, "grad_norm": 0.7224286794662476, "learning_rate": 0.00048711590756974965, "loss": 0.7977, "step": 9285 }, { "epoch": 1.1167207597066955, "grad_norm": 0.6598291397094727, "learning_rate": 0.0004871007935112619, "loss": 1.0461, "step": 9290 }, { "epoch": 1.1173217934847939, "grad_norm": 0.6916563510894775, "learning_rate": 0.0004870856708277329, "loss": 0.9453, "step": 9295 }, { "epoch": 1.1179228272628923, "grad_norm": 0.6471813321113586, "learning_rate": 0.0004870705395197128, "loss": 0.9906, "step": 9300 }, { "epoch": 1.1185238610409904, "grad_norm": 0.6290755271911621, "learning_rate": 0.0004870553995877521, "loss": 0.975, "step": 9305 }, { "epoch": 1.1191248948190888, "grad_norm": 0.4361618757247925, "learning_rate": 0.00048704025103240157, "loss": 0.7344, "step": 9310 }, { "epoch": 1.1197259285971872, "grad_norm": 0.5164428949356079, "learning_rate": 0.00048702509385421225, "loss": 0.9688, "step": 9315 }, { "epoch": 1.1203269623752854, "grad_norm": 1.074345350265503, "learning_rate": 0.0004870099280537355, "loss": 0.8879, "step": 9320 }, { "epoch": 1.1209279961533838, "grad_norm": 0.661244809627533, "learning_rate": 0.0004869947536315229, "loss": 1.3227, "step": 9325 }, { "epoch": 1.1215290299314822, "grad_norm": 0.6688597202301025, "learning_rate": 0.00048697957058812657, "loss": 1.2328, "step": 9330 }, { "epoch": 1.1221300637095806, "grad_norm": 0.6121726036071777, "learning_rate": 0.00048696437892409885, "loss": 0.8344, "step": 9335 }, { "epoch": 1.1227310974876787, "grad_norm": 0.49119481444358826, "learning_rate": 0.0004869491786399923, "loss": 0.8789, "step": 9340 }, { "epoch": 1.1233321312657771, "grad_norm": 0.6267352104187012, "learning_rate": 0.0004869339697363599, "loss": 0.9133, "step": 9345 }, { "epoch": 1.1239331650438755, "grad_norm": 0.4927639961242676, "learning_rate": 0.00048691875221375486, "loss": 1.1469, "step": 9350 }, { "epoch": 1.1245341988219737, "grad_norm": 0.6534408330917358, "learning_rate": 0.0004869035260727308, "loss": 0.8258, "step": 9355 }, { "epoch": 1.125135232600072, "grad_norm": 0.49710723757743835, "learning_rate": 0.0004868882913138416, "loss": 1.2547, "step": 9360 }, { "epoch": 1.1257362663781705, "grad_norm": 0.5398971438407898, "learning_rate": 0.0004868730479376414, "loss": 0.9437, "step": 9365 }, { "epoch": 1.1263373001562689, "grad_norm": 0.65297532081604, "learning_rate": 0.0004868577959446848, "loss": 0.9176, "step": 9370 }, { "epoch": 1.126938333934367, "grad_norm": 0.6034179329872131, "learning_rate": 0.00048684253533552653, "loss": 0.9344, "step": 9375 }, { "epoch": 1.1275393677124654, "grad_norm": 0.6172097325325012, "learning_rate": 0.0004868272661107218, "loss": 0.6539, "step": 9380 }, { "epoch": 1.1281404014905638, "grad_norm": 0.6426337361335754, "learning_rate": 0.000486811988270826, "loss": 0.7199, "step": 9385 }, { "epoch": 1.1287414352686622, "grad_norm": 0.8425966501235962, "learning_rate": 0.0004867967018163949, "loss": 0.743, "step": 9390 }, { "epoch": 1.1293424690467604, "grad_norm": 0.7397813200950623, "learning_rate": 0.00048678140674798467, "loss": 0.8094, "step": 9395 }, { "epoch": 1.1299435028248588, "grad_norm": 0.4290348291397095, "learning_rate": 0.0004867661030661516, "loss": 0.8008, "step": 9400 }, { "epoch": 1.1299435028248588, "eval_loss": 1.9289062023162842, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1919, "eval_samples_per_second": 4.547, "eval_steps_per_second": 1.137, "step": 9400 }, { "epoch": 1.1305445366029572, "grad_norm": 0.5975921750068665, "learning_rate": 0.0004867507907714525, "loss": 0.891, "step": 9405 }, { "epoch": 1.1311455703810553, "grad_norm": 0.7869811654090881, "learning_rate": 0.0004867354698644442, "loss": 0.8297, "step": 9410 }, { "epoch": 1.1317466041591537, "grad_norm": 0.6262176632881165, "learning_rate": 0.00048672014034568413, "loss": 0.8273, "step": 9415 }, { "epoch": 1.1323476379372521, "grad_norm": 0.822317361831665, "learning_rate": 0.00048670480221573, "loss": 1.0664, "step": 9420 }, { "epoch": 1.1329486717153503, "grad_norm": 0.6900310516357422, "learning_rate": 0.00048668945547513976, "loss": 0.9531, "step": 9425 }, { "epoch": 1.1335497054934487, "grad_norm": 0.6054418087005615, "learning_rate": 0.0004866741001244716, "loss": 0.8953, "step": 9430 }, { "epoch": 1.134150739271547, "grad_norm": 0.3946428596973419, "learning_rate": 0.0004866587361642841, "loss": 0.6176, "step": 9435 }, { "epoch": 1.1347517730496455, "grad_norm": 0.3694043755531311, "learning_rate": 0.0004866433635951362, "loss": 0.966, "step": 9440 }, { "epoch": 1.1353528068277436, "grad_norm": 0.7611523270606995, "learning_rate": 0.0004866279824175871, "loss": 0.8445, "step": 9445 }, { "epoch": 1.135953840605842, "grad_norm": 0.6459202170372009, "learning_rate": 0.0004866125926321964, "loss": 0.8695, "step": 9450 }, { "epoch": 1.1365548743839404, "grad_norm": 0.586692214012146, "learning_rate": 0.00048659719423952373, "loss": 0.9332, "step": 9455 }, { "epoch": 1.1371559081620388, "grad_norm": 0.5236200094223022, "learning_rate": 0.00048658178724012946, "loss": 0.5746, "step": 9460 }, { "epoch": 1.137756941940137, "grad_norm": 0.5764670968055725, "learning_rate": 0.00048656637163457393, "loss": 0.7379, "step": 9465 }, { "epoch": 1.1383579757182354, "grad_norm": 0.5531455874443054, "learning_rate": 0.0004865509474234179, "loss": 0.9414, "step": 9470 }, { "epoch": 1.1389590094963338, "grad_norm": 0.6139127016067505, "learning_rate": 0.00048653551460722263, "loss": 0.548, "step": 9475 }, { "epoch": 1.139560043274432, "grad_norm": 0.9836716651916504, "learning_rate": 0.0004865200731865493, "loss": 1.1297, "step": 9480 }, { "epoch": 1.1401610770525303, "grad_norm": 0.4908493161201477, "learning_rate": 0.00048650462316195977, "loss": 0.7613, "step": 9485 }, { "epoch": 1.1407621108306287, "grad_norm": 0.6248270869255066, "learning_rate": 0.00048648916453401597, "loss": 0.6023, "step": 9490 }, { "epoch": 1.141363144608727, "grad_norm": 0.61272794008255, "learning_rate": 0.0004864736973032803, "loss": 1.007, "step": 9495 }, { "epoch": 1.1419641783868253, "grad_norm": 0.5007345080375671, "learning_rate": 0.00048645822147031545, "loss": 0.902, "step": 9500 }, { "epoch": 1.1425652121649237, "grad_norm": 0.7278372645378113, "learning_rate": 0.0004864427370356843, "loss": 0.809, "step": 9505 }, { "epoch": 1.143166245943022, "grad_norm": 0.5138121247291565, "learning_rate": 0.0004864272439999501, "loss": 1.0941, "step": 9510 }, { "epoch": 1.1437672797211202, "grad_norm": 0.544341504573822, "learning_rate": 0.0004864117423636766, "loss": 0.7891, "step": 9515 }, { "epoch": 1.1443683134992186, "grad_norm": 0.39205268025398254, "learning_rate": 0.00048639623212742763, "loss": 0.7164, "step": 9520 }, { "epoch": 1.144969347277317, "grad_norm": 0.5244433283805847, "learning_rate": 0.00048638071329176736, "loss": 0.6438, "step": 9525 }, { "epoch": 1.1455703810554154, "grad_norm": 0.7287188172340393, "learning_rate": 0.0004863651858572603, "loss": 0.9984, "step": 9530 }, { "epoch": 1.1461714148335136, "grad_norm": 0.5485352277755737, "learning_rate": 0.0004863496498244714, "loss": 1.0992, "step": 9535 }, { "epoch": 1.146772448611612, "grad_norm": 0.5940278768539429, "learning_rate": 0.00048633410519396573, "loss": 0.9422, "step": 9540 }, { "epoch": 1.1473734823897104, "grad_norm": 0.6309543251991272, "learning_rate": 0.00048631855196630883, "loss": 0.743, "step": 9545 }, { "epoch": 1.1479745161678085, "grad_norm": 0.4247833490371704, "learning_rate": 0.00048630299014206635, "loss": 0.7414, "step": 9550 }, { "epoch": 1.148575549945907, "grad_norm": 0.5805197954177856, "learning_rate": 0.0004862874197218046, "loss": 1.0941, "step": 9555 }, { "epoch": 1.1491765837240053, "grad_norm": 0.42623281478881836, "learning_rate": 0.00048627184070608974, "loss": 0.7973, "step": 9560 }, { "epoch": 1.1497776175021035, "grad_norm": 0.4706101715564728, "learning_rate": 0.0004862562530954887, "loss": 0.9137, "step": 9565 }, { "epoch": 1.150378651280202, "grad_norm": 0.5413990616798401, "learning_rate": 0.0004862406568905684, "loss": 0.7375, "step": 9570 }, { "epoch": 1.1509796850583003, "grad_norm": 0.46900811791419983, "learning_rate": 0.00048622505209189613, "loss": 0.5926, "step": 9575 }, { "epoch": 1.1515807188363987, "grad_norm": 0.5464361906051636, "learning_rate": 0.00048620943870003976, "loss": 0.8793, "step": 9580 }, { "epoch": 1.1521817526144968, "grad_norm": 0.5873452425003052, "learning_rate": 0.0004861938167155671, "loss": 1.1105, "step": 9585 }, { "epoch": 1.1527827863925952, "grad_norm": 0.5033503770828247, "learning_rate": 0.0004861781861390463, "loss": 0.634, "step": 9590 }, { "epoch": 1.1533838201706936, "grad_norm": 0.5663416385650635, "learning_rate": 0.00048616254697104625, "loss": 0.5824, "step": 9595 }, { "epoch": 1.153984853948792, "grad_norm": 0.7580047845840454, "learning_rate": 0.0004861468992121357, "loss": 0.8844, "step": 9600 }, { "epoch": 1.153984853948792, "eval_loss": 1.928613305091858, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1882, "eval_samples_per_second": 4.547, "eval_steps_per_second": 1.137, "step": 9600 }, { "epoch": 1.1545858877268902, "grad_norm": 0.5153626203536987, "learning_rate": 0.0004861312428628839, "loss": 1.0789, "step": 9605 }, { "epoch": 1.1551869215049886, "grad_norm": 0.4683619737625122, "learning_rate": 0.00048611557792386037, "loss": 0.7207, "step": 9610 }, { "epoch": 1.155787955283087, "grad_norm": 0.7193109393119812, "learning_rate": 0.0004860999043956349, "loss": 1.0297, "step": 9615 }, { "epoch": 1.1563889890611851, "grad_norm": 0.6638111472129822, "learning_rate": 0.0004860842222787778, "loss": 0.8805, "step": 9620 }, { "epoch": 1.1569900228392835, "grad_norm": 0.571563184261322, "learning_rate": 0.00048606853157385935, "loss": 1.1223, "step": 9625 }, { "epoch": 1.157591056617382, "grad_norm": 0.696118950843811, "learning_rate": 0.00048605283228145046, "loss": 0.9039, "step": 9630 }, { "epoch": 1.1581920903954803, "grad_norm": 0.614761471748352, "learning_rate": 0.0004860371244021222, "loss": 0.7539, "step": 9635 }, { "epoch": 1.1587931241735785, "grad_norm": 0.5698915719985962, "learning_rate": 0.000486021407936446, "loss": 0.9117, "step": 9640 }, { "epoch": 1.1593941579516769, "grad_norm": 0.6086310148239136, "learning_rate": 0.0004860056828849936, "loss": 0.9313, "step": 9645 }, { "epoch": 1.1599951917297753, "grad_norm": 0.7498596906661987, "learning_rate": 0.0004859899492483368, "loss": 0.7418, "step": 9650 }, { "epoch": 1.1605962255078734, "grad_norm": 0.5920203328132629, "learning_rate": 0.0004859742070270483, "loss": 1.0391, "step": 9655 }, { "epoch": 1.1611972592859718, "grad_norm": 0.8326849341392517, "learning_rate": 0.00048595845622170054, "loss": 0.8324, "step": 9660 }, { "epoch": 1.1617982930640702, "grad_norm": 0.5928247570991516, "learning_rate": 0.00048594269683286653, "loss": 0.9938, "step": 9665 }, { "epoch": 1.1623993268421686, "grad_norm": 0.653587281703949, "learning_rate": 0.0004859269288611195, "loss": 0.7414, "step": 9670 }, { "epoch": 1.1630003606202668, "grad_norm": 0.7033923864364624, "learning_rate": 0.00048591115230703315, "loss": 0.6594, "step": 9675 }, { "epoch": 1.1636013943983652, "grad_norm": 0.8698201179504395, "learning_rate": 0.00048589536717118134, "loss": 0.8766, "step": 9680 }, { "epoch": 1.1642024281764636, "grad_norm": 0.5177077651023865, "learning_rate": 0.0004858795734541382, "loss": 0.7887, "step": 9685 }, { "epoch": 1.1648034619545617, "grad_norm": 0.4896091818809509, "learning_rate": 0.0004858637711564784, "loss": 0.8078, "step": 9690 }, { "epoch": 1.1654044957326601, "grad_norm": 0.665745735168457, "learning_rate": 0.0004858479602787767, "loss": 0.8969, "step": 9695 }, { "epoch": 1.1660055295107585, "grad_norm": 0.5258559584617615, "learning_rate": 0.00048583214082160834, "loss": 0.5965, "step": 9700 }, { "epoch": 1.166606563288857, "grad_norm": 0.4834305942058563, "learning_rate": 0.0004858163127855486, "loss": 1.0508, "step": 9705 }, { "epoch": 1.167207597066955, "grad_norm": 0.7334483861923218, "learning_rate": 0.00048580047617117345, "loss": 1.1414, "step": 9710 }, { "epoch": 1.1678086308450535, "grad_norm": 0.5815469026565552, "learning_rate": 0.0004857846309790589, "loss": 1.1867, "step": 9715 }, { "epoch": 1.1684096646231519, "grad_norm": 0.5477116703987122, "learning_rate": 0.0004857687772097814, "loss": 0.5773, "step": 9720 }, { "epoch": 1.1690106984012503, "grad_norm": 0.35547226667404175, "learning_rate": 0.0004857529148639176, "loss": 0.9297, "step": 9725 }, { "epoch": 1.1696117321793484, "grad_norm": 0.6704192161560059, "learning_rate": 0.00048573704394204453, "loss": 1.2063, "step": 9730 }, { "epoch": 1.1702127659574468, "grad_norm": 0.7087374329566956, "learning_rate": 0.00048572116444473956, "loss": 0.5887, "step": 9735 }, { "epoch": 1.1708137997355452, "grad_norm": 0.735370397567749, "learning_rate": 0.0004857052763725803, "loss": 0.8164, "step": 9740 }, { "epoch": 1.1714148335136434, "grad_norm": 0.7930375337600708, "learning_rate": 0.0004856893797261448, "loss": 0.8031, "step": 9745 }, { "epoch": 1.1720158672917418, "grad_norm": 0.6161326169967651, "learning_rate": 0.00048567347450601125, "loss": 0.666, "step": 9750 }, { "epoch": 1.1726169010698402, "grad_norm": 0.6328686475753784, "learning_rate": 0.00048565756071275825, "loss": 0.9383, "step": 9755 }, { "epoch": 1.1732179348479383, "grad_norm": 0.6329209804534912, "learning_rate": 0.00048564163834696473, "loss": 0.9047, "step": 9760 }, { "epoch": 1.1738189686260367, "grad_norm": 1.1697479486465454, "learning_rate": 0.0004856257074092099, "loss": 0.8203, "step": 9765 }, { "epoch": 1.1744200024041351, "grad_norm": 0.4956655502319336, "learning_rate": 0.00048560976790007314, "loss": 0.9742, "step": 9770 }, { "epoch": 1.1750210361822335, "grad_norm": 0.5287923216819763, "learning_rate": 0.0004855938198201345, "loss": 0.7641, "step": 9775 }, { "epoch": 1.1756220699603317, "grad_norm": 0.5474404692649841, "learning_rate": 0.0004855778631699741, "loss": 0.9648, "step": 9780 }, { "epoch": 1.17622310373843, "grad_norm": 0.5423691272735596, "learning_rate": 0.00048556189795017225, "loss": 0.7352, "step": 9785 }, { "epoch": 1.1768241375165285, "grad_norm": 0.4579305052757263, "learning_rate": 0.00048554592416130975, "loss": 0.8719, "step": 9790 }, { "epoch": 1.1774251712946269, "grad_norm": 0.5062451362609863, "learning_rate": 0.00048552994180396783, "loss": 0.818, "step": 9795 }, { "epoch": 1.178026205072725, "grad_norm": 0.47659769654273987, "learning_rate": 0.0004855139508787276, "loss": 0.8531, "step": 9800 }, { "epoch": 1.178026205072725, "eval_loss": 1.951171875, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1762, "eval_samples_per_second": 4.549, "eval_steps_per_second": 1.137, "step": 9800 }, { "epoch": 1.1786272388508234, "grad_norm": 0.5294814705848694, "learning_rate": 0.00048549795138617117, "loss": 1.1641, "step": 9805 }, { "epoch": 1.1792282726289218, "grad_norm": 0.6930984258651733, "learning_rate": 0.00048548194332688016, "loss": 0.7887, "step": 9810 }, { "epoch": 1.17982930640702, "grad_norm": 0.5947431921958923, "learning_rate": 0.00048546592670143713, "loss": 1.0766, "step": 9815 }, { "epoch": 1.1804303401851184, "grad_norm": 0.47059887647628784, "learning_rate": 0.00048544990151042463, "loss": 0.941, "step": 9820 }, { "epoch": 1.1810313739632168, "grad_norm": 0.6924253702163696, "learning_rate": 0.0004854338677544257, "loss": 1.0492, "step": 9825 }, { "epoch": 1.181632407741315, "grad_norm": 0.7252630591392517, "learning_rate": 0.0004854178254340234, "loss": 0.8555, "step": 9830 }, { "epoch": 1.1822334415194133, "grad_norm": 0.7214177250862122, "learning_rate": 0.0004854017745498015, "loss": 0.9492, "step": 9835 }, { "epoch": 1.1828344752975117, "grad_norm": 0.6109257340431213, "learning_rate": 0.00048538571510234384, "loss": 0.7238, "step": 9840 }, { "epoch": 1.1834355090756101, "grad_norm": 0.635126531124115, "learning_rate": 0.0004853696470922346, "loss": 0.6645, "step": 9845 }, { "epoch": 1.1840365428537083, "grad_norm": 0.5315448641777039, "learning_rate": 0.0004853535705200582, "loss": 0.9516, "step": 9850 }, { "epoch": 1.1846375766318067, "grad_norm": 0.49046024680137634, "learning_rate": 0.00048533748538639966, "loss": 0.5828, "step": 9855 }, { "epoch": 1.185238610409905, "grad_norm": 0.5722445249557495, "learning_rate": 0.000485321391691844, "loss": 1.0918, "step": 9860 }, { "epoch": 1.1858396441880035, "grad_norm": 0.7618870735168457, "learning_rate": 0.0004853052894369766, "loss": 0.9594, "step": 9865 }, { "epoch": 1.1864406779661016, "grad_norm": 0.6157470345497131, "learning_rate": 0.0004852891786223832, "loss": 1.3977, "step": 9870 }, { "epoch": 1.1870417117442, "grad_norm": 0.6484712958335876, "learning_rate": 0.0004852730592486501, "loss": 0.6188, "step": 9875 }, { "epoch": 1.1876427455222984, "grad_norm": 0.5407760143280029, "learning_rate": 0.0004852569313163634, "loss": 0.9453, "step": 9880 }, { "epoch": 1.1882437793003966, "grad_norm": 0.8038312792778015, "learning_rate": 0.0004852407948261099, "loss": 0.8867, "step": 9885 }, { "epoch": 1.188844813078495, "grad_norm": 0.6100638508796692, "learning_rate": 0.0004852246497784766, "loss": 0.8906, "step": 9890 }, { "epoch": 1.1894458468565934, "grad_norm": 0.6055903434753418, "learning_rate": 0.00048520849617405084, "loss": 0.9688, "step": 9895 }, { "epoch": 1.1900468806346916, "grad_norm": 0.6793960928916931, "learning_rate": 0.00048519233401342023, "loss": 1.1234, "step": 9900 }, { "epoch": 1.19064791441279, "grad_norm": 0.5068621635437012, "learning_rate": 0.00048517616329717264, "loss": 0.8438, "step": 9905 }, { "epoch": 1.1912489481908883, "grad_norm": 0.7873875498771667, "learning_rate": 0.00048515998402589634, "loss": 0.9785, "step": 9910 }, { "epoch": 1.1918499819689867, "grad_norm": 0.6555593609809875, "learning_rate": 0.0004851437962001799, "loss": 1.1812, "step": 9915 }, { "epoch": 1.192451015747085, "grad_norm": 0.804202675819397, "learning_rate": 0.0004851275998206122, "loss": 0.9246, "step": 9920 }, { "epoch": 1.1930520495251833, "grad_norm": 0.6544862985610962, "learning_rate": 0.00048511139488778245, "loss": 0.698, "step": 9925 }, { "epoch": 1.1936530833032817, "grad_norm": 0.46155181527137756, "learning_rate": 0.00048509518140228003, "loss": 0.5973, "step": 9930 }, { "epoch": 1.19425411708138, "grad_norm": 0.5717451572418213, "learning_rate": 0.00048507895936469484, "loss": 1.0312, "step": 9935 }, { "epoch": 1.1948551508594782, "grad_norm": 0.6018429398536682, "learning_rate": 0.000485062728775617, "loss": 0.8332, "step": 9940 }, { "epoch": 1.1954561846375766, "grad_norm": 0.5063443183898926, "learning_rate": 0.0004850464896356368, "loss": 0.8344, "step": 9945 }, { "epoch": 1.196057218415675, "grad_norm": 0.5430408716201782, "learning_rate": 0.00048503024194534506, "loss": 0.7766, "step": 9950 }, { "epoch": 1.1966582521937732, "grad_norm": 0.41651999950408936, "learning_rate": 0.00048501398570533285, "loss": 1.2418, "step": 9955 }, { "epoch": 1.1972592859718716, "grad_norm": 0.6759993433952332, "learning_rate": 0.00048499772091619147, "loss": 1.1461, "step": 9960 }, { "epoch": 1.19786031974997, "grad_norm": 1.0700994729995728, "learning_rate": 0.0004849814475785127, "loss": 1.2102, "step": 9965 }, { "epoch": 1.1984613535280684, "grad_norm": 0.5895488858222961, "learning_rate": 0.00048496516569288835, "loss": 1.0016, "step": 9970 }, { "epoch": 1.1990623873061665, "grad_norm": 0.475639671087265, "learning_rate": 0.00048494887525991085, "loss": 1.0008, "step": 9975 }, { "epoch": 1.199663421084265, "grad_norm": 0.6905430555343628, "learning_rate": 0.0004849325762801726, "loss": 0.7523, "step": 9980 }, { "epoch": 1.2002644548623633, "grad_norm": 0.5461141467094421, "learning_rate": 0.00048491626875426673, "loss": 1.5016, "step": 9985 }, { "epoch": 1.2008654886404615, "grad_norm": 0.6916874647140503, "learning_rate": 0.0004848999526827864, "loss": 0.9352, "step": 9990 }, { "epoch": 1.20146652241856, "grad_norm": 0.6212778687477112, "learning_rate": 0.00048488362806632516, "loss": 1.1289, "step": 9995 }, { "epoch": 1.2020675561966583, "grad_norm": 0.5597301125526428, "learning_rate": 0.00048486729490547677, "loss": 0.7359, "step": 10000 }, { "epoch": 1.2020675561966583, "eval_loss": 1.904687523841858, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1776, "eval_samples_per_second": 4.548, "eval_steps_per_second": 1.137, "step": 10000 }, { "epoch": 1.2026685899747567, "grad_norm": 0.6090041995048523, "learning_rate": 0.00048485095320083537, "loss": 0.8406, "step": 10005 }, { "epoch": 1.2032696237528548, "grad_norm": 0.3706042170524597, "learning_rate": 0.0004848346029529956, "loss": 0.5691, "step": 10010 }, { "epoch": 1.2038706575309532, "grad_norm": 0.6342514753341675, "learning_rate": 0.000484818244162552, "loss": 0.7844, "step": 10015 }, { "epoch": 1.2044716913090516, "grad_norm": 0.7507076263427734, "learning_rate": 0.0004848018768300998, "loss": 0.6617, "step": 10020 }, { "epoch": 1.2050727250871498, "grad_norm": 0.902039647102356, "learning_rate": 0.0004847855009562344, "loss": 0.9461, "step": 10025 }, { "epoch": 1.2056737588652482, "grad_norm": 0.6845393180847168, "learning_rate": 0.00048476911654155144, "loss": 0.8125, "step": 10030 }, { "epoch": 1.2062747926433466, "grad_norm": 1.0045607089996338, "learning_rate": 0.00048475272358664695, "loss": 1.0703, "step": 10035 }, { "epoch": 1.206875826421445, "grad_norm": 0.5999272465705872, "learning_rate": 0.00048473632209211734, "loss": 0.8648, "step": 10040 }, { "epoch": 1.2074768601995431, "grad_norm": 0.6186231970787048, "learning_rate": 0.00048471991205855915, "loss": 1.1133, "step": 10045 }, { "epoch": 1.2080778939776415, "grad_norm": 0.5896947383880615, "learning_rate": 0.00048470349348656933, "loss": 0.7109, "step": 10050 }, { "epoch": 1.20867892775574, "grad_norm": 0.7153265476226807, "learning_rate": 0.0004846870663767452, "loss": 1.2367, "step": 10055 }, { "epoch": 1.2092799615338383, "grad_norm": 0.554258406162262, "learning_rate": 0.00048467063072968433, "loss": 0.9734, "step": 10060 }, { "epoch": 1.2098809953119365, "grad_norm": 0.6269187331199646, "learning_rate": 0.0004846541865459846, "loss": 1.0437, "step": 10065 }, { "epoch": 1.2104820290900349, "grad_norm": 0.6897535920143127, "learning_rate": 0.0004846377338262441, "loss": 1.0359, "step": 10070 }, { "epoch": 1.2110830628681333, "grad_norm": 0.613079309463501, "learning_rate": 0.0004846212725710615, "loss": 0.9234, "step": 10075 }, { "epoch": 1.2116840966462314, "grad_norm": 0.5149432420730591, "learning_rate": 0.00048460480278103546, "loss": 0.8484, "step": 10080 }, { "epoch": 1.2122851304243298, "grad_norm": 0.7728766798973083, "learning_rate": 0.0004845883244567652, "loss": 0.7676, "step": 10085 }, { "epoch": 1.2128861642024282, "grad_norm": 0.3584342896938324, "learning_rate": 0.00048457183759885013, "loss": 0.6715, "step": 10090 }, { "epoch": 1.2134871979805264, "grad_norm": 0.6881641745567322, "learning_rate": 0.00048455534220788997, "loss": 1.1766, "step": 10095 }, { "epoch": 1.2140882317586248, "grad_norm": 0.6357237100601196, "learning_rate": 0.00048453883828448476, "loss": 0.6047, "step": 10100 }, { "epoch": 1.2146892655367232, "grad_norm": 0.6761031746864319, "learning_rate": 0.00048452232582923494, "loss": 0.9258, "step": 10105 }, { "epoch": 1.2152902993148216, "grad_norm": 0.5906206965446472, "learning_rate": 0.00048450580484274116, "loss": 0.5672, "step": 10110 }, { "epoch": 1.2158913330929197, "grad_norm": 0.7813739776611328, "learning_rate": 0.0004844892753256044, "loss": 1.3391, "step": 10115 }, { "epoch": 1.2164923668710181, "grad_norm": 1.0761452913284302, "learning_rate": 0.00048447273727842587, "loss": 0.9508, "step": 10120 }, { "epoch": 1.2170934006491165, "grad_norm": 0.5351290702819824, "learning_rate": 0.00048445619070180724, "loss": 0.7332, "step": 10125 }, { "epoch": 1.217694434427215, "grad_norm": 0.5338313579559326, "learning_rate": 0.00048443963559635054, "loss": 1.2047, "step": 10130 }, { "epoch": 1.218295468205313, "grad_norm": 0.7655940651893616, "learning_rate": 0.0004844230719626578, "loss": 0.968, "step": 10135 }, { "epoch": 1.2188965019834115, "grad_norm": 0.6944729089736938, "learning_rate": 0.00048440649980133167, "loss": 0.8883, "step": 10140 }, { "epoch": 1.2194975357615099, "grad_norm": 0.8016156554222107, "learning_rate": 0.000484389919112975, "loss": 1.0898, "step": 10145 }, { "epoch": 1.220098569539608, "grad_norm": 0.7745673656463623, "learning_rate": 0.0004843733298981909, "loss": 0.9094, "step": 10150 }, { "epoch": 1.2206996033177064, "grad_norm": 0.975824773311615, "learning_rate": 0.0004843567321575829, "loss": 0.7086, "step": 10155 }, { "epoch": 1.2213006370958048, "grad_norm": 0.5339833498001099, "learning_rate": 0.00048434012589175474, "loss": 0.7812, "step": 10160 }, { "epoch": 1.221901670873903, "grad_norm": 0.5753735303878784, "learning_rate": 0.0004843235111013105, "loss": 0.5359, "step": 10165 }, { "epoch": 1.2225027046520014, "grad_norm": 0.4889537990093231, "learning_rate": 0.0004843068877868546, "loss": 0.6418, "step": 10170 }, { "epoch": 1.2231037384300998, "grad_norm": 0.6564207077026367, "learning_rate": 0.00048429025594899167, "loss": 0.6727, "step": 10175 }, { "epoch": 1.2237047722081982, "grad_norm": 0.5597406029701233, "learning_rate": 0.00048427361558832686, "loss": 0.9297, "step": 10180 }, { "epoch": 1.2243058059862963, "grad_norm": 0.6608861684799194, "learning_rate": 0.00048425696670546537, "loss": 0.8266, "step": 10185 }, { "epoch": 1.2249068397643947, "grad_norm": 0.5560731887817383, "learning_rate": 0.00048424030930101296, "loss": 0.9406, "step": 10190 }, { "epoch": 1.2255078735424931, "grad_norm": 0.7094256281852722, "learning_rate": 0.0004842236433755755, "loss": 0.8523, "step": 10195 }, { "epoch": 1.2261089073205915, "grad_norm": 0.5122326016426086, "learning_rate": 0.0004842069689297592, "loss": 0.7648, "step": 10200 }, { "epoch": 1.2261089073205915, "eval_loss": 1.9375, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.195, "eval_samples_per_second": 4.546, "eval_steps_per_second": 1.137, "step": 10200 }, { "epoch": 1.2267099410986897, "grad_norm": 0.7169754505157471, "learning_rate": 0.0004841902859641708, "loss": 0.9453, "step": 10205 }, { "epoch": 1.227310974876788, "grad_norm": 0.9155317544937134, "learning_rate": 0.00048417359447941704, "loss": 1.1164, "step": 10210 }, { "epoch": 1.2279120086548865, "grad_norm": 0.4512961506843567, "learning_rate": 0.0004841568944761051, "loss": 1.0234, "step": 10215 }, { "epoch": 1.2285130424329846, "grad_norm": 0.8214110732078552, "learning_rate": 0.00048414018595484254, "loss": 0.8758, "step": 10220 }, { "epoch": 1.229114076211083, "grad_norm": 0.821488618850708, "learning_rate": 0.00048412346891623715, "loss": 0.998, "step": 10225 }, { "epoch": 1.2297151099891814, "grad_norm": 0.6312018036842346, "learning_rate": 0.00048410674336089713, "loss": 0.7047, "step": 10230 }, { "epoch": 1.2303161437672796, "grad_norm": 0.7122650146484375, "learning_rate": 0.0004840900092894307, "loss": 1.1125, "step": 10235 }, { "epoch": 1.230917177545378, "grad_norm": 0.6051816940307617, "learning_rate": 0.0004840732667024467, "loss": 0.7672, "step": 10240 }, { "epoch": 1.2315182113234764, "grad_norm": 0.7672595977783203, "learning_rate": 0.0004840565156005543, "loss": 0.8195, "step": 10245 }, { "epoch": 1.2321192451015748, "grad_norm": 0.6409992575645447, "learning_rate": 0.0004840397559843627, "loss": 0.8656, "step": 10250 }, { "epoch": 1.232720278879673, "grad_norm": 0.45830869674682617, "learning_rate": 0.00048402298785448156, "loss": 0.7516, "step": 10255 }, { "epoch": 1.2333213126577713, "grad_norm": 0.876666247844696, "learning_rate": 0.00048400621121152097, "loss": 0.959, "step": 10260 }, { "epoch": 1.2339223464358697, "grad_norm": 0.5949916243553162, "learning_rate": 0.0004839894260560912, "loss": 0.5703, "step": 10265 }, { "epoch": 1.2345233802139681, "grad_norm": 0.7354668974876404, "learning_rate": 0.0004839726323888027, "loss": 1.0609, "step": 10270 }, { "epoch": 1.2351244139920663, "grad_norm": 0.5362606048583984, "learning_rate": 0.0004839558302102666, "loss": 1.3273, "step": 10275 }, { "epoch": 1.2357254477701647, "grad_norm": 0.7297337651252747, "learning_rate": 0.00048393901952109385, "loss": 0.7195, "step": 10280 }, { "epoch": 1.236326481548263, "grad_norm": 0.6569810509681702, "learning_rate": 0.00048392220032189624, "loss": 0.9305, "step": 10285 }, { "epoch": 1.2369275153263612, "grad_norm": 0.5746892094612122, "learning_rate": 0.0004839053726132854, "loss": 0.6113, "step": 10290 }, { "epoch": 1.2375285491044596, "grad_norm": 0.6257365345954895, "learning_rate": 0.00048388853639587365, "loss": 0.8148, "step": 10295 }, { "epoch": 1.238129582882558, "grad_norm": 0.7595178484916687, "learning_rate": 0.0004838716916702733, "loss": 0.9664, "step": 10300 }, { "epoch": 1.2387306166606564, "grad_norm": 0.6051932573318481, "learning_rate": 0.0004838548384370972, "loss": 0.8414, "step": 10305 }, { "epoch": 1.2393316504387546, "grad_norm": 0.7046768665313721, "learning_rate": 0.00048383797669695826, "loss": 0.8695, "step": 10310 }, { "epoch": 1.239932684216853, "grad_norm": 0.45860451459884644, "learning_rate": 0.0004838211064504701, "loss": 0.8789, "step": 10315 }, { "epoch": 1.2405337179949514, "grad_norm": 0.7311157584190369, "learning_rate": 0.0004838042276982463, "loss": 0.8516, "step": 10320 }, { "epoch": 1.2411347517730495, "grad_norm": 0.5888791084289551, "learning_rate": 0.0004837873404409008, "loss": 0.9828, "step": 10325 }, { "epoch": 1.241735785551148, "grad_norm": 0.5652986168861389, "learning_rate": 0.00048377044467904794, "loss": 0.6574, "step": 10330 }, { "epoch": 1.2423368193292463, "grad_norm": 0.5203643441200256, "learning_rate": 0.00048375354041330245, "loss": 0.6844, "step": 10335 }, { "epoch": 1.2429378531073447, "grad_norm": 0.6786804795265198, "learning_rate": 0.00048373662764427907, "loss": 0.6109, "step": 10340 }, { "epoch": 1.243538886885443, "grad_norm": 0.6723741888999939, "learning_rate": 0.0004837197063725932, "loss": 0.9234, "step": 10345 }, { "epoch": 1.2441399206635413, "grad_norm": 0.7140915393829346, "learning_rate": 0.00048370277659886033, "loss": 1.0063, "step": 10350 }, { "epoch": 1.2447409544416397, "grad_norm": 0.7569808959960938, "learning_rate": 0.0004836858383236963, "loss": 0.6195, "step": 10355 }, { "epoch": 1.2453419882197378, "grad_norm": 0.5529028177261353, "learning_rate": 0.0004836688915477173, "loss": 0.9773, "step": 10360 }, { "epoch": 1.2459430219978362, "grad_norm": 0.9198957085609436, "learning_rate": 0.0004836519362715398, "loss": 1.0242, "step": 10365 }, { "epoch": 1.2465440557759346, "grad_norm": 0.6676874160766602, "learning_rate": 0.0004836349724957806, "loss": 0.8078, "step": 10370 }, { "epoch": 1.247145089554033, "grad_norm": 0.5963857769966125, "learning_rate": 0.00048361800022105674, "loss": 1.1363, "step": 10375 }, { "epoch": 1.2477461233321312, "grad_norm": 0.5967381000518799, "learning_rate": 0.0004836010194479856, "loss": 0.9086, "step": 10380 }, { "epoch": 1.2483471571102296, "grad_norm": 0.5581762194633484, "learning_rate": 0.000483584030177185, "loss": 0.7625, "step": 10385 }, { "epoch": 1.248948190888328, "grad_norm": 0.5977279543876648, "learning_rate": 0.0004835670324092729, "loss": 0.7891, "step": 10390 }, { "epoch": 1.2495492246664264, "grad_norm": 0.5720333456993103, "learning_rate": 0.0004835500261448676, "loss": 0.8578, "step": 10395 }, { "epoch": 1.2501502584445245, "grad_norm": 0.5132938623428345, "learning_rate": 0.00048353301138458785, "loss": 0.8215, "step": 10400 }, { "epoch": 1.2501502584445245, "eval_loss": 1.930273413658142, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1653, "eval_samples_per_second": 4.55, "eval_steps_per_second": 1.137, "step": 10400 }, { "epoch": 1.250751292222623, "grad_norm": 0.6224148273468018, "learning_rate": 0.0004835159881290524, "loss": 0.9672, "step": 10405 }, { "epoch": 1.2513523260007213, "grad_norm": 0.5987078547477722, "learning_rate": 0.0004834989563788807, "loss": 0.8496, "step": 10410 }, { "epoch": 1.2519533597788195, "grad_norm": 0.8181197643280029, "learning_rate": 0.00048348191613469227, "loss": 0.8148, "step": 10415 }, { "epoch": 1.2525543935569179, "grad_norm": 0.49654489755630493, "learning_rate": 0.00048346486739710696, "loss": 1.0664, "step": 10420 }, { "epoch": 1.2531554273350163, "grad_norm": 0.7340275645256042, "learning_rate": 0.00048344781016674486, "loss": 0.7961, "step": 10425 }, { "epoch": 1.2537564611131145, "grad_norm": 0.4701334834098816, "learning_rate": 0.0004834307444442266, "loss": 0.9113, "step": 10430 }, { "epoch": 1.2543574948912128, "grad_norm": 0.7004857063293457, "learning_rate": 0.00048341367023017304, "loss": 0.7937, "step": 10435 }, { "epoch": 1.2549585286693112, "grad_norm": 0.6275933980941772, "learning_rate": 0.00048339658752520506, "loss": 0.6195, "step": 10440 }, { "epoch": 1.2555595624474096, "grad_norm": 0.8820484280586243, "learning_rate": 0.0004833794963299442, "loss": 0.9828, "step": 10445 }, { "epoch": 1.2561605962255078, "grad_norm": 0.6559478640556335, "learning_rate": 0.00048336239664501223, "loss": 1.3375, "step": 10450 }, { "epoch": 1.2567616300036062, "grad_norm": 0.644068717956543, "learning_rate": 0.00048334528847103123, "loss": 0.8469, "step": 10455 }, { "epoch": 1.2573626637817046, "grad_norm": 0.7082321047782898, "learning_rate": 0.0004833281718086233, "loss": 0.9148, "step": 10460 }, { "epoch": 1.257963697559803, "grad_norm": 0.8087785243988037, "learning_rate": 0.0004833110466584114, "loss": 0.5, "step": 10465 }, { "epoch": 1.2585647313379011, "grad_norm": 0.5967214107513428, "learning_rate": 0.0004832939130210183, "loss": 0.8352, "step": 10470 }, { "epoch": 1.2591657651159995, "grad_norm": 0.950689971446991, "learning_rate": 0.00048327677089706736, "loss": 1.05, "step": 10475 }, { "epoch": 1.259766798894098, "grad_norm": 0.63368159532547, "learning_rate": 0.00048325962028718207, "loss": 0.7961, "step": 10480 }, { "epoch": 1.260367832672196, "grad_norm": 0.5194833874702454, "learning_rate": 0.00048324246119198645, "loss": 0.6227, "step": 10485 }, { "epoch": 1.2609688664502945, "grad_norm": 0.5865866541862488, "learning_rate": 0.00048322529361210457, "loss": 0.9512, "step": 10490 }, { "epoch": 1.2615699002283929, "grad_norm": 0.7601271271705627, "learning_rate": 0.00048320811754816104, "loss": 0.9625, "step": 10495 }, { "epoch": 1.262170934006491, "grad_norm": 0.6120428442955017, "learning_rate": 0.0004831909330007805, "loss": 1.0117, "step": 10500 }, { "epoch": 1.2627719677845894, "grad_norm": 0.870328962802887, "learning_rate": 0.0004831737399705884, "loss": 0.9016, "step": 10505 }, { "epoch": 1.2633730015626878, "grad_norm": 0.49900364875793457, "learning_rate": 0.0004831565384582098, "loss": 0.8125, "step": 10510 }, { "epoch": 1.2639740353407862, "grad_norm": 0.770746111869812, "learning_rate": 0.00048313932846427075, "loss": 1.5344, "step": 10515 }, { "epoch": 1.2645750691188844, "grad_norm": 0.7275922894477844, "learning_rate": 0.0004831221099893971, "loss": 0.9895, "step": 10520 }, { "epoch": 1.2651761028969828, "grad_norm": 0.4460059404373169, "learning_rate": 0.0004831048830342153, "loss": 0.8, "step": 10525 }, { "epoch": 1.2657771366750812, "grad_norm": 0.5242088437080383, "learning_rate": 0.00048308764759935196, "loss": 0.7379, "step": 10530 }, { "epoch": 1.2663781704531796, "grad_norm": 0.7097766995429993, "learning_rate": 0.00048307040368543414, "loss": 0.8555, "step": 10535 }, { "epoch": 1.2669792042312777, "grad_norm": 0.9315968155860901, "learning_rate": 0.000483053151293089, "loss": 0.9016, "step": 10540 }, { "epoch": 1.2675802380093761, "grad_norm": 0.6095036864280701, "learning_rate": 0.0004830358904229443, "loss": 1.0625, "step": 10545 }, { "epoch": 1.2681812717874745, "grad_norm": 0.827241063117981, "learning_rate": 0.00048301862107562776, "loss": 1.243, "step": 10550 }, { "epoch": 1.2687823055655727, "grad_norm": 0.6420729756355286, "learning_rate": 0.0004830013432517677, "loss": 0.7918, "step": 10555 }, { "epoch": 1.269383339343671, "grad_norm": 0.7661915421485901, "learning_rate": 0.00048298405695199254, "loss": 0.9766, "step": 10560 }, { "epoch": 1.2699843731217695, "grad_norm": 0.5343917012214661, "learning_rate": 0.00048296676217693125, "loss": 0.8191, "step": 10565 }, { "epoch": 1.2705854068998677, "grad_norm": 0.6964761018753052, "learning_rate": 0.00048294945892721295, "loss": 0.7746, "step": 10570 }, { "epoch": 1.271186440677966, "grad_norm": 0.5846682190895081, "learning_rate": 0.0004829321472034669, "loss": 0.8102, "step": 10575 }, { "epoch": 1.2717874744560644, "grad_norm": 0.7645305395126343, "learning_rate": 0.00048291482700632305, "loss": 0.9766, "step": 10580 }, { "epoch": 1.2723885082341628, "grad_norm": 0.5629719495773315, "learning_rate": 0.00048289749833641134, "loss": 0.6555, "step": 10585 }, { "epoch": 1.2729895420122612, "grad_norm": 0.6958023905754089, "learning_rate": 0.00048288016119436215, "loss": 1.2594, "step": 10590 }, { "epoch": 1.2735905757903594, "grad_norm": 0.6174116730690002, "learning_rate": 0.00048286281558080626, "loss": 0.7344, "step": 10595 }, { "epoch": 1.2741916095684578, "grad_norm": 0.6529636383056641, "learning_rate": 0.00048284546149637457, "loss": 0.8273, "step": 10600 }, { "epoch": 1.2741916095684578, "eval_loss": 1.9465820789337158, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2056, "eval_samples_per_second": 4.545, "eval_steps_per_second": 1.136, "step": 10600 }, { "epoch": 1.2747926433465562, "grad_norm": 0.6496317982673645, "learning_rate": 0.00048282809894169835, "loss": 0.8922, "step": 10605 }, { "epoch": 1.2753936771246543, "grad_norm": 0.6146672368049622, "learning_rate": 0.00048281072791740915, "loss": 0.557, "step": 10610 }, { "epoch": 1.2759947109027527, "grad_norm": 0.6771122217178345, "learning_rate": 0.000482793348424139, "loss": 0.6547, "step": 10615 }, { "epoch": 1.2765957446808511, "grad_norm": 0.6106471419334412, "learning_rate": 0.00048277596046252013, "loss": 0.9133, "step": 10620 }, { "epoch": 1.2771967784589493, "grad_norm": 0.6613155603408813, "learning_rate": 0.000482758564033185, "loss": 1.025, "step": 10625 }, { "epoch": 1.2777978122370477, "grad_norm": 1.1877671480178833, "learning_rate": 0.00048274115913676634, "loss": 1.0191, "step": 10630 }, { "epoch": 1.278398846015146, "grad_norm": 0.811174213886261, "learning_rate": 0.0004827237457738975, "loss": 1.2445, "step": 10635 }, { "epoch": 1.2789998797932443, "grad_norm": 0.5439468622207642, "learning_rate": 0.00048270632394521176, "loss": 0.8379, "step": 10640 }, { "epoch": 1.2796009135713426, "grad_norm": 0.5261412858963013, "learning_rate": 0.000482688893651343, "loss": 0.725, "step": 10645 }, { "epoch": 1.280201947349441, "grad_norm": 0.6249575614929199, "learning_rate": 0.0004826714548929252, "loss": 0.7688, "step": 10650 }, { "epoch": 1.2808029811275394, "grad_norm": 0.8901901841163635, "learning_rate": 0.00048265400767059273, "loss": 1.2398, "step": 10655 }, { "epoch": 1.2814040149056378, "grad_norm": 0.6960535049438477, "learning_rate": 0.0004826365519849803, "loss": 0.7609, "step": 10660 }, { "epoch": 1.282005048683736, "grad_norm": 0.6364520192146301, "learning_rate": 0.0004826190878367229, "loss": 1.1098, "step": 10665 }, { "epoch": 1.2826060824618344, "grad_norm": 0.8415795564651489, "learning_rate": 0.00048260161522645586, "loss": 1.1313, "step": 10670 }, { "epoch": 1.2832071162399328, "grad_norm": 0.5826691389083862, "learning_rate": 0.00048258413415481463, "loss": 0.607, "step": 10675 }, { "epoch": 1.283808150018031, "grad_norm": 0.3610847592353821, "learning_rate": 0.00048256664462243527, "loss": 0.7699, "step": 10680 }, { "epoch": 1.2844091837961293, "grad_norm": 0.7859867811203003, "learning_rate": 0.00048254914662995403, "loss": 1.1211, "step": 10685 }, { "epoch": 1.2850102175742277, "grad_norm": 0.4006088674068451, "learning_rate": 0.0004825316401780073, "loss": 0.7266, "step": 10690 }, { "epoch": 1.285611251352326, "grad_norm": 0.6036331653594971, "learning_rate": 0.000482514125267232, "loss": 0.6441, "step": 10695 }, { "epoch": 1.2862122851304243, "grad_norm": 0.5787040591239929, "learning_rate": 0.0004824966018982653, "loss": 0.8008, "step": 10700 }, { "epoch": 1.2868133189085227, "grad_norm": 0.4928837716579437, "learning_rate": 0.00048247907007174454, "loss": 0.9367, "step": 10705 }, { "epoch": 1.2874143526866209, "grad_norm": 0.7626219391822815, "learning_rate": 0.00048246152978830746, "loss": 0.8125, "step": 10710 }, { "epoch": 1.2880153864647192, "grad_norm": 0.6299381256103516, "learning_rate": 0.00048244398104859234, "loss": 0.932, "step": 10715 }, { "epoch": 1.2886164202428176, "grad_norm": 0.8862885236740112, "learning_rate": 0.00048242642385323733, "loss": 1.0891, "step": 10720 }, { "epoch": 1.289217454020916, "grad_norm": 0.6062028408050537, "learning_rate": 0.00048240885820288124, "loss": 0.9187, "step": 10725 }, { "epoch": 1.2898184877990144, "grad_norm": 0.4260680377483368, "learning_rate": 0.000482391284098163, "loss": 0.9898, "step": 10730 }, { "epoch": 1.2904195215771126, "grad_norm": 0.5453752875328064, "learning_rate": 0.0004823737015397219, "loss": 1.0461, "step": 10735 }, { "epoch": 1.291020555355211, "grad_norm": 0.5417711734771729, "learning_rate": 0.00048235611052819757, "loss": 1.1953, "step": 10740 }, { "epoch": 1.2916215891333094, "grad_norm": 0.5425593256950378, "learning_rate": 0.0004823385110642299, "loss": 0.6766, "step": 10745 }, { "epoch": 1.2922226229114075, "grad_norm": 0.5413578152656555, "learning_rate": 0.0004823209031484591, "loss": 0.8602, "step": 10750 }, { "epoch": 1.292823656689506, "grad_norm": 0.8350200653076172, "learning_rate": 0.0004823032867815258, "loss": 1.1719, "step": 10755 }, { "epoch": 1.2934246904676043, "grad_norm": 0.6589514017105103, "learning_rate": 0.00048228566196407063, "loss": 0.6258, "step": 10760 }, { "epoch": 1.2940257242457025, "grad_norm": 0.7652323246002197, "learning_rate": 0.00048226802869673493, "loss": 1.0586, "step": 10765 }, { "epoch": 1.294626758023801, "grad_norm": 0.5493776798248291, "learning_rate": 0.00048225038698016003, "loss": 1.3562, "step": 10770 }, { "epoch": 1.2952277918018993, "grad_norm": 0.8401371836662292, "learning_rate": 0.0004822327368149877, "loss": 1.0914, "step": 10775 }, { "epoch": 1.2958288255799977, "grad_norm": 0.7293179035186768, "learning_rate": 0.00048221507820186006, "loss": 1.0289, "step": 10780 }, { "epoch": 1.2964298593580958, "grad_norm": 0.5851186513900757, "learning_rate": 0.0004821974111414195, "loss": 1.1938, "step": 10785 }, { "epoch": 1.2970308931361942, "grad_norm": 0.6673911809921265, "learning_rate": 0.0004821797356343085, "loss": 0.9082, "step": 10790 }, { "epoch": 1.2976319269142926, "grad_norm": 0.7832614779472351, "learning_rate": 0.0004821620516811702, "loss": 0.7309, "step": 10795 }, { "epoch": 1.298232960692391, "grad_norm": 0.5708516836166382, "learning_rate": 0.0004821443592826479, "loss": 1.0652, "step": 10800 }, { "epoch": 1.298232960692391, "eval_loss": 1.95263671875, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2085, "eval_samples_per_second": 4.544, "eval_steps_per_second": 1.136, "step": 10800 }, { "epoch": 1.2988339944704892, "grad_norm": 0.7534372806549072, "learning_rate": 0.0004821266584393852, "loss": 0.8, "step": 10805 }, { "epoch": 1.2994350282485876, "grad_norm": 0.6106999516487122, "learning_rate": 0.00048210894915202594, "loss": 0.7984, "step": 10810 }, { "epoch": 1.300036062026686, "grad_norm": 0.5053595304489136, "learning_rate": 0.00048209123142121436, "loss": 0.6844, "step": 10815 }, { "epoch": 1.3006370958047841, "grad_norm": 0.5652788281440735, "learning_rate": 0.000482073505247595, "loss": 1.0305, "step": 10820 }, { "epoch": 1.3012381295828825, "grad_norm": 0.7114886045455933, "learning_rate": 0.0004820557706318126, "loss": 0.8094, "step": 10825 }, { "epoch": 1.301839163360981, "grad_norm": 0.7203825116157532, "learning_rate": 0.0004820380275745125, "loss": 1.2555, "step": 10830 }, { "epoch": 1.302440197139079, "grad_norm": 0.5713114142417908, "learning_rate": 0.0004820202760763399, "loss": 1.0297, "step": 10835 }, { "epoch": 1.3030412309171775, "grad_norm": 0.7799856066703796, "learning_rate": 0.0004820025161379407, "loss": 0.8664, "step": 10840 }, { "epoch": 1.3036422646952759, "grad_norm": 0.5092629790306091, "learning_rate": 0.0004819847477599609, "loss": 1.0586, "step": 10845 }, { "epoch": 1.3042432984733743, "grad_norm": 0.5138017535209656, "learning_rate": 0.00048196697094304684, "loss": 1.2937, "step": 10850 }, { "epoch": 1.3048443322514724, "grad_norm": 0.4723987877368927, "learning_rate": 0.00048194918568784527, "loss": 1.175, "step": 10855 }, { "epoch": 1.3054453660295708, "grad_norm": 0.7525123357772827, "learning_rate": 0.00048193139199500314, "loss": 1.0688, "step": 10860 }, { "epoch": 1.3060463998076692, "grad_norm": 0.597766101360321, "learning_rate": 0.0004819135898651677, "loss": 0.9477, "step": 10865 }, { "epoch": 1.3066474335857676, "grad_norm": 0.6497039794921875, "learning_rate": 0.00048189577929898654, "loss": 0.698, "step": 10870 }, { "epoch": 1.3072484673638658, "grad_norm": 0.6981368660926819, "learning_rate": 0.0004818779602971076, "loss": 0.7797, "step": 10875 }, { "epoch": 1.3078495011419642, "grad_norm": 0.5832539200782776, "learning_rate": 0.000481860132860179, "loss": 1.1074, "step": 10880 }, { "epoch": 1.3084505349200626, "grad_norm": 0.603484570980072, "learning_rate": 0.00048184229698884927, "loss": 0.7102, "step": 10885 }, { "epoch": 1.3090515686981608, "grad_norm": 0.5644847750663757, "learning_rate": 0.00048182445268376737, "loss": 1.1367, "step": 10890 }, { "epoch": 1.3096526024762591, "grad_norm": 0.7086564898490906, "learning_rate": 0.00048180659994558224, "loss": 0.7521, "step": 10895 }, { "epoch": 1.3102536362543575, "grad_norm": 0.7100790739059448, "learning_rate": 0.0004817887387749434, "loss": 1.2031, "step": 10900 }, { "epoch": 1.3108546700324557, "grad_norm": 0.6330604553222656, "learning_rate": 0.0004817708691725006, "loss": 0.7137, "step": 10905 }, { "epoch": 1.311455703810554, "grad_norm": 0.6586036682128906, "learning_rate": 0.0004817529911389038, "loss": 0.9848, "step": 10910 }, { "epoch": 1.3120567375886525, "grad_norm": 0.6988177299499512, "learning_rate": 0.0004817351046748034, "loss": 0.8984, "step": 10915 }, { "epoch": 1.3126577713667509, "grad_norm": 0.7171809077262878, "learning_rate": 0.00048171720978085013, "loss": 0.8906, "step": 10920 }, { "epoch": 1.3132588051448493, "grad_norm": 0.7116740942001343, "learning_rate": 0.00048169930645769487, "loss": 0.8242, "step": 10925 }, { "epoch": 1.3138598389229474, "grad_norm": 0.8414427042007446, "learning_rate": 0.00048168139470598895, "loss": 1.3813, "step": 10930 }, { "epoch": 1.3144608727010458, "grad_norm": 0.5719891786575317, "learning_rate": 0.00048166347452638386, "loss": 0.9492, "step": 10935 }, { "epoch": 1.3150619064791442, "grad_norm": 0.5521625280380249, "learning_rate": 0.0004816455459195316, "loss": 0.6379, "step": 10940 }, { "epoch": 1.3156629402572424, "grad_norm": 0.523263692855835, "learning_rate": 0.00048162760888608423, "loss": 0.9348, "step": 10945 }, { "epoch": 1.3162639740353408, "grad_norm": 0.6334478259086609, "learning_rate": 0.0004816096634266943, "loss": 0.4564, "step": 10950 }, { "epoch": 1.3168650078134392, "grad_norm": 0.6602448225021362, "learning_rate": 0.0004815917095420146, "loss": 0.7812, "step": 10955 }, { "epoch": 1.3174660415915374, "grad_norm": 0.6234865188598633, "learning_rate": 0.00048157374723269836, "loss": 0.7004, "step": 10960 }, { "epoch": 1.3180670753696357, "grad_norm": 0.48519912362098694, "learning_rate": 0.00048155577649939884, "loss": 0.8031, "step": 10965 }, { "epoch": 1.3186681091477341, "grad_norm": 0.692662239074707, "learning_rate": 0.00048153779734276986, "loss": 0.7137, "step": 10970 }, { "epoch": 1.3192691429258323, "grad_norm": 0.516724169254303, "learning_rate": 0.00048151980976346545, "loss": 0.9898, "step": 10975 }, { "epoch": 1.3198701767039307, "grad_norm": 0.5575066208839417, "learning_rate": 0.0004815018137621399, "loss": 0.609, "step": 10980 }, { "epoch": 1.320471210482029, "grad_norm": 0.7598167657852173, "learning_rate": 0.0004814838093394478, "loss": 0.6816, "step": 10985 }, { "epoch": 1.3210722442601275, "grad_norm": 0.4681103825569153, "learning_rate": 0.0004814657964960442, "loss": 0.8688, "step": 10990 }, { "epoch": 1.3216732780382259, "grad_norm": 0.5804292559623718, "learning_rate": 0.0004814477752325843, "loss": 0.7047, "step": 10995 }, { "epoch": 1.322274311816324, "grad_norm": 0.7653179168701172, "learning_rate": 0.00048142974554972375, "loss": 1.293, "step": 11000 }, { "epoch": 1.322274311816324, "eval_loss": 1.9064452648162842, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1995, "eval_samples_per_second": 4.546, "eval_steps_per_second": 1.136, "step": 11000 }, { "epoch": 1.3228753455944224, "grad_norm": 0.6465706825256348, "learning_rate": 0.00048141170744811826, "loss": 0.6529, "step": 11005 }, { "epoch": 1.3234763793725208, "grad_norm": 0.39915701746940613, "learning_rate": 0.0004813936609284241, "loss": 1.0059, "step": 11010 }, { "epoch": 1.324077413150619, "grad_norm": 0.6334801316261292, "learning_rate": 0.0004813756059912978, "loss": 0.9812, "step": 11015 }, { "epoch": 1.3246784469287174, "grad_norm": 0.5835946798324585, "learning_rate": 0.0004813575426373961, "loss": 1.1961, "step": 11020 }, { "epoch": 1.3252794807068158, "grad_norm": 0.5502042174339294, "learning_rate": 0.000481339470867376, "loss": 1.091, "step": 11025 }, { "epoch": 1.325880514484914, "grad_norm": 0.520435631275177, "learning_rate": 0.0004813213906818951, "loss": 0.8797, "step": 11030 }, { "epoch": 1.3264815482630123, "grad_norm": 0.7997671365737915, "learning_rate": 0.0004813033020816109, "loss": 1.1031, "step": 11035 }, { "epoch": 1.3270825820411107, "grad_norm": 0.5193153619766235, "learning_rate": 0.0004812852050671814, "loss": 1.2359, "step": 11040 }, { "epoch": 1.327683615819209, "grad_norm": 0.6436191201210022, "learning_rate": 0.00048126709963926517, "loss": 0.8531, "step": 11045 }, { "epoch": 1.3282846495973073, "grad_norm": 0.9058936238288879, "learning_rate": 0.0004812489857985206, "loss": 0.8492, "step": 11050 }, { "epoch": 1.3288856833754057, "grad_norm": 0.42691755294799805, "learning_rate": 0.00048123086354560674, "loss": 0.4992, "step": 11055 }, { "epoch": 1.329486717153504, "grad_norm": 0.610244631767273, "learning_rate": 0.00048121273288118284, "loss": 0.7793, "step": 11060 }, { "epoch": 1.3300877509316025, "grad_norm": 0.6124532222747803, "learning_rate": 0.00048119459380590825, "loss": 0.7594, "step": 11065 }, { "epoch": 1.3306887847097006, "grad_norm": 0.5388164520263672, "learning_rate": 0.000481176446320443, "loss": 0.9398, "step": 11070 }, { "epoch": 1.331289818487799, "grad_norm": 0.7066982388496399, "learning_rate": 0.00048115829042544723, "loss": 0.7375, "step": 11075 }, { "epoch": 1.3318908522658974, "grad_norm": 0.6047337055206299, "learning_rate": 0.0004811401261215813, "loss": 0.8539, "step": 11080 }, { "epoch": 1.3324918860439956, "grad_norm": 0.8070830702781677, "learning_rate": 0.0004811219534095061, "loss": 0.7016, "step": 11085 }, { "epoch": 1.333092919822094, "grad_norm": 0.5883705019950867, "learning_rate": 0.0004811037722898826, "loss": 0.5852, "step": 11090 }, { "epoch": 1.3336939536001924, "grad_norm": 0.7175491452217102, "learning_rate": 0.00048108558276337225, "loss": 1.0422, "step": 11095 }, { "epoch": 1.3342949873782906, "grad_norm": 0.6159098148345947, "learning_rate": 0.0004810673848306366, "loss": 0.7754, "step": 11100 }, { "epoch": 1.334896021156389, "grad_norm": 0.67514967918396, "learning_rate": 0.0004810491784923378, "loss": 0.9625, "step": 11105 }, { "epoch": 1.3354970549344873, "grad_norm": 0.5794950127601624, "learning_rate": 0.00048103096374913813, "loss": 0.8359, "step": 11110 }, { "epoch": 1.3360980887125857, "grad_norm": 0.5884886384010315, "learning_rate": 0.0004810127406017001, "loss": 0.5582, "step": 11115 }, { "epoch": 1.336699122490684, "grad_norm": 0.6307050585746765, "learning_rate": 0.0004809945090506866, "loss": 0.8379, "step": 11120 }, { "epoch": 1.3373001562687823, "grad_norm": 0.6032964587211609, "learning_rate": 0.00048097626909676097, "loss": 0.7539, "step": 11125 }, { "epoch": 1.3379011900468807, "grad_norm": 0.40143483877182007, "learning_rate": 0.0004809580207405865, "loss": 1.3516, "step": 11130 }, { "epoch": 1.338502223824979, "grad_norm": 0.6323394179344177, "learning_rate": 0.00048093976398282736, "loss": 1.2367, "step": 11135 }, { "epoch": 1.3391032576030772, "grad_norm": 0.5782902836799622, "learning_rate": 0.0004809214988241474, "loss": 0.8242, "step": 11140 }, { "epoch": 1.3397042913811756, "grad_norm": 0.536906361579895, "learning_rate": 0.0004809032252652111, "loss": 1.1953, "step": 11145 }, { "epoch": 1.340305325159274, "grad_norm": 0.74793541431427, "learning_rate": 0.00048088494330668325, "loss": 1.2469, "step": 11150 }, { "epoch": 1.3409063589373722, "grad_norm": 0.677535891532898, "learning_rate": 0.00048086665294922885, "loss": 0.7531, "step": 11155 }, { "epoch": 1.3415073927154706, "grad_norm": 0.47314557433128357, "learning_rate": 0.00048084835419351336, "loss": 0.5516, "step": 11160 }, { "epoch": 1.342108426493569, "grad_norm": 0.5068851709365845, "learning_rate": 0.00048083004704020226, "loss": 0.8977, "step": 11165 }, { "epoch": 1.3427094602716672, "grad_norm": 0.6457291841506958, "learning_rate": 0.0004808117314899617, "loss": 0.6992, "step": 11170 }, { "epoch": 1.3433104940497655, "grad_norm": 0.7612230777740479, "learning_rate": 0.0004807934075434578, "loss": 0.8531, "step": 11175 }, { "epoch": 1.343911527827864, "grad_norm": 0.7496997714042664, "learning_rate": 0.0004807750752013572, "loss": 0.932, "step": 11180 }, { "epoch": 1.3445125616059623, "grad_norm": 0.5454360246658325, "learning_rate": 0.00048075673446432675, "loss": 0.8922, "step": 11185 }, { "epoch": 1.3451135953840607, "grad_norm": 0.6435137987136841, "learning_rate": 0.0004807383853330337, "loss": 0.8984, "step": 11190 }, { "epoch": 1.345714629162159, "grad_norm": 0.6177284717559814, "learning_rate": 0.00048072002780814545, "loss": 0.7527, "step": 11195 }, { "epoch": 1.3463156629402573, "grad_norm": 0.599716305732727, "learning_rate": 0.0004807016618903298, "loss": 0.8016, "step": 11200 }, { "epoch": 1.3463156629402573, "eval_loss": 1.9163086414337158, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1978, "eval_samples_per_second": 4.546, "eval_steps_per_second": 1.136, "step": 11200 }, { "epoch": 1.3469166967183557, "grad_norm": 0.5576217770576477, "learning_rate": 0.00048068328758025494, "loss": 1.0055, "step": 11205 }, { "epoch": 1.3475177304964538, "grad_norm": 0.6056281328201294, "learning_rate": 0.00048066490487858916, "loss": 0.9594, "step": 11210 }, { "epoch": 1.3481187642745522, "grad_norm": 0.7149866223335266, "learning_rate": 0.0004806465137860012, "loss": 0.5891, "step": 11215 }, { "epoch": 1.3487197980526506, "grad_norm": 0.7981170415878296, "learning_rate": 0.0004806281143031601, "loss": 0.9141, "step": 11220 }, { "epoch": 1.3493208318307488, "grad_norm": 0.5131962895393372, "learning_rate": 0.0004806097064307352, "loss": 1.5766, "step": 11225 }, { "epoch": 1.3499218656088472, "grad_norm": 0.6224086880683899, "learning_rate": 0.0004805912901693961, "loss": 0.6008, "step": 11230 }, { "epoch": 1.3505228993869456, "grad_norm": 0.6797940135002136, "learning_rate": 0.00048057286551981274, "loss": 1.1223, "step": 11235 }, { "epoch": 1.3511239331650438, "grad_norm": 0.643601655960083, "learning_rate": 0.0004805544324826553, "loss": 0.782, "step": 11240 }, { "epoch": 1.3517249669431421, "grad_norm": 0.5339431166648865, "learning_rate": 0.0004805359910585945, "loss": 0.7926, "step": 11245 }, { "epoch": 1.3523260007212405, "grad_norm": 0.6211333870887756, "learning_rate": 0.0004805175412483009, "loss": 0.7117, "step": 11250 }, { "epoch": 1.352927034499339, "grad_norm": 0.49709734320640564, "learning_rate": 0.00048049908305244584, "loss": 0.8684, "step": 11255 }, { "epoch": 1.3535280682774373, "grad_norm": 0.5717650651931763, "learning_rate": 0.00048048061647170076, "loss": 0.7297, "step": 11260 }, { "epoch": 1.3541291020555355, "grad_norm": 0.6539830565452576, "learning_rate": 0.0004804621415067374, "loss": 0.8367, "step": 11265 }, { "epoch": 1.3547301358336339, "grad_norm": 0.5457570552825928, "learning_rate": 0.00048044365815822784, "loss": 0.8203, "step": 11270 }, { "epoch": 1.3553311696117323, "grad_norm": 0.4412502646446228, "learning_rate": 0.0004804251664268444, "loss": 0.893, "step": 11275 }, { "epoch": 1.3559322033898304, "grad_norm": 0.9668557643890381, "learning_rate": 0.0004804066663132598, "loss": 0.8961, "step": 11280 }, { "epoch": 1.3565332371679288, "grad_norm": 0.5155379772186279, "learning_rate": 0.00048038815781814706, "loss": 1.1148, "step": 11285 }, { "epoch": 1.3571342709460272, "grad_norm": 0.8544705510139465, "learning_rate": 0.0004803696409421794, "loss": 0.9555, "step": 11290 }, { "epoch": 1.3577353047241254, "grad_norm": 0.8742035627365112, "learning_rate": 0.0004803511156860304, "loss": 0.9926, "step": 11295 }, { "epoch": 1.3583363385022238, "grad_norm": 0.4879516363143921, "learning_rate": 0.000480332582050374, "loss": 0.9863, "step": 11300 }, { "epoch": 1.3589373722803222, "grad_norm": 0.466981440782547, "learning_rate": 0.0004803140400358843, "loss": 1.132, "step": 11305 }, { "epoch": 1.3595384060584204, "grad_norm": 0.8294722437858582, "learning_rate": 0.000480295489643236, "loss": 0.7312, "step": 11310 }, { "epoch": 1.3601394398365187, "grad_norm": 0.5230211615562439, "learning_rate": 0.00048027693087310377, "loss": 0.6676, "step": 11315 }, { "epoch": 1.3607404736146171, "grad_norm": 0.5742996335029602, "learning_rate": 0.0004802583637261627, "loss": 1.2273, "step": 11320 }, { "epoch": 1.3613415073927155, "grad_norm": 0.7959683537483215, "learning_rate": 0.00048023978820308834, "loss": 1.193, "step": 11325 }, { "epoch": 1.361942541170814, "grad_norm": 0.6174236536026001, "learning_rate": 0.0004802212043045563, "loss": 0.6727, "step": 11330 }, { "epoch": 1.362543574948912, "grad_norm": 0.5452308654785156, "learning_rate": 0.00048020261203124274, "loss": 0.7625, "step": 11335 }, { "epoch": 1.3631446087270105, "grad_norm": 0.57860267162323, "learning_rate": 0.0004801840113838237, "loss": 0.8266, "step": 11340 }, { "epoch": 1.3637456425051089, "grad_norm": 0.7537270784378052, "learning_rate": 0.0004801654023629761, "loss": 0.8496, "step": 11345 }, { "epoch": 1.364346676283207, "grad_norm": 0.6378467082977295, "learning_rate": 0.0004801467849693769, "loss": 0.7359, "step": 11350 }, { "epoch": 1.3649477100613054, "grad_norm": 0.6698923110961914, "learning_rate": 0.00048012815920370314, "loss": 0.5863, "step": 11355 }, { "epoch": 1.3655487438394038, "grad_norm": 0.585909903049469, "learning_rate": 0.00048010952506663253, "loss": 0.7445, "step": 11360 }, { "epoch": 1.366149777617502, "grad_norm": 0.5491454005241394, "learning_rate": 0.0004800908825588428, "loss": 0.9254, "step": 11365 }, { "epoch": 1.3667508113956004, "grad_norm": 0.7444734573364258, "learning_rate": 0.0004800722316810122, "loss": 1.0828, "step": 11370 }, { "epoch": 1.3673518451736988, "grad_norm": 0.781043529510498, "learning_rate": 0.00048005357243381924, "loss": 0.6766, "step": 11375 }, { "epoch": 1.367952878951797, "grad_norm": 0.6379426717758179, "learning_rate": 0.0004800349048179426, "loss": 0.8406, "step": 11380 }, { "epoch": 1.3685539127298953, "grad_norm": 0.8509328961372375, "learning_rate": 0.0004800162288340614, "loss": 0.9719, "step": 11385 }, { "epoch": 1.3691549465079937, "grad_norm": 0.5497897863388062, "learning_rate": 0.0004799975444828549, "loss": 0.9617, "step": 11390 }, { "epoch": 1.3697559802860921, "grad_norm": 0.8729432821273804, "learning_rate": 0.000479978851765003, "loss": 1.0242, "step": 11395 }, { "epoch": 1.3703570140641905, "grad_norm": 0.6886745691299438, "learning_rate": 0.0004799601506811855, "loss": 1.0766, "step": 11400 }, { "epoch": 1.3703570140641905, "eval_loss": 1.90771484375, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2222, "eval_samples_per_second": 4.543, "eval_steps_per_second": 1.136, "step": 11400 }, { "epoch": 1.3709580478422887, "grad_norm": 0.5805025100708008, "learning_rate": 0.0004799414412320828, "loss": 1.1969, "step": 11405 }, { "epoch": 1.371559081620387, "grad_norm": 0.9585275053977966, "learning_rate": 0.0004799227234183754, "loss": 1.1859, "step": 11410 }, { "epoch": 1.3721601153984855, "grad_norm": 0.49869146943092346, "learning_rate": 0.0004799039972407443, "loss": 0.9605, "step": 11415 }, { "epoch": 1.3727611491765837, "grad_norm": 0.5365501642227173, "learning_rate": 0.00047988526269987064, "loss": 1.0359, "step": 11420 }, { "epoch": 1.373362182954682, "grad_norm": 0.5086499452590942, "learning_rate": 0.00047986651979643595, "loss": 0.7688, "step": 11425 }, { "epoch": 1.3739632167327804, "grad_norm": 0.5709533095359802, "learning_rate": 0.00047984776853112203, "loss": 0.9395, "step": 11430 }, { "epoch": 1.3745642505108786, "grad_norm": 0.7177227139472961, "learning_rate": 0.000479829008904611, "loss": 0.7867, "step": 11435 }, { "epoch": 1.375165284288977, "grad_norm": 0.5393221974372864, "learning_rate": 0.00047981024091758536, "loss": 0.7066, "step": 11440 }, { "epoch": 1.3757663180670754, "grad_norm": 0.4818534553050995, "learning_rate": 0.00047979146457072773, "loss": 0.7496, "step": 11445 }, { "epoch": 1.3763673518451738, "grad_norm": 0.6378899812698364, "learning_rate": 0.00047977267986472116, "loss": 0.677, "step": 11450 }, { "epoch": 1.376968385623272, "grad_norm": 0.7033771276473999, "learning_rate": 0.000479753886800249, "loss": 0.6977, "step": 11455 }, { "epoch": 1.3775694194013703, "grad_norm": 0.7331205010414124, "learning_rate": 0.0004797350853779949, "loss": 0.9195, "step": 11460 }, { "epoch": 1.3781704531794687, "grad_norm": 0.6814472675323486, "learning_rate": 0.00047971627559864274, "loss": 1.1328, "step": 11465 }, { "epoch": 1.3787714869575671, "grad_norm": 0.586630642414093, "learning_rate": 0.00047969745746287684, "loss": 0.8621, "step": 11470 }, { "epoch": 1.3793725207356653, "grad_norm": 0.5795077085494995, "learning_rate": 0.0004796786309713817, "loss": 0.9133, "step": 11475 }, { "epoch": 1.3799735545137637, "grad_norm": 0.6244852542877197, "learning_rate": 0.0004796597961248422, "loss": 1.0539, "step": 11480 }, { "epoch": 1.380574588291862, "grad_norm": 0.6225668787956238, "learning_rate": 0.0004796409529239435, "loss": 0.8906, "step": 11485 }, { "epoch": 1.3811756220699603, "grad_norm": 0.6311030983924866, "learning_rate": 0.0004796221013693711, "loss": 1.0258, "step": 11490 }, { "epoch": 1.3817766558480586, "grad_norm": 0.6278150081634521, "learning_rate": 0.0004796032414618106, "loss": 0.8395, "step": 11495 }, { "epoch": 1.382377689626157, "grad_norm": 0.614443838596344, "learning_rate": 0.00047958437320194826, "loss": 0.6719, "step": 11500 }, { "epoch": 1.3829787234042552, "grad_norm": 0.6069492697715759, "learning_rate": 0.0004795654965904704, "loss": 0.9508, "step": 11505 }, { "epoch": 1.3835797571823536, "grad_norm": 0.969376802444458, "learning_rate": 0.00047954661162806365, "loss": 1.1504, "step": 11510 }, { "epoch": 1.384180790960452, "grad_norm": 0.5342624187469482, "learning_rate": 0.00047952771831541495, "loss": 0.9109, "step": 11515 }, { "epoch": 1.3847818247385504, "grad_norm": 0.7148461937904358, "learning_rate": 0.00047950881665321176, "loss": 0.8922, "step": 11520 }, { "epoch": 1.3853828585166488, "grad_norm": 0.6755735278129578, "learning_rate": 0.0004794899066421415, "loss": 0.7969, "step": 11525 }, { "epoch": 1.385983892294747, "grad_norm": 0.6074589490890503, "learning_rate": 0.0004794709882828921, "loss": 0.9855, "step": 11530 }, { "epoch": 1.3865849260728453, "grad_norm": 0.7329553365707397, "learning_rate": 0.00047945206157615184, "loss": 1.1672, "step": 11535 }, { "epoch": 1.3871859598509437, "grad_norm": 0.7074795365333557, "learning_rate": 0.00047943312652260904, "loss": 0.7902, "step": 11540 }, { "epoch": 1.387786993629042, "grad_norm": 0.699269711971283, "learning_rate": 0.00047941418312295273, "loss": 0.9234, "step": 11545 }, { "epoch": 1.3883880274071403, "grad_norm": 0.724504828453064, "learning_rate": 0.00047939523137787186, "loss": 0.9938, "step": 11550 }, { "epoch": 1.3889890611852387, "grad_norm": 0.6939505338668823, "learning_rate": 0.0004793762712880558, "loss": 0.8531, "step": 11555 }, { "epoch": 1.3895900949633369, "grad_norm": 0.5704456567764282, "learning_rate": 0.0004793573028541945, "loss": 0.7332, "step": 11560 }, { "epoch": 1.3901911287414352, "grad_norm": 1.0623825788497925, "learning_rate": 0.0004793383260769777, "loss": 1.1156, "step": 11565 }, { "epoch": 1.3907921625195336, "grad_norm": 0.4707493185997009, "learning_rate": 0.0004793193409570959, "loss": 0.5855, "step": 11570 }, { "epoch": 1.3913931962976318, "grad_norm": 0.7300910353660583, "learning_rate": 0.00047930034749523966, "loss": 0.941, "step": 11575 }, { "epoch": 1.3919942300757302, "grad_norm": 0.7034488320350647, "learning_rate": 0.0004792813456920999, "loss": 0.7309, "step": 11580 }, { "epoch": 1.3925952638538286, "grad_norm": 0.8144317269325256, "learning_rate": 0.00047926233554836784, "loss": 0.8031, "step": 11585 }, { "epoch": 1.393196297631927, "grad_norm": 0.6201553344726562, "learning_rate": 0.00047924331706473505, "loss": 0.7188, "step": 11590 }, { "epoch": 1.3937973314100254, "grad_norm": 0.5154601335525513, "learning_rate": 0.00047922429024189334, "loss": 0.7484, "step": 11595 }, { "epoch": 1.3943983651881235, "grad_norm": 0.5169423818588257, "learning_rate": 0.00047920525508053495, "loss": 0.7375, "step": 11600 }, { "epoch": 1.3943983651881235, "eval_loss": 1.92626953125, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.197, "eval_samples_per_second": 4.546, "eval_steps_per_second": 1.136, "step": 11600 }, { "epoch": 1.394999398966222, "grad_norm": 0.8038589358329773, "learning_rate": 0.0004791862115813521, "loss": 0.8148, "step": 11605 }, { "epoch": 1.3956004327443203, "grad_norm": 0.6422277688980103, "learning_rate": 0.0004791671597450378, "loss": 0.9367, "step": 11610 }, { "epoch": 1.3962014665224185, "grad_norm": 0.76006680727005, "learning_rate": 0.00047914809957228487, "loss": 0.7887, "step": 11615 }, { "epoch": 1.396802500300517, "grad_norm": 0.37485867738723755, "learning_rate": 0.00047912903106378686, "loss": 0.9402, "step": 11620 }, { "epoch": 1.3974035340786153, "grad_norm": 0.4976300895214081, "learning_rate": 0.0004791099542202373, "loss": 0.6328, "step": 11625 }, { "epoch": 1.3980045678567135, "grad_norm": 0.7349663376808167, "learning_rate": 0.00047909086904233017, "loss": 0.8508, "step": 11630 }, { "epoch": 1.3986056016348118, "grad_norm": 0.8288495540618896, "learning_rate": 0.00047907177553075975, "loss": 0.8781, "step": 11635 }, { "epoch": 1.3992066354129102, "grad_norm": 0.6059455275535583, "learning_rate": 0.0004790526736862207, "loss": 0.9578, "step": 11640 }, { "epoch": 1.3998076691910084, "grad_norm": 0.51099693775177, "learning_rate": 0.0004790335635094077, "loss": 0.7285, "step": 11645 }, { "epoch": 1.4004087029691068, "grad_norm": 0.6307785511016846, "learning_rate": 0.0004790144450010161, "loss": 0.9633, "step": 11650 }, { "epoch": 1.4010097367472052, "grad_norm": 0.5239074230194092, "learning_rate": 0.0004789953181617412, "loss": 0.8578, "step": 11655 }, { "epoch": 1.4016107705253036, "grad_norm": 0.815410315990448, "learning_rate": 0.000478976182992279, "loss": 0.8086, "step": 11660 }, { "epoch": 1.402211804303402, "grad_norm": 0.48938998579978943, "learning_rate": 0.00047895703949332535, "loss": 1.1637, "step": 11665 }, { "epoch": 1.4028128380815001, "grad_norm": 0.9560267925262451, "learning_rate": 0.00047893788766557676, "loss": 0.791, "step": 11670 }, { "epoch": 1.4034138718595985, "grad_norm": 0.533134400844574, "learning_rate": 0.0004789187275097299, "loss": 0.8922, "step": 11675 }, { "epoch": 1.404014905637697, "grad_norm": 0.6384151577949524, "learning_rate": 0.00047889955902648187, "loss": 0.7984, "step": 11680 }, { "epoch": 1.404615939415795, "grad_norm": 0.5299431681632996, "learning_rate": 0.00047888038221652974, "loss": 0.7051, "step": 11685 }, { "epoch": 1.4052169731938935, "grad_norm": 0.9126712679862976, "learning_rate": 0.00047886119708057123, "loss": 0.925, "step": 11690 }, { "epoch": 1.4058180069719919, "grad_norm": 0.6048250794410706, "learning_rate": 0.0004788420036193043, "loss": 0.8289, "step": 11695 }, { "epoch": 1.40641904075009, "grad_norm": 0.4162563979625702, "learning_rate": 0.000478822801833427, "loss": 0.8832, "step": 11700 }, { "epoch": 1.4070200745281884, "grad_norm": 0.5358899831771851, "learning_rate": 0.00047880359172363795, "loss": 0.832, "step": 11705 }, { "epoch": 1.4076211083062868, "grad_norm": 0.6457653641700745, "learning_rate": 0.000478784373290636, "loss": 1.018, "step": 11710 }, { "epoch": 1.408222142084385, "grad_norm": 0.6539694666862488, "learning_rate": 0.00047876514653512017, "loss": 1.1281, "step": 11715 }, { "epoch": 1.4088231758624834, "grad_norm": 0.6113206148147583, "learning_rate": 0.0004787459114577898, "loss": 0.909, "step": 11720 }, { "epoch": 1.4094242096405818, "grad_norm": 0.6877265572547913, "learning_rate": 0.00047872666805934483, "loss": 1.1672, "step": 11725 }, { "epoch": 1.4100252434186802, "grad_norm": 0.315181702375412, "learning_rate": 0.0004787074163404851, "loss": 0.975, "step": 11730 }, { "epoch": 1.4106262771967786, "grad_norm": 0.4645707607269287, "learning_rate": 0.00047868815630191093, "loss": 0.8203, "step": 11735 }, { "epoch": 1.4112273109748767, "grad_norm": 0.6153178811073303, "learning_rate": 0.00047866888794432307, "loss": 0.868, "step": 11740 }, { "epoch": 1.4118283447529751, "grad_norm": 0.4136320650577545, "learning_rate": 0.0004786496112684223, "loss": 0.7406, "step": 11745 }, { "epoch": 1.4124293785310735, "grad_norm": 0.6058844327926636, "learning_rate": 0.00047863032627491, "loss": 0.8234, "step": 11750 }, { "epoch": 1.4130304123091717, "grad_norm": 0.5758222341537476, "learning_rate": 0.00047861103296448763, "loss": 0.9852, "step": 11755 }, { "epoch": 1.41363144608727, "grad_norm": 0.7107464075088501, "learning_rate": 0.0004785917313378569, "loss": 0.8414, "step": 11760 }, { "epoch": 1.4142324798653685, "grad_norm": 0.6578184366226196, "learning_rate": 0.0004785724213957202, "loss": 1.1812, "step": 11765 }, { "epoch": 1.4148335136434667, "grad_norm": 0.586203932762146, "learning_rate": 0.00047855310313877974, "loss": 0.6344, "step": 11770 }, { "epoch": 1.415434547421565, "grad_norm": 0.43385422229766846, "learning_rate": 0.00047853377656773845, "loss": 0.5637, "step": 11775 }, { "epoch": 1.4160355811996634, "grad_norm": 0.9953305721282959, "learning_rate": 0.0004785144416832993, "loss": 1.0328, "step": 11780 }, { "epoch": 1.4166366149777618, "grad_norm": 0.6671276092529297, "learning_rate": 0.0004784950984861655, "loss": 0.7875, "step": 11785 }, { "epoch": 1.41723764875586, "grad_norm": 0.5496247410774231, "learning_rate": 0.0004784757469770409, "loss": 1.007, "step": 11790 }, { "epoch": 1.4178386825339584, "grad_norm": 0.849506676197052, "learning_rate": 0.0004784563871566294, "loss": 1.0562, "step": 11795 }, { "epoch": 1.4184397163120568, "grad_norm": 0.7524552941322327, "learning_rate": 0.0004784370190256352, "loss": 0.7793, "step": 11800 }, { "epoch": 1.4184397163120568, "eval_loss": 1.9072265625, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.198, "eval_samples_per_second": 4.546, "eval_steps_per_second": 1.136, "step": 11800 }, { "epoch": 1.4190407500901552, "grad_norm": 0.5351150035858154, "learning_rate": 0.0004784176425847628, "loss": 0.6422, "step": 11805 }, { "epoch": 1.4196417838682533, "grad_norm": 0.647532045841217, "learning_rate": 0.0004783982578347172, "loss": 0.8836, "step": 11810 }, { "epoch": 1.4202428176463517, "grad_norm": 0.44134610891342163, "learning_rate": 0.00047837886477620356, "loss": 0.9523, "step": 11815 }, { "epoch": 1.4208438514244501, "grad_norm": 0.500471293926239, "learning_rate": 0.0004783594634099273, "loss": 0.8375, "step": 11820 }, { "epoch": 1.4214448852025483, "grad_norm": 0.6604463458061218, "learning_rate": 0.00047834005373659406, "loss": 0.6324, "step": 11825 }, { "epoch": 1.4220459189806467, "grad_norm": 0.6263642907142639, "learning_rate": 0.0004783206357569101, "loss": 0.9242, "step": 11830 }, { "epoch": 1.422646952758745, "grad_norm": 0.6608700752258301, "learning_rate": 0.0004783012094715817, "loss": 0.9629, "step": 11835 }, { "epoch": 1.4232479865368433, "grad_norm": 0.6739986538887024, "learning_rate": 0.0004782817748813156, "loss": 0.9539, "step": 11840 }, { "epoch": 1.4238490203149416, "grad_norm": 0.7138675451278687, "learning_rate": 0.00047826233198681865, "loss": 0.8711, "step": 11845 }, { "epoch": 1.42445005409304, "grad_norm": 0.37352198362350464, "learning_rate": 0.00047824288078879816, "loss": 0.8984, "step": 11850 }, { "epoch": 1.4250510878711384, "grad_norm": 0.6453259587287903, "learning_rate": 0.0004782234212879618, "loss": 1.1148, "step": 11855 }, { "epoch": 1.4256521216492368, "grad_norm": 0.5219030380249023, "learning_rate": 0.00047820395348501746, "loss": 0.766, "step": 11860 }, { "epoch": 1.426253155427335, "grad_norm": 0.660132110118866, "learning_rate": 0.0004781844773806731, "loss": 1.1359, "step": 11865 }, { "epoch": 1.4268541892054334, "grad_norm": 0.676121711730957, "learning_rate": 0.0004781649929756374, "loss": 0.8164, "step": 11870 }, { "epoch": 1.4274552229835318, "grad_norm": 0.6689792275428772, "learning_rate": 0.0004781455002706192, "loss": 0.7781, "step": 11875 }, { "epoch": 1.42805625676163, "grad_norm": 0.6554180979728699, "learning_rate": 0.00047812599926632743, "loss": 0.8844, "step": 11880 }, { "epoch": 1.4286572905397283, "grad_norm": 0.8487147092819214, "learning_rate": 0.00047810648996347155, "loss": 0.9789, "step": 11885 }, { "epoch": 1.4292583243178267, "grad_norm": 0.5553125739097595, "learning_rate": 0.00047808697236276123, "loss": 1.1766, "step": 11890 }, { "epoch": 1.429859358095925, "grad_norm": 0.6369397044181824, "learning_rate": 0.00047806744646490654, "loss": 1.1664, "step": 11895 }, { "epoch": 1.4304603918740233, "grad_norm": 0.6198737025260925, "learning_rate": 0.0004780479122706176, "loss": 0.8113, "step": 11900 }, { "epoch": 1.4310614256521217, "grad_norm": 0.39903023838996887, "learning_rate": 0.00047802836978060525, "loss": 0.8598, "step": 11905 }, { "epoch": 1.4316624594302199, "grad_norm": 0.6504912972450256, "learning_rate": 0.0004780088189955802, "loss": 0.6898, "step": 11910 }, { "epoch": 1.4322634932083183, "grad_norm": 0.6548739075660706, "learning_rate": 0.00047798925991625373, "loss": 0.7398, "step": 11915 }, { "epoch": 1.4328645269864166, "grad_norm": 0.6616823673248291, "learning_rate": 0.0004779696925433373, "loss": 0.6602, "step": 11920 }, { "epoch": 1.433465560764515, "grad_norm": 0.6308808922767639, "learning_rate": 0.00047795011687754274, "loss": 0.7422, "step": 11925 }, { "epoch": 1.4340665945426134, "grad_norm": 0.7235855460166931, "learning_rate": 0.0004779305329195822, "loss": 0.9176, "step": 11930 }, { "epoch": 1.4346676283207116, "grad_norm": 0.4720229208469391, "learning_rate": 0.00047791094067016807, "loss": 0.7555, "step": 11935 }, { "epoch": 1.43526866209881, "grad_norm": 0.7362262606620789, "learning_rate": 0.0004778913401300129, "loss": 1.1086, "step": 11940 }, { "epoch": 1.4358696958769084, "grad_norm": 0.5105779767036438, "learning_rate": 0.00047787173129983, "loss": 0.8168, "step": 11945 }, { "epoch": 1.4364707296550066, "grad_norm": 0.5137690901756287, "learning_rate": 0.00047785211418033236, "loss": 0.7445, "step": 11950 }, { "epoch": 1.437071763433105, "grad_norm": 0.47566288709640503, "learning_rate": 0.0004778324887722338, "loss": 0.8789, "step": 11955 }, { "epoch": 1.4376727972112033, "grad_norm": 0.5992048382759094, "learning_rate": 0.0004778128550762483, "loss": 1.3156, "step": 11960 }, { "epoch": 1.4382738309893015, "grad_norm": 0.7317075729370117, "learning_rate": 0.0004777932130930899, "loss": 0.7258, "step": 11965 }, { "epoch": 1.4388748647674, "grad_norm": 0.5241214632987976, "learning_rate": 0.0004777735628234731, "loss": 0.9426, "step": 11970 }, { "epoch": 1.4394758985454983, "grad_norm": 0.3349893093109131, "learning_rate": 0.0004777539042681129, "loss": 0.5996, "step": 11975 }, { "epoch": 1.4400769323235965, "grad_norm": 0.7063739895820618, "learning_rate": 0.00047773423742772424, "loss": 0.9258, "step": 11980 }, { "epoch": 1.4406779661016949, "grad_norm": 0.6058081984519958, "learning_rate": 0.0004777145623030227, "loss": 0.6523, "step": 11985 }, { "epoch": 1.4412789998797932, "grad_norm": 0.6340814232826233, "learning_rate": 0.0004776948788947239, "loss": 0.7992, "step": 11990 }, { "epoch": 1.4418800336578916, "grad_norm": 0.6422274112701416, "learning_rate": 0.0004776751872035439, "loss": 0.8121, "step": 11995 }, { "epoch": 1.44248106743599, "grad_norm": 0.6687328815460205, "learning_rate": 0.000477655487230199, "loss": 0.7352, "step": 12000 }, { "epoch": 1.44248106743599, "eval_loss": 1.8953125476837158, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2026, "eval_samples_per_second": 4.545, "eval_steps_per_second": 1.136, "step": 12000 }, { "epoch": 1.4430821012140882, "grad_norm": 0.5076608657836914, "learning_rate": 0.0004776357789754059, "loss": 0.7865, "step": 12005 }, { "epoch": 1.4436831349921866, "grad_norm": 0.5138949155807495, "learning_rate": 0.00047761606243988143, "loss": 0.7227, "step": 12010 }, { "epoch": 1.444284168770285, "grad_norm": 0.7489267587661743, "learning_rate": 0.000477596337624343, "loss": 0.918, "step": 12015 }, { "epoch": 1.4448852025483832, "grad_norm": 0.6753156185150146, "learning_rate": 0.00047757660452950784, "loss": 0.8609, "step": 12020 }, { "epoch": 1.4454862363264815, "grad_norm": 0.5410712361335754, "learning_rate": 0.00047755686315609404, "loss": 1.1211, "step": 12025 }, { "epoch": 1.44608727010458, "grad_norm": 0.6748936772346497, "learning_rate": 0.00047753711350481963, "loss": 0.918, "step": 12030 }, { "epoch": 1.446688303882678, "grad_norm": 0.5443545579910278, "learning_rate": 0.00047751735557640307, "loss": 0.7074, "step": 12035 }, { "epoch": 1.4472893376607765, "grad_norm": 1.1086053848266602, "learning_rate": 0.00047749758937156307, "loss": 1.2141, "step": 12040 }, { "epoch": 1.447890371438875, "grad_norm": 0.5091587901115417, "learning_rate": 0.0004774778148910187, "loss": 1.118, "step": 12045 }, { "epoch": 1.448491405216973, "grad_norm": 0.6335846781730652, "learning_rate": 0.0004774580321354893, "loss": 1.0063, "step": 12050 }, { "epoch": 1.4490924389950715, "grad_norm": 0.5791338086128235, "learning_rate": 0.00047743824110569455, "loss": 0.9953, "step": 12055 }, { "epoch": 1.4496934727731698, "grad_norm": 0.5278282761573792, "learning_rate": 0.00047741844180235426, "loss": 0.7348, "step": 12060 }, { "epoch": 1.4502945065512682, "grad_norm": 0.7375497817993164, "learning_rate": 0.0004773986342261888, "loss": 1.0227, "step": 12065 }, { "epoch": 1.4508955403293666, "grad_norm": 0.6930285692214966, "learning_rate": 0.00047737881837791864, "loss": 0.9164, "step": 12070 }, { "epoch": 1.4514965741074648, "grad_norm": 0.5249225497245789, "learning_rate": 0.00047735899425826463, "loss": 0.7281, "step": 12075 }, { "epoch": 1.4520976078855632, "grad_norm": 0.6055876612663269, "learning_rate": 0.0004773391618679479, "loss": 0.9016, "step": 12080 }, { "epoch": 1.4526986416636616, "grad_norm": 0.686714231967926, "learning_rate": 0.0004773193212076899, "loss": 0.7613, "step": 12085 }, { "epoch": 1.4532996754417598, "grad_norm": 0.5571845173835754, "learning_rate": 0.00047729947227821256, "loss": 0.9586, "step": 12090 }, { "epoch": 1.4539007092198581, "grad_norm": 0.727156937122345, "learning_rate": 0.00047727961508023766, "loss": 1.0578, "step": 12095 }, { "epoch": 1.4545017429979565, "grad_norm": 0.47391200065612793, "learning_rate": 0.0004772597496144876, "loss": 1.0984, "step": 12100 }, { "epoch": 1.4551027767760547, "grad_norm": 0.7556493282318115, "learning_rate": 0.0004772398758816852, "loss": 0.9313, "step": 12105 }, { "epoch": 1.455703810554153, "grad_norm": 0.486554890871048, "learning_rate": 0.0004772199938825532, "loss": 0.8477, "step": 12110 }, { "epoch": 1.4563048443322515, "grad_norm": 0.8512095212936401, "learning_rate": 0.00047720010361781496, "loss": 0.7195, "step": 12115 }, { "epoch": 1.4569058781103499, "grad_norm": 0.6865861415863037, "learning_rate": 0.00047718020508819405, "loss": 0.8781, "step": 12120 }, { "epoch": 1.457506911888448, "grad_norm": 0.5790640115737915, "learning_rate": 0.0004771602982944142, "loss": 0.8102, "step": 12125 }, { "epoch": 1.4581079456665464, "grad_norm": 0.44529563188552856, "learning_rate": 0.0004771403832371997, "loss": 0.9488, "step": 12130 }, { "epoch": 1.4587089794446448, "grad_norm": 0.7128551602363586, "learning_rate": 0.00047712045991727493, "loss": 0.7906, "step": 12135 }, { "epoch": 1.4593100132227432, "grad_norm": 0.5855141878128052, "learning_rate": 0.00047710052833536466, "loss": 0.7473, "step": 12140 }, { "epoch": 1.4599110470008414, "grad_norm": 0.7266665697097778, "learning_rate": 0.00047708058849219395, "loss": 0.8297, "step": 12145 }, { "epoch": 1.4605120807789398, "grad_norm": 0.6347297430038452, "learning_rate": 0.0004770606403884882, "loss": 1.1242, "step": 12150 }, { "epoch": 1.4611131145570382, "grad_norm": 0.5679700374603271, "learning_rate": 0.0004770406840249729, "loss": 0.8238, "step": 12155 }, { "epoch": 1.4617141483351364, "grad_norm": 0.6876038312911987, "learning_rate": 0.0004770207194023742, "loss": 0.6891, "step": 12160 }, { "epoch": 1.4623151821132347, "grad_norm": 0.5750330686569214, "learning_rate": 0.00047700074652141824, "loss": 0.8281, "step": 12165 }, { "epoch": 1.4629162158913331, "grad_norm": 0.6052514910697937, "learning_rate": 0.0004769807653828316, "loss": 1.0781, "step": 12170 }, { "epoch": 1.4635172496694313, "grad_norm": 0.5823677778244019, "learning_rate": 0.00047696077598734113, "loss": 0.9879, "step": 12175 }, { "epoch": 1.4641182834475297, "grad_norm": 0.59844970703125, "learning_rate": 0.00047694077833567393, "loss": 0.8234, "step": 12180 }, { "epoch": 1.464719317225628, "grad_norm": 1.1165310144424438, "learning_rate": 0.0004769207724285577, "loss": 1.2922, "step": 12185 }, { "epoch": 1.4653203510037265, "grad_norm": 0.6903029084205627, "learning_rate": 0.00047690075826671986, "loss": 1.2063, "step": 12190 }, { "epoch": 1.4659213847818249, "grad_norm": 0.7730051279067993, "learning_rate": 0.0004768807358508887, "loss": 0.8281, "step": 12195 }, { "epoch": 1.466522418559923, "grad_norm": 0.6965062618255615, "learning_rate": 0.0004768607051817925, "loss": 0.9531, "step": 12200 }, { "epoch": 1.466522418559923, "eval_loss": 1.91943359375, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2378, "eval_samples_per_second": 4.541, "eval_steps_per_second": 1.135, "step": 12200 }, { "epoch": 1.4671234523380214, "grad_norm": 0.6067870855331421, "learning_rate": 0.0004768406662601599, "loss": 0.6766, "step": 12205 }, { "epoch": 1.4677244861161198, "grad_norm": 0.6868239641189575, "learning_rate": 0.0004768206190867199, "loss": 0.8977, "step": 12210 }, { "epoch": 1.468325519894218, "grad_norm": 0.5714823603630066, "learning_rate": 0.00047680056366220175, "loss": 1.1648, "step": 12215 }, { "epoch": 1.4689265536723164, "grad_norm": 0.6335210204124451, "learning_rate": 0.00047678049998733495, "loss": 1.118, "step": 12220 }, { "epoch": 1.4695275874504148, "grad_norm": 0.6525861620903015, "learning_rate": 0.00047676042806284944, "loss": 0.9152, "step": 12225 }, { "epoch": 1.470128621228513, "grad_norm": 0.7297841906547546, "learning_rate": 0.0004767403478894754, "loss": 0.9125, "step": 12230 }, { "epoch": 1.4707296550066113, "grad_norm": 0.6787566542625427, "learning_rate": 0.0004767202594679432, "loss": 1.1695, "step": 12235 }, { "epoch": 1.4713306887847097, "grad_norm": 0.6729519367218018, "learning_rate": 0.0004767001627989836, "loss": 0.8758, "step": 12240 }, { "epoch": 1.471931722562808, "grad_norm": 0.6045839786529541, "learning_rate": 0.0004766800578833277, "loss": 0.8633, "step": 12245 }, { "epoch": 1.4725327563409063, "grad_norm": 0.4813026189804077, "learning_rate": 0.00047665994472170683, "loss": 0.7582, "step": 12250 }, { "epoch": 1.4731337901190047, "grad_norm": 0.6443421244621277, "learning_rate": 0.0004766398233148527, "loss": 0.7891, "step": 12255 }, { "epoch": 1.473734823897103, "grad_norm": 0.7598147988319397, "learning_rate": 0.0004766196936634972, "loss": 0.7594, "step": 12260 }, { "epoch": 1.4743358576752015, "grad_norm": 0.5582093000411987, "learning_rate": 0.0004765995557683727, "loss": 0.875, "step": 12265 }, { "epoch": 1.4749368914532996, "grad_norm": 0.5228426456451416, "learning_rate": 0.0004765794096302117, "loss": 0.9266, "step": 12270 }, { "epoch": 1.475537925231398, "grad_norm": 0.8562288284301758, "learning_rate": 0.00047655925524974695, "loss": 0.8562, "step": 12275 }, { "epoch": 1.4761389590094964, "grad_norm": 0.7450162172317505, "learning_rate": 0.00047653909262771176, "loss": 0.8398, "step": 12280 }, { "epoch": 1.4767399927875946, "grad_norm": 0.6747555732727051, "learning_rate": 0.00047651892176483947, "loss": 1.3336, "step": 12285 }, { "epoch": 1.477341026565693, "grad_norm": 0.5333715677261353, "learning_rate": 0.000476498742661864, "loss": 1.0578, "step": 12290 }, { "epoch": 1.4779420603437914, "grad_norm": 0.9174572825431824, "learning_rate": 0.0004764785553195192, "loss": 0.8672, "step": 12295 }, { "epoch": 1.4785430941218896, "grad_norm": 0.5746082663536072, "learning_rate": 0.00047645835973853955, "loss": 0.9289, "step": 12300 }, { "epoch": 1.479144127899988, "grad_norm": 0.4555107653141022, "learning_rate": 0.00047643815591965965, "loss": 0.9156, "step": 12305 }, { "epoch": 1.4797451616780863, "grad_norm": 0.7347114682197571, "learning_rate": 0.0004764179438636146, "loss": 0.9094, "step": 12310 }, { "epoch": 1.4803461954561845, "grad_norm": 0.7148030996322632, "learning_rate": 0.0004763977235711395, "loss": 1.0395, "step": 12315 }, { "epoch": 1.480947229234283, "grad_norm": 0.5407163500785828, "learning_rate": 0.00047637749504296994, "loss": 0.7078, "step": 12320 }, { "epoch": 1.4815482630123813, "grad_norm": 0.6483573913574219, "learning_rate": 0.00047635725827984186, "loss": 0.9375, "step": 12325 }, { "epoch": 1.4821492967904797, "grad_norm": 0.6162995100021362, "learning_rate": 0.0004763370132824912, "loss": 0.6305, "step": 12330 }, { "epoch": 1.482750330568578, "grad_norm": 0.5880223512649536, "learning_rate": 0.0004763167600516547, "loss": 0.677, "step": 12335 }, { "epoch": 1.4833513643466762, "grad_norm": 0.7350181937217712, "learning_rate": 0.0004762964985880689, "loss": 1.0395, "step": 12340 }, { "epoch": 1.4839523981247746, "grad_norm": 0.5672032237052917, "learning_rate": 0.000476276228892471, "loss": 1.1992, "step": 12345 }, { "epoch": 1.484553431902873, "grad_norm": 0.5772983431816101, "learning_rate": 0.00047625595096559823, "loss": 0.7867, "step": 12350 }, { "epoch": 1.4851544656809712, "grad_norm": 0.5597769021987915, "learning_rate": 0.00047623566480818825, "loss": 0.8156, "step": 12355 }, { "epoch": 1.4857554994590696, "grad_norm": 0.4812193810939789, "learning_rate": 0.00047621537042097915, "loss": 0.9371, "step": 12360 }, { "epoch": 1.486356533237168, "grad_norm": 0.6280331611633301, "learning_rate": 0.000476195067804709, "loss": 1.0867, "step": 12365 }, { "epoch": 1.4869575670152662, "grad_norm": 0.5098450779914856, "learning_rate": 0.0004761747569601165, "loss": 0.8812, "step": 12370 }, { "epoch": 1.4875586007933645, "grad_norm": 0.3558202385902405, "learning_rate": 0.00047615443788794043, "loss": 0.877, "step": 12375 }, { "epoch": 1.488159634571463, "grad_norm": 0.8843599557876587, "learning_rate": 0.0004761341105889199, "loss": 0.8195, "step": 12380 }, { "epoch": 1.4887606683495611, "grad_norm": 0.5288358926773071, "learning_rate": 0.00047611377506379446, "loss": 0.6906, "step": 12385 }, { "epoch": 1.4893617021276595, "grad_norm": 0.5971837639808655, "learning_rate": 0.00047609343131330373, "loss": 1.0445, "step": 12390 }, { "epoch": 1.489962735905758, "grad_norm": 0.7089256048202515, "learning_rate": 0.0004760730793381879, "loss": 0.8844, "step": 12395 }, { "epoch": 1.4905637696838563, "grad_norm": 0.5668146014213562, "learning_rate": 0.0004760527191391872, "loss": 1.3359, "step": 12400 }, { "epoch": 1.4905637696838563, "eval_loss": 1.902929663658142, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2029, "eval_samples_per_second": 4.545, "eval_steps_per_second": 1.136, "step": 12400 }, { "epoch": 1.4911648034619547, "grad_norm": 0.5594661235809326, "learning_rate": 0.0004760323507170423, "loss": 0.7047, "step": 12405 }, { "epoch": 1.4917658372400528, "grad_norm": 0.6605058312416077, "learning_rate": 0.0004760119740724942, "loss": 0.707, "step": 12410 }, { "epoch": 1.4923668710181512, "grad_norm": 0.5722910165786743, "learning_rate": 0.0004759915892062842, "loss": 0.7408, "step": 12415 }, { "epoch": 1.4929679047962496, "grad_norm": 0.5285496711730957, "learning_rate": 0.0004759711961191536, "loss": 1.0297, "step": 12420 }, { "epoch": 1.4935689385743478, "grad_norm": 0.6956138610839844, "learning_rate": 0.00047595079481184443, "loss": 0.7648, "step": 12425 }, { "epoch": 1.4941699723524462, "grad_norm": 0.7472872138023376, "learning_rate": 0.0004759303852850988, "loss": 0.8766, "step": 12430 }, { "epoch": 1.4947710061305446, "grad_norm": 0.5937708020210266, "learning_rate": 0.0004759099675396592, "loss": 0.7461, "step": 12435 }, { "epoch": 1.4953720399086428, "grad_norm": 0.7011404633522034, "learning_rate": 0.00047588954157626816, "loss": 0.907, "step": 12440 }, { "epoch": 1.4959730736867412, "grad_norm": 0.6832110285758972, "learning_rate": 0.00047586910739566904, "loss": 1.1207, "step": 12445 }, { "epoch": 1.4965741074648395, "grad_norm": 0.5902302861213684, "learning_rate": 0.0004758486649986049, "loss": 1.0039, "step": 12450 }, { "epoch": 1.497175141242938, "grad_norm": 0.5513035655021667, "learning_rate": 0.00047582821438581945, "loss": 0.932, "step": 12455 }, { "epoch": 1.497776175021036, "grad_norm": 0.646411657333374, "learning_rate": 0.0004758077555580568, "loss": 0.9703, "step": 12460 }, { "epoch": 1.4983772087991345, "grad_norm": 0.7877782583236694, "learning_rate": 0.0004757872885160609, "loss": 1.0773, "step": 12465 }, { "epoch": 1.4989782425772329, "grad_norm": 0.6936209797859192, "learning_rate": 0.0004757668132605765, "loss": 0.7789, "step": 12470 }, { "epoch": 1.4995792763553313, "grad_norm": 0.745646595954895, "learning_rate": 0.00047574632979234825, "loss": 0.9727, "step": 12475 }, { "epoch": 1.5001803101334295, "grad_norm": 0.7798460125923157, "learning_rate": 0.00047572583811212146, "loss": 0.8359, "step": 12480 }, { "epoch": 1.5007813439115278, "grad_norm": 0.47348013520240784, "learning_rate": 0.0004757053382206414, "loss": 0.8695, "step": 12485 }, { "epoch": 1.5013823776896262, "grad_norm": 0.7061572670936584, "learning_rate": 0.00047568483011865397, "loss": 0.7414, "step": 12490 }, { "epoch": 1.5019834114677244, "grad_norm": 0.659461498260498, "learning_rate": 0.00047566431380690504, "loss": 0.8414, "step": 12495 }, { "epoch": 1.5025844452458228, "grad_norm": 0.7134028077125549, "learning_rate": 0.00047564378928614104, "loss": 0.8648, "step": 12500 }, { "epoch": 1.5031854790239212, "grad_norm": 0.6794770956039429, "learning_rate": 0.00047562325655710845, "loss": 0.9016, "step": 12505 }, { "epoch": 1.5037865128020194, "grad_norm": 0.7075036764144897, "learning_rate": 0.0004756027156205544, "loss": 0.8375, "step": 12510 }, { "epoch": 1.5043875465801178, "grad_norm": 0.6950543522834778, "learning_rate": 0.00047558216647722594, "loss": 0.9398, "step": 12515 }, { "epoch": 1.5049885803582161, "grad_norm": 0.76094651222229, "learning_rate": 0.00047556160912787064, "loss": 0.9906, "step": 12520 }, { "epoch": 1.5055896141363143, "grad_norm": 0.5852873921394348, "learning_rate": 0.00047554104357323633, "loss": 1.1922, "step": 12525 }, { "epoch": 1.506190647914413, "grad_norm": 0.8085603713989258, "learning_rate": 0.00047552046981407117, "loss": 0.9578, "step": 12530 }, { "epoch": 1.506791681692511, "grad_norm": 0.7374091148376465, "learning_rate": 0.0004754998878511234, "loss": 0.823, "step": 12535 }, { "epoch": 1.5073927154706095, "grad_norm": 0.6830416917800903, "learning_rate": 0.00047547929768514195, "loss": 0.916, "step": 12540 }, { "epoch": 1.5079937492487079, "grad_norm": 0.7086138725280762, "learning_rate": 0.0004754586993168757, "loss": 1.1336, "step": 12545 }, { "epoch": 1.508594783026806, "grad_norm": 0.9915233254432678, "learning_rate": 0.0004754380927470739, "loss": 0.8898, "step": 12550 }, { "epoch": 1.5091958168049044, "grad_norm": 0.7767045497894287, "learning_rate": 0.0004754174779764864, "loss": 0.8094, "step": 12555 }, { "epoch": 1.5097968505830028, "grad_norm": 0.6242465376853943, "learning_rate": 0.00047539685500586293, "loss": 0.7875, "step": 12560 }, { "epoch": 1.510397884361101, "grad_norm": 0.7357817888259888, "learning_rate": 0.0004753762238359537, "loss": 0.9445, "step": 12565 }, { "epoch": 1.5109989181391994, "grad_norm": 0.6769397258758545, "learning_rate": 0.0004753555844675092, "loss": 1.1406, "step": 12570 }, { "epoch": 1.5115999519172978, "grad_norm": 0.545312225818634, "learning_rate": 0.00047533493690128024, "loss": 0.9641, "step": 12575 }, { "epoch": 1.512200985695396, "grad_norm": 0.3835344612598419, "learning_rate": 0.00047531428113801804, "loss": 0.9316, "step": 12580 }, { "epoch": 1.5128020194734946, "grad_norm": 0.5439361333847046, "learning_rate": 0.00047529361717847386, "loss": 0.7875, "step": 12585 }, { "epoch": 1.5134030532515927, "grad_norm": 0.8007640838623047, "learning_rate": 0.00047527294502339943, "loss": 0.807, "step": 12590 }, { "epoch": 1.514004087029691, "grad_norm": 0.552044153213501, "learning_rate": 0.0004752522646735467, "loss": 0.5547, "step": 12595 }, { "epoch": 1.5146051208077895, "grad_norm": 0.7725129723548889, "learning_rate": 0.00047523157612966807, "loss": 0.9617, "step": 12600 }, { "epoch": 1.5146051208077895, "eval_loss": 1.926660180091858, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1957, "eval_samples_per_second": 4.546, "eval_steps_per_second": 1.137, "step": 12600 }, { "epoch": 1.5152061545858877, "grad_norm": 0.6693068146705627, "learning_rate": 0.00047521087939251605, "loss": 0.9527, "step": 12605 }, { "epoch": 1.515807188363986, "grad_norm": 0.8175145983695984, "learning_rate": 0.00047519017446284354, "loss": 0.6984, "step": 12610 }, { "epoch": 1.5164082221420845, "grad_norm": 0.5854190587997437, "learning_rate": 0.0004751694613414037, "loss": 0.7672, "step": 12615 }, { "epoch": 1.5170092559201827, "grad_norm": 0.5882565975189209, "learning_rate": 0.00047514874002895, "loss": 0.9641, "step": 12620 }, { "epoch": 1.517610289698281, "grad_norm": 0.6106027960777283, "learning_rate": 0.0004751280105262364, "loss": 0.7898, "step": 12625 }, { "epoch": 1.5182113234763794, "grad_norm": 0.47339731454849243, "learning_rate": 0.0004751072728340168, "loss": 1.1195, "step": 12630 }, { "epoch": 1.5188123572544776, "grad_norm": 0.6933841109275818, "learning_rate": 0.00047508652695304554, "loss": 0.9797, "step": 12635 }, { "epoch": 1.519413391032576, "grad_norm": 0.5944886803627014, "learning_rate": 0.0004750657728840775, "loss": 0.6145, "step": 12640 }, { "epoch": 1.5200144248106744, "grad_norm": 0.6986210346221924, "learning_rate": 0.0004750450106278674, "loss": 0.807, "step": 12645 }, { "epoch": 1.5206154585887726, "grad_norm": 0.6916808485984802, "learning_rate": 0.00047502424018517074, "loss": 1.0805, "step": 12650 }, { "epoch": 1.5212164923668712, "grad_norm": 0.6346192955970764, "learning_rate": 0.0004750034615567429, "loss": 1.3367, "step": 12655 }, { "epoch": 1.5218175261449693, "grad_norm": 0.5075841546058655, "learning_rate": 0.0004749826747433399, "loss": 0.8266, "step": 12660 }, { "epoch": 1.5224185599230675, "grad_norm": 0.7573110461235046, "learning_rate": 0.0004749618797457179, "loss": 0.8707, "step": 12665 }, { "epoch": 1.5230195937011661, "grad_norm": 0.5818377733230591, "learning_rate": 0.0004749410765646332, "loss": 0.8266, "step": 12670 }, { "epoch": 1.5236206274792643, "grad_norm": 0.6120326519012451, "learning_rate": 0.0004749202652008427, "loss": 0.9109, "step": 12675 }, { "epoch": 1.5242216612573627, "grad_norm": 0.6501766443252563, "learning_rate": 0.0004748994456551034, "loss": 1.15, "step": 12680 }, { "epoch": 1.524822695035461, "grad_norm": 0.6754354238510132, "learning_rate": 0.00047487861792817264, "loss": 1.2117, "step": 12685 }, { "epoch": 1.5254237288135593, "grad_norm": 0.6575729846954346, "learning_rate": 0.0004748577820208082, "loss": 0.8852, "step": 12690 }, { "epoch": 1.5260247625916576, "grad_norm": 0.8039092421531677, "learning_rate": 0.00047483693793376794, "loss": 1.1687, "step": 12695 }, { "epoch": 1.526625796369756, "grad_norm": 1.008545160293579, "learning_rate": 0.00047481608566781, "loss": 0.8289, "step": 12700 }, { "epoch": 1.5272268301478542, "grad_norm": 0.4198199212551117, "learning_rate": 0.0004747952252236931, "loss": 0.8187, "step": 12705 }, { "epoch": 1.5278278639259526, "grad_norm": 0.6067907214164734, "learning_rate": 0.0004747743566021759, "loss": 0.9016, "step": 12710 }, { "epoch": 1.528428897704051, "grad_norm": 0.5127436518669128, "learning_rate": 0.0004747534798040178, "loss": 0.9113, "step": 12715 }, { "epoch": 1.5290299314821492, "grad_norm": 0.7138319611549377, "learning_rate": 0.000474732594829978, "loss": 0.9062, "step": 12720 }, { "epoch": 1.5296309652602478, "grad_norm": 0.8510314226150513, "learning_rate": 0.0004747117016808164, "loss": 0.7484, "step": 12725 }, { "epoch": 1.530231999038346, "grad_norm": 0.6589043736457825, "learning_rate": 0.0004746908003572929, "loss": 0.7535, "step": 12730 }, { "epoch": 1.5308330328164443, "grad_norm": 0.5036451816558838, "learning_rate": 0.00047466989086016785, "loss": 0.6293, "step": 12735 }, { "epoch": 1.5314340665945427, "grad_norm": 0.6466298699378967, "learning_rate": 0.000474648973190202, "loss": 0.7484, "step": 12740 }, { "epoch": 1.532035100372641, "grad_norm": 0.6602294445037842, "learning_rate": 0.0004746280473481561, "loss": 1.0195, "step": 12745 }, { "epoch": 1.5326361341507393, "grad_norm": 0.6785079836845398, "learning_rate": 0.0004746071133347916, "loss": 0.9656, "step": 12750 }, { "epoch": 1.5332371679288377, "grad_norm": 0.7140780687332153, "learning_rate": 0.0004745861711508698, "loss": 1.0234, "step": 12755 }, { "epoch": 1.5338382017069359, "grad_norm": 0.8196379542350769, "learning_rate": 0.0004745652207971525, "loss": 1.2656, "step": 12760 }, { "epoch": 1.5344392354850342, "grad_norm": 0.682145893573761, "learning_rate": 0.000474544262274402, "loss": 0.875, "step": 12765 }, { "epoch": 1.5350402692631326, "grad_norm": 0.47468167543411255, "learning_rate": 0.0004745232955833806, "loss": 0.9867, "step": 12770 }, { "epoch": 1.5356413030412308, "grad_norm": 0.5364683866500854, "learning_rate": 0.000474502320724851, "loss": 0.9523, "step": 12775 }, { "epoch": 1.5362423368193292, "grad_norm": 0.7835503220558167, "learning_rate": 0.0004744813376995763, "loss": 1.1492, "step": 12780 }, { "epoch": 1.5368433705974276, "grad_norm": 0.5231155157089233, "learning_rate": 0.0004744603465083197, "loss": 0.609, "step": 12785 }, { "epoch": 1.5374444043755258, "grad_norm": 0.9372231960296631, "learning_rate": 0.00047443934715184476, "loss": 1.2703, "step": 12790 }, { "epoch": 1.5380454381536244, "grad_norm": 0.4742888808250427, "learning_rate": 0.0004744183396309155, "loss": 0.6832, "step": 12795 }, { "epoch": 1.5386464719317225, "grad_norm": 0.26846635341644287, "learning_rate": 0.0004743973239462961, "loss": 0.8227, "step": 12800 }, { "epoch": 1.5386464719317225, "eval_loss": 1.8889648914337158, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2182, "eval_samples_per_second": 4.543, "eval_steps_per_second": 1.136, "step": 12800 }, { "epoch": 1.539247505709821, "grad_norm": 0.5749069452285767, "learning_rate": 0.000474376300098751, "loss": 0.8934, "step": 12805 }, { "epoch": 1.5398485394879193, "grad_norm": 0.5642238259315491, "learning_rate": 0.0004743552680890449, "loss": 0.9945, "step": 12810 }, { "epoch": 1.5404495732660175, "grad_norm": 0.5721663236618042, "learning_rate": 0.0004743342279179431, "loss": 0.8516, "step": 12815 }, { "epoch": 1.541050607044116, "grad_norm": 0.46253702044487, "learning_rate": 0.0004743131795862108, "loss": 0.9207, "step": 12820 }, { "epoch": 1.5416516408222143, "grad_norm": 0.7346284985542297, "learning_rate": 0.0004742921230946138, "loss": 1.1168, "step": 12825 }, { "epoch": 1.5422526746003125, "grad_norm": 0.5650750994682312, "learning_rate": 0.00047427105844391796, "loss": 0.8742, "step": 12830 }, { "epoch": 1.5428537083784108, "grad_norm": 0.4906032383441925, "learning_rate": 0.00047424998563488956, "loss": 0.7164, "step": 12835 }, { "epoch": 1.5434547421565092, "grad_norm": 0.44781020283699036, "learning_rate": 0.0004742289046682953, "loss": 0.8965, "step": 12840 }, { "epoch": 1.5440557759346074, "grad_norm": 0.5351720452308655, "learning_rate": 0.0004742078155449019, "loss": 0.5508, "step": 12845 }, { "epoch": 1.5446568097127058, "grad_norm": 0.7030283808708191, "learning_rate": 0.00047418671826547653, "loss": 0.8133, "step": 12850 }, { "epoch": 1.5452578434908042, "grad_norm": 0.7418191432952881, "learning_rate": 0.00047416561283078674, "loss": 0.6844, "step": 12855 }, { "epoch": 1.5458588772689024, "grad_norm": 0.5547045469284058, "learning_rate": 0.0004741444992416003, "loss": 1.3477, "step": 12860 }, { "epoch": 1.546459911047001, "grad_norm": 0.5346720814704895, "learning_rate": 0.0004741233774986851, "loss": 0.6816, "step": 12865 }, { "epoch": 1.5470609448250991, "grad_norm": 0.5552061796188354, "learning_rate": 0.00047410224760280963, "loss": 0.9297, "step": 12870 }, { "epoch": 1.5476619786031975, "grad_norm": 0.6268603801727295, "learning_rate": 0.00047408110955474256, "loss": 0.8762, "step": 12875 }, { "epoch": 1.548263012381296, "grad_norm": 0.6950808763504028, "learning_rate": 0.0004740599633552527, "loss": 0.8707, "step": 12880 }, { "epoch": 1.548864046159394, "grad_norm": 0.6177887916564941, "learning_rate": 0.0004740388090051093, "loss": 0.7508, "step": 12885 }, { "epoch": 1.5494650799374925, "grad_norm": 0.7422083616256714, "learning_rate": 0.0004740176465050821, "loss": 1.3242, "step": 12890 }, { "epoch": 1.5500661137155909, "grad_norm": 0.43865928053855896, "learning_rate": 0.0004739964758559406, "loss": 1.0961, "step": 12895 }, { "epoch": 1.550667147493689, "grad_norm": 0.5872899889945984, "learning_rate": 0.0004739752970584552, "loss": 0.8086, "step": 12900 }, { "epoch": 1.5512681812717874, "grad_norm": 0.536278486251831, "learning_rate": 0.00047395411011339617, "loss": 1.1078, "step": 12905 }, { "epoch": 1.5518692150498858, "grad_norm": 0.5660296082496643, "learning_rate": 0.00047393291502153433, "loss": 0.7609, "step": 12910 }, { "epoch": 1.552470248827984, "grad_norm": 0.7178237438201904, "learning_rate": 0.0004739117117836407, "loss": 0.9914, "step": 12915 }, { "epoch": 1.5530712826060826, "grad_norm": 0.5997413396835327, "learning_rate": 0.0004738905004004865, "loss": 0.8508, "step": 12920 }, { "epoch": 1.5536723163841808, "grad_norm": 0.6288643479347229, "learning_rate": 0.0004738692808728433, "loss": 0.7898, "step": 12925 }, { "epoch": 1.554273350162279, "grad_norm": 0.4826321601867676, "learning_rate": 0.0004738480532014832, "loss": 1.4414, "step": 12930 }, { "epoch": 1.5548743839403776, "grad_norm": 0.4631876051425934, "learning_rate": 0.00047382681738717823, "loss": 0.5984, "step": 12935 }, { "epoch": 1.5554754177184758, "grad_norm": 0.7025954723358154, "learning_rate": 0.00047380557343070103, "loss": 0.9555, "step": 12940 }, { "epoch": 1.5560764514965741, "grad_norm": 0.797414243221283, "learning_rate": 0.0004737843213328242, "loss": 1.0742, "step": 12945 }, { "epoch": 1.5566774852746725, "grad_norm": 0.5380562543869019, "learning_rate": 0.000473763061094321, "loss": 1.1594, "step": 12950 }, { "epoch": 1.5572785190527707, "grad_norm": 0.858157753944397, "learning_rate": 0.00047374179271596477, "loss": 0.884, "step": 12955 }, { "epoch": 1.557879552830869, "grad_norm": 0.30107933282852173, "learning_rate": 0.0004737205161985292, "loss": 0.8143, "step": 12960 }, { "epoch": 1.5584805866089675, "grad_norm": 0.4761585593223572, "learning_rate": 0.00047369923154278815, "loss": 0.7547, "step": 12965 }, { "epoch": 1.5590816203870657, "grad_norm": 0.7264707684516907, "learning_rate": 0.0004736779387495161, "loss": 0.6852, "step": 12970 }, { "epoch": 1.559682654165164, "grad_norm": 0.870913028717041, "learning_rate": 0.0004736566378194874, "loss": 1.0285, "step": 12975 }, { "epoch": 1.5602836879432624, "grad_norm": 0.8177791237831116, "learning_rate": 0.00047363532875347716, "loss": 0.7477, "step": 12980 }, { "epoch": 1.5608847217213606, "grad_norm": 0.5882327556610107, "learning_rate": 0.00047361401155226036, "loss": 0.9398, "step": 12985 }, { "epoch": 1.5614857554994592, "grad_norm": 0.37226995825767517, "learning_rate": 0.00047359268621661245, "loss": 0.6699, "step": 12990 }, { "epoch": 1.5620867892775574, "grad_norm": 0.8597920536994934, "learning_rate": 0.00047357135274730933, "loss": 0.6965, "step": 12995 }, { "epoch": 1.5626878230556556, "grad_norm": 0.7695097923278809, "learning_rate": 0.00047355001114512686, "loss": 0.9211, "step": 13000 }, { "epoch": 1.5626878230556556, "eval_loss": 1.91943359375, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2061, "eval_samples_per_second": 4.545, "eval_steps_per_second": 1.136, "step": 13000 }, { "epoch": 1.5632888568337542, "grad_norm": 0.6630449295043945, "learning_rate": 0.0004735286614108416, "loss": 1.2695, "step": 13005 }, { "epoch": 1.5638898906118524, "grad_norm": 0.5374677181243896, "learning_rate": 0.00047350730354523, "loss": 0.7586, "step": 13010 }, { "epoch": 1.5644909243899507, "grad_norm": 0.5258702039718628, "learning_rate": 0.0004734859375490691, "loss": 1.0273, "step": 13015 }, { "epoch": 1.5650919581680491, "grad_norm": 0.4924733340740204, "learning_rate": 0.0004734645634231361, "loss": 0.6672, "step": 13020 }, { "epoch": 1.5656929919461473, "grad_norm": 0.6051394939422607, "learning_rate": 0.00047344318116820857, "loss": 0.5875, "step": 13025 }, { "epoch": 1.5662940257242457, "grad_norm": 0.6795390248298645, "learning_rate": 0.0004734217907850643, "loss": 1.282, "step": 13030 }, { "epoch": 1.566895059502344, "grad_norm": 0.8399996161460876, "learning_rate": 0.0004734003922744814, "loss": 1.2484, "step": 13035 }, { "epoch": 1.5674960932804423, "grad_norm": 0.6017289757728577, "learning_rate": 0.00047337898563723827, "loss": 0.5152, "step": 13040 }, { "epoch": 1.5680971270585407, "grad_norm": 0.4516098201274872, "learning_rate": 0.0004733575708741137, "loss": 0.9859, "step": 13045 }, { "epoch": 1.568698160836639, "grad_norm": 0.6040613651275635, "learning_rate": 0.00047333614798588666, "loss": 0.7672, "step": 13050 }, { "epoch": 1.5692991946147372, "grad_norm": 0.6769964098930359, "learning_rate": 0.0004733147169733364, "loss": 0.9414, "step": 13055 }, { "epoch": 1.5699002283928358, "grad_norm": 0.4154891073703766, "learning_rate": 0.00047329327783724253, "loss": 1.1516, "step": 13060 }, { "epoch": 1.570501262170934, "grad_norm": 0.5522969961166382, "learning_rate": 0.00047327183057838503, "loss": 0.8078, "step": 13065 }, { "epoch": 1.5711022959490324, "grad_norm": 0.7137904763221741, "learning_rate": 0.00047325037519754397, "loss": 0.7008, "step": 13070 }, { "epoch": 1.5717033297271308, "grad_norm": 0.6501874923706055, "learning_rate": 0.0004732289116955, "loss": 0.9523, "step": 13075 }, { "epoch": 1.572304363505229, "grad_norm": 0.7527011036872864, "learning_rate": 0.00047320744007303374, "loss": 0.816, "step": 13080 }, { "epoch": 1.5729053972833273, "grad_norm": 0.8173070549964905, "learning_rate": 0.0004731859603309263, "loss": 0.7727, "step": 13085 }, { "epoch": 1.5735064310614257, "grad_norm": 0.46920570731163025, "learning_rate": 0.00047316447246995914, "loss": 0.8043, "step": 13090 }, { "epoch": 1.574107464839524, "grad_norm": 0.913514256477356, "learning_rate": 0.0004731429764909138, "loss": 1.1477, "step": 13095 }, { "epoch": 1.5747084986176223, "grad_norm": 0.49969181418418884, "learning_rate": 0.00047312147239457237, "loss": 0.932, "step": 13100 }, { "epoch": 1.5753095323957207, "grad_norm": 0.5864337086677551, "learning_rate": 0.00047309996018171695, "loss": 1.0594, "step": 13105 }, { "epoch": 1.5759105661738189, "grad_norm": 0.7593932151794434, "learning_rate": 0.00047307843985313025, "loss": 0.8641, "step": 13110 }, { "epoch": 1.5765115999519173, "grad_norm": 0.6334415674209595, "learning_rate": 0.000473056911409595, "loss": 0.9227, "step": 13115 }, { "epoch": 1.5771126337300156, "grad_norm": 0.7549822330474854, "learning_rate": 0.0004730353748518944, "loss": 1.1523, "step": 13120 }, { "epoch": 1.5777136675081138, "grad_norm": 0.6202312111854553, "learning_rate": 0.0004730138301808119, "loss": 1.1391, "step": 13125 }, { "epoch": 1.5783147012862124, "grad_norm": 0.8628902435302734, "learning_rate": 0.0004729922773971312, "loss": 0.9117, "step": 13130 }, { "epoch": 1.5789157350643106, "grad_norm": 0.34794557094573975, "learning_rate": 0.00047297071650163636, "loss": 0.682, "step": 13135 }, { "epoch": 1.579516768842409, "grad_norm": 0.6737480163574219, "learning_rate": 0.0004729491474951116, "loss": 0.9336, "step": 13140 }, { "epoch": 1.5801178026205074, "grad_norm": 0.604663610458374, "learning_rate": 0.0004729275703783417, "loss": 0.7871, "step": 13145 }, { "epoch": 1.5807188363986056, "grad_norm": 0.8120701313018799, "learning_rate": 0.0004729059851521115, "loss": 0.7238, "step": 13150 }, { "epoch": 1.581319870176704, "grad_norm": 0.6737380027770996, "learning_rate": 0.0004728843918172062, "loss": 0.9117, "step": 13155 }, { "epoch": 1.5819209039548023, "grad_norm": 0.7524706721305847, "learning_rate": 0.00047286279037441126, "loss": 0.9703, "step": 13160 }, { "epoch": 1.5825219377329005, "grad_norm": 0.9580506086349487, "learning_rate": 0.00047284118082451253, "loss": 0.7156, "step": 13165 }, { "epoch": 1.583122971510999, "grad_norm": 0.5387940406799316, "learning_rate": 0.0004728195631682961, "loss": 1.0227, "step": 13170 }, { "epoch": 1.5837240052890973, "grad_norm": 0.7800549268722534, "learning_rate": 0.00047279793740654844, "loss": 0.9008, "step": 13175 }, { "epoch": 1.5843250390671955, "grad_norm": 0.4982510209083557, "learning_rate": 0.000472776303540056, "loss": 0.6348, "step": 13180 }, { "epoch": 1.5849260728452939, "grad_norm": 0.7826941013336182, "learning_rate": 0.000472754661569606, "loss": 1.077, "step": 13185 }, { "epoch": 1.5855271066233922, "grad_norm": 0.5226656794548035, "learning_rate": 0.00047273301149598556, "loss": 0.9508, "step": 13190 }, { "epoch": 1.5861281404014904, "grad_norm": 0.7028262615203857, "learning_rate": 0.0004727113533199824, "loss": 1.2539, "step": 13195 }, { "epoch": 1.586729174179589, "grad_norm": 0.5015523433685303, "learning_rate": 0.0004726896870423841, "loss": 0.9898, "step": 13200 }, { "epoch": 1.586729174179589, "eval_loss": 1.899511694908142, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.212, "eval_samples_per_second": 4.544, "eval_steps_per_second": 1.136, "step": 13200 }, { "epoch": 1.5873302079576872, "grad_norm": 0.6886287331581116, "learning_rate": 0.00047266801266397917, "loss": 0.8383, "step": 13205 }, { "epoch": 1.5879312417357856, "grad_norm": 0.5920904874801636, "learning_rate": 0.0004726463301855559, "loss": 1.0719, "step": 13210 }, { "epoch": 1.588532275513884, "grad_norm": 0.5949668288230896, "learning_rate": 0.00047262463960790293, "loss": 0.7406, "step": 13215 }, { "epoch": 1.5891333092919822, "grad_norm": 0.6875108480453491, "learning_rate": 0.00047260294093180944, "loss": 1.275, "step": 13220 }, { "epoch": 1.5897343430700805, "grad_norm": 0.5957589149475098, "learning_rate": 0.0004725812341580647, "loss": 1.0746, "step": 13225 }, { "epoch": 1.590335376848179, "grad_norm": 0.47029590606689453, "learning_rate": 0.00047255951928745836, "loss": 0.7422, "step": 13230 }, { "epoch": 1.590936410626277, "grad_norm": 0.6401347517967224, "learning_rate": 0.0004725377963207804, "loss": 0.8789, "step": 13235 }, { "epoch": 1.5915374444043755, "grad_norm": 0.6766828298568726, "learning_rate": 0.00047251606525882094, "loss": 0.7273, "step": 13240 }, { "epoch": 1.592138478182474, "grad_norm": 0.6120871901512146, "learning_rate": 0.0004724943261023706, "loss": 0.8082, "step": 13245 }, { "epoch": 1.592739511960572, "grad_norm": 0.902204155921936, "learning_rate": 0.00047247257885222005, "loss": 1.0723, "step": 13250 }, { "epoch": 1.5933405457386707, "grad_norm": 0.8353217840194702, "learning_rate": 0.00047245082350916046, "loss": 0.9531, "step": 13255 }, { "epoch": 1.5939415795167688, "grad_norm": 0.6984806060791016, "learning_rate": 0.00047242906007398334, "loss": 0.9539, "step": 13260 }, { "epoch": 1.594542613294867, "grad_norm": 0.8646644353866577, "learning_rate": 0.00047240728854748017, "loss": 0.6598, "step": 13265 }, { "epoch": 1.5951436470729656, "grad_norm": 0.6523377299308777, "learning_rate": 0.00047238550893044306, "loss": 0.7105, "step": 13270 }, { "epoch": 1.5957446808510638, "grad_norm": 0.6353850364685059, "learning_rate": 0.00047236372122366434, "loss": 0.9184, "step": 13275 }, { "epoch": 1.5963457146291622, "grad_norm": 0.5403639674186707, "learning_rate": 0.0004723419254279365, "loss": 0.8566, "step": 13280 }, { "epoch": 1.5969467484072606, "grad_norm": 0.5633966326713562, "learning_rate": 0.00047232012154405236, "loss": 0.8883, "step": 13285 }, { "epoch": 1.5975477821853588, "grad_norm": 0.6972243785858154, "learning_rate": 0.00047229830957280517, "loss": 0.8219, "step": 13290 }, { "epoch": 1.5981488159634571, "grad_norm": 0.5912173390388489, "learning_rate": 0.00047227648951498843, "loss": 0.5832, "step": 13295 }, { "epoch": 1.5987498497415555, "grad_norm": 0.4919407069683075, "learning_rate": 0.0004722546613713957, "loss": 0.9812, "step": 13300 }, { "epoch": 1.5993508835196537, "grad_norm": 0.503868579864502, "learning_rate": 0.0004722328251428213, "loss": 1.0328, "step": 13305 }, { "epoch": 1.599951917297752, "grad_norm": 0.38204383850097656, "learning_rate": 0.0004722109808300593, "loss": 0.7312, "step": 13310 }, { "epoch": 1.6005529510758505, "grad_norm": 0.6426239609718323, "learning_rate": 0.0004721891284339045, "loss": 1.0813, "step": 13315 }, { "epoch": 1.6011539848539487, "grad_norm": 0.5860337615013123, "learning_rate": 0.00047216726795515173, "loss": 0.6613, "step": 13320 }, { "epoch": 1.6017550186320473, "grad_norm": 0.7284621000289917, "learning_rate": 0.00047214539939459627, "loss": 0.9883, "step": 13325 }, { "epoch": 1.6023560524101454, "grad_norm": 0.6601481437683105, "learning_rate": 0.0004721235227530336, "loss": 0.666, "step": 13330 }, { "epoch": 1.6029570861882436, "grad_norm": 0.5403615236282349, "learning_rate": 0.0004721016380312596, "loss": 0.9195, "step": 13335 }, { "epoch": 1.6035581199663422, "grad_norm": 0.6052476763725281, "learning_rate": 0.0004720797452300703, "loss": 0.9688, "step": 13340 }, { "epoch": 1.6041591537444404, "grad_norm": 0.6085507869720459, "learning_rate": 0.00047205784435026214, "loss": 1.0223, "step": 13345 }, { "epoch": 1.6047601875225388, "grad_norm": 0.6806771755218506, "learning_rate": 0.0004720359353926318, "loss": 1.0437, "step": 13350 }, { "epoch": 1.6053612213006372, "grad_norm": 0.45934852957725525, "learning_rate": 0.0004720140183579762, "loss": 0.835, "step": 13355 }, { "epoch": 1.6059622550787354, "grad_norm": 0.6096804141998291, "learning_rate": 0.0004719920932470927, "loss": 0.8969, "step": 13360 }, { "epoch": 1.6065632888568337, "grad_norm": 0.6965910196304321, "learning_rate": 0.00047197016006077886, "loss": 1.0227, "step": 13365 }, { "epoch": 1.6071643226349321, "grad_norm": 0.6849845051765442, "learning_rate": 0.0004719482187998325, "loss": 0.8328, "step": 13370 }, { "epoch": 1.6077653564130303, "grad_norm": 0.6051676869392395, "learning_rate": 0.00047192626946505184, "loss": 0.8223, "step": 13375 }, { "epoch": 1.6083663901911287, "grad_norm": 0.4394454061985016, "learning_rate": 0.0004719043120572353, "loss": 0.7312, "step": 13380 }, { "epoch": 1.608967423969227, "grad_norm": 0.5389827489852905, "learning_rate": 0.00047188234657718163, "loss": 0.682, "step": 13385 }, { "epoch": 1.6095684577473253, "grad_norm": 0.6120681166648865, "learning_rate": 0.0004718603730256899, "loss": 1.0109, "step": 13390 }, { "epoch": 1.6101694915254239, "grad_norm": 0.7860174179077148, "learning_rate": 0.0004718383914035594, "loss": 0.932, "step": 13395 }, { "epoch": 1.610770525303522, "grad_norm": 0.6418444514274597, "learning_rate": 0.00047181640171158973, "loss": 0.7098, "step": 13400 }, { "epoch": 1.610770525303522, "eval_loss": 1.9319336414337158, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1892, "eval_samples_per_second": 4.547, "eval_steps_per_second": 1.137, "step": 13400 }, { "epoch": 1.6113715590816204, "grad_norm": 0.4149583578109741, "learning_rate": 0.00047179440395058093, "loss": 0.6969, "step": 13405 }, { "epoch": 1.6119725928597188, "grad_norm": 0.522482693195343, "learning_rate": 0.0004717723981213331, "loss": 0.8766, "step": 13410 }, { "epoch": 1.612573626637817, "grad_norm": 0.5530385375022888, "learning_rate": 0.0004717503842246468, "loss": 1.1016, "step": 13415 }, { "epoch": 1.6131746604159154, "grad_norm": 0.6051827669143677, "learning_rate": 0.00047172836226132275, "loss": 1.0633, "step": 13420 }, { "epoch": 1.6137756941940138, "grad_norm": 0.5502833724021912, "learning_rate": 0.0004717063322321622, "loss": 0.7273, "step": 13425 }, { "epoch": 1.614376727972112, "grad_norm": 0.5018969178199768, "learning_rate": 0.0004716842941379664, "loss": 0.8586, "step": 13430 }, { "epoch": 1.6149777617502103, "grad_norm": 0.8088052868843079, "learning_rate": 0.00047166224797953716, "loss": 1.0797, "step": 13435 }, { "epoch": 1.6155787955283087, "grad_norm": 0.6862267255783081, "learning_rate": 0.00047164019375767636, "loss": 0.9535, "step": 13440 }, { "epoch": 1.616179829306407, "grad_norm": 0.462275892496109, "learning_rate": 0.00047161813147318627, "loss": 1.0398, "step": 13445 }, { "epoch": 1.6167808630845053, "grad_norm": 0.4753485918045044, "learning_rate": 0.0004715960611268695, "loss": 1.0063, "step": 13450 }, { "epoch": 1.6173818968626037, "grad_norm": 0.6200545430183411, "learning_rate": 0.00047157398271952883, "loss": 0.9398, "step": 13455 }, { "epoch": 1.6179829306407019, "grad_norm": 0.6016744375228882, "learning_rate": 0.00047155189625196746, "loss": 0.8484, "step": 13460 }, { "epoch": 1.6185839644188005, "grad_norm": 0.5368013978004456, "learning_rate": 0.0004715298017249889, "loss": 1.0219, "step": 13465 }, { "epoch": 1.6191849981968987, "grad_norm": 0.40214380621910095, "learning_rate": 0.00047150769913939674, "loss": 0.8152, "step": 13470 }, { "epoch": 1.619786031974997, "grad_norm": 0.5101467967033386, "learning_rate": 0.0004714855884959951, "loss": 1.1031, "step": 13475 }, { "epoch": 1.6203870657530954, "grad_norm": 0.5352076292037964, "learning_rate": 0.0004714634697955883, "loss": 0.7145, "step": 13480 }, { "epoch": 1.6209880995311936, "grad_norm": 0.6837683320045471, "learning_rate": 0.00047144134303898097, "loss": 1.0117, "step": 13485 }, { "epoch": 1.621589133309292, "grad_norm": 0.731714129447937, "learning_rate": 0.0004714192082269779, "loss": 1.0695, "step": 13490 }, { "epoch": 1.6221901670873904, "grad_norm": 0.36670467257499695, "learning_rate": 0.00047139706536038445, "loss": 0.8551, "step": 13495 }, { "epoch": 1.6227912008654886, "grad_norm": 0.5586612224578857, "learning_rate": 0.000471374914440006, "loss": 1.2531, "step": 13500 }, { "epoch": 1.623392234643587, "grad_norm": 0.7070698738098145, "learning_rate": 0.00047135275546664846, "loss": 1.0195, "step": 13505 }, { "epoch": 1.6239932684216853, "grad_norm": 0.6747303009033203, "learning_rate": 0.0004713305884411178, "loss": 1.0312, "step": 13510 }, { "epoch": 1.6245943021997835, "grad_norm": 0.5010889172554016, "learning_rate": 0.00047130841336422033, "loss": 1.3148, "step": 13515 }, { "epoch": 1.625195335977882, "grad_norm": 0.7816142439842224, "learning_rate": 0.0004712862302367629, "loss": 1.2004, "step": 13520 }, { "epoch": 1.6257963697559803, "grad_norm": 0.8569761514663696, "learning_rate": 0.0004712640390595523, "loss": 0.6508, "step": 13525 }, { "epoch": 1.6263974035340785, "grad_norm": 0.5181674361228943, "learning_rate": 0.00047124183983339587, "loss": 0.9523, "step": 13530 }, { "epoch": 1.626998437312177, "grad_norm": 0.675092339515686, "learning_rate": 0.00047121963255910106, "loss": 0.6867, "step": 13535 }, { "epoch": 1.6275994710902753, "grad_norm": 0.6229550838470459, "learning_rate": 0.0004711974172374759, "loss": 0.6875, "step": 13540 }, { "epoch": 1.6282005048683736, "grad_norm": 0.7590916156768799, "learning_rate": 0.0004711751938693283, "loss": 0.8627, "step": 13545 }, { "epoch": 1.628801538646472, "grad_norm": 0.9761258959770203, "learning_rate": 0.0004711529624554669, "loss": 1.0078, "step": 13550 }, { "epoch": 1.6294025724245702, "grad_norm": 0.6167377829551697, "learning_rate": 0.0004711307229967002, "loss": 0.8816, "step": 13555 }, { "epoch": 1.6300036062026686, "grad_norm": 0.5394764542579651, "learning_rate": 0.0004711084754938374, "loss": 0.8773, "step": 13560 }, { "epoch": 1.630604639980767, "grad_norm": 0.5855737924575806, "learning_rate": 0.0004710862199476876, "loss": 0.7562, "step": 13565 }, { "epoch": 1.6312056737588652, "grad_norm": 0.7766919136047363, "learning_rate": 0.0004710639563590606, "loss": 1.2234, "step": 13570 }, { "epoch": 1.6318067075369636, "grad_norm": 0.7161315083503723, "learning_rate": 0.0004710416847287661, "loss": 0.8684, "step": 13575 }, { "epoch": 1.632407741315062, "grad_norm": 0.6269826889038086, "learning_rate": 0.00047101940505761444, "loss": 1.1945, "step": 13580 }, { "epoch": 1.6330087750931601, "grad_norm": 0.5967350006103516, "learning_rate": 0.0004709971173464159, "loss": 0.8801, "step": 13585 }, { "epoch": 1.6336098088712587, "grad_norm": 0.8685805201530457, "learning_rate": 0.0004709748215959814, "loss": 0.6074, "step": 13590 }, { "epoch": 1.634210842649357, "grad_norm": 0.5150066614151001, "learning_rate": 0.000470952517807122, "loss": 0.7012, "step": 13595 }, { "epoch": 1.634811876427455, "grad_norm": 0.5718885660171509, "learning_rate": 0.0004709302059806489, "loss": 0.7156, "step": 13600 }, { "epoch": 1.634811876427455, "eval_loss": 1.9001953601837158, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2059, "eval_samples_per_second": 4.545, "eval_steps_per_second": 1.136, "step": 13600 }, { "epoch": 1.6354129102055537, "grad_norm": 0.7326075434684753, "learning_rate": 0.0004709078861173739, "loss": 0.7973, "step": 13605 }, { "epoch": 1.6360139439836519, "grad_norm": 0.6694378852844238, "learning_rate": 0.0004708855582181089, "loss": 0.6234, "step": 13610 }, { "epoch": 1.6366149777617502, "grad_norm": 0.5784134864807129, "learning_rate": 0.00047086322228366594, "loss": 1.0867, "step": 13615 }, { "epoch": 1.6372160115398486, "grad_norm": 0.7724462747573853, "learning_rate": 0.00047084087831485784, "loss": 0.7953, "step": 13620 }, { "epoch": 1.6378170453179468, "grad_norm": 0.7929957509040833, "learning_rate": 0.00047081852631249724, "loss": 0.7656, "step": 13625 }, { "epoch": 1.6384180790960452, "grad_norm": 0.6591349840164185, "learning_rate": 0.0004707961662773972, "loss": 0.7031, "step": 13630 }, { "epoch": 1.6390191128741436, "grad_norm": 0.857776403427124, "learning_rate": 0.00047077379821037113, "loss": 0.7086, "step": 13635 }, { "epoch": 1.6396201466522418, "grad_norm": 0.4660142958164215, "learning_rate": 0.00047075142211223286, "loss": 1.0121, "step": 13640 }, { "epoch": 1.6402211804303402, "grad_norm": 0.6814168691635132, "learning_rate": 0.0004707290379837962, "loss": 0.9797, "step": 13645 }, { "epoch": 1.6408222142084385, "grad_norm": 0.7814996838569641, "learning_rate": 0.0004707066458258755, "loss": 0.7926, "step": 13650 }, { "epoch": 1.6414232479865367, "grad_norm": 0.7591158747673035, "learning_rate": 0.00047068424563928536, "loss": 1.0859, "step": 13655 }, { "epoch": 1.6420242817646353, "grad_norm": 0.6530153751373291, "learning_rate": 0.0004706618374248405, "loss": 1.093, "step": 13660 }, { "epoch": 1.6426253155427335, "grad_norm": 0.43621137738227844, "learning_rate": 0.00047063942118335617, "loss": 0.8422, "step": 13665 }, { "epoch": 1.6432263493208317, "grad_norm": 0.481831818819046, "learning_rate": 0.0004706169969156478, "loss": 0.734, "step": 13670 }, { "epoch": 1.6438273830989303, "grad_norm": 0.5244174003601074, "learning_rate": 0.0004705945646225311, "loss": 0.7852, "step": 13675 }, { "epoch": 1.6444284168770285, "grad_norm": 0.6995603442192078, "learning_rate": 0.0004705721243048221, "loss": 1.2656, "step": 13680 }, { "epoch": 1.6450294506551268, "grad_norm": 0.6107094883918762, "learning_rate": 0.0004705496759633371, "loss": 0.9039, "step": 13685 }, { "epoch": 1.6456304844332252, "grad_norm": 0.39400115609169006, "learning_rate": 0.00047052721959889266, "loss": 0.5824, "step": 13690 }, { "epoch": 1.6462315182113234, "grad_norm": 0.566294252872467, "learning_rate": 0.00047050475521230584, "loss": 0.8297, "step": 13695 }, { "epoch": 1.6468325519894218, "grad_norm": 0.7385695576667786, "learning_rate": 0.00047048228280439366, "loss": 0.7141, "step": 13700 }, { "epoch": 1.6474335857675202, "grad_norm": 0.45049768686294556, "learning_rate": 0.0004704598023759736, "loss": 0.7352, "step": 13705 }, { "epoch": 1.6480346195456184, "grad_norm": 0.9299842119216919, "learning_rate": 0.0004704373139278636, "loss": 0.7707, "step": 13710 }, { "epoch": 1.6486356533237168, "grad_norm": 0.6763330698013306, "learning_rate": 0.0004704148174608816, "loss": 1.2531, "step": 13715 }, { "epoch": 1.6492366871018151, "grad_norm": 0.5432426333427429, "learning_rate": 0.00047039231297584596, "loss": 0.6883, "step": 13720 }, { "epoch": 1.6498377208799133, "grad_norm": 0.561834990978241, "learning_rate": 0.0004703698004735754, "loss": 0.9125, "step": 13725 }, { "epoch": 1.650438754658012, "grad_norm": 0.5149598121643066, "learning_rate": 0.0004703472799548887, "loss": 1.0031, "step": 13730 }, { "epoch": 1.65103978843611, "grad_norm": 0.4916873276233673, "learning_rate": 0.00047032475142060525, "loss": 1.0918, "step": 13735 }, { "epoch": 1.6516408222142085, "grad_norm": 0.7627952098846436, "learning_rate": 0.00047030221487154457, "loss": 1.0125, "step": 13740 }, { "epoch": 1.6522418559923069, "grad_norm": 0.6240417957305908, "learning_rate": 0.00047027967030852634, "loss": 1.0051, "step": 13745 }, { "epoch": 1.652842889770405, "grad_norm": 0.7563932538032532, "learning_rate": 0.0004702571177323708, "loss": 0.9375, "step": 13750 }, { "epoch": 1.6534439235485034, "grad_norm": 0.5754905939102173, "learning_rate": 0.0004702345571438983, "loss": 0.807, "step": 13755 }, { "epoch": 1.6540449573266018, "grad_norm": 0.8103287816047668, "learning_rate": 0.00047021198854392955, "loss": 0.6414, "step": 13760 }, { "epoch": 1.6546459911047, "grad_norm": 0.6837884187698364, "learning_rate": 0.0004701894119332855, "loss": 0.7787, "step": 13765 }, { "epoch": 1.6552470248827984, "grad_norm": 0.273602694272995, "learning_rate": 0.00047016682731278734, "loss": 0.7396, "step": 13770 }, { "epoch": 1.6558480586608968, "grad_norm": 0.7548685073852539, "learning_rate": 0.00047014423468325675, "loss": 1.0555, "step": 13775 }, { "epoch": 1.656449092438995, "grad_norm": 0.5320366621017456, "learning_rate": 0.00047012163404551563, "loss": 0.8461, "step": 13780 }, { "epoch": 1.6570501262170934, "grad_norm": 0.6091782450675964, "learning_rate": 0.000470099025400386, "loss": 0.7914, "step": 13785 }, { "epoch": 1.6576511599951917, "grad_norm": 0.6404203176498413, "learning_rate": 0.0004700764087486903, "loss": 0.852, "step": 13790 }, { "epoch": 1.65825219377329, "grad_norm": 0.7298011183738708, "learning_rate": 0.00047005378409125146, "loss": 1.3109, "step": 13795 }, { "epoch": 1.6588532275513885, "grad_norm": 0.8533394932746887, "learning_rate": 0.0004700311514288922, "loss": 0.7773, "step": 13800 }, { "epoch": 1.6588532275513885, "eval_loss": 1.916015625, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1981, "eval_samples_per_second": 4.546, "eval_steps_per_second": 1.136, "step": 13800 }, { "epoch": 1.6594542613294867, "grad_norm": 0.7845478057861328, "learning_rate": 0.00047000851076243604, "loss": 0.8238, "step": 13805 }, { "epoch": 1.660055295107585, "grad_norm": 0.45315414667129517, "learning_rate": 0.00046998586209270654, "loss": 0.8812, "step": 13810 }, { "epoch": 1.6606563288856835, "grad_norm": 0.7870513200759888, "learning_rate": 0.0004699632054205275, "loss": 0.8938, "step": 13815 }, { "epoch": 1.6612573626637817, "grad_norm": 0.9680578112602234, "learning_rate": 0.00046994054074672324, "loss": 1.2602, "step": 13820 }, { "epoch": 1.66185839644188, "grad_norm": 0.5297554135322571, "learning_rate": 0.0004699178680721182, "loss": 1.0422, "step": 13825 }, { "epoch": 1.6624594302199784, "grad_norm": 0.4886226952075958, "learning_rate": 0.000469895187397537, "loss": 0.7812, "step": 13830 }, { "epoch": 1.6630604639980766, "grad_norm": 0.5143516063690186, "learning_rate": 0.00046987249872380496, "loss": 0.8023, "step": 13835 }, { "epoch": 1.663661497776175, "grad_norm": 0.6982690095901489, "learning_rate": 0.0004698498020517472, "loss": 0.7461, "step": 13840 }, { "epoch": 1.6642625315542734, "grad_norm": 1.0345759391784668, "learning_rate": 0.0004698270973821894, "loss": 1.1344, "step": 13845 }, { "epoch": 1.6648635653323716, "grad_norm": 0.39363521337509155, "learning_rate": 0.00046980438471595765, "loss": 0.909, "step": 13850 }, { "epoch": 1.66546459911047, "grad_norm": 0.6059823036193848, "learning_rate": 0.00046978166405387803, "loss": 0.884, "step": 13855 }, { "epoch": 1.6660656328885683, "grad_norm": 0.5618536472320557, "learning_rate": 0.00046975893539677704, "loss": 0.9148, "step": 13860 }, { "epoch": 1.6666666666666665, "grad_norm": 0.826676070690155, "learning_rate": 0.0004697361987454815, "loss": 1.0312, "step": 13865 }, { "epoch": 1.6672677004447651, "grad_norm": 0.8051931262016296, "learning_rate": 0.00046971345410081855, "loss": 0.7625, "step": 13870 }, { "epoch": 1.6678687342228633, "grad_norm": 0.9830324053764343, "learning_rate": 0.0004696907014636156, "loss": 0.9648, "step": 13875 }, { "epoch": 1.6684697680009617, "grad_norm": 0.8053030967712402, "learning_rate": 0.0004696679408347002, "loss": 0.8168, "step": 13880 }, { "epoch": 1.66907080177906, "grad_norm": 0.9662067294120789, "learning_rate": 0.00046964517221490036, "loss": 0.6715, "step": 13885 }, { "epoch": 1.6696718355571583, "grad_norm": 0.7224244475364685, "learning_rate": 0.0004696223956050445, "loss": 0.993, "step": 13890 }, { "epoch": 1.6702728693352566, "grad_norm": 0.7713427543640137, "learning_rate": 0.00046959961100596096, "loss": 1.025, "step": 13895 }, { "epoch": 1.670873903113355, "grad_norm": 0.6482025980949402, "learning_rate": 0.0004695768184184786, "loss": 1.1289, "step": 13900 }, { "epoch": 1.6714749368914532, "grad_norm": 4.499296188354492, "learning_rate": 0.00046955401784342667, "loss": 1.1297, "step": 13905 }, { "epoch": 1.6720759706695516, "grad_norm": 1.7099672555923462, "learning_rate": 0.00046953120928163453, "loss": 0.9883, "step": 13910 }, { "epoch": 1.67267700444765, "grad_norm": 0.5057311058044434, "learning_rate": 0.00046950839273393174, "loss": 0.7602, "step": 13915 }, { "epoch": 1.6732780382257482, "grad_norm": 0.6874944567680359, "learning_rate": 0.0004694855682011486, "loss": 0.7297, "step": 13920 }, { "epoch": 1.6738790720038468, "grad_norm": 0.49933019280433655, "learning_rate": 0.0004694627356841151, "loss": 0.8266, "step": 13925 }, { "epoch": 1.674480105781945, "grad_norm": 0.4433287978172302, "learning_rate": 0.0004694398951836621, "loss": 1.1004, "step": 13930 }, { "epoch": 1.6750811395600431, "grad_norm": 0.8989080190658569, "learning_rate": 0.0004694170467006202, "loss": 1.0156, "step": 13935 }, { "epoch": 1.6756821733381417, "grad_norm": 0.7371564507484436, "learning_rate": 0.0004693941902358208, "loss": 0.6746, "step": 13940 }, { "epoch": 1.67628320711624, "grad_norm": 0.7021960020065308, "learning_rate": 0.0004693713257900952, "loss": 0.9047, "step": 13945 }, { "epoch": 1.6768842408943383, "grad_norm": 0.49400001764297485, "learning_rate": 0.00046934845336427523, "loss": 0.6816, "step": 13950 }, { "epoch": 1.6774852746724367, "grad_norm": 0.5325484275817871, "learning_rate": 0.0004693255729591928, "loss": 0.925, "step": 13955 }, { "epoch": 1.6780863084505349, "grad_norm": 0.7615514397621155, "learning_rate": 0.0004693026845756804, "loss": 0.6191, "step": 13960 }, { "epoch": 1.6786873422286333, "grad_norm": 0.5583820343017578, "learning_rate": 0.00046927978821457045, "loss": 0.8297, "step": 13965 }, { "epoch": 1.6792883760067316, "grad_norm": 0.7001646757125854, "learning_rate": 0.000469256883876696, "loss": 0.9953, "step": 13970 }, { "epoch": 1.6798894097848298, "grad_norm": 0.620525598526001, "learning_rate": 0.00046923397156289025, "loss": 0.743, "step": 13975 }, { "epoch": 1.6804904435629282, "grad_norm": 0.571262776851654, "learning_rate": 0.0004692110512739866, "loss": 0.7945, "step": 13980 }, { "epoch": 1.6810914773410266, "grad_norm": 0.6807316541671753, "learning_rate": 0.00046918812301081886, "loss": 0.8965, "step": 13985 }, { "epoch": 1.6816925111191248, "grad_norm": 0.48465320467948914, "learning_rate": 0.0004691651867742211, "loss": 0.8441, "step": 13990 }, { "epoch": 1.6822935448972234, "grad_norm": 0.8721078634262085, "learning_rate": 0.00046914224256502766, "loss": 1.1859, "step": 13995 }, { "epoch": 1.6828945786753216, "grad_norm": 0.6022788286209106, "learning_rate": 0.00046911929038407317, "loss": 1.2852, "step": 14000 }, { "epoch": 1.6828945786753216, "eval_loss": 1.88818359375, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.191, "eval_samples_per_second": 4.547, "eval_steps_per_second": 1.137, "step": 14000 }, { "epoch": 1.6834956124534197, "grad_norm": 0.7357308864593506, "learning_rate": 0.0004690963302321927, "loss": 0.6275, "step": 14005 }, { "epoch": 1.6840966462315183, "grad_norm": 0.5796063542366028, "learning_rate": 0.00046907336211022123, "loss": 0.8156, "step": 14010 }, { "epoch": 1.6846976800096165, "grad_norm": 0.660999059677124, "learning_rate": 0.0004690503860189945, "loss": 0.8492, "step": 14015 }, { "epoch": 1.685298713787715, "grad_norm": 0.8831363916397095, "learning_rate": 0.00046902740195934814, "loss": 1.0922, "step": 14020 }, { "epoch": 1.6858997475658133, "grad_norm": 0.6950260400772095, "learning_rate": 0.00046900440993211833, "loss": 1.2188, "step": 14025 }, { "epoch": 1.6865007813439115, "grad_norm": 0.8405808806419373, "learning_rate": 0.00046898140993814147, "loss": 0.7633, "step": 14030 }, { "epoch": 1.6871018151220099, "grad_norm": 0.6292724609375, "learning_rate": 0.00046895840197825424, "loss": 0.9727, "step": 14035 }, { "epoch": 1.6877028489001082, "grad_norm": 0.5269624590873718, "learning_rate": 0.0004689353860532935, "loss": 1.5391, "step": 14040 }, { "epoch": 1.6883038826782064, "grad_norm": 0.6983106136322021, "learning_rate": 0.0004689123621640966, "loss": 0.9477, "step": 14045 }, { "epoch": 1.6889049164563048, "grad_norm": 0.8299253582954407, "learning_rate": 0.00046888933031150103, "loss": 0.9773, "step": 14050 }, { "epoch": 1.6895059502344032, "grad_norm": 0.6337663531303406, "learning_rate": 0.00046886629049634466, "loss": 0.7812, "step": 14055 }, { "epoch": 1.6901069840125014, "grad_norm": 0.4387208819389343, "learning_rate": 0.00046884324271946566, "loss": 0.8047, "step": 14060 }, { "epoch": 1.6907080177906, "grad_norm": 0.4457261562347412, "learning_rate": 0.00046882018698170237, "loss": 0.6129, "step": 14065 }, { "epoch": 1.6913090515686982, "grad_norm": 0.4971809983253479, "learning_rate": 0.0004687971232838935, "loss": 0.75, "step": 14070 }, { "epoch": 1.6919100853467965, "grad_norm": 6.326786518096924, "learning_rate": 0.000468774051626878, "loss": 0.8187, "step": 14075 }, { "epoch": 1.692511119124895, "grad_norm": 0.6099910736083984, "learning_rate": 0.0004687509720114952, "loss": 1.218, "step": 14080 }, { "epoch": 1.693112152902993, "grad_norm": 0.8206839561462402, "learning_rate": 0.0004687278844385846, "loss": 0.7766, "step": 14085 }, { "epoch": 1.6937131866810915, "grad_norm": 0.6266629695892334, "learning_rate": 0.0004687047889089863, "loss": 0.6633, "step": 14090 }, { "epoch": 1.69431422045919, "grad_norm": 0.6055701971054077, "learning_rate": 0.0004686816854235401, "loss": 1.2812, "step": 14095 }, { "epoch": 1.694915254237288, "grad_norm": 0.5827081799507141, "learning_rate": 0.0004686585739830867, "loss": 1.0781, "step": 14100 }, { "epoch": 1.6955162880153865, "grad_norm": 0.6133692860603333, "learning_rate": 0.00046863545458846666, "loss": 0.6965, "step": 14105 }, { "epoch": 1.6961173217934848, "grad_norm": 0.6290988326072693, "learning_rate": 0.00046861232724052117, "loss": 0.8176, "step": 14110 }, { "epoch": 1.696718355571583, "grad_norm": 0.5541148781776428, "learning_rate": 0.0004685891919400914, "loss": 0.8508, "step": 14115 }, { "epoch": 1.6973193893496814, "grad_norm": 0.6243453621864319, "learning_rate": 0.000468566048688019, "loss": 0.8648, "step": 14120 }, { "epoch": 1.6979204231277798, "grad_norm": 0.41570451855659485, "learning_rate": 0.0004685428974851458, "loss": 0.7383, "step": 14125 }, { "epoch": 1.698521456905878, "grad_norm": 0.5674639344215393, "learning_rate": 0.00046851973833231405, "loss": 0.9578, "step": 14130 }, { "epoch": 1.6991224906839766, "grad_norm": 0.7244648933410645, "learning_rate": 0.00046849657123036617, "loss": 0.925, "step": 14135 }, { "epoch": 1.6997235244620748, "grad_norm": 0.635011613368988, "learning_rate": 0.00046847339618014486, "loss": 0.7227, "step": 14140 }, { "epoch": 1.7003245582401731, "grad_norm": 0.5580353736877441, "learning_rate": 0.0004684502131824933, "loss": 0.8387, "step": 14145 }, { "epoch": 1.7009255920182715, "grad_norm": 0.5320613980293274, "learning_rate": 0.0004684270222382547, "loss": 0.8656, "step": 14150 }, { "epoch": 1.7015266257963697, "grad_norm": 0.45074930787086487, "learning_rate": 0.0004684038233482727, "loss": 0.6988, "step": 14155 }, { "epoch": 1.702127659574468, "grad_norm": 0.7159966230392456, "learning_rate": 0.00046838061651339124, "loss": 0.7195, "step": 14160 }, { "epoch": 1.7027286933525665, "grad_norm": 0.7549470067024231, "learning_rate": 0.0004683574017344545, "loss": 0.7258, "step": 14165 }, { "epoch": 1.7033297271306647, "grad_norm": 0.7577493190765381, "learning_rate": 0.00046833417901230703, "loss": 1.0453, "step": 14170 }, { "epoch": 1.703930760908763, "grad_norm": 0.9762184619903564, "learning_rate": 0.0004683109483477935, "loss": 0.8031, "step": 14175 }, { "epoch": 1.7045317946868614, "grad_norm": 0.7323477268218994, "learning_rate": 0.00046828770974175903, "loss": 1.4648, "step": 14180 }, { "epoch": 1.7051328284649596, "grad_norm": 0.47101300954818726, "learning_rate": 0.00046826446319504897, "loss": 1.0633, "step": 14185 }, { "epoch": 1.705733862243058, "grad_norm": 0.6455997824668884, "learning_rate": 0.00046824120870850896, "loss": 0.8336, "step": 14190 }, { "epoch": 1.7063348960211564, "grad_norm": 0.5867834091186523, "learning_rate": 0.00046821794628298485, "loss": 0.8914, "step": 14195 }, { "epoch": 1.7069359297992546, "grad_norm": 0.686712384223938, "learning_rate": 0.000468194675919323, "loss": 0.6324, "step": 14200 }, { "epoch": 1.7069359297992546, "eval_loss": 1.9455077648162842, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2074, "eval_samples_per_second": 4.544, "eval_steps_per_second": 1.136, "step": 14200 }, { "epoch": 1.7075369635773532, "grad_norm": 0.6903026700019836, "learning_rate": 0.0004681713976183699, "loss": 0.8844, "step": 14205 }, { "epoch": 1.7081379973554514, "grad_norm": 0.5660929083824158, "learning_rate": 0.00046814811138097225, "loss": 0.6027, "step": 14210 }, { "epoch": 1.7087390311335497, "grad_norm": 0.7346572279930115, "learning_rate": 0.0004681248172079772, "loss": 1.0898, "step": 14215 }, { "epoch": 1.7093400649116481, "grad_norm": 0.8147523999214172, "learning_rate": 0.0004681015151002321, "loss": 0.9172, "step": 14220 }, { "epoch": 1.7099410986897463, "grad_norm": 0.7744752764701843, "learning_rate": 0.00046807820505858467, "loss": 1.0477, "step": 14225 }, { "epoch": 1.7105421324678447, "grad_norm": 0.5552868843078613, "learning_rate": 0.0004680548870838828, "loss": 1.0758, "step": 14230 }, { "epoch": 1.711143166245943, "grad_norm": 0.5608605146408081, "learning_rate": 0.0004680315611769748, "loss": 0.9187, "step": 14235 }, { "epoch": 1.7117442000240413, "grad_norm": 2.3658225536346436, "learning_rate": 0.00046800822733870906, "loss": 1.0734, "step": 14240 }, { "epoch": 1.7123452338021397, "grad_norm": 0.7731983065605164, "learning_rate": 0.0004679848855699345, "loss": 0.768, "step": 14245 }, { "epoch": 1.712946267580238, "grad_norm": 0.5495609641075134, "learning_rate": 0.00046796153587150025, "loss": 0.9344, "step": 14250 }, { "epoch": 1.7135473013583362, "grad_norm": 0.9082116484642029, "learning_rate": 0.00046793817824425566, "loss": 1.3, "step": 14255 }, { "epoch": 1.7141483351364348, "grad_norm": 10.029204368591309, "learning_rate": 0.0004679148126890503, "loss": 0.723, "step": 14260 }, { "epoch": 1.714749368914533, "grad_norm": 0.5258381366729736, "learning_rate": 0.0004678914392067344, "loss": 1.1266, "step": 14265 }, { "epoch": 1.7153504026926312, "grad_norm": 0.6977257132530212, "learning_rate": 0.0004678680577981581, "loss": 1.0969, "step": 14270 }, { "epoch": 1.7159514364707298, "grad_norm": 0.6840816736221313, "learning_rate": 0.0004678446684641719, "loss": 0.9004, "step": 14275 }, { "epoch": 1.716552470248828, "grad_norm": 0.7627301812171936, "learning_rate": 0.00046782127120562663, "loss": 0.9383, "step": 14280 }, { "epoch": 1.7171535040269263, "grad_norm": 2.099540948867798, "learning_rate": 0.0004677978660233735, "loss": 0.6945, "step": 14285 }, { "epoch": 1.7177545378050247, "grad_norm": 0.8298578262329102, "learning_rate": 0.00046777445291826383, "loss": 0.7266, "step": 14290 }, { "epoch": 1.718355571583123, "grad_norm": 0.8102193474769592, "learning_rate": 0.0004677510318911493, "loss": 0.9484, "step": 14295 }, { "epoch": 1.7189566053612213, "grad_norm": 0.5687063336372375, "learning_rate": 0.00046772760294288207, "loss": 1.257, "step": 14300 }, { "epoch": 1.7195576391393197, "grad_norm": 0.6808454394340515, "learning_rate": 0.00046770416607431434, "loss": 1.0383, "step": 14305 }, { "epoch": 1.7201586729174179, "grad_norm": 0.5212880969047546, "learning_rate": 0.00046768072128629853, "loss": 0.9945, "step": 14310 }, { "epoch": 1.7207597066955163, "grad_norm": 0.4844411313533783, "learning_rate": 0.00046765726857968764, "loss": 0.8609, "step": 14315 }, { "epoch": 1.7213607404736146, "grad_norm": 0.7734144330024719, "learning_rate": 0.0004676338079553348, "loss": 0.802, "step": 14320 }, { "epoch": 1.7219617742517128, "grad_norm": 0.5943062901496887, "learning_rate": 0.0004676103394140935, "loss": 0.9465, "step": 14325 }, { "epoch": 1.7225628080298114, "grad_norm": 0.6196196675300598, "learning_rate": 0.00046758686295681726, "loss": 0.643, "step": 14330 }, { "epoch": 1.7231638418079096, "grad_norm": 0.8153932690620422, "learning_rate": 0.0004675633785843602, "loss": 0.7316, "step": 14335 }, { "epoch": 1.7237648755860078, "grad_norm": 0.5955705046653748, "learning_rate": 0.0004675398862975767, "loss": 0.5957, "step": 14340 }, { "epoch": 1.7243659093641064, "grad_norm": 0.8560868501663208, "learning_rate": 0.0004675163860973212, "loss": 1.3484, "step": 14345 }, { "epoch": 1.7249669431422046, "grad_norm": 0.9683178663253784, "learning_rate": 0.0004674928779844486, "loss": 1.0398, "step": 14350 }, { "epoch": 1.725567976920303, "grad_norm": 0.3556564152240753, "learning_rate": 0.00046746936195981415, "loss": 0.9938, "step": 14355 }, { "epoch": 1.7261690106984013, "grad_norm": 0.6307397484779358, "learning_rate": 0.0004674458380242733, "loss": 0.7758, "step": 14360 }, { "epoch": 1.7267700444764995, "grad_norm": 0.6326887011528015, "learning_rate": 0.0004674223061786816, "loss": 0.6723, "step": 14365 }, { "epoch": 1.727371078254598, "grad_norm": 0.7777372598648071, "learning_rate": 0.0004673987664238952, "loss": 0.7316, "step": 14370 }, { "epoch": 1.7279721120326963, "grad_norm": 0.9758979082107544, "learning_rate": 0.00046737521876077037, "loss": 1.0273, "step": 14375 }, { "epoch": 1.7285731458107945, "grad_norm": 0.556367039680481, "learning_rate": 0.0004673516631901638, "loss": 0.9609, "step": 14380 }, { "epoch": 1.7291741795888929, "grad_norm": 0.706124484539032, "learning_rate": 0.00046732809971293223, "loss": 0.6875, "step": 14385 }, { "epoch": 1.7297752133669912, "grad_norm": 0.4858115613460541, "learning_rate": 0.00046730452832993295, "loss": 1.0453, "step": 14390 }, { "epoch": 1.7303762471450894, "grad_norm": 0.6397810578346252, "learning_rate": 0.00046728094904202334, "loss": 0.8773, "step": 14395 }, { "epoch": 1.730977280923188, "grad_norm": 0.594719409942627, "learning_rate": 0.0004672573618500612, "loss": 1.2039, "step": 14400 }, { "epoch": 1.730977280923188, "eval_loss": 1.8694336414337158, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1913, "eval_samples_per_second": 4.547, "eval_steps_per_second": 1.137, "step": 14400 }, { "epoch": 1.7315783147012862, "grad_norm": 0.7139551639556885, "learning_rate": 0.0004672337667549046, "loss": 1.1117, "step": 14405 }, { "epoch": 1.7321793484793846, "grad_norm": 0.8633884191513062, "learning_rate": 0.00046721016375741173, "loss": 0.7621, "step": 14410 }, { "epoch": 1.732780382257483, "grad_norm": 0.6140379309654236, "learning_rate": 0.00046718655285844123, "loss": 1.1867, "step": 14415 }, { "epoch": 1.7333814160355812, "grad_norm": 0.5713362693786621, "learning_rate": 0.00046716293405885215, "loss": 1.2, "step": 14420 }, { "epoch": 1.7339824498136795, "grad_norm": 0.8366125822067261, "learning_rate": 0.0004671393073595035, "loss": 1.132, "step": 14425 }, { "epoch": 1.734583483591778, "grad_norm": 0.5650630593299866, "learning_rate": 0.0004671156727612548, "loss": 0.7625, "step": 14430 }, { "epoch": 1.7351845173698761, "grad_norm": 0.43902072310447693, "learning_rate": 0.00046709203026496583, "loss": 0.8293, "step": 14435 }, { "epoch": 1.7357855511479745, "grad_norm": 0.7776477336883545, "learning_rate": 0.0004670683798714966, "loss": 1.175, "step": 14440 }, { "epoch": 1.736386584926073, "grad_norm": 0.5803810954093933, "learning_rate": 0.00046704472158170754, "loss": 0.907, "step": 14445 }, { "epoch": 1.736987618704171, "grad_norm": 0.6228445768356323, "learning_rate": 0.0004670210553964591, "loss": 1.1492, "step": 14450 }, { "epoch": 1.7375886524822695, "grad_norm": 0.5363442897796631, "learning_rate": 0.0004669973813166123, "loss": 1.0336, "step": 14455 }, { "epoch": 1.7381896862603678, "grad_norm": 0.8779881000518799, "learning_rate": 0.00046697369934302845, "loss": 1.0477, "step": 14460 }, { "epoch": 1.738790720038466, "grad_norm": 0.9747870564460754, "learning_rate": 0.0004669500094765688, "loss": 0.6488, "step": 14465 }, { "epoch": 1.7393917538165646, "grad_norm": 0.7010635733604431, "learning_rate": 0.00046692631171809523, "loss": 1.1141, "step": 14470 }, { "epoch": 1.7399927875946628, "grad_norm": 0.5419831871986389, "learning_rate": 0.00046690260606846977, "loss": 1.0398, "step": 14475 }, { "epoch": 1.7405938213727612, "grad_norm": 0.5878553986549377, "learning_rate": 0.00046687889252855486, "loss": 0.7367, "step": 14480 }, { "epoch": 1.7411948551508596, "grad_norm": 0.6187124252319336, "learning_rate": 0.000466855171099213, "loss": 0.8836, "step": 14485 }, { "epoch": 1.7417958889289578, "grad_norm": 0.8833189606666565, "learning_rate": 0.00046683144178130714, "loss": 0.9539, "step": 14490 }, { "epoch": 1.7423969227070562, "grad_norm": 0.5022724270820618, "learning_rate": 0.00046680770457570054, "loss": 0.627, "step": 14495 }, { "epoch": 1.7429979564851545, "grad_norm": 0.7679694294929504, "learning_rate": 0.0004667839594832566, "loss": 1.1199, "step": 14500 }, { "epoch": 1.7435989902632527, "grad_norm": 0.7132576704025269, "learning_rate": 0.0004667602065048392, "loss": 1.143, "step": 14505 }, { "epoch": 1.744200024041351, "grad_norm": 0.6355376243591309, "learning_rate": 0.00046673644564131236, "loss": 0.8969, "step": 14510 }, { "epoch": 1.7448010578194495, "grad_norm": 0.7761083245277405, "learning_rate": 0.00046671267689354034, "loss": 0.9945, "step": 14515 }, { "epoch": 1.7454020915975477, "grad_norm": 0.8872345685958862, "learning_rate": 0.00046668890026238795, "loss": 1.1633, "step": 14520 }, { "epoch": 1.746003125375646, "grad_norm": 0.5762779116630554, "learning_rate": 0.0004666651157487201, "loss": 0.7742, "step": 14525 }, { "epoch": 1.7466041591537445, "grad_norm": 0.8239423632621765, "learning_rate": 0.00046664132335340184, "loss": 0.6023, "step": 14530 }, { "epoch": 1.7472051929318426, "grad_norm": 0.6539376974105835, "learning_rate": 0.0004666175230772987, "loss": 0.7094, "step": 14535 }, { "epoch": 1.7478062267099412, "grad_norm": 0.6567628979682922, "learning_rate": 0.00046659371492127654, "loss": 0.9242, "step": 14540 }, { "epoch": 1.7484072604880394, "grad_norm": 0.80846107006073, "learning_rate": 0.0004665698988862015, "loss": 1.0164, "step": 14545 }, { "epoch": 1.7490082942661378, "grad_norm": 0.5849335193634033, "learning_rate": 0.00046654607497293983, "loss": 0.6699, "step": 14550 }, { "epoch": 1.7496093280442362, "grad_norm": 0.6208394169807434, "learning_rate": 0.0004665222431823582, "loss": 0.9234, "step": 14555 }, { "epoch": 1.7502103618223344, "grad_norm": 0.8051161766052246, "learning_rate": 0.0004664984035153235, "loss": 0.7693, "step": 14560 }, { "epoch": 1.7508113956004328, "grad_norm": 0.6577956676483154, "learning_rate": 0.0004664745559727031, "loss": 0.9578, "step": 14565 }, { "epoch": 1.7514124293785311, "grad_norm": 0.4393552839756012, "learning_rate": 0.0004664507005553643, "loss": 0.7617, "step": 14570 }, { "epoch": 1.7520134631566293, "grad_norm": 0.4610913395881653, "learning_rate": 0.000466426837264175, "loss": 1.0383, "step": 14575 }, { "epoch": 1.7526144969347277, "grad_norm": 0.5156323313713074, "learning_rate": 0.0004664029661000032, "loss": 0.7055, "step": 14580 }, { "epoch": 1.753215530712826, "grad_norm": 0.524409830570221, "learning_rate": 0.00046637908706371746, "loss": 0.9246, "step": 14585 }, { "epoch": 1.7538165644909243, "grad_norm": 0.6617944836616516, "learning_rate": 0.0004663552001561862, "loss": 0.8344, "step": 14590 }, { "epoch": 1.7544175982690229, "grad_norm": 0.855524480342865, "learning_rate": 0.0004663313053782785, "loss": 1.0516, "step": 14595 }, { "epoch": 1.755018632047121, "grad_norm": 0.751544713973999, "learning_rate": 0.0004663074027308636, "loss": 1.0258, "step": 14600 }, { "epoch": 1.755018632047121, "eval_loss": 1.901757836341858, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.22, "eval_samples_per_second": 4.543, "eval_steps_per_second": 1.136, "step": 14600 }, { "epoch": 1.7556196658252192, "grad_norm": 0.5535914897918701, "learning_rate": 0.0004662834922148108, "loss": 0.9187, "step": 14605 }, { "epoch": 1.7562206996033178, "grad_norm": 0.4686012268066406, "learning_rate": 0.00046625957383099017, "loss": 1.0359, "step": 14610 }, { "epoch": 1.756821733381416, "grad_norm": 0.6937959790229797, "learning_rate": 0.0004662356475802716, "loss": 1.1, "step": 14615 }, { "epoch": 1.7574227671595144, "grad_norm": 0.6707364916801453, "learning_rate": 0.0004662117134635255, "loss": 0.8777, "step": 14620 }, { "epoch": 1.7580238009376128, "grad_norm": 0.5960907936096191, "learning_rate": 0.00046618777148162263, "loss": 0.5996, "step": 14625 }, { "epoch": 1.758624834715711, "grad_norm": 1.0070306062698364, "learning_rate": 0.0004661638216354338, "loss": 0.8371, "step": 14630 }, { "epoch": 1.7592258684938094, "grad_norm": 0.9170734286308289, "learning_rate": 0.00046613986392583025, "loss": 0.7422, "step": 14635 }, { "epoch": 1.7598269022719077, "grad_norm": 0.7560063004493713, "learning_rate": 0.00046611589835368356, "loss": 0.7414, "step": 14640 }, { "epoch": 1.760427936050006, "grad_norm": 0.5892887711524963, "learning_rate": 0.00046609192491986554, "loss": 0.8523, "step": 14645 }, { "epoch": 1.7610289698281043, "grad_norm": 0.8650941252708435, "learning_rate": 0.00046606794362524806, "loss": 1.2141, "step": 14650 }, { "epoch": 1.7616300036062027, "grad_norm": 0.6460860967636108, "learning_rate": 0.0004660439544707038, "loss": 0.7453, "step": 14655 }, { "epoch": 1.7622310373843009, "grad_norm": 0.5424789786338806, "learning_rate": 0.0004660199574571052, "loss": 0.9875, "step": 14660 }, { "epoch": 1.7628320711623995, "grad_norm": 0.5063350796699524, "learning_rate": 0.00046599595258532537, "loss": 1.1051, "step": 14665 }, { "epoch": 1.7634331049404977, "grad_norm": 0.6863090395927429, "learning_rate": 0.00046597193985623735, "loss": 0.7371, "step": 14670 }, { "epoch": 1.7640341387185958, "grad_norm": 0.799013078212738, "learning_rate": 0.00046594791927071476, "loss": 0.9055, "step": 14675 }, { "epoch": 1.7646351724966944, "grad_norm": 1.3016997575759888, "learning_rate": 0.0004659238908296315, "loss": 0.8016, "step": 14680 }, { "epoch": 1.7652362062747926, "grad_norm": 0.6678380966186523, "learning_rate": 0.0004658998545338614, "loss": 0.8445, "step": 14685 }, { "epoch": 1.765837240052891, "grad_norm": 0.6553308963775635, "learning_rate": 0.0004658758103842791, "loss": 1.0453, "step": 14690 }, { "epoch": 1.7664382738309894, "grad_norm": 0.6554756164550781, "learning_rate": 0.00046585175838175896, "loss": 0.8805, "step": 14695 }, { "epoch": 1.7670393076090876, "grad_norm": 0.5864089131355286, "learning_rate": 0.0004658276985271762, "loss": 0.6094, "step": 14700 }, { "epoch": 1.767640341387186, "grad_norm": 0.5038054585456848, "learning_rate": 0.00046580363082140595, "loss": 0.9602, "step": 14705 }, { "epoch": 1.7682413751652843, "grad_norm": 0.6401656270027161, "learning_rate": 0.00046577955526532367, "loss": 0.9539, "step": 14710 }, { "epoch": 1.7688424089433825, "grad_norm": 0.7997369766235352, "learning_rate": 0.00046575547185980523, "loss": 0.7359, "step": 14715 }, { "epoch": 1.769443442721481, "grad_norm": 0.7938977479934692, "learning_rate": 0.0004657313806057266, "loss": 0.8242, "step": 14720 }, { "epoch": 1.7700444764995793, "grad_norm": 0.7641409635543823, "learning_rate": 0.00046570728150396434, "loss": 0.7977, "step": 14725 }, { "epoch": 1.7706455102776775, "grad_norm": 0.48218366503715515, "learning_rate": 0.000465683174555395, "loss": 0.6922, "step": 14730 }, { "epoch": 1.771246544055776, "grad_norm": 0.6959134340286255, "learning_rate": 0.00046565905976089545, "loss": 0.652, "step": 14735 }, { "epoch": 1.7718475778338743, "grad_norm": 0.6846743226051331, "learning_rate": 0.000465634937121343, "loss": 0.7445, "step": 14740 }, { "epoch": 1.7724486116119726, "grad_norm": 0.6193838119506836, "learning_rate": 0.00046561080663761525, "loss": 0.9313, "step": 14745 }, { "epoch": 1.773049645390071, "grad_norm": 0.6893641352653503, "learning_rate": 0.00046558666831058985, "loss": 0.8648, "step": 14750 }, { "epoch": 1.7736506791681692, "grad_norm": 0.4324619770050049, "learning_rate": 0.0004655625221411448, "loss": 0.7871, "step": 14755 }, { "epoch": 1.7742517129462676, "grad_norm": 0.6585713624954224, "learning_rate": 0.0004655383681301587, "loss": 0.8477, "step": 14760 }, { "epoch": 1.774852746724366, "grad_norm": 0.6840612292289734, "learning_rate": 0.00046551420627851016, "loss": 0.8469, "step": 14765 }, { "epoch": 1.7754537805024642, "grad_norm": 0.7830227613449097, "learning_rate": 0.00046549003658707803, "loss": 1.1734, "step": 14770 }, { "epoch": 1.7760548142805626, "grad_norm": 0.6449682712554932, "learning_rate": 0.00046546585905674157, "loss": 0.8211, "step": 14775 }, { "epoch": 1.776655848058661, "grad_norm": 0.5970798134803772, "learning_rate": 0.0004654416736883802, "loss": 0.8352, "step": 14780 }, { "epoch": 1.7772568818367591, "grad_norm": 0.5850330591201782, "learning_rate": 0.00046541748048287384, "loss": 0.8941, "step": 14785 }, { "epoch": 1.7778579156148575, "grad_norm": 0.9840527772903442, "learning_rate": 0.00046539327944110256, "loss": 1.1844, "step": 14790 }, { "epoch": 1.778458949392956, "grad_norm": 0.6113297343254089, "learning_rate": 0.0004653690705639466, "loss": 0.6355, "step": 14795 }, { "epoch": 1.779059983171054, "grad_norm": 0.5765538811683655, "learning_rate": 0.00046534485385228675, "loss": 0.6301, "step": 14800 }, { "epoch": 1.779059983171054, "eval_loss": 1.8923828601837158, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.204, "eval_samples_per_second": 4.545, "eval_steps_per_second": 1.136, "step": 14800 }, { "epoch": 1.7796610169491527, "grad_norm": 0.32295092940330505, "learning_rate": 0.00046532062930700394, "loss": 0.6115, "step": 14805 }, { "epoch": 1.7802620507272509, "grad_norm": 0.5577372908592224, "learning_rate": 0.0004652963969289793, "loss": 0.7641, "step": 14810 }, { "epoch": 1.7808630845053492, "grad_norm": 0.682712197303772, "learning_rate": 0.0004652721567190944, "loss": 1.0406, "step": 14815 }, { "epoch": 1.7814641182834476, "grad_norm": 0.7370647192001343, "learning_rate": 0.000465247908678231, "loss": 0.7297, "step": 14820 }, { "epoch": 1.7820651520615458, "grad_norm": 0.7873982191085815, "learning_rate": 0.00046522365280727115, "loss": 0.8984, "step": 14825 }, { "epoch": 1.7826661858396442, "grad_norm": 0.533419668674469, "learning_rate": 0.00046519938910709725, "loss": 0.6441, "step": 14830 }, { "epoch": 1.7832672196177426, "grad_norm": 0.6736013293266296, "learning_rate": 0.0004651751175785919, "loss": 0.9078, "step": 14835 }, { "epoch": 1.7838682533958408, "grad_norm": 0.7657696604728699, "learning_rate": 0.00046515083822263806, "loss": 1.0555, "step": 14840 }, { "epoch": 1.7844692871739392, "grad_norm": 0.6417455673217773, "learning_rate": 0.0004651265510401189, "loss": 0.7266, "step": 14845 }, { "epoch": 1.7850703209520375, "grad_norm": 0.5859986543655396, "learning_rate": 0.000465102256031918, "loss": 0.5254, "step": 14850 }, { "epoch": 1.7856713547301357, "grad_norm": 0.5953994989395142, "learning_rate": 0.00046507795319891915, "loss": 1.0316, "step": 14855 }, { "epoch": 1.786272388508234, "grad_norm": 0.542062520980835, "learning_rate": 0.0004650536425420063, "loss": 0.657, "step": 14860 }, { "epoch": 1.7868734222863325, "grad_norm": 0.5656332969665527, "learning_rate": 0.0004650293240620639, "loss": 0.8547, "step": 14865 }, { "epoch": 1.7874744560644307, "grad_norm": 0.6022223830223083, "learning_rate": 0.00046500499775997656, "loss": 0.9875, "step": 14870 }, { "epoch": 1.7880754898425293, "grad_norm": 0.6173596382141113, "learning_rate": 0.00046498066363662914, "loss": 1.3766, "step": 14875 }, { "epoch": 1.7886765236206275, "grad_norm": 0.6819121837615967, "learning_rate": 0.00046495632169290694, "loss": 0.8633, "step": 14880 }, { "epoch": 1.7892775573987258, "grad_norm": 0.7490895986557007, "learning_rate": 0.0004649319719296954, "loss": 0.6531, "step": 14885 }, { "epoch": 1.7898785911768242, "grad_norm": 0.6200850009918213, "learning_rate": 0.0004649076143478803, "loss": 0.8449, "step": 14890 }, { "epoch": 1.7904796249549224, "grad_norm": 0.7366893291473389, "learning_rate": 0.00046488324894834775, "loss": 0.8906, "step": 14895 }, { "epoch": 1.7910806587330208, "grad_norm": 0.6687186360359192, "learning_rate": 0.000464858875731984, "loss": 0.9031, "step": 14900 }, { "epoch": 1.7916816925111192, "grad_norm": 0.562707245349884, "learning_rate": 0.0004648344946996757, "loss": 0.8344, "step": 14905 }, { "epoch": 1.7922827262892174, "grad_norm": 0.7979212403297424, "learning_rate": 0.00046481010585230974, "loss": 0.8148, "step": 14910 }, { "epoch": 1.7928837600673158, "grad_norm": 0.4303218722343445, "learning_rate": 0.00046478570919077344, "loss": 0.9242, "step": 14915 }, { "epoch": 1.7934847938454141, "grad_norm": 0.5044393539428711, "learning_rate": 0.00046476130471595424, "loss": 1.0, "step": 14920 }, { "epoch": 1.7940858276235123, "grad_norm": 0.8743340373039246, "learning_rate": 0.00046473689242873973, "loss": 0.7199, "step": 14925 }, { "epoch": 1.794686861401611, "grad_norm": 0.6428083777427673, "learning_rate": 0.0004647124723300182, "loss": 0.6094, "step": 14930 }, { "epoch": 1.795287895179709, "grad_norm": 0.514148473739624, "learning_rate": 0.0004646880444206778, "loss": 1.0352, "step": 14935 }, { "epoch": 1.7958889289578073, "grad_norm": 0.8994453549385071, "learning_rate": 0.00046466360870160727, "loss": 0.723, "step": 14940 }, { "epoch": 1.7964899627359059, "grad_norm": 0.4371896982192993, "learning_rate": 0.00046463916517369545, "loss": 0.9723, "step": 14945 }, { "epoch": 1.797090996514004, "grad_norm": 0.5805209875106812, "learning_rate": 0.0004646147138378315, "loss": 1.3289, "step": 14950 }, { "epoch": 1.7976920302921024, "grad_norm": 0.5409806966781616, "learning_rate": 0.0004645902546949049, "loss": 0.7914, "step": 14955 }, { "epoch": 1.7982930640702008, "grad_norm": 0.6429707407951355, "learning_rate": 0.0004645657877458055, "loss": 0.6945, "step": 14960 }, { "epoch": 1.798894097848299, "grad_norm": 0.49241867661476135, "learning_rate": 0.0004645413129914232, "loss": 0.7336, "step": 14965 }, { "epoch": 1.7994951316263974, "grad_norm": 0.7267056107521057, "learning_rate": 0.0004645168304326485, "loss": 1.1109, "step": 14970 }, { "epoch": 1.8000961654044958, "grad_norm": 0.7958641648292542, "learning_rate": 0.00046449234007037176, "loss": 0.7434, "step": 14975 }, { "epoch": 1.800697199182594, "grad_norm": 0.6455166935920715, "learning_rate": 0.000464467841905484, "loss": 1.0578, "step": 14980 }, { "epoch": 1.8012982329606924, "grad_norm": 0.5513315200805664, "learning_rate": 0.0004644433359388764, "loss": 0.8227, "step": 14985 }, { "epoch": 1.8018992667387908, "grad_norm": 0.7794502377510071, "learning_rate": 0.0004644188221714405, "loss": 0.6512, "step": 14990 }, { "epoch": 1.802500300516889, "grad_norm": 0.5551174283027649, "learning_rate": 0.0004643943006040679, "loss": 0.8187, "step": 14995 }, { "epoch": 1.8031013342949875, "grad_norm": 0.7093446254730225, "learning_rate": 0.0004643697712376507, "loss": 0.7445, "step": 15000 }, { "epoch": 1.8031013342949875, "eval_loss": 1.8564453125, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2193, "eval_samples_per_second": 4.543, "eval_steps_per_second": 1.136, "step": 15000 }, { "epoch": 1.8037023680730857, "grad_norm": 0.44965025782585144, "learning_rate": 0.00046434523407308103, "loss": 1.175, "step": 15005 }, { "epoch": 1.8043034018511839, "grad_norm": 0.448920339345932, "learning_rate": 0.0004643206891112518, "loss": 0.6039, "step": 15010 }, { "epoch": 1.8049044356292825, "grad_norm": 0.651673436164856, "learning_rate": 0.0004642961363530556, "loss": 1.0094, "step": 15015 }, { "epoch": 1.8055054694073807, "grad_norm": 0.7590422630310059, "learning_rate": 0.00046427157579938575, "loss": 0.7008, "step": 15020 }, { "epoch": 1.806106503185479, "grad_norm": 0.7723820805549622, "learning_rate": 0.0004642470074511356, "loss": 0.9641, "step": 15025 }, { "epoch": 1.8067075369635774, "grad_norm": 0.608374834060669, "learning_rate": 0.000464222431309199, "loss": 0.9832, "step": 15030 }, { "epoch": 1.8073085707416756, "grad_norm": 0.6677886843681335, "learning_rate": 0.0004641978473744698, "loss": 0.9055, "step": 15035 }, { "epoch": 1.807909604519774, "grad_norm": 0.5482303500175476, "learning_rate": 0.00046417325564784243, "loss": 0.5465, "step": 15040 }, { "epoch": 1.8085106382978724, "grad_norm": 0.823980450630188, "learning_rate": 0.0004641486561302114, "loss": 0.975, "step": 15045 }, { "epoch": 1.8091116720759706, "grad_norm": 0.7872509956359863, "learning_rate": 0.0004641240488224715, "loss": 0.875, "step": 15050 }, { "epoch": 1.809712705854069, "grad_norm": 0.5708063840866089, "learning_rate": 0.000464099433725518, "loss": 1.0328, "step": 15055 }, { "epoch": 1.8103137396321674, "grad_norm": 0.6019933819770813, "learning_rate": 0.00046407481084024627, "loss": 0.8781, "step": 15060 }, { "epoch": 1.8109147734102655, "grad_norm": 0.6818620562553406, "learning_rate": 0.0004640501801675521, "loss": 1.0953, "step": 15065 }, { "epoch": 1.8115158071883641, "grad_norm": 0.5760321617126465, "learning_rate": 0.0004640255417083313, "loss": 0.9102, "step": 15070 }, { "epoch": 1.8121168409664623, "grad_norm": 0.8366863131523132, "learning_rate": 0.0004640008954634803, "loss": 1.0813, "step": 15075 }, { "epoch": 1.8127178747445607, "grad_norm": 0.6316083073616028, "learning_rate": 0.0004639762414338956, "loss": 0.7621, "step": 15080 }, { "epoch": 1.813318908522659, "grad_norm": 0.608749508857727, "learning_rate": 0.000463951579620474, "loss": 1.1727, "step": 15085 }, { "epoch": 1.8139199423007573, "grad_norm": 0.40747588872909546, "learning_rate": 0.00046392691002411274, "loss": 0.918, "step": 15090 }, { "epoch": 1.8145209760788557, "grad_norm": 0.6815487146377563, "learning_rate": 0.0004639022326457092, "loss": 0.8031, "step": 15095 }, { "epoch": 1.815122009856954, "grad_norm": 0.6751261949539185, "learning_rate": 0.00046387754748616097, "loss": 0.982, "step": 15100 }, { "epoch": 1.8157230436350522, "grad_norm": 0.6685701608657837, "learning_rate": 0.0004638528545463661, "loss": 0.8207, "step": 15105 }, { "epoch": 1.8163240774131506, "grad_norm": 0.7334131598472595, "learning_rate": 0.0004638281538272229, "loss": 0.9441, "step": 15110 }, { "epoch": 1.816925111191249, "grad_norm": 0.8678345680236816, "learning_rate": 0.0004638034453296298, "loss": 1.0312, "step": 15115 }, { "epoch": 1.8175261449693472, "grad_norm": 0.5021523833274841, "learning_rate": 0.0004637787290544857, "loss": 0.9926, "step": 15120 }, { "epoch": 1.8181271787474456, "grad_norm": 0.6790867447853088, "learning_rate": 0.0004637540050026896, "loss": 0.7688, "step": 15125 }, { "epoch": 1.818728212525544, "grad_norm": 0.530465304851532, "learning_rate": 0.00046372927317514105, "loss": 0.8477, "step": 15130 }, { "epoch": 1.8193292463036421, "grad_norm": 0.8125158548355103, "learning_rate": 0.00046370453357273965, "loss": 0.9418, "step": 15135 }, { "epoch": 1.8199302800817407, "grad_norm": 0.5161015391349792, "learning_rate": 0.00046367978619638533, "loss": 0.6984, "step": 15140 }, { "epoch": 1.820531313859839, "grad_norm": 0.6889507174491882, "learning_rate": 0.00046365503104697835, "loss": 1.2734, "step": 15145 }, { "epoch": 1.8211323476379373, "grad_norm": 0.6118544936180115, "learning_rate": 0.00046363026812541925, "loss": 0.7605, "step": 15150 }, { "epoch": 1.8217333814160357, "grad_norm": 0.5412024259567261, "learning_rate": 0.00046360549743260885, "loss": 0.5813, "step": 15155 }, { "epoch": 1.8223344151941339, "grad_norm": 0.6936192512512207, "learning_rate": 0.0004635807189694481, "loss": 1.1195, "step": 15160 }, { "epoch": 1.8229354489722323, "grad_norm": 0.7266600131988525, "learning_rate": 0.0004635559327368385, "loss": 0.707, "step": 15165 }, { "epoch": 1.8235364827503306, "grad_norm": 0.5251690149307251, "learning_rate": 0.0004635311387356816, "loss": 0.7961, "step": 15170 }, { "epoch": 1.8241375165284288, "grad_norm": 0.7028011083602905, "learning_rate": 0.00046350633696687956, "loss": 0.7148, "step": 15175 }, { "epoch": 1.8247385503065272, "grad_norm": 0.874918520450592, "learning_rate": 0.0004634815274313343, "loss": 0.8164, "step": 15180 }, { "epoch": 1.8253395840846256, "grad_norm": 0.7211055159568787, "learning_rate": 0.00046345671012994855, "loss": 0.8629, "step": 15185 }, { "epoch": 1.8259406178627238, "grad_norm": 0.4633733034133911, "learning_rate": 0.00046343188506362497, "loss": 1.3609, "step": 15190 }, { "epoch": 1.8265416516408222, "grad_norm": 1.096551775932312, "learning_rate": 0.00046340705223326664, "loss": 0.882, "step": 15195 }, { "epoch": 1.8271426854189206, "grad_norm": 0.49717584252357483, "learning_rate": 0.0004633822116397769, "loss": 0.9109, "step": 15200 }, { "epoch": 1.8271426854189206, "eval_loss": 1.839453101158142, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1957, "eval_samples_per_second": 4.546, "eval_steps_per_second": 1.137, "step": 15200 }, { "epoch": 1.8277437191970187, "grad_norm": 0.40456676483154297, "learning_rate": 0.0004633573632840594, "loss": 0.8133, "step": 15205 }, { "epoch": 1.8283447529751173, "grad_norm": 0.4645957946777344, "learning_rate": 0.00046333250716701813, "loss": 1.0016, "step": 15210 }, { "epoch": 1.8289457867532155, "grad_norm": 0.789980411529541, "learning_rate": 0.00046330764328955714, "loss": 0.6188, "step": 15215 }, { "epoch": 1.829546820531314, "grad_norm": 0.6263753175735474, "learning_rate": 0.000463282771652581, "loss": 1.1344, "step": 15220 }, { "epoch": 1.8301478543094123, "grad_norm": 0.48008212447166443, "learning_rate": 0.00046325789225699443, "loss": 1.2578, "step": 15225 }, { "epoch": 1.8307488880875105, "grad_norm": 0.8974840044975281, "learning_rate": 0.00046323300510370247, "loss": 0.7625, "step": 15230 }, { "epoch": 1.8313499218656089, "grad_norm": 0.5595062375068665, "learning_rate": 0.0004632081101936104, "loss": 0.823, "step": 15235 }, { "epoch": 1.8319509556437072, "grad_norm": 0.8355674147605896, "learning_rate": 0.0004631832075276239, "loss": 1.1266, "step": 15240 }, { "epoch": 1.8325519894218054, "grad_norm": 0.5733554363250732, "learning_rate": 0.00046315829710664893, "loss": 1.0969, "step": 15245 }, { "epoch": 1.8331530231999038, "grad_norm": 0.773493766784668, "learning_rate": 0.00046313337893159147, "loss": 1.0891, "step": 15250 }, { "epoch": 1.8337540569780022, "grad_norm": 0.424111008644104, "learning_rate": 0.00046310845300335813, "loss": 0.4793, "step": 15255 }, { "epoch": 1.8343550907561004, "grad_norm": 0.6091646552085876, "learning_rate": 0.00046308351932285554, "loss": 1.1414, "step": 15260 }, { "epoch": 1.834956124534199, "grad_norm": 0.6316931247711182, "learning_rate": 0.00046305857789099073, "loss": 1.0602, "step": 15265 }, { "epoch": 1.8355571583122972, "grad_norm": 0.6684390902519226, "learning_rate": 0.000463033628708671, "loss": 0.6312, "step": 15270 }, { "epoch": 1.8361581920903953, "grad_norm": 0.5945658087730408, "learning_rate": 0.00046300867177680394, "loss": 0.7152, "step": 15275 }, { "epoch": 1.836759225868494, "grad_norm": 0.6461968421936035, "learning_rate": 0.00046298370709629746, "loss": 1.0531, "step": 15280 }, { "epoch": 1.837360259646592, "grad_norm": 0.6458706855773926, "learning_rate": 0.0004629587346680597, "loss": 1.1484, "step": 15285 }, { "epoch": 1.8379612934246905, "grad_norm": 0.5554401874542236, "learning_rate": 0.000462933754492999, "loss": 0.8863, "step": 15290 }, { "epoch": 1.838562327202789, "grad_norm": 0.7965975403785706, "learning_rate": 0.0004629087665720241, "loss": 1.0609, "step": 15295 }, { "epoch": 1.839163360980887, "grad_norm": 0.6352299451828003, "learning_rate": 0.00046288377090604406, "loss": 0.7703, "step": 15300 }, { "epoch": 1.8397643947589855, "grad_norm": 0.6565055251121521, "learning_rate": 0.000462858767495968, "loss": 0.8191, "step": 15305 }, { "epoch": 1.8403654285370838, "grad_norm": 0.5926379561424255, "learning_rate": 0.00046283375634270565, "loss": 0.8453, "step": 15310 }, { "epoch": 1.840966462315182, "grad_norm": 0.44651028513908386, "learning_rate": 0.00046280873744716675, "loss": 0.7406, "step": 15315 }, { "epoch": 1.8415674960932804, "grad_norm": 0.7312407493591309, "learning_rate": 0.0004627837108102614, "loss": 0.7689, "step": 15320 }, { "epoch": 1.8421685298713788, "grad_norm": 0.7699540257453918, "learning_rate": 0.00046275867643290005, "loss": 1.1734, "step": 15325 }, { "epoch": 1.842769563649477, "grad_norm": 0.6732514500617981, "learning_rate": 0.00046273363431599323, "loss": 0.9617, "step": 15330 }, { "epoch": 1.8433705974275756, "grad_norm": 0.612540066242218, "learning_rate": 0.00046270858446045214, "loss": 0.7217, "step": 15335 }, { "epoch": 1.8439716312056738, "grad_norm": 0.564551591873169, "learning_rate": 0.0004626835268671878, "loss": 1.0188, "step": 15340 }, { "epoch": 1.844572664983772, "grad_norm": 0.4973868727684021, "learning_rate": 0.0004626584615371119, "loss": 0.9781, "step": 15345 }, { "epoch": 1.8451736987618705, "grad_norm": 0.6533011198043823, "learning_rate": 0.0004626333884711362, "loss": 0.9254, "step": 15350 }, { "epoch": 1.8457747325399687, "grad_norm": 0.5783156752586365, "learning_rate": 0.0004626083076701727, "loss": 1.2137, "step": 15355 }, { "epoch": 1.846375766318067, "grad_norm": 0.9182881116867065, "learning_rate": 0.0004625832191351339, "loss": 0.8559, "step": 15360 }, { "epoch": 1.8469768000961655, "grad_norm": 0.7529346942901611, "learning_rate": 0.0004625581228669323, "loss": 0.5879, "step": 15365 }, { "epoch": 1.8475778338742637, "grad_norm": 0.4365798234939575, "learning_rate": 0.0004625330188664809, "loss": 1.0352, "step": 15370 }, { "epoch": 1.848178867652362, "grad_norm": 0.6213639378547668, "learning_rate": 0.0004625079071346929, "loss": 0.9043, "step": 15375 }, { "epoch": 1.8487799014304604, "grad_norm": 0.7529160976409912, "learning_rate": 0.00046248278767248193, "loss": 0.7621, "step": 15380 }, { "epoch": 1.8493809352085586, "grad_norm": 0.5664300322532654, "learning_rate": 0.00046245766048076155, "loss": 0.959, "step": 15385 }, { "epoch": 1.849981968986657, "grad_norm": 0.8210234045982361, "learning_rate": 0.0004624325255604459, "loss": 0.8559, "step": 15390 }, { "epoch": 1.8505830027647554, "grad_norm": 0.6288379430770874, "learning_rate": 0.0004624073829124494, "loss": 0.7066, "step": 15395 }, { "epoch": 1.8511840365428536, "grad_norm": 0.742318868637085, "learning_rate": 0.00046238223253768654, "loss": 0.7141, "step": 15400 }, { "epoch": 1.8511840365428536, "eval_loss": 1.8668944835662842, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1988, "eval_samples_per_second": 4.546, "eval_steps_per_second": 1.136, "step": 15400 }, { "epoch": 1.8517850703209522, "grad_norm": 0.8229348063468933, "learning_rate": 0.0004623570744370722, "loss": 1.1922, "step": 15405 }, { "epoch": 1.8523861040990504, "grad_norm": 0.8095618486404419, "learning_rate": 0.00046233190861152176, "loss": 0.9258, "step": 15410 }, { "epoch": 1.8529871378771487, "grad_norm": 0.7435784935951233, "learning_rate": 0.00046230673506195044, "loss": 1.0133, "step": 15415 }, { "epoch": 1.8535881716552471, "grad_norm": 0.6222808361053467, "learning_rate": 0.00046228155378927414, "loss": 0.8117, "step": 15420 }, { "epoch": 1.8541892054333453, "grad_norm": 0.7926135063171387, "learning_rate": 0.0004622563647944088, "loss": 1.1617, "step": 15425 }, { "epoch": 1.8547902392114437, "grad_norm": 0.5542756915092468, "learning_rate": 0.0004622311680782707, "loss": 0.8539, "step": 15430 }, { "epoch": 1.855391272989542, "grad_norm": 0.6877511143684387, "learning_rate": 0.0004622059636417766, "loss": 0.7574, "step": 15435 }, { "epoch": 1.8559923067676403, "grad_norm": 0.4971630275249481, "learning_rate": 0.00046218075148584314, "loss": 0.5406, "step": 15440 }, { "epoch": 1.8565933405457387, "grad_norm": 0.6721771955490112, "learning_rate": 0.0004621555316113876, "loss": 0.8328, "step": 15445 }, { "epoch": 1.857194374323837, "grad_norm": 0.6412745118141174, "learning_rate": 0.00046213030401932745, "loss": 0.7625, "step": 15450 }, { "epoch": 1.8577954081019352, "grad_norm": 0.5967793464660645, "learning_rate": 0.0004621050687105802, "loss": 0.8234, "step": 15455 }, { "epoch": 1.8583964418800336, "grad_norm": 0.6425501108169556, "learning_rate": 0.000462079825686064, "loss": 0.8844, "step": 15460 }, { "epoch": 1.858997475658132, "grad_norm": 0.6047863364219666, "learning_rate": 0.00046205457494669707, "loss": 0.5707, "step": 15465 }, { "epoch": 1.8595985094362302, "grad_norm": 0.8482145071029663, "learning_rate": 0.000462029316493398, "loss": 0.8898, "step": 15470 }, { "epoch": 1.8601995432143288, "grad_norm": 0.3807300627231598, "learning_rate": 0.00046200405032708545, "loss": 0.9367, "step": 15475 }, { "epoch": 1.860800576992427, "grad_norm": 0.364632248878479, "learning_rate": 0.00046197877644867883, "loss": 1.0391, "step": 15480 }, { "epoch": 1.8614016107705253, "grad_norm": 0.8206885457038879, "learning_rate": 0.0004619534948590972, "loss": 1.082, "step": 15485 }, { "epoch": 1.8620026445486237, "grad_norm": 0.6448713541030884, "learning_rate": 0.0004619282055592605, "loss": 0.7688, "step": 15490 }, { "epoch": 1.862603678326722, "grad_norm": 0.5562570691108704, "learning_rate": 0.00046190290855008855, "loss": 0.8344, "step": 15495 }, { "epoch": 1.8632047121048203, "grad_norm": 0.520274817943573, "learning_rate": 0.00046187760383250166, "loss": 1.1945, "step": 15500 }, { "epoch": 1.8638057458829187, "grad_norm": 0.7087087631225586, "learning_rate": 0.00046185229140742023, "loss": 0.8195, "step": 15505 }, { "epoch": 1.8644067796610169, "grad_norm": 0.7038390040397644, "learning_rate": 0.00046182697127576514, "loss": 0.9727, "step": 15510 }, { "epoch": 1.8650078134391153, "grad_norm": 0.6353370547294617, "learning_rate": 0.00046180164343845743, "loss": 1.0258, "step": 15515 }, { "epoch": 1.8656088472172137, "grad_norm": 0.8494519591331482, "learning_rate": 0.00046177630789641844, "loss": 0.8539, "step": 15520 }, { "epoch": 1.8662098809953118, "grad_norm": 0.7935754656791687, "learning_rate": 0.0004617509646505698, "loss": 0.6391, "step": 15525 }, { "epoch": 1.8668109147734102, "grad_norm": 0.8831447958946228, "learning_rate": 0.0004617256137018335, "loss": 0.732, "step": 15530 }, { "epoch": 1.8674119485515086, "grad_norm": 0.6574364900588989, "learning_rate": 0.0004617002550511317, "loss": 1.0406, "step": 15535 }, { "epoch": 1.8680129823296068, "grad_norm": 0.7329922318458557, "learning_rate": 0.0004616748886993868, "loss": 0.9664, "step": 15540 }, { "epoch": 1.8686140161077054, "grad_norm": 0.6389554142951965, "learning_rate": 0.00046164951464752166, "loss": 0.9352, "step": 15545 }, { "epoch": 1.8692150498858036, "grad_norm": 0.6787768602371216, "learning_rate": 0.0004616241328964592, "loss": 0.9648, "step": 15550 }, { "epoch": 1.869816083663902, "grad_norm": 0.47150591015815735, "learning_rate": 0.00046159874344712284, "loss": 0.7305, "step": 15555 }, { "epoch": 1.8704171174420003, "grad_norm": 0.587776243686676, "learning_rate": 0.00046157334630043614, "loss": 0.9416, "step": 15560 }, { "epoch": 1.8710181512200985, "grad_norm": 0.6806148290634155, "learning_rate": 0.00046154794145732294, "loss": 1.0219, "step": 15565 }, { "epoch": 1.871619184998197, "grad_norm": 0.48794689774513245, "learning_rate": 0.00046152252891870746, "loss": 0.8578, "step": 15570 }, { "epoch": 1.8722202187762953, "grad_norm": 0.4087463319301605, "learning_rate": 0.0004614971086855141, "loss": 1.0727, "step": 15575 }, { "epoch": 1.8728212525543935, "grad_norm": 0.6228848695755005, "learning_rate": 0.0004614716807586675, "loss": 0.6562, "step": 15580 }, { "epoch": 1.8734222863324919, "grad_norm": 0.7155863642692566, "learning_rate": 0.00046144624513909285, "loss": 0.7797, "step": 15585 }, { "epoch": 1.8740233201105903, "grad_norm": 0.8531315326690674, "learning_rate": 0.00046142080182771516, "loss": 0.8227, "step": 15590 }, { "epoch": 1.8746243538886884, "grad_norm": 0.6068195700645447, "learning_rate": 0.00046139535082546023, "loss": 0.9516, "step": 15595 }, { "epoch": 1.875225387666787, "grad_norm": 0.7528138756752014, "learning_rate": 0.00046136989213325377, "loss": 0.8586, "step": 15600 }, { "epoch": 1.875225387666787, "eval_loss": 1.85009765625, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2009, "eval_samples_per_second": 4.545, "eval_steps_per_second": 1.136, "step": 15600 }, { "epoch": 1.8758264214448852, "grad_norm": 0.822066605091095, "learning_rate": 0.00046134442575202183, "loss": 0.7758, "step": 15605 }, { "epoch": 1.8764274552229834, "grad_norm": 0.5799782872200012, "learning_rate": 0.000461318951682691, "loss": 0.8324, "step": 15610 }, { "epoch": 1.877028489001082, "grad_norm": 0.6150916218757629, "learning_rate": 0.0004612934699261878, "loss": 1.0238, "step": 15615 }, { "epoch": 1.8776295227791802, "grad_norm": 0.6659246683120728, "learning_rate": 0.0004612679804834392, "loss": 0.9031, "step": 15620 }, { "epoch": 1.8782305565572786, "grad_norm": 0.7410773038864136, "learning_rate": 0.0004612424833553725, "loss": 0.7906, "step": 15625 }, { "epoch": 1.878831590335377, "grad_norm": 0.502784788608551, "learning_rate": 0.0004612169785429151, "loss": 1.3734, "step": 15630 }, { "epoch": 1.8794326241134751, "grad_norm": 0.6301351189613342, "learning_rate": 0.0004611914660469949, "loss": 0.627, "step": 15635 }, { "epoch": 1.8800336578915735, "grad_norm": 0.6845929026603699, "learning_rate": 0.00046116594586854, "loss": 0.7234, "step": 15640 }, { "epoch": 1.880634691669672, "grad_norm": 0.4773063063621521, "learning_rate": 0.00046114041800847855, "loss": 0.8633, "step": 15645 }, { "epoch": 1.88123572544777, "grad_norm": 0.7948882579803467, "learning_rate": 0.00046111488246773933, "loss": 0.7387, "step": 15650 }, { "epoch": 1.8818367592258685, "grad_norm": 0.6024848818778992, "learning_rate": 0.00046108933924725125, "loss": 0.9027, "step": 15655 }, { "epoch": 1.8824377930039669, "grad_norm": 0.5325372219085693, "learning_rate": 0.00046106378834794357, "loss": 0.6184, "step": 15660 }, { "epoch": 1.883038826782065, "grad_norm": 0.671114981174469, "learning_rate": 0.00046103822977074555, "loss": 0.7105, "step": 15665 }, { "epoch": 1.8836398605601636, "grad_norm": 0.5747897028923035, "learning_rate": 0.0004610126635165871, "loss": 0.8516, "step": 15670 }, { "epoch": 1.8842408943382618, "grad_norm": 0.6841700673103333, "learning_rate": 0.00046098708958639814, "loss": 0.7992, "step": 15675 }, { "epoch": 1.88484192811636, "grad_norm": 0.8215524554252625, "learning_rate": 0.0004609615079811091, "loss": 0.9266, "step": 15680 }, { "epoch": 1.8854429618944586, "grad_norm": 0.7524236440658569, "learning_rate": 0.0004609359187016504, "loss": 1.1977, "step": 15685 }, { "epoch": 1.8860439956725568, "grad_norm": 0.6952587962150574, "learning_rate": 0.0004609103217489531, "loss": 1.2164, "step": 15690 }, { "epoch": 1.8866450294506552, "grad_norm": 0.5708470940589905, "learning_rate": 0.00046088471712394817, "loss": 1.2328, "step": 15695 }, { "epoch": 1.8872460632287535, "grad_norm": 0.779551088809967, "learning_rate": 0.0004608591048275672, "loss": 0.8492, "step": 15700 }, { "epoch": 1.8878470970068517, "grad_norm": 0.8021473288536072, "learning_rate": 0.00046083348486074174, "loss": 0.8625, "step": 15705 }, { "epoch": 1.88844813078495, "grad_norm": 0.6541423797607422, "learning_rate": 0.00046080785722440385, "loss": 1.0109, "step": 15710 }, { "epoch": 1.8890491645630485, "grad_norm": 0.549266517162323, "learning_rate": 0.0004607822219194858, "loss": 0.8125, "step": 15715 }, { "epoch": 1.8896501983411467, "grad_norm": 0.565075159072876, "learning_rate": 0.00046075657894692, "loss": 0.509, "step": 15720 }, { "epoch": 1.890251232119245, "grad_norm": 0.8590371608734131, "learning_rate": 0.00046073092830763943, "loss": 0.8475, "step": 15725 }, { "epoch": 1.8908522658973435, "grad_norm": 0.5703751444816589, "learning_rate": 0.0004607052700025771, "loss": 1.0836, "step": 15730 }, { "epoch": 1.8914532996754416, "grad_norm": 0.702496349811554, "learning_rate": 0.0004606796040326664, "loss": 0.8734, "step": 15735 }, { "epoch": 1.8920543334535402, "grad_norm": 0.8717988729476929, "learning_rate": 0.00046065393039884103, "loss": 0.7898, "step": 15740 }, { "epoch": 1.8926553672316384, "grad_norm": 1.0466914176940918, "learning_rate": 0.0004606282491020348, "loss": 0.9219, "step": 15745 }, { "epoch": 1.8932564010097368, "grad_norm": 0.5622535347938538, "learning_rate": 0.000460602560143182, "loss": 1.2086, "step": 15750 }, { "epoch": 1.8938574347878352, "grad_norm": 0.7398979067802429, "learning_rate": 0.00046057686352321726, "loss": 0.832, "step": 15755 }, { "epoch": 1.8944584685659334, "grad_norm": 0.6693282127380371, "learning_rate": 0.0004605511592430751, "loss": 0.8773, "step": 15760 }, { "epoch": 1.8950595023440318, "grad_norm": 0.6905662417411804, "learning_rate": 0.0004605254473036907, "loss": 1.1047, "step": 15765 }, { "epoch": 1.8956605361221301, "grad_norm": 0.7456283569335938, "learning_rate": 0.00046049972770599937, "loss": 0.8566, "step": 15770 }, { "epoch": 1.8962615699002283, "grad_norm": 0.790802001953125, "learning_rate": 0.0004604740004509367, "loss": 0.9727, "step": 15775 }, { "epoch": 1.8968626036783267, "grad_norm": 0.5572713613510132, "learning_rate": 0.00046044826553943855, "loss": 0.9875, "step": 15780 }, { "epoch": 1.897463637456425, "grad_norm": 0.7856184840202332, "learning_rate": 0.0004604225229724411, "loss": 0.6012, "step": 15785 }, { "epoch": 1.8980646712345233, "grad_norm": 0.41269636154174805, "learning_rate": 0.00046039677275088093, "loss": 0.7609, "step": 15790 }, { "epoch": 1.8986657050126217, "grad_norm": 0.7941540479660034, "learning_rate": 0.00046037101487569454, "loss": 1.0762, "step": 15795 }, { "epoch": 1.89926673879072, "grad_norm": 0.5511928200721741, "learning_rate": 0.000460345249347819, "loss": 0.9469, "step": 15800 }, { "epoch": 1.89926673879072, "eval_loss": 1.82958984375, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1913, "eval_samples_per_second": 4.547, "eval_steps_per_second": 1.137, "step": 15800 }, { "epoch": 1.8998677725688182, "grad_norm": 0.6881498098373413, "learning_rate": 0.0004603194761681917, "loss": 0.7824, "step": 15805 }, { "epoch": 1.9004688063469168, "grad_norm": 0.7449555993080139, "learning_rate": 0.00046029369533775, "loss": 1.1203, "step": 15810 }, { "epoch": 1.901069840125015, "grad_norm": 0.49304434657096863, "learning_rate": 0.00046026790685743186, "loss": 0.8383, "step": 15815 }, { "epoch": 1.9016708739031134, "grad_norm": 0.45290544629096985, "learning_rate": 0.00046024211072817536, "loss": 0.8273, "step": 15820 }, { "epoch": 1.9022719076812118, "grad_norm": 0.8016815185546875, "learning_rate": 0.0004602163069509189, "loss": 0.9418, "step": 15825 }, { "epoch": 1.90287294145931, "grad_norm": 0.5830326080322266, "learning_rate": 0.0004601904955266011, "loss": 0.7055, "step": 15830 }, { "epoch": 1.9034739752374084, "grad_norm": 0.8227860331535339, "learning_rate": 0.000460164676456161, "loss": 0.9711, "step": 15835 }, { "epoch": 1.9040750090155067, "grad_norm": 0.41357049345970154, "learning_rate": 0.00046013884974053777, "loss": 0.8301, "step": 15840 }, { "epoch": 1.904676042793605, "grad_norm": 0.7187300324440002, "learning_rate": 0.00046011301538067086, "loss": 0.9305, "step": 15845 }, { "epoch": 1.9052770765717033, "grad_norm": 0.6024027466773987, "learning_rate": 0.00046008717337750005, "loss": 0.9102, "step": 15850 }, { "epoch": 1.9058781103498017, "grad_norm": 0.44877317547798157, "learning_rate": 0.0004600613237319655, "loss": 0.8203, "step": 15855 }, { "epoch": 1.9064791441278999, "grad_norm": 0.6754428148269653, "learning_rate": 0.0004600354664450075, "loss": 0.9723, "step": 15860 }, { "epoch": 1.9070801779059983, "grad_norm": 0.6167812347412109, "learning_rate": 0.00046000960151756665, "loss": 0.8836, "step": 15865 }, { "epoch": 1.9076812116840967, "grad_norm": 0.5798412561416626, "learning_rate": 0.00045998372895058383, "loss": 1.032, "step": 15870 }, { "epoch": 1.9082822454621948, "grad_norm": 0.7080972194671631, "learning_rate": 0.00045995784874500027, "loss": 0.8758, "step": 15875 }, { "epoch": 1.9088832792402934, "grad_norm": 0.6686116456985474, "learning_rate": 0.00045993196090175726, "loss": 0.9258, "step": 15880 }, { "epoch": 1.9094843130183916, "grad_norm": 0.6391817331314087, "learning_rate": 0.00045990606542179667, "loss": 0.977, "step": 15885 }, { "epoch": 1.91008534679649, "grad_norm": 0.5385151505470276, "learning_rate": 0.0004598801623060605, "loss": 0.7238, "step": 15890 }, { "epoch": 1.9106863805745884, "grad_norm": 0.7799992561340332, "learning_rate": 0.000459854251555491, "loss": 1.0289, "step": 15895 }, { "epoch": 1.9112874143526866, "grad_norm": 0.6737104058265686, "learning_rate": 0.00045982833317103066, "loss": 0.8758, "step": 15900 }, { "epoch": 1.911888448130785, "grad_norm": 0.5996498465538025, "learning_rate": 0.00045980240715362244, "loss": 1.1383, "step": 15905 }, { "epoch": 1.9124894819088833, "grad_norm": 0.7501261234283447, "learning_rate": 0.0004597764735042094, "loss": 0.9027, "step": 15910 }, { "epoch": 1.9130905156869815, "grad_norm": 0.6438601613044739, "learning_rate": 0.00045975053222373485, "loss": 0.8875, "step": 15915 }, { "epoch": 1.91369154946508, "grad_norm": 0.4240054190158844, "learning_rate": 0.0004597245833131426, "loss": 0.6937, "step": 15920 }, { "epoch": 1.9142925832431783, "grad_norm": 0.7788777351379395, "learning_rate": 0.00045969862677337656, "loss": 0.8012, "step": 15925 }, { "epoch": 1.9148936170212765, "grad_norm": 0.6803364157676697, "learning_rate": 0.0004596726626053809, "loss": 0.7508, "step": 15930 }, { "epoch": 1.915494650799375, "grad_norm": 0.4940835237503052, "learning_rate": 0.00045964669081010007, "loss": 0.741, "step": 15935 }, { "epoch": 1.9160956845774733, "grad_norm": 0.9183889627456665, "learning_rate": 0.000459620711388479, "loss": 0.859, "step": 15940 }, { "epoch": 1.9166967183555714, "grad_norm": 0.7395238876342773, "learning_rate": 0.00045959472434146264, "loss": 0.7258, "step": 15945 }, { "epoch": 1.91729775213367, "grad_norm": 0.5668870210647583, "learning_rate": 0.0004595687296699964, "loss": 0.7707, "step": 15950 }, { "epoch": 1.9178987859117682, "grad_norm": 0.6207901239395142, "learning_rate": 0.0004595427273750258, "loss": 0.8766, "step": 15955 }, { "epoch": 1.9184998196898666, "grad_norm": 0.5633701682090759, "learning_rate": 0.00045951671745749677, "loss": 0.8629, "step": 15960 }, { "epoch": 1.919100853467965, "grad_norm": 0.4973164498806, "learning_rate": 0.0004594906999183555, "loss": 0.7621, "step": 15965 }, { "epoch": 1.9197018872460632, "grad_norm": 0.47049078345298767, "learning_rate": 0.0004594646747585484, "loss": 1.443, "step": 15970 }, { "epoch": 1.9203029210241616, "grad_norm": 0.583698570728302, "learning_rate": 0.0004594386419790222, "loss": 0.8309, "step": 15975 }, { "epoch": 1.92090395480226, "grad_norm": 0.5502817034721375, "learning_rate": 0.00045941260158072385, "loss": 0.8168, "step": 15980 }, { "epoch": 1.9215049885803581, "grad_norm": 0.6599786877632141, "learning_rate": 0.00045938655356460073, "loss": 0.9445, "step": 15985 }, { "epoch": 1.9221060223584565, "grad_norm": 0.4884117543697357, "learning_rate": 0.0004593604979316003, "loss": 0.9109, "step": 15990 }, { "epoch": 1.922707056136555, "grad_norm": 0.5936402678489685, "learning_rate": 0.0004593344346826705, "loss": 0.7719, "step": 15995 }, { "epoch": 1.923308089914653, "grad_norm": 0.5205080509185791, "learning_rate": 0.0004593083638187593, "loss": 0.9703, "step": 16000 }, { "epoch": 1.923308089914653, "eval_loss": 1.841796875, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1904, "eval_samples_per_second": 4.547, "eval_steps_per_second": 1.137, "step": 16000 }, { "epoch": 1.9239091236927517, "grad_norm": 0.8271058797836304, "learning_rate": 0.0004592822853408152, "loss": 1.0641, "step": 16005 }, { "epoch": 1.9245101574708499, "grad_norm": 0.8249663710594177, "learning_rate": 0.00045925619924978665, "loss": 0.8219, "step": 16010 }, { "epoch": 1.925111191248948, "grad_norm": 0.49755972623825073, "learning_rate": 0.0004592301055466229, "loss": 1.1812, "step": 16015 }, { "epoch": 1.9257122250270466, "grad_norm": 0.6515794396400452, "learning_rate": 0.00045920400423227296, "loss": 0.6527, "step": 16020 }, { "epoch": 1.9263132588051448, "grad_norm": 0.6021793484687805, "learning_rate": 0.00045917789530768634, "loss": 0.9773, "step": 16025 }, { "epoch": 1.9269142925832432, "grad_norm": 0.7682889103889465, "learning_rate": 0.00045915177877381283, "loss": 1.0414, "step": 16030 }, { "epoch": 1.9275153263613416, "grad_norm": 0.5919394493103027, "learning_rate": 0.0004591256546316025, "loss": 1.1367, "step": 16035 }, { "epoch": 1.9281163601394398, "grad_norm": 0.37970903515815735, "learning_rate": 0.0004590995228820056, "loss": 0.727, "step": 16040 }, { "epoch": 1.9287173939175382, "grad_norm": 0.6846290230751038, "learning_rate": 0.00045907338352597275, "loss": 0.9121, "step": 16045 }, { "epoch": 1.9293184276956366, "grad_norm": 0.6051719188690186, "learning_rate": 0.0004590472365644549, "loss": 1.0766, "step": 16050 }, { "epoch": 1.9299194614737347, "grad_norm": 0.3410149812698364, "learning_rate": 0.0004590210819984031, "loss": 0.8555, "step": 16055 }, { "epoch": 1.9305204952518331, "grad_norm": 0.7222450971603394, "learning_rate": 0.0004589949198287688, "loss": 0.9676, "step": 16060 }, { "epoch": 1.9311215290299315, "grad_norm": 0.9534603953361511, "learning_rate": 0.00045896875005650386, "loss": 1.368, "step": 16065 }, { "epoch": 1.9317225628080297, "grad_norm": 0.5912963151931763, "learning_rate": 0.00045894257268256, "loss": 0.8703, "step": 16070 }, { "epoch": 1.9323235965861283, "grad_norm": 0.4612202048301697, "learning_rate": 0.0004589163877078897, "loss": 0.7086, "step": 16075 }, { "epoch": 1.9329246303642265, "grad_norm": 0.6640035510063171, "learning_rate": 0.0004588901951334453, "loss": 1.0031, "step": 16080 }, { "epoch": 1.9335256641423249, "grad_norm": 1.1239770650863647, "learning_rate": 0.00045886399496017984, "loss": 0.9598, "step": 16085 }, { "epoch": 1.9341266979204232, "grad_norm": 0.5875447392463684, "learning_rate": 0.0004588377871890462, "loss": 0.9898, "step": 16090 }, { "epoch": 1.9347277316985214, "grad_norm": 1.2783493995666504, "learning_rate": 0.0004588115718209978, "loss": 1.084, "step": 16095 }, { "epoch": 1.9353287654766198, "grad_norm": 0.5066685676574707, "learning_rate": 0.0004587853488569883, "loss": 0.7211, "step": 16100 }, { "epoch": 1.9359297992547182, "grad_norm": 0.6114668846130371, "learning_rate": 0.00045875911829797156, "loss": 1.3625, "step": 16105 }, { "epoch": 1.9365308330328164, "grad_norm": 0.5979018211364746, "learning_rate": 0.00045873288014490195, "loss": 0.8961, "step": 16110 }, { "epoch": 1.9371318668109148, "grad_norm": 0.868495523929596, "learning_rate": 0.0004587066343987337, "loss": 0.7992, "step": 16115 }, { "epoch": 1.9377329005890132, "grad_norm": 0.47293373942375183, "learning_rate": 0.00045868038106042176, "loss": 0.7039, "step": 16120 }, { "epoch": 1.9383339343671113, "grad_norm": 0.785038948059082, "learning_rate": 0.000458654120130921, "loss": 1.3227, "step": 16125 }, { "epoch": 1.9389349681452097, "grad_norm": 0.546787440776825, "learning_rate": 0.0004586278516111868, "loss": 0.4719, "step": 16130 }, { "epoch": 1.939536001923308, "grad_norm": 0.44721922278404236, "learning_rate": 0.00045860157550217467, "loss": 0.7047, "step": 16135 }, { "epoch": 1.9401370357014063, "grad_norm": 0.6451538801193237, "learning_rate": 0.00045857529180484056, "loss": 0.9637, "step": 16140 }, { "epoch": 1.940738069479505, "grad_norm": 0.5918285846710205, "learning_rate": 0.0004585490005201405, "loss": 1.1523, "step": 16145 }, { "epoch": 1.941339103257603, "grad_norm": 0.5370919704437256, "learning_rate": 0.00045852270164903087, "loss": 1.0141, "step": 16150 }, { "epoch": 1.9419401370357015, "grad_norm": 0.8413140773773193, "learning_rate": 0.0004584963951924685, "loss": 0.9102, "step": 16155 }, { "epoch": 1.9425411708137998, "grad_norm": 0.7827994227409363, "learning_rate": 0.0004584700811514102, "loss": 1.1617, "step": 16160 }, { "epoch": 1.943142204591898, "grad_norm": 0.5252236723899841, "learning_rate": 0.00045844375952681324, "loss": 1.0453, "step": 16165 }, { "epoch": 1.9437432383699964, "grad_norm": 0.5354886651039124, "learning_rate": 0.0004584174303196351, "loss": 1.0852, "step": 16170 }, { "epoch": 1.9443442721480948, "grad_norm": 0.5166362524032593, "learning_rate": 0.0004583910935308336, "loss": 0.7516, "step": 16175 }, { "epoch": 1.944945305926193, "grad_norm": 0.6396731734275818, "learning_rate": 0.00045836474916136674, "loss": 0.6801, "step": 16180 }, { "epoch": 1.9455463397042914, "grad_norm": 0.7472630739212036, "learning_rate": 0.00045833839721219296, "loss": 0.8313, "step": 16185 }, { "epoch": 1.9461473734823898, "grad_norm": 0.4584224820137024, "learning_rate": 0.00045831203768427074, "loss": 0.8445, "step": 16190 }, { "epoch": 1.946748407260488, "grad_norm": 0.5246545672416687, "learning_rate": 0.00045828567057855905, "loss": 0.7848, "step": 16195 }, { "epoch": 1.9473494410385863, "grad_norm": 0.5090553164482117, "learning_rate": 0.00045825929589601714, "loss": 1.0273, "step": 16200 }, { "epoch": 1.9473494410385863, "eval_loss": 1.8635742664337158, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2476, "eval_samples_per_second": 4.539, "eval_steps_per_second": 1.135, "step": 16200 }, { "epoch": 1.9479504748166847, "grad_norm": 0.5777876973152161, "learning_rate": 0.00045823291363760414, "loss": 0.7125, "step": 16205 }, { "epoch": 1.9485515085947829, "grad_norm": 0.596541166305542, "learning_rate": 0.00045820652380428007, "loss": 0.6934, "step": 16210 }, { "epoch": 1.9491525423728815, "grad_norm": 0.6587649583816528, "learning_rate": 0.0004581801263970047, "loss": 1.2547, "step": 16215 }, { "epoch": 1.9497535761509797, "grad_norm": 0.5737314820289612, "learning_rate": 0.0004581537214167385, "loss": 0.793, "step": 16220 }, { "epoch": 1.950354609929078, "grad_norm": 0.6597981452941895, "learning_rate": 0.00045812730886444185, "loss": 0.8008, "step": 16225 }, { "epoch": 1.9509556437071764, "grad_norm": 0.6425457000732422, "learning_rate": 0.0004581008887410756, "loss": 0.7875, "step": 16230 }, { "epoch": 1.9515566774852746, "grad_norm": 0.4491041600704193, "learning_rate": 0.0004580744610476008, "loss": 1.0414, "step": 16235 }, { "epoch": 1.952157711263373, "grad_norm": 0.7524226307868958, "learning_rate": 0.00045804802578497893, "loss": 0.7375, "step": 16240 }, { "epoch": 1.9527587450414714, "grad_norm": 0.486409455537796, "learning_rate": 0.0004580215829541715, "loss": 0.582, "step": 16245 }, { "epoch": 1.9533597788195696, "grad_norm": 0.4699746072292328, "learning_rate": 0.00045799513255614043, "loss": 0.8316, "step": 16250 }, { "epoch": 1.953960812597668, "grad_norm": 0.41282719373703003, "learning_rate": 0.0004579686745918481, "loss": 0.7004, "step": 16255 }, { "epoch": 1.9545618463757664, "grad_norm": 0.5473438501358032, "learning_rate": 0.0004579422090622567, "loss": 1.0039, "step": 16260 }, { "epoch": 1.9551628801538645, "grad_norm": 0.4732927083969116, "learning_rate": 0.0004579157359683291, "loss": 0.9215, "step": 16265 }, { "epoch": 1.9557639139319631, "grad_norm": 0.6017240881919861, "learning_rate": 0.00045788925531102846, "loss": 1.1148, "step": 16270 }, { "epoch": 1.9563649477100613, "grad_norm": 0.7589472532272339, "learning_rate": 0.0004578627670913178, "loss": 0.582, "step": 16275 }, { "epoch": 1.9569659814881595, "grad_norm": 1.0253260135650635, "learning_rate": 0.0004578362713101608, "loss": 0.9477, "step": 16280 }, { "epoch": 1.957567015266258, "grad_norm": 0.5631656050682068, "learning_rate": 0.00045780976796852134, "loss": 0.8414, "step": 16285 }, { "epoch": 1.9581680490443563, "grad_norm": 0.4838091731071472, "learning_rate": 0.00045778325706736356, "loss": 0.9289, "step": 16290 }, { "epoch": 1.9587690828224547, "grad_norm": 0.6642307043075562, "learning_rate": 0.0004577567386076518, "loss": 0.8387, "step": 16295 }, { "epoch": 1.959370116600553, "grad_norm": 0.7522189617156982, "learning_rate": 0.00045773021259035065, "loss": 1.0305, "step": 16300 }, { "epoch": 1.9599711503786512, "grad_norm": 0.688406229019165, "learning_rate": 0.0004577036790164251, "loss": 1.0785, "step": 16305 }, { "epoch": 1.9605721841567496, "grad_norm": 0.5037306547164917, "learning_rate": 0.0004576771378868404, "loss": 0.8273, "step": 16310 }, { "epoch": 1.961173217934848, "grad_norm": 0.5705709457397461, "learning_rate": 0.0004576505892025621, "loss": 0.593, "step": 16315 }, { "epoch": 1.9617742517129462, "grad_norm": 0.3605443835258484, "learning_rate": 0.0004576240329645558, "loss": 1.1641, "step": 16320 }, { "epoch": 1.9623752854910446, "grad_norm": 0.7139511108398438, "learning_rate": 0.0004575974691737876, "loss": 1.2164, "step": 16325 }, { "epoch": 1.962976319269143, "grad_norm": 0.5320140719413757, "learning_rate": 0.00045757089783122394, "loss": 1.066, "step": 16330 }, { "epoch": 1.9635773530472411, "grad_norm": 0.5788782835006714, "learning_rate": 0.00045754431893783133, "loss": 1.2133, "step": 16335 }, { "epoch": 1.9641783868253397, "grad_norm": 0.6410330533981323, "learning_rate": 0.0004575177324945765, "loss": 0.8367, "step": 16340 }, { "epoch": 1.964779420603438, "grad_norm": 0.8639314770698547, "learning_rate": 0.0004574911385024268, "loss": 0.9367, "step": 16345 }, { "epoch": 1.965380454381536, "grad_norm": 0.703011691570282, "learning_rate": 0.00045746453696234946, "loss": 0.7672, "step": 16350 }, { "epoch": 1.9659814881596347, "grad_norm": 0.9743753671646118, "learning_rate": 0.0004574379278753123, "loss": 1.2141, "step": 16355 }, { "epoch": 1.9665825219377329, "grad_norm": 0.6580327153205872, "learning_rate": 0.0004574113112422832, "loss": 1.0539, "step": 16360 }, { "epoch": 1.9671835557158313, "grad_norm": 0.600508451461792, "learning_rate": 0.0004573846870642304, "loss": 0.9016, "step": 16365 }, { "epoch": 1.9677845894939296, "grad_norm": 0.8149005770683289, "learning_rate": 0.00045735805534212253, "loss": 1.1367, "step": 16370 }, { "epoch": 1.9683856232720278, "grad_norm": 0.600533664226532, "learning_rate": 0.00045733141607692823, "loss": 0.734, "step": 16375 }, { "epoch": 1.9689866570501262, "grad_norm": 0.5731871128082275, "learning_rate": 0.0004573047692696166, "loss": 1.0016, "step": 16380 }, { "epoch": 1.9695876908282246, "grad_norm": 0.4646255373954773, "learning_rate": 0.000457278114921157, "loss": 1.0492, "step": 16385 }, { "epoch": 1.9701887246063228, "grad_norm": 0.8313210010528564, "learning_rate": 0.0004572514530325191, "loss": 1.0297, "step": 16390 }, { "epoch": 1.9707897583844212, "grad_norm": 0.36958426237106323, "learning_rate": 0.0004572247836046726, "loss": 0.9191, "step": 16395 }, { "epoch": 1.9713907921625196, "grad_norm": 0.48828598856925964, "learning_rate": 0.00045719810663858785, "loss": 0.7652, "step": 16400 }, { "epoch": 1.9713907921625196, "eval_loss": 1.841699242591858, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1692, "eval_samples_per_second": 4.549, "eval_steps_per_second": 1.137, "step": 16400 }, { "epoch": 1.9719918259406177, "grad_norm": 0.8144629001617432, "learning_rate": 0.0004571714221352351, "loss": 0.9262, "step": 16405 }, { "epoch": 1.9725928597187163, "grad_norm": 0.6654730439186096, "learning_rate": 0.0004571447300955852, "loss": 0.8715, "step": 16410 }, { "epoch": 1.9731938934968145, "grad_norm": 0.6317954659461975, "learning_rate": 0.00045711803052060905, "loss": 0.7508, "step": 16415 }, { "epoch": 1.973794927274913, "grad_norm": 0.5145111083984375, "learning_rate": 0.00045709132341127803, "loss": 0.9332, "step": 16420 }, { "epoch": 1.9743959610530113, "grad_norm": 0.6377562284469604, "learning_rate": 0.00045706460876856347, "loss": 1.1824, "step": 16425 }, { "epoch": 1.9749969948311095, "grad_norm": 0.5247347950935364, "learning_rate": 0.00045703788659343727, "loss": 0.8875, "step": 16430 }, { "epoch": 1.9755980286092079, "grad_norm": 0.5502212643623352, "learning_rate": 0.00045701115688687156, "loss": 0.7725, "step": 16435 }, { "epoch": 1.9761990623873062, "grad_norm": 0.6441766023635864, "learning_rate": 0.0004569844196498386, "loss": 0.8445, "step": 16440 }, { "epoch": 1.9768000961654044, "grad_norm": 0.6154478192329407, "learning_rate": 0.0004569576748833111, "loss": 0.7738, "step": 16445 }, { "epoch": 1.9774011299435028, "grad_norm": 0.6955156326293945, "learning_rate": 0.00045693092258826184, "loss": 0.8293, "step": 16450 }, { "epoch": 1.9780021637216012, "grad_norm": 0.6052159070968628, "learning_rate": 0.0004569041627656641, "loss": 1.0238, "step": 16455 }, { "epoch": 1.9786031974996994, "grad_norm": 0.6839408874511719, "learning_rate": 0.0004568773954164913, "loss": 1.0125, "step": 16460 }, { "epoch": 1.9792042312777978, "grad_norm": 0.6588771343231201, "learning_rate": 0.00045685062054171704, "loss": 0.8223, "step": 16465 }, { "epoch": 1.9798052650558962, "grad_norm": 0.7799257636070251, "learning_rate": 0.00045682383814231556, "loss": 0.7348, "step": 16470 }, { "epoch": 1.9804062988339943, "grad_norm": 0.6425604224205017, "learning_rate": 0.00045679704821926083, "loss": 0.8941, "step": 16475 }, { "epoch": 1.981007332612093, "grad_norm": 0.7071772217750549, "learning_rate": 0.00045677025077352765, "loss": 0.5846, "step": 16480 }, { "epoch": 1.9816083663901911, "grad_norm": 0.5076990127563477, "learning_rate": 0.0004567434458060907, "loss": 0.6984, "step": 16485 }, { "epoch": 1.9822094001682895, "grad_norm": 0.524396538734436, "learning_rate": 0.00045671663331792504, "loss": 0.6871, "step": 16490 }, { "epoch": 1.982810433946388, "grad_norm": 0.47433626651763916, "learning_rate": 0.0004566898133100061, "loss": 0.9824, "step": 16495 }, { "epoch": 1.983411467724486, "grad_norm": 0.7505754232406616, "learning_rate": 0.00045666298578330957, "loss": 1.0453, "step": 16500 }, { "epoch": 1.9840125015025845, "grad_norm": 0.2976857125759125, "learning_rate": 0.0004566361507388112, "loss": 0.7518, "step": 16505 }, { "epoch": 1.9846135352806829, "grad_norm": 0.7059286832809448, "learning_rate": 0.0004566093081774873, "loss": 0.743, "step": 16510 }, { "epoch": 1.985214569058781, "grad_norm": 0.44888272881507874, "learning_rate": 0.0004565824581003143, "loss": 0.5859, "step": 16515 }, { "epoch": 1.9858156028368794, "grad_norm": 0.6957870721817017, "learning_rate": 0.00045655560050826885, "loss": 0.8805, "step": 16520 }, { "epoch": 1.9864166366149778, "grad_norm": 0.6294426918029785, "learning_rate": 0.000456528735402328, "loss": 1.1461, "step": 16525 }, { "epoch": 1.987017670393076, "grad_norm": 0.5553144216537476, "learning_rate": 0.00045650186278346907, "loss": 0.7027, "step": 16530 }, { "epoch": 1.9876187041711744, "grad_norm": 0.5432278513908386, "learning_rate": 0.0004564749826526696, "loss": 1.1594, "step": 16535 }, { "epoch": 1.9882197379492728, "grad_norm": 0.6596419811248779, "learning_rate": 0.0004564480950109073, "loss": 1.0203, "step": 16540 }, { "epoch": 1.988820771727371, "grad_norm": 0.72481369972229, "learning_rate": 0.0004564211998591604, "loss": 1.3977, "step": 16545 }, { "epoch": 1.9894218055054695, "grad_norm": 0.5712243318557739, "learning_rate": 0.0004563942971984073, "loss": 0.791, "step": 16550 }, { "epoch": 1.9900228392835677, "grad_norm": 0.678567111492157, "learning_rate": 0.00045636738702962645, "loss": 0.8633, "step": 16555 }, { "epoch": 1.990623873061666, "grad_norm": 0.5263369083404541, "learning_rate": 0.00045634046935379694, "loss": 1.0648, "step": 16560 }, { "epoch": 1.9912249068397645, "grad_norm": 0.5460792779922485, "learning_rate": 0.0004563135441718978, "loss": 0.5568, "step": 16565 }, { "epoch": 1.9918259406178627, "grad_norm": 0.803355872631073, "learning_rate": 0.00045628661148490864, "loss": 1.0352, "step": 16570 }, { "epoch": 1.992426974395961, "grad_norm": 0.6674015522003174, "learning_rate": 0.0004562596712938091, "loss": 1.1313, "step": 16575 }, { "epoch": 1.9930280081740595, "grad_norm": 0.8867233991622925, "learning_rate": 0.00045623272359957923, "loss": 0.7734, "step": 16580 }, { "epoch": 1.9936290419521576, "grad_norm": 0.605686604976654, "learning_rate": 0.0004562057684031993, "loss": 0.7906, "step": 16585 }, { "epoch": 1.994230075730256, "grad_norm": 0.512194037437439, "learning_rate": 0.0004561788057056499, "loss": 0.7535, "step": 16590 }, { "epoch": 1.9948311095083544, "grad_norm": 0.9268355369567871, "learning_rate": 0.0004561518355079118, "loss": 0.8102, "step": 16595 }, { "epoch": 1.9954321432864526, "grad_norm": 0.552725613117218, "learning_rate": 0.00045612485781096606, "loss": 0.4604, "step": 16600 }, { "epoch": 1.9954321432864526, "eval_loss": 1.8494141101837158, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2065, "eval_samples_per_second": 4.545, "eval_steps_per_second": 1.136, "step": 16600 }, { "epoch": 1.9960331770645512, "grad_norm": 0.7407888770103455, "learning_rate": 0.00045609787261579416, "loss": 0.8367, "step": 16605 }, { "epoch": 1.9966342108426494, "grad_norm": 0.5645660758018494, "learning_rate": 0.00045607087992337764, "loss": 0.9859, "step": 16610 }, { "epoch": 1.9972352446207475, "grad_norm": 0.5735164284706116, "learning_rate": 0.0004560438797346985, "loss": 0.9969, "step": 16615 }, { "epoch": 1.9978362783988461, "grad_norm": 0.5844913721084595, "learning_rate": 0.00045601687205073886, "loss": 0.718, "step": 16620 }, { "epoch": 1.9984373121769443, "grad_norm": 0.7222350239753723, "learning_rate": 0.0004559898568724813, "loss": 0.9086, "step": 16625 }, { "epoch": 1.9990383459550427, "grad_norm": 0.5324108600616455, "learning_rate": 0.00045596283420090835, "loss": 0.866, "step": 16630 }, { "epoch": 1.999639379733141, "grad_norm": 1.0802171230316162, "learning_rate": 0.0004559358040370032, "loss": 0.7438, "step": 16635 }, { "epoch": 2.0002404135112393, "grad_norm": 0.6004274487495422, "learning_rate": 0.00045590876638174906, "loss": 0.7594, "step": 16640 }, { "epoch": 2.000841447289338, "grad_norm": 0.6771324872970581, "learning_rate": 0.0004558817212361294, "loss": 0.7867, "step": 16645 }, { "epoch": 2.001442481067436, "grad_norm": 0.400155633687973, "learning_rate": 0.00045585466860112824, "loss": 0.4176, "step": 16650 }, { "epoch": 2.0020435148455342, "grad_norm": 0.8131656050682068, "learning_rate": 0.00045582760847772953, "loss": 0.973, "step": 16655 }, { "epoch": 2.002644548623633, "grad_norm": 0.5619331002235413, "learning_rate": 0.0004558005408669177, "loss": 0.9746, "step": 16660 }, { "epoch": 2.003245582401731, "grad_norm": 0.7898737192153931, "learning_rate": 0.0004557734657696773, "loss": 0.6988, "step": 16665 }, { "epoch": 2.003846616179829, "grad_norm": 0.9394904971122742, "learning_rate": 0.00045574638318699335, "loss": 0.5539, "step": 16670 }, { "epoch": 2.004447649957928, "grad_norm": 0.5570420026779175, "learning_rate": 0.000455719293119851, "loss": 0.8906, "step": 16675 }, { "epoch": 2.005048683736026, "grad_norm": 0.6177582740783691, "learning_rate": 0.00045569219556923567, "loss": 0.8406, "step": 16680 }, { "epoch": 2.005649717514124, "grad_norm": 0.7129695415496826, "learning_rate": 0.0004556650905361331, "loss": 0.5414, "step": 16685 }, { "epoch": 2.0062507512922227, "grad_norm": 0.7557781338691711, "learning_rate": 0.0004556379780215294, "loss": 0.741, "step": 16690 }, { "epoch": 2.006851785070321, "grad_norm": 0.6572217345237732, "learning_rate": 0.0004556108580264108, "loss": 0.6289, "step": 16695 }, { "epoch": 2.007452818848419, "grad_norm": 0.5258102416992188, "learning_rate": 0.00045558373055176374, "loss": 0.5613, "step": 16700 }, { "epoch": 2.0080538526265177, "grad_norm": 0.605327844619751, "learning_rate": 0.00045555659559857504, "loss": 0.7812, "step": 16705 }, { "epoch": 2.008654886404616, "grad_norm": 0.8384151458740234, "learning_rate": 0.0004555294531678319, "loss": 0.6602, "step": 16710 }, { "epoch": 2.0092559201827145, "grad_norm": 0.6054327487945557, "learning_rate": 0.00045550230326052167, "loss": 1.2688, "step": 16715 }, { "epoch": 2.0098569539608127, "grad_norm": 0.4540344774723053, "learning_rate": 0.00045547514587763197, "loss": 0.8289, "step": 16720 }, { "epoch": 2.010457987738911, "grad_norm": 0.7510804533958435, "learning_rate": 0.0004554479810201507, "loss": 0.7359, "step": 16725 }, { "epoch": 2.0110590215170094, "grad_norm": 0.438589483499527, "learning_rate": 0.000455420808689066, "loss": 0.602, "step": 16730 }, { "epoch": 2.0116600552951076, "grad_norm": 0.6101544499397278, "learning_rate": 0.0004553936288853663, "loss": 0.6504, "step": 16735 }, { "epoch": 2.0122610890732058, "grad_norm": 0.837252676486969, "learning_rate": 0.0004553664416100405, "loss": 0.6066, "step": 16740 }, { "epoch": 2.0128621228513044, "grad_norm": 0.5817500352859497, "learning_rate": 0.0004553392468640774, "loss": 0.9563, "step": 16745 }, { "epoch": 2.0134631566294026, "grad_norm": 0.6677179336547852, "learning_rate": 0.0004553120446484664, "loss": 0.675, "step": 16750 }, { "epoch": 2.0140641904075007, "grad_norm": 0.4963759481906891, "learning_rate": 0.00045528483496419693, "loss": 0.5809, "step": 16755 }, { "epoch": 2.0146652241855993, "grad_norm": 0.5529560446739197, "learning_rate": 0.00045525761781225885, "loss": 0.9437, "step": 16760 }, { "epoch": 2.0152662579636975, "grad_norm": 0.7732462882995605, "learning_rate": 0.0004552303931936422, "loss": 0.6137, "step": 16765 }, { "epoch": 2.0158672917417957, "grad_norm": 0.6000874042510986, "learning_rate": 0.00045520316110933735, "loss": 0.6309, "step": 16770 }, { "epoch": 2.0164683255198943, "grad_norm": 0.45318618416786194, "learning_rate": 0.00045517592156033503, "loss": 0.7961, "step": 16775 }, { "epoch": 2.0170693592979925, "grad_norm": 0.6987054347991943, "learning_rate": 0.000455148674547626, "loss": 0.5908, "step": 16780 }, { "epoch": 2.017670393076091, "grad_norm": 0.7891154289245605, "learning_rate": 0.0004551214200722015, "loss": 0.7176, "step": 16785 }, { "epoch": 2.0182714268541893, "grad_norm": 0.7376841306686401, "learning_rate": 0.000455094158135053, "loss": 0.8984, "step": 16790 }, { "epoch": 2.0188724606322874, "grad_norm": 0.5121150016784668, "learning_rate": 0.00045506688873717203, "loss": 0.7641, "step": 16795 }, { "epoch": 2.019473494410386, "grad_norm": 0.585405170917511, "learning_rate": 0.00045503961187955073, "loss": 1.0121, "step": 16800 }, { "epoch": 2.019473494410386, "eval_loss": 1.8629882335662842, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2162, "eval_samples_per_second": 4.543, "eval_steps_per_second": 1.136, "step": 16800 }, { "epoch": 2.020074528188484, "grad_norm": 0.7785583734512329, "learning_rate": 0.00045501232756318136, "loss": 0.5211, "step": 16805 }, { "epoch": 2.0206755619665824, "grad_norm": 0.5569724440574646, "learning_rate": 0.0004549850357890564, "loss": 0.9281, "step": 16810 }, { "epoch": 2.021276595744681, "grad_norm": 0.5937827229499817, "learning_rate": 0.00045495773655816865, "loss": 0.7738, "step": 16815 }, { "epoch": 2.021877629522779, "grad_norm": 0.6806617975234985, "learning_rate": 0.0004549304298715112, "loss": 0.5266, "step": 16820 }, { "epoch": 2.0224786633008773, "grad_norm": 0.5024230480194092, "learning_rate": 0.0004549031157300773, "loss": 0.4988, "step": 16825 }, { "epoch": 2.023079697078976, "grad_norm": 0.7852580547332764, "learning_rate": 0.00045487579413486067, "loss": 1.0, "step": 16830 }, { "epoch": 2.023680730857074, "grad_norm": 0.9144189357757568, "learning_rate": 0.0004548484650868552, "loss": 0.9641, "step": 16835 }, { "epoch": 2.0242817646351723, "grad_norm": 0.5884906053543091, "learning_rate": 0.00045482112858705496, "loss": 0.7359, "step": 16840 }, { "epoch": 2.024882798413271, "grad_norm": 0.7806922197341919, "learning_rate": 0.00045479378463645433, "loss": 0.7574, "step": 16845 }, { "epoch": 2.025483832191369, "grad_norm": 0.9033584594726562, "learning_rate": 0.00045476643323604817, "loss": 0.7258, "step": 16850 }, { "epoch": 2.0260848659694677, "grad_norm": 0.6801016926765442, "learning_rate": 0.0004547390743868313, "loss": 0.6699, "step": 16855 }, { "epoch": 2.026685899747566, "grad_norm": 0.6402775645256042, "learning_rate": 0.000454711708089799, "loss": 0.7891, "step": 16860 }, { "epoch": 2.027286933525664, "grad_norm": 0.7287570834159851, "learning_rate": 0.00045468433434594683, "loss": 1.0602, "step": 16865 }, { "epoch": 2.0278879673037626, "grad_norm": 0.7366101145744324, "learning_rate": 0.00045465695315627044, "loss": 0.6277, "step": 16870 }, { "epoch": 2.028489001081861, "grad_norm": 0.710874617099762, "learning_rate": 0.000454629564521766, "loss": 0.452, "step": 16875 }, { "epoch": 2.029090034859959, "grad_norm": 0.6218833923339844, "learning_rate": 0.00045460216844342985, "loss": 0.6941, "step": 16880 }, { "epoch": 2.0296910686380576, "grad_norm": 0.769995391368866, "learning_rate": 0.00045457476492225847, "loss": 0.9078, "step": 16885 }, { "epoch": 2.0302921024161558, "grad_norm": 0.7024398446083069, "learning_rate": 0.0004545473539592487, "loss": 0.6605, "step": 16890 }, { "epoch": 2.030893136194254, "grad_norm": 0.6514043807983398, "learning_rate": 0.0004545199355553978, "loss": 0.7937, "step": 16895 }, { "epoch": 2.0314941699723525, "grad_norm": 0.9204562902450562, "learning_rate": 0.00045449250971170315, "loss": 0.6477, "step": 16900 }, { "epoch": 2.0320952037504507, "grad_norm": 0.5101679563522339, "learning_rate": 0.00045446507642916235, "loss": 0.7277, "step": 16905 }, { "epoch": 2.032696237528549, "grad_norm": 0.48708030581474304, "learning_rate": 0.0004544376357087734, "loss": 0.5152, "step": 16910 }, { "epoch": 2.0332972713066475, "grad_norm": 0.6405378580093384, "learning_rate": 0.0004544101875515345, "loss": 0.6273, "step": 16915 }, { "epoch": 2.0338983050847457, "grad_norm": 0.7723744511604309, "learning_rate": 0.0004543827319584441, "loss": 0.9156, "step": 16920 }, { "epoch": 2.0344993388628443, "grad_norm": 0.7031096816062927, "learning_rate": 0.00045435526893050095, "loss": 0.8066, "step": 16925 }, { "epoch": 2.0351003726409425, "grad_norm": 0.762577474117279, "learning_rate": 0.0004543277984687042, "loss": 0.6828, "step": 16930 }, { "epoch": 2.0357014064190406, "grad_norm": 0.9583767652511597, "learning_rate": 0.000454300320574053, "loss": 0.7316, "step": 16935 }, { "epoch": 2.0363024401971392, "grad_norm": 0.6798631548881531, "learning_rate": 0.000454272835247547, "loss": 0.7809, "step": 16940 }, { "epoch": 2.0369034739752374, "grad_norm": 0.7449055910110474, "learning_rate": 0.00045424534249018594, "loss": 0.4066, "step": 16945 }, { "epoch": 2.0375045077533356, "grad_norm": 0.5920630693435669, "learning_rate": 0.00045421784230297, "loss": 0.8461, "step": 16950 }, { "epoch": 2.038105541531434, "grad_norm": 0.6102299690246582, "learning_rate": 0.00045419033468689965, "loss": 0.5344, "step": 16955 }, { "epoch": 2.0387065753095324, "grad_norm": 0.6877725124359131, "learning_rate": 0.0004541628196429754, "loss": 0.6898, "step": 16960 }, { "epoch": 2.0393076090876305, "grad_norm": 0.8014640212059021, "learning_rate": 0.00045413529717219813, "loss": 0.9359, "step": 16965 }, { "epoch": 2.039908642865729, "grad_norm": 0.544452428817749, "learning_rate": 0.00045410776727556923, "loss": 0.4379, "step": 16970 }, { "epoch": 2.0405096766438273, "grad_norm": 0.47163134813308716, "learning_rate": 0.00045408022995408994, "loss": 0.6457, "step": 16975 }, { "epoch": 2.0411107104219255, "grad_norm": 0.5041592717170715, "learning_rate": 0.00045405268520876216, "loss": 0.5918, "step": 16980 }, { "epoch": 2.041711744200024, "grad_norm": 0.9241482615470886, "learning_rate": 0.00045402513304058776, "loss": 0.6109, "step": 16985 }, { "epoch": 2.0423127779781223, "grad_norm": 0.9017901420593262, "learning_rate": 0.000453997573450569, "loss": 0.7812, "step": 16990 }, { "epoch": 2.042913811756221, "grad_norm": 0.5215926170349121, "learning_rate": 0.0004539700064397085, "loss": 0.6207, "step": 16995 }, { "epoch": 2.043514845534319, "grad_norm": 0.8023661375045776, "learning_rate": 0.0004539424320090091, "loss": 0.6945, "step": 17000 }, { "epoch": 2.043514845534319, "eval_loss": 1.858496069908142, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1921, "eval_samples_per_second": 4.546, "eval_steps_per_second": 1.137, "step": 17000 }, { "epoch": 2.0441158793124172, "grad_norm": 0.6612244248390198, "learning_rate": 0.0004539148501594739, "loss": 0.782, "step": 17005 }, { "epoch": 2.044716913090516, "grad_norm": 0.5667388439178467, "learning_rate": 0.00045388726089210603, "loss": 0.7711, "step": 17010 }, { "epoch": 2.045317946868614, "grad_norm": 0.7826765179634094, "learning_rate": 0.0004538596642079093, "loss": 0.5684, "step": 17015 }, { "epoch": 2.045918980646712, "grad_norm": 0.6680711507797241, "learning_rate": 0.0004538320601078875, "loss": 0.8781, "step": 17020 }, { "epoch": 2.046520014424811, "grad_norm": 0.9267891049385071, "learning_rate": 0.0004538044485930449, "loss": 0.8773, "step": 17025 }, { "epoch": 2.047121048202909, "grad_norm": 0.5493230819702148, "learning_rate": 0.00045377682966438583, "loss": 0.5775, "step": 17030 }, { "epoch": 2.047722081981007, "grad_norm": 0.7469697594642639, "learning_rate": 0.000453749203322915, "loss": 0.9074, "step": 17035 }, { "epoch": 2.0483231157591058, "grad_norm": 0.627752959728241, "learning_rate": 0.0004537215695696374, "loss": 0.6449, "step": 17040 }, { "epoch": 2.048924149537204, "grad_norm": 0.7816077470779419, "learning_rate": 0.00045369392840555834, "loss": 0.9039, "step": 17045 }, { "epoch": 2.0495251833153025, "grad_norm": 0.6993010640144348, "learning_rate": 0.0004536662798316832, "loss": 0.7586, "step": 17050 }, { "epoch": 2.0501262170934007, "grad_norm": 0.9948700666427612, "learning_rate": 0.0004536386238490177, "loss": 0.8039, "step": 17055 }, { "epoch": 2.050727250871499, "grad_norm": 0.476158082485199, "learning_rate": 0.0004536109604585681, "loss": 0.8164, "step": 17060 }, { "epoch": 2.0513282846495975, "grad_norm": 0.901404857635498, "learning_rate": 0.00045358328966134056, "loss": 0.6344, "step": 17065 }, { "epoch": 2.0519293184276957, "grad_norm": 0.49783027172088623, "learning_rate": 0.00045355561145834166, "loss": 0.7258, "step": 17070 }, { "epoch": 2.052530352205794, "grad_norm": 0.6956377029418945, "learning_rate": 0.00045352792585057834, "loss": 0.4779, "step": 17075 }, { "epoch": 2.0531313859838924, "grad_norm": 0.9687783718109131, "learning_rate": 0.00045350023283905773, "loss": 0.6008, "step": 17080 }, { "epoch": 2.0537324197619906, "grad_norm": 0.4525292217731476, "learning_rate": 0.0004534725324247871, "loss": 0.5566, "step": 17085 }, { "epoch": 2.054333453540089, "grad_norm": 0.7543963193893433, "learning_rate": 0.00045344482460877417, "loss": 0.8473, "step": 17090 }, { "epoch": 2.0549344873181874, "grad_norm": 0.6699605584144592, "learning_rate": 0.0004534171093920269, "loss": 0.9203, "step": 17095 }, { "epoch": 2.0555355210962856, "grad_norm": 0.7298281192779541, "learning_rate": 0.0004533893867755535, "loss": 0.9375, "step": 17100 }, { "epoch": 2.0561365548743837, "grad_norm": 0.7011690735816956, "learning_rate": 0.00045336165676036235, "loss": 0.5965, "step": 17105 }, { "epoch": 2.0567375886524824, "grad_norm": 0.4685646891593933, "learning_rate": 0.00045333391934746224, "loss": 0.7605, "step": 17110 }, { "epoch": 2.0573386224305805, "grad_norm": 0.6965895295143127, "learning_rate": 0.00045330617453786224, "loss": 0.6877, "step": 17115 }, { "epoch": 2.057939656208679, "grad_norm": 0.5974926948547363, "learning_rate": 0.0004532784223325716, "loss": 0.857, "step": 17120 }, { "epoch": 2.0585406899867773, "grad_norm": 0.7837967872619629, "learning_rate": 0.00045325066273259974, "loss": 0.5879, "step": 17125 }, { "epoch": 2.0591417237648755, "grad_norm": 0.6797435879707336, "learning_rate": 0.0004532228957389566, "loss": 0.7086, "step": 17130 }, { "epoch": 2.059742757542974, "grad_norm": 0.6532315015792847, "learning_rate": 0.0004531951213526523, "loss": 0.5465, "step": 17135 }, { "epoch": 2.0603437913210723, "grad_norm": 0.5805181264877319, "learning_rate": 0.000453167339574697, "loss": 0.6316, "step": 17140 }, { "epoch": 2.0609448250991704, "grad_norm": 0.5898436307907104, "learning_rate": 0.00045313955040610153, "loss": 1.1227, "step": 17145 }, { "epoch": 2.061545858877269, "grad_norm": 0.6501821279525757, "learning_rate": 0.0004531117538478767, "loss": 0.8016, "step": 17150 }, { "epoch": 2.062146892655367, "grad_norm": 0.7484675645828247, "learning_rate": 0.00045308394990103366, "loss": 0.7309, "step": 17155 }, { "epoch": 2.0627479264334654, "grad_norm": 0.7221248149871826, "learning_rate": 0.0004530561385665838, "loss": 0.7215, "step": 17160 }, { "epoch": 2.063348960211564, "grad_norm": 0.8504783511161804, "learning_rate": 0.0004530283198455389, "loss": 0.807, "step": 17165 }, { "epoch": 2.063949993989662, "grad_norm": 0.6153644919395447, "learning_rate": 0.0004530004937389108, "loss": 0.7141, "step": 17170 }, { "epoch": 2.0645510277677603, "grad_norm": 0.3961271345615387, "learning_rate": 0.00045297266024771186, "loss": 0.5598, "step": 17175 }, { "epoch": 2.065152061545859, "grad_norm": 0.8818097710609436, "learning_rate": 0.00045294481937295457, "loss": 0.5008, "step": 17180 }, { "epoch": 2.065753095323957, "grad_norm": 0.7384099364280701, "learning_rate": 0.0004529169711156517, "loss": 0.6398, "step": 17185 }, { "epoch": 2.0663541291020557, "grad_norm": 0.7998079061508179, "learning_rate": 0.0004528891154768161, "loss": 1.0223, "step": 17190 }, { "epoch": 2.066955162880154, "grad_norm": 0.6071758270263672, "learning_rate": 0.00045286125245746134, "loss": 0.6395, "step": 17195 }, { "epoch": 2.067556196658252, "grad_norm": 0.7403247356414795, "learning_rate": 0.0004528333820586009, "loss": 0.5687, "step": 17200 }, { "epoch": 2.067556196658252, "eval_loss": 1.8771483898162842, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.2027, "eval_samples_per_second": 4.545, "eval_steps_per_second": 1.136, "step": 17200 }, { "epoch": 2.0681572304363507, "grad_norm": 0.8686665892601013, "learning_rate": 0.00045280550428124857, "loss": 1.0352, "step": 17205 }, { "epoch": 2.068758264214449, "grad_norm": 0.616020679473877, "learning_rate": 0.00045277761912641845, "loss": 0.7422, "step": 17210 }, { "epoch": 2.069359297992547, "grad_norm": 0.7111455798149109, "learning_rate": 0.000452749726595125, "loss": 0.6477, "step": 17215 }, { "epoch": 2.0699603317706456, "grad_norm": 0.590523898601532, "learning_rate": 0.0004527218266883829, "loss": 0.6486, "step": 17220 }, { "epoch": 2.070561365548744, "grad_norm": 1.0232583284378052, "learning_rate": 0.00045269391940720705, "loss": 0.6562, "step": 17225 }, { "epoch": 2.071162399326842, "grad_norm": 0.6734606623649597, "learning_rate": 0.00045266600475261253, "loss": 0.9355, "step": 17230 }, { "epoch": 2.0717634331049406, "grad_norm": 0.6992089748382568, "learning_rate": 0.00045263808272561493, "loss": 0.7695, "step": 17235 }, { "epoch": 2.0723644668830388, "grad_norm": 0.6774486899375916, "learning_rate": 0.0004526101533272298, "loss": 0.5449, "step": 17240 }, { "epoch": 2.0729655006611374, "grad_norm": 0.680239200592041, "learning_rate": 0.00045258221655847334, "loss": 0.7352, "step": 17245 }, { "epoch": 2.0735665344392356, "grad_norm": 0.6663438081741333, "learning_rate": 0.00045255427242036164, "loss": 0.6836, "step": 17250 }, { "epoch": 2.0741675682173337, "grad_norm": 0.6518068909645081, "learning_rate": 0.0004525263209139113, "loss": 0.9281, "step": 17255 }, { "epoch": 2.0747686019954323, "grad_norm": 0.7786570191383362, "learning_rate": 0.00045249836204013915, "loss": 0.7063, "step": 17260 }, { "epoch": 2.0753696357735305, "grad_norm": 0.570971667766571, "learning_rate": 0.0004524703958000622, "loss": 0.9047, "step": 17265 }, { "epoch": 2.0759706695516287, "grad_norm": 0.6106430888175964, "learning_rate": 0.00045244242219469777, "loss": 0.6094, "step": 17270 }, { "epoch": 2.0765717033297273, "grad_norm": 0.6374453902244568, "learning_rate": 0.00045241444122506353, "loss": 0.5941, "step": 17275 }, { "epoch": 2.0771727371078255, "grad_norm": 0.767215371131897, "learning_rate": 0.00045238645289217725, "loss": 0.5746, "step": 17280 }, { "epoch": 2.0777737708859236, "grad_norm": 0.6309446692466736, "learning_rate": 0.00045235845719705716, "loss": 0.7789, "step": 17285 }, { "epoch": 2.0783748046640222, "grad_norm": 0.6179651618003845, "learning_rate": 0.00045233045414072153, "loss": 0.7844, "step": 17290 }, { "epoch": 2.0789758384421204, "grad_norm": 0.6599948406219482, "learning_rate": 0.00045230244372418914, "loss": 0.7367, "step": 17295 }, { "epoch": 2.0795768722202186, "grad_norm": 0.6810296773910522, "learning_rate": 0.00045227442594847903, "loss": 0.6168, "step": 17300 }, { "epoch": 2.080177905998317, "grad_norm": 0.6597148776054382, "learning_rate": 0.0004522464008146101, "loss": 0.5434, "step": 17305 }, { "epoch": 2.0807789397764154, "grad_norm": 0.705898642539978, "learning_rate": 0.0004522183683236021, "loss": 0.8578, "step": 17310 }, { "epoch": 2.0813799735545135, "grad_norm": 0.7407541871070862, "learning_rate": 0.0004521903284764747, "loss": 0.9508, "step": 17315 }, { "epoch": 2.081981007332612, "grad_norm": 0.7318904399871826, "learning_rate": 0.00045216228127424775, "loss": 0.8387, "step": 17320 }, { "epoch": 2.0825820411107103, "grad_norm": 0.7759041786193848, "learning_rate": 0.0004521342267179418, "loss": 0.6785, "step": 17325 }, { "epoch": 2.083183074888809, "grad_norm": 0.6000215411186218, "learning_rate": 0.00045210616480857713, "loss": 0.6395, "step": 17330 }, { "epoch": 2.083784108666907, "grad_norm": 0.7510565519332886, "learning_rate": 0.0004520780955471748, "loss": 0.6348, "step": 17335 }, { "epoch": 2.0843851424450053, "grad_norm": 0.6598749756813049, "learning_rate": 0.0004520500189347557, "loss": 0.7203, "step": 17340 }, { "epoch": 2.084986176223104, "grad_norm": 0.7145467400550842, "learning_rate": 0.00045202193497234124, "loss": 0.8512, "step": 17345 }, { "epoch": 2.085587210001202, "grad_norm": 0.7418386340141296, "learning_rate": 0.000451993843660953, "loss": 0.4508, "step": 17350 }, { "epoch": 2.0861882437793002, "grad_norm": 0.6311342716217041, "learning_rate": 0.0004519657450016129, "loss": 0.6426, "step": 17355 }, { "epoch": 2.086789277557399, "grad_norm": 0.7941176891326904, "learning_rate": 0.00045193763899534315, "loss": 0.7762, "step": 17360 }, { "epoch": 2.087390311335497, "grad_norm": 0.8457861542701721, "learning_rate": 0.000451909525643166, "loss": 0.9812, "step": 17365 }, { "epoch": 2.087991345113595, "grad_norm": 0.5690633058547974, "learning_rate": 0.0004518814049461043, "loss": 0.6332, "step": 17370 }, { "epoch": 2.088592378891694, "grad_norm": 0.6167107224464417, "learning_rate": 0.00045185327690518086, "loss": 0.6973, "step": 17375 }, { "epoch": 2.089193412669792, "grad_norm": 0.7467222213745117, "learning_rate": 0.00045182514152141903, "loss": 0.9738, "step": 17380 }, { "epoch": 2.0897944464478906, "grad_norm": 0.8636981844902039, "learning_rate": 0.00045179699879584217, "loss": 0.7328, "step": 17385 }, { "epoch": 2.0903954802259888, "grad_norm": 0.7264381051063538, "learning_rate": 0.0004517688487294741, "loss": 0.7211, "step": 17390 }, { "epoch": 2.090996514004087, "grad_norm": 1.3584240674972534, "learning_rate": 0.00045174069132333886, "loss": 0.7707, "step": 17395 }, { "epoch": 2.0915975477821855, "grad_norm": 0.684133768081665, "learning_rate": 0.0004517125265784607, "loss": 0.8984, "step": 17400 }, { "epoch": 2.0915975477821855, "eval_loss": 1.895117163658142, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1959, "eval_samples_per_second": 4.546, "eval_steps_per_second": 1.136, "step": 17400 }, { "epoch": 2.0921985815602837, "grad_norm": 0.7006523013114929, "learning_rate": 0.0004516843544958641, "loss": 0.7937, "step": 17405 }, { "epoch": 2.092799615338382, "grad_norm": 0.7982677221298218, "learning_rate": 0.0004516561750765741, "loss": 0.5328, "step": 17410 }, { "epoch": 2.0934006491164805, "grad_norm": 0.516133725643158, "learning_rate": 0.0004516279883216155, "loss": 0.5805, "step": 17415 }, { "epoch": 2.0940016828945787, "grad_norm": 0.6930840611457825, "learning_rate": 0.0004515997942320138, "loss": 0.8434, "step": 17420 }, { "epoch": 2.094602716672677, "grad_norm": 0.7119755744934082, "learning_rate": 0.0004515715928087947, "loss": 0.9844, "step": 17425 }, { "epoch": 2.0952037504507754, "grad_norm": 0.6991851925849915, "learning_rate": 0.0004515433840529839, "loss": 0.7012, "step": 17430 }, { "epoch": 2.0958047842288736, "grad_norm": 0.7983843684196472, "learning_rate": 0.00045151516796560767, "loss": 0.6113, "step": 17435 }, { "epoch": 2.096405818006972, "grad_norm": 0.7745969891548157, "learning_rate": 0.00045148694454769247, "loss": 0.6867, "step": 17440 }, { "epoch": 2.0970068517850704, "grad_norm": 0.6228597164154053, "learning_rate": 0.0004514587138002649, "loss": 0.4977, "step": 17445 }, { "epoch": 2.0976078855631686, "grad_norm": 0.9158568382263184, "learning_rate": 0.0004514304757243519, "loss": 0.7461, "step": 17450 }, { "epoch": 2.098208919341267, "grad_norm": 0.8394906520843506, "learning_rate": 0.0004514022303209807, "loss": 0.8773, "step": 17455 }, { "epoch": 2.0988099531193654, "grad_norm": 0.6034619212150574, "learning_rate": 0.00045137397759117894, "loss": 0.448, "step": 17460 }, { "epoch": 2.0994109868974635, "grad_norm": 0.846920907497406, "learning_rate": 0.00045134571753597413, "loss": 0.5891, "step": 17465 }, { "epoch": 2.100012020675562, "grad_norm": 0.6311703324317932, "learning_rate": 0.0004513174501563945, "loss": 0.7531, "step": 17470 }, { "epoch": 2.1006130544536603, "grad_norm": 0.8502973914146423, "learning_rate": 0.00045128917545346817, "loss": 0.8566, "step": 17475 }, { "epoch": 2.1012140882317585, "grad_norm": 0.969598650932312, "learning_rate": 0.00045126089342822375, "loss": 0.8512, "step": 17480 }, { "epoch": 2.101815122009857, "grad_norm": 1.1035313606262207, "learning_rate": 0.0004512326040816901, "loss": 0.8555, "step": 17485 }, { "epoch": 2.1024161557879553, "grad_norm": 0.4838935136795044, "learning_rate": 0.00045120430741489624, "loss": 0.9016, "step": 17490 }, { "epoch": 2.1030171895660534, "grad_norm": 0.5388343930244446, "learning_rate": 0.0004511760034288716, "loss": 0.7521, "step": 17495 }, { "epoch": 2.103618223344152, "grad_norm": 1.0079002380371094, "learning_rate": 0.00045114769212464573, "loss": 0.8016, "step": 17500 }, { "epoch": 2.10421925712225, "grad_norm": 0.43126386404037476, "learning_rate": 0.0004511193735032486, "loss": 0.7273, "step": 17505 }, { "epoch": 2.1048202909003484, "grad_norm": 0.5944538712501526, "learning_rate": 0.0004510910475657102, "loss": 0.4594, "step": 17510 }, { "epoch": 2.105421324678447, "grad_norm": 0.5367337465286255, "learning_rate": 0.00045106271431306113, "loss": 0.7672, "step": 17515 }, { "epoch": 2.106022358456545, "grad_norm": 0.5563839077949524, "learning_rate": 0.00045103437374633196, "loss": 0.8406, "step": 17520 }, { "epoch": 2.106623392234644, "grad_norm": 0.7489462494850159, "learning_rate": 0.0004510060258665536, "loss": 0.6223, "step": 17525 }, { "epoch": 2.107224426012742, "grad_norm": 0.4015835225582123, "learning_rate": 0.00045097767067475735, "loss": 0.5094, "step": 17530 }, { "epoch": 2.10782545979084, "grad_norm": 0.7061188817024231, "learning_rate": 0.0004509493081719747, "loss": 0.6773, "step": 17535 }, { "epoch": 2.1084264935689387, "grad_norm": 0.7431478500366211, "learning_rate": 0.00045092093835923737, "loss": 0.4648, "step": 17540 }, { "epoch": 2.109027527347037, "grad_norm": 0.9528364539146423, "learning_rate": 0.00045089256123757735, "loss": 0.6734, "step": 17545 }, { "epoch": 2.109628561125135, "grad_norm": 0.4772130846977234, "learning_rate": 0.0004508641768080269, "loss": 0.5215, "step": 17550 }, { "epoch": 2.1102295949032337, "grad_norm": 0.7699978351593018, "learning_rate": 0.0004508357850716186, "loss": 0.7137, "step": 17555 }, { "epoch": 2.110830628681332, "grad_norm": 0.3766671419143677, "learning_rate": 0.00045080738602938533, "loss": 0.5818, "step": 17560 }, { "epoch": 2.11143166245943, "grad_norm": 0.657067060470581, "learning_rate": 0.00045077897968236, "loss": 0.5984, "step": 17565 }, { "epoch": 2.1120326962375287, "grad_norm": 0.6125964522361755, "learning_rate": 0.000450750566031576, "loss": 0.8344, "step": 17570 }, { "epoch": 2.112633730015627, "grad_norm": 0.6069907546043396, "learning_rate": 0.0004507221450780671, "loss": 1.025, "step": 17575 }, { "epoch": 2.1132347637937254, "grad_norm": 0.7666451930999756, "learning_rate": 0.00045069371682286696, "loss": 0.8016, "step": 17580 }, { "epoch": 2.1138357975718236, "grad_norm": 0.6099645495414734, "learning_rate": 0.0004506652812670099, "loss": 0.7809, "step": 17585 }, { "epoch": 2.1144368313499218, "grad_norm": 0.7072697877883911, "learning_rate": 0.0004506368384115301, "loss": 0.5977, "step": 17590 }, { "epoch": 2.1150378651280204, "grad_norm": 0.5507746934890747, "learning_rate": 0.00045060838825746255, "loss": 0.6613, "step": 17595 }, { "epoch": 2.1156388989061186, "grad_norm": 0.488972932100296, "learning_rate": 0.0004505799308058418, "loss": 0.7414, "step": 17600 }, { "epoch": 2.1156388989061186, "eval_loss": 1.935644507408142, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.268, "eval_samples_per_second": 4.537, "eval_steps_per_second": 1.134, "step": 17600 }, { "epoch": 2.1162399326842167, "grad_norm": 1.0738722085952759, "learning_rate": 0.0004505514660577033, "loss": 0.6578, "step": 17605 }, { "epoch": 2.1168409664623153, "grad_norm": 0.5866246819496155, "learning_rate": 0.0004505229940140825, "loss": 0.7664, "step": 17610 }, { "epoch": 2.1174420002404135, "grad_norm": 0.7349161505699158, "learning_rate": 0.000450494514676015, "loss": 0.793, "step": 17615 }, { "epoch": 2.1180430340185117, "grad_norm": 0.8021662831306458, "learning_rate": 0.00045046602804453697, "loss": 0.782, "step": 17620 }, { "epoch": 2.1186440677966103, "grad_norm": 0.5936036705970764, "learning_rate": 0.00045043753412068457, "loss": 0.6574, "step": 17625 }, { "epoch": 2.1192451015747085, "grad_norm": 0.7160750031471252, "learning_rate": 0.0004504090329054943, "loss": 0.8656, "step": 17630 }, { "epoch": 2.1198461353528066, "grad_norm": 0.6168115735054016, "learning_rate": 0.000450380524400003, "loss": 0.6082, "step": 17635 }, { "epoch": 2.1204471691309053, "grad_norm": 0.6771405339241028, "learning_rate": 0.0004503520086052477, "loss": 0.9531, "step": 17640 }, { "epoch": 2.1210482029090034, "grad_norm": 0.6585192680358887, "learning_rate": 0.00045032348552226587, "loss": 0.5445, "step": 17645 }, { "epoch": 2.1216492366871016, "grad_norm": 0.492551326751709, "learning_rate": 0.00045029495515209485, "loss": 0.6473, "step": 17650 }, { "epoch": 2.1222502704652, "grad_norm": 0.7444902658462524, "learning_rate": 0.00045026641749577266, "loss": 0.6602, "step": 17655 }, { "epoch": 2.1228513042432984, "grad_norm": 0.6974149942398071, "learning_rate": 0.00045023787255433736, "loss": 0.7086, "step": 17660 }, { "epoch": 2.123452338021397, "grad_norm": 0.6058925986289978, "learning_rate": 0.00045020932032882735, "loss": 0.4457, "step": 17665 }, { "epoch": 2.124053371799495, "grad_norm": 0.5222938060760498, "learning_rate": 0.00045018076082028124, "loss": 0.9113, "step": 17670 }, { "epoch": 2.1246544055775933, "grad_norm": 0.40989217162132263, "learning_rate": 0.00045015219402973796, "loss": 0.7293, "step": 17675 }, { "epoch": 2.125255439355692, "grad_norm": 0.8943347930908203, "learning_rate": 0.0004501236199582367, "loss": 0.7934, "step": 17680 }, { "epoch": 2.12585647313379, "grad_norm": 0.8208712339401245, "learning_rate": 0.00045009503860681693, "loss": 0.9922, "step": 17685 }, { "epoch": 2.1264575069118883, "grad_norm": 0.8035229444503784, "learning_rate": 0.00045006644997651834, "loss": 0.5477, "step": 17690 }, { "epoch": 2.127058540689987, "grad_norm": 0.5654513239860535, "learning_rate": 0.0004500378540683808, "loss": 0.7027, "step": 17695 }, { "epoch": 2.127659574468085, "grad_norm": 0.7384824156761169, "learning_rate": 0.0004500092508834448, "loss": 1.0051, "step": 17700 }, { "epoch": 2.1282606082461832, "grad_norm": 0.621644914150238, "learning_rate": 0.00044998064042275053, "loss": 0.6094, "step": 17705 }, { "epoch": 2.128861642024282, "grad_norm": 0.9057394862174988, "learning_rate": 0.000449952022687339, "loss": 0.8773, "step": 17710 }, { "epoch": 2.12946267580238, "grad_norm": 0.7892702221870422, "learning_rate": 0.0004499233976782511, "loss": 1.0785, "step": 17715 }, { "epoch": 2.1300637095804786, "grad_norm": 0.6460103988647461, "learning_rate": 0.00044989476539652817, "loss": 0.6828, "step": 17720 }, { "epoch": 2.130664743358577, "grad_norm": 0.5196257829666138, "learning_rate": 0.0004498661258432117, "loss": 0.7059, "step": 17725 }, { "epoch": 2.131265777136675, "grad_norm": 0.5224001407623291, "learning_rate": 0.0004498374790193437, "loss": 0.8344, "step": 17730 }, { "epoch": 2.1318668109147736, "grad_norm": 0.6787461638450623, "learning_rate": 0.0004498088249259661, "loss": 0.8812, "step": 17735 }, { "epoch": 2.1324678446928718, "grad_norm": 0.557407796382904, "learning_rate": 0.0004497801635641213, "loss": 0.9895, "step": 17740 }, { "epoch": 2.13306887847097, "grad_norm": 0.5969130992889404, "learning_rate": 0.00044975149493485186, "loss": 0.9305, "step": 17745 }, { "epoch": 2.1336699122490685, "grad_norm": 0.5112702250480652, "learning_rate": 0.0004497228190392008, "loss": 0.684, "step": 17750 }, { "epoch": 2.1342709460271667, "grad_norm": 0.5920119285583496, "learning_rate": 0.0004496941358782111, "loss": 0.5422, "step": 17755 }, { "epoch": 2.134871979805265, "grad_norm": 0.38535431027412415, "learning_rate": 0.0004496654454529262, "loss": 0.4613, "step": 17760 }, { "epoch": 2.1354730135833635, "grad_norm": 0.5178935527801514, "learning_rate": 0.00044963674776438996, "loss": 0.5852, "step": 17765 }, { "epoch": 2.1360740473614617, "grad_norm": 0.7553673386573792, "learning_rate": 0.00044960804281364607, "loss": 0.5805, "step": 17770 }, { "epoch": 2.13667508113956, "grad_norm": 0.6306725144386292, "learning_rate": 0.00044957933060173894, "loss": 0.5637, "step": 17775 }, { "epoch": 2.1372761149176585, "grad_norm": 0.5551248788833618, "learning_rate": 0.0004495506111297129, "loss": 1.1945, "step": 17780 }, { "epoch": 2.1378771486957566, "grad_norm": 0.9314557909965515, "learning_rate": 0.00044952188439861276, "loss": 0.6871, "step": 17785 }, { "epoch": 2.1384781824738552, "grad_norm": 0.7166127562522888, "learning_rate": 0.00044949315040948345, "loss": 0.8969, "step": 17790 }, { "epoch": 2.1390792162519534, "grad_norm": 0.5654996633529663, "learning_rate": 0.00044946440916337024, "loss": 0.7562, "step": 17795 }, { "epoch": 2.1396802500300516, "grad_norm": 0.6562860608100891, "learning_rate": 0.0004494356606613187, "loss": 0.9078, "step": 17800 }, { "epoch": 2.1396802500300516, "eval_loss": 1.8380858898162842, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1963, "eval_samples_per_second": 4.546, "eval_steps_per_second": 1.136, "step": 17800 }, { "epoch": 2.14028128380815, "grad_norm": 0.8220098614692688, "learning_rate": 0.0004494069049043746, "loss": 0.9766, "step": 17805 }, { "epoch": 2.1408823175862484, "grad_norm": 0.5194075703620911, "learning_rate": 0.00044937814189358397, "loss": 0.6887, "step": 17810 }, { "epoch": 2.1414833513643465, "grad_norm": 0.6570320129394531, "learning_rate": 0.00044934937162999315, "loss": 0.607, "step": 17815 }, { "epoch": 2.142084385142445, "grad_norm": 0.5360496640205383, "learning_rate": 0.00044932059411464866, "loss": 0.9953, "step": 17820 }, { "epoch": 2.1426854189205433, "grad_norm": 0.7278378009796143, "learning_rate": 0.00044929180934859736, "loss": 0.6391, "step": 17825 }, { "epoch": 2.1432864526986415, "grad_norm": 0.631291925907135, "learning_rate": 0.0004492630173328864, "loss": 1.0164, "step": 17830 }, { "epoch": 2.14388748647674, "grad_norm": 0.6542544364929199, "learning_rate": 0.0004492342180685632, "loss": 0.8535, "step": 17835 }, { "epoch": 2.1444885202548383, "grad_norm": 0.7265483736991882, "learning_rate": 0.0004492054115566754, "loss": 0.5465, "step": 17840 }, { "epoch": 2.1450895540329364, "grad_norm": 0.8160646557807922, "learning_rate": 0.0004491765977982707, "loss": 0.8758, "step": 17845 }, { "epoch": 2.145690587811035, "grad_norm": 0.5731572508811951, "learning_rate": 0.00044914777679439747, "loss": 0.6859, "step": 17850 }, { "epoch": 2.1462916215891332, "grad_norm": 0.5523707270622253, "learning_rate": 0.0004491189485461039, "loss": 0.752, "step": 17855 }, { "epoch": 2.146892655367232, "grad_norm": 0.700449526309967, "learning_rate": 0.000449090113054439, "loss": 0.7633, "step": 17860 }, { "epoch": 2.14749368914533, "grad_norm": 0.6764151453971863, "learning_rate": 0.00044906127032045146, "loss": 0.8273, "step": 17865 }, { "epoch": 2.148094722923428, "grad_norm": 0.5220698714256287, "learning_rate": 0.00044903242034519065, "loss": 0.6039, "step": 17870 }, { "epoch": 2.148695756701527, "grad_norm": 0.572157621383667, "learning_rate": 0.0004490035631297059, "loss": 0.8023, "step": 17875 }, { "epoch": 2.149296790479625, "grad_norm": 0.6647509336471558, "learning_rate": 0.00044897469867504717, "loss": 0.7305, "step": 17880 }, { "epoch": 2.149897824257723, "grad_norm": 0.4489690065383911, "learning_rate": 0.0004489458269822642, "loss": 0.6717, "step": 17885 }, { "epoch": 2.1504988580358217, "grad_norm": 0.8110430240631104, "learning_rate": 0.00044891694805240746, "loss": 0.7969, "step": 17890 }, { "epoch": 2.15109989181392, "grad_norm": 0.7674276828765869, "learning_rate": 0.0004488880618865274, "loss": 0.5857, "step": 17895 }, { "epoch": 2.151700925592018, "grad_norm": 0.5642954707145691, "learning_rate": 0.00044885916848567487, "loss": 0.9984, "step": 17900 }, { "epoch": 2.1523019593701167, "grad_norm": 0.7679926156997681, "learning_rate": 0.00044883026785090085, "loss": 0.6477, "step": 17905 }, { "epoch": 2.152902993148215, "grad_norm": 0.7056578397750854, "learning_rate": 0.00044880135998325667, "loss": 0.9785, "step": 17910 }, { "epoch": 2.1535040269263135, "grad_norm": 0.5287848114967346, "learning_rate": 0.000448772444883794, "loss": 0.7316, "step": 17915 }, { "epoch": 2.1541050607044117, "grad_norm": 0.8994506597518921, "learning_rate": 0.0004487435225535646, "loss": 1.3047, "step": 17920 }, { "epoch": 2.15470609448251, "grad_norm": 0.4346869885921478, "learning_rate": 0.0004487145929936206, "loss": 0.5166, "step": 17925 }, { "epoch": 2.1553071282606084, "grad_norm": 0.8611705303192139, "learning_rate": 0.00044868565620501447, "loss": 0.6352, "step": 17930 }, { "epoch": 2.1559081620387066, "grad_norm": 0.6104060411453247, "learning_rate": 0.0004486567121887987, "loss": 0.6422, "step": 17935 }, { "epoch": 2.156509195816805, "grad_norm": 0.6659860610961914, "learning_rate": 0.0004486277609460262, "loss": 0.7652, "step": 17940 }, { "epoch": 2.1571102295949034, "grad_norm": 0.7950757145881653, "learning_rate": 0.0004485988024777503, "loss": 0.6023, "step": 17945 }, { "epoch": 2.1577112633730016, "grad_norm": 0.7772306203842163, "learning_rate": 0.00044856983678502425, "loss": 0.9652, "step": 17950 }, { "epoch": 2.1583122971510997, "grad_norm": 0.7486100792884827, "learning_rate": 0.0004485408638689018, "loss": 0.7199, "step": 17955 }, { "epoch": 2.1589133309291983, "grad_norm": 0.5416676998138428, "learning_rate": 0.0004485118837304369, "loss": 0.6129, "step": 17960 }, { "epoch": 2.1595143647072965, "grad_norm": 0.9193729162216187, "learning_rate": 0.0004484828963706837, "loss": 0.552, "step": 17965 }, { "epoch": 2.1601153984853947, "grad_norm": 0.5687127709388733, "learning_rate": 0.0004484539017906969, "loss": 0.423, "step": 17970 }, { "epoch": 2.1607164322634933, "grad_norm": 0.8954154253005981, "learning_rate": 0.00044842489999153095, "loss": 0.8242, "step": 17975 }, { "epoch": 2.1613174660415915, "grad_norm": 0.7648013830184937, "learning_rate": 0.00044839589097424103, "loss": 0.8352, "step": 17980 }, { "epoch": 2.1619184998196896, "grad_norm": 0.32162073254585266, "learning_rate": 0.00044836687473988237, "loss": 0.6967, "step": 17985 }, { "epoch": 2.1625195335977883, "grad_norm": 0.6281862854957581, "learning_rate": 0.00044833785128951044, "loss": 0.7418, "step": 17990 }, { "epoch": 2.1631205673758864, "grad_norm": 0.5281660556793213, "learning_rate": 0.0004483088206241811, "loss": 0.5414, "step": 17995 }, { "epoch": 2.163721601153985, "grad_norm": 0.8859165906906128, "learning_rate": 0.00044827978274495034, "loss": 0.7602, "step": 18000 }, { "epoch": 2.163721601153985, "eval_loss": 1.845605492591858, "eval_model_preparation_time": 0.0053, "eval_runtime": 35.1998, "eval_samples_per_second": 4.545, "eval_steps_per_second": 1.136, "step": 18000 } ], "logging_steps": 5, "max_steps": 83190, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 3000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.923062453960704e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }