{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.98185117967332, "eval_steps": 500, "global_step": 2750, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003629764065335753, "grad_norm": 516.0, "learning_rate": 7.272727272727273e-07, "loss": 51.5982, "step": 1 }, { "epoch": 0.018148820326678767, "grad_norm": 454.0, "learning_rate": 3.636363636363636e-06, "loss": 52.3483, "step": 5 }, { "epoch": 0.036297640653357534, "grad_norm": 536.0, "learning_rate": 7.272727272727272e-06, "loss": 53.1669, "step": 10 }, { "epoch": 0.0544464609800363, "grad_norm": 440.0, "learning_rate": 1.0909090909090909e-05, "loss": 46.8624, "step": 15 }, { "epoch": 0.07259528130671507, "grad_norm": 306.0, "learning_rate": 1.4545454545454545e-05, "loss": 35.5942, "step": 20 }, { "epoch": 0.09074410163339383, "grad_norm": 41.5, "learning_rate": 1.8181818181818182e-05, "loss": 25.8916, "step": 25 }, { "epoch": 0.1088929219600726, "grad_norm": 44.0, "learning_rate": 2.1818181818181818e-05, "loss": 24.4549, "step": 30 }, { "epoch": 0.12704174228675136, "grad_norm": 24.375, "learning_rate": 2.5454545454545454e-05, "loss": 22.3596, "step": 35 }, { "epoch": 0.14519056261343014, "grad_norm": 20.0, "learning_rate": 2.909090909090909e-05, "loss": 21.008, "step": 40 }, { "epoch": 0.16333938294010888, "grad_norm": 7.4375, "learning_rate": 3.272727272727273e-05, "loss": 20.0885, "step": 45 }, { "epoch": 0.18148820326678766, "grad_norm": 7.28125, "learning_rate": 3.6363636363636364e-05, "loss": 19.1877, "step": 50 }, { "epoch": 0.1996370235934664, "grad_norm": 10.25, "learning_rate": 4e-05, "loss": 18.5964, "step": 55 }, { "epoch": 0.2177858439201452, "grad_norm": 14.5, "learning_rate": 4.3636363636363636e-05, "loss": 17.6581, "step": 60 }, { "epoch": 0.23593466424682397, "grad_norm": 26.5, "learning_rate": 4.7272727272727275e-05, "loss": 16.1181, "step": 65 }, { "epoch": 0.2540834845735027, "grad_norm": 49.75, "learning_rate": 5.090909090909091e-05, "loss": 13.6493, "step": 70 }, { "epoch": 0.27223230490018147, "grad_norm": 50.75, "learning_rate": 5.4545454545454546e-05, "loss": 7.9128, "step": 75 }, { "epoch": 0.29038112522686027, "grad_norm": 6.625, "learning_rate": 5.818181818181818e-05, "loss": 2.7543, "step": 80 }, { "epoch": 0.308529945553539, "grad_norm": 12.375, "learning_rate": 6.181818181818182e-05, "loss": 2.0995, "step": 85 }, { "epoch": 0.32667876588021777, "grad_norm": 2.53125, "learning_rate": 6.545454545454546e-05, "loss": 1.8007, "step": 90 }, { "epoch": 0.3448275862068966, "grad_norm": 1.984375, "learning_rate": 6.90909090909091e-05, "loss": 1.6358, "step": 95 }, { "epoch": 0.3629764065335753, "grad_norm": 2.828125, "learning_rate": 7.272727272727273e-05, "loss": 1.509, "step": 100 }, { "epoch": 0.3811252268602541, "grad_norm": 4.0, "learning_rate": 7.636363636363637e-05, "loss": 1.4215, "step": 105 }, { "epoch": 0.3992740471869328, "grad_norm": 3.21875, "learning_rate": 8e-05, "loss": 1.3173, "step": 110 }, { "epoch": 0.41742286751361163, "grad_norm": 2.625, "learning_rate": 8.363636363636364e-05, "loss": 1.2786, "step": 115 }, { "epoch": 0.4355716878402904, "grad_norm": 2.8125, "learning_rate": 8.727272727272727e-05, "loss": 1.2548, "step": 120 }, { "epoch": 0.4537205081669691, "grad_norm": 2.96875, "learning_rate": 9.090909090909092e-05, "loss": 1.1995, "step": 125 }, { "epoch": 0.47186932849364793, "grad_norm": 1.390625, "learning_rate": 9.454545454545455e-05, "loss": 1.1818, "step": 130 }, { "epoch": 0.4900181488203267, "grad_norm": 5.78125, "learning_rate": 9.818181818181818e-05, "loss": 1.1444, "step": 135 }, { "epoch": 0.5081669691470054, "grad_norm": 7.34375, "learning_rate": 0.00010181818181818181, "loss": 1.0977, "step": 140 }, { "epoch": 0.5263157894736842, "grad_norm": 5.1875, "learning_rate": 0.00010545454545454545, "loss": 1.0818, "step": 145 }, { "epoch": 0.5444646098003629, "grad_norm": 1.984375, "learning_rate": 0.00010909090909090909, "loss": 1.0524, "step": 150 }, { "epoch": 0.5626134301270418, "grad_norm": 20.5, "learning_rate": 0.00011272727272727272, "loss": 1.0397, "step": 155 }, { "epoch": 0.5807622504537205, "grad_norm": 7.71875, "learning_rate": 0.00011636363636363636, "loss": 1.0695, "step": 160 }, { "epoch": 0.5989110707803993, "grad_norm": 5.28125, "learning_rate": 0.00012, "loss": 1.0426, "step": 165 }, { "epoch": 0.617059891107078, "grad_norm": 1.453125, "learning_rate": 0.00012363636363636364, "loss": 1.0295, "step": 170 }, { "epoch": 0.6352087114337568, "grad_norm": 17.75, "learning_rate": 0.00012727272727272728, "loss": 0.9987, "step": 175 }, { "epoch": 0.6533575317604355, "grad_norm": 1.3515625, "learning_rate": 0.00013090909090909093, "loss": 0.9895, "step": 180 }, { "epoch": 0.6715063520871143, "grad_norm": 1.2421875, "learning_rate": 0.00013454545454545455, "loss": 1.0001, "step": 185 }, { "epoch": 0.6896551724137931, "grad_norm": 1.3828125, "learning_rate": 0.0001381818181818182, "loss": 0.9797, "step": 190 }, { "epoch": 0.7078039927404719, "grad_norm": 0.84375, "learning_rate": 0.00014181818181818184, "loss": 0.9846, "step": 195 }, { "epoch": 0.7259528130671506, "grad_norm": 1.8671875, "learning_rate": 0.00014545454545454546, "loss": 0.9978, "step": 200 }, { "epoch": 0.7441016333938294, "grad_norm": 1.1640625, "learning_rate": 0.0001490909090909091, "loss": 0.9639, "step": 205 }, { "epoch": 0.7622504537205081, "grad_norm": 0.78125, "learning_rate": 0.00015272727272727275, "loss": 0.9302, "step": 210 }, { "epoch": 0.7803992740471869, "grad_norm": 2.390625, "learning_rate": 0.00015636363636363637, "loss": 0.9798, "step": 215 }, { "epoch": 0.7985480943738656, "grad_norm": 2.390625, "learning_rate": 0.00016, "loss": 0.9477, "step": 220 }, { "epoch": 0.8166969147005445, "grad_norm": 3.265625, "learning_rate": 0.00016363636363636366, "loss": 0.9237, "step": 225 }, { "epoch": 0.8348457350272233, "grad_norm": 4.34375, "learning_rate": 0.00016727272727272728, "loss": 0.9335, "step": 230 }, { "epoch": 0.852994555353902, "grad_norm": 3.640625, "learning_rate": 0.0001709090909090909, "loss": 0.9452, "step": 235 }, { "epoch": 0.8711433756805808, "grad_norm": 3.265625, "learning_rate": 0.00017454545454545454, "loss": 0.9112, "step": 240 }, { "epoch": 0.8892921960072595, "grad_norm": 3.796875, "learning_rate": 0.0001781818181818182, "loss": 0.9211, "step": 245 }, { "epoch": 0.9074410163339383, "grad_norm": 6.125, "learning_rate": 0.00018181818181818183, "loss": 0.9188, "step": 250 }, { "epoch": 0.925589836660617, "grad_norm": 2.609375, "learning_rate": 0.00018545454545454545, "loss": 0.937, "step": 255 }, { "epoch": 0.9437386569872959, "grad_norm": 3.03125, "learning_rate": 0.0001890909090909091, "loss": 0.9138, "step": 260 }, { "epoch": 0.9618874773139746, "grad_norm": 1.875, "learning_rate": 0.00019272727272727274, "loss": 0.9222, "step": 265 }, { "epoch": 0.9800362976406534, "grad_norm": 3.84375, "learning_rate": 0.00019636363636363636, "loss": 0.9238, "step": 270 }, { "epoch": 0.9981851179673321, "grad_norm": 2.6875, "learning_rate": 0.0002, "loss": 0.9152, "step": 275 }, { "epoch": 0.9981851179673321, "eval_loss": 2.195019245147705, "eval_runtime": 1.0238, "eval_samples_per_second": 4.884, "eval_steps_per_second": 1.953, "step": 275 }, { "epoch": 1.0163339382940109, "grad_norm": 1.875, "learning_rate": 0.00019999798600729064, "loss": 0.8799, "step": 280 }, { "epoch": 1.0344827586206897, "grad_norm": 4.28125, "learning_rate": 0.00019999194411028594, "loss": 0.87, "step": 285 }, { "epoch": 1.0526315789473684, "grad_norm": 3.28125, "learning_rate": 0.0001999818745523526, "loss": 0.8781, "step": 290 }, { "epoch": 1.0707803992740472, "grad_norm": 0.87109375, "learning_rate": 0.00019996777773909093, "loss": 0.8886, "step": 295 }, { "epoch": 1.0889292196007259, "grad_norm": 4.375, "learning_rate": 0.00019994965423831854, "loss": 0.9042, "step": 300 }, { "epoch": 1.1070780399274047, "grad_norm": 1.2265625, "learning_rate": 0.00019992750478004738, "loss": 0.8569, "step": 305 }, { "epoch": 1.1252268602540836, "grad_norm": 1.828125, "learning_rate": 0.0001999013302564544, "loss": 0.8641, "step": 310 }, { "epoch": 1.1433756805807622, "grad_norm": 1.03125, "learning_rate": 0.00019987113172184563, "loss": 0.8521, "step": 315 }, { "epoch": 1.161524500907441, "grad_norm": 1.15625, "learning_rate": 0.00019983691039261357, "loss": 0.8658, "step": 320 }, { "epoch": 1.1796733212341197, "grad_norm": 12.5, "learning_rate": 0.00019979866764718843, "loss": 0.8702, "step": 325 }, { "epoch": 1.1978221415607986, "grad_norm": 1.8203125, "learning_rate": 0.00019975640502598244, "loss": 0.874, "step": 330 }, { "epoch": 1.2159709618874772, "grad_norm": 2.109375, "learning_rate": 0.00019971012423132775, "loss": 0.8552, "step": 335 }, { "epoch": 1.234119782214156, "grad_norm": 4.5625, "learning_rate": 0.00019965982712740808, "loss": 0.8459, "step": 340 }, { "epoch": 1.252268602540835, "grad_norm": 2.046875, "learning_rate": 0.0001996055157401834, "loss": 0.8528, "step": 345 }, { "epoch": 1.2704174228675136, "grad_norm": 1.3203125, "learning_rate": 0.00019954719225730847, "loss": 0.8687, "step": 350 }, { "epoch": 1.2885662431941924, "grad_norm": 1.3828125, "learning_rate": 0.0001994848590280447, "loss": 0.8539, "step": 355 }, { "epoch": 1.306715063520871, "grad_norm": 1.515625, "learning_rate": 0.00019941851856316548, "loss": 0.8335, "step": 360 }, { "epoch": 1.32486388384755, "grad_norm": 12.75, "learning_rate": 0.00019934817353485501, "loss": 0.8529, "step": 365 }, { "epoch": 1.3430127041742286, "grad_norm": 0.94140625, "learning_rate": 0.00019927382677660088, "loss": 0.861, "step": 370 }, { "epoch": 1.3611615245009074, "grad_norm": 2.4375, "learning_rate": 0.00019919548128307954, "loss": 0.8304, "step": 375 }, { "epoch": 1.3793103448275863, "grad_norm": 0.93359375, "learning_rate": 0.00019911314021003613, "loss": 0.8179, "step": 380 }, { "epoch": 1.397459165154265, "grad_norm": 1.5234375, "learning_rate": 0.00019902680687415705, "loss": 0.8295, "step": 385 }, { "epoch": 1.4156079854809438, "grad_norm": 1.703125, "learning_rate": 0.00019893648475293648, "loss": 0.8607, "step": 390 }, { "epoch": 1.4337568058076224, "grad_norm": 1.1015625, "learning_rate": 0.00019884217748453623, "loss": 0.8223, "step": 395 }, { "epoch": 1.4519056261343013, "grad_norm": 0.796875, "learning_rate": 0.00019874388886763944, "loss": 0.8603, "step": 400 }, { "epoch": 1.47005444646098, "grad_norm": 0.640625, "learning_rate": 0.0001986416228612972, "loss": 0.8393, "step": 405 }, { "epoch": 1.4882032667876588, "grad_norm": 1.5390625, "learning_rate": 0.00019853538358476932, "loss": 0.8363, "step": 410 }, { "epoch": 1.5063520871143377, "grad_norm": 1.09375, "learning_rate": 0.00019842517531735838, "loss": 0.8403, "step": 415 }, { "epoch": 1.5245009074410163, "grad_norm": 1.296875, "learning_rate": 0.00019831100249823733, "loss": 0.8198, "step": 420 }, { "epoch": 1.542649727767695, "grad_norm": 17.5, "learning_rate": 0.00019819286972627066, "loss": 0.8203, "step": 425 }, { "epoch": 1.560798548094374, "grad_norm": 4.3125, "learning_rate": 0.00019807078175982924, "loss": 0.8595, "step": 430 }, { "epoch": 1.5789473684210527, "grad_norm": 1.890625, "learning_rate": 0.00019794474351659852, "loss": 0.8302, "step": 435 }, { "epoch": 1.5970961887477313, "grad_norm": 1.125, "learning_rate": 0.00019781476007338058, "loss": 0.8248, "step": 440 }, { "epoch": 1.6152450090744102, "grad_norm": 1.0234375, "learning_rate": 0.00019768083666588953, "loss": 0.8099, "step": 445 }, { "epoch": 1.633393829401089, "grad_norm": 1.4921875, "learning_rate": 0.00019754297868854073, "loss": 0.8127, "step": 450 }, { "epoch": 1.6515426497277677, "grad_norm": 1.5078125, "learning_rate": 0.00019740119169423337, "loss": 0.8199, "step": 455 }, { "epoch": 1.6696914700544465, "grad_norm": 0.6640625, "learning_rate": 0.00019725548139412692, "loss": 0.8185, "step": 460 }, { "epoch": 1.6878402903811254, "grad_norm": 0.71875, "learning_rate": 0.00019710585365741103, "loss": 0.8347, "step": 465 }, { "epoch": 1.705989110707804, "grad_norm": 1.046875, "learning_rate": 0.00019695231451106912, "loss": 0.8217, "step": 470 }, { "epoch": 1.7241379310344827, "grad_norm": 0.87109375, "learning_rate": 0.00019679487013963564, "loss": 0.8093, "step": 475 }, { "epoch": 1.7422867513611615, "grad_norm": 0.6953125, "learning_rate": 0.00019663352688494684, "loss": 0.8139, "step": 480 }, { "epoch": 1.7604355716878404, "grad_norm": 0.6796875, "learning_rate": 0.0001964682912458856, "loss": 0.8232, "step": 485 }, { "epoch": 1.778584392014519, "grad_norm": 0.75, "learning_rate": 0.00019629916987811926, "loss": 0.8143, "step": 490 }, { "epoch": 1.7967332123411979, "grad_norm": 1.046875, "learning_rate": 0.0001961261695938319, "loss": 0.8022, "step": 495 }, { "epoch": 1.8148820326678767, "grad_norm": 0.88671875, "learning_rate": 0.00019594929736144976, "loss": 0.8233, "step": 500 }, { "epoch": 1.8330308529945554, "grad_norm": 0.71875, "learning_rate": 0.00019576856030536054, "loss": 0.8185, "step": 505 }, { "epoch": 1.851179673321234, "grad_norm": 1.0234375, "learning_rate": 0.0001955839657056265, "loss": 0.8208, "step": 510 }, { "epoch": 1.8693284936479129, "grad_norm": 1.546875, "learning_rate": 0.00019539552099769126, "loss": 0.8178, "step": 515 }, { "epoch": 1.8874773139745917, "grad_norm": 2.5625, "learning_rate": 0.00019520323377208017, "loss": 0.8086, "step": 520 }, { "epoch": 1.9056261343012704, "grad_norm": 0.9140625, "learning_rate": 0.00019500711177409454, "loss": 0.8057, "step": 525 }, { "epoch": 1.9237749546279492, "grad_norm": 2.15625, "learning_rate": 0.00019480716290349995, "loss": 0.8139, "step": 530 }, { "epoch": 1.941923774954628, "grad_norm": 0.86328125, "learning_rate": 0.00019460339521420772, "loss": 0.8028, "step": 535 }, { "epoch": 1.9600725952813067, "grad_norm": 1.5546875, "learning_rate": 0.00019439581691395067, "loss": 0.8212, "step": 540 }, { "epoch": 1.9782214156079854, "grad_norm": 0.984375, "learning_rate": 0.00019418443636395248, "loss": 0.8107, "step": 545 }, { "epoch": 1.9963702359346642, "grad_norm": 0.546875, "learning_rate": 0.00019396926207859084, "loss": 0.8104, "step": 550 }, { "epoch": 2.0, "eval_loss": 2.1405208110809326, "eval_runtime": 1.0236, "eval_samples_per_second": 4.885, "eval_steps_per_second": 1.954, "step": 551 }, { "epoch": 2.014519056261343, "grad_norm": 0.85546875, "learning_rate": 0.00019375030272505463, "loss": 0.7501, "step": 555 }, { "epoch": 2.0326678765880217, "grad_norm": 0.69921875, "learning_rate": 0.00019352756712299468, "loss": 0.7576, "step": 560 }, { "epoch": 2.0508166969147004, "grad_norm": 1.8828125, "learning_rate": 0.00019330106424416852, "loss": 0.7446, "step": 565 }, { "epoch": 2.0689655172413794, "grad_norm": 5.9375, "learning_rate": 0.00019307080321207912, "loss": 0.7617, "step": 570 }, { "epoch": 2.087114337568058, "grad_norm": 0.734375, "learning_rate": 0.00019283679330160726, "loss": 0.7408, "step": 575 }, { "epoch": 2.1052631578947367, "grad_norm": 1.2734375, "learning_rate": 0.00019259904393863802, "loss": 0.7426, "step": 580 }, { "epoch": 2.123411978221416, "grad_norm": 1.25, "learning_rate": 0.0001923575646996811, "loss": 0.7396, "step": 585 }, { "epoch": 2.1415607985480944, "grad_norm": 2.03125, "learning_rate": 0.000192112365311485, "loss": 0.7581, "step": 590 }, { "epoch": 2.159709618874773, "grad_norm": 0.921875, "learning_rate": 0.00019186345565064535, "loss": 0.7494, "step": 595 }, { "epoch": 2.1778584392014517, "grad_norm": 0.73046875, "learning_rate": 0.00019161084574320696, "loss": 0.7356, "step": 600 }, { "epoch": 2.196007259528131, "grad_norm": 1.3125, "learning_rate": 0.0001913545457642601, "loss": 0.7502, "step": 605 }, { "epoch": 2.2141560798548094, "grad_norm": 2.078125, "learning_rate": 0.0001910945660375305, "loss": 0.7566, "step": 610 }, { "epoch": 2.232304900181488, "grad_norm": 1.109375, "learning_rate": 0.0001908309170349637, "loss": 0.7429, "step": 615 }, { "epoch": 2.250453720508167, "grad_norm": 0.87109375, "learning_rate": 0.0001905636093763031, "loss": 0.7449, "step": 620 }, { "epoch": 2.268602540834846, "grad_norm": 0.78515625, "learning_rate": 0.00019029265382866214, "loss": 0.735, "step": 625 }, { "epoch": 2.2867513611615244, "grad_norm": 1.5546875, "learning_rate": 0.0001900180613060908, "loss": 0.7585, "step": 630 }, { "epoch": 2.304900181488203, "grad_norm": 2.015625, "learning_rate": 0.00018973984286913584, "loss": 0.7429, "step": 635 }, { "epoch": 2.323049001814882, "grad_norm": 0.7890625, "learning_rate": 0.00018945800972439538, "loss": 0.758, "step": 640 }, { "epoch": 2.341197822141561, "grad_norm": 1.1015625, "learning_rate": 0.00018917257322406734, "loss": 0.757, "step": 645 }, { "epoch": 2.3593466424682394, "grad_norm": 0.9296875, "learning_rate": 0.00018888354486549237, "loss": 0.7506, "step": 650 }, { "epoch": 2.3774954627949185, "grad_norm": 1.0625, "learning_rate": 0.00018859093629069058, "loss": 0.756, "step": 655 }, { "epoch": 2.395644283121597, "grad_norm": 0.609375, "learning_rate": 0.00018829475928589271, "loss": 0.7411, "step": 660 }, { "epoch": 2.413793103448276, "grad_norm": 0.58203125, "learning_rate": 0.00018799502578106534, "loss": 0.7426, "step": 665 }, { "epoch": 2.4319419237749544, "grad_norm": 0.8359375, "learning_rate": 0.0001876917478494303, "loss": 0.7359, "step": 670 }, { "epoch": 2.4500907441016335, "grad_norm": 1.1640625, "learning_rate": 0.00018738493770697852, "loss": 0.7516, "step": 675 }, { "epoch": 2.468239564428312, "grad_norm": 0.82421875, "learning_rate": 0.00018707460771197774, "loss": 0.7524, "step": 680 }, { "epoch": 2.486388384754991, "grad_norm": 1.0078125, "learning_rate": 0.00018676077036447494, "loss": 0.7655, "step": 685 }, { "epoch": 2.50453720508167, "grad_norm": 0.546875, "learning_rate": 0.0001864434383057927, "loss": 0.7457, "step": 690 }, { "epoch": 2.5226860254083485, "grad_norm": 0.921875, "learning_rate": 0.00018612262431802007, "loss": 0.7608, "step": 695 }, { "epoch": 2.540834845735027, "grad_norm": 2.390625, "learning_rate": 0.00018579834132349772, "loss": 0.7437, "step": 700 }, { "epoch": 2.558983666061706, "grad_norm": 1.7890625, "learning_rate": 0.00018547060238429736, "loss": 0.7586, "step": 705 }, { "epoch": 2.577132486388385, "grad_norm": 3.3125, "learning_rate": 0.0001851394207016957, "loss": 0.752, "step": 710 }, { "epoch": 2.5952813067150635, "grad_norm": 0.58984375, "learning_rate": 0.0001848048096156426, "loss": 0.7484, "step": 715 }, { "epoch": 2.613430127041742, "grad_norm": 6.3125, "learning_rate": 0.00018446678260422385, "loss": 0.7554, "step": 720 }, { "epoch": 2.6315789473684212, "grad_norm": 0.62109375, "learning_rate": 0.00018412535328311814, "loss": 0.7442, "step": 725 }, { "epoch": 2.6497277676951, "grad_norm": 0.8828125, "learning_rate": 0.00018378053540504873, "loss": 0.7581, "step": 730 }, { "epoch": 2.6678765880217785, "grad_norm": 1.7265625, "learning_rate": 0.00018343234285922953, "loss": 0.7727, "step": 735 }, { "epoch": 2.686025408348457, "grad_norm": 0.58203125, "learning_rate": 0.00018308078967080546, "loss": 0.7525, "step": 740 }, { "epoch": 2.7041742286751362, "grad_norm": 1.0703125, "learning_rate": 0.00018272589000028772, "loss": 0.737, "step": 745 }, { "epoch": 2.722323049001815, "grad_norm": 0.7578125, "learning_rate": 0.0001823676581429833, "loss": 0.7724, "step": 750 }, { "epoch": 2.7404718693284935, "grad_norm": 1.640625, "learning_rate": 0.00018200610852841913, "loss": 0.746, "step": 755 }, { "epoch": 2.7586206896551726, "grad_norm": 0.7265625, "learning_rate": 0.00018164125571976098, "loss": 0.7609, "step": 760 }, { "epoch": 2.7767695099818512, "grad_norm": 1.1640625, "learning_rate": 0.0001812731144132268, "loss": 0.7623, "step": 765 }, { "epoch": 2.79491833030853, "grad_norm": 0.6328125, "learning_rate": 0.00018090169943749476, "loss": 0.7668, "step": 770 }, { "epoch": 2.8130671506352085, "grad_norm": 0.546875, "learning_rate": 0.00018052702575310588, "loss": 0.747, "step": 775 }, { "epoch": 2.8312159709618876, "grad_norm": 1.09375, "learning_rate": 0.00018014910845186153, "loss": 0.7533, "step": 780 }, { "epoch": 2.8493647912885662, "grad_norm": 0.83984375, "learning_rate": 0.00017976796275621555, "loss": 0.7435, "step": 785 }, { "epoch": 2.867513611615245, "grad_norm": 0.73046875, "learning_rate": 0.00017938360401866093, "loss": 0.7424, "step": 790 }, { "epoch": 2.885662431941924, "grad_norm": 0.75, "learning_rate": 0.00017899604772111163, "loss": 0.7321, "step": 795 }, { "epoch": 2.9038112522686026, "grad_norm": 0.703125, "learning_rate": 0.00017860530947427875, "loss": 0.7474, "step": 800 }, { "epoch": 2.9219600725952812, "grad_norm": 0.63671875, "learning_rate": 0.00017821140501704194, "loss": 0.7502, "step": 805 }, { "epoch": 2.94010889292196, "grad_norm": 1.1328125, "learning_rate": 0.00017781435021581527, "loss": 0.7432, "step": 810 }, { "epoch": 2.958257713248639, "grad_norm": 1.046875, "learning_rate": 0.00017741416106390826, "loss": 0.7529, "step": 815 }, { "epoch": 2.9764065335753176, "grad_norm": 0.921875, "learning_rate": 0.00017701085368088156, "loss": 0.7633, "step": 820 }, { "epoch": 2.9945553539019962, "grad_norm": 0.5859375, "learning_rate": 0.0001766044443118978, "loss": 0.7914, "step": 825 }, { "epoch": 2.9981851179673322, "eval_loss": 2.1592297554016113, "eval_runtime": 1.0245, "eval_samples_per_second": 4.881, "eval_steps_per_second": 1.952, "step": 826 }, { "epoch": 3.0127041742286753, "grad_norm": 0.53515625, "learning_rate": 0.0001761949493270671, "loss": 0.6962, "step": 830 }, { "epoch": 3.030852994555354, "grad_norm": 0.61328125, "learning_rate": 0.0001757823852207877, "loss": 0.6788, "step": 835 }, { "epoch": 3.0490018148820326, "grad_norm": 0.6953125, "learning_rate": 0.00017536676861108164, "loss": 0.6785, "step": 840 }, { "epoch": 3.0671506352087112, "grad_norm": 0.796875, "learning_rate": 0.0001749481162389254, "loss": 0.6784, "step": 845 }, { "epoch": 3.0852994555353903, "grad_norm": 1.3515625, "learning_rate": 0.0001745264449675755, "loss": 0.6796, "step": 850 }, { "epoch": 3.103448275862069, "grad_norm": 1.7109375, "learning_rate": 0.00017410177178188918, "loss": 0.6682, "step": 855 }, { "epoch": 3.1215970961887476, "grad_norm": 0.76953125, "learning_rate": 0.0001736741137876405, "loss": 0.6941, "step": 860 }, { "epoch": 3.1397459165154267, "grad_norm": 1.1015625, "learning_rate": 0.0001732434882108311, "loss": 0.6757, "step": 865 }, { "epoch": 3.1578947368421053, "grad_norm": 1.0859375, "learning_rate": 0.00017280991239699642, "loss": 0.6772, "step": 870 }, { "epoch": 3.176043557168784, "grad_norm": 0.79296875, "learning_rate": 0.00017237340381050703, "loss": 0.6951, "step": 875 }, { "epoch": 3.1941923774954626, "grad_norm": 1.390625, "learning_rate": 0.0001719339800338651, "loss": 0.7005, "step": 880 }, { "epoch": 3.2123411978221417, "grad_norm": 1.46875, "learning_rate": 0.00017149165876699635, "loss": 0.6948, "step": 885 }, { "epoch": 3.2304900181488203, "grad_norm": 0.80078125, "learning_rate": 0.0001710464578265369, "loss": 0.6809, "step": 890 }, { "epoch": 3.248638838475499, "grad_norm": 0.79296875, "learning_rate": 0.00017059839514511565, "loss": 0.6848, "step": 895 }, { "epoch": 3.266787658802178, "grad_norm": 0.66015625, "learning_rate": 0.00017014748877063214, "loss": 0.6695, "step": 900 }, { "epoch": 3.2849364791288567, "grad_norm": 0.5859375, "learning_rate": 0.00016969375686552937, "loss": 0.6939, "step": 905 }, { "epoch": 3.3030852994555353, "grad_norm": 0.83984375, "learning_rate": 0.00016923721770606228, "loss": 0.6792, "step": 910 }, { "epoch": 3.321234119782214, "grad_norm": 0.81640625, "learning_rate": 0.0001687778896815617, "loss": 0.7015, "step": 915 }, { "epoch": 3.339382940108893, "grad_norm": 0.64453125, "learning_rate": 0.00016831579129369346, "loss": 0.6828, "step": 920 }, { "epoch": 3.3575317604355717, "grad_norm": 0.6015625, "learning_rate": 0.00016785094115571322, "loss": 0.6704, "step": 925 }, { "epoch": 3.3756805807622503, "grad_norm": 2.15625, "learning_rate": 0.00016738335799171682, "loss": 0.6796, "step": 930 }, { "epoch": 3.3938294010889294, "grad_norm": 0.671875, "learning_rate": 0.00016691306063588583, "loss": 0.6866, "step": 935 }, { "epoch": 3.411978221415608, "grad_norm": 1.609375, "learning_rate": 0.00016644006803172924, "loss": 0.6788, "step": 940 }, { "epoch": 3.4301270417422867, "grad_norm": 0.6484375, "learning_rate": 0.00016596439923132017, "loss": 0.7052, "step": 945 }, { "epoch": 3.4482758620689653, "grad_norm": 0.859375, "learning_rate": 0.00016548607339452853, "loss": 0.672, "step": 950 }, { "epoch": 3.4664246823956444, "grad_norm": 0.56640625, "learning_rate": 0.00016500510978824926, "loss": 0.6981, "step": 955 }, { "epoch": 3.484573502722323, "grad_norm": 0.609375, "learning_rate": 0.0001645215277856263, "loss": 0.6958, "step": 960 }, { "epoch": 3.5027223230490017, "grad_norm": 0.7890625, "learning_rate": 0.00016403534686527225, "loss": 0.6779, "step": 965 }, { "epoch": 3.5208711433756807, "grad_norm": 0.5703125, "learning_rate": 0.00016354658661048364, "loss": 0.6773, "step": 970 }, { "epoch": 3.5390199637023594, "grad_norm": 0.703125, "learning_rate": 0.00016305526670845226, "loss": 0.6883, "step": 975 }, { "epoch": 3.557168784029038, "grad_norm": 0.765625, "learning_rate": 0.00016256140694947217, "loss": 0.6962, "step": 980 }, { "epoch": 3.5753176043557167, "grad_norm": 0.703125, "learning_rate": 0.00016206502722614238, "loss": 0.6783, "step": 985 }, { "epoch": 3.5934664246823957, "grad_norm": 0.53515625, "learning_rate": 0.0001615661475325658, "loss": 0.6933, "step": 990 }, { "epoch": 3.6116152450090744, "grad_norm": 0.67578125, "learning_rate": 0.00016106478796354382, "loss": 0.6791, "step": 995 }, { "epoch": 3.629764065335753, "grad_norm": 0.859375, "learning_rate": 0.00016056096871376667, "loss": 0.7038, "step": 1000 }, { "epoch": 3.647912885662432, "grad_norm": 0.6796875, "learning_rate": 0.00016005471007700031, "loss": 0.6966, "step": 1005 }, { "epoch": 3.6660617059891107, "grad_norm": 1.1875, "learning_rate": 0.0001595460324452688, "loss": 0.6856, "step": 1010 }, { "epoch": 3.6842105263157894, "grad_norm": 0.9609375, "learning_rate": 0.000159034956308033, "loss": 0.6949, "step": 1015 }, { "epoch": 3.702359346642468, "grad_norm": 0.6171875, "learning_rate": 0.00015852150225136518, "loss": 0.6962, "step": 1020 }, { "epoch": 3.720508166969147, "grad_norm": 0.50390625, "learning_rate": 0.00015800569095711982, "loss": 0.6948, "step": 1025 }, { "epoch": 3.7386569872958257, "grad_norm": 0.6328125, "learning_rate": 0.00015748754320210072, "loss": 0.695, "step": 1030 }, { "epoch": 3.7568058076225044, "grad_norm": 0.71484375, "learning_rate": 0.0001569670798572239, "loss": 0.6778, "step": 1035 }, { "epoch": 3.7749546279491835, "grad_norm": 0.87109375, "learning_rate": 0.00015644432188667695, "loss": 0.6932, "step": 1040 }, { "epoch": 3.793103448275862, "grad_norm": 1.0703125, "learning_rate": 0.0001559192903470747, "loss": 0.7006, "step": 1045 }, { "epoch": 3.8112522686025407, "grad_norm": 0.796875, "learning_rate": 0.00015539200638661104, "loss": 0.6905, "step": 1050 }, { "epoch": 3.8294010889292194, "grad_norm": 0.6328125, "learning_rate": 0.000154862491244207, "loss": 0.684, "step": 1055 }, { "epoch": 3.8475499092558985, "grad_norm": 0.9765625, "learning_rate": 0.00015433076624865531, "loss": 0.6838, "step": 1060 }, { "epoch": 3.865698729582577, "grad_norm": 0.8046875, "learning_rate": 0.00015379685281776125, "loss": 0.6807, "step": 1065 }, { "epoch": 3.8838475499092557, "grad_norm": 0.59375, "learning_rate": 0.00015326077245747999, "loss": 0.6966, "step": 1070 }, { "epoch": 3.901996370235935, "grad_norm": 0.6875, "learning_rate": 0.00015272254676105025, "loss": 0.6782, "step": 1075 }, { "epoch": 3.9201451905626135, "grad_norm": 0.8984375, "learning_rate": 0.0001521821974081246, "loss": 0.6908, "step": 1080 }, { "epoch": 3.938294010889292, "grad_norm": 0.58203125, "learning_rate": 0.0001516397461638962, "loss": 0.7119, "step": 1085 }, { "epoch": 3.9564428312159707, "grad_norm": 0.86328125, "learning_rate": 0.00015109521487822206, "loss": 0.6926, "step": 1090 }, { "epoch": 3.97459165154265, "grad_norm": 0.62109375, "learning_rate": 0.000150548625484743, "loss": 0.6885, "step": 1095 }, { "epoch": 3.9927404718693285, "grad_norm": 0.640625, "learning_rate": 0.00015000000000000001, "loss": 0.6978, "step": 1100 }, { "epoch": 4.0, "eval_loss": 2.2176449298858643, "eval_runtime": 1.0239, "eval_samples_per_second": 4.883, "eval_steps_per_second": 1.953, "step": 1102 }, { "epoch": 4.0108892921960075, "grad_norm": 0.6015625, "learning_rate": 0.0001494493605225477, "loss": 0.6515, "step": 1105 }, { "epoch": 4.029038112522686, "grad_norm": 0.7421875, "learning_rate": 0.0001488967292320639, "loss": 0.6052, "step": 1110 }, { "epoch": 4.047186932849365, "grad_norm": 0.640625, "learning_rate": 0.00014834212838845637, "loss": 0.6075, "step": 1115 }, { "epoch": 4.0653357531760435, "grad_norm": 0.65234375, "learning_rate": 0.00014778558033096633, "loss": 0.6193, "step": 1120 }, { "epoch": 4.083484573502722, "grad_norm": 1.125, "learning_rate": 0.0001472271074772683, "loss": 0.6051, "step": 1125 }, { "epoch": 4.101633393829401, "grad_norm": 0.9765625, "learning_rate": 0.00014666673232256738, "loss": 0.6112, "step": 1130 }, { "epoch": 4.11978221415608, "grad_norm": 0.68359375, "learning_rate": 0.00014610447743869314, "loss": 0.6208, "step": 1135 }, { "epoch": 4.137931034482759, "grad_norm": 0.89453125, "learning_rate": 0.00014554036547319033, "loss": 0.6185, "step": 1140 }, { "epoch": 4.1560798548094375, "grad_norm": 1.09375, "learning_rate": 0.0001449744191484066, "loss": 0.6209, "step": 1145 }, { "epoch": 4.174228675136116, "grad_norm": 0.87109375, "learning_rate": 0.00014440666126057744, "loss": 0.6134, "step": 1150 }, { "epoch": 4.192377495462795, "grad_norm": 0.76171875, "learning_rate": 0.00014383711467890774, "loss": 0.6234, "step": 1155 }, { "epoch": 4.2105263157894735, "grad_norm": 0.640625, "learning_rate": 0.00014326580234465085, "loss": 0.6104, "step": 1160 }, { "epoch": 4.228675136116152, "grad_norm": 0.9296875, "learning_rate": 0.0001426927472701842, "loss": 0.6183, "step": 1165 }, { "epoch": 4.246823956442832, "grad_norm": 0.88671875, "learning_rate": 0.00014211797253808268, "loss": 0.6201, "step": 1170 }, { "epoch": 4.26497277676951, "grad_norm": 0.89453125, "learning_rate": 0.00014154150130018866, "loss": 0.6291, "step": 1175 }, { "epoch": 4.283121597096189, "grad_norm": 1.046875, "learning_rate": 0.00014096335677667954, "loss": 0.6248, "step": 1180 }, { "epoch": 4.3012704174228675, "grad_norm": 0.8984375, "learning_rate": 0.00014038356225513248, "loss": 0.6321, "step": 1185 }, { "epoch": 4.319419237749546, "grad_norm": 0.61328125, "learning_rate": 0.00013980214108958624, "loss": 0.6284, "step": 1190 }, { "epoch": 4.337568058076225, "grad_norm": 0.79296875, "learning_rate": 0.00013921911669960055, "loss": 0.6294, "step": 1195 }, { "epoch": 4.3557168784029034, "grad_norm": 0.80859375, "learning_rate": 0.00013863451256931287, "loss": 0.6166, "step": 1200 }, { "epoch": 4.373865698729583, "grad_norm": 0.60546875, "learning_rate": 0.0001380483522464923, "loss": 0.6254, "step": 1205 }, { "epoch": 4.392014519056262, "grad_norm": 0.875, "learning_rate": 0.00013746065934159123, "loss": 0.6194, "step": 1210 }, { "epoch": 4.41016333938294, "grad_norm": 0.69921875, "learning_rate": 0.0001368714575267941, "loss": 0.6287, "step": 1215 }, { "epoch": 4.428312159709619, "grad_norm": 0.55859375, "learning_rate": 0.0001362807705350641, "loss": 0.6306, "step": 1220 }, { "epoch": 4.4464609800362975, "grad_norm": 0.7421875, "learning_rate": 0.00013568862215918717, "loss": 0.6298, "step": 1225 }, { "epoch": 4.464609800362976, "grad_norm": 0.62109375, "learning_rate": 0.00013509503625081358, "loss": 0.6162, "step": 1230 }, { "epoch": 4.482758620689655, "grad_norm": 0.6484375, "learning_rate": 0.00013450003671949706, "loss": 0.6235, "step": 1235 }, { "epoch": 4.500907441016334, "grad_norm": 0.7578125, "learning_rate": 0.00013390364753173206, "loss": 0.6217, "step": 1240 }, { "epoch": 4.519056261343013, "grad_norm": 0.69921875, "learning_rate": 0.00013330589270998808, "loss": 0.6119, "step": 1245 }, { "epoch": 4.537205081669692, "grad_norm": 0.6484375, "learning_rate": 0.00013270679633174218, "loss": 0.6272, "step": 1250 }, { "epoch": 4.55535390199637, "grad_norm": 0.60546875, "learning_rate": 0.00013210638252850908, "loss": 0.6244, "step": 1255 }, { "epoch": 4.573502722323049, "grad_norm": 0.609375, "learning_rate": 0.0001315046754848693, "loss": 0.6335, "step": 1260 }, { "epoch": 4.5916515426497275, "grad_norm": 0.59375, "learning_rate": 0.00013090169943749476, "loss": 0.6282, "step": 1265 }, { "epoch": 4.609800362976406, "grad_norm": 0.59375, "learning_rate": 0.00013029747867417276, "loss": 0.628, "step": 1270 }, { "epoch": 4.627949183303086, "grad_norm": 0.6171875, "learning_rate": 0.0001296920375328275, "loss": 0.6257, "step": 1275 }, { "epoch": 4.646098003629764, "grad_norm": 0.57421875, "learning_rate": 0.0001290854004005399, "loss": 0.6229, "step": 1280 }, { "epoch": 4.664246823956443, "grad_norm": 0.640625, "learning_rate": 0.00012847759171256523, "loss": 0.6401, "step": 1285 }, { "epoch": 4.682395644283122, "grad_norm": 0.5859375, "learning_rate": 0.0001278686359513488, "loss": 0.627, "step": 1290 }, { "epoch": 4.7005444646098, "grad_norm": 0.6171875, "learning_rate": 0.0001272585576455398, "loss": 0.6368, "step": 1295 }, { "epoch": 4.718693284936479, "grad_norm": 0.65234375, "learning_rate": 0.00012664738136900348, "loss": 0.6378, "step": 1300 }, { "epoch": 4.7368421052631575, "grad_norm": 0.609375, "learning_rate": 0.0001260351317398312, "loss": 0.6318, "step": 1305 }, { "epoch": 4.754990925589837, "grad_norm": 0.59375, "learning_rate": 0.00012542183341934872, "loss": 0.6277, "step": 1310 }, { "epoch": 4.773139745916516, "grad_norm": 0.59375, "learning_rate": 0.0001248075111111229, "loss": 0.6407, "step": 1315 }, { "epoch": 4.791288566243194, "grad_norm": 0.73828125, "learning_rate": 0.00012419218955996676, "loss": 0.6421, "step": 1320 }, { "epoch": 4.809437386569873, "grad_norm": 0.80859375, "learning_rate": 0.00012357589355094275, "loss": 0.6323, "step": 1325 }, { "epoch": 4.827586206896552, "grad_norm": 0.59765625, "learning_rate": 0.0001229586479083641, "loss": 0.6437, "step": 1330 }, { "epoch": 4.84573502722323, "grad_norm": 0.5625, "learning_rate": 0.00012234047749479544, "loss": 0.6415, "step": 1335 }, { "epoch": 4.863883847549909, "grad_norm": 0.59375, "learning_rate": 0.00012172140721005079, "loss": 0.6293, "step": 1340 }, { "epoch": 4.882032667876588, "grad_norm": 0.609375, "learning_rate": 0.000121101461990191, "loss": 0.6245, "step": 1345 }, { "epoch": 4.900181488203267, "grad_norm": 0.5703125, "learning_rate": 0.00012048066680651908, "loss": 0.6146, "step": 1350 }, { "epoch": 4.918330308529946, "grad_norm": 0.65625, "learning_rate": 0.00011985904666457455, "loss": 0.6425, "step": 1355 }, { "epoch": 4.936479128856624, "grad_norm": 0.80078125, "learning_rate": 0.00011923662660312611, "loss": 0.64, "step": 1360 }, { "epoch": 4.954627949183303, "grad_norm": 0.54296875, "learning_rate": 0.00011861343169316301, "loss": 0.6316, "step": 1365 }, { "epoch": 4.972776769509982, "grad_norm": 0.76171875, "learning_rate": 0.00011798948703688539, "loss": 0.629, "step": 1370 }, { "epoch": 4.99092558983666, "grad_norm": 0.625, "learning_rate": 0.00011736481776669306, "loss": 0.6386, "step": 1375 }, { "epoch": 4.998185117967332, "eval_loss": 2.3272275924682617, "eval_runtime": 1.0255, "eval_samples_per_second": 4.876, "eval_steps_per_second": 1.95, "step": 1377 }, { "epoch": 5.00907441016334, "grad_norm": 0.77734375, "learning_rate": 0.00011673944904417308, "loss": 0.5952, "step": 1380 }, { "epoch": 5.027223230490018, "grad_norm": 0.79296875, "learning_rate": 0.00011611340605908642, "loss": 0.5449, "step": 1385 }, { "epoch": 5.045372050816697, "grad_norm": 0.65234375, "learning_rate": 0.00011548671402835325, "loss": 0.5476, "step": 1390 }, { "epoch": 5.063520871143376, "grad_norm": 0.72265625, "learning_rate": 0.00011485939819503717, "loss": 0.5626, "step": 1395 }, { "epoch": 5.081669691470054, "grad_norm": 0.5703125, "learning_rate": 0.00011423148382732853, "loss": 0.5616, "step": 1400 }, { "epoch": 5.099818511796733, "grad_norm": 0.61328125, "learning_rate": 0.00011360299621752644, "loss": 0.569, "step": 1405 }, { "epoch": 5.117967332123412, "grad_norm": 0.6953125, "learning_rate": 0.00011297396068102017, "loss": 0.5644, "step": 1410 }, { "epoch": 5.136116152450091, "grad_norm": 0.64453125, "learning_rate": 0.00011234440255526948, "loss": 0.5577, "step": 1415 }, { "epoch": 5.15426497277677, "grad_norm": 0.66015625, "learning_rate": 0.00011171434719878384, "loss": 0.5484, "step": 1420 }, { "epoch": 5.172413793103448, "grad_norm": 0.671875, "learning_rate": 0.00011108381999010111, "loss": 0.5599, "step": 1425 }, { "epoch": 5.190562613430127, "grad_norm": 0.62109375, "learning_rate": 0.00011045284632676536, "loss": 0.5588, "step": 1430 }, { "epoch": 5.208711433756806, "grad_norm": 0.625, "learning_rate": 0.00010982145162430373, "loss": 0.5562, "step": 1435 }, { "epoch": 5.226860254083484, "grad_norm": 0.59375, "learning_rate": 0.00010918966131520277, "loss": 0.549, "step": 1440 }, { "epoch": 5.245009074410163, "grad_norm": 0.640625, "learning_rate": 0.00010855750084788398, "loss": 0.5667, "step": 1445 }, { "epoch": 5.2631578947368425, "grad_norm": 0.64453125, "learning_rate": 0.00010792499568567884, "loss": 0.5611, "step": 1450 }, { "epoch": 5.281306715063521, "grad_norm": 0.7421875, "learning_rate": 0.0001072921713058031, "loss": 0.5577, "step": 1455 }, { "epoch": 5.2994555353902, "grad_norm": 0.65234375, "learning_rate": 0.00010665905319833041, "loss": 0.559, "step": 1460 }, { "epoch": 5.317604355716878, "grad_norm": 0.6953125, "learning_rate": 0.00010602566686516586, "loss": 0.552, "step": 1465 }, { "epoch": 5.335753176043557, "grad_norm": 1.0546875, "learning_rate": 0.00010539203781901861, "loss": 0.5516, "step": 1470 }, { "epoch": 5.353901996370236, "grad_norm": 0.88671875, "learning_rate": 0.00010475819158237425, "loss": 0.5646, "step": 1475 }, { "epoch": 5.372050816696914, "grad_norm": 0.73046875, "learning_rate": 0.00010412415368646673, "loss": 0.5614, "step": 1480 }, { "epoch": 5.390199637023594, "grad_norm": 0.75, "learning_rate": 0.00010348994967025012, "loss": 0.5532, "step": 1485 }, { "epoch": 5.4083484573502725, "grad_norm": 0.91796875, "learning_rate": 0.00010285560507936961, "loss": 0.5631, "step": 1490 }, { "epoch": 5.426497277676951, "grad_norm": 0.80078125, "learning_rate": 0.00010222114546513295, "loss": 0.581, "step": 1495 }, { "epoch": 5.44464609800363, "grad_norm": 0.61328125, "learning_rate": 0.00010158659638348081, "loss": 0.5503, "step": 1500 }, { "epoch": 5.462794918330308, "grad_norm": 0.69921875, "learning_rate": 0.00010095198339395769, "loss": 0.5645, "step": 1505 }, { "epoch": 5.480943738656987, "grad_norm": 0.65234375, "learning_rate": 0.00010031733205868224, "loss": 0.565, "step": 1510 }, { "epoch": 5.499092558983666, "grad_norm": 0.64453125, "learning_rate": 9.968266794131777e-05, "loss": 0.5616, "step": 1515 }, { "epoch": 5.517241379310345, "grad_norm": 0.640625, "learning_rate": 9.904801660604234e-05, "loss": 0.5601, "step": 1520 }, { "epoch": 5.535390199637024, "grad_norm": 0.7109375, "learning_rate": 9.84134036165192e-05, "loss": 0.5661, "step": 1525 }, { "epoch": 5.5535390199637025, "grad_norm": 0.65234375, "learning_rate": 9.777885453486706e-05, "loss": 0.5756, "step": 1530 }, { "epoch": 5.571687840290381, "grad_norm": 0.625, "learning_rate": 9.71443949206304e-05, "loss": 0.5702, "step": 1535 }, { "epoch": 5.58983666061706, "grad_norm": 0.5859375, "learning_rate": 9.651005032974994e-05, "loss": 0.561, "step": 1540 }, { "epoch": 5.607985480943738, "grad_norm": 0.6953125, "learning_rate": 9.587584631353329e-05, "loss": 0.5747, "step": 1545 }, { "epoch": 5.626134301270417, "grad_norm": 0.6875, "learning_rate": 9.524180841762577e-05, "loss": 0.562, "step": 1550 }, { "epoch": 5.6442831215970966, "grad_norm": 0.76953125, "learning_rate": 9.460796218098143e-05, "loss": 0.5608, "step": 1555 }, { "epoch": 5.662431941923775, "grad_norm": 1.03125, "learning_rate": 9.397433313483416e-05, "loss": 0.5677, "step": 1560 }, { "epoch": 5.680580762250454, "grad_norm": 0.734375, "learning_rate": 9.334094680166962e-05, "loss": 0.575, "step": 1565 }, { "epoch": 5.6987295825771325, "grad_norm": 0.9296875, "learning_rate": 9.270782869419694e-05, "loss": 0.5709, "step": 1570 }, { "epoch": 5.716878402903811, "grad_norm": 0.86328125, "learning_rate": 9.207500431432115e-05, "loss": 0.5556, "step": 1575 }, { "epoch": 5.73502722323049, "grad_norm": 0.8828125, "learning_rate": 9.144249915211605e-05, "loss": 0.5694, "step": 1580 }, { "epoch": 5.753176043557168, "grad_norm": 0.65625, "learning_rate": 9.081033868479727e-05, "loss": 0.5652, "step": 1585 }, { "epoch": 5.771324863883848, "grad_norm": 0.8125, "learning_rate": 9.01785483756963e-05, "loss": 0.568, "step": 1590 }, { "epoch": 5.7894736842105265, "grad_norm": 0.74609375, "learning_rate": 8.954715367323468e-05, "loss": 0.5638, "step": 1595 }, { "epoch": 5.807622504537205, "grad_norm": 0.671875, "learning_rate": 8.891618000989891e-05, "loss": 0.5668, "step": 1600 }, { "epoch": 5.825771324863884, "grad_norm": 0.640625, "learning_rate": 8.828565280121617e-05, "loss": 0.5714, "step": 1605 }, { "epoch": 5.8439201451905625, "grad_norm": 0.69921875, "learning_rate": 8.765559744473053e-05, "loss": 0.5709, "step": 1610 }, { "epoch": 5.862068965517241, "grad_norm": 0.69140625, "learning_rate": 8.702603931897982e-05, "loss": 0.5834, "step": 1615 }, { "epoch": 5.88021778584392, "grad_norm": 0.64453125, "learning_rate": 8.639700378247361e-05, "loss": 0.5673, "step": 1620 }, { "epoch": 5.898366606170599, "grad_norm": 0.84765625, "learning_rate": 8.57685161726715e-05, "loss": 0.5702, "step": 1625 }, { "epoch": 5.916515426497278, "grad_norm": 0.6328125, "learning_rate": 8.514060180496285e-05, "loss": 0.5671, "step": 1630 }, { "epoch": 5.9346642468239565, "grad_norm": 0.640625, "learning_rate": 8.451328597164679e-05, "loss": 0.5686, "step": 1635 }, { "epoch": 5.952813067150635, "grad_norm": 0.70703125, "learning_rate": 8.38865939409136e-05, "loss": 0.5686, "step": 1640 }, { "epoch": 5.970961887477314, "grad_norm": 0.6796875, "learning_rate": 8.326055095582694e-05, "loss": 0.5685, "step": 1645 }, { "epoch": 5.9891107078039925, "grad_norm": 0.625, "learning_rate": 8.263518223330697e-05, "loss": 0.5725, "step": 1650 }, { "epoch": 6.0, "eval_loss": 2.4713406562805176, "eval_runtime": 1.0255, "eval_samples_per_second": 4.876, "eval_steps_per_second": 1.95, "step": 1653 }, { "epoch": 6.007259528130671, "grad_norm": 0.640625, "learning_rate": 8.201051296311462e-05, "loss": 0.5327, "step": 1655 }, { "epoch": 6.025408348457351, "grad_norm": 0.7265625, "learning_rate": 8.1386568306837e-05, "loss": 0.5063, "step": 1660 }, { "epoch": 6.043557168784029, "grad_norm": 0.703125, "learning_rate": 8.076337339687394e-05, "loss": 0.4988, "step": 1665 }, { "epoch": 6.061705989110708, "grad_norm": 0.7421875, "learning_rate": 8.014095333542548e-05, "loss": 0.4931, "step": 1670 }, { "epoch": 6.0798548094373865, "grad_norm": 0.62890625, "learning_rate": 7.951933319348095e-05, "loss": 0.4928, "step": 1675 }, { "epoch": 6.098003629764065, "grad_norm": 0.67578125, "learning_rate": 7.889853800980904e-05, "loss": 0.4987, "step": 1680 }, { "epoch": 6.116152450090744, "grad_norm": 0.734375, "learning_rate": 7.827859278994925e-05, "loss": 0.502, "step": 1685 }, { "epoch": 6.1343012704174225, "grad_norm": 0.73828125, "learning_rate": 7.765952250520459e-05, "loss": 0.494, "step": 1690 }, { "epoch": 6.152450090744102, "grad_norm": 0.71875, "learning_rate": 7.704135209163589e-05, "loss": 0.4876, "step": 1695 }, { "epoch": 6.170598911070781, "grad_norm": 0.6875, "learning_rate": 7.642410644905726e-05, "loss": 0.51, "step": 1700 }, { "epoch": 6.188747731397459, "grad_norm": 0.64453125, "learning_rate": 7.580781044003324e-05, "loss": 0.4999, "step": 1705 }, { "epoch": 6.206896551724138, "grad_norm": 0.69140625, "learning_rate": 7.519248888887716e-05, "loss": 0.5019, "step": 1710 }, { "epoch": 6.2250453720508165, "grad_norm": 0.65234375, "learning_rate": 7.457816658065134e-05, "loss": 0.4964, "step": 1715 }, { "epoch": 6.243194192377495, "grad_norm": 0.6953125, "learning_rate": 7.39648682601688e-05, "loss": 0.4976, "step": 1720 }, { "epoch": 6.261343012704174, "grad_norm": 0.62890625, "learning_rate": 7.335261863099651e-05, "loss": 0.5046, "step": 1725 }, { "epoch": 6.279491833030853, "grad_norm": 0.69140625, "learning_rate": 7.274144235446023e-05, "loss": 0.4968, "step": 1730 }, { "epoch": 6.297640653357532, "grad_norm": 0.703125, "learning_rate": 7.213136404865124e-05, "loss": 0.508, "step": 1735 }, { "epoch": 6.315789473684211, "grad_norm": 0.68359375, "learning_rate": 7.152240828743477e-05, "loss": 0.4968, "step": 1740 }, { "epoch": 6.333938294010889, "grad_norm": 0.83984375, "learning_rate": 7.09145995994601e-05, "loss": 0.5009, "step": 1745 }, { "epoch": 6.352087114337568, "grad_norm": 0.671875, "learning_rate": 7.030796246717255e-05, "loss": 0.4925, "step": 1750 }, { "epoch": 6.3702359346642465, "grad_norm": 0.6953125, "learning_rate": 6.970252132582728e-05, "loss": 0.5057, "step": 1755 }, { "epoch": 6.388384754990925, "grad_norm": 0.6875, "learning_rate": 6.909830056250527e-05, "loss": 0.505, "step": 1760 }, { "epoch": 6.406533575317605, "grad_norm": 0.69140625, "learning_rate": 6.849532451513074e-05, "loss": 0.4962, "step": 1765 }, { "epoch": 6.424682395644283, "grad_norm": 0.66796875, "learning_rate": 6.789361747149093e-05, "loss": 0.5082, "step": 1770 }, { "epoch": 6.442831215970962, "grad_norm": 0.640625, "learning_rate": 6.729320366825784e-05, "loss": 0.5006, "step": 1775 }, { "epoch": 6.460980036297641, "grad_norm": 0.78515625, "learning_rate": 6.669410729001193e-05, "loss": 0.5089, "step": 1780 }, { "epoch": 6.479128856624319, "grad_norm": 0.76171875, "learning_rate": 6.609635246826794e-05, "loss": 0.5005, "step": 1785 }, { "epoch": 6.497277676950998, "grad_norm": 0.7890625, "learning_rate": 6.549996328050296e-05, "loss": 0.5045, "step": 1790 }, { "epoch": 6.5154264972776765, "grad_norm": 0.78125, "learning_rate": 6.490496374918647e-05, "loss": 0.5117, "step": 1795 }, { "epoch": 6.533575317604356, "grad_norm": 0.69921875, "learning_rate": 6.431137784081282e-05, "loss": 0.5074, "step": 1800 }, { "epoch": 6.551724137931035, "grad_norm": 0.65625, "learning_rate": 6.371922946493591e-05, "loss": 0.5078, "step": 1805 }, { "epoch": 6.569872958257713, "grad_norm": 0.69140625, "learning_rate": 6.312854247320595e-05, "loss": 0.5032, "step": 1810 }, { "epoch": 6.588021778584392, "grad_norm": 0.7109375, "learning_rate": 6.25393406584088e-05, "loss": 0.507, "step": 1815 }, { "epoch": 6.606170598911071, "grad_norm": 0.71875, "learning_rate": 6.19516477535077e-05, "loss": 0.5106, "step": 1820 }, { "epoch": 6.624319419237749, "grad_norm": 0.66796875, "learning_rate": 6.136548743068713e-05, "loss": 0.4985, "step": 1825 }, { "epoch": 6.642468239564428, "grad_norm": 0.66796875, "learning_rate": 6.078088330039945e-05, "loss": 0.5001, "step": 1830 }, { "epoch": 6.660617059891107, "grad_norm": 0.68359375, "learning_rate": 6.019785891041381e-05, "loss": 0.5002, "step": 1835 }, { "epoch": 6.678765880217786, "grad_norm": 0.66015625, "learning_rate": 5.9616437744867535e-05, "loss": 0.5107, "step": 1840 }, { "epoch": 6.696914700544465, "grad_norm": 0.7890625, "learning_rate": 5.9036643223320475e-05, "loss": 0.4967, "step": 1845 }, { "epoch": 6.715063520871143, "grad_norm": 0.765625, "learning_rate": 5.845849869981137e-05, "loss": 0.4988, "step": 1850 }, { "epoch": 6.733212341197822, "grad_norm": 0.7265625, "learning_rate": 5.788202746191734e-05, "loss": 0.5003, "step": 1855 }, { "epoch": 6.751361161524501, "grad_norm": 0.76171875, "learning_rate": 5.7307252729815833e-05, "loss": 0.5062, "step": 1860 }, { "epoch": 6.769509981851179, "grad_norm": 0.80859375, "learning_rate": 5.6734197655349156e-05, "loss": 0.5056, "step": 1865 }, { "epoch": 6.787658802177859, "grad_norm": 0.72265625, "learning_rate": 5.616288532109225e-05, "loss": 0.5081, "step": 1870 }, { "epoch": 6.805807622504537, "grad_norm": 0.6640625, "learning_rate": 5.559333873942259e-05, "loss": 0.502, "step": 1875 }, { "epoch": 6.823956442831216, "grad_norm": 0.76171875, "learning_rate": 5.5025580851593436e-05, "loss": 0.5091, "step": 1880 }, { "epoch": 6.842105263157895, "grad_norm": 0.7265625, "learning_rate": 5.445963452680973e-05, "loss": 0.5017, "step": 1885 }, { "epoch": 6.860254083484573, "grad_norm": 0.7890625, "learning_rate": 5.38955225613069e-05, "loss": 0.5148, "step": 1890 }, { "epoch": 6.878402903811252, "grad_norm": 0.6484375, "learning_rate": 5.333326767743263e-05, "loss": 0.5044, "step": 1895 }, { "epoch": 6.896551724137931, "grad_norm": 0.66015625, "learning_rate": 5.277289252273174e-05, "loss": 0.5094, "step": 1900 }, { "epoch": 6.91470054446461, "grad_norm": 0.69140625, "learning_rate": 5.221441966903371e-05, "loss": 0.5056, "step": 1905 }, { "epoch": 6.932849364791289, "grad_norm": 0.78515625, "learning_rate": 5.1657871611543605e-05, "loss": 0.5108, "step": 1910 }, { "epoch": 6.950998185117967, "grad_norm": 0.796875, "learning_rate": 5.110327076793613e-05, "loss": 0.5068, "step": 1915 }, { "epoch": 6.969147005444646, "grad_norm": 0.72265625, "learning_rate": 5.055063947745233e-05, "loss": 0.5038, "step": 1920 }, { "epoch": 6.987295825771325, "grad_norm": 0.67578125, "learning_rate": 5.000000000000002e-05, "loss": 0.5089, "step": 1925 }, { "epoch": 6.998185117967332, "eval_loss": 2.6490731239318848, "eval_runtime": 1.0248, "eval_samples_per_second": 4.879, "eval_steps_per_second": 1.952, "step": 1928 }, { "epoch": 7.005444646098003, "grad_norm": 0.65625, "learning_rate": 4.945137451525707e-05, "loss": 0.4914, "step": 1930 }, { "epoch": 7.023593466424682, "grad_norm": 0.87109375, "learning_rate": 4.890478512177795e-05, "loss": 0.4567, "step": 1935 }, { "epoch": 7.0417422867513615, "grad_norm": 0.7109375, "learning_rate": 4.836025383610382e-05, "loss": 0.4459, "step": 1940 }, { "epoch": 7.05989110707804, "grad_norm": 0.6796875, "learning_rate": 4.7817802591875426e-05, "loss": 0.4523, "step": 1945 }, { "epoch": 7.078039927404719, "grad_norm": 0.71484375, "learning_rate": 4.727745323894976e-05, "loss": 0.4551, "step": 1950 }, { "epoch": 7.096188747731397, "grad_norm": 0.72265625, "learning_rate": 4.673922754252002e-05, "loss": 0.4435, "step": 1955 }, { "epoch": 7.114337568058076, "grad_norm": 0.74609375, "learning_rate": 4.620314718223876e-05, "loss": 0.4572, "step": 1960 }, { "epoch": 7.132486388384755, "grad_norm": 0.78515625, "learning_rate": 4.566923375134472e-05, "loss": 0.4564, "step": 1965 }, { "epoch": 7.150635208711433, "grad_norm": 0.73828125, "learning_rate": 4.513750875579303e-05, "loss": 0.4482, "step": 1970 }, { "epoch": 7.168784029038113, "grad_norm": 0.6875, "learning_rate": 4.4607993613388976e-05, "loss": 0.4536, "step": 1975 }, { "epoch": 7.1869328493647915, "grad_norm": 0.7578125, "learning_rate": 4.4080709652925336e-05, "loss": 0.4571, "step": 1980 }, { "epoch": 7.20508166969147, "grad_norm": 0.69921875, "learning_rate": 4.355567811332311e-05, "loss": 0.4548, "step": 1985 }, { "epoch": 7.223230490018149, "grad_norm": 0.6796875, "learning_rate": 4.3032920142776125e-05, "loss": 0.4538, "step": 1990 }, { "epoch": 7.241379310344827, "grad_norm": 0.71875, "learning_rate": 4.251245679789928e-05, "loss": 0.4497, "step": 1995 }, { "epoch": 7.259528130671506, "grad_norm": 0.74609375, "learning_rate": 4.19943090428802e-05, "loss": 0.4546, "step": 2000 }, { "epoch": 7.277676950998185, "grad_norm": 0.73828125, "learning_rate": 4.147849774863488e-05, "loss": 0.4612, "step": 2005 }, { "epoch": 7.295825771324864, "grad_norm": 0.7265625, "learning_rate": 4.096504369196704e-05, "loss": 0.4437, "step": 2010 }, { "epoch": 7.313974591651543, "grad_norm": 0.765625, "learning_rate": 4.045396755473121e-05, "loss": 0.4468, "step": 2015 }, { "epoch": 7.3321234119782215, "grad_norm": 0.73828125, "learning_rate": 3.994528992299971e-05, "loss": 0.4456, "step": 2020 }, { "epoch": 7.3502722323049, "grad_norm": 0.671875, "learning_rate": 3.943903128623335e-05, "loss": 0.4458, "step": 2025 }, { "epoch": 7.368421052631579, "grad_norm": 0.7265625, "learning_rate": 3.893521203645618e-05, "loss": 0.4524, "step": 2030 }, { "epoch": 7.386569872958257, "grad_norm": 0.73828125, "learning_rate": 3.843385246743417e-05, "loss": 0.4534, "step": 2035 }, { "epoch": 7.404718693284936, "grad_norm": 0.76171875, "learning_rate": 3.7934972773857634e-05, "loss": 0.4577, "step": 2040 }, { "epoch": 7.422867513611616, "grad_norm": 0.78515625, "learning_rate": 3.7438593050527845e-05, "loss": 0.455, "step": 2045 }, { "epoch": 7.441016333938294, "grad_norm": 0.703125, "learning_rate": 3.694473329154778e-05, "loss": 0.462, "step": 2050 }, { "epoch": 7.459165154264973, "grad_norm": 0.6796875, "learning_rate": 3.645341338951639e-05, "loss": 0.4538, "step": 2055 }, { "epoch": 7.4773139745916515, "grad_norm": 0.72265625, "learning_rate": 3.5964653134727776e-05, "loss": 0.455, "step": 2060 }, { "epoch": 7.49546279491833, "grad_norm": 0.7265625, "learning_rate": 3.547847221437372e-05, "loss": 0.459, "step": 2065 }, { "epoch": 7.513611615245009, "grad_norm": 0.765625, "learning_rate": 3.4994890211750754e-05, "loss": 0.4491, "step": 2070 }, { "epoch": 7.531760435571687, "grad_norm": 0.7421875, "learning_rate": 3.45139266054715e-05, "loss": 0.4535, "step": 2075 }, { "epoch": 7.549909255898367, "grad_norm": 0.79296875, "learning_rate": 3.4035600768679855e-05, "loss": 0.4517, "step": 2080 }, { "epoch": 7.568058076225046, "grad_norm": 0.67578125, "learning_rate": 3.3559931968270753e-05, "loss": 0.4535, "step": 2085 }, { "epoch": 7.586206896551724, "grad_norm": 0.6875, "learning_rate": 3.308693936411421e-05, "loss": 0.4486, "step": 2090 }, { "epoch": 7.604355716878403, "grad_norm": 0.7265625, "learning_rate": 3.2616642008283213e-05, "loss": 0.459, "step": 2095 }, { "epoch": 7.6225045372050815, "grad_norm": 0.76953125, "learning_rate": 3.21490588442868e-05, "loss": 0.4579, "step": 2100 }, { "epoch": 7.64065335753176, "grad_norm": 0.81640625, "learning_rate": 3.1684208706306574e-05, "loss": 0.4431, "step": 2105 }, { "epoch": 7.658802177858439, "grad_norm": 0.734375, "learning_rate": 3.1222110318438304e-05, "loss": 0.4554, "step": 2110 }, { "epoch": 7.676950998185118, "grad_norm": 0.734375, "learning_rate": 3.076278229393773e-05, "loss": 0.4574, "step": 2115 }, { "epoch": 7.695099818511797, "grad_norm": 0.75, "learning_rate": 3.030624313447067e-05, "loss": 0.466, "step": 2120 }, { "epoch": 7.713248638838476, "grad_norm": 0.72265625, "learning_rate": 2.9852511229367865e-05, "loss": 0.4592, "step": 2125 }, { "epoch": 7.731397459165154, "grad_norm": 0.67578125, "learning_rate": 2.9401604854884357e-05, "loss": 0.4622, "step": 2130 }, { "epoch": 7.749546279491833, "grad_norm": 0.7890625, "learning_rate": 2.8953542173463133e-05, "loss": 0.4527, "step": 2135 }, { "epoch": 7.7676950998185115, "grad_norm": 0.73828125, "learning_rate": 2.8508341233003654e-05, "loss": 0.4565, "step": 2140 }, { "epoch": 7.78584392014519, "grad_norm": 0.7734375, "learning_rate": 2.8066019966134904e-05, "loss": 0.4548, "step": 2145 }, { "epoch": 7.80399274047187, "grad_norm": 0.73046875, "learning_rate": 2.7626596189492983e-05, "loss": 0.4584, "step": 2150 }, { "epoch": 7.822141560798548, "grad_norm": 0.79296875, "learning_rate": 2.719008760300359e-05, "loss": 0.4523, "step": 2155 }, { "epoch": 7.840290381125227, "grad_norm": 0.70703125, "learning_rate": 2.6756511789168925e-05, "loss": 0.4503, "step": 2160 }, { "epoch": 7.8584392014519056, "grad_norm": 0.73046875, "learning_rate": 2.6325886212359498e-05, "loss": 0.451, "step": 2165 }, { "epoch": 7.876588021778584, "grad_norm": 0.75390625, "learning_rate": 2.589822821811083e-05, "loss": 0.4574, "step": 2170 }, { "epoch": 7.894736842105263, "grad_norm": 0.7109375, "learning_rate": 2.5473555032424533e-05, "loss": 0.4692, "step": 2175 }, { "epoch": 7.9128856624319415, "grad_norm": 0.76953125, "learning_rate": 2.5051883761074614e-05, "loss": 0.4595, "step": 2180 }, { "epoch": 7.931034482758621, "grad_norm": 0.66796875, "learning_rate": 2.4633231388918378e-05, "loss": 0.4543, "step": 2185 }, { "epoch": 7.9491833030853, "grad_norm": 0.7265625, "learning_rate": 2.4217614779212315e-05, "loss": 0.4552, "step": 2190 }, { "epoch": 7.967332123411978, "grad_norm": 0.8203125, "learning_rate": 2.3805050672932928e-05, "loss": 0.4542, "step": 2195 }, { "epoch": 7.985480943738657, "grad_norm": 0.72265625, "learning_rate": 2.339555568810221e-05, "loss": 0.4678, "step": 2200 }, { "epoch": 8.0, "eval_loss": 2.8434367179870605, "eval_runtime": 1.027, "eval_samples_per_second": 4.869, "eval_steps_per_second": 1.947, "step": 2204 }, { "epoch": 8.003629764065336, "grad_norm": 0.6875, "learning_rate": 2.2989146319118425e-05, "loss": 0.4538, "step": 2205 }, { "epoch": 8.021778584392015, "grad_norm": 0.6484375, "learning_rate": 2.2585838936091754e-05, "loss": 0.4327, "step": 2210 }, { "epoch": 8.039927404718693, "grad_norm": 0.71875, "learning_rate": 2.2185649784184746e-05, "loss": 0.4241, "step": 2215 }, { "epoch": 8.058076225045372, "grad_norm": 0.76171875, "learning_rate": 2.178859498295809e-05, "loss": 0.4201, "step": 2220 }, { "epoch": 8.07622504537205, "grad_norm": 0.77734375, "learning_rate": 2.139469052572127e-05, "loss": 0.4283, "step": 2225 }, { "epoch": 8.09437386569873, "grad_norm": 0.703125, "learning_rate": 2.1003952278888382e-05, "loss": 0.4324, "step": 2230 }, { "epoch": 8.11252268602541, "grad_norm": 0.6953125, "learning_rate": 2.0616395981339075e-05, "loss": 0.4288, "step": 2235 }, { "epoch": 8.130671506352087, "grad_norm": 0.7421875, "learning_rate": 2.0232037243784475e-05, "loss": 0.4235, "step": 2240 }, { "epoch": 8.148820326678766, "grad_norm": 0.7734375, "learning_rate": 1.985089154813846e-05, "loss": 0.4291, "step": 2245 }, { "epoch": 8.166969147005444, "grad_norm": 0.71875, "learning_rate": 1.947297424689414e-05, "loss": 0.4272, "step": 2250 }, { "epoch": 8.185117967332124, "grad_norm": 0.69921875, "learning_rate": 1.9098300562505266e-05, "loss": 0.4274, "step": 2255 }, { "epoch": 8.203266787658801, "grad_norm": 0.74609375, "learning_rate": 1.8726885586773212e-05, "loss": 0.4336, "step": 2260 }, { "epoch": 8.221415607985481, "grad_norm": 0.6953125, "learning_rate": 1.835874428023905e-05, "loss": 0.4359, "step": 2265 }, { "epoch": 8.23956442831216, "grad_norm": 0.703125, "learning_rate": 1.7993891471580893e-05, "loss": 0.4275, "step": 2270 }, { "epoch": 8.257713248638838, "grad_norm": 0.6796875, "learning_rate": 1.763234185701673e-05, "loss": 0.4315, "step": 2275 }, { "epoch": 8.275862068965518, "grad_norm": 0.70703125, "learning_rate": 1.7274109999712295e-05, "loss": 0.4249, "step": 2280 }, { "epoch": 8.294010889292196, "grad_norm": 0.734375, "learning_rate": 1.6919210329194533e-05, "loss": 0.4276, "step": 2285 }, { "epoch": 8.312159709618875, "grad_norm": 0.765625, "learning_rate": 1.6567657140770475e-05, "loss": 0.4315, "step": 2290 }, { "epoch": 8.330308529945553, "grad_norm": 0.71875, "learning_rate": 1.621946459495127e-05, "loss": 0.4247, "step": 2295 }, { "epoch": 8.348457350272232, "grad_norm": 0.76171875, "learning_rate": 1.587464671688187e-05, "loss": 0.4441, "step": 2300 }, { "epoch": 8.366606170598912, "grad_norm": 0.6875, "learning_rate": 1.553321739577619e-05, "loss": 0.4267, "step": 2305 }, { "epoch": 8.38475499092559, "grad_norm": 0.66796875, "learning_rate": 1.5195190384357404e-05, "loss": 0.4224, "step": 2310 }, { "epoch": 8.40290381125227, "grad_norm": 0.73828125, "learning_rate": 1.4860579298304312e-05, "loss": 0.4348, "step": 2315 }, { "epoch": 8.421052631578947, "grad_norm": 0.703125, "learning_rate": 1.4529397615702656e-05, "loss": 0.4321, "step": 2320 }, { "epoch": 8.439201451905626, "grad_norm": 0.73828125, "learning_rate": 1.4201658676502294e-05, "loss": 0.4386, "step": 2325 }, { "epoch": 8.457350272232304, "grad_norm": 0.77734375, "learning_rate": 1.3877375681979943e-05, "loss": 0.4366, "step": 2330 }, { "epoch": 8.475499092558984, "grad_norm": 0.72265625, "learning_rate": 1.3556561694207338e-05, "loss": 0.4336, "step": 2335 }, { "epoch": 8.493647912885663, "grad_norm": 0.75, "learning_rate": 1.3239229635525074e-05, "loss": 0.4274, "step": 2340 }, { "epoch": 8.511796733212341, "grad_norm": 0.7109375, "learning_rate": 1.2925392288022298e-05, "loss": 0.4259, "step": 2345 }, { "epoch": 8.52994555353902, "grad_norm": 0.66796875, "learning_rate": 1.2615062293021507e-05, "loss": 0.427, "step": 2350 }, { "epoch": 8.548094373865698, "grad_norm": 0.67578125, "learning_rate": 1.230825215056971e-05, "loss": 0.4226, "step": 2355 }, { "epoch": 8.566243194192378, "grad_norm": 0.7421875, "learning_rate": 1.2004974218934695e-05, "loss": 0.4361, "step": 2360 }, { "epoch": 8.584392014519056, "grad_norm": 0.65625, "learning_rate": 1.1705240714107302e-05, "loss": 0.4291, "step": 2365 }, { "epoch": 8.602540834845735, "grad_norm": 1.171875, "learning_rate": 1.1409063709309442e-05, "loss": 0.4416, "step": 2370 }, { "epoch": 8.620689655172415, "grad_norm": 0.69921875, "learning_rate": 1.1116455134507664e-05, "loss": 0.425, "step": 2375 }, { "epoch": 8.638838475499092, "grad_norm": 0.74609375, "learning_rate": 1.0827426775932658e-05, "loss": 0.4316, "step": 2380 }, { "epoch": 8.656987295825772, "grad_norm": 0.703125, "learning_rate": 1.054199027560463e-05, "loss": 0.425, "step": 2385 }, { "epoch": 8.67513611615245, "grad_norm": 0.6953125, "learning_rate": 1.026015713086418e-05, "loss": 0.4344, "step": 2390 }, { "epoch": 8.69328493647913, "grad_norm": 0.76171875, "learning_rate": 9.98193869390922e-06, "loss": 0.4261, "step": 2395 }, { "epoch": 8.711433756805807, "grad_norm": 0.75, "learning_rate": 9.707346171337894e-06, "loss": 0.4323, "step": 2400 }, { "epoch": 8.729582577132486, "grad_norm": 0.71484375, "learning_rate": 9.436390623696911e-06, "loss": 0.4309, "step": 2405 }, { "epoch": 8.747731397459166, "grad_norm": 0.76171875, "learning_rate": 9.16908296503628e-06, "loss": 0.4278, "step": 2410 }, { "epoch": 8.765880217785844, "grad_norm": 0.765625, "learning_rate": 8.905433962469489e-06, "loss": 0.4289, "step": 2415 }, { "epoch": 8.784029038112523, "grad_norm": 0.76171875, "learning_rate": 8.645454235739903e-06, "loss": 0.4323, "step": 2420 }, { "epoch": 8.802177858439201, "grad_norm": 0.66796875, "learning_rate": 8.38915425679304e-06, "loss": 0.4286, "step": 2425 }, { "epoch": 8.82032667876588, "grad_norm": 0.69140625, "learning_rate": 8.13654434935467e-06, "loss": 0.4267, "step": 2430 }, { "epoch": 8.838475499092558, "grad_norm": 0.75390625, "learning_rate": 7.887634688515e-06, "loss": 0.4351, "step": 2435 }, { "epoch": 8.856624319419238, "grad_norm": 0.7265625, "learning_rate": 7.642435300318907e-06, "loss": 0.4294, "step": 2440 }, { "epoch": 8.874773139745917, "grad_norm": 0.73828125, "learning_rate": 7.400956061361974e-06, "loss": 0.4301, "step": 2445 }, { "epoch": 8.892921960072595, "grad_norm": 0.69921875, "learning_rate": 7.163206698392744e-06, "loss": 0.4306, "step": 2450 }, { "epoch": 8.911070780399275, "grad_norm": 0.7265625, "learning_rate": 6.929196787920899e-06, "loss": 0.433, "step": 2455 }, { "epoch": 8.929219600725952, "grad_norm": 0.7265625, "learning_rate": 6.698935755831492e-06, "loss": 0.4332, "step": 2460 }, { "epoch": 8.947368421052632, "grad_norm": 0.70703125, "learning_rate": 6.472432877005341e-06, "loss": 0.4364, "step": 2465 }, { "epoch": 8.96551724137931, "grad_norm": 0.72265625, "learning_rate": 6.2496972749453766e-06, "loss": 0.434, "step": 2470 }, { "epoch": 8.98366606170599, "grad_norm": 0.65625, "learning_rate": 6.030737921409169e-06, "loss": 0.433, "step": 2475 }, { "epoch": 8.998185117967331, "eval_loss": 2.9603512287139893, "eval_runtime": 1.0237, "eval_samples_per_second": 4.884, "eval_steps_per_second": 1.954, "step": 2479 }, { "epoch": 9.001814882032669, "grad_norm": 0.68359375, "learning_rate": 5.8155636360475385e-06, "loss": 0.4369, "step": 2480 }, { "epoch": 9.019963702359346, "grad_norm": 0.68359375, "learning_rate": 5.604183086049342e-06, "loss": 0.4232, "step": 2485 }, { "epoch": 9.038112522686026, "grad_norm": 0.7109375, "learning_rate": 5.396604785792281e-06, "loss": 0.4265, "step": 2490 }, { "epoch": 9.056261343012704, "grad_norm": 0.73828125, "learning_rate": 5.192837096500058e-06, "loss": 0.4255, "step": 2495 }, { "epoch": 9.074410163339383, "grad_norm": 0.7265625, "learning_rate": 4.992888225905468e-06, "loss": 0.4207, "step": 2500 }, { "epoch": 9.092558983666061, "grad_norm": 0.6875, "learning_rate": 4.796766227919857e-06, "loss": 0.4223, "step": 2505 }, { "epoch": 9.11070780399274, "grad_norm": 0.72265625, "learning_rate": 4.604479002308737e-06, "loss": 0.4237, "step": 2510 }, { "epoch": 9.12885662431942, "grad_norm": 0.7421875, "learning_rate": 4.416034294373472e-06, "loss": 0.4417, "step": 2515 }, { "epoch": 9.147005444646098, "grad_norm": 0.69140625, "learning_rate": 4.231439694639483e-06, "loss": 0.4247, "step": 2520 }, { "epoch": 9.165154264972777, "grad_norm": 0.70703125, "learning_rate": 4.050702638550275e-06, "loss": 0.4244, "step": 2525 }, { "epoch": 9.183303085299455, "grad_norm": 0.69921875, "learning_rate": 3.873830406168111e-06, "loss": 0.4309, "step": 2530 }, { "epoch": 9.201451905626135, "grad_norm": 0.71875, "learning_rate": 3.7008301218807716e-06, "loss": 0.4264, "step": 2535 }, { "epoch": 9.219600725952812, "grad_norm": 0.73046875, "learning_rate": 3.5317087541144377e-06, "loss": 0.4226, "step": 2540 }, { "epoch": 9.237749546279492, "grad_norm": 0.6953125, "learning_rate": 3.3664731150531482e-06, "loss": 0.4263, "step": 2545 }, { "epoch": 9.255898366606171, "grad_norm": 0.7578125, "learning_rate": 3.2051298603643753e-06, "loss": 0.4272, "step": 2550 }, { "epoch": 9.27404718693285, "grad_norm": 0.84375, "learning_rate": 3.047685488930874e-06, "loss": 0.4207, "step": 2555 }, { "epoch": 9.292196007259529, "grad_norm": 0.73046875, "learning_rate": 2.894146342588977e-06, "loss": 0.426, "step": 2560 }, { "epoch": 9.310344827586206, "grad_norm": 0.75390625, "learning_rate": 2.744518605873092e-06, "loss": 0.4308, "step": 2565 }, { "epoch": 9.328493647912886, "grad_norm": 0.76953125, "learning_rate": 2.5988083057666533e-06, "loss": 0.4262, "step": 2570 }, { "epoch": 9.346642468239564, "grad_norm": 0.75, "learning_rate": 2.4570213114592954e-06, "loss": 0.4225, "step": 2575 }, { "epoch": 9.364791288566243, "grad_norm": 0.73046875, "learning_rate": 2.3191633341104856e-06, "loss": 0.4198, "step": 2580 }, { "epoch": 9.382940108892923, "grad_norm": 0.69921875, "learning_rate": 2.1852399266194314e-06, "loss": 0.4168, "step": 2585 }, { "epoch": 9.4010889292196, "grad_norm": 0.69921875, "learning_rate": 2.05525648340148e-06, "loss": 0.4234, "step": 2590 }, { "epoch": 9.41923774954628, "grad_norm": 0.75, "learning_rate": 1.9292182401707603e-06, "loss": 0.4383, "step": 2595 }, { "epoch": 9.437386569872958, "grad_norm": 0.68359375, "learning_rate": 1.8071302737293295e-06, "loss": 0.4203, "step": 2600 }, { "epoch": 9.455535390199637, "grad_norm": 0.7265625, "learning_rate": 1.6889975017626903e-06, "loss": 0.4152, "step": 2605 }, { "epoch": 9.473684210526315, "grad_norm": 0.6875, "learning_rate": 1.574824682641629e-06, "loss": 0.4275, "step": 2610 }, { "epoch": 9.491833030852995, "grad_norm": 0.6640625, "learning_rate": 1.4646164152307018e-06, "loss": 0.4209, "step": 2615 }, { "epoch": 9.509981851179674, "grad_norm": 0.734375, "learning_rate": 1.3583771387028265e-06, "loss": 0.429, "step": 2620 }, { "epoch": 9.528130671506352, "grad_norm": 0.69140625, "learning_rate": 1.2561111323605712e-06, "loss": 0.4182, "step": 2625 }, { "epoch": 9.546279491833031, "grad_norm": 0.7109375, "learning_rate": 1.157822515463758e-06, "loss": 0.424, "step": 2630 }, { "epoch": 9.56442831215971, "grad_norm": 0.6875, "learning_rate": 1.0635152470635512e-06, "loss": 0.4211, "step": 2635 }, { "epoch": 9.582577132486389, "grad_norm": 0.69140625, "learning_rate": 9.731931258429638e-07, "loss": 0.4202, "step": 2640 }, { "epoch": 9.600725952813066, "grad_norm": 0.6953125, "learning_rate": 8.868597899638898e-07, "loss": 0.4183, "step": 2645 }, { "epoch": 9.618874773139746, "grad_norm": 0.6796875, "learning_rate": 8.04518716920466e-07, "loss": 0.4258, "step": 2650 }, { "epoch": 9.637023593466425, "grad_norm": 0.6953125, "learning_rate": 7.261732233991513e-07, "loss": 0.426, "step": 2655 }, { "epoch": 9.655172413793103, "grad_norm": 0.69921875, "learning_rate": 6.518264651449779e-07, "loss": 0.4341, "step": 2660 }, { "epoch": 9.673321234119783, "grad_norm": 0.7265625, "learning_rate": 5.814814368345412e-07, "loss": 0.4311, "step": 2665 }, { "epoch": 9.69147005444646, "grad_norm": 0.72265625, "learning_rate": 5.151409719553079e-07, "loss": 0.4219, "step": 2670 }, { "epoch": 9.70961887477314, "grad_norm": 0.6796875, "learning_rate": 4.5280774269154115e-07, "loss": 0.4205, "step": 2675 }, { "epoch": 9.727767695099818, "grad_norm": 0.74609375, "learning_rate": 3.9448425981661876e-07, "loss": 0.4268, "step": 2680 }, { "epoch": 9.745916515426497, "grad_norm": 0.69921875, "learning_rate": 3.401728725919373e-07, "loss": 0.4258, "step": 2685 }, { "epoch": 9.764065335753177, "grad_norm": 0.73046875, "learning_rate": 2.898757686722542e-07, "loss": 0.4191, "step": 2690 }, { "epoch": 9.782214156079855, "grad_norm": 0.78125, "learning_rate": 2.4359497401758024e-07, "loss": 0.4277, "step": 2695 }, { "epoch": 9.800362976406534, "grad_norm": 0.68359375, "learning_rate": 2.0133235281156736e-07, "loss": 0.434, "step": 2700 }, { "epoch": 9.818511796733212, "grad_norm": 0.69921875, "learning_rate": 1.630896073864352e-07, "loss": 0.4307, "step": 2705 }, { "epoch": 9.836660617059891, "grad_norm": 0.71484375, "learning_rate": 1.2886827815440372e-07, "loss": 0.4291, "step": 2710 }, { "epoch": 9.85480943738657, "grad_norm": 0.73046875, "learning_rate": 9.866974354560965e-08, "loss": 0.4284, "step": 2715 }, { "epoch": 9.872958257713249, "grad_norm": 0.796875, "learning_rate": 7.249521995263964e-08, "loss": 0.4319, "step": 2720 }, { "epoch": 9.891107078039928, "grad_norm": 0.71484375, "learning_rate": 5.0345761681491746e-08, "loss": 0.4329, "step": 2725 }, { "epoch": 9.909255898366606, "grad_norm": 0.7421875, "learning_rate": 3.2222260909087196e-08, "loss": 0.4218, "step": 2730 }, { "epoch": 9.927404718693285, "grad_norm": 1.0, "learning_rate": 1.81254476474213e-08, "loss": 0.4236, "step": 2735 }, { "epoch": 9.945553539019963, "grad_norm": 0.69921875, "learning_rate": 8.055889714064791e-09, "loss": 0.424, "step": 2740 }, { "epoch": 9.963702359346643, "grad_norm": 0.71875, "learning_rate": 2.0139927093487664e-09, "loss": 0.4362, "step": 2745 }, { "epoch": 9.98185117967332, "grad_norm": 0.69140625, "learning_rate": 0.0, "loss": 0.4229, "step": 2750 }, { "epoch": 9.98185117967332, "eval_loss": 2.9801077842712402, "eval_runtime": 1.0256, "eval_samples_per_second": 4.875, "eval_steps_per_second": 1.95, "step": 2750 }, { "epoch": 9.98185117967332, "step": 2750, "total_flos": 4.2044012259841147e+18, "train_loss": 1.3294648733139038, "train_runtime": 22350.6599, "train_samples_per_second": 1.971, "train_steps_per_second": 0.123 } ], "logging_steps": 5, "max_steps": 2750, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.2044012259841147e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }