{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9995621747712183, "eval_steps": 500, "global_step": 1088, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009187152341647227, "grad_norm": 0.12890625, "learning_rate": 1.8348623853211011e-06, "loss": 1.1775, "step": 1 }, { "epoch": 0.004593576170823614, "grad_norm": 0.1240234375, "learning_rate": 9.174311926605506e-06, "loss": 1.1877, "step": 5 }, { "epoch": 0.009187152341647228, "grad_norm": 0.1298828125, "learning_rate": 1.834862385321101e-05, "loss": 1.1744, "step": 10 }, { "epoch": 0.01378072851247084, "grad_norm": 0.11865234375, "learning_rate": 2.7522935779816515e-05, "loss": 1.1666, "step": 15 }, { "epoch": 0.018374304683294457, "grad_norm": 0.0947265625, "learning_rate": 3.669724770642202e-05, "loss": 1.1771, "step": 20 }, { "epoch": 0.02296788085411807, "grad_norm": 0.083984375, "learning_rate": 4.587155963302753e-05, "loss": 1.1437, "step": 25 }, { "epoch": 0.02756145702494168, "grad_norm": 0.07080078125, "learning_rate": 5.504587155963303e-05, "loss": 1.1373, "step": 30 }, { "epoch": 0.0321550331957653, "grad_norm": 0.0517578125, "learning_rate": 6.422018348623854e-05, "loss": 1.0934, "step": 35 }, { "epoch": 0.036748609366588914, "grad_norm": 0.0439453125, "learning_rate": 7.339449541284404e-05, "loss": 1.0912, "step": 40 }, { "epoch": 0.04134218553741253, "grad_norm": 0.041748046875, "learning_rate": 8.256880733944955e-05, "loss": 1.1026, "step": 45 }, { "epoch": 0.04593576170823614, "grad_norm": 0.0380859375, "learning_rate": 9.174311926605506e-05, "loss": 1.0824, "step": 50 }, { "epoch": 0.05052933787905975, "grad_norm": 0.03369140625, "learning_rate": 0.00010091743119266055, "loss": 1.1075, "step": 55 }, { "epoch": 0.05512291404988336, "grad_norm": 0.034423828125, "learning_rate": 0.00011009174311926606, "loss": 1.1073, "step": 60 }, { "epoch": 0.05971649022070698, "grad_norm": 0.0322265625, "learning_rate": 0.00011926605504587157, "loss": 1.0848, "step": 65 }, { "epoch": 0.0643100663915306, "grad_norm": 0.0294189453125, "learning_rate": 0.00012844036697247707, "loss": 1.072, "step": 70 }, { "epoch": 0.0689036425623542, "grad_norm": 0.0311279296875, "learning_rate": 0.00013761467889908258, "loss": 1.0763, "step": 75 }, { "epoch": 0.07349721873317783, "grad_norm": 0.0294189453125, "learning_rate": 0.0001467889908256881, "loss": 1.0737, "step": 80 }, { "epoch": 0.07809079490400143, "grad_norm": 0.0308837890625, "learning_rate": 0.0001559633027522936, "loss": 1.0564, "step": 85 }, { "epoch": 0.08268437107482506, "grad_norm": 0.030517578125, "learning_rate": 0.0001651376146788991, "loss": 1.0754, "step": 90 }, { "epoch": 0.08727794724564866, "grad_norm": 0.0296630859375, "learning_rate": 0.00017431192660550458, "loss": 1.0635, "step": 95 }, { "epoch": 0.09187152341647228, "grad_norm": 0.02978515625, "learning_rate": 0.00018348623853211012, "loss": 1.0757, "step": 100 }, { "epoch": 0.09646509958729589, "grad_norm": 0.0299072265625, "learning_rate": 0.0001926605504587156, "loss": 1.0715, "step": 105 }, { "epoch": 0.1010586757581195, "grad_norm": 0.0301513671875, "learning_rate": 0.00019999948512240548, "loss": 1.0733, "step": 110 }, { "epoch": 0.10565225192894312, "grad_norm": 0.0289306640625, "learning_rate": 0.00019998146496329837, "loss": 1.0891, "step": 115 }, { "epoch": 0.11024582809976673, "grad_norm": 0.03125, "learning_rate": 0.00019993770622619782, "loss": 1.0702, "step": 120 }, { "epoch": 0.11483940427059035, "grad_norm": 0.033447265625, "learning_rate": 0.00019986822017606848, "loss": 1.0861, "step": 125 }, { "epoch": 0.11943298044141396, "grad_norm": 0.0294189453125, "learning_rate": 0.00019977302470094708, "loss": 1.0615, "step": 130 }, { "epoch": 0.12402655661223758, "grad_norm": 0.0296630859375, "learning_rate": 0.00019965214430733754, "loss": 1.0673, "step": 135 }, { "epoch": 0.1286201327830612, "grad_norm": 0.031982421875, "learning_rate": 0.00019950561011390213, "loss": 1.0648, "step": 140 }, { "epoch": 0.1332137089538848, "grad_norm": 0.031494140625, "learning_rate": 0.00019933345984345037, "loss": 1.0685, "step": 145 }, { "epoch": 0.1378072851247084, "grad_norm": 0.0301513671875, "learning_rate": 0.00019913573781322818, "loss": 1.0663, "step": 150 }, { "epoch": 0.14240086129553203, "grad_norm": 0.0291748046875, "learning_rate": 0.00019891249492350887, "loss": 1.079, "step": 155 }, { "epoch": 0.14699443746635565, "grad_norm": 0.03076171875, "learning_rate": 0.00019866378864448985, "loss": 1.0959, "step": 160 }, { "epoch": 0.15158801363717925, "grad_norm": 0.030029296875, "learning_rate": 0.00019838968300149782, "loss": 1.0805, "step": 165 }, { "epoch": 0.15618158980800287, "grad_norm": 0.0302734375, "learning_rate": 0.00019809024855850662, "loss": 1.0644, "step": 170 }, { "epoch": 0.1607751659788265, "grad_norm": 0.02978515625, "learning_rate": 0.00019776556239997146, "loss": 1.0657, "step": 175 }, { "epoch": 0.1653687421496501, "grad_norm": 0.0289306640625, "learning_rate": 0.000197415708110985, "loss": 1.0559, "step": 180 }, { "epoch": 0.1699623183204737, "grad_norm": 0.02978515625, "learning_rate": 0.00019704077575575978, "loss": 1.0507, "step": 185 }, { "epoch": 0.17455589449129733, "grad_norm": 0.031005859375, "learning_rate": 0.00019664086185444246, "loss": 1.1005, "step": 190 }, { "epoch": 0.17914947066212095, "grad_norm": 0.02978515625, "learning_rate": 0.0001962160693582665, "loss": 1.0795, "step": 195 }, { "epoch": 0.18374304683294457, "grad_norm": 0.0291748046875, "learning_rate": 0.00019576650762304903, "loss": 1.0786, "step": 200 }, { "epoch": 0.18833662300376816, "grad_norm": 0.0284423828125, "learning_rate": 0.00019529229238103883, "loss": 1.0602, "step": 205 }, { "epoch": 0.19293019917459178, "grad_norm": 0.0301513671875, "learning_rate": 0.00019479354571112323, "loss": 1.072, "step": 210 }, { "epoch": 0.1975237753454154, "grad_norm": 0.0286865234375, "learning_rate": 0.00019427039600740072, "loss": 1.0903, "step": 215 }, { "epoch": 0.202117351516239, "grad_norm": 0.03076171875, "learning_rate": 0.00019372297794612817, "loss": 1.0734, "step": 220 }, { "epoch": 0.20671092768706262, "grad_norm": 0.029541015625, "learning_rate": 0.00019315143245105047, "loss": 1.0638, "step": 225 }, { "epoch": 0.21130450385788624, "grad_norm": 0.0302734375, "learning_rate": 0.00019255590665712214, "loss": 1.0663, "step": 230 }, { "epoch": 0.21589808002870986, "grad_norm": 0.029052734375, "learning_rate": 0.00019193655387262984, "loss": 1.0778, "step": 235 }, { "epoch": 0.22049165619953345, "grad_norm": 0.02978515625, "learning_rate": 0.00019129353353972581, "loss": 1.0732, "step": 240 }, { "epoch": 0.22508523237035707, "grad_norm": 0.029052734375, "learning_rate": 0.00019062701119338185, "loss": 1.0463, "step": 245 }, { "epoch": 0.2296788085411807, "grad_norm": 0.030517578125, "learning_rate": 0.0001899371584187753, "loss": 1.0674, "step": 250 }, { "epoch": 0.23427238471200432, "grad_norm": 0.0294189453125, "learning_rate": 0.00018922415280711716, "loss": 1.0611, "step": 255 }, { "epoch": 0.2388659608828279, "grad_norm": 0.0291748046875, "learning_rate": 0.00018848817790993432, "loss": 1.0603, "step": 260 }, { "epoch": 0.24345953705365153, "grad_norm": 0.0299072265625, "learning_rate": 0.00018772942319181696, "loss": 1.0497, "step": 265 }, { "epoch": 0.24805311322447515, "grad_norm": 0.0311279296875, "learning_rate": 0.0001869480839816443, "loss": 1.056, "step": 270 }, { "epoch": 0.2526466893952988, "grad_norm": 0.029541015625, "learning_rate": 0.0001861443614223002, "loss": 1.0677, "step": 275 }, { "epoch": 0.2572402655661224, "grad_norm": 0.0301513671875, "learning_rate": 0.00018531846241889245, "loss": 1.0646, "step": 280 }, { "epoch": 0.261833841736946, "grad_norm": 0.030029296875, "learning_rate": 0.0001844705995854882, "loss": 1.0962, "step": 285 }, { "epoch": 0.2664274179077696, "grad_norm": 0.0296630859375, "learning_rate": 0.0001836009911903803, "loss": 1.0587, "step": 290 }, { "epoch": 0.2710209940785932, "grad_norm": 0.0299072265625, "learning_rate": 0.00018270986109989744, "loss": 1.0724, "step": 295 }, { "epoch": 0.2756145702494168, "grad_norm": 0.0299072265625, "learning_rate": 0.00018179743872077359, "loss": 1.0816, "step": 300 }, { "epoch": 0.28020814642024044, "grad_norm": 0.030029296875, "learning_rate": 0.000180863958941091, "loss": 1.0654, "step": 305 }, { "epoch": 0.28480172259106407, "grad_norm": 0.032470703125, "learning_rate": 0.00017990966206981224, "loss": 1.0828, "step": 310 }, { "epoch": 0.2893952987618877, "grad_norm": 0.02978515625, "learning_rate": 0.0001789347937749164, "loss": 1.0492, "step": 315 }, { "epoch": 0.2939888749327113, "grad_norm": 0.0294189453125, "learning_rate": 0.00017793960502015613, "loss": 1.0845, "step": 320 }, { "epoch": 0.2985824511035349, "grad_norm": 0.0294189453125, "learning_rate": 0.0001769243520004511, "loss": 1.0536, "step": 325 }, { "epoch": 0.3031760272743585, "grad_norm": 0.030029296875, "learning_rate": 0.0001758892960759348, "loss": 1.0649, "step": 330 }, { "epoch": 0.3077696034451821, "grad_norm": 0.031005859375, "learning_rate": 0.00017483470370467178, "loss": 1.064, "step": 335 }, { "epoch": 0.31236317961600574, "grad_norm": 0.0296630859375, "learning_rate": 0.00017376084637406222, "loss": 1.0584, "step": 340 }, { "epoch": 0.31695675578682936, "grad_norm": 0.02978515625, "learning_rate": 0.00017266800053095232, "loss": 1.0616, "step": 345 }, { "epoch": 0.321550331957653, "grad_norm": 0.030029296875, "learning_rate": 0.0001715564475104673, "loss": 1.0868, "step": 350 }, { "epoch": 0.3261439081284766, "grad_norm": 0.030517578125, "learning_rate": 0.00017042647346358645, "loss": 1.0808, "step": 355 }, { "epoch": 0.3307374842993002, "grad_norm": 0.0294189453125, "learning_rate": 0.00016927836928347826, "loss": 1.0739, "step": 360 }, { "epoch": 0.3353310604701238, "grad_norm": 0.030517578125, "learning_rate": 0.00016811243053061487, "loss": 1.078, "step": 365 }, { "epoch": 0.3399246366409474, "grad_norm": 0.030517578125, "learning_rate": 0.00016692895735668476, "loss": 1.0578, "step": 370 }, { "epoch": 0.34451821281177103, "grad_norm": 0.0308837890625, "learning_rate": 0.00016572825442732366, "loss": 1.0535, "step": 375 }, { "epoch": 0.34911178898259465, "grad_norm": 0.03125, "learning_rate": 0.0001645106308436836, "loss": 1.0668, "step": 380 }, { "epoch": 0.35370536515341827, "grad_norm": 0.031494140625, "learning_rate": 0.00016327640006285967, "loss": 1.05, "step": 385 }, { "epoch": 0.3582989413242419, "grad_norm": 0.031494140625, "learning_rate": 0.00016202587981719584, "loss": 1.066, "step": 390 }, { "epoch": 0.3628925174950655, "grad_norm": 0.030517578125, "learning_rate": 0.0001607593920324899, "loss": 1.0622, "step": 395 }, { "epoch": 0.36748609366588914, "grad_norm": 0.031982421875, "learning_rate": 0.00015947726274511908, "loss": 1.0666, "step": 400 }, { "epoch": 0.3720796698367127, "grad_norm": 0.031494140625, "learning_rate": 0.0001581798220181073, "loss": 1.0531, "step": 405 }, { "epoch": 0.3766732460075363, "grad_norm": 0.0311279296875, "learning_rate": 0.00015686740385615586, "loss": 1.0829, "step": 410 }, { "epoch": 0.38126682217835994, "grad_norm": 0.030517578125, "learning_rate": 0.00015554034611965954, "loss": 1.0592, "step": 415 }, { "epoch": 0.38586039834918356, "grad_norm": 0.031494140625, "learning_rate": 0.00015419899043772994, "loss": 1.05, "step": 420 }, { "epoch": 0.3904539745200072, "grad_norm": 0.030029296875, "learning_rate": 0.00015284368212024877, "loss": 1.0732, "step": 425 }, { "epoch": 0.3950475506908308, "grad_norm": 0.031494140625, "learning_rate": 0.00015147477006897337, "loss": 1.0728, "step": 430 }, { "epoch": 0.3996411268616544, "grad_norm": 0.030029296875, "learning_rate": 0.000150092606687718, "loss": 1.0481, "step": 435 }, { "epoch": 0.404234703032478, "grad_norm": 0.03173828125, "learning_rate": 0.0001486975477916329, "loss": 1.0781, "step": 440 }, { "epoch": 0.4088282792033016, "grad_norm": 0.03125, "learning_rate": 0.00014728995251560596, "loss": 1.0703, "step": 445 }, { "epoch": 0.41342185537412524, "grad_norm": 0.0322265625, "learning_rate": 0.00014587018322180905, "loss": 1.0554, "step": 450 }, { "epoch": 0.41801543154494886, "grad_norm": 0.03076171875, "learning_rate": 0.00014443860540641406, "loss": 1.0626, "step": 455 }, { "epoch": 0.4226090077157725, "grad_norm": 0.033203125, "learning_rate": 0.00014299558760550184, "loss": 1.0918, "step": 460 }, { "epoch": 0.4272025838865961, "grad_norm": 0.0302734375, "learning_rate": 0.00014154150130018866, "loss": 1.0488, "step": 465 }, { "epoch": 0.4317961600574197, "grad_norm": 0.031494140625, "learning_rate": 0.0001400767208209946, "loss": 1.0599, "step": 470 }, { "epoch": 0.43638973622824334, "grad_norm": 0.0308837890625, "learning_rate": 0.0001386016232514784, "loss": 1.0669, "step": 475 }, { "epoch": 0.4409833123990669, "grad_norm": 0.0311279296875, "learning_rate": 0.00013711658833116323, "loss": 1.0721, "step": 480 }, { "epoch": 0.44557688856989053, "grad_norm": 0.03173828125, "learning_rate": 0.00013562199835777934, "loss": 1.0829, "step": 485 }, { "epoch": 0.45017046474071415, "grad_norm": 0.03125, "learning_rate": 0.00013411823808884765, "loss": 1.0611, "step": 490 }, { "epoch": 0.45476404091153777, "grad_norm": 0.031494140625, "learning_rate": 0.00013260569464263036, "loss": 1.0547, "step": 495 }, { "epoch": 0.4593576170823614, "grad_norm": 0.032958984375, "learning_rate": 0.00013108475739847362, "loss": 1.0635, "step": 500 }, { "epoch": 0.463951193253185, "grad_norm": 0.0302734375, "learning_rate": 0.00012955581789656843, "loss": 1.0746, "step": 505 }, { "epoch": 0.46854476942400863, "grad_norm": 0.03125, "learning_rate": 0.00012801926973715483, "loss": 1.0665, "step": 510 }, { "epoch": 0.4731383455948322, "grad_norm": 0.03125, "learning_rate": 0.0001264755084791963, "loss": 1.0547, "step": 515 }, { "epoch": 0.4777319217656558, "grad_norm": 0.03271484375, "learning_rate": 0.00012492493153854937, "loss": 1.0508, "step": 520 }, { "epoch": 0.48232549793647944, "grad_norm": 0.03173828125, "learning_rate": 0.0001233679380856557, "loss": 1.082, "step": 525 }, { "epoch": 0.48691907410730306, "grad_norm": 0.03173828125, "learning_rate": 0.00012180492894278206, "loss": 1.0656, "step": 530 }, { "epoch": 0.4915126502781267, "grad_norm": 0.032958984375, "learning_rate": 0.00012023630648083528, "loss": 1.0703, "step": 535 }, { "epoch": 0.4961062264489503, "grad_norm": 0.031982421875, "learning_rate": 0.00011866247451577864, "loss": 1.0685, "step": 540 }, { "epoch": 0.5006998026197739, "grad_norm": 0.0306396484375, "learning_rate": 0.00011708383820467595, "loss": 1.0568, "step": 545 }, { "epoch": 0.5052933787905975, "grad_norm": 0.0322265625, "learning_rate": 0.00011550080394139062, "loss": 1.0405, "step": 550 }, { "epoch": 0.5098869549614211, "grad_norm": 0.033203125, "learning_rate": 0.00011391377925196626, "loss": 1.0654, "step": 555 }, { "epoch": 0.5144805311322448, "grad_norm": 0.031005859375, "learning_rate": 0.00011232317268971585, "loss": 1.0578, "step": 560 }, { "epoch": 0.5190741073030684, "grad_norm": 0.03125, "learning_rate": 0.00011072939373004647, "loss": 1.0581, "step": 565 }, { "epoch": 0.523667683473892, "grad_norm": 0.031005859375, "learning_rate": 0.00010913285266504636, "loss": 1.0464, "step": 570 }, { "epoch": 0.5282612596447156, "grad_norm": 0.0308837890625, "learning_rate": 0.0001075339604978624, "loss": 1.0744, "step": 575 }, { "epoch": 0.5328548358155392, "grad_norm": 0.0308837890625, "learning_rate": 0.0001059331288368938, "loss": 1.0675, "step": 580 }, { "epoch": 0.5374484119863628, "grad_norm": 0.031982421875, "learning_rate": 0.0001043307697898305, "loss": 1.0682, "step": 585 }, { "epoch": 0.5420419881571864, "grad_norm": 0.032470703125, "learning_rate": 0.00010272729585756275, "loss": 1.0577, "step": 590 }, { "epoch": 0.5466355643280101, "grad_norm": 0.0311279296875, "learning_rate": 0.00010112311982798959, "loss": 1.0535, "step": 595 }, { "epoch": 0.5512291404988336, "grad_norm": 0.0306396484375, "learning_rate": 9.951865466975344e-05, "loss": 1.0713, "step": 600 }, { "epoch": 0.5558227166696573, "grad_norm": 0.0311279296875, "learning_rate": 9.791431342592811e-05, "loss": 1.0532, "step": 605 }, { "epoch": 0.5604162928404809, "grad_norm": 0.0311279296875, "learning_rate": 9.631050910768773e-05, "loss": 1.0807, "step": 610 }, { "epoch": 0.5650098690113045, "grad_norm": 0.031005859375, "learning_rate": 9.470765458798368e-05, "loss": 1.0504, "step": 615 }, { "epoch": 0.5696034451821281, "grad_norm": 0.032470703125, "learning_rate": 9.310616249525759e-05, "loss": 1.0507, "step": 620 }, { "epoch": 0.5741970213529517, "grad_norm": 0.0308837890625, "learning_rate": 9.15064451072165e-05, "loss": 1.0434, "step": 625 }, { "epoch": 0.5787905975237754, "grad_norm": 0.031005859375, "learning_rate": 8.990891424469927e-05, "loss": 1.0744, "step": 630 }, { "epoch": 0.5833841736945989, "grad_norm": 0.03173828125, "learning_rate": 8.831398116565964e-05, "loss": 1.0792, "step": 635 }, { "epoch": 0.5879777498654226, "grad_norm": 0.03173828125, "learning_rate": 8.672205645929503e-05, "loss": 1.0723, "step": 640 }, { "epoch": 0.5925713260362462, "grad_norm": 0.031982421875, "learning_rate": 8.513354994034681e-05, "loss": 1.0627, "step": 645 }, { "epoch": 0.5971649022070697, "grad_norm": 0.031982421875, "learning_rate": 8.35488705436006e-05, "loss": 1.0844, "step": 650 }, { "epoch": 0.6017584783778934, "grad_norm": 0.031982421875, "learning_rate": 8.196842621861242e-05, "loss": 1.0566, "step": 655 }, { "epoch": 0.606352054548717, "grad_norm": 0.03173828125, "learning_rate": 8.039262382468905e-05, "loss": 1.0697, "step": 660 }, { "epoch": 0.6109456307195407, "grad_norm": 0.03271484375, "learning_rate": 7.882186902614865e-05, "loss": 1.0719, "step": 665 }, { "epoch": 0.6155392068903642, "grad_norm": 0.032470703125, "learning_rate": 7.725656618788937e-05, "loss": 1.0519, "step": 670 }, { "epoch": 0.6201327830611879, "grad_norm": 0.03271484375, "learning_rate": 7.569711827129208e-05, "loss": 1.0508, "step": 675 }, { "epoch": 0.6247263592320115, "grad_norm": 0.031982421875, "learning_rate": 7.414392673048478e-05, "loss": 1.0631, "step": 680 }, { "epoch": 0.6293199354028352, "grad_norm": 0.031494140625, "learning_rate": 7.259739140899462e-05, "loss": 1.0702, "step": 685 }, { "epoch": 0.6339135115736587, "grad_norm": 0.031494140625, "learning_rate": 7.105791043681519e-05, "loss": 1.0642, "step": 690 }, { "epoch": 0.6385070877444823, "grad_norm": 0.031494140625, "learning_rate": 6.952588012791405e-05, "loss": 1.0585, "step": 695 }, { "epoch": 0.643100663915306, "grad_norm": 0.03125, "learning_rate": 6.80016948782086e-05, "loss": 1.0691, "step": 700 }, { "epoch": 0.6476942400861295, "grad_norm": 0.031494140625, "learning_rate": 6.648574706403522e-05, "loss": 1.0614, "step": 705 }, { "epoch": 0.6522878162569532, "grad_norm": 0.031982421875, "learning_rate": 6.497842694113842e-05, "loss": 1.0649, "step": 710 }, { "epoch": 0.6568813924277768, "grad_norm": 0.03173828125, "learning_rate": 6.348012254420606e-05, "loss": 1.07, "step": 715 }, { "epoch": 0.6614749685986004, "grad_norm": 0.0308837890625, "learning_rate": 6.199121958697604e-05, "loss": 1.0709, "step": 720 }, { "epoch": 0.666068544769424, "grad_norm": 0.03466796875, "learning_rate": 6.051210136294089e-05, "loss": 1.063, "step": 725 }, { "epoch": 0.6706621209402476, "grad_norm": 0.03173828125, "learning_rate": 5.904314864667497e-05, "loss": 1.0622, "step": 730 }, { "epoch": 0.6752556971110713, "grad_norm": 0.031005859375, "learning_rate": 5.758473959581061e-05, "loss": 1.0558, "step": 735 }, { "epoch": 0.6798492732818948, "grad_norm": 0.03125, "learning_rate": 5.613724965368723e-05, "loss": 1.0724, "step": 740 }, { "epoch": 0.6844428494527185, "grad_norm": 0.031982421875, "learning_rate": 5.4701051452700245e-05, "loss": 1.0675, "step": 745 }, { "epoch": 0.6890364256235421, "grad_norm": 0.0322265625, "learning_rate": 5.327651471837242e-05, "loss": 1.079, "step": 750 }, { "epoch": 0.6936300017943657, "grad_norm": 0.03173828125, "learning_rate": 5.1864006174174504e-05, "loss": 1.0626, "step": 755 }, { "epoch": 0.6982235779651893, "grad_norm": 0.033447265625, "learning_rate": 5.046388944711824e-05, "loss": 1.0637, "step": 760 }, { "epoch": 0.7028171541360129, "grad_norm": 0.0322265625, "learning_rate": 4.9076524974146507e-05, "loss": 1.0565, "step": 765 }, { "epoch": 0.7074107303068365, "grad_norm": 0.031494140625, "learning_rate": 4.7702269909344907e-05, "loss": 1.068, "step": 770 }, { "epoch": 0.7120043064776601, "grad_norm": 0.031494140625, "learning_rate": 4.6341478031998265e-05, "loss": 1.0632, "step": 775 }, { "epoch": 0.7165978826484838, "grad_norm": 0.0322265625, "learning_rate": 4.4994499655515865e-05, "loss": 1.0809, "step": 780 }, { "epoch": 0.7211914588193074, "grad_norm": 0.03173828125, "learning_rate": 4.3661681537249455e-05, "loss": 1.0694, "step": 785 }, { "epoch": 0.725785034990131, "grad_norm": 0.03271484375, "learning_rate": 4.234336678922569e-05, "loss": 1.06, "step": 790 }, { "epoch": 0.7303786111609546, "grad_norm": 0.03173828125, "learning_rate": 4.103989478981827e-05, "loss": 1.0498, "step": 795 }, { "epoch": 0.7349721873317783, "grad_norm": 0.0322265625, "learning_rate": 3.975160109637992e-05, "loss": 1.0527, "step": 800 }, { "epoch": 0.7395657635026018, "grad_norm": 0.033447265625, "learning_rate": 3.847881735885918e-05, "loss": 1.0593, "step": 805 }, { "epoch": 0.7441593396734254, "grad_norm": 0.031982421875, "learning_rate": 3.722187123442249e-05, "loss": 1.0568, "step": 810 }, { "epoch": 0.7487529158442491, "grad_norm": 0.0322265625, "learning_rate": 3.598108630310399e-05, "loss": 1.0651, "step": 815 }, { "epoch": 0.7533464920150726, "grad_norm": 0.031494140625, "learning_rate": 3.475678198450555e-05, "loss": 1.0451, "step": 820 }, { "epoch": 0.7579400681858963, "grad_norm": 0.032470703125, "learning_rate": 3.354927345556723e-05, "loss": 1.0591, "step": 825 }, { "epoch": 0.7625336443567199, "grad_norm": 0.03173828125, "learning_rate": 3.235887156943029e-05, "loss": 1.0519, "step": 830 }, { "epoch": 0.7671272205275436, "grad_norm": 0.031982421875, "learning_rate": 3.118588277541312e-05, "loss": 1.0833, "step": 835 }, { "epoch": 0.7717207966983671, "grad_norm": 0.03271484375, "learning_rate": 3.003060904012096e-05, "loss": 1.0543, "step": 840 }, { "epoch": 0.7763143728691907, "grad_norm": 0.031982421875, "learning_rate": 2.8893347769709476e-05, "loss": 1.0755, "step": 845 }, { "epoch": 0.7809079490400144, "grad_norm": 0.03271484375, "learning_rate": 2.7774391733322713e-05, "loss": 1.0682, "step": 850 }, { "epoch": 0.7855015252108379, "grad_norm": 0.03173828125, "learning_rate": 2.6674028987724163e-05, "loss": 1.064, "step": 855 }, { "epoch": 0.7900951013816616, "grad_norm": 0.031494140625, "learning_rate": 2.559254280314156e-05, "loss": 1.0648, "step": 860 }, { "epoch": 0.7946886775524852, "grad_norm": 0.031982421875, "learning_rate": 2.4530211590343578e-05, "loss": 1.0699, "step": 865 }, { "epoch": 0.7992822537233089, "grad_norm": 0.03173828125, "learning_rate": 2.3487308828967493e-05, "loss": 1.0647, "step": 870 }, { "epoch": 0.8038758298941324, "grad_norm": 0.031982421875, "learning_rate": 2.2464102997116475e-05, "loss": 1.0636, "step": 875 }, { "epoch": 0.808469406064956, "grad_norm": 0.03125, "learning_rate": 2.1460857502244248e-05, "loss": 1.0793, "step": 880 }, { "epoch": 0.8130629822357797, "grad_norm": 0.03125, "learning_rate": 2.047783061334523e-05, "loss": 1.0501, "step": 885 }, { "epoch": 0.8176565584066032, "grad_norm": 0.031494140625, "learning_rate": 1.9515275394467446e-05, "loss": 1.0553, "step": 890 }, { "epoch": 0.8222501345774269, "grad_norm": 0.03173828125, "learning_rate": 1.8573439639565282e-05, "loss": 1.059, "step": 895 }, { "epoch": 0.8268437107482505, "grad_norm": 0.031494140625, "learning_rate": 1.765256580870924e-05, "loss": 1.0653, "step": 900 }, { "epoch": 0.8314372869190741, "grad_norm": 0.0322265625, "learning_rate": 1.6752890965668266e-05, "loss": 1.0616, "step": 905 }, { "epoch": 0.8360308630898977, "grad_norm": 0.031982421875, "learning_rate": 1.587464671688187e-05, "loss": 1.0612, "step": 910 }, { "epoch": 0.8406244392607214, "grad_norm": 0.031982421875, "learning_rate": 1.501805915183685e-05, "loss": 1.0723, "step": 915 }, { "epoch": 0.845218015431545, "grad_norm": 0.031982421875, "learning_rate": 1.4183348784864037e-05, "loss": 1.0664, "step": 920 }, { "epoch": 0.8498115916023685, "grad_norm": 0.03125, "learning_rate": 1.3370730498370831e-05, "loss": 1.0693, "step": 925 }, { "epoch": 0.8544051677731922, "grad_norm": 0.03173828125, "learning_rate": 1.258041348752308e-05, "loss": 1.0715, "step": 930 }, { "epoch": 0.8589987439440158, "grad_norm": 0.031982421875, "learning_rate": 1.1812601206391304e-05, "loss": 1.0755, "step": 935 }, { "epoch": 0.8635923201148394, "grad_norm": 0.03125, "learning_rate": 1.1067491315574797e-05, "loss": 1.0522, "step": 940 }, { "epoch": 0.868185896285663, "grad_norm": 0.03173828125, "learning_rate": 1.0345275631317163e-05, "loss": 1.0495, "step": 945 }, { "epoch": 0.8727794724564867, "grad_norm": 0.03173828125, "learning_rate": 9.64614007612633e-06, "loss": 1.0637, "step": 950 }, { "epoch": 0.8773730486273102, "grad_norm": 0.031494140625, "learning_rate": 8.970264630912061e-06, "loss": 1.064, "step": 955 }, { "epoch": 0.8819666247981338, "grad_norm": 0.031494140625, "learning_rate": 8.317823288652526e-06, "loss": 1.0457, "step": 960 }, { "epoch": 0.8865602009689575, "grad_norm": 0.03173828125, "learning_rate": 7.688984009603062e-06, "loss": 1.0633, "step": 965 }, { "epoch": 0.8911537771397811, "grad_norm": 0.0322265625, "learning_rate": 7.083908678057194e-06, "loss": 1.0677, "step": 970 }, { "epoch": 0.8957473533106047, "grad_norm": 0.03125, "learning_rate": 6.502753060672495e-06, "loss": 1.0482, "step": 975 }, { "epoch": 0.9003409294814283, "grad_norm": 0.0322265625, "learning_rate": 5.945666766370861e-06, "loss": 1.0749, "step": 980 }, { "epoch": 0.904934505652252, "grad_norm": 0.03173828125, "learning_rate": 5.412793207824252e-06, "loss": 1.0588, "step": 985 }, { "epoch": 0.9095280818230755, "grad_norm": 0.031494140625, "learning_rate": 4.904269564535391e-06, "loss": 1.0575, "step": 990 }, { "epoch": 0.9141216579938991, "grad_norm": 0.03173828125, "learning_rate": 4.42022674752326e-06, "loss": 1.0554, "step": 995 }, { "epoch": 0.9187152341647228, "grad_norm": 0.03271484375, "learning_rate": 3.960789365622075e-06, "loss": 1.0504, "step": 1000 }, { "epoch": 0.9233088103355463, "grad_norm": 0.031494140625, "learning_rate": 3.526075693402986e-06, "loss": 1.064, "step": 1005 }, { "epoch": 0.92790238650637, "grad_norm": 0.031494140625, "learning_rate": 3.116197640726104e-06, "loss": 1.0566, "step": 1010 }, { "epoch": 0.9324959626771936, "grad_norm": 0.03173828125, "learning_rate": 2.7312607239311505e-06, "loss": 1.0623, "step": 1015 }, { "epoch": 0.9370895388480173, "grad_norm": 0.032958984375, "learning_rate": 2.3713640386741396e-06, "loss": 1.0708, "step": 1020 }, { "epoch": 0.9416831150188408, "grad_norm": 0.032470703125, "learning_rate": 2.0366002344166745e-06, "loss": 1.0646, "step": 1025 }, { "epoch": 0.9462766911896644, "grad_norm": 0.03173828125, "learning_rate": 1.7270554905750137e-06, "loss": 1.0753, "step": 1030 }, { "epoch": 0.9508702673604881, "grad_norm": 0.031494140625, "learning_rate": 1.4428094943345294e-06, "loss": 1.0618, "step": 1035 }, { "epoch": 0.9554638435313116, "grad_norm": 0.03173828125, "learning_rate": 1.1839354201355513e-06, "loss": 1.0533, "step": 1040 }, { "epoch": 0.9600574197021353, "grad_norm": 0.03125, "learning_rate": 9.50499910835867e-07, "loss": 1.0471, "step": 1045 }, { "epoch": 0.9646509958729589, "grad_norm": 0.03173828125, "learning_rate": 7.425630605545575e-07, "loss": 1.0564, "step": 1050 }, { "epoch": 0.9692445720437826, "grad_norm": 0.0322265625, "learning_rate": 5.60178399201805e-07, "loss": 1.0793, "step": 1055 }, { "epoch": 0.9738381482146061, "grad_norm": 0.03173828125, "learning_rate": 4.0339287869847197e-07, "loss": 1.0729, "step": 1060 }, { "epoch": 0.9784317243854298, "grad_norm": 0.031982421875, "learning_rate": 2.7224686088910265e-07, "loss": 1.0542, "step": 1065 }, { "epoch": 0.9830253005562534, "grad_norm": 0.03173828125, "learning_rate": 1.6677410715149054e-07, "loss": 1.0532, "step": 1070 }, { "epoch": 0.9876188767270769, "grad_norm": 0.031982421875, "learning_rate": 8.700176970527497e-08, "loss": 1.0724, "step": 1075 }, { "epoch": 0.9922124528979006, "grad_norm": 0.0322265625, "learning_rate": 3.29503846221213e-08, "loss": 1.0617, "step": 1080 }, { "epoch": 0.9968060290687242, "grad_norm": 0.031494140625, "learning_rate": 4.633866539005549e-09, "loss": 1.0546, "step": 1085 }, { "epoch": 0.9995621747712183, "eval_loss": 1.0676658153533936, "eval_runtime": 1909.6941, "eval_samples_per_second": 8.075, "eval_steps_per_second": 8.075, "step": 1088 }, { "epoch": 0.9995621747712183, "step": 1088, "total_flos": 3.023110834516656e+18, "train_loss": 0.22257287449696483, "train_runtime": 15104.5697, "train_samples_per_second": 9.224, "train_steps_per_second": 0.072 } ], "logging_steps": 5, "max_steps": 1088, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 20, "total_flos": 3.023110834516656e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }