{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 1025, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004878048780487805, "grad_norm": 616.0, "learning_rate": 1.941747572815534e-06, "loss": 39.9034, "step": 1 }, { "epoch": 0.024390243902439025, "grad_norm": 466.0, "learning_rate": 9.70873786407767e-06, "loss": 34.9341, "step": 5 }, { "epoch": 0.04878048780487805, "grad_norm": 251.0, "learning_rate": 1.941747572815534e-05, "loss": 29.4015, "step": 10 }, { "epoch": 0.07317073170731707, "grad_norm": 56.25, "learning_rate": 2.912621359223301e-05, "loss": 17.9289, "step": 15 }, { "epoch": 0.0975609756097561, "grad_norm": 39.75, "learning_rate": 3.883495145631068e-05, "loss": 14.9717, "step": 20 }, { "epoch": 0.12195121951219512, "grad_norm": 15.75, "learning_rate": 4.854368932038835e-05, "loss": 12.4657, "step": 25 }, { "epoch": 0.14634146341463414, "grad_norm": 5.15625, "learning_rate": 5.825242718446602e-05, "loss": 10.6748, "step": 30 }, { "epoch": 0.17073170731707318, "grad_norm": 3.890625, "learning_rate": 6.79611650485437e-05, "loss": 9.811, "step": 35 }, { "epoch": 0.1951219512195122, "grad_norm": 6.75, "learning_rate": 7.766990291262136e-05, "loss": 9.6309, "step": 40 }, { "epoch": 0.21951219512195122, "grad_norm": 15.3125, "learning_rate": 8.737864077669902e-05, "loss": 8.5315, "step": 45 }, { "epoch": 0.24390243902439024, "grad_norm": 31.75, "learning_rate": 9.70873786407767e-05, "loss": 6.2519, "step": 50 }, { "epoch": 0.2682926829268293, "grad_norm": 5.25, "learning_rate": 0.00010679611650485437, "loss": 2.2665, "step": 55 }, { "epoch": 0.2926829268292683, "grad_norm": 2.40625, "learning_rate": 0.00011650485436893204, "loss": 1.5008, "step": 60 }, { "epoch": 0.3170731707317073, "grad_norm": 1.2578125, "learning_rate": 0.00012621359223300972, "loss": 1.3289, "step": 65 }, { "epoch": 0.34146341463414637, "grad_norm": 2.796875, "learning_rate": 0.0001359223300970874, "loss": 1.2198, "step": 70 }, { "epoch": 0.36585365853658536, "grad_norm": 1.4375, "learning_rate": 0.00014563106796116506, "loss": 1.1493, "step": 75 }, { "epoch": 0.3902439024390244, "grad_norm": 8.1875, "learning_rate": 0.0001553398058252427, "loss": 1.0862, "step": 80 }, { "epoch": 0.4146341463414634, "grad_norm": 3.0625, "learning_rate": 0.0001650485436893204, "loss": 1.0768, "step": 85 }, { "epoch": 0.43902439024390244, "grad_norm": 1.2109375, "learning_rate": 0.00017475728155339805, "loss": 1.0468, "step": 90 }, { "epoch": 0.4634146341463415, "grad_norm": 4.21875, "learning_rate": 0.00018446601941747576, "loss": 0.9991, "step": 95 }, { "epoch": 0.4878048780487805, "grad_norm": 2.0625, "learning_rate": 0.0001941747572815534, "loss": 0.9596, "step": 100 }, { "epoch": 0.5121951219512195, "grad_norm": 1.5078125, "learning_rate": 0.00019999767797859854, "loss": 0.9712, "step": 105 }, { "epoch": 0.5365853658536586, "grad_norm": 0.890625, "learning_rate": 0.0001999715564762413, "loss": 0.9139, "step": 110 }, { "epoch": 0.5609756097560976, "grad_norm": 1.1875, "learning_rate": 0.00019991641855173097, "loss": 0.9175, "step": 115 }, { "epoch": 0.5853658536585366, "grad_norm": 1.25, "learning_rate": 0.00019983228020867242, "loss": 0.8959, "step": 120 }, { "epoch": 0.6097560975609756, "grad_norm": 1.3203125, "learning_rate": 0.00019971916586794867, "loss": 0.8605, "step": 125 }, { "epoch": 0.6341463414634146, "grad_norm": 0.98828125, "learning_rate": 0.00019957710836063263, "loss": 0.8857, "step": 130 }, { "epoch": 0.6585365853658537, "grad_norm": 1.8046875, "learning_rate": 0.00019940614891845809, "loss": 0.8533, "step": 135 }, { "epoch": 0.6829268292682927, "grad_norm": 1.9140625, "learning_rate": 0.00019920633716185226, "loss": 0.8332, "step": 140 }, { "epoch": 0.7073170731707317, "grad_norm": 0.9140625, "learning_rate": 0.00019897773108553378, "loss": 0.8485, "step": 145 }, { "epoch": 0.7317073170731707, "grad_norm": 1.9296875, "learning_rate": 0.00019872039704167964, "loss": 0.8479, "step": 150 }, { "epoch": 0.7560975609756098, "grad_norm": 1.4921875, "learning_rate": 0.00019843440972066697, "loss": 0.8418, "step": 155 }, { "epoch": 0.7804878048780488, "grad_norm": 1.9296875, "learning_rate": 0.00019811985212939416, "loss": 0.8513, "step": 160 }, { "epoch": 0.8048780487804879, "grad_norm": 1.5234375, "learning_rate": 0.00019777681556718864, "loss": 0.8217, "step": 165 }, { "epoch": 0.8292682926829268, "grad_norm": 1.8671875, "learning_rate": 0.00019740539959930725, "loss": 0.8169, "step": 170 }, { "epoch": 0.8536585365853658, "grad_norm": 1.046875, "learning_rate": 0.00019700571202803797, "loss": 0.8386, "step": 175 }, { "epoch": 0.8780487804878049, "grad_norm": 1.2421875, "learning_rate": 0.00019657786886141052, "loss": 0.8144, "step": 180 }, { "epoch": 0.9024390243902439, "grad_norm": 4.15625, "learning_rate": 0.00019612199427952552, "loss": 0.8229, "step": 185 }, { "epoch": 0.926829268292683, "grad_norm": 0.70703125, "learning_rate": 0.00019563822059851145, "loss": 0.8128, "step": 190 }, { "epoch": 0.9512195121951219, "grad_norm": 0.97265625, "learning_rate": 0.00019512668823212055, "loss": 0.8072, "step": 195 }, { "epoch": 0.975609756097561, "grad_norm": 0.6484375, "learning_rate": 0.0001945875456509739, "loss": 0.8097, "step": 200 }, { "epoch": 1.0, "grad_norm": 0.6953125, "learning_rate": 0.00019402094933946857, "loss": 0.7892, "step": 205 }, { "epoch": 1.0, "eval_loss": 2.2406020164489746, "eval_runtime": 0.9998, "eval_samples_per_second": 5.001, "eval_steps_per_second": 2.0, "step": 205 }, { "epoch": 1.024390243902439, "grad_norm": 0.6796875, "learning_rate": 0.0001934270637503584, "loss": 0.7753, "step": 210 }, { "epoch": 1.048780487804878, "grad_norm": 0.71875, "learning_rate": 0.00019280606125702203, "loss": 0.7289, "step": 215 }, { "epoch": 1.0731707317073171, "grad_norm": 0.625, "learning_rate": 0.00019215812210343226, "loss": 0.7656, "step": 220 }, { "epoch": 1.0975609756097562, "grad_norm": 0.65625, "learning_rate": 0.00019148343435184079, "loss": 0.7603, "step": 225 }, { "epoch": 1.1219512195121952, "grad_norm": 0.671875, "learning_rate": 0.00019078219382819353, "loss": 0.7491, "step": 230 }, { "epoch": 1.146341463414634, "grad_norm": 0.56640625, "learning_rate": 0.00019005460406529311, "loss": 0.7249, "step": 235 }, { "epoch": 1.170731707317073, "grad_norm": 0.57421875, "learning_rate": 0.00018930087624372387, "loss": 0.7755, "step": 240 }, { "epoch": 1.1951219512195121, "grad_norm": 0.703125, "learning_rate": 0.00018852122913055742, "loss": 0.7553, "step": 245 }, { "epoch": 1.2195121951219512, "grad_norm": 0.890625, "learning_rate": 0.00018771588901585635, "loss": 0.7482, "step": 250 }, { "epoch": 1.2439024390243902, "grad_norm": 0.77734375, "learning_rate": 0.00018688508964699404, "loss": 0.7402, "step": 255 }, { "epoch": 1.2682926829268293, "grad_norm": 1.015625, "learning_rate": 0.00018602907216081044, "loss": 0.7263, "step": 260 }, { "epoch": 1.2926829268292683, "grad_norm": 1.3828125, "learning_rate": 0.0001851480850136228, "loss": 0.7435, "step": 265 }, { "epoch": 1.3170731707317074, "grad_norm": 0.58984375, "learning_rate": 0.00018424238390911198, "loss": 0.7457, "step": 270 }, { "epoch": 1.3414634146341464, "grad_norm": 1.0078125, "learning_rate": 0.00018331223172410535, "loss": 0.7483, "step": 275 }, { "epoch": 1.3658536585365852, "grad_norm": 0.609375, "learning_rate": 0.00018235789843227756, "loss": 0.7486, "step": 280 }, { "epoch": 1.3902439024390243, "grad_norm": 1.265625, "learning_rate": 0.00018137966102579176, "loss": 0.7548, "step": 285 }, { "epoch": 1.4146341463414633, "grad_norm": 1.1953125, "learning_rate": 0.00018037780343490312, "loss": 0.7282, "step": 290 }, { "epoch": 1.4390243902439024, "grad_norm": 0.6875, "learning_rate": 0.00017935261644554942, "loss": 0.7415, "step": 295 }, { "epoch": 1.4634146341463414, "grad_norm": 0.73046875, "learning_rate": 0.0001783043976149511, "loss": 0.7362, "step": 300 }, { "epoch": 1.4878048780487805, "grad_norm": 0.578125, "learning_rate": 0.0001772334511852463, "loss": 0.724, "step": 305 }, { "epoch": 1.5121951219512195, "grad_norm": 0.59765625, "learning_rate": 0.0001761400879951856, "loss": 0.7249, "step": 310 }, { "epoch": 1.5365853658536586, "grad_norm": 1.6953125, "learning_rate": 0.00017502462538991205, "loss": 0.7212, "step": 315 }, { "epoch": 1.5609756097560976, "grad_norm": 0.6328125, "learning_rate": 0.00017388738712885275, "loss": 0.7194, "step": 320 }, { "epoch": 1.5853658536585367, "grad_norm": 1.3125, "learning_rate": 0.0001727287032917487, "loss": 0.765, "step": 325 }, { "epoch": 1.6097560975609757, "grad_norm": 1.578125, "learning_rate": 0.00017154891018285028, "loss": 0.7318, "step": 330 }, { "epoch": 1.6341463414634148, "grad_norm": 0.50390625, "learning_rate": 0.00017034835023330597, "loss": 0.7396, "step": 335 }, { "epoch": 1.6585365853658538, "grad_norm": 2.0625, "learning_rate": 0.00016912737190177292, "loss": 0.737, "step": 340 }, { "epoch": 1.6829268292682928, "grad_norm": 0.59765625, "learning_rate": 0.00016788632957327772, "loss": 0.7302, "step": 345 }, { "epoch": 1.7073170731707317, "grad_norm": 0.65234375, "learning_rate": 0.00016662558345635753, "loss": 0.7193, "step": 350 }, { "epoch": 1.7317073170731707, "grad_norm": 0.76953125, "learning_rate": 0.00016534549947851062, "loss": 0.7147, "step": 355 }, { "epoch": 1.7560975609756098, "grad_norm": 0.73828125, "learning_rate": 0.00016404644917998698, "loss": 0.7539, "step": 360 }, { "epoch": 1.7804878048780488, "grad_norm": 0.9453125, "learning_rate": 0.00016272880960595024, "loss": 0.754, "step": 365 }, { "epoch": 1.8048780487804879, "grad_norm": 0.90625, "learning_rate": 0.00016139296319704117, "loss": 0.7346, "step": 370 }, { "epoch": 1.8292682926829267, "grad_norm": 0.89453125, "learning_rate": 0.00016003929767837588, "loss": 0.6961, "step": 375 }, { "epoch": 1.8536585365853657, "grad_norm": 1.078125, "learning_rate": 0.00015866820594700944, "loss": 0.7259, "step": 380 }, { "epoch": 1.8780487804878048, "grad_norm": 0.65625, "learning_rate": 0.00015728008595789926, "loss": 0.7198, "step": 385 }, { "epoch": 1.9024390243902438, "grad_norm": 0.515625, "learning_rate": 0.0001558753406083995, "loss": 0.6858, "step": 390 }, { "epoch": 1.9268292682926829, "grad_norm": 0.60546875, "learning_rate": 0.00015445437762132174, "loss": 0.6987, "step": 395 }, { "epoch": 1.951219512195122, "grad_norm": 0.5625, "learning_rate": 0.0001530176094265945, "loss": 0.7054, "step": 400 }, { "epoch": 1.975609756097561, "grad_norm": 0.62890625, "learning_rate": 0.00015156545304155698, "loss": 0.7226, "step": 405 }, { "epoch": 2.0, "grad_norm": 0.52734375, "learning_rate": 0.00015009832994992102, "loss": 0.7033, "step": 410 }, { "epoch": 2.0, "eval_loss": 2.2004494667053223, "eval_runtime": 1.0005, "eval_samples_per_second": 4.998, "eval_steps_per_second": 1.999, "step": 410 }, { "epoch": 2.024390243902439, "grad_norm": 0.6328125, "learning_rate": 0.0001486166659794368, "loss": 0.6398, "step": 415 }, { "epoch": 2.048780487804878, "grad_norm": 0.65234375, "learning_rate": 0.00014712089117829776, "loss": 0.647, "step": 420 }, { "epoch": 2.073170731707317, "grad_norm": 0.66796875, "learning_rate": 0.0001456114396903204, "loss": 0.6333, "step": 425 }, { "epoch": 2.097560975609756, "grad_norm": 0.60546875, "learning_rate": 0.0001440887496289356, "loss": 0.6296, "step": 430 }, { "epoch": 2.1219512195121952, "grad_norm": 0.546875, "learning_rate": 0.00014255326295002754, "loss": 0.6516, "step": 435 }, { "epoch": 2.1463414634146343, "grad_norm": 0.671875, "learning_rate": 0.00014100542532365724, "loss": 0.6468, "step": 440 }, { "epoch": 2.1707317073170733, "grad_norm": 0.578125, "learning_rate": 0.0001394456860047086, "loss": 0.6472, "step": 445 }, { "epoch": 2.1951219512195124, "grad_norm": 0.77734375, "learning_rate": 0.00013787449770249336, "loss": 0.6587, "step": 450 }, { "epoch": 2.2195121951219514, "grad_norm": 0.70703125, "learning_rate": 0.0001362923164493538, "loss": 0.6505, "step": 455 }, { "epoch": 2.2439024390243905, "grad_norm": 0.69140625, "learning_rate": 0.00013469960146830073, "loss": 0.6463, "step": 460 }, { "epoch": 2.2682926829268295, "grad_norm": 0.72265625, "learning_rate": 0.00013309681503972565, "loss": 0.6372, "step": 465 }, { "epoch": 2.292682926829268, "grad_norm": 0.80078125, "learning_rate": 0.00013148442236722506, "loss": 0.6285, "step": 470 }, { "epoch": 2.317073170731707, "grad_norm": 0.7578125, "learning_rate": 0.00012986289144257705, "loss": 0.6645, "step": 475 }, { "epoch": 2.341463414634146, "grad_norm": 1.0234375, "learning_rate": 0.00012823269290990777, "loss": 0.6663, "step": 480 }, { "epoch": 2.3658536585365852, "grad_norm": 1.0625, "learning_rate": 0.0001265942999290887, "loss": 0.6495, "step": 485 }, { "epoch": 2.3902439024390243, "grad_norm": 1.0078125, "learning_rate": 0.00012494818803840367, "loss": 0.6429, "step": 490 }, { "epoch": 2.4146341463414633, "grad_norm": 1.796875, "learning_rate": 0.00012329483501652492, "loss": 0.6573, "step": 495 }, { "epoch": 2.4390243902439024, "grad_norm": 0.921875, "learning_rate": 0.00012163472074383994, "loss": 0.6487, "step": 500 }, { "epoch": 2.4634146341463414, "grad_norm": 0.59375, "learning_rate": 0.00011996832706316739, "loss": 0.6726, "step": 505 }, { "epoch": 2.4878048780487805, "grad_norm": 0.5546875, "learning_rate": 0.00011829613763990384, "loss": 0.6546, "step": 510 }, { "epoch": 2.5121951219512195, "grad_norm": 0.57421875, "learning_rate": 0.00011661863782164153, "loss": 0.6306, "step": 515 }, { "epoch": 2.5365853658536586, "grad_norm": 0.61328125, "learning_rate": 0.00011493631449729767, "loss": 0.6591, "step": 520 }, { "epoch": 2.5609756097560976, "grad_norm": 0.578125, "learning_rate": 0.00011324965595579666, "loss": 0.6387, "step": 525 }, { "epoch": 2.5853658536585367, "grad_norm": 0.60546875, "learning_rate": 0.00011155915174434561, "loss": 0.6539, "step": 530 }, { "epoch": 2.6097560975609757, "grad_norm": 0.87109375, "learning_rate": 0.00010986529252634503, "loss": 0.6654, "step": 535 }, { "epoch": 2.6341463414634148, "grad_norm": 0.859375, "learning_rate": 0.00010816856993897522, "loss": 0.6471, "step": 540 }, { "epoch": 2.658536585365854, "grad_norm": 0.5703125, "learning_rate": 0.00010646947645050023, "loss": 0.6482, "step": 545 }, { "epoch": 2.682926829268293, "grad_norm": 0.5703125, "learning_rate": 0.00010476850521733048, "loss": 0.6252, "step": 550 }, { "epoch": 2.7073170731707314, "grad_norm": 0.69140625, "learning_rate": 0.00010306614994088582, "loss": 0.6605, "step": 555 }, { "epoch": 2.7317073170731705, "grad_norm": 0.52734375, "learning_rate": 0.00010136290472430013, "loss": 0.646, "step": 560 }, { "epoch": 2.7560975609756095, "grad_norm": 0.64453125, "learning_rate": 9.965926392900956e-05, "loss": 0.6492, "step": 565 }, { "epoch": 2.7804878048780486, "grad_norm": 0.63671875, "learning_rate": 9.795572203126573e-05, "loss": 0.6477, "step": 570 }, { "epoch": 2.8048780487804876, "grad_norm": 0.703125, "learning_rate": 9.625277347861553e-05, "loss": 0.6268, "step": 575 }, { "epoch": 2.8292682926829267, "grad_norm": 0.80078125, "learning_rate": 9.455091254638939e-05, "loss": 0.6364, "step": 580 }, { "epoch": 2.8536585365853657, "grad_norm": 0.62890625, "learning_rate": 9.285063319423939e-05, "loss": 0.6497, "step": 585 }, { "epoch": 2.8780487804878048, "grad_norm": 0.58203125, "learning_rate": 9.115242892276909e-05, "loss": 0.6446, "step": 590 }, { "epoch": 2.902439024390244, "grad_norm": 0.70703125, "learning_rate": 8.945679263029661e-05, "loss": 0.6657, "step": 595 }, { "epoch": 2.926829268292683, "grad_norm": 0.64453125, "learning_rate": 8.776421646979232e-05, "loss": 0.6498, "step": 600 }, { "epoch": 2.951219512195122, "grad_norm": 0.5546875, "learning_rate": 8.607519170603328e-05, "loss": 0.6623, "step": 605 }, { "epoch": 2.975609756097561, "grad_norm": 0.55078125, "learning_rate": 8.439020857301503e-05, "loss": 0.6467, "step": 610 }, { "epoch": 3.0, "grad_norm": 0.6875, "learning_rate": 8.270975613166281e-05, "loss": 0.6346, "step": 615 }, { "epoch": 3.0, "eval_loss": 2.245811939239502, "eval_runtime": 1.001, "eval_samples_per_second": 4.995, "eval_steps_per_second": 1.998, "step": 615 }, { "epoch": 3.024390243902439, "grad_norm": 0.609375, "learning_rate": 8.103432212788323e-05, "loss": 0.5718, "step": 620 }, { "epoch": 3.048780487804878, "grad_norm": 0.66796875, "learning_rate": 7.936439285099752e-05, "loss": 0.5989, "step": 625 }, { "epoch": 3.073170731707317, "grad_norm": 0.62890625, "learning_rate": 7.770045299259774e-05, "loss": 0.598, "step": 630 }, { "epoch": 3.097560975609756, "grad_norm": 0.5625, "learning_rate": 7.60429855058664e-05, "loss": 0.5731, "step": 635 }, { "epoch": 3.1219512195121952, "grad_norm": 0.56640625, "learning_rate": 7.439247146540109e-05, "loss": 0.6034, "step": 640 }, { "epoch": 3.1463414634146343, "grad_norm": 0.6015625, "learning_rate": 7.274938992758403e-05, "loss": 0.5669, "step": 645 }, { "epoch": 3.1707317073170733, "grad_norm": 0.59765625, "learning_rate": 7.111421779153745e-05, "loss": 0.5873, "step": 650 }, { "epoch": 3.1951219512195124, "grad_norm": 0.60546875, "learning_rate": 6.94874296607052e-05, "loss": 0.5674, "step": 655 }, { "epoch": 3.2195121951219514, "grad_norm": 0.54296875, "learning_rate": 6.786949770510071e-05, "loss": 0.5726, "step": 660 }, { "epoch": 3.2439024390243905, "grad_norm": 0.578125, "learning_rate": 6.626089152426097e-05, "loss": 0.6006, "step": 665 }, { "epoch": 3.2682926829268295, "grad_norm": 0.58984375, "learning_rate": 6.4662078010947e-05, "loss": 0.5729, "step": 670 }, { "epoch": 3.292682926829268, "grad_norm": 0.546875, "learning_rate": 6.307352121562949e-05, "loss": 0.5719, "step": 675 }, { "epoch": 3.317073170731707, "grad_norm": 0.5859375, "learning_rate": 6.149568221179993e-05, "loss": 0.571, "step": 680 }, { "epoch": 3.341463414634146, "grad_norm": 0.5859375, "learning_rate": 5.992901896214526e-05, "loss": 0.5674, "step": 685 }, { "epoch": 3.3658536585365852, "grad_norm": 0.6640625, "learning_rate": 5.837398618562584e-05, "loss": 0.5772, "step": 690 }, { "epoch": 3.3902439024390243, "grad_norm": 0.58203125, "learning_rate": 5.68310352254946e-05, "loss": 0.5893, "step": 695 }, { "epoch": 3.4146341463414633, "grad_norm": 0.62109375, "learning_rate": 5.5300613918296295e-05, "loss": 0.5771, "step": 700 }, { "epoch": 3.4390243902439024, "grad_norm": 0.58203125, "learning_rate": 5.378316646388424e-05, "loss": 0.5721, "step": 705 }, { "epoch": 3.4634146341463414, "grad_norm": 0.578125, "learning_rate": 5.227913329649271e-05, "loss": 0.5788, "step": 710 }, { "epoch": 3.4878048780487805, "grad_norm": 0.59765625, "learning_rate": 5.078895095690249e-05, "loss": 0.5693, "step": 715 }, { "epoch": 3.5121951219512195, "grad_norm": 0.5703125, "learning_rate": 4.931305196573621e-05, "loss": 0.6107, "step": 720 }, { "epoch": 3.5365853658536586, "grad_norm": 0.58203125, "learning_rate": 4.78518646979206e-05, "loss": 0.5695, "step": 725 }, { "epoch": 3.5609756097560976, "grad_norm": 0.58984375, "learning_rate": 4.6405813258352135e-05, "loss": 0.5707, "step": 730 }, { "epoch": 3.5853658536585367, "grad_norm": 0.62109375, "learning_rate": 4.4975317358801885e-05, "loss": 0.5635, "step": 735 }, { "epoch": 3.6097560975609757, "grad_norm": 0.57421875, "learning_rate": 4.3560792196095543e-05, "loss": 0.5747, "step": 740 }, { "epoch": 3.6341463414634148, "grad_norm": 0.57421875, "learning_rate": 4.216264833160396e-05, "loss": 0.5856, "step": 745 }, { "epoch": 3.658536585365854, "grad_norm": 0.6015625, "learning_rate": 4.0781291572078806e-05, "loss": 0.5797, "step": 750 }, { "epoch": 3.682926829268293, "grad_norm": 0.55859375, "learning_rate": 3.941712285186878e-05, "loss": 0.5643, "step": 755 }, { "epoch": 3.7073170731707314, "grad_norm": 0.5859375, "learning_rate": 3.807053811654948e-05, "loss": 0.5723, "step": 760 }, { "epoch": 3.7317073170731705, "grad_norm": 0.59375, "learning_rate": 3.674192820800156e-05, "loss": 0.5894, "step": 765 }, { "epoch": 3.7560975609756095, "grad_norm": 0.55859375, "learning_rate": 3.543167875097013e-05, "loss": 0.5505, "step": 770 }, { "epoch": 3.7804878048780486, "grad_norm": 0.5859375, "learning_rate": 3.4140170041138385e-05, "loss": 0.5752, "step": 775 }, { "epoch": 3.8048780487804876, "grad_norm": 0.55859375, "learning_rate": 3.286777693474803e-05, "loss": 0.5629, "step": 780 }, { "epoch": 3.8292682926829267, "grad_norm": 0.59765625, "learning_rate": 3.1614868739798495e-05, "loss": 0.5787, "step": 785 }, { "epoch": 3.8536585365853657, "grad_norm": 0.6015625, "learning_rate": 3.0381809108856398e-05, "loss": 0.574, "step": 790 }, { "epoch": 3.8780487804878048, "grad_norm": 0.6171875, "learning_rate": 2.9168955933506648e-05, "loss": 0.5826, "step": 795 }, { "epoch": 3.902439024390244, "grad_norm": 0.578125, "learning_rate": 2.79766612404755e-05, "loss": 0.5725, "step": 800 }, { "epoch": 3.926829268292683, "grad_norm": 0.55859375, "learning_rate": 2.6805271089455986e-05, "loss": 0.5612, "step": 805 }, { "epoch": 3.951219512195122, "grad_norm": 0.6328125, "learning_rate": 2.565512547266511e-05, "loss": 0.5721, "step": 810 }, { "epoch": 3.975609756097561, "grad_norm": 0.5625, "learning_rate": 2.4526558216162322e-05, "loss": 0.5725, "step": 815 }, { "epoch": 4.0, "grad_norm": 0.56640625, "learning_rate": 2.3419896882957527e-05, "loss": 0.5755, "step": 820 }, { "epoch": 4.0, "eval_loss": 2.2934529781341553, "eval_runtime": 0.9995, "eval_samples_per_second": 5.002, "eval_steps_per_second": 2.001, "step": 820 }, { "epoch": 4.024390243902439, "grad_norm": 0.55859375, "learning_rate": 2.2335462677936957e-05, "loss": 0.5324, "step": 825 }, { "epoch": 4.048780487804878, "grad_norm": 0.61328125, "learning_rate": 2.1273570354634508e-05, "loss": 0.5386, "step": 830 }, { "epoch": 4.073170731707317, "grad_norm": 0.55859375, "learning_rate": 2.023452812387555e-05, "loss": 0.5296, "step": 835 }, { "epoch": 4.097560975609756, "grad_norm": 0.62890625, "learning_rate": 1.9218637564319696e-05, "loss": 0.5304, "step": 840 }, { "epoch": 4.121951219512195, "grad_norm": 0.57421875, "learning_rate": 1.8226193534928604e-05, "loss": 0.5554, "step": 845 }, { "epoch": 4.146341463414634, "grad_norm": 0.578125, "learning_rate": 1.725748408938408e-05, "loss": 0.5376, "step": 850 }, { "epoch": 4.170731707317073, "grad_norm": 0.58984375, "learning_rate": 1.63127903924815e-05, "loss": 0.5335, "step": 855 }, { "epoch": 4.195121951219512, "grad_norm": 0.5625, "learning_rate": 1.5392386638522482e-05, "loss": 0.5298, "step": 860 }, { "epoch": 4.219512195121951, "grad_norm": 0.60546875, "learning_rate": 1.4496539971731026e-05, "loss": 0.5239, "step": 865 }, { "epoch": 4.2439024390243905, "grad_norm": 0.57421875, "learning_rate": 1.3625510408715714e-05, "loss": 0.5255, "step": 870 }, { "epoch": 4.2682926829268295, "grad_norm": 0.5859375, "learning_rate": 1.2779550763000703e-05, "loss": 0.5382, "step": 875 }, { "epoch": 4.2926829268292686, "grad_norm": 0.609375, "learning_rate": 1.1958906571647421e-05, "loss": 0.5469, "step": 880 }, { "epoch": 4.317073170731708, "grad_norm": 0.59375, "learning_rate": 1.1163816023988261e-05, "loss": 0.547, "step": 885 }, { "epoch": 4.341463414634147, "grad_norm": 0.56640625, "learning_rate": 1.0394509892492833e-05, "loss": 0.5387, "step": 890 }, { "epoch": 4.365853658536586, "grad_norm": 0.58203125, "learning_rate": 9.65121146578709e-06, "loss": 0.5487, "step": 895 }, { "epoch": 4.390243902439025, "grad_norm": 0.57421875, "learning_rate": 8.934136483844391e-06, "loss": 0.5306, "step": 900 }, { "epoch": 4.414634146341464, "grad_norm": 0.58984375, "learning_rate": 8.243493075367813e-06, "loss": 0.5259, "step": 905 }, { "epoch": 4.439024390243903, "grad_norm": 0.5703125, "learning_rate": 7.579481697381363e-06, "loss": 0.5473, "step": 910 }, { "epoch": 4.463414634146342, "grad_norm": 0.55859375, "learning_rate": 6.942295077048011e-06, "loss": 0.5329, "step": 915 }, { "epoch": 4.487804878048781, "grad_norm": 0.60546875, "learning_rate": 6.3321181557312815e-06, "loss": 0.551, "step": 920 }, { "epoch": 4.512195121951219, "grad_norm": 0.5703125, "learning_rate": 5.749128035316553e-06, "loss": 0.5508, "step": 925 }, { "epoch": 4.536585365853659, "grad_norm": 0.59375, "learning_rate": 5.193493926807835e-06, "loss": 0.5348, "step": 930 }, { "epoch": 4.560975609756097, "grad_norm": 0.58984375, "learning_rate": 4.665377101214863e-06, "loss": 0.5168, "step": 935 }, { "epoch": 4.585365853658536, "grad_norm": 0.61328125, "learning_rate": 4.164930842744608e-06, "loss": 0.5442, "step": 940 }, { "epoch": 4.609756097560975, "grad_norm": 0.59375, "learning_rate": 3.6923004043111444e-06, "loss": 0.5296, "step": 945 }, { "epoch": 4.634146341463414, "grad_norm": 0.546875, "learning_rate": 3.2476229653763734e-06, "loss": 0.5566, "step": 950 }, { "epoch": 4.658536585365853, "grad_norm": 0.58984375, "learning_rate": 2.8310275921341944e-06, "loss": 0.5219, "step": 955 }, { "epoch": 4.682926829268292, "grad_norm": 0.578125, "learning_rate": 2.44263520004937e-06, "loss": 0.5577, "step": 960 }, { "epoch": 4.7073170731707314, "grad_norm": 0.57421875, "learning_rate": 2.0825585187623007e-06, "loss": 0.5235, "step": 965 }, { "epoch": 4.7317073170731705, "grad_norm": 0.578125, "learning_rate": 1.7509020593695302e-06, "loss": 0.547, "step": 970 }, { "epoch": 4.7560975609756095, "grad_norm": 0.57421875, "learning_rate": 1.4477620840897766e-06, "loss": 0.5331, "step": 975 }, { "epoch": 4.780487804878049, "grad_norm": 0.59375, "learning_rate": 1.1732265783241492e-06, "loss": 0.5251, "step": 980 }, { "epoch": 4.804878048780488, "grad_norm": 0.60546875, "learning_rate": 9.273752251186096e-07, "loss": 0.527, "step": 985 }, { "epoch": 4.829268292682927, "grad_norm": 0.57421875, "learning_rate": 7.102793820362829e-07, "loss": 0.5353, "step": 990 }, { "epoch": 4.853658536585366, "grad_norm": 0.58203125, "learning_rate": 5.22002060446125e-07, "loss": 0.5347, "step": 995 }, { "epoch": 4.878048780487805, "grad_norm": 0.58984375, "learning_rate": 3.6259790723409683e-07, "loss": 0.5436, "step": 1000 }, { "epoch": 4.902439024390244, "grad_norm": 0.5625, "learning_rate": 2.3211318894205136e-07, "loss": 0.535, "step": 1005 }, { "epoch": 4.926829268292683, "grad_norm": 0.58984375, "learning_rate": 1.3058577833905404e-07, "loss": 0.5213, "step": 1010 }, { "epoch": 4.951219512195122, "grad_norm": 0.55859375, "learning_rate": 5.804514342889755e-08, "loss": 0.5247, "step": 1015 }, { "epoch": 4.975609756097561, "grad_norm": 0.60546875, "learning_rate": 1.4512338897121335e-08, "loss": 0.5333, "step": 1020 }, { "epoch": 5.0, "grad_norm": 0.546875, "learning_rate": 0.0, "loss": 0.5144, "step": 1025 }, { "epoch": 5.0, "eval_loss": 2.326845407485962, "eval_runtime": 1.0003, "eval_samples_per_second": 4.998, "eval_steps_per_second": 1.999, "step": 1025 }, { "epoch": 5.0, "step": 1025, "total_flos": 1.5670950020754964e+18, "train_loss": 1.4066738275202308, "train_runtime": 8016.4558, "train_samples_per_second": 2.044, "train_steps_per_second": 0.128 } ], "logging_steps": 5, "max_steps": 1025, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.5670950020754964e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }