{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.950738916256158, "eval_steps": 500, "global_step": 1010, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009852216748768473, "grad_norm": 1.765625, "learning_rate": 1.9801980198019803e-06, "loss": 2.8604, "step": 1 }, { "epoch": 0.04926108374384237, "grad_norm": 1.4921875, "learning_rate": 9.900990099009901e-06, "loss": 2.8185, "step": 5 }, { "epoch": 0.09852216748768473, "grad_norm": 2.1875, "learning_rate": 1.9801980198019803e-05, "loss": 2.8289, "step": 10 }, { "epoch": 0.1477832512315271, "grad_norm": 2.0625, "learning_rate": 2.9702970297029702e-05, "loss": 2.7485, "step": 15 }, { "epoch": 0.19704433497536947, "grad_norm": 3.3125, "learning_rate": 3.9603960396039605e-05, "loss": 2.6773, "step": 20 }, { "epoch": 0.24630541871921183, "grad_norm": 2.3125, "learning_rate": 4.950495049504951e-05, "loss": 2.5348, "step": 25 }, { "epoch": 0.2955665024630542, "grad_norm": 1.3359375, "learning_rate": 5.9405940594059404e-05, "loss": 2.375, "step": 30 }, { "epoch": 0.3448275862068966, "grad_norm": 1.5, "learning_rate": 6.93069306930693e-05, "loss": 2.225, "step": 35 }, { "epoch": 0.39408866995073893, "grad_norm": 1.1796875, "learning_rate": 7.920792079207921e-05, "loss": 2.0577, "step": 40 }, { "epoch": 0.4433497536945813, "grad_norm": 2.3125, "learning_rate": 8.910891089108912e-05, "loss": 1.9332, "step": 45 }, { "epoch": 0.49261083743842365, "grad_norm": 0.890625, "learning_rate": 9.900990099009902e-05, "loss": 1.8054, "step": 50 }, { "epoch": 0.541871921182266, "grad_norm": 0.82421875, "learning_rate": 0.00010891089108910893, "loss": 1.7066, "step": 55 }, { "epoch": 0.5911330049261084, "grad_norm": 0.55078125, "learning_rate": 0.00011881188118811881, "loss": 1.6055, "step": 60 }, { "epoch": 0.6403940886699507, "grad_norm": 0.44921875, "learning_rate": 0.00012871287128712872, "loss": 1.5102, "step": 65 }, { "epoch": 0.6896551724137931, "grad_norm": 0.8125, "learning_rate": 0.0001386138613861386, "loss": 1.4332, "step": 70 }, { "epoch": 0.7389162561576355, "grad_norm": 0.48046875, "learning_rate": 0.0001485148514851485, "loss": 1.3703, "step": 75 }, { "epoch": 0.7881773399014779, "grad_norm": 0.62109375, "learning_rate": 0.00015841584158415842, "loss": 1.3483, "step": 80 }, { "epoch": 0.8374384236453202, "grad_norm": 0.5625, "learning_rate": 0.00016831683168316833, "loss": 1.2989, "step": 85 }, { "epoch": 0.8866995073891626, "grad_norm": 0.55078125, "learning_rate": 0.00017821782178217824, "loss": 1.2627, "step": 90 }, { "epoch": 0.9359605911330049, "grad_norm": 0.71875, "learning_rate": 0.00018811881188118812, "loss": 1.2456, "step": 95 }, { "epoch": 0.9852216748768473, "grad_norm": 0.5, "learning_rate": 0.00019801980198019803, "loss": 1.2361, "step": 100 }, { "epoch": 0.9950738916256158, "eval_loss": 2.518617868423462, "eval_runtime": 0.6245, "eval_samples_per_second": 16.014, "eval_steps_per_second": 1.601, "step": 101 }, { "epoch": 1.0344827586206897, "grad_norm": 0.609375, "learning_rate": 0.000199990444464082, "loss": 1.2023, "step": 105 }, { "epoch": 1.083743842364532, "grad_norm": 0.5546875, "learning_rate": 0.00019995162822919883, "loss": 1.1906, "step": 110 }, { "epoch": 1.1330049261083743, "grad_norm": 0.4375, "learning_rate": 0.00019988296565626987, "loss": 1.1786, "step": 115 }, { "epoch": 1.1822660098522166, "grad_norm": 0.62109375, "learning_rate": 0.00019978447724847652, "loss": 1.1654, "step": 120 }, { "epoch": 1.2315270935960592, "grad_norm": 0.466796875, "learning_rate": 0.0001996561924152278, "loss": 1.1533, "step": 125 }, { "epoch": 1.2807881773399015, "grad_norm": 0.466796875, "learning_rate": 0.00019949814946337838, "loss": 1.149, "step": 130 }, { "epoch": 1.3300492610837438, "grad_norm": 0.5234375, "learning_rate": 0.00019931039558578997, "loss": 1.1363, "step": 135 }, { "epoch": 1.3793103448275863, "grad_norm": 0.427734375, "learning_rate": 0.00019909298684723904, "loss": 1.1292, "step": 140 }, { "epoch": 1.4285714285714286, "grad_norm": 0.7109375, "learning_rate": 0.00019884598816767563, "loss": 1.1319, "step": 145 }, { "epoch": 1.477832512315271, "grad_norm": 0.384765625, "learning_rate": 0.00019856947330283752, "loss": 1.1248, "step": 150 }, { "epoch": 1.5270935960591134, "grad_norm": 0.578125, "learning_rate": 0.00019826352482222638, "loss": 1.1241, "step": 155 }, { "epoch": 1.5763546798029555, "grad_norm": 0.47265625, "learning_rate": 0.00019792823408445174, "loss": 1.1179, "step": 160 }, { "epoch": 1.625615763546798, "grad_norm": 0.55859375, "learning_rate": 0.00019756370120995066, "loss": 1.1112, "step": 165 }, { "epoch": 1.6748768472906403, "grad_norm": 0.462890625, "learning_rate": 0.00019717003505109095, "loss": 1.1021, "step": 170 }, { "epoch": 1.7241379310344827, "grad_norm": 0.93359375, "learning_rate": 0.0001967473531596671, "loss": 1.1162, "step": 175 }, { "epoch": 1.7733990147783252, "grad_norm": 0.46875, "learning_rate": 0.0001962957817517982, "loss": 1.0989, "step": 180 }, { "epoch": 1.8226600985221675, "grad_norm": 0.482421875, "learning_rate": 0.000195815455670239, "loss": 1.1004, "step": 185 }, { "epoch": 1.8719211822660098, "grad_norm": 0.55859375, "learning_rate": 0.00019530651834411474, "loss": 1.0986, "step": 190 }, { "epoch": 1.9211822660098523, "grad_norm": 0.40625, "learning_rate": 0.0001947691217460921, "loss": 1.0921, "step": 195 }, { "epoch": 1.9704433497536946, "grad_norm": 0.59375, "learning_rate": 0.0001942034263469989, "loss": 1.0968, "step": 200 }, { "epoch": 2.0, "eval_loss": 2.484497547149658, "eval_runtime": 0.5386, "eval_samples_per_second": 18.567, "eval_steps_per_second": 1.857, "step": 203 }, { "epoch": 2.019704433497537, "grad_norm": 0.75390625, "learning_rate": 0.00019360960106790643, "loss": 1.0792, "step": 205 }, { "epoch": 2.0689655172413794, "grad_norm": 0.455078125, "learning_rate": 0.00019298782322968815, "loss": 1.0645, "step": 210 }, { "epoch": 2.1182266009852215, "grad_norm": 0.6015625, "learning_rate": 0.00019233827850007027, "loss": 1.069, "step": 215 }, { "epoch": 2.167487684729064, "grad_norm": 0.48046875, "learning_rate": 0.00019166116083819002, "loss": 1.0589, "step": 220 }, { "epoch": 2.2167487684729066, "grad_norm": 0.53515625, "learning_rate": 0.0001909566724366779, "loss": 1.0646, "step": 225 }, { "epoch": 2.2660098522167487, "grad_norm": 0.5546875, "learning_rate": 0.00019022502366128135, "loss": 1.0631, "step": 230 }, { "epoch": 2.315270935960591, "grad_norm": 0.55078125, "learning_rate": 0.00018946643298804793, "loss": 1.0559, "step": 235 }, { "epoch": 2.3645320197044333, "grad_norm": 0.609375, "learning_rate": 0.00018868112693808665, "loss": 1.0469, "step": 240 }, { "epoch": 2.413793103448276, "grad_norm": 0.6953125, "learning_rate": 0.00018786934000992688, "loss": 1.0502, "step": 245 }, { "epoch": 2.4630541871921183, "grad_norm": 0.53515625, "learning_rate": 0.00018703131460949554, "loss": 1.058, "step": 250 }, { "epoch": 2.512315270935961, "grad_norm": 0.53515625, "learning_rate": 0.0001861673009777325, "loss": 1.0501, "step": 255 }, { "epoch": 2.561576354679803, "grad_norm": 0.5546875, "learning_rate": 0.00018527755711586678, "loss": 1.0516, "step": 260 }, { "epoch": 2.6108374384236455, "grad_norm": 0.6484375, "learning_rate": 0.00018436234870837547, "loss": 1.0503, "step": 265 }, { "epoch": 2.6600985221674875, "grad_norm": 0.52734375, "learning_rate": 0.00018342194904364813, "loss": 1.0539, "step": 270 }, { "epoch": 2.70935960591133, "grad_norm": 0.5078125, "learning_rate": 0.00018245663893238075, "loss": 1.0407, "step": 275 }, { "epoch": 2.7586206896551726, "grad_norm": 0.466796875, "learning_rate": 0.00018146670662372354, "loss": 1.0412, "step": 280 }, { "epoch": 2.8078817733990147, "grad_norm": 0.72265625, "learning_rate": 0.0001804524477192075, "loss": 1.0412, "step": 285 }, { "epoch": 2.857142857142857, "grad_norm": 0.4921875, "learning_rate": 0.00017941416508447536, "loss": 1.0286, "step": 290 }, { "epoch": 2.9064039408866993, "grad_norm": 0.63671875, "learning_rate": 0.00017835216875884368, "loss": 1.0476, "step": 295 }, { "epoch": 2.955665024630542, "grad_norm": 0.5390625, "learning_rate": 0.00017726677586272263, "loss": 1.0436, "step": 300 }, { "epoch": 2.9950738916256157, "eval_loss": 2.479555606842041, "eval_runtime": 0.6669, "eval_samples_per_second": 14.995, "eval_steps_per_second": 1.5, "step": 304 }, { "epoch": 3.0049261083743843, "grad_norm": 0.455078125, "learning_rate": 0.0001761583105029213, "loss": 1.0275, "step": 305 }, { "epoch": 3.0541871921182264, "grad_norm": 0.6640625, "learning_rate": 0.00017502710367586687, "loss": 1.0062, "step": 310 }, { "epoch": 3.103448275862069, "grad_norm": 0.6484375, "learning_rate": 0.00017387349316876666, "loss": 1.018, "step": 315 }, { "epoch": 3.1527093596059115, "grad_norm": 0.68359375, "learning_rate": 0.00017269782345874203, "loss": 1.005, "step": 320 }, { "epoch": 3.2019704433497536, "grad_norm": 0.66015625, "learning_rate": 0.00017150044560996488, "loss": 1.0104, "step": 325 }, { "epoch": 3.251231527093596, "grad_norm": 0.609375, "learning_rate": 0.00017028171716882714, "loss": 1.0039, "step": 330 }, { "epoch": 3.3004926108374386, "grad_norm": 0.494140625, "learning_rate": 0.0001690420020571747, "loss": 1.0177, "step": 335 }, { "epoch": 3.3497536945812807, "grad_norm": 0.76171875, "learning_rate": 0.00016778167046363734, "loss": 1.0066, "step": 340 }, { "epoch": 3.399014778325123, "grad_norm": 0.5859375, "learning_rate": 0.00016650109873308765, "loss": 1.0187, "step": 345 }, { "epoch": 3.4482758620689653, "grad_norm": 0.498046875, "learning_rate": 0.00016520066925426144, "loss": 1.0157, "step": 350 }, { "epoch": 3.497536945812808, "grad_norm": 0.734375, "learning_rate": 0.00016388077034557355, "loss": 1.0104, "step": 355 }, { "epoch": 3.5467980295566504, "grad_norm": 0.5546875, "learning_rate": 0.00016254179613916278, "loss": 1.0177, "step": 360 }, { "epoch": 3.596059113300493, "grad_norm": 0.4765625, "learning_rate": 0.0001611841464632011, "loss": 1.0193, "step": 365 }, { "epoch": 3.645320197044335, "grad_norm": 0.52734375, "learning_rate": 0.0001598082267225018, "loss": 1.0096, "step": 370 }, { "epoch": 3.6945812807881775, "grad_norm": 0.6640625, "learning_rate": 0.0001584144477774623, "loss": 1.025, "step": 375 }, { "epoch": 3.7438423645320196, "grad_norm": 0.53515625, "learning_rate": 0.00015700322582137827, "loss": 1.0125, "step": 380 }, { "epoch": 3.793103448275862, "grad_norm": 0.50390625, "learning_rate": 0.00015557498225616487, "loss": 1.0022, "step": 385 }, { "epoch": 3.8423645320197046, "grad_norm": 0.578125, "learning_rate": 0.00015413014356652286, "loss": 1.007, "step": 390 }, { "epoch": 3.8916256157635467, "grad_norm": 0.5078125, "learning_rate": 0.000152669141192587, "loss": 1.0013, "step": 395 }, { "epoch": 3.9408866995073892, "grad_norm": 0.50390625, "learning_rate": 0.00015119241140109467, "loss": 1.0009, "step": 400 }, { "epoch": 3.9901477832512313, "grad_norm": 0.5625, "learning_rate": 0.00014970039515511304, "loss": 1.0084, "step": 405 }, { "epoch": 4.0, "eval_loss": 2.494363307952881, "eval_runtime": 0.5386, "eval_samples_per_second": 18.567, "eval_steps_per_second": 1.857, "step": 406 }, { "epoch": 4.039408866995074, "grad_norm": 0.50390625, "learning_rate": 0.00014819353798236427, "loss": 0.9801, "step": 410 }, { "epoch": 4.088669950738916, "grad_norm": 0.60546875, "learning_rate": 0.0001466722898421873, "loss": 0.9817, "step": 415 }, { "epoch": 4.137931034482759, "grad_norm": 0.5390625, "learning_rate": 0.00014513710499117647, "loss": 0.988, "step": 420 }, { "epoch": 4.187192118226601, "grad_norm": 0.51171875, "learning_rate": 0.00014358844184753712, "loss": 0.9782, "step": 425 }, { "epoch": 4.236453201970443, "grad_norm": 0.546875, "learning_rate": 0.00014202676285419812, "loss": 0.9812, "step": 430 }, { "epoch": 4.285714285714286, "grad_norm": 0.7421875, "learning_rate": 0.0001404525343407228, "loss": 0.9897, "step": 435 }, { "epoch": 4.334975369458128, "grad_norm": 0.859375, "learning_rate": 0.00013886622638405952, "loss": 0.992, "step": 440 }, { "epoch": 4.384236453201971, "grad_norm": 0.5390625, "learning_rate": 0.00013726831266817278, "loss": 0.9933, "step": 445 }, { "epoch": 4.433497536945813, "grad_norm": 0.54296875, "learning_rate": 0.0001356592703425976, "loss": 0.9742, "step": 450 }, { "epoch": 4.482758620689655, "grad_norm": 0.6953125, "learning_rate": 0.00013403957987995882, "loss": 0.9777, "step": 455 }, { "epoch": 4.532019704433497, "grad_norm": 0.63671875, "learning_rate": 0.00013240972493249847, "loss": 0.9853, "step": 460 }, { "epoch": 4.58128078817734, "grad_norm": 0.474609375, "learning_rate": 0.00013077019218765305, "loss": 0.9791, "step": 465 }, { "epoch": 4.630541871921182, "grad_norm": 0.53125, "learning_rate": 0.00012912147122272523, "loss": 0.9857, "step": 470 }, { "epoch": 4.679802955665025, "grad_norm": 0.48046875, "learning_rate": 0.00012746405435869198, "loss": 0.9854, "step": 475 }, { "epoch": 4.7290640394088665, "grad_norm": 0.43359375, "learning_rate": 0.0001257984365131938, "loss": 0.9836, "step": 480 }, { "epoch": 4.778325123152709, "grad_norm": 0.478515625, "learning_rate": 0.00012412511505274844, "loss": 0.9939, "step": 485 }, { "epoch": 4.827586206896552, "grad_norm": 0.494140625, "learning_rate": 0.00012244458964423327, "loss": 0.9685, "step": 490 }, { "epoch": 4.876847290640394, "grad_norm": 0.55859375, "learning_rate": 0.0001207573621056809, "loss": 0.9806, "step": 495 }, { "epoch": 4.926108374384237, "grad_norm": 0.494140625, "learning_rate": 0.00011906393625643244, "loss": 0.979, "step": 500 }, { "epoch": 4.975369458128079, "grad_norm": 0.6484375, "learning_rate": 0.00011736481776669306, "loss": 0.9913, "step": 505 }, { "epoch": 4.995073891625616, "eval_loss": 2.50097918510437, "eval_runtime": 0.6794, "eval_samples_per_second": 14.718, "eval_steps_per_second": 1.472, "step": 507 }, { "epoch": 5.024630541871921, "grad_norm": 0.57421875, "learning_rate": 0.00011566051400653486, "loss": 0.9714, "step": 510 }, { "epoch": 5.073891625615763, "grad_norm": 0.6171875, "learning_rate": 0.00011395153389439233, "loss": 0.9602, "step": 515 }, { "epoch": 5.123152709359606, "grad_norm": 0.5234375, "learning_rate": 0.00011223838774509514, "loss": 0.9657, "step": 520 }, { "epoch": 5.172413793103448, "grad_norm": 0.490234375, "learning_rate": 0.00011052158711748434, "loss": 0.9526, "step": 525 }, { "epoch": 5.221674876847291, "grad_norm": 0.5234375, "learning_rate": 0.00010880164466165674, "loss": 0.958, "step": 530 }, { "epoch": 5.2709359605911335, "grad_norm": 0.498046875, "learning_rate": 0.00010707907396588361, "loss": 0.9666, "step": 535 }, { "epoch": 5.320197044334975, "grad_norm": 0.52734375, "learning_rate": 0.0001053543894032493, "loss": 0.9625, "step": 540 }, { "epoch": 5.369458128078818, "grad_norm": 0.515625, "learning_rate": 0.00010362810597805526, "loss": 0.9657, "step": 545 }, { "epoch": 5.41871921182266, "grad_norm": 0.482421875, "learning_rate": 0.00010190073917203589, "loss": 0.9655, "step": 550 }, { "epoch": 5.467980295566503, "grad_norm": 0.515625, "learning_rate": 0.00010017280479043147, "loss": 0.9665, "step": 555 }, { "epoch": 5.517241379310345, "grad_norm": 0.498046875, "learning_rate": 9.844481880796491e-05, "loss": 0.9587, "step": 560 }, { "epoch": 5.566502463054187, "grad_norm": 0.609375, "learning_rate": 9.671729721476746e-05, "loss": 0.9665, "step": 565 }, { "epoch": 5.615763546798029, "grad_norm": 0.5390625, "learning_rate": 9.499075586230013e-05, "loss": 0.9554, "step": 570 }, { "epoch": 5.665024630541872, "grad_norm": 0.625, "learning_rate": 9.326571030931637e-05, "loss": 0.9607, "step": 575 }, { "epoch": 5.714285714285714, "grad_norm": 0.68359375, "learning_rate": 9.154267566791223e-05, "loss": 0.9669, "step": 580 }, { "epoch": 5.763546798029557, "grad_norm": 0.61328125, "learning_rate": 8.982216644970979e-05, "loss": 0.9628, "step": 585 }, { "epoch": 5.812807881773399, "grad_norm": 0.6171875, "learning_rate": 8.810469641222001e-05, "loss": 0.9511, "step": 590 }, { "epoch": 5.862068965517241, "grad_norm": 0.51171875, "learning_rate": 8.639077840543077e-05, "loss": 0.9684, "step": 595 }, { "epoch": 5.911330049261084, "grad_norm": 0.671875, "learning_rate": 8.468092421866573e-05, "loss": 0.9678, "step": 600 }, { "epoch": 5.960591133004926, "grad_norm": 0.65625, "learning_rate": 8.297564442776014e-05, "loss": 0.9588, "step": 605 }, { "epoch": 6.0, "eval_loss": 2.5066332817077637, "eval_runtime": 0.5391, "eval_samples_per_second": 18.548, "eval_steps_per_second": 1.855, "step": 609 }, { "epoch": 6.009852216748769, "grad_norm": 0.68359375, "learning_rate": 8.127544824259889e-05, "loss": 0.953, "step": 610 }, { "epoch": 6.059113300492611, "grad_norm": 0.734375, "learning_rate": 7.958084335506239e-05, "loss": 0.9536, "step": 615 }, { "epoch": 6.108374384236453, "grad_norm": 0.50390625, "learning_rate": 7.789233578742582e-05, "loss": 0.9446, "step": 620 }, { "epoch": 6.157635467980295, "grad_norm": 0.498046875, "learning_rate": 7.6210429741257e-05, "loss": 0.9353, "step": 625 }, { "epoch": 6.206896551724138, "grad_norm": 0.5078125, "learning_rate": 7.453562744685778e-05, "loss": 0.9449, "step": 630 }, { "epoch": 6.25615763546798, "grad_norm": 0.5625, "learning_rate": 7.286842901329412e-05, "loss": 0.9535, "step": 635 }, { "epoch": 6.305418719211823, "grad_norm": 0.498046875, "learning_rate": 7.12093322790597e-05, "loss": 0.9376, "step": 640 }, { "epoch": 6.3546798029556655, "grad_norm": 0.51953125, "learning_rate": 6.955883266341741e-05, "loss": 0.9461, "step": 645 }, { "epoch": 6.403940886699507, "grad_norm": 0.5, "learning_rate": 6.791742301846326e-05, "loss": 0.9503, "step": 650 }, { "epoch": 6.45320197044335, "grad_norm": 0.46875, "learning_rate": 6.62855934819569e-05, "loss": 0.9447, "step": 655 }, { "epoch": 6.502463054187192, "grad_norm": 0.6328125, "learning_rate": 6.466383133096267e-05, "loss": 0.9453, "step": 660 }, { "epoch": 6.551724137931035, "grad_norm": 0.53125, "learning_rate": 6.305262083634488e-05, "loss": 0.944, "step": 665 }, { "epoch": 6.600985221674877, "grad_norm": 0.53125, "learning_rate": 6.145244311816063e-05, "loss": 0.9366, "step": 670 }, { "epoch": 6.650246305418719, "grad_norm": 0.5234375, "learning_rate": 5.986377600199371e-05, "loss": 0.9405, "step": 675 }, { "epoch": 6.699507389162561, "grad_norm": 0.482421875, "learning_rate": 5.828709387627218e-05, "loss": 0.9521, "step": 680 }, { "epoch": 6.748768472906404, "grad_norm": 0.4453125, "learning_rate": 5.6722867550612116e-05, "loss": 0.9509, "step": 685 }, { "epoch": 6.798029556650246, "grad_norm": 0.59765625, "learning_rate": 5.5171564115230254e-05, "loss": 0.9625, "step": 690 }, { "epoch": 6.847290640394089, "grad_norm": 0.447265625, "learning_rate": 5.363364680146725e-05, "loss": 0.9496, "step": 695 }, { "epoch": 6.896551724137931, "grad_norm": 0.439453125, "learning_rate": 5.210957484346314e-05, "loss": 0.9457, "step": 700 }, { "epoch": 6.945812807881773, "grad_norm": 0.482421875, "learning_rate": 5.059980334102637e-05, "loss": 0.9377, "step": 705 }, { "epoch": 6.995073891625616, "grad_norm": 0.453125, "learning_rate": 4.9104783123737566e-05, "loss": 0.9459, "step": 710 }, { "epoch": 6.995073891625616, "eval_loss": 2.516418933868408, "eval_runtime": 0.674, "eval_samples_per_second": 14.837, "eval_steps_per_second": 1.484, "step": 710 }, { "epoch": 7.044334975369458, "grad_norm": 0.48828125, "learning_rate": 4.762496061632814e-05, "loss": 0.9341, "step": 715 }, { "epoch": 7.093596059113301, "grad_norm": 0.51953125, "learning_rate": 4.6160777705374524e-05, "loss": 0.938, "step": 720 }, { "epoch": 7.142857142857143, "grad_norm": 0.50390625, "learning_rate": 4.471267160734731e-05, "loss": 0.9366, "step": 725 }, { "epoch": 7.192118226600985, "grad_norm": 0.53125, "learning_rate": 4.328107473805487e-05, "loss": 0.9403, "step": 730 }, { "epoch": 7.241379310344827, "grad_norm": 0.45703125, "learning_rate": 4.1866414583520877e-05, "loss": 0.9387, "step": 735 }, { "epoch": 7.29064039408867, "grad_norm": 0.71484375, "learning_rate": 4.046911357233343e-05, "loss": 0.9334, "step": 740 }, { "epoch": 7.3399014778325125, "grad_norm": 0.498046875, "learning_rate": 3.9089588949504655e-05, "loss": 0.93, "step": 745 }, { "epoch": 7.389162561576355, "grad_norm": 0.453125, "learning_rate": 3.772825265187802e-05, "loss": 0.9298, "step": 750 }, { "epoch": 7.4384236453201975, "grad_norm": 0.474609375, "learning_rate": 3.638551118512089e-05, "loss": 0.9486, "step": 755 }, { "epoch": 7.487684729064039, "grad_norm": 0.447265625, "learning_rate": 3.506176550233863e-05, "loss": 0.9373, "step": 760 }, { "epoch": 7.536945812807882, "grad_norm": 0.423828125, "learning_rate": 3.3757410884346894e-05, "loss": 0.939, "step": 765 }, { "epoch": 7.586206896551724, "grad_norm": 0.42578125, "learning_rate": 3.2472836821637744e-05, "loss": 0.9325, "step": 770 }, { "epoch": 7.635467980295567, "grad_norm": 0.458984375, "learning_rate": 3.120842689807468e-05, "loss": 0.932, "step": 775 }, { "epoch": 7.684729064039409, "grad_norm": 0.51171875, "learning_rate": 2.996455867635155e-05, "loss": 0.9259, "step": 780 }, { "epoch": 7.733990147783251, "grad_norm": 0.44921875, "learning_rate": 2.874160358524931e-05, "loss": 0.9277, "step": 785 }, { "epoch": 7.783251231527093, "grad_norm": 0.458984375, "learning_rate": 2.753992680872457e-05, "loss": 0.9259, "step": 790 }, { "epoch": 7.832512315270936, "grad_norm": 0.447265625, "learning_rate": 2.6359887176862718e-05, "loss": 0.9431, "step": 795 }, { "epoch": 7.8817733990147785, "grad_norm": 0.46875, "learning_rate": 2.5201837058728505e-05, "loss": 0.9323, "step": 800 }, { "epoch": 7.931034482758621, "grad_norm": 0.4296875, "learning_rate": 2.4066122257145894e-05, "loss": 0.9294, "step": 805 }, { "epoch": 7.980295566502463, "grad_norm": 0.453125, "learning_rate": 2.295308190543859e-05, "loss": 0.943, "step": 810 }, { "epoch": 8.0, "eval_loss": 2.523322582244873, "eval_runtime": 0.5404, "eval_samples_per_second": 18.504, "eval_steps_per_second": 1.85, "step": 812 }, { "epoch": 8.029556650246306, "grad_norm": 0.419921875, "learning_rate": 2.1863048366162208e-05, "loss": 0.9428, "step": 815 }, { "epoch": 8.078817733990148, "grad_norm": 0.44140625, "learning_rate": 2.0796347131858186e-05, "loss": 0.931, "step": 820 }, { "epoch": 8.12807881773399, "grad_norm": 0.431640625, "learning_rate": 1.9753296727859195e-05, "loss": 0.9263, "step": 825 }, { "epoch": 8.177339901477833, "grad_norm": 0.455078125, "learning_rate": 1.8734208617174988e-05, "loss": 0.9426, "step": 830 }, { "epoch": 8.226600985221674, "grad_norm": 0.453125, "learning_rate": 1.773938710748706e-05, "loss": 0.9267, "step": 835 }, { "epoch": 8.275862068965518, "grad_norm": 0.427734375, "learning_rate": 1.676912926028007e-05, "loss": 0.9269, "step": 840 }, { "epoch": 8.32512315270936, "grad_norm": 0.427734375, "learning_rate": 1.5823724802136865e-05, "loss": 0.9294, "step": 845 }, { "epoch": 8.374384236453203, "grad_norm": 0.419921875, "learning_rate": 1.4903456038223939e-05, "loss": 0.9222, "step": 850 }, { "epoch": 8.423645320197044, "grad_norm": 0.412109375, "learning_rate": 1.4008597767992871e-05, "loss": 0.929, "step": 855 }, { "epoch": 8.472906403940886, "grad_norm": 0.42578125, "learning_rate": 1.3139417203123027e-05, "loss": 0.9285, "step": 860 }, { "epoch": 8.52216748768473, "grad_norm": 0.421875, "learning_rate": 1.2296173887730123e-05, "loss": 0.9346, "step": 865 }, { "epoch": 8.571428571428571, "grad_norm": 0.42578125, "learning_rate": 1.1479119620864276e-05, "loss": 0.9252, "step": 870 }, { "epoch": 8.620689655172415, "grad_norm": 0.4375, "learning_rate": 1.0688498381320855e-05, "loss": 0.929, "step": 875 }, { "epoch": 8.669950738916256, "grad_norm": 0.66796875, "learning_rate": 9.924546254786493e-06, "loss": 0.9381, "step": 880 }, { "epoch": 8.719211822660098, "grad_norm": 0.453125, "learning_rate": 9.187491363342093e-06, "loss": 0.9374, "step": 885 }, { "epoch": 8.768472906403941, "grad_norm": 0.458984375, "learning_rate": 8.47755379734373e-06, "loss": 0.9267, "step": 890 }, { "epoch": 8.817733990147783, "grad_norm": 0.447265625, "learning_rate": 7.794945549701993e-06, "loss": 0.9369, "step": 895 }, { "epoch": 8.866995073891626, "grad_norm": 0.43359375, "learning_rate": 7.1398704525792e-06, "loss": 0.9371, "step": 900 }, { "epoch": 8.916256157635468, "grad_norm": 0.4375, "learning_rate": 6.512524116523633e-06, "loss": 0.9348, "step": 905 }, { "epoch": 8.96551724137931, "grad_norm": 0.44140625, "learning_rate": 5.913093872058528e-06, "loss": 0.9169, "step": 910 }, { "epoch": 8.995073891625616, "eval_loss": 2.523988723754883, "eval_runtime": 0.6711, "eval_samples_per_second": 14.9, "eval_steps_per_second": 1.49, "step": 913 }, { "epoch": 9.014778325123153, "grad_norm": 0.427734375, "learning_rate": 5.341758713743828e-06, "loss": 0.9254, "step": 915 }, { "epoch": 9.064039408866995, "grad_norm": 0.427734375, "learning_rate": 4.798689246727006e-06, "loss": 0.929, "step": 920 }, { "epoch": 9.113300492610838, "grad_norm": 0.421875, "learning_rate": 4.2840476357989825e-06, "loss": 0.9346, "step": 925 }, { "epoch": 9.16256157635468, "grad_norm": 0.4140625, "learning_rate": 3.797987556970495e-06, "loss": 0.936, "step": 930 }, { "epoch": 9.211822660098521, "grad_norm": 0.419921875, "learning_rate": 3.3406541515832003e-06, "loss": 0.9283, "step": 935 }, { "epoch": 9.261083743842365, "grad_norm": 0.41015625, "learning_rate": 2.912183982969385e-06, "loss": 0.9255, "step": 940 }, { "epoch": 9.310344827586206, "grad_norm": 0.421875, "learning_rate": 2.5127049956730207e-06, "loss": 0.9252, "step": 945 }, { "epoch": 9.35960591133005, "grad_norm": 0.431640625, "learning_rate": 2.1423364772445887e-06, "loss": 0.9276, "step": 950 }, { "epoch": 9.408866995073891, "grad_norm": 0.423828125, "learning_rate": 1.8011890226208527e-06, "loss": 0.9251, "step": 955 }, { "epoch": 9.458128078817733, "grad_norm": 0.4140625, "learning_rate": 1.489364501100332e-06, "loss": 0.9291, "step": 960 }, { "epoch": 9.507389162561577, "grad_norm": 0.423828125, "learning_rate": 1.2069560259243328e-06, "loss": 0.9333, "step": 965 }, { "epoch": 9.556650246305418, "grad_norm": 0.4375, "learning_rate": 9.540479264726676e-07, "loss": 0.9332, "step": 970 }, { "epoch": 9.605911330049262, "grad_norm": 0.4140625, "learning_rate": 7.307157230821426e-07, "loss": 0.9297, "step": 975 }, { "epoch": 9.655172413793103, "grad_norm": 0.40234375, "learning_rate": 5.370261044956971e-07, "loss": 0.9222, "step": 980 }, { "epoch": 9.704433497536947, "grad_norm": 0.421875, "learning_rate": 3.73036907948543e-07, "loss": 0.9396, "step": 985 }, { "epoch": 9.753694581280788, "grad_norm": 0.416015625, "learning_rate": 2.3879710189753656e-07, "loss": 0.931, "step": 990 }, { "epoch": 9.80295566502463, "grad_norm": 0.43359375, "learning_rate": 1.3434677139885222e-07, "loss": 0.9309, "step": 995 }, { "epoch": 9.852216748768473, "grad_norm": 0.4140625, "learning_rate": 5.971710613821291e-08, "loss": 0.9223, "step": 1000 }, { "epoch": 9.901477832512315, "grad_norm": 0.470703125, "learning_rate": 1.4930391117451426e-08, "loss": 0.9251, "step": 1005 }, { "epoch": 9.950738916256158, "grad_norm": 0.427734375, "learning_rate": 0.0, "loss": 0.925, "step": 1010 }, { "epoch": 9.950738916256158, "eval_loss": 2.523752212524414, "eval_runtime": 0.5416, "eval_samples_per_second": 18.464, "eval_steps_per_second": 1.846, "step": 1010 }, { "epoch": 9.950738916256158, "step": 1010, "total_flos": 5.932470720905871e+17, "train_loss": 1.07745361446154, "train_runtime": 3468.402, "train_samples_per_second": 13.998, "train_steps_per_second": 0.291 } ], "logging_steps": 5, "max_steps": 1010, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "total_flos": 5.932470720905871e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }