{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.19990295972828723, "eval_steps": 103, "global_step": 206, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009704027171276079, "grad_norm": 0.1630859375, "learning_rate": 2e-07, "loss": 2.2674, "step": 1 }, { "epoch": 0.0009704027171276079, "eval_loss": 2.3277485370635986, "eval_runtime": 707.6734, "eval_samples_per_second": 0.916, "eval_steps_per_second": 0.229, "step": 1 }, { "epoch": 0.0019408054342552159, "grad_norm": 0.1728515625, "learning_rate": 4e-07, "loss": 2.2913, "step": 2 }, { "epoch": 0.002911208151382824, "grad_norm": 0.169921875, "learning_rate": 6e-07, "loss": 2.2295, "step": 3 }, { "epoch": 0.0038816108685104317, "grad_norm": 0.1611328125, "learning_rate": 8e-07, "loss": 2.3423, "step": 4 }, { "epoch": 0.0048520135856380394, "grad_norm": 0.17578125, "learning_rate": 1e-06, "loss": 2.4168, "step": 5 }, { "epoch": 0.005822416302765648, "grad_norm": 0.17578125, "learning_rate": 1.2e-06, "loss": 2.1116, "step": 6 }, { "epoch": 0.006792819019893256, "grad_norm": 0.171875, "learning_rate": 1.4e-06, "loss": 2.3571, "step": 7 }, { "epoch": 0.0077632217370208634, "grad_norm": 0.1708984375, "learning_rate": 1.6e-06, "loss": 2.336, "step": 8 }, { "epoch": 0.008733624454148471, "grad_norm": 0.1474609375, "learning_rate": 1.8e-06, "loss": 2.2734, "step": 9 }, { "epoch": 0.009704027171276079, "grad_norm": 0.158203125, "learning_rate": 2e-06, "loss": 2.3191, "step": 10 }, { "epoch": 0.010674429888403688, "grad_norm": 0.140625, "learning_rate": 1.9999957311433394e-06, "loss": 2.1462, "step": 11 }, { "epoch": 0.011644832605531296, "grad_norm": 0.1455078125, "learning_rate": 1.999982924613854e-06, "loss": 2.3692, "step": 12 }, { "epoch": 0.012615235322658904, "grad_norm": 0.134765625, "learning_rate": 1.999961580533031e-06, "loss": 2.1327, "step": 13 }, { "epoch": 0.013585638039786511, "grad_norm": 0.130859375, "learning_rate": 1.9999316991033473e-06, "loss": 2.33, "step": 14 }, { "epoch": 0.01455604075691412, "grad_norm": 0.1357421875, "learning_rate": 1.999893280608269e-06, "loss": 2.1897, "step": 15 }, { "epoch": 0.015526443474041727, "grad_norm": 0.125, "learning_rate": 1.9998463254122472e-06, "loss": 2.3865, "step": 16 }, { "epoch": 0.016496846191169336, "grad_norm": 0.1240234375, "learning_rate": 1.9997908339607153e-06, "loss": 2.2324, "step": 17 }, { "epoch": 0.017467248908296942, "grad_norm": 0.12353515625, "learning_rate": 1.9997268067800845e-06, "loss": 2.3878, "step": 18 }, { "epoch": 0.018437651625424552, "grad_norm": 0.1396484375, "learning_rate": 1.9996542444777386e-06, "loss": 2.2695, "step": 19 }, { "epoch": 0.019408054342552158, "grad_norm": 0.1318359375, "learning_rate": 1.9995731477420292e-06, "loss": 2.2875, "step": 20 }, { "epoch": 0.020378457059679767, "grad_norm": 0.1357421875, "learning_rate": 1.999483517342268e-06, "loss": 2.4014, "step": 21 }, { "epoch": 0.021348859776807377, "grad_norm": 0.1376953125, "learning_rate": 1.9993853541287205e-06, "loss": 2.3639, "step": 22 }, { "epoch": 0.022319262493934983, "grad_norm": 0.1376953125, "learning_rate": 1.999278659032597e-06, "loss": 2.2434, "step": 23 }, { "epoch": 0.023289665211062592, "grad_norm": 0.1328125, "learning_rate": 1.9991634330660437e-06, "loss": 2.3897, "step": 24 }, { "epoch": 0.024260067928190198, "grad_norm": 0.126953125, "learning_rate": 1.999039677322135e-06, "loss": 2.2402, "step": 25 }, { "epoch": 0.025230470645317808, "grad_norm": 0.1279296875, "learning_rate": 1.998907392974861e-06, "loss": 2.401, "step": 26 }, { "epoch": 0.026200873362445413, "grad_norm": 0.12890625, "learning_rate": 1.9987665812791164e-06, "loss": 2.2646, "step": 27 }, { "epoch": 0.027171276079573023, "grad_norm": 0.12890625, "learning_rate": 1.9986172435706903e-06, "loss": 2.3168, "step": 28 }, { "epoch": 0.028141678796700632, "grad_norm": 0.1279296875, "learning_rate": 1.9984593812662525e-06, "loss": 2.3874, "step": 29 }, { "epoch": 0.02911208151382824, "grad_norm": 0.134765625, "learning_rate": 1.9982929958633397e-06, "loss": 2.3126, "step": 30 }, { "epoch": 0.030082484230955848, "grad_norm": 0.11376953125, "learning_rate": 1.998118088940341e-06, "loss": 2.233, "step": 31 }, { "epoch": 0.031052886948083454, "grad_norm": 0.125, "learning_rate": 1.9979346621564857e-06, "loss": 2.1122, "step": 32 }, { "epoch": 0.03202328966521106, "grad_norm": 0.1279296875, "learning_rate": 1.9977427172518227e-06, "loss": 2.2075, "step": 33 }, { "epoch": 0.03299369238233867, "grad_norm": 0.1259765625, "learning_rate": 1.9975422560472093e-06, "loss": 2.3974, "step": 34 }, { "epoch": 0.033964095099466275, "grad_norm": 0.1181640625, "learning_rate": 1.9973332804442895e-06, "loss": 2.344, "step": 35 }, { "epoch": 0.034934497816593885, "grad_norm": 0.11962890625, "learning_rate": 1.997115792425479e-06, "loss": 2.2733, "step": 36 }, { "epoch": 0.035904900533721494, "grad_norm": 0.1318359375, "learning_rate": 1.996889794053945e-06, "loss": 2.2443, "step": 37 }, { "epoch": 0.036875303250849104, "grad_norm": 0.1220703125, "learning_rate": 1.9966552874735863e-06, "loss": 2.3253, "step": 38 }, { "epoch": 0.03784570596797671, "grad_norm": 0.1201171875, "learning_rate": 1.9964122749090145e-06, "loss": 2.393, "step": 39 }, { "epoch": 0.038816108685104316, "grad_norm": 0.1142578125, "learning_rate": 1.996160758665531e-06, "loss": 2.234, "step": 40 }, { "epoch": 0.039786511402231925, "grad_norm": 0.11181640625, "learning_rate": 1.9959007411291063e-06, "loss": 2.1832, "step": 41 }, { "epoch": 0.040756914119359534, "grad_norm": 0.11962890625, "learning_rate": 1.995632224766358e-06, "loss": 2.2993, "step": 42 }, { "epoch": 0.041727316836487144, "grad_norm": 0.12060546875, "learning_rate": 1.995355212124525e-06, "loss": 2.4293, "step": 43 }, { "epoch": 0.04269771955361475, "grad_norm": 0.115234375, "learning_rate": 1.9950697058314457e-06, "loss": 2.1791, "step": 44 }, { "epoch": 0.043668122270742356, "grad_norm": 0.11328125, "learning_rate": 1.994775708595533e-06, "loss": 2.402, "step": 45 }, { "epoch": 0.044638524987869965, "grad_norm": 0.11181640625, "learning_rate": 1.9944732232057465e-06, "loss": 2.271, "step": 46 }, { "epoch": 0.045608927704997575, "grad_norm": 0.10888671875, "learning_rate": 1.994162252531567e-06, "loss": 2.1897, "step": 47 }, { "epoch": 0.046579330422125184, "grad_norm": 0.11279296875, "learning_rate": 1.9938427995229723e-06, "loss": 2.268, "step": 48 }, { "epoch": 0.04754973313925279, "grad_norm": 0.125, "learning_rate": 1.993514867210404e-06, "loss": 2.2919, "step": 49 }, { "epoch": 0.048520135856380396, "grad_norm": 0.126953125, "learning_rate": 1.9931784587047422e-06, "loss": 2.4426, "step": 50 }, { "epoch": 0.049490538573508006, "grad_norm": 0.11767578125, "learning_rate": 1.9928335771972748e-06, "loss": 2.3823, "step": 51 }, { "epoch": 0.050460941290635615, "grad_norm": 0.11572265625, "learning_rate": 1.9924802259596686e-06, "loss": 2.2299, "step": 52 }, { "epoch": 0.051431344007763224, "grad_norm": 0.11767578125, "learning_rate": 1.9921184083439354e-06, "loss": 2.4699, "step": 53 }, { "epoch": 0.05240174672489083, "grad_norm": 0.11083984375, "learning_rate": 1.991748127782404e-06, "loss": 2.2251, "step": 54 }, { "epoch": 0.053372149442018436, "grad_norm": 0.1142578125, "learning_rate": 1.9913693877876844e-06, "loss": 2.204, "step": 55 }, { "epoch": 0.054342552159146046, "grad_norm": 0.111328125, "learning_rate": 1.9909821919526363e-06, "loss": 2.228, "step": 56 }, { "epoch": 0.055312954876273655, "grad_norm": 0.109375, "learning_rate": 1.9905865439503337e-06, "loss": 2.2021, "step": 57 }, { "epoch": 0.056283357593401265, "grad_norm": 0.10888671875, "learning_rate": 1.9901824475340314e-06, "loss": 2.2048, "step": 58 }, { "epoch": 0.05725376031052887, "grad_norm": 0.11279296875, "learning_rate": 1.9897699065371285e-06, "loss": 2.2993, "step": 59 }, { "epoch": 0.05822416302765648, "grad_norm": 0.111328125, "learning_rate": 1.9893489248731336e-06, "loss": 2.2354, "step": 60 }, { "epoch": 0.059194565744784086, "grad_norm": 0.123046875, "learning_rate": 1.9889195065356238e-06, "loss": 2.3262, "step": 61 }, { "epoch": 0.060164968461911696, "grad_norm": 0.11767578125, "learning_rate": 1.988481655598212e-06, "loss": 2.4042, "step": 62 }, { "epoch": 0.0611353711790393, "grad_norm": 0.1171875, "learning_rate": 1.988035376214504e-06, "loss": 2.2258, "step": 63 }, { "epoch": 0.06210577389616691, "grad_norm": 0.11669921875, "learning_rate": 1.987580672618062e-06, "loss": 2.2404, "step": 64 }, { "epoch": 0.06307617661329452, "grad_norm": 0.1142578125, "learning_rate": 1.987117549122363e-06, "loss": 2.398, "step": 65 }, { "epoch": 0.06404657933042213, "grad_norm": 0.1181640625, "learning_rate": 1.986646010120756e-06, "loss": 2.264, "step": 66 }, { "epoch": 0.06501698204754973, "grad_norm": 0.1162109375, "learning_rate": 1.986166060086425e-06, "loss": 2.3085, "step": 67 }, { "epoch": 0.06598738476467735, "grad_norm": 0.11669921875, "learning_rate": 1.985677703572344e-06, "loss": 2.219, "step": 68 }, { "epoch": 0.06695778748180495, "grad_norm": 0.1181640625, "learning_rate": 1.9851809452112317e-06, "loss": 2.2302, "step": 69 }, { "epoch": 0.06792819019893255, "grad_norm": 0.1142578125, "learning_rate": 1.9846757897155116e-06, "loss": 2.3431, "step": 70 }, { "epoch": 0.06889859291606017, "grad_norm": 0.11669921875, "learning_rate": 1.984162241877264e-06, "loss": 2.2894, "step": 71 }, { "epoch": 0.06986899563318777, "grad_norm": 0.11083984375, "learning_rate": 1.983640306568183e-06, "loss": 2.3387, "step": 72 }, { "epoch": 0.07083939835031539, "grad_norm": 0.11669921875, "learning_rate": 1.9831099887395287e-06, "loss": 2.092, "step": 73 }, { "epoch": 0.07180980106744299, "grad_norm": 0.123046875, "learning_rate": 1.98257129342208e-06, "loss": 2.3139, "step": 74 }, { "epoch": 0.07278020378457059, "grad_norm": 0.12158203125, "learning_rate": 1.9820242257260884e-06, "loss": 2.3842, "step": 75 }, { "epoch": 0.07375060650169821, "grad_norm": 0.111328125, "learning_rate": 1.981468790841229e-06, "loss": 2.2733, "step": 76 }, { "epoch": 0.07472100921882581, "grad_norm": 0.1044921875, "learning_rate": 1.9809049940365504e-06, "loss": 2.1439, "step": 77 }, { "epoch": 0.07569141193595343, "grad_norm": 0.10888671875, "learning_rate": 1.980332840660425e-06, "loss": 2.316, "step": 78 }, { "epoch": 0.07666181465308103, "grad_norm": 0.12255859375, "learning_rate": 1.979752336140499e-06, "loss": 2.4238, "step": 79 }, { "epoch": 0.07763221737020863, "grad_norm": 0.10986328125, "learning_rate": 1.9791634859836408e-06, "loss": 2.2081, "step": 80 }, { "epoch": 0.07860262008733625, "grad_norm": 0.11572265625, "learning_rate": 1.978566295775887e-06, "loss": 2.3303, "step": 81 }, { "epoch": 0.07957302280446385, "grad_norm": 0.111328125, "learning_rate": 1.977960771182393e-06, "loss": 2.3657, "step": 82 }, { "epoch": 0.08054342552159147, "grad_norm": 0.1103515625, "learning_rate": 1.9773469179473754e-06, "loss": 2.3921, "step": 83 }, { "epoch": 0.08151382823871907, "grad_norm": 0.1123046875, "learning_rate": 1.9767247418940593e-06, "loss": 2.3947, "step": 84 }, { "epoch": 0.08248423095584667, "grad_norm": 0.12109375, "learning_rate": 1.9760942489246236e-06, "loss": 2.2361, "step": 85 }, { "epoch": 0.08345463367297429, "grad_norm": 0.1083984375, "learning_rate": 1.975455445020144e-06, "loss": 2.1025, "step": 86 }, { "epoch": 0.08442503639010189, "grad_norm": 0.115234375, "learning_rate": 1.9748083362405373e-06, "loss": 2.4577, "step": 87 }, { "epoch": 0.0853954391072295, "grad_norm": 0.1171875, "learning_rate": 1.974152928724502e-06, "loss": 2.3706, "step": 88 }, { "epoch": 0.08636584182435711, "grad_norm": 0.11181640625, "learning_rate": 1.973489228689463e-06, "loss": 2.4223, "step": 89 }, { "epoch": 0.08733624454148471, "grad_norm": 0.12158203125, "learning_rate": 1.9728172424315087e-06, "loss": 2.1975, "step": 90 }, { "epoch": 0.08830664725861233, "grad_norm": 0.10986328125, "learning_rate": 1.9721369763253348e-06, "loss": 2.3638, "step": 91 }, { "epoch": 0.08927704997573993, "grad_norm": 0.10986328125, "learning_rate": 1.9714484368241828e-06, "loss": 2.278, "step": 92 }, { "epoch": 0.09024745269286755, "grad_norm": 0.1123046875, "learning_rate": 1.9707516304597783e-06, "loss": 2.3421, "step": 93 }, { "epoch": 0.09121785540999515, "grad_norm": 0.1171875, "learning_rate": 1.9700465638422686e-06, "loss": 2.3418, "step": 94 }, { "epoch": 0.09218825812712275, "grad_norm": 0.11181640625, "learning_rate": 1.9693332436601613e-06, "loss": 2.1814, "step": 95 }, { "epoch": 0.09315866084425037, "grad_norm": 0.1259765625, "learning_rate": 1.96861167668026e-06, "loss": 2.299, "step": 96 }, { "epoch": 0.09412906356137797, "grad_norm": 0.10791015625, "learning_rate": 1.9678818697476e-06, "loss": 2.1708, "step": 97 }, { "epoch": 0.09509946627850557, "grad_norm": 0.1171875, "learning_rate": 1.9671438297853845e-06, "loss": 2.3062, "step": 98 }, { "epoch": 0.09606986899563319, "grad_norm": 0.11669921875, "learning_rate": 1.9663975637949172e-06, "loss": 2.2044, "step": 99 }, { "epoch": 0.09704027171276079, "grad_norm": 0.11181640625, "learning_rate": 1.9656430788555372e-06, "loss": 2.1654, "step": 100 }, { "epoch": 0.09801067442988841, "grad_norm": 0.1279296875, "learning_rate": 1.964880382124551e-06, "loss": 2.3448, "step": 101 }, { "epoch": 0.09898107714701601, "grad_norm": 0.1103515625, "learning_rate": 1.964109480837165e-06, "loss": 2.2951, "step": 102 }, { "epoch": 0.09995147986414361, "grad_norm": 0.10986328125, "learning_rate": 1.9633303823064186e-06, "loss": 2.2252, "step": 103 }, { "epoch": 0.09995147986414361, "eval_loss": 2.3053860664367676, "eval_runtime": 714.5363, "eval_samples_per_second": 0.907, "eval_steps_per_second": 0.227, "step": 103 }, { "epoch": 0.10092188258127123, "grad_norm": 0.11474609375, "learning_rate": 1.96254309392311e-06, "loss": 2.2504, "step": 104 }, { "epoch": 0.10189228529839883, "grad_norm": 0.10302734375, "learning_rate": 1.9617476231557315e-06, "loss": 2.3294, "step": 105 }, { "epoch": 0.10286268801552645, "grad_norm": 0.107421875, "learning_rate": 1.960943977550397e-06, "loss": 2.2063, "step": 106 }, { "epoch": 0.10383309073265405, "grad_norm": 0.10888671875, "learning_rate": 1.960132164730766e-06, "loss": 2.2253, "step": 107 }, { "epoch": 0.10480349344978165, "grad_norm": 0.1162109375, "learning_rate": 1.95931219239798e-06, "loss": 2.3711, "step": 108 }, { "epoch": 0.10577389616690927, "grad_norm": 0.1064453125, "learning_rate": 1.9584840683305802e-06, "loss": 2.2904, "step": 109 }, { "epoch": 0.10674429888403687, "grad_norm": 0.11669921875, "learning_rate": 1.957647800384441e-06, "loss": 2.3879, "step": 110 }, { "epoch": 0.10771470160116449, "grad_norm": 0.10498046875, "learning_rate": 1.9568033964926904e-06, "loss": 2.3677, "step": 111 }, { "epoch": 0.10868510431829209, "grad_norm": 0.119140625, "learning_rate": 1.9559508646656384e-06, "loss": 2.4216, "step": 112 }, { "epoch": 0.1096555070354197, "grad_norm": 0.10791015625, "learning_rate": 1.9550902129906976e-06, "loss": 2.197, "step": 113 }, { "epoch": 0.11062590975254731, "grad_norm": 0.126953125, "learning_rate": 1.954221449632311e-06, "loss": 2.2467, "step": 114 }, { "epoch": 0.11159631246967491, "grad_norm": 0.11083984375, "learning_rate": 1.953344582831869e-06, "loss": 2.1098, "step": 115 }, { "epoch": 0.11256671518680253, "grad_norm": 0.1123046875, "learning_rate": 1.952459620907636e-06, "loss": 2.2719, "step": 116 }, { "epoch": 0.11353711790393013, "grad_norm": 0.11572265625, "learning_rate": 1.951566572254669e-06, "loss": 2.3066, "step": 117 }, { "epoch": 0.11450752062105773, "grad_norm": 0.11376953125, "learning_rate": 1.9506654453447375e-06, "loss": 2.3814, "step": 118 }, { "epoch": 0.11547792333818535, "grad_norm": 0.109375, "learning_rate": 1.9497562487262457e-06, "loss": 2.233, "step": 119 }, { "epoch": 0.11644832605531295, "grad_norm": 0.115234375, "learning_rate": 1.9488389910241497e-06, "loss": 2.3341, "step": 120 }, { "epoch": 0.11741872877244056, "grad_norm": 0.11376953125, "learning_rate": 1.947913680939874e-06, "loss": 2.1674, "step": 121 }, { "epoch": 0.11838913148956817, "grad_norm": 0.12255859375, "learning_rate": 1.9469803272512343e-06, "loss": 2.3772, "step": 122 }, { "epoch": 0.11935953420669577, "grad_norm": 0.1142578125, "learning_rate": 1.946038938812347e-06, "loss": 2.1713, "step": 123 }, { "epoch": 0.12032993692382339, "grad_norm": 0.1083984375, "learning_rate": 1.9450895245535508e-06, "loss": 2.1877, "step": 124 }, { "epoch": 0.121300339640951, "grad_norm": 0.1142578125, "learning_rate": 1.9441320934813205e-06, "loss": 2.2995, "step": 125 }, { "epoch": 0.1222707423580786, "grad_norm": 0.1044921875, "learning_rate": 1.9431666546781793e-06, "loss": 2.3268, "step": 126 }, { "epoch": 0.12324114507520621, "grad_norm": 0.10693359375, "learning_rate": 1.942193217302617e-06, "loss": 2.3007, "step": 127 }, { "epoch": 0.12421154779233382, "grad_norm": 0.12353515625, "learning_rate": 1.9412117905889995e-06, "loss": 2.1957, "step": 128 }, { "epoch": 0.12518195050946143, "grad_norm": 0.1123046875, "learning_rate": 1.9402223838474815e-06, "loss": 2.3008, "step": 129 }, { "epoch": 0.12615235322658905, "grad_norm": 0.103515625, "learning_rate": 1.93922500646392e-06, "loss": 2.3233, "step": 130 }, { "epoch": 0.12712275594371664, "grad_norm": 0.1240234375, "learning_rate": 1.9382196678997843e-06, "loss": 2.342, "step": 131 }, { "epoch": 0.12809315866084425, "grad_norm": 0.11962890625, "learning_rate": 1.937206377692066e-06, "loss": 2.2727, "step": 132 }, { "epoch": 0.12906356137797187, "grad_norm": 0.11474609375, "learning_rate": 1.936185145453189e-06, "loss": 2.3136, "step": 133 }, { "epoch": 0.13003396409509946, "grad_norm": 0.11376953125, "learning_rate": 1.9351559808709173e-06, "loss": 2.3071, "step": 134 }, { "epoch": 0.13100436681222707, "grad_norm": 0.11474609375, "learning_rate": 1.9341188937082645e-06, "loss": 2.3851, "step": 135 }, { "epoch": 0.1319747695293547, "grad_norm": 0.10888671875, "learning_rate": 1.9330738938034004e-06, "loss": 2.1945, "step": 136 }, { "epoch": 0.13294517224648228, "grad_norm": 0.1220703125, "learning_rate": 1.9320209910695587e-06, "loss": 2.3302, "step": 137 }, { "epoch": 0.1339155749636099, "grad_norm": 0.1083984375, "learning_rate": 1.9309601954949403e-06, "loss": 2.1342, "step": 138 }, { "epoch": 0.1348859776807375, "grad_norm": 0.1162109375, "learning_rate": 1.9298915171426217e-06, "loss": 2.3877, "step": 139 }, { "epoch": 0.1358563803978651, "grad_norm": 0.1064453125, "learning_rate": 1.9288149661504584e-06, "loss": 2.246, "step": 140 }, { "epoch": 0.13682678311499272, "grad_norm": 0.111328125, "learning_rate": 1.9277305527309865e-06, "loss": 2.2299, "step": 141 }, { "epoch": 0.13779718583212033, "grad_norm": 0.1123046875, "learning_rate": 1.9266382871713306e-06, "loss": 2.2701, "step": 142 }, { "epoch": 0.13876758854924795, "grad_norm": 0.107421875, "learning_rate": 1.9255381798331015e-06, "loss": 2.1286, "step": 143 }, { "epoch": 0.13973799126637554, "grad_norm": 0.11083984375, "learning_rate": 1.9244302411523e-06, "loss": 2.3215, "step": 144 }, { "epoch": 0.14070839398350315, "grad_norm": 0.107421875, "learning_rate": 1.9233144816392187e-06, "loss": 2.0745, "step": 145 }, { "epoch": 0.14167879670063077, "grad_norm": 0.10791015625, "learning_rate": 1.9221909118783407e-06, "loss": 2.3284, "step": 146 }, { "epoch": 0.14264919941775836, "grad_norm": 0.1142578125, "learning_rate": 1.92105954252824e-06, "loss": 2.3162, "step": 147 }, { "epoch": 0.14361960213488598, "grad_norm": 0.1103515625, "learning_rate": 1.9199203843214794e-06, "loss": 2.1617, "step": 148 }, { "epoch": 0.1445900048520136, "grad_norm": 0.1171875, "learning_rate": 1.918773448064511e-06, "loss": 2.3815, "step": 149 }, { "epoch": 0.14556040756914118, "grad_norm": 0.12158203125, "learning_rate": 1.9176187446375713e-06, "loss": 2.1546, "step": 150 }, { "epoch": 0.1465308102862688, "grad_norm": 0.11376953125, "learning_rate": 1.9164562849945785e-06, "loss": 2.2754, "step": 151 }, { "epoch": 0.14750121300339641, "grad_norm": 0.1240234375, "learning_rate": 1.9152860801630306e-06, "loss": 2.1837, "step": 152 }, { "epoch": 0.14847161572052403, "grad_norm": 0.11279296875, "learning_rate": 1.9141081412438966e-06, "loss": 2.2695, "step": 153 }, { "epoch": 0.14944201843765162, "grad_norm": 0.11474609375, "learning_rate": 1.912922479411516e-06, "loss": 2.2561, "step": 154 }, { "epoch": 0.15041242115477924, "grad_norm": 0.10986328125, "learning_rate": 1.9117291059134893e-06, "loss": 2.201, "step": 155 }, { "epoch": 0.15138282387190685, "grad_norm": 0.10986328125, "learning_rate": 1.9105280320705733e-06, "loss": 2.2406, "step": 156 }, { "epoch": 0.15235322658903444, "grad_norm": 0.11376953125, "learning_rate": 1.9093192692765728e-06, "loss": 2.2358, "step": 157 }, { "epoch": 0.15332362930616206, "grad_norm": 0.1171875, "learning_rate": 1.9081028289982322e-06, "loss": 2.2847, "step": 158 }, { "epoch": 0.15429403202328967, "grad_norm": 0.1171875, "learning_rate": 1.9068787227751273e-06, "loss": 2.23, "step": 159 }, { "epoch": 0.15526443474041726, "grad_norm": 0.10986328125, "learning_rate": 1.9056469622195561e-06, "loss": 2.1406, "step": 160 }, { "epoch": 0.15623483745754488, "grad_norm": 0.10986328125, "learning_rate": 1.904407559016428e-06, "loss": 2.3344, "step": 161 }, { "epoch": 0.1572052401746725, "grad_norm": 0.11474609375, "learning_rate": 1.9031605249231528e-06, "loss": 2.307, "step": 162 }, { "epoch": 0.15817564289180008, "grad_norm": 0.1201171875, "learning_rate": 1.9019058717695309e-06, "loss": 2.3153, "step": 163 }, { "epoch": 0.1591460456089277, "grad_norm": 0.10986328125, "learning_rate": 1.9006436114576385e-06, "loss": 2.2753, "step": 164 }, { "epoch": 0.16011644832605532, "grad_norm": 0.10791015625, "learning_rate": 1.8993737559617163e-06, "loss": 2.3172, "step": 165 }, { "epoch": 0.16108685104318293, "grad_norm": 0.11669921875, "learning_rate": 1.8980963173280556e-06, "loss": 2.419, "step": 166 }, { "epoch": 0.16205725376031052, "grad_norm": 0.11474609375, "learning_rate": 1.8968113076748841e-06, "loss": 2.3402, "step": 167 }, { "epoch": 0.16302765647743814, "grad_norm": 0.11669921875, "learning_rate": 1.8955187391922506e-06, "loss": 2.2971, "step": 168 }, { "epoch": 0.16399805919456575, "grad_norm": 0.1083984375, "learning_rate": 1.89421862414191e-06, "loss": 2.4186, "step": 169 }, { "epoch": 0.16496846191169334, "grad_norm": 0.1220703125, "learning_rate": 1.892910974857206e-06, "loss": 2.2934, "step": 170 }, { "epoch": 0.16593886462882096, "grad_norm": 0.10986328125, "learning_rate": 1.891595803742955e-06, "loss": 2.2529, "step": 171 }, { "epoch": 0.16690926734594858, "grad_norm": 0.11181640625, "learning_rate": 1.890273123275328e-06, "loss": 2.2834, "step": 172 }, { "epoch": 0.16787967006307616, "grad_norm": 0.115234375, "learning_rate": 1.8889429460017328e-06, "loss": 2.3267, "step": 173 }, { "epoch": 0.16885007278020378, "grad_norm": 0.1142578125, "learning_rate": 1.8876052845406932e-06, "loss": 2.3528, "step": 174 }, { "epoch": 0.1698204754973314, "grad_norm": 0.11669921875, "learning_rate": 1.8862601515817325e-06, "loss": 2.2583, "step": 175 }, { "epoch": 0.170790878214459, "grad_norm": 0.11328125, "learning_rate": 1.8849075598852497e-06, "loss": 2.0822, "step": 176 }, { "epoch": 0.1717612809315866, "grad_norm": 0.1142578125, "learning_rate": 1.8835475222824004e-06, "loss": 2.2797, "step": 177 }, { "epoch": 0.17273168364871422, "grad_norm": 0.1064453125, "learning_rate": 1.8821800516749753e-06, "loss": 2.1859, "step": 178 }, { "epoch": 0.17370208636584183, "grad_norm": 0.1064453125, "learning_rate": 1.8808051610352764e-06, "loss": 2.3002, "step": 179 }, { "epoch": 0.17467248908296942, "grad_norm": 0.12109375, "learning_rate": 1.8794228634059947e-06, "loss": 2.4352, "step": 180 }, { "epoch": 0.17564289180009704, "grad_norm": 0.11767578125, "learning_rate": 1.878033171900087e-06, "loss": 2.2839, "step": 181 }, { "epoch": 0.17661329451722466, "grad_norm": 0.11767578125, "learning_rate": 1.8766360997006506e-06, "loss": 2.3349, "step": 182 }, { "epoch": 0.17758369723435224, "grad_norm": 0.11669921875, "learning_rate": 1.8752316600607989e-06, "loss": 2.3102, "step": 183 }, { "epoch": 0.17855409995147986, "grad_norm": 0.11474609375, "learning_rate": 1.8738198663035351e-06, "loss": 2.2898, "step": 184 }, { "epoch": 0.17952450266860748, "grad_norm": 0.11474609375, "learning_rate": 1.8724007318216263e-06, "loss": 2.3098, "step": 185 }, { "epoch": 0.1804949053857351, "grad_norm": 0.1220703125, "learning_rate": 1.8709742700774764e-06, "loss": 2.6074, "step": 186 }, { "epoch": 0.18146530810286268, "grad_norm": 0.11083984375, "learning_rate": 1.869540494602998e-06, "loss": 2.3196, "step": 187 }, { "epoch": 0.1824357108199903, "grad_norm": 0.1162109375, "learning_rate": 1.8680994189994842e-06, "loss": 2.365, "step": 188 }, { "epoch": 0.18340611353711792, "grad_norm": 0.1142578125, "learning_rate": 1.8666510569374804e-06, "loss": 2.2361, "step": 189 }, { "epoch": 0.1843765162542455, "grad_norm": 0.11083984375, "learning_rate": 1.8651954221566527e-06, "loss": 2.2856, "step": 190 }, { "epoch": 0.18534691897137312, "grad_norm": 0.11181640625, "learning_rate": 1.8637325284656598e-06, "loss": 2.3472, "step": 191 }, { "epoch": 0.18631732168850074, "grad_norm": 0.10888671875, "learning_rate": 1.8622623897420201e-06, "loss": 2.2649, "step": 192 }, { "epoch": 0.18728772440562833, "grad_norm": 0.1103515625, "learning_rate": 1.860785019931982e-06, "loss": 2.281, "step": 193 }, { "epoch": 0.18825812712275594, "grad_norm": 0.12451171875, "learning_rate": 1.85930043305039e-06, "loss": 2.2947, "step": 194 }, { "epoch": 0.18922852983988356, "grad_norm": 0.1142578125, "learning_rate": 1.8578086431805507e-06, "loss": 2.193, "step": 195 }, { "epoch": 0.19019893255701115, "grad_norm": 0.111328125, "learning_rate": 1.8563096644741034e-06, "loss": 2.351, "step": 196 }, { "epoch": 0.19116933527413876, "grad_norm": 0.11083984375, "learning_rate": 1.8548035111508815e-06, "loss": 2.0055, "step": 197 }, { "epoch": 0.19213973799126638, "grad_norm": 0.11279296875, "learning_rate": 1.85329019749878e-06, "loss": 2.2949, "step": 198 }, { "epoch": 0.193110140708394, "grad_norm": 0.119140625, "learning_rate": 1.8517697378736188e-06, "loss": 2.0795, "step": 199 }, { "epoch": 0.19408054342552158, "grad_norm": 0.11181640625, "learning_rate": 1.8502421466990075e-06, "loss": 2.2709, "step": 200 }, { "epoch": 0.1950509461426492, "grad_norm": 0.11474609375, "learning_rate": 1.8487074384662076e-06, "loss": 2.2284, "step": 201 }, { "epoch": 0.19602134885977682, "grad_norm": 0.11328125, "learning_rate": 1.8471656277339956e-06, "loss": 2.2379, "step": 202 }, { "epoch": 0.1969917515769044, "grad_norm": 0.1240234375, "learning_rate": 1.845616729128525e-06, "loss": 2.3339, "step": 203 }, { "epoch": 0.19796215429403202, "grad_norm": 0.10693359375, "learning_rate": 1.844060757343187e-06, "loss": 2.2919, "step": 204 }, { "epoch": 0.19893255701115964, "grad_norm": 0.1142578125, "learning_rate": 1.842497727138472e-06, "loss": 2.1203, "step": 205 }, { "epoch": 0.19990295972828723, "grad_norm": 0.10693359375, "learning_rate": 1.8409276533418283e-06, "loss": 2.1531, "step": 206 }, { "epoch": 0.19990295972828723, "eval_loss": 2.299978494644165, "eval_runtime": 713.9402, "eval_samples_per_second": 0.908, "eval_steps_per_second": 0.227, "step": 206 } ], "logging_steps": 1, "max_steps": 1030, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 103, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.253350449813914e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }