{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.14881915237787124, "eval_steps": 386, "global_step": 115, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012940795858945326, "grad_norm": 1.7405146360397339, "learning_rate": 2.0000000000000003e-06, "loss": 2.4269, "step": 1 }, { "epoch": 0.0012940795858945326, "eval_loss": 2.247628688812256, "eval_runtime": 189.8853, "eval_samples_per_second": 3.428, "eval_steps_per_second": 0.858, "step": 1 }, { "epoch": 0.002588159171789065, "grad_norm": 1.6643542051315308, "learning_rate": 4.000000000000001e-06, "loss": 2.2583, "step": 2 }, { "epoch": 0.0038822387576835974, "grad_norm": 1.8690767288208008, "learning_rate": 6e-06, "loss": 2.2696, "step": 3 }, { "epoch": 0.00517631834357813, "grad_norm": 1.828118085861206, "learning_rate": 8.000000000000001e-06, "loss": 2.3646, "step": 4 }, { "epoch": 0.006470397929472662, "grad_norm": 1.9319926500320435, "learning_rate": 1e-05, "loss": 2.4196, "step": 5 }, { "epoch": 0.007764477515367195, "grad_norm": 1.7723782062530518, "learning_rate": 1.2e-05, "loss": 2.4177, "step": 6 }, { "epoch": 0.009058557101261728, "grad_norm": 1.9500815868377686, "learning_rate": 1.4000000000000001e-05, "loss": 2.3497, "step": 7 }, { "epoch": 0.01035263668715626, "grad_norm": 2.3909075260162354, "learning_rate": 1.6000000000000003e-05, "loss": 2.405, "step": 8 }, { "epoch": 0.011646716273050793, "grad_norm": 2.0620856285095215, "learning_rate": 1.8e-05, "loss": 2.4098, "step": 9 }, { "epoch": 0.012940795858945324, "grad_norm": 1.8054910898208618, "learning_rate": 2e-05, "loss": 2.1233, "step": 10 }, { "epoch": 0.014234875444839857, "grad_norm": 2.190964937210083, "learning_rate": 2.2000000000000003e-05, "loss": 2.3985, "step": 11 }, { "epoch": 0.01552895503073439, "grad_norm": 1.9412921667099, "learning_rate": 2.4e-05, "loss": 2.462, "step": 12 }, { "epoch": 0.016823034616628922, "grad_norm": 1.9161555767059326, "learning_rate": 2.6000000000000002e-05, "loss": 2.2118, "step": 13 }, { "epoch": 0.018117114202523456, "grad_norm": 1.7161599397659302, "learning_rate": 2.8000000000000003e-05, "loss": 2.2175, "step": 14 }, { "epoch": 0.019411193788417987, "grad_norm": 2.173877000808716, "learning_rate": 3e-05, "loss": 2.2521, "step": 15 }, { "epoch": 0.02070527337431252, "grad_norm": 2.0000555515289307, "learning_rate": 3.2000000000000005e-05, "loss": 2.1615, "step": 16 }, { "epoch": 0.021999352960207053, "grad_norm": 1.5915080308914185, "learning_rate": 3.4000000000000007e-05, "loss": 1.9522, "step": 17 }, { "epoch": 0.023293432546101587, "grad_norm": 1.6972448825836182, "learning_rate": 3.6e-05, "loss": 1.7224, "step": 18 }, { "epoch": 0.024587512131996118, "grad_norm": 1.7509772777557373, "learning_rate": 3.8e-05, "loss": 2.0414, "step": 19 }, { "epoch": 0.02588159171789065, "grad_norm": 1.697340488433838, "learning_rate": 4e-05, "loss": 2.0427, "step": 20 }, { "epoch": 0.027175671303785183, "grad_norm": 1.8733758926391602, "learning_rate": 4.2e-05, "loss": 1.6772, "step": 21 }, { "epoch": 0.028469750889679714, "grad_norm": 1.6085255146026611, "learning_rate": 4.4000000000000006e-05, "loss": 1.6527, "step": 22 }, { "epoch": 0.029763830475574248, "grad_norm": 1.5792337656021118, "learning_rate": 4.600000000000001e-05, "loss": 1.6567, "step": 23 }, { "epoch": 0.03105791006146878, "grad_norm": 1.4392567873001099, "learning_rate": 4.8e-05, "loss": 1.508, "step": 24 }, { "epoch": 0.03235198964736331, "grad_norm": 1.5222433805465698, "learning_rate": 5e-05, "loss": 1.4606, "step": 25 }, { "epoch": 0.033646069233257844, "grad_norm": 1.5480064153671265, "learning_rate": 5.2000000000000004e-05, "loss": 1.5027, "step": 26 }, { "epoch": 0.034940148819152375, "grad_norm": 1.6736445426940918, "learning_rate": 5.4000000000000005e-05, "loss": 1.2426, "step": 27 }, { "epoch": 0.03623422840504691, "grad_norm": 1.7392551898956299, "learning_rate": 5.6000000000000006e-05, "loss": 1.4703, "step": 28 }, { "epoch": 0.037528307990941444, "grad_norm": 1.6173359155654907, "learning_rate": 5.8e-05, "loss": 1.4546, "step": 29 }, { "epoch": 0.038822387576835975, "grad_norm": 1.3955802917480469, "learning_rate": 6e-05, "loss": 1.3808, "step": 30 }, { "epoch": 0.040116467162730506, "grad_norm": 1.353873372077942, "learning_rate": 6.2e-05, "loss": 1.229, "step": 31 }, { "epoch": 0.04141054674862504, "grad_norm": 1.2547746896743774, "learning_rate": 6.400000000000001e-05, "loss": 1.1668, "step": 32 }, { "epoch": 0.042704626334519574, "grad_norm": 1.3806778192520142, "learning_rate": 6.6e-05, "loss": 1.0691, "step": 33 }, { "epoch": 0.043998705920414105, "grad_norm": 1.2815773487091064, "learning_rate": 6.800000000000001e-05, "loss": 1.2409, "step": 34 }, { "epoch": 0.045292785506308636, "grad_norm": 1.3677266836166382, "learning_rate": 7e-05, "loss": 0.9668, "step": 35 }, { "epoch": 0.046586865092203174, "grad_norm": 1.5457032918930054, "learning_rate": 7.2e-05, "loss": 1.1385, "step": 36 }, { "epoch": 0.047880944678097705, "grad_norm": 1.5587060451507568, "learning_rate": 7.4e-05, "loss": 1.1707, "step": 37 }, { "epoch": 0.049175024263992236, "grad_norm": 1.079053282737732, "learning_rate": 7.6e-05, "loss": 1.0655, "step": 38 }, { "epoch": 0.050469103849886766, "grad_norm": 1.1773897409439087, "learning_rate": 7.800000000000001e-05, "loss": 1.0465, "step": 39 }, { "epoch": 0.0517631834357813, "grad_norm": 1.2437673807144165, "learning_rate": 8e-05, "loss": 1.2779, "step": 40 }, { "epoch": 0.053057263021675835, "grad_norm": 1.254847526550293, "learning_rate": 8.2e-05, "loss": 1.0898, "step": 41 }, { "epoch": 0.054351342607570366, "grad_norm": 1.1771515607833862, "learning_rate": 8.4e-05, "loss": 1.1827, "step": 42 }, { "epoch": 0.0556454221934649, "grad_norm": 1.1400648355484009, "learning_rate": 8.6e-05, "loss": 1.1066, "step": 43 }, { "epoch": 0.05693950177935943, "grad_norm": 1.2047138214111328, "learning_rate": 8.800000000000001e-05, "loss": 0.8974, "step": 44 }, { "epoch": 0.058233581365253966, "grad_norm": 1.1269346475601196, "learning_rate": 9e-05, "loss": 1.0146, "step": 45 }, { "epoch": 0.059527660951148496, "grad_norm": 1.169231653213501, "learning_rate": 9.200000000000001e-05, "loss": 1.1266, "step": 46 }, { "epoch": 0.06082174053704303, "grad_norm": 0.9771779179573059, "learning_rate": 9.4e-05, "loss": 0.8351, "step": 47 }, { "epoch": 0.06211582012293756, "grad_norm": 1.2849314212799072, "learning_rate": 9.6e-05, "loss": 1.1822, "step": 48 }, { "epoch": 0.0634098997088321, "grad_norm": 1.023181676864624, "learning_rate": 9.8e-05, "loss": 0.9082, "step": 49 }, { "epoch": 0.06470397929472663, "grad_norm": 1.135751724243164, "learning_rate": 0.0001, "loss": 0.9407, "step": 50 }, { "epoch": 0.06599805888062116, "grad_norm": 0.9701154828071594, "learning_rate": 9.999998300231494e-05, "loss": 0.9423, "step": 51 }, { "epoch": 0.06729213846651569, "grad_norm": 1.2891143560409546, "learning_rate": 9.999993200927133e-05, "loss": 0.9757, "step": 52 }, { "epoch": 0.06858621805241022, "grad_norm": 1.3360975980758667, "learning_rate": 9.999984702090383e-05, "loss": 1.0158, "step": 53 }, { "epoch": 0.06988029763830475, "grad_norm": 0.977446436882019, "learning_rate": 9.999972803727024e-05, "loss": 0.8175, "step": 54 }, { "epoch": 0.0711743772241993, "grad_norm": 0.9943827390670776, "learning_rate": 9.999957505845144e-05, "loss": 0.8627, "step": 55 }, { "epoch": 0.07246845681009383, "grad_norm": 1.1531224250793457, "learning_rate": 9.999938808455145e-05, "loss": 1.143, "step": 56 }, { "epoch": 0.07376253639598836, "grad_norm": 1.287972092628479, "learning_rate": 9.99991671156974e-05, "loss": 1.2342, "step": 57 }, { "epoch": 0.07505661598188289, "grad_norm": 1.1554590463638306, "learning_rate": 9.999891215203949e-05, "loss": 0.9692, "step": 58 }, { "epoch": 0.07635069556777742, "grad_norm": 1.0786008834838867, "learning_rate": 9.999862319375113e-05, "loss": 1.1254, "step": 59 }, { "epoch": 0.07764477515367195, "grad_norm": 1.0764508247375488, "learning_rate": 9.999830024102874e-05, "loss": 0.9312, "step": 60 }, { "epoch": 0.07893885473956648, "grad_norm": 1.1909526586532593, "learning_rate": 9.999794329409194e-05, "loss": 0.9959, "step": 61 }, { "epoch": 0.08023293432546101, "grad_norm": 0.9989166259765625, "learning_rate": 9.999755235318337e-05, "loss": 0.934, "step": 62 }, { "epoch": 0.08152701391135554, "grad_norm": 1.0302046537399292, "learning_rate": 9.999712741856889e-05, "loss": 1.1017, "step": 63 }, { "epoch": 0.08282109349725009, "grad_norm": 0.9583478569984436, "learning_rate": 9.999666849053738e-05, "loss": 1.1384, "step": 64 }, { "epoch": 0.08411517308314462, "grad_norm": 1.001126766204834, "learning_rate": 9.999617556940085e-05, "loss": 0.9279, "step": 65 }, { "epoch": 0.08540925266903915, "grad_norm": 1.0130903720855713, "learning_rate": 9.999564865549449e-05, "loss": 0.9381, "step": 66 }, { "epoch": 0.08670333225493368, "grad_norm": 1.1210829019546509, "learning_rate": 9.999508774917652e-05, "loss": 0.9607, "step": 67 }, { "epoch": 0.08799741184082821, "grad_norm": 1.045749545097351, "learning_rate": 9.999449285082831e-05, "loss": 1.0037, "step": 68 }, { "epoch": 0.08929149142672274, "grad_norm": 1.1308139562606812, "learning_rate": 9.999386396085434e-05, "loss": 0.9086, "step": 69 }, { "epoch": 0.09058557101261727, "grad_norm": 1.1013413667678833, "learning_rate": 9.999320107968219e-05, "loss": 1.0712, "step": 70 }, { "epoch": 0.0918796505985118, "grad_norm": 1.0830566883087158, "learning_rate": 9.999250420776258e-05, "loss": 1.0326, "step": 71 }, { "epoch": 0.09317373018440635, "grad_norm": 1.0673171281814575, "learning_rate": 9.999177334556929e-05, "loss": 1.0034, "step": 72 }, { "epoch": 0.09446780977030088, "grad_norm": 1.1546461582183838, "learning_rate": 9.999100849359926e-05, "loss": 1.059, "step": 73 }, { "epoch": 0.09576188935619541, "grad_norm": 0.9139528870582581, "learning_rate": 9.999020965237249e-05, "loss": 0.8596, "step": 74 }, { "epoch": 0.09705596894208994, "grad_norm": 1.1570812463760376, "learning_rate": 9.998937682243215e-05, "loss": 1.0456, "step": 75 }, { "epoch": 0.09835004852798447, "grad_norm": 1.3232612609863281, "learning_rate": 9.998851000434448e-05, "loss": 0.9994, "step": 76 }, { "epoch": 0.099644128113879, "grad_norm": 1.2017115354537964, "learning_rate": 9.998760919869883e-05, "loss": 1.2664, "step": 77 }, { "epoch": 0.10093820769977353, "grad_norm": 1.0694175958633423, "learning_rate": 9.998667440610765e-05, "loss": 0.9483, "step": 78 }, { "epoch": 0.10223228728566806, "grad_norm": 0.9963059425354004, "learning_rate": 9.998570562720654e-05, "loss": 0.9577, "step": 79 }, { "epoch": 0.1035263668715626, "grad_norm": 0.8873535394668579, "learning_rate": 9.998470286265416e-05, "loss": 0.8498, "step": 80 }, { "epoch": 0.10482044645745714, "grad_norm": 1.1350760459899902, "learning_rate": 9.99836661131323e-05, "loss": 1.0024, "step": 81 }, { "epoch": 0.10611452604335167, "grad_norm": 0.8355389833450317, "learning_rate": 9.998259537934586e-05, "loss": 0.7399, "step": 82 }, { "epoch": 0.1074086056292462, "grad_norm": 0.9935446381568909, "learning_rate": 9.998149066202284e-05, "loss": 0.9809, "step": 83 }, { "epoch": 0.10870268521514073, "grad_norm": 1.0571558475494385, "learning_rate": 9.998035196191435e-05, "loss": 1.0144, "step": 84 }, { "epoch": 0.10999676480103526, "grad_norm": 0.9860286116600037, "learning_rate": 9.99791792797946e-05, "loss": 1.0467, "step": 85 }, { "epoch": 0.1112908443869298, "grad_norm": 1.1422507762908936, "learning_rate": 9.997797261646089e-05, "loss": 0.9535, "step": 86 }, { "epoch": 0.11258492397282432, "grad_norm": 0.8561545014381409, "learning_rate": 9.997673197273365e-05, "loss": 1.007, "step": 87 }, { "epoch": 0.11387900355871886, "grad_norm": 1.0027543306350708, "learning_rate": 9.997545734945639e-05, "loss": 0.9861, "step": 88 }, { "epoch": 0.1151730831446134, "grad_norm": 0.8489773273468018, "learning_rate": 9.997414874749575e-05, "loss": 0.9672, "step": 89 }, { "epoch": 0.11646716273050793, "grad_norm": 1.0517115592956543, "learning_rate": 9.997280616774147e-05, "loss": 1.1672, "step": 90 }, { "epoch": 0.11776124231640246, "grad_norm": 1.0035395622253418, "learning_rate": 9.997142961110634e-05, "loss": 0.9294, "step": 91 }, { "epoch": 0.11905532190229699, "grad_norm": 1.1194915771484375, "learning_rate": 9.997001907852635e-05, "loss": 1.0857, "step": 92 }, { "epoch": 0.12034940148819152, "grad_norm": 1.5234825611114502, "learning_rate": 9.996857457096047e-05, "loss": 1.027, "step": 93 }, { "epoch": 0.12164348107408605, "grad_norm": 0.949878454208374, "learning_rate": 9.996709608939088e-05, "loss": 0.8173, "step": 94 }, { "epoch": 0.12293756065998059, "grad_norm": 0.8736472129821777, "learning_rate": 9.996558363482277e-05, "loss": 0.855, "step": 95 }, { "epoch": 0.12423164024587512, "grad_norm": 0.8604567050933838, "learning_rate": 9.996403720828449e-05, "loss": 0.9485, "step": 96 }, { "epoch": 0.12552571983176966, "grad_norm": 1.020851492881775, "learning_rate": 9.996245681082748e-05, "loss": 1.0024, "step": 97 }, { "epoch": 0.1268197994176642, "grad_norm": 1.0704892873764038, "learning_rate": 9.996084244352623e-05, "loss": 0.9246, "step": 98 }, { "epoch": 0.12811387900355872, "grad_norm": 0.8441987037658691, "learning_rate": 9.99591941074784e-05, "loss": 1.0343, "step": 99 }, { "epoch": 0.12940795858945325, "grad_norm": 1.0280612707138062, "learning_rate": 9.995751180380466e-05, "loss": 0.9644, "step": 100 }, { "epoch": 0.13070203817534778, "grad_norm": 0.9827906489372253, "learning_rate": 9.995579553364887e-05, "loss": 0.9583, "step": 101 }, { "epoch": 0.13199611776124232, "grad_norm": 1.035618543624878, "learning_rate": 9.995404529817791e-05, "loss": 1.0366, "step": 102 }, { "epoch": 0.13329019734713685, "grad_norm": 1.2775524854660034, "learning_rate": 9.995226109858178e-05, "loss": 0.9353, "step": 103 }, { "epoch": 0.13458427693303138, "grad_norm": 1.0101919174194336, "learning_rate": 9.995044293607355e-05, "loss": 0.9045, "step": 104 }, { "epoch": 0.1358783565189259, "grad_norm": 0.8396942019462585, "learning_rate": 9.994859081188943e-05, "loss": 0.867, "step": 105 }, { "epoch": 0.13717243610482044, "grad_norm": 1.04515540599823, "learning_rate": 9.99467047272887e-05, "loss": 0.9693, "step": 106 }, { "epoch": 0.13846651569071497, "grad_norm": 1.099042534828186, "learning_rate": 9.994478468355369e-05, "loss": 0.8879, "step": 107 }, { "epoch": 0.1397605952766095, "grad_norm": 0.8710360527038574, "learning_rate": 9.994283068198988e-05, "loss": 0.9018, "step": 108 }, { "epoch": 0.14105467486250403, "grad_norm": 0.961025059223175, "learning_rate": 9.99408427239258e-05, "loss": 0.8806, "step": 109 }, { "epoch": 0.1423487544483986, "grad_norm": 0.915665328502655, "learning_rate": 9.993882081071306e-05, "loss": 0.8628, "step": 110 }, { "epoch": 0.14364283403429312, "grad_norm": 1.2776648998260498, "learning_rate": 9.993676494372642e-05, "loss": 0.9742, "step": 111 }, { "epoch": 0.14493691362018765, "grad_norm": 1.1270071268081665, "learning_rate": 9.993467512436364e-05, "loss": 0.9729, "step": 112 }, { "epoch": 0.14623099320608218, "grad_norm": 0.8188664317131042, "learning_rate": 9.99325513540456e-05, "loss": 0.9428, "step": 113 }, { "epoch": 0.1475250727919767, "grad_norm": 1.0760393142700195, "learning_rate": 9.993039363421627e-05, "loss": 0.9482, "step": 114 }, { "epoch": 0.14881915237787124, "grad_norm": 1.019920825958252, "learning_rate": 9.992820196634273e-05, "loss": 0.9785, "step": 115 } ], "logging_steps": 1, "max_steps": 3860, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 5, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.2856277474476032e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }