{ "best_metric": 1.2635489702224731, "best_model_checkpoint": "data/Llama-31-8B_task-1_120-samples_config-4/checkpoint-137", "epoch": 32.0, "eval_steps": 500, "global_step": 176, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.18181818181818182, "grad_norm": 1.874562382698059, "learning_rate": 1.3333333333333336e-07, "loss": 2.2898, "step": 1 }, { "epoch": 0.36363636363636365, "grad_norm": 1.8042479753494263, "learning_rate": 2.666666666666667e-07, "loss": 2.0811, "step": 2 }, { "epoch": 0.7272727272727273, "grad_norm": 1.760155200958252, "learning_rate": 5.333333333333335e-07, "loss": 2.121, "step": 4 }, { "epoch": 0.9090909090909091, "eval_loss": 2.101999044418335, "eval_runtime": 9.6381, "eval_samples_per_second": 2.49, "eval_steps_per_second": 2.49, "step": 5 }, { "epoch": 1.0909090909090908, "grad_norm": 1.5958633422851562, "learning_rate": 8.000000000000001e-07, "loss": 1.9468, "step": 6 }, { "epoch": 1.4545454545454546, "grad_norm": 1.8069952726364136, "learning_rate": 1.066666666666667e-06, "loss": 2.1471, "step": 8 }, { "epoch": 1.8181818181818183, "grad_norm": 1.736191749572754, "learning_rate": 1.3333333333333334e-06, "loss": 2.0709, "step": 10 }, { "epoch": 2.0, "eval_loss": 2.0930726528167725, "eval_runtime": 9.6259, "eval_samples_per_second": 2.493, "eval_steps_per_second": 2.493, "step": 11 }, { "epoch": 2.1818181818181817, "grad_norm": 1.6809439659118652, "learning_rate": 1.6000000000000001e-06, "loss": 2.1306, "step": 12 }, { "epoch": 2.5454545454545454, "grad_norm": 1.747848391532898, "learning_rate": 1.8666666666666669e-06, "loss": 2.0776, "step": 14 }, { "epoch": 2.909090909090909, "grad_norm": 1.8399686813354492, "learning_rate": 2.133333333333334e-06, "loss": 2.0454, "step": 16 }, { "epoch": 2.909090909090909, "eval_loss": 2.075482130050659, "eval_runtime": 9.6366, "eval_samples_per_second": 2.491, "eval_steps_per_second": 2.491, "step": 16 }, { "epoch": 3.2727272727272725, "grad_norm": 1.8816026449203491, "learning_rate": 2.4000000000000003e-06, "loss": 2.1284, "step": 18 }, { "epoch": 3.6363636363636362, "grad_norm": 1.5757534503936768, "learning_rate": 2.666666666666667e-06, "loss": 2.03, "step": 20 }, { "epoch": 4.0, "grad_norm": 1.9466415643692017, "learning_rate": 2.9333333333333338e-06, "loss": 2.0502, "step": 22 }, { "epoch": 4.0, "eval_loss": 2.0472075939178467, "eval_runtime": 9.6277, "eval_samples_per_second": 2.493, "eval_steps_per_second": 2.493, "step": 22 }, { "epoch": 4.363636363636363, "grad_norm": 1.6615264415740967, "learning_rate": 3.2000000000000003e-06, "loss": 2.0197, "step": 24 }, { "epoch": 4.7272727272727275, "grad_norm": 1.7619231939315796, "learning_rate": 3.4666666666666672e-06, "loss": 2.0511, "step": 26 }, { "epoch": 4.909090909090909, "eval_loss": 2.0100386142730713, "eval_runtime": 9.6313, "eval_samples_per_second": 2.492, "eval_steps_per_second": 2.492, "step": 27 }, { "epoch": 5.090909090909091, "grad_norm": 1.8135912418365479, "learning_rate": 3.7333333333333337e-06, "loss": 1.9759, "step": 28 }, { "epoch": 5.454545454545454, "grad_norm": 1.8354995250701904, "learning_rate": 4.000000000000001e-06, "loss": 2.0128, "step": 30 }, { "epoch": 5.818181818181818, "grad_norm": 1.8015000820159912, "learning_rate": 4.266666666666668e-06, "loss": 1.9554, "step": 32 }, { "epoch": 6.0, "eval_loss": 1.9471648931503296, "eval_runtime": 9.6325, "eval_samples_per_second": 2.492, "eval_steps_per_second": 2.492, "step": 33 }, { "epoch": 6.181818181818182, "grad_norm": 1.7802411317825317, "learning_rate": 4.533333333333334e-06, "loss": 1.9607, "step": 34 }, { "epoch": 6.545454545454545, "grad_norm": 1.5615451335906982, "learning_rate": 4.800000000000001e-06, "loss": 1.9032, "step": 36 }, { "epoch": 6.909090909090909, "grad_norm": 1.8741137981414795, "learning_rate": 5.0666666666666676e-06, "loss": 1.8921, "step": 38 }, { "epoch": 6.909090909090909, "eval_loss": 1.8795281648635864, "eval_runtime": 9.6403, "eval_samples_per_second": 2.49, "eval_steps_per_second": 2.49, "step": 38 }, { "epoch": 7.2727272727272725, "grad_norm": 1.7604111433029175, "learning_rate": 5.333333333333334e-06, "loss": 1.8681, "step": 40 }, { "epoch": 7.636363636363637, "grad_norm": 1.6821084022521973, "learning_rate": 5.600000000000001e-06, "loss": 1.769, "step": 42 }, { "epoch": 8.0, "grad_norm": 1.665964126586914, "learning_rate": 5.8666666666666675e-06, "loss": 1.8104, "step": 44 }, { "epoch": 8.0, "eval_loss": 1.7812843322753906, "eval_runtime": 9.6236, "eval_samples_per_second": 2.494, "eval_steps_per_second": 2.494, "step": 44 }, { "epoch": 8.363636363636363, "grad_norm": 1.5216890573501587, "learning_rate": 6.133333333333334e-06, "loss": 1.7145, "step": 46 }, { "epoch": 8.727272727272727, "grad_norm": 1.4722410440444946, "learning_rate": 6.4000000000000006e-06, "loss": 1.7636, "step": 48 }, { "epoch": 8.909090909090908, "eval_loss": 1.6937414407730103, "eval_runtime": 9.6259, "eval_samples_per_second": 2.493, "eval_steps_per_second": 2.493, "step": 49 }, { "epoch": 9.090909090909092, "grad_norm": 1.2136281728744507, "learning_rate": 6.666666666666667e-06, "loss": 1.659, "step": 50 }, { "epoch": 9.454545454545455, "grad_norm": 1.0023685693740845, "learning_rate": 6.9333333333333344e-06, "loss": 1.6509, "step": 52 }, { "epoch": 9.818181818181818, "grad_norm": 1.0440162420272827, "learning_rate": 7.2000000000000005e-06, "loss": 1.6011, "step": 54 }, { "epoch": 10.0, "eval_loss": 1.6141911745071411, "eval_runtime": 9.6302, "eval_samples_per_second": 2.492, "eval_steps_per_second": 2.492, "step": 55 }, { "epoch": 10.181818181818182, "grad_norm": 0.7877157926559448, "learning_rate": 7.4666666666666675e-06, "loss": 1.5814, "step": 56 }, { "epoch": 10.545454545454545, "grad_norm": 0.6534942984580994, "learning_rate": 7.733333333333334e-06, "loss": 1.5824, "step": 58 }, { "epoch": 10.909090909090908, "grad_norm": 0.6240991950035095, "learning_rate": 8.000000000000001e-06, "loss": 1.5128, "step": 60 }, { "epoch": 10.909090909090908, "eval_loss": 1.5751093626022339, "eval_runtime": 9.6475, "eval_samples_per_second": 2.488, "eval_steps_per_second": 2.488, "step": 60 }, { "epoch": 11.272727272727273, "grad_norm": 0.6224139928817749, "learning_rate": 8.266666666666667e-06, "loss": 1.5444, "step": 62 }, { "epoch": 11.636363636363637, "grad_norm": 0.6345284581184387, "learning_rate": 8.533333333333335e-06, "loss": 1.5682, "step": 64 }, { "epoch": 12.0, "grad_norm": 0.5680299997329712, "learning_rate": 8.8e-06, "loss": 1.4277, "step": 66 }, { "epoch": 12.0, "eval_loss": 1.5352739095687866, "eval_runtime": 9.6312, "eval_samples_per_second": 2.492, "eval_steps_per_second": 2.492, "step": 66 }, { "epoch": 12.363636363636363, "grad_norm": 0.5991209745407104, "learning_rate": 9.066666666666667e-06, "loss": 1.4703, "step": 68 }, { "epoch": 12.727272727272727, "grad_norm": 0.5993205308914185, "learning_rate": 9.333333333333334e-06, "loss": 1.4998, "step": 70 }, { "epoch": 12.909090909090908, "eval_loss": 1.5001062154769897, "eval_runtime": 9.6248, "eval_samples_per_second": 2.494, "eval_steps_per_second": 2.494, "step": 71 }, { "epoch": 13.090909090909092, "grad_norm": 0.5633314251899719, "learning_rate": 9.600000000000001e-06, "loss": 1.445, "step": 72 }, { "epoch": 13.454545454545455, "grad_norm": 0.5419648885726929, "learning_rate": 9.866666666666668e-06, "loss": 1.4256, "step": 74 }, { "epoch": 13.818181818181818, "grad_norm": 0.5384172201156616, "learning_rate": 9.999945845889795e-06, "loss": 1.4154, "step": 76 }, { "epoch": 14.0, "eval_loss": 1.4582971334457397, "eval_runtime": 9.6218, "eval_samples_per_second": 2.494, "eval_steps_per_second": 2.494, "step": 77 }, { "epoch": 14.181818181818182, "grad_norm": 0.6161755323410034, "learning_rate": 9.999512620046523e-06, "loss": 1.4459, "step": 78 }, { "epoch": 14.545454545454545, "grad_norm": 0.5570430159568787, "learning_rate": 9.99864620589731e-06, "loss": 1.3661, "step": 80 }, { "epoch": 14.909090909090908, "grad_norm": 0.5637471675872803, "learning_rate": 9.99734667851357e-06, "loss": 1.4201, "step": 82 }, { "epoch": 14.909090909090908, "eval_loss": 1.4252301454544067, "eval_runtime": 9.6319, "eval_samples_per_second": 2.492, "eval_steps_per_second": 2.492, "step": 82 }, { "epoch": 15.272727272727273, "grad_norm": 0.5539014935493469, "learning_rate": 9.995614150494293e-06, "loss": 1.3497, "step": 84 }, { "epoch": 15.636363636363637, "grad_norm": 0.5583813786506653, "learning_rate": 9.993448771956285e-06, "loss": 1.3512, "step": 86 }, { "epoch": 16.0, "grad_norm": 0.5377728939056396, "learning_rate": 9.99085073052117e-06, "loss": 1.3364, "step": 88 }, { "epoch": 16.0, "eval_loss": 1.3921159505844116, "eval_runtime": 9.635, "eval_samples_per_second": 2.491, "eval_steps_per_second": 2.491, "step": 88 }, { "epoch": 16.363636363636363, "grad_norm": 0.5390649437904358, "learning_rate": 9.987820251299121e-06, "loss": 1.3614, "step": 90 }, { "epoch": 16.727272727272727, "grad_norm": 0.5126790404319763, "learning_rate": 9.984357596869369e-06, "loss": 1.2762, "step": 92 }, { "epoch": 16.90909090909091, "eval_loss": 1.3691315650939941, "eval_runtime": 9.6319, "eval_samples_per_second": 2.492, "eval_steps_per_second": 2.492, "step": 93 }, { "epoch": 17.09090909090909, "grad_norm": 0.5642189383506775, "learning_rate": 9.980463067257437e-06, "loss": 1.2961, "step": 94 }, { "epoch": 17.454545454545453, "grad_norm": 0.5290245413780212, "learning_rate": 9.976136999909156e-06, "loss": 1.1987, "step": 96 }, { "epoch": 17.818181818181817, "grad_norm": 0.5963008403778076, "learning_rate": 9.971379769661422e-06, "loss": 1.2851, "step": 98 }, { "epoch": 18.0, "eval_loss": 1.3436861038208008, "eval_runtime": 9.6214, "eval_samples_per_second": 2.494, "eval_steps_per_second": 2.494, "step": 99 }, { "epoch": 18.181818181818183, "grad_norm": 0.5820615291595459, "learning_rate": 9.966191788709716e-06, "loss": 1.329, "step": 100 }, { "epoch": 18.545454545454547, "grad_norm": 0.5619508624076843, "learning_rate": 9.960573506572391e-06, "loss": 1.2428, "step": 102 }, { "epoch": 18.90909090909091, "grad_norm": 0.5272645950317383, "learning_rate": 9.95452541005172e-06, "loss": 1.2239, "step": 104 }, { "epoch": 18.90909090909091, "eval_loss": 1.3261139392852783, "eval_runtime": 9.6475, "eval_samples_per_second": 2.488, "eval_steps_per_second": 2.488, "step": 104 }, { "epoch": 19.272727272727273, "grad_norm": 0.5720901489257812, "learning_rate": 9.948048023191728e-06, "loss": 1.1753, "step": 106 }, { "epoch": 19.636363636363637, "grad_norm": 0.5877869725227356, "learning_rate": 9.941141907232766e-06, "loss": 1.2334, "step": 108 }, { "epoch": 20.0, "grad_norm": 0.5674625039100647, "learning_rate": 9.933807660562898e-06, "loss": 1.221, "step": 110 }, { "epoch": 20.0, "eval_loss": 1.308407187461853, "eval_runtime": 9.6226, "eval_samples_per_second": 2.494, "eval_steps_per_second": 2.494, "step": 110 }, { "epoch": 20.363636363636363, "grad_norm": 0.5934170484542847, "learning_rate": 9.926045918666045e-06, "loss": 1.1685, "step": 112 }, { "epoch": 20.727272727272727, "grad_norm": 0.6199212670326233, "learning_rate": 9.91785735406693e-06, "loss": 1.2011, "step": 114 }, { "epoch": 20.90909090909091, "eval_loss": 1.2950953245162964, "eval_runtime": 9.6285, "eval_samples_per_second": 2.493, "eval_steps_per_second": 2.493, "step": 115 }, { "epoch": 21.09090909090909, "grad_norm": 0.5995011329650879, "learning_rate": 9.909242676272797e-06, "loss": 1.1717, "step": 116 }, { "epoch": 21.454545454545453, "grad_norm": 0.6024748682975769, "learning_rate": 9.90020263171194e-06, "loss": 1.1654, "step": 118 }, { "epoch": 21.818181818181817, "grad_norm": 0.6147428750991821, "learning_rate": 9.890738003669029e-06, "loss": 1.1433, "step": 120 }, { "epoch": 22.0, "eval_loss": 1.2823587656021118, "eval_runtime": 9.6228, "eval_samples_per_second": 2.494, "eval_steps_per_second": 2.494, "step": 121 }, { "epoch": 22.181818181818183, "grad_norm": 0.612140953540802, "learning_rate": 9.880849612217238e-06, "loss": 1.0765, "step": 122 }, { "epoch": 22.545454545454547, "grad_norm": 0.647298276424408, "learning_rate": 9.870538314147194e-06, "loss": 1.1183, "step": 124 }, { "epoch": 22.90909090909091, "grad_norm": 0.6705971360206604, "learning_rate": 9.859805002892733e-06, "loss": 1.1579, "step": 126 }, { "epoch": 22.90909090909091, "eval_loss": 1.2746213674545288, "eval_runtime": 9.6328, "eval_samples_per_second": 2.491, "eval_steps_per_second": 2.491, "step": 126 }, { "epoch": 23.272727272727273, "grad_norm": 0.670023500919342, "learning_rate": 9.84865060845349e-06, "loss": 1.0965, "step": 128 }, { "epoch": 23.636363636363637, "grad_norm": 0.6824691891670227, "learning_rate": 9.83707609731432e-06, "loss": 1.061, "step": 130 }, { "epoch": 24.0, "grad_norm": 0.6598721146583557, "learning_rate": 9.825082472361558e-06, "loss": 1.0871, "step": 132 }, { "epoch": 24.0, "eval_loss": 1.268039345741272, "eval_runtime": 9.6219, "eval_samples_per_second": 2.494, "eval_steps_per_second": 2.494, "step": 132 }, { "epoch": 24.363636363636363, "grad_norm": 0.6824683547019958, "learning_rate": 9.812670772796113e-06, "loss": 1.0733, "step": 134 }, { "epoch": 24.727272727272727, "grad_norm": 0.7309969663619995, "learning_rate": 9.799842074043438e-06, "loss": 1.0745, "step": 136 }, { "epoch": 24.90909090909091, "eval_loss": 1.2635489702224731, "eval_runtime": 9.6334, "eval_samples_per_second": 2.491, "eval_steps_per_second": 2.491, "step": 137 }, { "epoch": 25.09090909090909, "grad_norm": 0.8717047572135925, "learning_rate": 9.786597487660336e-06, "loss": 1.0049, "step": 138 }, { "epoch": 25.454545454545453, "grad_norm": 0.7290262579917908, "learning_rate": 9.77293816123866e-06, "loss": 1.0355, "step": 140 }, { "epoch": 25.818181818181817, "grad_norm": 0.8125291466712952, "learning_rate": 9.75886527830587e-06, "loss": 1.0006, "step": 142 }, { "epoch": 26.0, "eval_loss": 1.2674241065979004, "eval_runtime": 9.6242, "eval_samples_per_second": 2.494, "eval_steps_per_second": 2.494, "step": 143 }, { "epoch": 26.181818181818183, "grad_norm": 0.777037501335144, "learning_rate": 9.744380058222483e-06, "loss": 1.0235, "step": 144 }, { "epoch": 26.545454545454547, "grad_norm": 0.7910988330841064, "learning_rate": 9.729483756076436e-06, "loss": 1.0119, "step": 146 }, { "epoch": 26.90909090909091, "grad_norm": 0.8250744342803955, "learning_rate": 9.714177662574316e-06, "loss": 0.9628, "step": 148 }, { "epoch": 26.90909090909091, "eval_loss": 1.2688733339309692, "eval_runtime": 9.6388, "eval_samples_per_second": 2.49, "eval_steps_per_second": 2.49, "step": 148 }, { "epoch": 27.272727272727273, "grad_norm": 0.9542063474655151, "learning_rate": 9.698463103929542e-06, "loss": 0.9176, "step": 150 }, { "epoch": 27.636363636363637, "grad_norm": 0.8577086925506592, "learning_rate": 9.682341441747446e-06, "loss": 0.9908, "step": 152 }, { "epoch": 28.0, "grad_norm": 0.8569504022598267, "learning_rate": 9.665814072907293e-06, "loss": 0.9237, "step": 154 }, { "epoch": 28.0, "eval_loss": 1.2716994285583496, "eval_runtime": 9.6271, "eval_samples_per_second": 2.493, "eval_steps_per_second": 2.493, "step": 154 }, { "epoch": 28.363636363636363, "grad_norm": 0.933702826499939, "learning_rate": 9.648882429441258e-06, "loss": 0.9053, "step": 156 }, { "epoch": 28.727272727272727, "grad_norm": 1.002100944519043, "learning_rate": 9.63154797841033e-06, "loss": 0.8824, "step": 158 }, { "epoch": 28.90909090909091, "eval_loss": 1.2879880666732788, "eval_runtime": 9.6501, "eval_samples_per_second": 2.487, "eval_steps_per_second": 2.487, "step": 159 }, { "epoch": 29.09090909090909, "grad_norm": 0.9883065819740295, "learning_rate": 9.613812221777212e-06, "loss": 0.9089, "step": 160 }, { "epoch": 29.454545454545453, "grad_norm": 1.0561895370483398, "learning_rate": 9.595676696276173e-06, "loss": 0.9285, "step": 162 }, { "epoch": 29.818181818181817, "grad_norm": 1.1776067018508911, "learning_rate": 9.577142973279896e-06, "loss": 0.8706, "step": 164 }, { "epoch": 30.0, "eval_loss": 1.296054482460022, "eval_runtime": 9.6279, "eval_samples_per_second": 2.493, "eval_steps_per_second": 2.493, "step": 165 }, { "epoch": 30.181818181818183, "grad_norm": 1.0879513025283813, "learning_rate": 9.55821265866333e-06, "loss": 0.7961, "step": 166 }, { "epoch": 30.545454545454547, "grad_norm": 1.1668307781219482, "learning_rate": 9.538887392664544e-06, "loss": 0.7865, "step": 168 }, { "epoch": 30.90909090909091, "grad_norm": 1.065364956855774, "learning_rate": 9.519168849742603e-06, "loss": 0.8328, "step": 170 }, { "epoch": 30.90909090909091, "eval_loss": 1.326621174812317, "eval_runtime": 9.6286, "eval_samples_per_second": 2.493, "eval_steps_per_second": 2.493, "step": 170 }, { "epoch": 31.272727272727273, "grad_norm": 1.143373727798462, "learning_rate": 9.499058738432492e-06, "loss": 0.8381, "step": 172 }, { "epoch": 31.636363636363637, "grad_norm": 1.1452257633209229, "learning_rate": 9.478558801197065e-06, "loss": 0.7725, "step": 174 }, { "epoch": 32.0, "grad_norm": 1.2163513898849487, "learning_rate": 9.457670814276083e-06, "loss": 0.7667, "step": 176 }, { "epoch": 32.0, "eval_loss": 1.344734787940979, "eval_runtime": 9.6252, "eval_samples_per_second": 2.493, "eval_steps_per_second": 2.493, "step": 176 }, { "epoch": 32.0, "step": 176, "total_flos": 4.322507713465549e+16, "train_loss": 1.4077397015961735, "train_runtime": 3298.1261, "train_samples_per_second": 4.002, "train_steps_per_second": 0.227 } ], "logging_steps": 2, "max_steps": 750, "num_input_tokens_seen": 0, "num_train_epochs": 150, "save_steps": 25, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 7, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.322507713465549e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }