{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.10982976386600769, "eval_steps": 1000, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 11.704396157183714, "learning_rate": 0.0, "loss": 2.6917, "step": 1 }, { "epoch": 0.0, "grad_norm": 14.261265634920543, "learning_rate": 6.020599913279623e-05, "loss": 2.0455, "step": 2 }, { "epoch": 0.0, "grad_norm": 9.252464584699219, "learning_rate": 9.542425094393248e-05, "loss": 2.1024, "step": 3 }, { "epoch": 0.0, "grad_norm": 7.478072090774749, "learning_rate": 0.00012041199826559246, "loss": 1.9419, "step": 4 }, { "epoch": 0.01, "grad_norm": 6.996421444202911, "learning_rate": 0.00013979400086720374, "loss": 1.2142, "step": 5 }, { "epoch": 0.01, "grad_norm": 9.62925923224712, "learning_rate": 0.00015563025007672872, "loss": 1.4192, "step": 6 }, { "epoch": 0.01, "grad_norm": 5.9971192119861545, "learning_rate": 0.00016901960800285134, "loss": 1.1698, "step": 7 }, { "epoch": 0.01, "grad_norm": 15.448410811680446, "learning_rate": 0.00018061799739838867, "loss": 1.0663, "step": 8 }, { "epoch": 0.01, "grad_norm": 6.664012698560983, "learning_rate": 0.00019084850188786495, "loss": 1.0346, "step": 9 }, { "epoch": 0.01, "grad_norm": 5.9872198688383325, "learning_rate": 0.00019999999999999998, "loss": 1.2899, "step": 10 }, { "epoch": 0.01, "grad_norm": 5.492858411635369, "learning_rate": 0.0002, "loss": 0.733, "step": 11 }, { "epoch": 0.01, "grad_norm": 4.117272508239465, "learning_rate": 0.00019931271477663232, "loss": 0.7529, "step": 12 }, { "epoch": 0.01, "grad_norm": 4.57016142244955, "learning_rate": 0.0001986254295532646, "loss": 0.9221, "step": 13 }, { "epoch": 0.02, "grad_norm": 5.106703686957636, "learning_rate": 0.00019793814432989693, "loss": 1.1292, "step": 14 }, { "epoch": 0.02, "grad_norm": 4.4331559960790115, "learning_rate": 0.00019725085910652924, "loss": 0.9505, "step": 15 }, { "epoch": 0.02, "grad_norm": 4.871898936371301, "learning_rate": 0.0001965635738831615, "loss": 1.0894, "step": 16 }, { "epoch": 0.02, "grad_norm": 5.546128257668831, "learning_rate": 0.00019587628865979381, "loss": 1.0915, "step": 17 }, { "epoch": 0.02, "grad_norm": 5.426135233570764, "learning_rate": 0.00019518900343642613, "loss": 1.2174, "step": 18 }, { "epoch": 0.02, "grad_norm": 3.7481567939197316, "learning_rate": 0.00019450171821305842, "loss": 0.8281, "step": 19 }, { "epoch": 0.02, "grad_norm": 6.264747615275931, "learning_rate": 0.00019381443298969073, "loss": 1.1069, "step": 20 }, { "epoch": 0.02, "grad_norm": 4.850720515264914, "learning_rate": 0.00019312714776632305, "loss": 1.1216, "step": 21 }, { "epoch": 0.02, "grad_norm": 4.583277384098615, "learning_rate": 0.00019243986254295533, "loss": 1.0751, "step": 22 }, { "epoch": 0.03, "grad_norm": 4.923174586611154, "learning_rate": 0.00019175257731958765, "loss": 1.1915, "step": 23 }, { "epoch": 0.03, "grad_norm": 4.21422328394722, "learning_rate": 0.00019106529209621996, "loss": 1.1412, "step": 24 }, { "epoch": 0.03, "grad_norm": 3.9312652393524803, "learning_rate": 0.00019037800687285222, "loss": 0.9699, "step": 25 }, { "epoch": 0.03, "grad_norm": 4.511161102577495, "learning_rate": 0.00018969072164948454, "loss": 1.1289, "step": 26 }, { "epoch": 0.03, "grad_norm": 3.9233152624949454, "learning_rate": 0.00018900343642611685, "loss": 0.8803, "step": 27 }, { "epoch": 0.03, "grad_norm": 5.413459689226915, "learning_rate": 0.00018831615120274914, "loss": 1.0594, "step": 28 }, { "epoch": 0.03, "grad_norm": 4.2106489602848685, "learning_rate": 0.00018762886597938145, "loss": 1.1003, "step": 29 }, { "epoch": 0.03, "grad_norm": 4.568263346040736, "learning_rate": 0.00018694158075601377, "loss": 1.0816, "step": 30 }, { "epoch": 0.03, "grad_norm": 3.12202434010765, "learning_rate": 0.00018625429553264605, "loss": 0.9053, "step": 31 }, { "epoch": 0.04, "grad_norm": 4.282623293516478, "learning_rate": 0.00018556701030927837, "loss": 1.206, "step": 32 }, { "epoch": 0.04, "grad_norm": 6.45030819651884, "learning_rate": 0.00018487972508591068, "loss": 0.9049, "step": 33 }, { "epoch": 0.04, "grad_norm": 4.871975082534186, "learning_rate": 0.00018419243986254294, "loss": 1.1436, "step": 34 }, { "epoch": 0.04, "grad_norm": 4.715014058776802, "learning_rate": 0.00018350515463917526, "loss": 1.3847, "step": 35 }, { "epoch": 0.04, "grad_norm": 3.3808462582338707, "learning_rate": 0.00018281786941580757, "loss": 0.7817, "step": 36 }, { "epoch": 0.04, "grad_norm": 3.8976964456454866, "learning_rate": 0.00018213058419243986, "loss": 1.0001, "step": 37 }, { "epoch": 0.04, "grad_norm": 5.355867444757705, "learning_rate": 0.00018144329896907217, "loss": 0.9259, "step": 38 }, { "epoch": 0.04, "grad_norm": 5.178991855035252, "learning_rate": 0.0001807560137457045, "loss": 0.794, "step": 39 }, { "epoch": 0.04, "grad_norm": 4.14441205427679, "learning_rate": 0.00018006872852233677, "loss": 1.0384, "step": 40 }, { "epoch": 0.05, "grad_norm": 3.7176836552561974, "learning_rate": 0.0001793814432989691, "loss": 0.75, "step": 41 }, { "epoch": 0.05, "grad_norm": 5.011588210132538, "learning_rate": 0.0001786941580756014, "loss": 1.3034, "step": 42 }, { "epoch": 0.05, "grad_norm": 5.111725152950483, "learning_rate": 0.00017800687285223366, "loss": 1.3023, "step": 43 }, { "epoch": 0.05, "grad_norm": 4.021959822322685, "learning_rate": 0.00017731958762886598, "loss": 0.9624, "step": 44 }, { "epoch": 0.05, "grad_norm": 2.943541786297702, "learning_rate": 0.0001766323024054983, "loss": 0.6659, "step": 45 }, { "epoch": 0.05, "grad_norm": 4.534002060279327, "learning_rate": 0.00017594501718213058, "loss": 1.1354, "step": 46 }, { "epoch": 0.05, "grad_norm": 5.108426720583385, "learning_rate": 0.0001752577319587629, "loss": 1.4258, "step": 47 }, { "epoch": 0.05, "grad_norm": 7.235810963301271, "learning_rate": 0.0001745704467353952, "loss": 1.3918, "step": 48 }, { "epoch": 0.05, "grad_norm": 4.207791712185817, "learning_rate": 0.0001738831615120275, "loss": 1.2796, "step": 49 }, { "epoch": 0.05, "grad_norm": 4.050243728273998, "learning_rate": 0.0001731958762886598, "loss": 1.244, "step": 50 }, { "epoch": 0.06, "grad_norm": 4.657182228931627, "learning_rate": 0.00017250859106529212, "loss": 1.2068, "step": 51 }, { "epoch": 0.06, "grad_norm": 3.5652582299228643, "learning_rate": 0.00017182130584192438, "loss": 0.8897, "step": 52 }, { "epoch": 0.06, "grad_norm": 4.529701968193304, "learning_rate": 0.0001711340206185567, "loss": 1.1582, "step": 53 }, { "epoch": 0.06, "grad_norm": 4.49586529984068, "learning_rate": 0.000170446735395189, "loss": 1.1379, "step": 54 }, { "epoch": 0.06, "grad_norm": 5.845994813972232, "learning_rate": 0.0001697594501718213, "loss": 1.4724, "step": 55 }, { "epoch": 0.06, "grad_norm": 4.732950229095352, "learning_rate": 0.00016907216494845361, "loss": 0.886, "step": 56 }, { "epoch": 0.06, "grad_norm": 3.8567901480339426, "learning_rate": 0.00016838487972508593, "loss": 0.7763, "step": 57 }, { "epoch": 0.06, "grad_norm": 4.931804800794497, "learning_rate": 0.00016769759450171822, "loss": 1.1761, "step": 58 }, { "epoch": 0.06, "grad_norm": 3.7294788904744625, "learning_rate": 0.00016701030927835053, "loss": 0.6722, "step": 59 }, { "epoch": 0.07, "grad_norm": 3.688890615204965, "learning_rate": 0.00016632302405498285, "loss": 0.9642, "step": 60 }, { "epoch": 0.07, "grad_norm": 4.022317141495382, "learning_rate": 0.00016563573883161513, "loss": 0.8865, "step": 61 }, { "epoch": 0.07, "grad_norm": 6.55573614414137, "learning_rate": 0.00016494845360824742, "loss": 1.217, "step": 62 }, { "epoch": 0.07, "grad_norm": 4.5890564068710615, "learning_rate": 0.00016426116838487973, "loss": 1.0791, "step": 63 }, { "epoch": 0.07, "grad_norm": 5.435288078022071, "learning_rate": 0.00016357388316151202, "loss": 1.1501, "step": 64 }, { "epoch": 0.07, "grad_norm": 4.965664463354534, "learning_rate": 0.00016288659793814434, "loss": 1.4109, "step": 65 }, { "epoch": 0.07, "grad_norm": 7.483997056692371, "learning_rate": 0.00016219931271477665, "loss": 1.0735, "step": 66 }, { "epoch": 0.07, "grad_norm": 4.9923544603584915, "learning_rate": 0.00016151202749140894, "loss": 0.9927, "step": 67 }, { "epoch": 0.07, "grad_norm": 5.143678077389599, "learning_rate": 0.00016082474226804125, "loss": 1.1299, "step": 68 }, { "epoch": 0.08, "grad_norm": 4.420855401236021, "learning_rate": 0.00016013745704467357, "loss": 1.0263, "step": 69 }, { "epoch": 0.08, "grad_norm": 5.02485050719212, "learning_rate": 0.00015945017182130585, "loss": 1.1168, "step": 70 }, { "epoch": 0.08, "grad_norm": 4.524608717382107, "learning_rate": 0.00015876288659793814, "loss": 1.1015, "step": 71 }, { "epoch": 0.08, "grad_norm": 3.9028726855074054, "learning_rate": 0.00015807560137457046, "loss": 1.091, "step": 72 }, { "epoch": 0.08, "grad_norm": 3.271315148375007, "learning_rate": 0.00015738831615120274, "loss": 0.9397, "step": 73 }, { "epoch": 0.08, "grad_norm": 4.5227145076761515, "learning_rate": 0.00015670103092783506, "loss": 1.0062, "step": 74 }, { "epoch": 0.08, "grad_norm": 3.344752651586208, "learning_rate": 0.00015601374570446737, "loss": 1.0027, "step": 75 }, { "epoch": 0.08, "grad_norm": 4.102353206893142, "learning_rate": 0.00015532646048109966, "loss": 1.3915, "step": 76 }, { "epoch": 0.08, "grad_norm": 4.722343844673861, "learning_rate": 0.00015463917525773197, "loss": 1.1678, "step": 77 }, { "epoch": 0.09, "grad_norm": 6.017428516681286, "learning_rate": 0.0001539518900343643, "loss": 1.329, "step": 78 }, { "epoch": 0.09, "grad_norm": 3.7313811619387516, "learning_rate": 0.00015326460481099657, "loss": 0.887, "step": 79 }, { "epoch": 0.09, "grad_norm": 3.7179924970759406, "learning_rate": 0.00015257731958762886, "loss": 1.0153, "step": 80 }, { "epoch": 0.09, "grad_norm": 4.499981774187266, "learning_rate": 0.00015189003436426118, "loss": 1.3911, "step": 81 }, { "epoch": 0.09, "grad_norm": 5.955286629774712, "learning_rate": 0.00015120274914089346, "loss": 1.215, "step": 82 }, { "epoch": 0.09, "grad_norm": 3.5953610996772984, "learning_rate": 0.00015051546391752578, "loss": 0.9343, "step": 83 }, { "epoch": 0.09, "grad_norm": 3.8040779769157687, "learning_rate": 0.0001498281786941581, "loss": 1.062, "step": 84 }, { "epoch": 0.09, "grad_norm": 5.755142565965664, "learning_rate": 0.00014914089347079038, "loss": 1.2539, "step": 85 }, { "epoch": 0.09, "grad_norm": 3.7550494212874264, "learning_rate": 0.0001484536082474227, "loss": 0.8594, "step": 86 }, { "epoch": 0.1, "grad_norm": 6.7330159455584635, "learning_rate": 0.000147766323024055, "loss": 0.9157, "step": 87 }, { "epoch": 0.1, "grad_norm": 4.438479463179745, "learning_rate": 0.0001470790378006873, "loss": 1.1235, "step": 88 }, { "epoch": 0.1, "grad_norm": 3.87421384650703, "learning_rate": 0.00014639175257731958, "loss": 0.9763, "step": 89 }, { "epoch": 0.1, "grad_norm": 2.4000323333151683, "learning_rate": 0.0001457044673539519, "loss": 0.5867, "step": 90 }, { "epoch": 0.1, "grad_norm": 5.866412115354472, "learning_rate": 0.00014501718213058418, "loss": 1.5134, "step": 91 }, { "epoch": 0.1, "grad_norm": 3.023945137404242, "learning_rate": 0.0001443298969072165, "loss": 0.7887, "step": 92 }, { "epoch": 0.1, "grad_norm": 3.7174822786238435, "learning_rate": 0.00014364261168384881, "loss": 0.925, "step": 93 }, { "epoch": 0.1, "grad_norm": 4.007705895268358, "learning_rate": 0.0001429553264604811, "loss": 0.9114, "step": 94 }, { "epoch": 0.1, "grad_norm": 3.852559484855566, "learning_rate": 0.00014226804123711342, "loss": 1.2322, "step": 95 }, { "epoch": 0.11, "grad_norm": 3.9331976985714006, "learning_rate": 0.00014158075601374573, "loss": 1.021, "step": 96 }, { "epoch": 0.11, "grad_norm": 3.7472774795317676, "learning_rate": 0.00014089347079037802, "loss": 0.9222, "step": 97 }, { "epoch": 0.11, "grad_norm": 5.430655545449399, "learning_rate": 0.0001402061855670103, "loss": 1.0648, "step": 98 }, { "epoch": 0.11, "grad_norm": 3.8259440534841365, "learning_rate": 0.00013951890034364262, "loss": 0.9769, "step": 99 }, { "epoch": 0.11, "grad_norm": 4.784794509604932, "learning_rate": 0.0001388316151202749, "loss": 1.0134, "step": 100 } ], "logging_steps": 1, "max_steps": 301, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "total_flos": 1192784269148160.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }