|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.10982976386600769, |
|
"eval_steps": 1000, |
|
"global_step": 100, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 11.704396157183714, |
|
"learning_rate": 0.0, |
|
"loss": 2.6917, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 14.261265634920543, |
|
"learning_rate": 6.020599913279623e-05, |
|
"loss": 2.0455, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 9.252464584699219, |
|
"learning_rate": 9.542425094393248e-05, |
|
"loss": 2.1024, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 7.478072090774749, |
|
"learning_rate": 0.00012041199826559246, |
|
"loss": 1.9419, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 6.996421444202911, |
|
"learning_rate": 0.00013979400086720374, |
|
"loss": 1.2142, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 9.62925923224712, |
|
"learning_rate": 0.00015563025007672872, |
|
"loss": 1.4192, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 5.9971192119861545, |
|
"learning_rate": 0.00016901960800285134, |
|
"loss": 1.1698, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 15.448410811680446, |
|
"learning_rate": 0.00018061799739838867, |
|
"loss": 1.0663, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 6.664012698560983, |
|
"learning_rate": 0.00019084850188786495, |
|
"loss": 1.0346, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 5.9872198688383325, |
|
"learning_rate": 0.00019999999999999998, |
|
"loss": 1.2899, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 5.492858411635369, |
|
"learning_rate": 0.0002, |
|
"loss": 0.733, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 4.117272508239465, |
|
"learning_rate": 0.00019931271477663232, |
|
"loss": 0.7529, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 4.57016142244955, |
|
"learning_rate": 0.0001986254295532646, |
|
"loss": 0.9221, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 5.106703686957636, |
|
"learning_rate": 0.00019793814432989693, |
|
"loss": 1.1292, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 4.4331559960790115, |
|
"learning_rate": 0.00019725085910652924, |
|
"loss": 0.9505, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 4.871898936371301, |
|
"learning_rate": 0.0001965635738831615, |
|
"loss": 1.0894, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 5.546128257668831, |
|
"learning_rate": 0.00019587628865979381, |
|
"loss": 1.0915, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 5.426135233570764, |
|
"learning_rate": 0.00019518900343642613, |
|
"loss": 1.2174, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 3.7481567939197316, |
|
"learning_rate": 0.00019450171821305842, |
|
"loss": 0.8281, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 6.264747615275931, |
|
"learning_rate": 0.00019381443298969073, |
|
"loss": 1.1069, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 4.850720515264914, |
|
"learning_rate": 0.00019312714776632305, |
|
"loss": 1.1216, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 4.583277384098615, |
|
"learning_rate": 0.00019243986254295533, |
|
"loss": 1.0751, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 4.923174586611154, |
|
"learning_rate": 0.00019175257731958765, |
|
"loss": 1.1915, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 4.21422328394722, |
|
"learning_rate": 0.00019106529209621996, |
|
"loss": 1.1412, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 3.9312652393524803, |
|
"learning_rate": 0.00019037800687285222, |
|
"loss": 0.9699, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 4.511161102577495, |
|
"learning_rate": 0.00018969072164948454, |
|
"loss": 1.1289, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 3.9233152624949454, |
|
"learning_rate": 0.00018900343642611685, |
|
"loss": 0.8803, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 5.413459689226915, |
|
"learning_rate": 0.00018831615120274914, |
|
"loss": 1.0594, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 4.2106489602848685, |
|
"learning_rate": 0.00018762886597938145, |
|
"loss": 1.1003, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 4.568263346040736, |
|
"learning_rate": 0.00018694158075601377, |
|
"loss": 1.0816, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 3.12202434010765, |
|
"learning_rate": 0.00018625429553264605, |
|
"loss": 0.9053, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 4.282623293516478, |
|
"learning_rate": 0.00018556701030927837, |
|
"loss": 1.206, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 6.45030819651884, |
|
"learning_rate": 0.00018487972508591068, |
|
"loss": 0.9049, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 4.871975082534186, |
|
"learning_rate": 0.00018419243986254294, |
|
"loss": 1.1436, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 4.715014058776802, |
|
"learning_rate": 0.00018350515463917526, |
|
"loss": 1.3847, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.3808462582338707, |
|
"learning_rate": 0.00018281786941580757, |
|
"loss": 0.7817, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.8976964456454866, |
|
"learning_rate": 0.00018213058419243986, |
|
"loss": 1.0001, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 5.355867444757705, |
|
"learning_rate": 0.00018144329896907217, |
|
"loss": 0.9259, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 5.178991855035252, |
|
"learning_rate": 0.0001807560137457045, |
|
"loss": 0.794, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 4.14441205427679, |
|
"learning_rate": 0.00018006872852233677, |
|
"loss": 1.0384, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 3.7176836552561974, |
|
"learning_rate": 0.0001793814432989691, |
|
"loss": 0.75, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 5.011588210132538, |
|
"learning_rate": 0.0001786941580756014, |
|
"loss": 1.3034, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 5.111725152950483, |
|
"learning_rate": 0.00017800687285223366, |
|
"loss": 1.3023, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 4.021959822322685, |
|
"learning_rate": 0.00017731958762886598, |
|
"loss": 0.9624, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.943541786297702, |
|
"learning_rate": 0.0001766323024054983, |
|
"loss": 0.6659, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 4.534002060279327, |
|
"learning_rate": 0.00017594501718213058, |
|
"loss": 1.1354, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 5.108426720583385, |
|
"learning_rate": 0.0001752577319587629, |
|
"loss": 1.4258, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 7.235810963301271, |
|
"learning_rate": 0.0001745704467353952, |
|
"loss": 1.3918, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 4.207791712185817, |
|
"learning_rate": 0.0001738831615120275, |
|
"loss": 1.2796, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 4.050243728273998, |
|
"learning_rate": 0.0001731958762886598, |
|
"loss": 1.244, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 4.657182228931627, |
|
"learning_rate": 0.00017250859106529212, |
|
"loss": 1.2068, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 3.5652582299228643, |
|
"learning_rate": 0.00017182130584192438, |
|
"loss": 0.8897, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 4.529701968193304, |
|
"learning_rate": 0.0001711340206185567, |
|
"loss": 1.1582, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 4.49586529984068, |
|
"learning_rate": 0.000170446735395189, |
|
"loss": 1.1379, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 5.845994813972232, |
|
"learning_rate": 0.0001697594501718213, |
|
"loss": 1.4724, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 4.732950229095352, |
|
"learning_rate": 0.00016907216494845361, |
|
"loss": 0.886, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 3.8567901480339426, |
|
"learning_rate": 0.00016838487972508593, |
|
"loss": 0.7763, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 4.931804800794497, |
|
"learning_rate": 0.00016769759450171822, |
|
"loss": 1.1761, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 3.7294788904744625, |
|
"learning_rate": 0.00016701030927835053, |
|
"loss": 0.6722, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 3.688890615204965, |
|
"learning_rate": 0.00016632302405498285, |
|
"loss": 0.9642, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 4.022317141495382, |
|
"learning_rate": 0.00016563573883161513, |
|
"loss": 0.8865, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 6.55573614414137, |
|
"learning_rate": 0.00016494845360824742, |
|
"loss": 1.217, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 4.5890564068710615, |
|
"learning_rate": 0.00016426116838487973, |
|
"loss": 1.0791, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 5.435288078022071, |
|
"learning_rate": 0.00016357388316151202, |
|
"loss": 1.1501, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 4.965664463354534, |
|
"learning_rate": 0.00016288659793814434, |
|
"loss": 1.4109, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 7.483997056692371, |
|
"learning_rate": 0.00016219931271477665, |
|
"loss": 1.0735, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 4.9923544603584915, |
|
"learning_rate": 0.00016151202749140894, |
|
"loss": 0.9927, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 5.143678077389599, |
|
"learning_rate": 0.00016082474226804125, |
|
"loss": 1.1299, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 4.420855401236021, |
|
"learning_rate": 0.00016013745704467357, |
|
"loss": 1.0263, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 5.02485050719212, |
|
"learning_rate": 0.00015945017182130585, |
|
"loss": 1.1168, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 4.524608717382107, |
|
"learning_rate": 0.00015876288659793814, |
|
"loss": 1.1015, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 3.9028726855074054, |
|
"learning_rate": 0.00015807560137457046, |
|
"loss": 1.091, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 3.271315148375007, |
|
"learning_rate": 0.00015738831615120274, |
|
"loss": 0.9397, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 4.5227145076761515, |
|
"learning_rate": 0.00015670103092783506, |
|
"loss": 1.0062, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 3.344752651586208, |
|
"learning_rate": 0.00015601374570446737, |
|
"loss": 1.0027, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 4.102353206893142, |
|
"learning_rate": 0.00015532646048109966, |
|
"loss": 1.3915, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 4.722343844673861, |
|
"learning_rate": 0.00015463917525773197, |
|
"loss": 1.1678, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 6.017428516681286, |
|
"learning_rate": 0.0001539518900343643, |
|
"loss": 1.329, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 3.7313811619387516, |
|
"learning_rate": 0.00015326460481099657, |
|
"loss": 0.887, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 3.7179924970759406, |
|
"learning_rate": 0.00015257731958762886, |
|
"loss": 1.0153, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 4.499981774187266, |
|
"learning_rate": 0.00015189003436426118, |
|
"loss": 1.3911, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 5.955286629774712, |
|
"learning_rate": 0.00015120274914089346, |
|
"loss": 1.215, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 3.5953610996772984, |
|
"learning_rate": 0.00015051546391752578, |
|
"loss": 0.9343, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 3.8040779769157687, |
|
"learning_rate": 0.0001498281786941581, |
|
"loss": 1.062, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 5.755142565965664, |
|
"learning_rate": 0.00014914089347079038, |
|
"loss": 1.2539, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 3.7550494212874264, |
|
"learning_rate": 0.0001484536082474227, |
|
"loss": 0.8594, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 6.7330159455584635, |
|
"learning_rate": 0.000147766323024055, |
|
"loss": 0.9157, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 4.438479463179745, |
|
"learning_rate": 0.0001470790378006873, |
|
"loss": 1.1235, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 3.87421384650703, |
|
"learning_rate": 0.00014639175257731958, |
|
"loss": 0.9763, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.4000323333151683, |
|
"learning_rate": 0.0001457044673539519, |
|
"loss": 0.5867, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 5.866412115354472, |
|
"learning_rate": 0.00014501718213058418, |
|
"loss": 1.5134, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 3.023945137404242, |
|
"learning_rate": 0.0001443298969072165, |
|
"loss": 0.7887, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 3.7174822786238435, |
|
"learning_rate": 0.00014364261168384881, |
|
"loss": 0.925, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 4.007705895268358, |
|
"learning_rate": 0.0001429553264604811, |
|
"loss": 0.9114, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 3.852559484855566, |
|
"learning_rate": 0.00014226804123711342, |
|
"loss": 1.2322, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 3.9331976985714006, |
|
"learning_rate": 0.00014158075601374573, |
|
"loss": 1.021, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 3.7472774795317676, |
|
"learning_rate": 0.00014089347079037802, |
|
"loss": 0.9222, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 5.430655545449399, |
|
"learning_rate": 0.0001402061855670103, |
|
"loss": 1.0648, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 3.8259440534841365, |
|
"learning_rate": 0.00013951890034364262, |
|
"loss": 0.9769, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 4.784794509604932, |
|
"learning_rate": 0.0001388316151202749, |
|
"loss": 1.0134, |
|
"step": 100 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 301, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 25, |
|
"total_flos": 1192784269148160.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|