|
{ |
|
"best_metric": 2.1916589736938477, |
|
"best_model_checkpoint": "/home/datta0/models/lora_final/Meta-Llama-3-8B_pct_reverse/checkpoint-384", |
|
"epoch": 0.9990344383649823, |
|
"eval_steps": 8, |
|
"global_step": 388, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.002574831026713872, |
|
"grad_norm": 7.234233379364014, |
|
"learning_rate": 3.75e-05, |
|
"loss": 2.3601, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.010299324106855488, |
|
"grad_norm": 10.67747974395752, |
|
"learning_rate": 0.00015, |
|
"loss": 2.3309, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.020598648213710977, |
|
"grad_norm": 5.488127708435059, |
|
"learning_rate": 0.0003, |
|
"loss": 2.2547, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.020598648213710977, |
|
"eval_loss": 2.265151262283325, |
|
"eval_runtime": 10.4761, |
|
"eval_samples_per_second": 23.387, |
|
"eval_steps_per_second": 2.959, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.03089797232056646, |
|
"grad_norm": 8.297589302062988, |
|
"learning_rate": 0.0002999179886011389, |
|
"loss": 2.2409, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.04119729642742195, |
|
"grad_norm": 9.902826309204102, |
|
"learning_rate": 0.00029967204408281613, |
|
"loss": 2.2857, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.04119729642742195, |
|
"eval_loss": 2.2721691131591797, |
|
"eval_runtime": 10.4981, |
|
"eval_samples_per_second": 23.338, |
|
"eval_steps_per_second": 2.953, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.051496620534277435, |
|
"grad_norm": 7.015585422515869, |
|
"learning_rate": 0.0002992624353817517, |
|
"loss": 2.2483, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06179594464113292, |
|
"grad_norm": 10.114925384521484, |
|
"learning_rate": 0.00029868961039904624, |
|
"loss": 2.217, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.06179594464113292, |
|
"eval_loss": 2.2662720680236816, |
|
"eval_runtime": 10.4882, |
|
"eval_samples_per_second": 23.36, |
|
"eval_steps_per_second": 2.956, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.07209526874798841, |
|
"grad_norm": 8.40988540649414, |
|
"learning_rate": 0.00029795419551040833, |
|
"loss": 2.2508, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.0823945928548439, |
|
"grad_norm": 7.934015274047852, |
|
"learning_rate": 0.0002970569948812214, |
|
"loss": 2.2942, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.0823945928548439, |
|
"eval_loss": 2.2549476623535156, |
|
"eval_runtime": 10.4835, |
|
"eval_samples_per_second": 23.37, |
|
"eval_steps_per_second": 2.957, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.09269391696169939, |
|
"grad_norm": 10.256850242614746, |
|
"learning_rate": 0.0002959989895872009, |
|
"loss": 2.2494, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.10299324106855487, |
|
"grad_norm": 5.103182792663574, |
|
"learning_rate": 0.0002947813365416023, |
|
"loss": 2.281, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.10299324106855487, |
|
"eval_loss": 2.250826358795166, |
|
"eval_runtime": 10.4745, |
|
"eval_samples_per_second": 23.39, |
|
"eval_steps_per_second": 2.96, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.11329256517541036, |
|
"grad_norm": 14.361717224121094, |
|
"learning_rate": 0.0002934053672301536, |
|
"loss": 2.2826, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.12359188928226585, |
|
"grad_norm": 14.451160430908203, |
|
"learning_rate": 0.00029187258625509513, |
|
"loss": 2.2541, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.12359188928226585, |
|
"eval_loss": 2.270808219909668, |
|
"eval_runtime": 10.4893, |
|
"eval_samples_per_second": 23.357, |
|
"eval_steps_per_second": 2.955, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.13389121338912133, |
|
"grad_norm": 7.362520217895508, |
|
"learning_rate": 0.0002901846696899191, |
|
"loss": 2.3205, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.14419053749597682, |
|
"grad_norm": 6.173586368560791, |
|
"learning_rate": 0.0002883434632466077, |
|
"loss": 2.2672, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.14419053749597682, |
|
"eval_loss": 2.264765977859497, |
|
"eval_runtime": 10.4685, |
|
"eval_samples_per_second": 23.404, |
|
"eval_steps_per_second": 2.961, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.15448986160283232, |
|
"grad_norm": 6.935930252075195, |
|
"learning_rate": 0.00028635098025737434, |
|
"loss": 2.2811, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1647891857096878, |
|
"grad_norm": 6.581336498260498, |
|
"learning_rate": 0.0002842093994731145, |
|
"loss": 2.2887, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.1647891857096878, |
|
"eval_loss": 2.2697689533233643, |
|
"eval_runtime": 10.4495, |
|
"eval_samples_per_second": 23.446, |
|
"eval_steps_per_second": 2.967, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.17508850981654328, |
|
"grad_norm": 7.652212619781494, |
|
"learning_rate": 0.00028192106268097334, |
|
"loss": 2.3141, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.18538783392339878, |
|
"grad_norm": 6.344089508056641, |
|
"learning_rate": 0.0002794884721436361, |
|
"loss": 2.2464, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.18538783392339878, |
|
"eval_loss": 2.265443801879883, |
|
"eval_runtime": 10.4326, |
|
"eval_samples_per_second": 23.484, |
|
"eval_steps_per_second": 2.971, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.19568715803025427, |
|
"grad_norm": 7.209957599639893, |
|
"learning_rate": 0.0002769142878631403, |
|
"loss": 2.3189, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.20598648213710974, |
|
"grad_norm": 7.181511402130127, |
|
"learning_rate": 0.000274201324672203, |
|
"loss": 2.2805, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.20598648213710974, |
|
"eval_loss": 2.2734484672546387, |
|
"eval_runtime": 10.4193, |
|
"eval_samples_per_second": 23.514, |
|
"eval_steps_per_second": 2.975, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.21628580624396523, |
|
"grad_norm": 4.833886623382568, |
|
"learning_rate": 0.0002713525491562421, |
|
"loss": 2.2785, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.22658513035082073, |
|
"grad_norm": 6.109076499938965, |
|
"learning_rate": 0.00026837107640945905, |
|
"loss": 2.3111, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.22658513035082073, |
|
"eval_loss": 2.2742276191711426, |
|
"eval_runtime": 10.3908, |
|
"eval_samples_per_second": 23.578, |
|
"eval_steps_per_second": 2.983, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.23688445445767622, |
|
"grad_norm": 7.419241428375244, |
|
"learning_rate": 0.00026526016662852886, |
|
"loss": 2.2952, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.2471837785645317, |
|
"grad_norm": 7.077712059020996, |
|
"learning_rate": 0.0002620232215476231, |
|
"loss": 2.361, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.2471837785645317, |
|
"eval_loss": 2.280755043029785, |
|
"eval_runtime": 10.3724, |
|
"eval_samples_per_second": 23.62, |
|
"eval_steps_per_second": 2.989, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.2574831026713872, |
|
"grad_norm": 5.053241729736328, |
|
"learning_rate": 0.00025866378071866334, |
|
"loss": 2.3453, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.26778242677824265, |
|
"grad_norm": 6.051340579986572, |
|
"learning_rate": 0.00025518551764087326, |
|
"loss": 2.3418, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.26778242677824265, |
|
"eval_loss": 2.2801687717437744, |
|
"eval_runtime": 10.3642, |
|
"eval_samples_per_second": 23.639, |
|
"eval_steps_per_second": 2.991, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.2780817508850982, |
|
"grad_norm": 6.38856840133667, |
|
"learning_rate": 0.00025159223574386114, |
|
"loss": 2.2998, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.28838107499195365, |
|
"grad_norm": 8.506232261657715, |
|
"learning_rate": 0.00024788786422862526, |
|
"loss": 2.3064, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.28838107499195365, |
|
"eval_loss": 2.295248031616211, |
|
"eval_runtime": 42.5567, |
|
"eval_samples_per_second": 5.757, |
|
"eval_steps_per_second": 0.728, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.29868039909880917, |
|
"grad_norm": 4.6549577713012695, |
|
"learning_rate": 0.00024407645377103054, |
|
"loss": 2.3127, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.30897972320566464, |
|
"grad_norm": 5.354101657867432, |
|
"learning_rate": 0.00024016217209245374, |
|
"loss": 2.3509, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.30897972320566464, |
|
"eval_loss": 2.284128427505493, |
|
"eval_runtime": 46.025, |
|
"eval_samples_per_second": 5.323, |
|
"eval_steps_per_second": 0.674, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3192790473125201, |
|
"grad_norm": 5.410560607910156, |
|
"learning_rate": 0.0002361492994024415, |
|
"loss": 2.2929, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.3295783714193756, |
|
"grad_norm": 4.379918098449707, |
|
"learning_rate": 0.00023204222371836405, |
|
"loss": 2.3507, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.3295783714193756, |
|
"eval_loss": 2.2785849571228027, |
|
"eval_runtime": 43.533, |
|
"eval_samples_per_second": 5.628, |
|
"eval_steps_per_second": 0.712, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.3398776955262311, |
|
"grad_norm": 7.4106645584106445, |
|
"learning_rate": 0.00022784543606718227, |
|
"loss": 2.3353, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.35017701963308656, |
|
"grad_norm": 5.213589191436768, |
|
"learning_rate": 0.0002235635255745762, |
|
"loss": 2.3, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.35017701963308656, |
|
"eval_loss": 2.280118465423584, |
|
"eval_runtime": 43.9318, |
|
"eval_samples_per_second": 5.577, |
|
"eval_steps_per_second": 0.706, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.3604763437399421, |
|
"grad_norm": 5.06958532333374, |
|
"learning_rate": 0.00021920117444680317, |
|
"loss": 2.3789, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.37077566784679755, |
|
"grad_norm": 3.743401527404785, |
|
"learning_rate": 0.0002147631528507739, |
|
"loss": 2.2953, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.37077566784679755, |
|
"eval_loss": 2.2771708965301514, |
|
"eval_runtime": 44.2996, |
|
"eval_samples_per_second": 5.531, |
|
"eval_steps_per_second": 0.7, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.381074991953653, |
|
"grad_norm": 4.105078220367432, |
|
"learning_rate": 0.0002102543136979454, |
|
"loss": 2.2907, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.39137431606050854, |
|
"grad_norm": 5.587433815002441, |
|
"learning_rate": 0.0002056795873377331, |
|
"loss": 2.3224, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.39137431606050854, |
|
"eval_loss": 2.2822823524475098, |
|
"eval_runtime": 45.5585, |
|
"eval_samples_per_second": 5.378, |
|
"eval_steps_per_second": 0.68, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.401673640167364, |
|
"grad_norm": 4.818370819091797, |
|
"learning_rate": 0.00020104397616624645, |
|
"loss": 2.2946, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.4119729642742195, |
|
"grad_norm": 6.417372703552246, |
|
"learning_rate": 0.0001963525491562421, |
|
"loss": 2.3055, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4119729642742195, |
|
"eval_loss": 2.273859977722168, |
|
"eval_runtime": 43.2134, |
|
"eval_samples_per_second": 5.67, |
|
"eval_steps_per_second": 0.717, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.422272288381075, |
|
"grad_norm": 4.668494701385498, |
|
"learning_rate": 0.00019161043631427666, |
|
"loss": 2.2872, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.43257161248793047, |
|
"grad_norm": 4.450340270996094, |
|
"learning_rate": 0.00018682282307111987, |
|
"loss": 2.3519, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.43257161248793047, |
|
"eval_loss": 2.2795205116271973, |
|
"eval_runtime": 10.4715, |
|
"eval_samples_per_second": 23.397, |
|
"eval_steps_per_second": 2.96, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.442870936594786, |
|
"grad_norm": 3.7239012718200684, |
|
"learning_rate": 0.00018199494461156203, |
|
"loss": 2.3101, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.45317026070164146, |
|
"grad_norm": 4.4652533531188965, |
|
"learning_rate": 0.00017713208014981648, |
|
"loss": 2.2988, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.45317026070164146, |
|
"eval_loss": 2.269359827041626, |
|
"eval_runtime": 10.4803, |
|
"eval_samples_per_second": 23.377, |
|
"eval_steps_per_second": 2.958, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.4634695848084969, |
|
"grad_norm": 5.2811079025268555, |
|
"learning_rate": 0.00017223954715677627, |
|
"loss": 2.3234, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.47376890891535245, |
|
"grad_norm": 5.2930707931518555, |
|
"learning_rate": 0.00016732269554543794, |
|
"loss": 2.3046, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.47376890891535245, |
|
"eval_loss": 2.264807939529419, |
|
"eval_runtime": 10.4796, |
|
"eval_samples_per_second": 23.379, |
|
"eval_steps_per_second": 2.958, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.4840682330222079, |
|
"grad_norm": 4.5643229484558105, |
|
"learning_rate": 0.00016238690182084986, |
|
"loss": 2.2673, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.4943675571290634, |
|
"grad_norm": 5.457281589508057, |
|
"learning_rate": 0.00015743756320098332, |
|
"loss": 2.296, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.4943675571290634, |
|
"eval_loss": 2.2660765647888184, |
|
"eval_runtime": 10.4734, |
|
"eval_samples_per_second": 23.393, |
|
"eval_steps_per_second": 2.96, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.5046668812359189, |
|
"grad_norm": 5.561792373657227, |
|
"learning_rate": 0.00015248009171495378, |
|
"loss": 2.2962, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.5149662053427744, |
|
"grad_norm": 4.571716785430908, |
|
"learning_rate": 0.00014751990828504622, |
|
"loss": 2.2908, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5149662053427744, |
|
"eval_loss": 2.2650046348571777, |
|
"eval_runtime": 10.4665, |
|
"eval_samples_per_second": 23.408, |
|
"eval_steps_per_second": 2.962, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5252655294496299, |
|
"grad_norm": 4.987983226776123, |
|
"learning_rate": 0.00014256243679901663, |
|
"loss": 2.2651, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.5355648535564853, |
|
"grad_norm": 4.410604476928711, |
|
"learning_rate": 0.00013761309817915014, |
|
"loss": 2.2923, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.5355648535564853, |
|
"eval_loss": 2.2632699012756348, |
|
"eval_runtime": 10.4757, |
|
"eval_samples_per_second": 23.387, |
|
"eval_steps_per_second": 2.959, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.5458641776633408, |
|
"grad_norm": 4.124587535858154, |
|
"learning_rate": 0.00013267730445456208, |
|
"loss": 2.2416, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.5561635017701964, |
|
"grad_norm": 4.6150031089782715, |
|
"learning_rate": 0.00012776045284322368, |
|
"loss": 2.3062, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.5561635017701964, |
|
"eval_loss": 2.246860980987549, |
|
"eval_runtime": 10.4825, |
|
"eval_samples_per_second": 23.372, |
|
"eval_steps_per_second": 2.957, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.5664628258770518, |
|
"grad_norm": 5.15360164642334, |
|
"learning_rate": 0.00012286791985018355, |
|
"loss": 2.2691, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5767621499839073, |
|
"grad_norm": 3.6853907108306885, |
|
"learning_rate": 0.00011800505538843798, |
|
"loss": 2.289, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.5767621499839073, |
|
"eval_loss": 2.251561164855957, |
|
"eval_runtime": 10.4661, |
|
"eval_samples_per_second": 23.409, |
|
"eval_steps_per_second": 2.962, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.5870614740907628, |
|
"grad_norm": 5.001940727233887, |
|
"learning_rate": 0.00011317717692888012, |
|
"loss": 2.2904, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.5973607981976183, |
|
"grad_norm": 4.5493245124816895, |
|
"learning_rate": 0.00010838956368572334, |
|
"loss": 2.2736, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.5973607981976183, |
|
"eval_loss": 2.2452073097229004, |
|
"eval_runtime": 10.4568, |
|
"eval_samples_per_second": 23.43, |
|
"eval_steps_per_second": 2.965, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.6076601223044737, |
|
"grad_norm": 4.061275959014893, |
|
"learning_rate": 0.0001036474508437579, |
|
"loss": 2.3138, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.6179594464113293, |
|
"grad_norm": 4.110962867736816, |
|
"learning_rate": 9.895602383375353e-05, |
|
"loss": 2.2414, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6179594464113293, |
|
"eval_loss": 2.2406225204467773, |
|
"eval_runtime": 10.4008, |
|
"eval_samples_per_second": 23.556, |
|
"eval_steps_per_second": 2.981, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6282587705181848, |
|
"grad_norm": 5.820373058319092, |
|
"learning_rate": 9.432041266226686e-05, |
|
"loss": 2.2842, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.6385580946250402, |
|
"grad_norm": 4.578804016113281, |
|
"learning_rate": 8.97456863020546e-05, |
|
"loss": 2.2667, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.6385580946250402, |
|
"eval_loss": 2.2354886531829834, |
|
"eval_runtime": 10.394, |
|
"eval_samples_per_second": 23.571, |
|
"eval_steps_per_second": 2.982, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.6488574187318957, |
|
"grad_norm": 4.047770023345947, |
|
"learning_rate": 8.523684714922608e-05, |
|
"loss": 2.2321, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.6591567428387513, |
|
"grad_norm": 4.675099849700928, |
|
"learning_rate": 8.079882555319684e-05, |
|
"loss": 2.2595, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.6591567428387513, |
|
"eval_loss": 2.235430955886841, |
|
"eval_runtime": 10.3763, |
|
"eval_samples_per_second": 23.612, |
|
"eval_steps_per_second": 2.988, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.6694560669456067, |
|
"grad_norm": 4.365800857543945, |
|
"learning_rate": 7.643647442542382e-05, |
|
"loss": 2.2597, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6797553910524622, |
|
"grad_norm": 3.6257271766662598, |
|
"learning_rate": 7.215456393281776e-05, |
|
"loss": 2.2175, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.6797553910524622, |
|
"eval_loss": 2.227555751800537, |
|
"eval_runtime": 10.353, |
|
"eval_samples_per_second": 23.665, |
|
"eval_steps_per_second": 2.994, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.6900547151593177, |
|
"grad_norm": 3.8250067234039307, |
|
"learning_rate": 6.795777628163599e-05, |
|
"loss": 2.2509, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.7003540392661731, |
|
"grad_norm": 3.7792561054229736, |
|
"learning_rate": 6.385070059755846e-05, |
|
"loss": 2.277, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.7003540392661731, |
|
"eval_loss": 2.2220957279205322, |
|
"eval_runtime": 44.4356, |
|
"eval_samples_per_second": 5.514, |
|
"eval_steps_per_second": 0.698, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.7106533633730286, |
|
"grad_norm": 3.0634820461273193, |
|
"learning_rate": 5.983782790754623e-05, |
|
"loss": 2.2659, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.7209526874798842, |
|
"grad_norm": 3.05484676361084, |
|
"learning_rate": 5.592354622896944e-05, |
|
"loss": 2.2576, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7209526874798842, |
|
"eval_loss": 2.216092824935913, |
|
"eval_runtime": 43.7777, |
|
"eval_samples_per_second": 5.596, |
|
"eval_steps_per_second": 0.708, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7312520115867396, |
|
"grad_norm": 3.351827383041382, |
|
"learning_rate": 5.211213577137469e-05, |
|
"loss": 2.2508, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.7415513356935951, |
|
"grad_norm": 4.593320846557617, |
|
"learning_rate": 4.840776425613886e-05, |
|
"loss": 2.2604, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.7415513356935951, |
|
"eval_loss": 2.212282657623291, |
|
"eval_runtime": 44.6778, |
|
"eval_samples_per_second": 5.484, |
|
"eval_steps_per_second": 0.694, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.7518506598004506, |
|
"grad_norm": 3.539661407470703, |
|
"learning_rate": 4.481448235912671e-05, |
|
"loss": 2.2531, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.762149983907306, |
|
"grad_norm": 3.0205695629119873, |
|
"learning_rate": 4.133621928133665e-05, |
|
"loss": 2.2526, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.762149983907306, |
|
"eval_loss": 2.2118477821350098, |
|
"eval_runtime": 43.6297, |
|
"eval_samples_per_second": 5.615, |
|
"eval_steps_per_second": 0.711, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.7724493080141616, |
|
"grad_norm": 3.0903685092926025, |
|
"learning_rate": 3.797677845237696e-05, |
|
"loss": 2.2038, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7827486321210171, |
|
"grad_norm": 3.416120767593384, |
|
"learning_rate": 3.473983337147118e-05, |
|
"loss": 2.2838, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.7827486321210171, |
|
"eval_loss": 2.20328688621521, |
|
"eval_runtime": 43.6906, |
|
"eval_samples_per_second": 5.608, |
|
"eval_steps_per_second": 0.71, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.7930479562278725, |
|
"grad_norm": 4.284872055053711, |
|
"learning_rate": 3.162892359054098e-05, |
|
"loss": 2.2062, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.803347280334728, |
|
"grad_norm": 4.279630184173584, |
|
"learning_rate": 2.8647450843757897e-05, |
|
"loss": 2.2214, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.803347280334728, |
|
"eval_loss": 2.2009222507476807, |
|
"eval_runtime": 44.2495, |
|
"eval_samples_per_second": 5.537, |
|
"eval_steps_per_second": 0.701, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.8136466044415835, |
|
"grad_norm": 2.9478297233581543, |
|
"learning_rate": 2.5798675327796993e-05, |
|
"loss": 2.181, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.823945928548439, |
|
"grad_norm": 3.1356239318847656, |
|
"learning_rate": 2.3085712136859668e-05, |
|
"loss": 2.2034, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.823945928548439, |
|
"eval_loss": 2.2014718055725098, |
|
"eval_runtime": 44.3189, |
|
"eval_samples_per_second": 5.528, |
|
"eval_steps_per_second": 0.699, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8342452526552945, |
|
"grad_norm": 3.170116901397705, |
|
"learning_rate": 2.0511527856363912e-05, |
|
"loss": 2.2624, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.84454457676215, |
|
"grad_norm": 3.725310802459717, |
|
"learning_rate": 1.8078937319026654e-05, |
|
"loss": 2.235, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.84454457676215, |
|
"eval_loss": 2.1954057216644287, |
|
"eval_runtime": 10.4846, |
|
"eval_samples_per_second": 23.368, |
|
"eval_steps_per_second": 2.957, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.8548439008690055, |
|
"grad_norm": 2.9525537490844727, |
|
"learning_rate": 1.579060052688548e-05, |
|
"loss": 2.209, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.8651432249758609, |
|
"grad_norm": 3.0569522380828857, |
|
"learning_rate": 1.3649019742625623e-05, |
|
"loss": 2.2444, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.8651432249758609, |
|
"eval_loss": 2.1970837116241455, |
|
"eval_runtime": 10.4801, |
|
"eval_samples_per_second": 23.378, |
|
"eval_steps_per_second": 2.958, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.8754425490827165, |
|
"grad_norm": 3.5634520053863525, |
|
"learning_rate": 1.1656536753392287e-05, |
|
"loss": 2.2384, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.885741873189572, |
|
"grad_norm": 3.441944122314453, |
|
"learning_rate": 9.815330310080887e-06, |
|
"loss": 2.2593, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.885741873189572, |
|
"eval_loss": 2.1938905715942383, |
|
"eval_runtime": 10.4695, |
|
"eval_samples_per_second": 23.401, |
|
"eval_steps_per_second": 2.961, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.8960411972964274, |
|
"grad_norm": 3.692169666290283, |
|
"learning_rate": 8.127413744904804e-06, |
|
"loss": 2.231, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.9063405214032829, |
|
"grad_norm": 4.078197956085205, |
|
"learning_rate": 6.594632769846353e-06, |
|
"loss": 2.2222, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.9063405214032829, |
|
"eval_loss": 2.1928889751434326, |
|
"eval_runtime": 10.4765, |
|
"eval_samples_per_second": 23.386, |
|
"eval_steps_per_second": 2.959, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.9166398455101384, |
|
"grad_norm": 2.998924493789673, |
|
"learning_rate": 5.218663458397715e-06, |
|
"loss": 2.2152, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.9269391696169939, |
|
"grad_norm": 4.00682258605957, |
|
"learning_rate": 4.001010412799138e-06, |
|
"loss": 2.1894, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9269391696169939, |
|
"eval_loss": 2.194350481033325, |
|
"eval_runtime": 10.4754, |
|
"eval_samples_per_second": 23.388, |
|
"eval_steps_per_second": 2.959, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9372384937238494, |
|
"grad_norm": 2.969897508621216, |
|
"learning_rate": 2.9430051187785962e-06, |
|
"loss": 2.2449, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.9475378178307049, |
|
"grad_norm": 2.7603042125701904, |
|
"learning_rate": 2.0458044895916513e-06, |
|
"loss": 2.2138, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.9475378178307049, |
|
"eval_loss": 2.192730188369751, |
|
"eval_runtime": 10.4662, |
|
"eval_samples_per_second": 23.409, |
|
"eval_steps_per_second": 2.962, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.9578371419375603, |
|
"grad_norm": 3.089548110961914, |
|
"learning_rate": 1.3103896009537207e-06, |
|
"loss": 2.1948, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.9681364660444158, |
|
"grad_norm": 2.956756114959717, |
|
"learning_rate": 7.375646182482875e-07, |
|
"loss": 2.2543, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.9681364660444158, |
|
"eval_loss": 2.1918137073516846, |
|
"eval_runtime": 10.4575, |
|
"eval_samples_per_second": 23.428, |
|
"eval_steps_per_second": 2.964, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.9784357901512714, |
|
"grad_norm": 3.6336631774902344, |
|
"learning_rate": 3.2795591718381975e-07, |
|
"loss": 2.1958, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9887351142581268, |
|
"grad_norm": 3.5543529987335205, |
|
"learning_rate": 8.201139886109264e-08, |
|
"loss": 2.2462, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.9887351142581268, |
|
"eval_loss": 2.1916589736938477, |
|
"eval_runtime": 10.4533, |
|
"eval_samples_per_second": 23.438, |
|
"eval_steps_per_second": 2.966, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.9990344383649823, |
|
"grad_norm": 3.0281052589416504, |
|
"learning_rate": 0.0, |
|
"loss": 2.2377, |
|
"step": 388 |
|
} |
|
], |
|
"logging_steps": 4, |
|
"max_steps": 388, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 8, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.087198145554678e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|