|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 20.0, |
|
"eval_steps": 500, |
|
"global_step": 371900, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.0679160356521606, |
|
"learning_rate": 3.125e-06, |
|
"loss": 7.5946, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.7341670989990234, |
|
"learning_rate": 6.25e-06, |
|
"loss": 5.8196, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.8071977496147156, |
|
"learning_rate": 9.375000000000001e-06, |
|
"loss": 5.3842, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.9924404621124268, |
|
"learning_rate": 1.25e-05, |
|
"loss": 5.1603, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.9524043202400208, |
|
"learning_rate": 1.5625e-05, |
|
"loss": 4.999, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.0067890882492065, |
|
"learning_rate": 1.8750000000000002e-05, |
|
"loss": 4.8622, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.0616331100463867, |
|
"learning_rate": 2.1875e-05, |
|
"loss": 4.7445, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.4308688640594482, |
|
"learning_rate": 2.5e-05, |
|
"loss": 4.6507, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.0376019477844238, |
|
"learning_rate": 2.8125000000000003e-05, |
|
"loss": 4.558, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.0812149047851562, |
|
"learning_rate": 3.125e-05, |
|
"loss": 4.4792, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.039007544517517, |
|
"learning_rate": 3.4375e-05, |
|
"loss": 4.4158, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.0472838878631592, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 4.3502, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.0667674541473389, |
|
"learning_rate": 4.061875e-05, |
|
"loss": 4.2864, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.0470460653305054, |
|
"learning_rate": 4.374375e-05, |
|
"loss": 4.2366, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.0131665468215942, |
|
"learning_rate": 4.6865625e-05, |
|
"loss": 4.1872, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.0265806913375854, |
|
"learning_rate": 4.9990625000000004e-05, |
|
"loss": 4.1391, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.9650479555130005, |
|
"learning_rate": 5.3115625000000005e-05, |
|
"loss": 4.0979, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 1.008900761604309, |
|
"learning_rate": 5.6240625e-05, |
|
"loss": 4.0574, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.3094689486155737, |
|
"eval_loss": 4.26678466796875, |
|
"eval_runtime": 152.067, |
|
"eval_samples_per_second": 380.891, |
|
"eval_steps_per_second": 5.958, |
|
"step": 18595 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.9984724521636963, |
|
"learning_rate": 5.93625e-05, |
|
"loss": 4.0058, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 1.0308632850646973, |
|
"learning_rate": 6.24875e-05, |
|
"loss": 3.9657, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 0.9864353537559509, |
|
"learning_rate": 6.56125e-05, |
|
"loss": 3.925, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 1.0101842880249023, |
|
"learning_rate": 6.8734375e-05, |
|
"loss": 3.8953, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.9528496861457825, |
|
"learning_rate": 7.185937500000001e-05, |
|
"loss": 3.8541, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 1.021189570426941, |
|
"learning_rate": 7.4978125e-05, |
|
"loss": 3.8288, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 0.9835717082023621, |
|
"learning_rate": 7.8103125e-05, |
|
"loss": 3.7995, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 1.0188689231872559, |
|
"learning_rate": 8.1228125e-05, |
|
"loss": 3.7699, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 0.9111623764038086, |
|
"learning_rate": 8.435e-05, |
|
"loss": 3.7484, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 0.8624160289764404, |
|
"learning_rate": 8.747500000000001e-05, |
|
"loss": 3.7345, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.8700354695320129, |
|
"learning_rate": 9.0596875e-05, |
|
"loss": 3.7102, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 0.8851209878921509, |
|
"learning_rate": 9.3721875e-05, |
|
"loss": 3.6873, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 0.8330232501029968, |
|
"learning_rate": 9.684375e-05, |
|
"loss": 3.6671, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.8477538824081421, |
|
"learning_rate": 9.9965625e-05, |
|
"loss": 3.6533, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 0.8847412467002869, |
|
"learning_rate": 9.970903206825538e-05, |
|
"loss": 3.6399, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 0.8788692951202393, |
|
"learning_rate": 9.941482789055604e-05, |
|
"loss": 3.6138, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.8514883518218994, |
|
"learning_rate": 9.912091791703443e-05, |
|
"loss": 3.6049, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 0.8380881547927856, |
|
"learning_rate": 9.882671373933511e-05, |
|
"loss": 3.5864, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 0.7953729033470154, |
|
"learning_rate": 9.853250956163578e-05, |
|
"loss": 3.574, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.36377712108475674, |
|
"eval_loss": 3.7340097427368164, |
|
"eval_runtime": 153.4304, |
|
"eval_samples_per_second": 377.507, |
|
"eval_steps_per_second": 5.905, |
|
"step": 37190 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 0.8078764081001282, |
|
"learning_rate": 9.823830538393646e-05, |
|
"loss": 3.5281, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 0.8253265619277954, |
|
"learning_rate": 9.794439541041483e-05, |
|
"loss": 3.52, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 0.8200889825820923, |
|
"learning_rate": 9.765048543689321e-05, |
|
"loss": 3.5065, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 0.7781252861022949, |
|
"learning_rate": 9.735628125919388e-05, |
|
"loss": 3.4945, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 0.7886627912521362, |
|
"learning_rate": 9.706207708149456e-05, |
|
"loss": 3.4924, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 0.7948366403579712, |
|
"learning_rate": 9.676787290379524e-05, |
|
"loss": 3.4834, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 0.7732539176940918, |
|
"learning_rate": 9.647425713445132e-05, |
|
"loss": 3.4722, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 0.769277036190033, |
|
"learning_rate": 9.618005295675198e-05, |
|
"loss": 3.4659, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 0.7610235214233398, |
|
"learning_rate": 9.588584877905267e-05, |
|
"loss": 3.4572, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 0.7955553531646729, |
|
"learning_rate": 9.559164460135335e-05, |
|
"loss": 3.4458, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 0.772249698638916, |
|
"learning_rate": 9.529773462783173e-05, |
|
"loss": 3.4369, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 0.7617048025131226, |
|
"learning_rate": 9.50035304501324e-05, |
|
"loss": 3.4327, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 0.7853600382804871, |
|
"learning_rate": 9.470962047661077e-05, |
|
"loss": 3.4226, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 0.7761564254760742, |
|
"learning_rate": 9.441541629891145e-05, |
|
"loss": 3.4227, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.7742950916290283, |
|
"learning_rate": 9.412121212121212e-05, |
|
"loss": 3.4126, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 0.7954655885696411, |
|
"learning_rate": 9.38270079435128e-05, |
|
"loss": 3.4063, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 0.7167351841926575, |
|
"learning_rate": 9.353309796999118e-05, |
|
"loss": 3.3961, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 0.7676237225532532, |
|
"learning_rate": 9.323889379229186e-05, |
|
"loss": 3.3998, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.3792631275512326, |
|
"eval_loss": 3.594615936279297, |
|
"eval_runtime": 153.2776, |
|
"eval_samples_per_second": 377.883, |
|
"eval_steps_per_second": 5.911, |
|
"step": 55785 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"grad_norm": 0.7646710276603699, |
|
"learning_rate": 9.294468961459253e-05, |
|
"loss": 3.3752, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 3.07, |
|
"grad_norm": 0.7935709953308105, |
|
"learning_rate": 9.26507796410709e-05, |
|
"loss": 3.3398, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 0.746233344078064, |
|
"learning_rate": 9.235657546337158e-05, |
|
"loss": 3.3393, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"grad_norm": 0.7573293447494507, |
|
"learning_rate": 9.206266548984997e-05, |
|
"loss": 3.3332, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 3.23, |
|
"grad_norm": 0.7440087199211121, |
|
"learning_rate": 9.176875551632832e-05, |
|
"loss": 3.3342, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 0.8584343194961548, |
|
"learning_rate": 9.1474551338629e-05, |
|
"loss": 3.3328, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"grad_norm": 0.7430059909820557, |
|
"learning_rate": 9.118034716092969e-05, |
|
"loss": 3.3274, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"grad_norm": 0.7765605449676514, |
|
"learning_rate": 9.088614298323037e-05, |
|
"loss": 3.3256, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 0.7461695075035095, |
|
"learning_rate": 9.059193880553104e-05, |
|
"loss": 3.323, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 0.767082691192627, |
|
"learning_rate": 9.029802883200942e-05, |
|
"loss": 3.318, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"grad_norm": 0.749569296836853, |
|
"learning_rate": 9.00038246543101e-05, |
|
"loss": 3.3153, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 0.7436603307723999, |
|
"learning_rate": 8.971020888496617e-05, |
|
"loss": 3.3119, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"grad_norm": 0.7696906924247742, |
|
"learning_rate": 8.941600470726684e-05, |
|
"loss": 3.3112, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"grad_norm": 0.7421345114707947, |
|
"learning_rate": 8.912180052956752e-05, |
|
"loss": 3.3105, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 0.753808319568634, |
|
"learning_rate": 8.88275963518682e-05, |
|
"loss": 3.2991, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"grad_norm": 0.7684649229049683, |
|
"learning_rate": 8.853368637834658e-05, |
|
"loss": 3.2998, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"grad_norm": 0.7607173919677734, |
|
"learning_rate": 8.823948220064724e-05, |
|
"loss": 3.2958, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 3.93, |
|
"grad_norm": 0.7405970096588135, |
|
"learning_rate": 8.794557222712563e-05, |
|
"loss": 3.2941, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 3.98, |
|
"grad_norm": 0.7150483131408691, |
|
"learning_rate": 8.765166225360401e-05, |
|
"loss": 3.2908, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.38712933630182045, |
|
"eval_loss": 3.5169150829315186, |
|
"eval_runtime": 153.253, |
|
"eval_samples_per_second": 377.944, |
|
"eval_steps_per_second": 5.912, |
|
"step": 74380 |
|
}, |
|
{ |
|
"epoch": 4.03, |
|
"grad_norm": 0.7505189776420593, |
|
"learning_rate": 8.735745807590468e-05, |
|
"loss": 3.254, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 4.09, |
|
"grad_norm": 0.8018031120300293, |
|
"learning_rate": 8.706325389820536e-05, |
|
"loss": 3.2441, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 4.14, |
|
"grad_norm": 0.7522112727165222, |
|
"learning_rate": 8.676934392468373e-05, |
|
"loss": 3.2338, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 4.19, |
|
"grad_norm": 0.7681379318237305, |
|
"learning_rate": 8.647513974698441e-05, |
|
"loss": 3.2398, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"grad_norm": 0.7102425694465637, |
|
"learning_rate": 8.61809355692851e-05, |
|
"loss": 3.2401, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"grad_norm": 0.7491008043289185, |
|
"learning_rate": 8.588702559576346e-05, |
|
"loss": 3.2365, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 4.36, |
|
"grad_norm": 0.7204309701919556, |
|
"learning_rate": 8.559282141806415e-05, |
|
"loss": 3.2395, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 4.41, |
|
"grad_norm": 0.7859947085380554, |
|
"learning_rate": 8.529861724036483e-05, |
|
"loss": 3.2378, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"grad_norm": 0.76870197057724, |
|
"learning_rate": 8.50044130626655e-05, |
|
"loss": 3.2353, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 4.52, |
|
"grad_norm": 0.7622324228286743, |
|
"learning_rate": 8.471050308914386e-05, |
|
"loss": 3.2353, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 4.57, |
|
"grad_norm": 0.7557118535041809, |
|
"learning_rate": 8.441629891144455e-05, |
|
"loss": 3.2375, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"grad_norm": 0.7363464832305908, |
|
"learning_rate": 8.412238893792293e-05, |
|
"loss": 3.233, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 4.68, |
|
"grad_norm": 0.7295689582824707, |
|
"learning_rate": 8.38281847602236e-05, |
|
"loss": 3.2331, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 4.73, |
|
"grad_norm": 0.7375525832176208, |
|
"learning_rate": 8.353456899087967e-05, |
|
"loss": 3.2309, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 4.79, |
|
"grad_norm": 0.7149308919906616, |
|
"learning_rate": 8.324036481318035e-05, |
|
"loss": 3.228, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 4.84, |
|
"grad_norm": 0.7252637147903442, |
|
"learning_rate": 8.294616063548103e-05, |
|
"loss": 3.2236, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 4.89, |
|
"grad_norm": 0.7209696173667908, |
|
"learning_rate": 8.26522506619594e-05, |
|
"loss": 3.23, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 4.95, |
|
"grad_norm": 0.7185742855072021, |
|
"learning_rate": 8.235804648426007e-05, |
|
"loss": 3.2244, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.3919204455122256, |
|
"eval_loss": 3.484323501586914, |
|
"eval_runtime": 153.2201, |
|
"eval_samples_per_second": 378.025, |
|
"eval_steps_per_second": 5.913, |
|
"step": 92975 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.7267788648605347, |
|
"learning_rate": 8.206384230656075e-05, |
|
"loss": 3.2244, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 5.06, |
|
"grad_norm": 0.7365211248397827, |
|
"learning_rate": 8.176993233303914e-05, |
|
"loss": 3.1676, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 5.11, |
|
"grad_norm": 0.7716740369796753, |
|
"learning_rate": 8.147572815533982e-05, |
|
"loss": 3.1672, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 5.16, |
|
"grad_norm": 0.7571492791175842, |
|
"learning_rate": 8.118181818181818e-05, |
|
"loss": 3.1731, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 5.22, |
|
"grad_norm": 0.7364664673805237, |
|
"learning_rate": 8.088761400411886e-05, |
|
"loss": 3.1719, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 5.27, |
|
"grad_norm": 0.7242543697357178, |
|
"learning_rate": 8.059370403059724e-05, |
|
"loss": 3.1739, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 5.32, |
|
"grad_norm": 0.7749021053314209, |
|
"learning_rate": 8.029949985289792e-05, |
|
"loss": 3.1771, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 5.38, |
|
"grad_norm": 0.740552544593811, |
|
"learning_rate": 8.000529567519859e-05, |
|
"loss": 3.1771, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 5.43, |
|
"grad_norm": 0.7202236652374268, |
|
"learning_rate": 7.971109149749927e-05, |
|
"loss": 3.1755, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 5.49, |
|
"grad_norm": 0.7516536712646484, |
|
"learning_rate": 7.941688731979995e-05, |
|
"loss": 3.1761, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 5.54, |
|
"grad_norm": 0.7896652221679688, |
|
"learning_rate": 7.912268314210062e-05, |
|
"loss": 3.1821, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 5.59, |
|
"grad_norm": 0.7660993337631226, |
|
"learning_rate": 7.882906737275669e-05, |
|
"loss": 3.1744, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 5.65, |
|
"grad_norm": 0.7442207336425781, |
|
"learning_rate": 7.853486319505737e-05, |
|
"loss": 3.17, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 5.7, |
|
"grad_norm": 0.7453558444976807, |
|
"learning_rate": 7.824065901735806e-05, |
|
"loss": 3.1778, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 5.75, |
|
"grad_norm": 0.7240561842918396, |
|
"learning_rate": 7.794674904383643e-05, |
|
"loss": 3.1805, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 5.81, |
|
"grad_norm": 0.7229368090629578, |
|
"learning_rate": 7.76525448661371e-05, |
|
"loss": 3.1764, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 5.86, |
|
"grad_norm": 0.7163822054862976, |
|
"learning_rate": 7.735834068843777e-05, |
|
"loss": 3.1734, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 5.92, |
|
"grad_norm": 0.7416737675666809, |
|
"learning_rate": 7.706413651073846e-05, |
|
"loss": 3.174, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 5.97, |
|
"grad_norm": 0.7351878881454468, |
|
"learning_rate": 7.677022653721683e-05, |
|
"loss": 3.1723, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.3955316301279006, |
|
"eval_loss": 3.442295551300049, |
|
"eval_runtime": 153.2217, |
|
"eval_samples_per_second": 378.021, |
|
"eval_steps_per_second": 5.913, |
|
"step": 111570 |
|
}, |
|
{ |
|
"epoch": 6.02, |
|
"grad_norm": 0.7372068762779236, |
|
"learning_rate": 7.647631656369521e-05, |
|
"loss": 3.1476, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 6.08, |
|
"grad_norm": 0.734592854976654, |
|
"learning_rate": 7.618211238599588e-05, |
|
"loss": 3.1148, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 6.13, |
|
"grad_norm": 0.731022298336029, |
|
"learning_rate": 7.588790820829656e-05, |
|
"loss": 3.1139, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 6.18, |
|
"grad_norm": 0.7731876373291016, |
|
"learning_rate": 7.559429243895265e-05, |
|
"loss": 3.1289, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 6.24, |
|
"grad_norm": 0.7593401074409485, |
|
"learning_rate": 7.530008826125331e-05, |
|
"loss": 3.1261, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 6.29, |
|
"grad_norm": 0.7750897407531738, |
|
"learning_rate": 7.5005884083554e-05, |
|
"loss": 3.1222, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 6.35, |
|
"grad_norm": 0.7594813704490662, |
|
"learning_rate": 7.471167990585468e-05, |
|
"loss": 3.128, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"grad_norm": 0.7291180491447449, |
|
"learning_rate": 7.441747572815534e-05, |
|
"loss": 3.1269, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 6.45, |
|
"grad_norm": 0.7330898642539978, |
|
"learning_rate": 7.412327155045603e-05, |
|
"loss": 3.1344, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 6.51, |
|
"grad_norm": 0.7279812693595886, |
|
"learning_rate": 7.38290673727567e-05, |
|
"loss": 3.1298, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 6.56, |
|
"grad_norm": 0.7451930642127991, |
|
"learning_rate": 7.353515739923508e-05, |
|
"loss": 3.1308, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 6.61, |
|
"grad_norm": 0.7272422909736633, |
|
"learning_rate": 7.324124742571345e-05, |
|
"loss": 3.1364, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 6.67, |
|
"grad_norm": 0.7475588917732239, |
|
"learning_rate": 7.294704324801413e-05, |
|
"loss": 3.1333, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 6.72, |
|
"grad_norm": 0.7194586992263794, |
|
"learning_rate": 7.265283907031481e-05, |
|
"loss": 3.1342, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 6.78, |
|
"grad_norm": 0.7266201972961426, |
|
"learning_rate": 7.235863489261548e-05, |
|
"loss": 3.1322, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 6.83, |
|
"grad_norm": 0.7517876029014587, |
|
"learning_rate": 7.206472491909385e-05, |
|
"loss": 3.1335, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 6.88, |
|
"grad_norm": 0.7235228419303894, |
|
"learning_rate": 7.177081494557223e-05, |
|
"loss": 3.1315, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 6.94, |
|
"grad_norm": 0.7359281182289124, |
|
"learning_rate": 7.147661076787291e-05, |
|
"loss": 3.1309, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 6.99, |
|
"grad_norm": 0.7120690941810608, |
|
"learning_rate": 7.11824065901736e-05, |
|
"loss": 3.1287, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.3986633612117726, |
|
"eval_loss": 3.4223978519439697, |
|
"eval_runtime": 153.3442, |
|
"eval_samples_per_second": 377.719, |
|
"eval_steps_per_second": 5.908, |
|
"step": 130165 |
|
}, |
|
{ |
|
"epoch": 7.04, |
|
"grad_norm": 0.7617224454879761, |
|
"learning_rate": 7.088820241247426e-05, |
|
"loss": 3.0854, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 7.1, |
|
"grad_norm": 0.7914534211158752, |
|
"learning_rate": 7.059429243895263e-05, |
|
"loss": 3.0764, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 7.15, |
|
"grad_norm": 0.7849559187889099, |
|
"learning_rate": 7.030008826125331e-05, |
|
"loss": 3.0822, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 7.21, |
|
"grad_norm": 0.7480010390281677, |
|
"learning_rate": 7.000588408355398e-05, |
|
"loss": 3.0807, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 7.26, |
|
"grad_norm": 0.7867963910102844, |
|
"learning_rate": 6.971167990585466e-05, |
|
"loss": 3.0868, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 7.31, |
|
"grad_norm": 0.7274609208106995, |
|
"learning_rate": 6.941806413651074e-05, |
|
"loss": 3.0906, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 7.37, |
|
"grad_norm": 0.7924407720565796, |
|
"learning_rate": 6.912385995881142e-05, |
|
"loss": 3.0901, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 7.42, |
|
"grad_norm": 0.751484215259552, |
|
"learning_rate": 6.882965578111209e-05, |
|
"loss": 3.0894, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 7.48, |
|
"grad_norm": 0.761904239654541, |
|
"learning_rate": 6.853574580759047e-05, |
|
"loss": 3.098, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 7.53, |
|
"grad_norm": 0.7824081778526306, |
|
"learning_rate": 6.824154162989115e-05, |
|
"loss": 3.0904, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 7.58, |
|
"grad_norm": 0.7663974761962891, |
|
"learning_rate": 6.794733745219183e-05, |
|
"loss": 3.0915, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 7.64, |
|
"grad_norm": 0.7567524909973145, |
|
"learning_rate": 6.765342747867019e-05, |
|
"loss": 3.0938, |
|
"step": 142000 |
|
}, |
|
{ |
|
"epoch": 7.69, |
|
"grad_norm": 0.7632171511650085, |
|
"learning_rate": 6.735922330097087e-05, |
|
"loss": 3.0936, |
|
"step": 143000 |
|
}, |
|
{ |
|
"epoch": 7.74, |
|
"grad_norm": 0.7414125204086304, |
|
"learning_rate": 6.706501912327155e-05, |
|
"loss": 3.0933, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 7.8, |
|
"grad_norm": 0.7134759426116943, |
|
"learning_rate": 6.677081494557223e-05, |
|
"loss": 3.0941, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 7.85, |
|
"grad_norm": 0.7270065546035767, |
|
"learning_rate": 6.64769049720506e-05, |
|
"loss": 3.0953, |
|
"step": 146000 |
|
}, |
|
{ |
|
"epoch": 7.91, |
|
"grad_norm": 0.7573560476303101, |
|
"learning_rate": 6.618299499852897e-05, |
|
"loss": 3.094, |
|
"step": 147000 |
|
}, |
|
{ |
|
"epoch": 7.96, |
|
"grad_norm": 0.7566157579421997, |
|
"learning_rate": 6.588879082082966e-05, |
|
"loss": 3.0995, |
|
"step": 148000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.40014243202720035, |
|
"eval_loss": 3.411925792694092, |
|
"eval_runtime": 153.1198, |
|
"eval_samples_per_second": 378.272, |
|
"eval_steps_per_second": 5.917, |
|
"step": 148760 |
|
}, |
|
{ |
|
"epoch": 8.01, |
|
"grad_norm": 0.7916214466094971, |
|
"learning_rate": 6.559488084730804e-05, |
|
"loss": 3.0778, |
|
"step": 149000 |
|
}, |
|
{ |
|
"epoch": 8.07, |
|
"grad_norm": 0.7633106708526611, |
|
"learning_rate": 6.530067666960871e-05, |
|
"loss": 3.0389, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 8.12, |
|
"grad_norm": 0.7663868069648743, |
|
"learning_rate": 6.500647249190939e-05, |
|
"loss": 3.0467, |
|
"step": 151000 |
|
}, |
|
{ |
|
"epoch": 8.17, |
|
"grad_norm": 0.772544801235199, |
|
"learning_rate": 6.471226831421007e-05, |
|
"loss": 3.0511, |
|
"step": 152000 |
|
}, |
|
{ |
|
"epoch": 8.23, |
|
"grad_norm": 0.7798665165901184, |
|
"learning_rate": 6.441835834068844e-05, |
|
"loss": 3.0481, |
|
"step": 153000 |
|
}, |
|
{ |
|
"epoch": 8.28, |
|
"grad_norm": 0.7501771450042725, |
|
"learning_rate": 6.412415416298911e-05, |
|
"loss": 3.0541, |
|
"step": 154000 |
|
}, |
|
{ |
|
"epoch": 8.34, |
|
"grad_norm": 0.7601937651634216, |
|
"learning_rate": 6.383024418946749e-05, |
|
"loss": 3.0572, |
|
"step": 155000 |
|
}, |
|
{ |
|
"epoch": 8.39, |
|
"grad_norm": 0.7670091986656189, |
|
"learning_rate": 6.353604001176817e-05, |
|
"loss": 3.0551, |
|
"step": 156000 |
|
}, |
|
{ |
|
"epoch": 8.44, |
|
"grad_norm": 0.7640057802200317, |
|
"learning_rate": 6.324183583406885e-05, |
|
"loss": 3.0579, |
|
"step": 157000 |
|
}, |
|
{ |
|
"epoch": 8.5, |
|
"grad_norm": 0.7653687596321106, |
|
"learning_rate": 6.294763165636952e-05, |
|
"loss": 3.0599, |
|
"step": 158000 |
|
}, |
|
{ |
|
"epoch": 8.55, |
|
"grad_norm": 0.7580332159996033, |
|
"learning_rate": 6.265372168284789e-05, |
|
"loss": 3.0579, |
|
"step": 159000 |
|
}, |
|
{ |
|
"epoch": 8.6, |
|
"grad_norm": 0.7833405137062073, |
|
"learning_rate": 6.235951750514857e-05, |
|
"loss": 3.0605, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 8.66, |
|
"grad_norm": 0.7054916024208069, |
|
"learning_rate": 6.206560753162696e-05, |
|
"loss": 3.0629, |
|
"step": 161000 |
|
}, |
|
{ |
|
"epoch": 8.71, |
|
"grad_norm": 0.7401750087738037, |
|
"learning_rate": 6.177169755810533e-05, |
|
"loss": 3.0622, |
|
"step": 162000 |
|
}, |
|
{ |
|
"epoch": 8.77, |
|
"grad_norm": 0.7965439558029175, |
|
"learning_rate": 6.147749338040601e-05, |
|
"loss": 3.0641, |
|
"step": 163000 |
|
}, |
|
{ |
|
"epoch": 8.82, |
|
"grad_norm": 0.747765064239502, |
|
"learning_rate": 6.118328920270668e-05, |
|
"loss": 3.0636, |
|
"step": 164000 |
|
}, |
|
{ |
|
"epoch": 8.87, |
|
"grad_norm": 0.7850915193557739, |
|
"learning_rate": 6.088908502500735e-05, |
|
"loss": 3.0647, |
|
"step": 165000 |
|
}, |
|
{ |
|
"epoch": 8.93, |
|
"grad_norm": 0.762381911277771, |
|
"learning_rate": 6.0595469255663425e-05, |
|
"loss": 3.0674, |
|
"step": 166000 |
|
}, |
|
{ |
|
"epoch": 8.98, |
|
"grad_norm": 0.7457260489463806, |
|
"learning_rate": 6.0301265077964107e-05, |
|
"loss": 3.0666, |
|
"step": 167000 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.4014027022046734, |
|
"eval_loss": 3.4093098640441895, |
|
"eval_runtime": 153.0419, |
|
"eval_samples_per_second": 378.465, |
|
"eval_steps_per_second": 5.92, |
|
"step": 167355 |
|
}, |
|
{ |
|
"epoch": 9.03, |
|
"grad_norm": 0.8109008073806763, |
|
"learning_rate": 6.000706090026479e-05, |
|
"loss": 3.0281, |
|
"step": 168000 |
|
}, |
|
{ |
|
"epoch": 9.09, |
|
"grad_norm": 0.7763618230819702, |
|
"learning_rate": 5.9712856722565455e-05, |
|
"loss": 3.0099, |
|
"step": 169000 |
|
}, |
|
{ |
|
"epoch": 9.14, |
|
"grad_norm": 0.7697040438652039, |
|
"learning_rate": 5.941865254486614e-05, |
|
"loss": 3.0167, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 9.2, |
|
"grad_norm": 0.8015623688697815, |
|
"learning_rate": 5.9124742571344514e-05, |
|
"loss": 3.021, |
|
"step": 171000 |
|
}, |
|
{ |
|
"epoch": 9.25, |
|
"grad_norm": 0.8062806725502014, |
|
"learning_rate": 5.883083259782289e-05, |
|
"loss": 3.0259, |
|
"step": 172000 |
|
}, |
|
{ |
|
"epoch": 9.3, |
|
"grad_norm": 0.7650690078735352, |
|
"learning_rate": 5.853662842012357e-05, |
|
"loss": 3.0219, |
|
"step": 173000 |
|
}, |
|
{ |
|
"epoch": 9.36, |
|
"grad_norm": 0.7815614342689514, |
|
"learning_rate": 5.824271844660194e-05, |
|
"loss": 3.0225, |
|
"step": 174000 |
|
}, |
|
{ |
|
"epoch": 9.41, |
|
"grad_norm": 0.7516615986824036, |
|
"learning_rate": 5.794880847308032e-05, |
|
"loss": 3.0309, |
|
"step": 175000 |
|
}, |
|
{ |
|
"epoch": 9.46, |
|
"grad_norm": 0.7918343544006348, |
|
"learning_rate": 5.7654604295381e-05, |
|
"loss": 3.0323, |
|
"step": 176000 |
|
}, |
|
{ |
|
"epoch": 9.52, |
|
"grad_norm": 0.7646675109863281, |
|
"learning_rate": 5.7360400117681676e-05, |
|
"loss": 3.0324, |
|
"step": 177000 |
|
}, |
|
{ |
|
"epoch": 9.57, |
|
"grad_norm": 0.764613151550293, |
|
"learning_rate": 5.706619593998235e-05, |
|
"loss": 3.0294, |
|
"step": 178000 |
|
}, |
|
{ |
|
"epoch": 9.63, |
|
"grad_norm": 0.7855122685432434, |
|
"learning_rate": 5.6771991762283025e-05, |
|
"loss": 3.0325, |
|
"step": 179000 |
|
}, |
|
{ |
|
"epoch": 9.68, |
|
"grad_norm": 0.7663726210594177, |
|
"learning_rate": 5.6477787584583706e-05, |
|
"loss": 3.0352, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 9.73, |
|
"grad_norm": 0.741322934627533, |
|
"learning_rate": 5.618387761106208e-05, |
|
"loss": 3.0389, |
|
"step": 181000 |
|
}, |
|
{ |
|
"epoch": 9.79, |
|
"grad_norm": 0.8002005219459534, |
|
"learning_rate": 5.5889967637540454e-05, |
|
"loss": 3.0358, |
|
"step": 182000 |
|
}, |
|
{ |
|
"epoch": 9.84, |
|
"grad_norm": 0.7639887928962708, |
|
"learning_rate": 5.5595763459841135e-05, |
|
"loss": 3.0366, |
|
"step": 183000 |
|
}, |
|
{ |
|
"epoch": 9.9, |
|
"grad_norm": 0.7922505736351013, |
|
"learning_rate": 5.530155928214181e-05, |
|
"loss": 3.0392, |
|
"step": 184000 |
|
}, |
|
{ |
|
"epoch": 9.95, |
|
"grad_norm": 0.7338058352470398, |
|
"learning_rate": 5.5007355104442484e-05, |
|
"loss": 3.0395, |
|
"step": 185000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.40239089912954695, |
|
"eval_loss": 3.3992717266082764, |
|
"eval_runtime": 153.2452, |
|
"eval_samples_per_second": 377.963, |
|
"eval_steps_per_second": 5.912, |
|
"step": 185950 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.7729306817054749, |
|
"learning_rate": 5.471344513092086e-05, |
|
"loss": 3.0359, |
|
"step": 186000 |
|
}, |
|
{ |
|
"epoch": 10.06, |
|
"grad_norm": 0.8014710545539856, |
|
"learning_rate": 5.441924095322154e-05, |
|
"loss": 2.9824, |
|
"step": 187000 |
|
}, |
|
{ |
|
"epoch": 10.11, |
|
"grad_norm": 0.7774161100387573, |
|
"learning_rate": 5.412503677552222e-05, |
|
"loss": 2.9844, |
|
"step": 188000 |
|
}, |
|
{ |
|
"epoch": 10.16, |
|
"grad_norm": 0.7719259262084961, |
|
"learning_rate": 5.383112680200059e-05, |
|
"loss": 2.9878, |
|
"step": 189000 |
|
}, |
|
{ |
|
"epoch": 10.22, |
|
"grad_norm": 0.77918940782547, |
|
"learning_rate": 5.353692262430127e-05, |
|
"loss": 2.9942, |
|
"step": 190000 |
|
}, |
|
{ |
|
"epoch": 10.27, |
|
"grad_norm": 0.803722620010376, |
|
"learning_rate": 5.3242718446601943e-05, |
|
"loss": 2.9966, |
|
"step": 191000 |
|
}, |
|
{ |
|
"epoch": 10.33, |
|
"grad_norm": 0.7969011664390564, |
|
"learning_rate": 5.294851426890262e-05, |
|
"loss": 3.001, |
|
"step": 192000 |
|
}, |
|
{ |
|
"epoch": 10.38, |
|
"grad_norm": 0.7830320000648499, |
|
"learning_rate": 5.2654604295380995e-05, |
|
"loss": 3.0019, |
|
"step": 193000 |
|
}, |
|
{ |
|
"epoch": 10.43, |
|
"grad_norm": 0.788979709148407, |
|
"learning_rate": 5.236069432185937e-05, |
|
"loss": 3.003, |
|
"step": 194000 |
|
}, |
|
{ |
|
"epoch": 10.49, |
|
"grad_norm": 0.7922942042350769, |
|
"learning_rate": 5.2066490144160054e-05, |
|
"loss": 3.0082, |
|
"step": 195000 |
|
}, |
|
{ |
|
"epoch": 10.54, |
|
"grad_norm": 0.7848927974700928, |
|
"learning_rate": 5.177258017063843e-05, |
|
"loss": 3.0071, |
|
"step": 196000 |
|
}, |
|
{ |
|
"epoch": 10.59, |
|
"grad_norm": 0.7817524075508118, |
|
"learning_rate": 5.14783759929391e-05, |
|
"loss": 3.0102, |
|
"step": 197000 |
|
}, |
|
{ |
|
"epoch": 10.65, |
|
"grad_norm": 0.7726117968559265, |
|
"learning_rate": 5.118417181523978e-05, |
|
"loss": 3.0095, |
|
"step": 198000 |
|
}, |
|
{ |
|
"epoch": 10.7, |
|
"grad_norm": 0.7940185070037842, |
|
"learning_rate": 5.089026184171816e-05, |
|
"loss": 3.0116, |
|
"step": 199000 |
|
}, |
|
{ |
|
"epoch": 10.76, |
|
"grad_norm": 0.8069621324539185, |
|
"learning_rate": 5.059605766401884e-05, |
|
"loss": 3.0125, |
|
"step": 200000 |
|
}, |
|
{ |
|
"epoch": 10.81, |
|
"grad_norm": 0.7631500959396362, |
|
"learning_rate": 5.03021476904972e-05, |
|
"loss": 3.0131, |
|
"step": 201000 |
|
}, |
|
{ |
|
"epoch": 10.86, |
|
"grad_norm": 0.7978447675704956, |
|
"learning_rate": 5.0007943512797884e-05, |
|
"loss": 3.0115, |
|
"step": 202000 |
|
}, |
|
{ |
|
"epoch": 10.92, |
|
"grad_norm": 0.8167823553085327, |
|
"learning_rate": 4.971403353927626e-05, |
|
"loss": 3.0125, |
|
"step": 203000 |
|
}, |
|
{ |
|
"epoch": 10.97, |
|
"grad_norm": 0.8195194005966187, |
|
"learning_rate": 4.9419829361576935e-05, |
|
"loss": 3.0097, |
|
"step": 204000 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.40306617822464075, |
|
"eval_loss": 3.4086954593658447, |
|
"eval_runtime": 154.7236, |
|
"eval_samples_per_second": 374.352, |
|
"eval_steps_per_second": 5.856, |
|
"step": 204545 |
|
}, |
|
{ |
|
"epoch": 11.02, |
|
"grad_norm": 0.7763456106185913, |
|
"learning_rate": 4.9125625183877617e-05, |
|
"loss": 2.9904, |
|
"step": 205000 |
|
}, |
|
{ |
|
"epoch": 11.08, |
|
"grad_norm": 0.8207082152366638, |
|
"learning_rate": 4.883142100617829e-05, |
|
"loss": 2.9575, |
|
"step": 206000 |
|
}, |
|
{ |
|
"epoch": 11.13, |
|
"grad_norm": 0.8150632977485657, |
|
"learning_rate": 4.853751103265667e-05, |
|
"loss": 2.9651, |
|
"step": 207000 |
|
}, |
|
{ |
|
"epoch": 11.19, |
|
"grad_norm": 0.8188782334327698, |
|
"learning_rate": 4.824360105913504e-05, |
|
"loss": 2.9688, |
|
"step": 208000 |
|
}, |
|
{ |
|
"epoch": 11.24, |
|
"grad_norm": 0.898553192615509, |
|
"learning_rate": 4.794939688143572e-05, |
|
"loss": 2.9716, |
|
"step": 209000 |
|
}, |
|
{ |
|
"epoch": 11.29, |
|
"grad_norm": 0.8035799860954285, |
|
"learning_rate": 4.7655192703736395e-05, |
|
"loss": 2.977, |
|
"step": 210000 |
|
}, |
|
{ |
|
"epoch": 11.35, |
|
"grad_norm": 0.8373109102249146, |
|
"learning_rate": 4.736098852603707e-05, |
|
"loss": 2.9771, |
|
"step": 211000 |
|
}, |
|
{ |
|
"epoch": 11.4, |
|
"grad_norm": 0.8262482285499573, |
|
"learning_rate": 4.7067078552515446e-05, |
|
"loss": 2.9828, |
|
"step": 212000 |
|
}, |
|
{ |
|
"epoch": 11.45, |
|
"grad_norm": 0.8329269289970398, |
|
"learning_rate": 4.677287437481612e-05, |
|
"loss": 2.9779, |
|
"step": 213000 |
|
}, |
|
{ |
|
"epoch": 11.51, |
|
"grad_norm": 0.8244719505310059, |
|
"learning_rate": 4.64789644012945e-05, |
|
"loss": 2.9854, |
|
"step": 214000 |
|
}, |
|
{ |
|
"epoch": 11.56, |
|
"grad_norm": 0.800463855266571, |
|
"learning_rate": 4.618476022359517e-05, |
|
"loss": 2.9844, |
|
"step": 215000 |
|
}, |
|
{ |
|
"epoch": 11.62, |
|
"grad_norm": 0.8081605434417725, |
|
"learning_rate": 4.589085025007355e-05, |
|
"loss": 2.9834, |
|
"step": 216000 |
|
}, |
|
{ |
|
"epoch": 11.67, |
|
"grad_norm": 0.8294093608856201, |
|
"learning_rate": 4.5596646072374224e-05, |
|
"loss": 2.9862, |
|
"step": 217000 |
|
}, |
|
{ |
|
"epoch": 11.72, |
|
"grad_norm": 0.8135597109794617, |
|
"learning_rate": 4.530273609885261e-05, |
|
"loss": 2.9889, |
|
"step": 218000 |
|
}, |
|
{ |
|
"epoch": 11.78, |
|
"grad_norm": 0.8094319105148315, |
|
"learning_rate": 4.500853192115328e-05, |
|
"loss": 2.9857, |
|
"step": 219000 |
|
}, |
|
{ |
|
"epoch": 11.83, |
|
"grad_norm": 0.8075074553489685, |
|
"learning_rate": 4.471462194763166e-05, |
|
"loss": 2.9917, |
|
"step": 220000 |
|
}, |
|
{ |
|
"epoch": 11.88, |
|
"grad_norm": 0.8017106652259827, |
|
"learning_rate": 4.4420417769932335e-05, |
|
"loss": 2.988, |
|
"step": 221000 |
|
}, |
|
{ |
|
"epoch": 11.94, |
|
"grad_norm": 0.8178462982177734, |
|
"learning_rate": 4.412621359223301e-05, |
|
"loss": 2.9942, |
|
"step": 222000 |
|
}, |
|
{ |
|
"epoch": 11.99, |
|
"grad_norm": 0.8240616917610168, |
|
"learning_rate": 4.383230361871139e-05, |
|
"loss": 2.9923, |
|
"step": 223000 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.4041937035262776, |
|
"eval_loss": 3.402968406677246, |
|
"eval_runtime": 154.7261, |
|
"eval_samples_per_second": 374.345, |
|
"eval_steps_per_second": 5.856, |
|
"step": 223140 |
|
}, |
|
{ |
|
"epoch": 12.05, |
|
"grad_norm": 0.816884458065033, |
|
"learning_rate": 4.353809944101207e-05, |
|
"loss": 2.9496, |
|
"step": 224000 |
|
}, |
|
{ |
|
"epoch": 12.1, |
|
"grad_norm": 0.8183420896530151, |
|
"learning_rate": 4.3244189467490445e-05, |
|
"loss": 2.9446, |
|
"step": 225000 |
|
}, |
|
{ |
|
"epoch": 12.15, |
|
"grad_norm": 0.8411849737167358, |
|
"learning_rate": 4.2950279493968815e-05, |
|
"loss": 2.9469, |
|
"step": 226000 |
|
}, |
|
{ |
|
"epoch": 12.21, |
|
"grad_norm": 0.836554765701294, |
|
"learning_rate": 4.26560753162695e-05, |
|
"loss": 2.948, |
|
"step": 227000 |
|
}, |
|
{ |
|
"epoch": 12.26, |
|
"grad_norm": 0.8519123196601868, |
|
"learning_rate": 4.236187113857017e-05, |
|
"loss": 2.952, |
|
"step": 228000 |
|
}, |
|
{ |
|
"epoch": 12.32, |
|
"grad_norm": 0.8291841745376587, |
|
"learning_rate": 4.2067666960870846e-05, |
|
"loss": 2.9543, |
|
"step": 229000 |
|
}, |
|
{ |
|
"epoch": 12.37, |
|
"grad_norm": 0.8258066177368164, |
|
"learning_rate": 4.177375698734922e-05, |
|
"loss": 2.959, |
|
"step": 230000 |
|
}, |
|
{ |
|
"epoch": 12.42, |
|
"grad_norm": 0.8320387005805969, |
|
"learning_rate": 4.14798470138276e-05, |
|
"loss": 2.9631, |
|
"step": 231000 |
|
}, |
|
{ |
|
"epoch": 12.48, |
|
"grad_norm": 0.8495768308639526, |
|
"learning_rate": 4.1185642836128275e-05, |
|
"loss": 2.9607, |
|
"step": 232000 |
|
}, |
|
{ |
|
"epoch": 12.53, |
|
"grad_norm": 0.7984282970428467, |
|
"learning_rate": 4.0891438658428956e-05, |
|
"loss": 2.9598, |
|
"step": 233000 |
|
}, |
|
{ |
|
"epoch": 12.58, |
|
"grad_norm": 0.8098889589309692, |
|
"learning_rate": 4.059723448072963e-05, |
|
"loss": 2.9605, |
|
"step": 234000 |
|
}, |
|
{ |
|
"epoch": 12.64, |
|
"grad_norm": 0.8174999356269836, |
|
"learning_rate": 4.030332450720801e-05, |
|
"loss": 2.9623, |
|
"step": 235000 |
|
}, |
|
{ |
|
"epoch": 12.69, |
|
"grad_norm": 0.8120893239974976, |
|
"learning_rate": 4.000912032950868e-05, |
|
"loss": 2.9613, |
|
"step": 236000 |
|
}, |
|
{ |
|
"epoch": 12.75, |
|
"grad_norm": 0.8061412572860718, |
|
"learning_rate": 3.9714916151809357e-05, |
|
"loss": 2.9632, |
|
"step": 237000 |
|
}, |
|
{ |
|
"epoch": 12.8, |
|
"grad_norm": 0.8751248121261597, |
|
"learning_rate": 3.942071197411004e-05, |
|
"loss": 2.9681, |
|
"step": 238000 |
|
}, |
|
{ |
|
"epoch": 12.85, |
|
"grad_norm": 0.8276800513267517, |
|
"learning_rate": 3.912680200058841e-05, |
|
"loss": 2.9684, |
|
"step": 239000 |
|
}, |
|
{ |
|
"epoch": 12.91, |
|
"grad_norm": 0.861995279788971, |
|
"learning_rate": 3.883259782288909e-05, |
|
"loss": 2.967, |
|
"step": 240000 |
|
}, |
|
{ |
|
"epoch": 12.96, |
|
"grad_norm": 0.8120051026344299, |
|
"learning_rate": 3.853898205354516e-05, |
|
"loss": 2.9703, |
|
"step": 241000 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.404724884565365, |
|
"eval_loss": 3.393812894821167, |
|
"eval_runtime": 154.9346, |
|
"eval_samples_per_second": 373.842, |
|
"eval_steps_per_second": 5.848, |
|
"step": 241735 |
|
}, |
|
{ |
|
"epoch": 13.01, |
|
"grad_norm": 0.8200232982635498, |
|
"learning_rate": 3.824477787584584e-05, |
|
"loss": 2.9557, |
|
"step": 242000 |
|
}, |
|
{ |
|
"epoch": 13.07, |
|
"grad_norm": 0.857276201248169, |
|
"learning_rate": 3.795057369814651e-05, |
|
"loss": 2.9224, |
|
"step": 243000 |
|
}, |
|
{ |
|
"epoch": 13.12, |
|
"grad_norm": 0.8572235107421875, |
|
"learning_rate": 3.765666372462489e-05, |
|
"loss": 2.9268, |
|
"step": 244000 |
|
}, |
|
{ |
|
"epoch": 13.18, |
|
"grad_norm": 0.8236098885536194, |
|
"learning_rate": 3.7362459546925564e-05, |
|
"loss": 2.9321, |
|
"step": 245000 |
|
}, |
|
{ |
|
"epoch": 13.23, |
|
"grad_norm": 0.8436420559883118, |
|
"learning_rate": 3.7068255369226245e-05, |
|
"loss": 2.9301, |
|
"step": 246000 |
|
}, |
|
{ |
|
"epoch": 13.28, |
|
"grad_norm": 0.850407063961029, |
|
"learning_rate": 3.6774345395704615e-05, |
|
"loss": 2.9385, |
|
"step": 247000 |
|
}, |
|
{ |
|
"epoch": 13.34, |
|
"grad_norm": 0.8579990863800049, |
|
"learning_rate": 3.64801412180053e-05, |
|
"loss": 2.934, |
|
"step": 248000 |
|
}, |
|
{ |
|
"epoch": 13.39, |
|
"grad_norm": 0.8318254351615906, |
|
"learning_rate": 3.618593704030597e-05, |
|
"loss": 2.9366, |
|
"step": 249000 |
|
}, |
|
{ |
|
"epoch": 13.44, |
|
"grad_norm": 0.8493559956550598, |
|
"learning_rate": 3.589202706678435e-05, |
|
"loss": 2.9407, |
|
"step": 250000 |
|
}, |
|
{ |
|
"epoch": 13.5, |
|
"grad_norm": 0.8592551350593567, |
|
"learning_rate": 3.559782288908502e-05, |
|
"loss": 2.942, |
|
"step": 251000 |
|
}, |
|
{ |
|
"epoch": 13.55, |
|
"grad_norm": 0.8678461909294128, |
|
"learning_rate": 3.5303618711385704e-05, |
|
"loss": 2.9419, |
|
"step": 252000 |
|
}, |
|
{ |
|
"epoch": 13.61, |
|
"grad_norm": 0.8739249110221863, |
|
"learning_rate": 3.500941453368638e-05, |
|
"loss": 2.9452, |
|
"step": 253000 |
|
}, |
|
{ |
|
"epoch": 13.66, |
|
"grad_norm": 0.8372821807861328, |
|
"learning_rate": 3.4715504560164756e-05, |
|
"loss": 2.944, |
|
"step": 254000 |
|
}, |
|
{ |
|
"epoch": 13.71, |
|
"grad_norm": 0.8296213746070862, |
|
"learning_rate": 3.442159458664313e-05, |
|
"loss": 2.9454, |
|
"step": 255000 |
|
}, |
|
{ |
|
"epoch": 13.77, |
|
"grad_norm": 0.7957202792167664, |
|
"learning_rate": 3.412739040894381e-05, |
|
"loss": 2.9452, |
|
"step": 256000 |
|
}, |
|
{ |
|
"epoch": 13.82, |
|
"grad_norm": 0.8241844773292542, |
|
"learning_rate": 3.3833480435422185e-05, |
|
"loss": 2.9427, |
|
"step": 257000 |
|
}, |
|
{ |
|
"epoch": 13.87, |
|
"grad_norm": 0.8526179194450378, |
|
"learning_rate": 3.353927625772286e-05, |
|
"loss": 2.9461, |
|
"step": 258000 |
|
}, |
|
{ |
|
"epoch": 13.93, |
|
"grad_norm": 0.8483251333236694, |
|
"learning_rate": 3.3245072080023534e-05, |
|
"loss": 2.95, |
|
"step": 259000 |
|
}, |
|
{ |
|
"epoch": 13.98, |
|
"grad_norm": 0.8320598602294922, |
|
"learning_rate": 3.2950867902324215e-05, |
|
"loss": 2.9483, |
|
"step": 260000 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.4050780423650972, |
|
"eval_loss": 3.400024175643921, |
|
"eval_runtime": 154.9428, |
|
"eval_samples_per_second": 373.822, |
|
"eval_steps_per_second": 5.847, |
|
"step": 260330 |
|
}, |
|
{ |
|
"epoch": 14.04, |
|
"grad_norm": 0.8741620182991028, |
|
"learning_rate": 3.265725213298029e-05, |
|
"loss": 2.9159, |
|
"step": 261000 |
|
}, |
|
{ |
|
"epoch": 14.09, |
|
"grad_norm": 0.8629675507545471, |
|
"learning_rate": 3.236304795528097e-05, |
|
"loss": 2.9064, |
|
"step": 262000 |
|
}, |
|
{ |
|
"epoch": 14.14, |
|
"grad_norm": 0.8614948987960815, |
|
"learning_rate": 3.206913798175935e-05, |
|
"loss": 2.9098, |
|
"step": 263000 |
|
}, |
|
{ |
|
"epoch": 14.2, |
|
"grad_norm": 0.8749313354492188, |
|
"learning_rate": 3.177493380406002e-05, |
|
"loss": 2.9085, |
|
"step": 264000 |
|
}, |
|
{ |
|
"epoch": 14.25, |
|
"grad_norm": 0.8873867392539978, |
|
"learning_rate": 3.1480729626360696e-05, |
|
"loss": 2.9161, |
|
"step": 265000 |
|
}, |
|
{ |
|
"epoch": 14.3, |
|
"grad_norm": 0.8229598999023438, |
|
"learning_rate": 3.118681965283907e-05, |
|
"loss": 2.9146, |
|
"step": 266000 |
|
}, |
|
{ |
|
"epoch": 14.36, |
|
"grad_norm": 0.8612464070320129, |
|
"learning_rate": 3.089261547513975e-05, |
|
"loss": 2.9189, |
|
"step": 267000 |
|
}, |
|
{ |
|
"epoch": 14.41, |
|
"grad_norm": 0.8653061985969543, |
|
"learning_rate": 3.059841129744043e-05, |
|
"loss": 2.9204, |
|
"step": 268000 |
|
}, |
|
{ |
|
"epoch": 14.47, |
|
"grad_norm": 0.8890775442123413, |
|
"learning_rate": 3.03045013239188e-05, |
|
"loss": 2.9217, |
|
"step": 269000 |
|
}, |
|
{ |
|
"epoch": 14.52, |
|
"grad_norm": 0.8957571983337402, |
|
"learning_rate": 3.001029714621948e-05, |
|
"loss": 2.9262, |
|
"step": 270000 |
|
}, |
|
{ |
|
"epoch": 14.57, |
|
"grad_norm": 0.8220794796943665, |
|
"learning_rate": 2.9716092968520155e-05, |
|
"loss": 2.9236, |
|
"step": 271000 |
|
}, |
|
{ |
|
"epoch": 14.63, |
|
"grad_norm": 0.841556191444397, |
|
"learning_rate": 2.942188879082083e-05, |
|
"loss": 2.9248, |
|
"step": 272000 |
|
}, |
|
{ |
|
"epoch": 14.68, |
|
"grad_norm": 0.8488039374351501, |
|
"learning_rate": 2.9127684613121508e-05, |
|
"loss": 2.9272, |
|
"step": 273000 |
|
}, |
|
{ |
|
"epoch": 14.74, |
|
"grad_norm": 0.8521600365638733, |
|
"learning_rate": 2.8833774639599885e-05, |
|
"loss": 2.9286, |
|
"step": 274000 |
|
}, |
|
{ |
|
"epoch": 14.79, |
|
"grad_norm": 0.878585696220398, |
|
"learning_rate": 2.8539570461900563e-05, |
|
"loss": 2.9254, |
|
"step": 275000 |
|
}, |
|
{ |
|
"epoch": 14.84, |
|
"grad_norm": 0.8403483033180237, |
|
"learning_rate": 2.8245954692556636e-05, |
|
"loss": 2.9266, |
|
"step": 276000 |
|
}, |
|
{ |
|
"epoch": 14.9, |
|
"grad_norm": 0.8805426955223083, |
|
"learning_rate": 2.795175051485731e-05, |
|
"loss": 2.932, |
|
"step": 277000 |
|
}, |
|
{ |
|
"epoch": 14.95, |
|
"grad_norm": 0.8541871309280396, |
|
"learning_rate": 2.7657546337157992e-05, |
|
"loss": 2.9286, |
|
"step": 278000 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.40484641061819276, |
|
"eval_loss": 3.40685772895813, |
|
"eval_runtime": 155.3799, |
|
"eval_samples_per_second": 372.77, |
|
"eval_steps_per_second": 5.831, |
|
"step": 278925 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"grad_norm": 0.851596474647522, |
|
"learning_rate": 2.7363636363636362e-05, |
|
"loss": 2.9252, |
|
"step": 279000 |
|
}, |
|
{ |
|
"epoch": 15.06, |
|
"grad_norm": 0.8839541077613831, |
|
"learning_rate": 2.7069726390114743e-05, |
|
"loss": 2.8894, |
|
"step": 280000 |
|
}, |
|
{ |
|
"epoch": 15.11, |
|
"grad_norm": 0.9067598581314087, |
|
"learning_rate": 2.6775522212415417e-05, |
|
"loss": 2.8899, |
|
"step": 281000 |
|
}, |
|
{ |
|
"epoch": 15.17, |
|
"grad_norm": 0.8887162208557129, |
|
"learning_rate": 2.6481318034716095e-05, |
|
"loss": 2.8969, |
|
"step": 282000 |
|
}, |
|
{ |
|
"epoch": 15.22, |
|
"grad_norm": 0.9024575352668762, |
|
"learning_rate": 2.618740806119447e-05, |
|
"loss": 2.8941, |
|
"step": 283000 |
|
}, |
|
{ |
|
"epoch": 15.27, |
|
"grad_norm": 0.9097668528556824, |
|
"learning_rate": 2.5893203883495147e-05, |
|
"loss": 2.899, |
|
"step": 284000 |
|
}, |
|
{ |
|
"epoch": 15.33, |
|
"grad_norm": 0.8730289936065674, |
|
"learning_rate": 2.559899970579582e-05, |
|
"loss": 2.8947, |
|
"step": 285000 |
|
}, |
|
{ |
|
"epoch": 15.38, |
|
"grad_norm": 0.8862543702125549, |
|
"learning_rate": 2.53047955280965e-05, |
|
"loss": 2.9005, |
|
"step": 286000 |
|
}, |
|
{ |
|
"epoch": 15.43, |
|
"grad_norm": 0.8661286234855652, |
|
"learning_rate": 2.5010885554574877e-05, |
|
"loss": 2.903, |
|
"step": 287000 |
|
}, |
|
{ |
|
"epoch": 15.49, |
|
"grad_norm": 0.8767553567886353, |
|
"learning_rate": 2.4716975581053254e-05, |
|
"loss": 2.9094, |
|
"step": 288000 |
|
}, |
|
{ |
|
"epoch": 15.54, |
|
"grad_norm": 0.8723335266113281, |
|
"learning_rate": 2.442277140335393e-05, |
|
"loss": 2.9033, |
|
"step": 289000 |
|
}, |
|
{ |
|
"epoch": 15.6, |
|
"grad_norm": 0.8956754207611084, |
|
"learning_rate": 2.4128567225654606e-05, |
|
"loss": 2.9059, |
|
"step": 290000 |
|
}, |
|
{ |
|
"epoch": 15.65, |
|
"grad_norm": 0.8903990387916565, |
|
"learning_rate": 2.3834363047955284e-05, |
|
"loss": 2.9088, |
|
"step": 291000 |
|
}, |
|
{ |
|
"epoch": 15.7, |
|
"grad_norm": 0.8938170671463013, |
|
"learning_rate": 2.354015887025596e-05, |
|
"loss": 2.9057, |
|
"step": 292000 |
|
}, |
|
{ |
|
"epoch": 15.76, |
|
"grad_norm": 0.9013971090316772, |
|
"learning_rate": 2.3245954692556633e-05, |
|
"loss": 2.9101, |
|
"step": 293000 |
|
}, |
|
{ |
|
"epoch": 15.81, |
|
"grad_norm": 0.8826420307159424, |
|
"learning_rate": 2.2952044719035014e-05, |
|
"loss": 2.9129, |
|
"step": 294000 |
|
}, |
|
{ |
|
"epoch": 15.86, |
|
"grad_norm": 0.8986074328422546, |
|
"learning_rate": 2.2658134745513388e-05, |
|
"loss": 2.9102, |
|
"step": 295000 |
|
}, |
|
{ |
|
"epoch": 15.92, |
|
"grad_norm": 0.8564696311950684, |
|
"learning_rate": 2.2363930567814065e-05, |
|
"loss": 2.9093, |
|
"step": 296000 |
|
}, |
|
{ |
|
"epoch": 15.97, |
|
"grad_norm": 0.8886106610298157, |
|
"learning_rate": 2.206972639011474e-05, |
|
"loss": 2.9143, |
|
"step": 297000 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.4056046552606841, |
|
"eval_loss": 3.4019551277160645, |
|
"eval_runtime": 154.6299, |
|
"eval_samples_per_second": 374.578, |
|
"eval_steps_per_second": 5.859, |
|
"step": 297520 |
|
}, |
|
{ |
|
"epoch": 16.03, |
|
"grad_norm": 0.8847858905792236, |
|
"learning_rate": 2.1775522212415418e-05, |
|
"loss": 2.8953, |
|
"step": 298000 |
|
}, |
|
{ |
|
"epoch": 16.08, |
|
"grad_norm": 0.8800062537193298, |
|
"learning_rate": 2.1481318034716096e-05, |
|
"loss": 2.8747, |
|
"step": 299000 |
|
}, |
|
{ |
|
"epoch": 16.13, |
|
"grad_norm": 0.9220243692398071, |
|
"learning_rate": 2.118711385701677e-05, |
|
"loss": 2.8779, |
|
"step": 300000 |
|
}, |
|
{ |
|
"epoch": 16.19, |
|
"grad_norm": 0.9044596552848816, |
|
"learning_rate": 2.0893203883495147e-05, |
|
"loss": 2.8786, |
|
"step": 301000 |
|
}, |
|
{ |
|
"epoch": 16.24, |
|
"grad_norm": 0.9245201945304871, |
|
"learning_rate": 2.0598999705795825e-05, |
|
"loss": 2.883, |
|
"step": 302000 |
|
}, |
|
{ |
|
"epoch": 16.29, |
|
"grad_norm": 0.9172502160072327, |
|
"learning_rate": 2.03050897322742e-05, |
|
"loss": 2.8844, |
|
"step": 303000 |
|
}, |
|
{ |
|
"epoch": 16.35, |
|
"grad_norm": 0.8956001996994019, |
|
"learning_rate": 2.0010885554574877e-05, |
|
"loss": 2.8817, |
|
"step": 304000 |
|
}, |
|
{ |
|
"epoch": 16.4, |
|
"grad_norm": 0.9131536483764648, |
|
"learning_rate": 1.971697558105325e-05, |
|
"loss": 2.8848, |
|
"step": 305000 |
|
}, |
|
{ |
|
"epoch": 16.46, |
|
"grad_norm": 0.8909400105476379, |
|
"learning_rate": 1.942277140335393e-05, |
|
"loss": 2.8869, |
|
"step": 306000 |
|
}, |
|
{ |
|
"epoch": 16.51, |
|
"grad_norm": 0.9004851579666138, |
|
"learning_rate": 1.9128567225654607e-05, |
|
"loss": 2.8903, |
|
"step": 307000 |
|
}, |
|
{ |
|
"epoch": 16.56, |
|
"grad_norm": 0.8977559804916382, |
|
"learning_rate": 1.883436304795528e-05, |
|
"loss": 2.8893, |
|
"step": 308000 |
|
}, |
|
{ |
|
"epoch": 16.62, |
|
"grad_norm": 0.9052317142486572, |
|
"learning_rate": 1.854045307443366e-05, |
|
"loss": 2.8895, |
|
"step": 309000 |
|
}, |
|
{ |
|
"epoch": 16.67, |
|
"grad_norm": 0.9036689400672913, |
|
"learning_rate": 1.8246248896734333e-05, |
|
"loss": 2.8875, |
|
"step": 310000 |
|
}, |
|
{ |
|
"epoch": 16.72, |
|
"grad_norm": 0.9088098406791687, |
|
"learning_rate": 1.795204471903501e-05, |
|
"loss": 2.8949, |
|
"step": 311000 |
|
}, |
|
{ |
|
"epoch": 16.78, |
|
"grad_norm": 0.8923940658569336, |
|
"learning_rate": 1.7658134745513388e-05, |
|
"loss": 2.8941, |
|
"step": 312000 |
|
}, |
|
{ |
|
"epoch": 16.83, |
|
"grad_norm": 0.8931061625480652, |
|
"learning_rate": 1.7363930567814063e-05, |
|
"loss": 2.8923, |
|
"step": 313000 |
|
}, |
|
{ |
|
"epoch": 16.89, |
|
"grad_norm": 0.8664661049842834, |
|
"learning_rate": 1.707002059429244e-05, |
|
"loss": 2.896, |
|
"step": 314000 |
|
}, |
|
{ |
|
"epoch": 16.94, |
|
"grad_norm": 0.880660355091095, |
|
"learning_rate": 1.6775816416593114e-05, |
|
"loss": 2.8917, |
|
"step": 315000 |
|
}, |
|
{ |
|
"epoch": 16.99, |
|
"grad_norm": 0.8778538703918457, |
|
"learning_rate": 1.6481612238893792e-05, |
|
"loss": 2.8935, |
|
"step": 316000 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.4054564593112435, |
|
"eval_loss": 3.409980058670044, |
|
"eval_runtime": 155.1392, |
|
"eval_samples_per_second": 373.349, |
|
"eval_steps_per_second": 5.84, |
|
"step": 316115 |
|
}, |
|
{ |
|
"epoch": 17.05, |
|
"grad_norm": 0.8941461443901062, |
|
"learning_rate": 1.618740806119447e-05, |
|
"loss": 2.8677, |
|
"step": 317000 |
|
}, |
|
{ |
|
"epoch": 17.1, |
|
"grad_norm": 0.8646876215934753, |
|
"learning_rate": 1.5893498087672844e-05, |
|
"loss": 2.8632, |
|
"step": 318000 |
|
}, |
|
{ |
|
"epoch": 17.16, |
|
"grad_norm": 0.9284677505493164, |
|
"learning_rate": 1.5599293909973522e-05, |
|
"loss": 2.8627, |
|
"step": 319000 |
|
}, |
|
{ |
|
"epoch": 17.21, |
|
"grad_norm": 0.9292691349983215, |
|
"learning_rate": 1.53056781406296e-05, |
|
"loss": 2.8676, |
|
"step": 320000 |
|
}, |
|
{ |
|
"epoch": 17.26, |
|
"grad_norm": 0.8917170166969299, |
|
"learning_rate": 1.5011473962930275e-05, |
|
"loss": 2.8713, |
|
"step": 321000 |
|
}, |
|
{ |
|
"epoch": 17.32, |
|
"grad_norm": 0.964272141456604, |
|
"learning_rate": 1.4717269785230949e-05, |
|
"loss": 2.8679, |
|
"step": 322000 |
|
}, |
|
{ |
|
"epoch": 17.37, |
|
"grad_norm": 1.0006158351898193, |
|
"learning_rate": 1.4423359811709328e-05, |
|
"loss": 2.8746, |
|
"step": 323000 |
|
}, |
|
{ |
|
"epoch": 17.42, |
|
"grad_norm": 0.9144309163093567, |
|
"learning_rate": 1.4129155634010003e-05, |
|
"loss": 2.8703, |
|
"step": 324000 |
|
}, |
|
{ |
|
"epoch": 17.48, |
|
"grad_norm": 0.8891538381576538, |
|
"learning_rate": 1.383524566048838e-05, |
|
"loss": 2.8737, |
|
"step": 325000 |
|
}, |
|
{ |
|
"epoch": 17.53, |
|
"grad_norm": 0.9083254337310791, |
|
"learning_rate": 1.3541041482789058e-05, |
|
"loss": 2.8691, |
|
"step": 326000 |
|
}, |
|
{ |
|
"epoch": 17.59, |
|
"grad_norm": 0.8879551291465759, |
|
"learning_rate": 1.3247131509267433e-05, |
|
"loss": 2.875, |
|
"step": 327000 |
|
}, |
|
{ |
|
"epoch": 17.64, |
|
"grad_norm": 0.8899810314178467, |
|
"learning_rate": 1.295292733156811e-05, |
|
"loss": 2.8744, |
|
"step": 328000 |
|
}, |
|
{ |
|
"epoch": 17.69, |
|
"grad_norm": 0.9027150273323059, |
|
"learning_rate": 1.2658723153868784e-05, |
|
"loss": 2.8767, |
|
"step": 329000 |
|
}, |
|
{ |
|
"epoch": 17.75, |
|
"grad_norm": 0.8921009302139282, |
|
"learning_rate": 1.2364518976169462e-05, |
|
"loss": 2.8727, |
|
"step": 330000 |
|
}, |
|
{ |
|
"epoch": 17.8, |
|
"grad_norm": 0.9295068979263306, |
|
"learning_rate": 1.207031479847014e-05, |
|
"loss": 2.8767, |
|
"step": 331000 |
|
}, |
|
{ |
|
"epoch": 17.85, |
|
"grad_norm": 0.896858274936676, |
|
"learning_rate": 1.1776404824948514e-05, |
|
"loss": 2.8779, |
|
"step": 332000 |
|
}, |
|
{ |
|
"epoch": 17.91, |
|
"grad_norm": 0.9167591333389282, |
|
"learning_rate": 1.1482200647249191e-05, |
|
"loss": 2.8762, |
|
"step": 333000 |
|
}, |
|
{ |
|
"epoch": 17.96, |
|
"grad_norm": 0.9663364887237549, |
|
"learning_rate": 1.1188290673727569e-05, |
|
"loss": 2.8782, |
|
"step": 334000 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.40578731382212063, |
|
"eval_loss": 3.407121181488037, |
|
"eval_runtime": 155.0726, |
|
"eval_samples_per_second": 373.509, |
|
"eval_steps_per_second": 5.842, |
|
"step": 334710 |
|
}, |
|
{ |
|
"epoch": 18.02, |
|
"grad_norm": 0.9326697587966919, |
|
"learning_rate": 1.0894086496028245e-05, |
|
"loss": 2.8691, |
|
"step": 335000 |
|
}, |
|
{ |
|
"epoch": 18.07, |
|
"grad_norm": 0.922129213809967, |
|
"learning_rate": 1.060017652250662e-05, |
|
"loss": 2.8486, |
|
"step": 336000 |
|
}, |
|
{ |
|
"epoch": 18.12, |
|
"grad_norm": 0.9094411730766296, |
|
"learning_rate": 1.0306266548984996e-05, |
|
"loss": 2.8516, |
|
"step": 337000 |
|
}, |
|
{ |
|
"epoch": 18.18, |
|
"grad_norm": 0.9079362154006958, |
|
"learning_rate": 1.0012062371285672e-05, |
|
"loss": 2.854, |
|
"step": 338000 |
|
}, |
|
{ |
|
"epoch": 18.23, |
|
"grad_norm": 0.9157450199127197, |
|
"learning_rate": 9.717858193586348e-06, |
|
"loss": 2.8547, |
|
"step": 339000 |
|
}, |
|
{ |
|
"epoch": 18.28, |
|
"grad_norm": 0.9174096584320068, |
|
"learning_rate": 9.423654015887026e-06, |
|
"loss": 2.8564, |
|
"step": 340000 |
|
}, |
|
{ |
|
"epoch": 18.34, |
|
"grad_norm": 0.9197123050689697, |
|
"learning_rate": 9.130038246543101e-06, |
|
"loss": 2.8525, |
|
"step": 341000 |
|
}, |
|
{ |
|
"epoch": 18.39, |
|
"grad_norm": 0.9466710090637207, |
|
"learning_rate": 8.835834068843777e-06, |
|
"loss": 2.8571, |
|
"step": 342000 |
|
}, |
|
{ |
|
"epoch": 18.45, |
|
"grad_norm": 0.9464150667190552, |
|
"learning_rate": 8.541629891144455e-06, |
|
"loss": 2.8611, |
|
"step": 343000 |
|
}, |
|
{ |
|
"epoch": 18.5, |
|
"grad_norm": 0.9168482422828674, |
|
"learning_rate": 8.247425713445131e-06, |
|
"loss": 2.8564, |
|
"step": 344000 |
|
}, |
|
{ |
|
"epoch": 18.55, |
|
"grad_norm": 0.9207190871238708, |
|
"learning_rate": 7.953221535745808e-06, |
|
"loss": 2.8604, |
|
"step": 345000 |
|
}, |
|
{ |
|
"epoch": 18.61, |
|
"grad_norm": 0.9423150420188904, |
|
"learning_rate": 7.659605766401884e-06, |
|
"loss": 2.8611, |
|
"step": 346000 |
|
}, |
|
{ |
|
"epoch": 18.66, |
|
"grad_norm": 0.9121553301811218, |
|
"learning_rate": 7.36540158870256e-06, |
|
"loss": 2.8624, |
|
"step": 347000 |
|
}, |
|
{ |
|
"epoch": 18.71, |
|
"grad_norm": 0.9213626980781555, |
|
"learning_rate": 7.0711974110032376e-06, |
|
"loss": 2.8603, |
|
"step": 348000 |
|
}, |
|
{ |
|
"epoch": 18.77, |
|
"grad_norm": 0.9198666214942932, |
|
"learning_rate": 6.776993233303913e-06, |
|
"loss": 2.8627, |
|
"step": 349000 |
|
}, |
|
{ |
|
"epoch": 18.82, |
|
"grad_norm": 0.9301998615264893, |
|
"learning_rate": 6.483083259782289e-06, |
|
"loss": 2.8613, |
|
"step": 350000 |
|
}, |
|
{ |
|
"epoch": 18.88, |
|
"grad_norm": 0.899235725402832, |
|
"learning_rate": 6.1888790820829655e-06, |
|
"loss": 2.8622, |
|
"step": 351000 |
|
}, |
|
{ |
|
"epoch": 18.93, |
|
"grad_norm": 0.9574356079101562, |
|
"learning_rate": 5.894969108561342e-06, |
|
"loss": 2.8614, |
|
"step": 352000 |
|
}, |
|
{ |
|
"epoch": 18.98, |
|
"grad_norm": 0.9441936016082764, |
|
"learning_rate": 5.6010591350397175e-06, |
|
"loss": 2.8613, |
|
"step": 353000 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 0.4061827941278128, |
|
"eval_loss": 3.4122915267944336, |
|
"eval_runtime": 154.3443, |
|
"eval_samples_per_second": 375.271, |
|
"eval_steps_per_second": 5.87, |
|
"step": 353305 |
|
}, |
|
{ |
|
"epoch": 19.04, |
|
"grad_norm": 0.9131889939308167, |
|
"learning_rate": 5.3068549573403945e-06, |
|
"loss": 2.8451, |
|
"step": 354000 |
|
}, |
|
{ |
|
"epoch": 19.09, |
|
"grad_norm": 0.9485396146774292, |
|
"learning_rate": 5.0126507796410715e-06, |
|
"loss": 2.8459, |
|
"step": 355000 |
|
}, |
|
{ |
|
"epoch": 19.14, |
|
"grad_norm": 0.9703813791275024, |
|
"learning_rate": 4.718446601941748e-06, |
|
"loss": 2.8399, |
|
"step": 356000 |
|
}, |
|
{ |
|
"epoch": 19.2, |
|
"grad_norm": 0.9274348616600037, |
|
"learning_rate": 4.424536628420123e-06, |
|
"loss": 2.8446, |
|
"step": 357000 |
|
}, |
|
{ |
|
"epoch": 19.25, |
|
"grad_norm": 0.9653975367546082, |
|
"learning_rate": 4.1303324507208e-06, |
|
"loss": 2.8447, |
|
"step": 358000 |
|
}, |
|
{ |
|
"epoch": 19.31, |
|
"grad_norm": 0.9628462195396423, |
|
"learning_rate": 3.8361282730214765e-06, |
|
"loss": 2.8465, |
|
"step": 359000 |
|
}, |
|
{ |
|
"epoch": 19.36, |
|
"grad_norm": 0.9582130312919617, |
|
"learning_rate": 3.5422182994998533e-06, |
|
"loss": 2.8486, |
|
"step": 360000 |
|
}, |
|
{ |
|
"epoch": 19.41, |
|
"grad_norm": 0.9385794401168823, |
|
"learning_rate": 3.24801412180053e-06, |
|
"loss": 2.8457, |
|
"step": 361000 |
|
}, |
|
{ |
|
"epoch": 19.47, |
|
"grad_norm": 0.973229706287384, |
|
"learning_rate": 2.954398352456605e-06, |
|
"loss": 2.8481, |
|
"step": 362000 |
|
}, |
|
{ |
|
"epoch": 19.52, |
|
"grad_norm": 0.9264838099479675, |
|
"learning_rate": 2.660194174757282e-06, |
|
"loss": 2.8433, |
|
"step": 363000 |
|
}, |
|
{ |
|
"epoch": 19.58, |
|
"grad_norm": 0.9257712960243225, |
|
"learning_rate": 2.3659899970579586e-06, |
|
"loss": 2.8487, |
|
"step": 364000 |
|
}, |
|
{ |
|
"epoch": 19.63, |
|
"grad_norm": 0.898076057434082, |
|
"learning_rate": 2.071785819358635e-06, |
|
"loss": 2.8459, |
|
"step": 365000 |
|
}, |
|
{ |
|
"epoch": 19.68, |
|
"grad_norm": 0.9263227581977844, |
|
"learning_rate": 1.777875845837011e-06, |
|
"loss": 2.8443, |
|
"step": 366000 |
|
}, |
|
{ |
|
"epoch": 19.74, |
|
"grad_norm": 0.931954026222229, |
|
"learning_rate": 1.4836716681376875e-06, |
|
"loss": 2.8468, |
|
"step": 367000 |
|
}, |
|
{ |
|
"epoch": 19.79, |
|
"grad_norm": 0.9403001666069031, |
|
"learning_rate": 1.1897616946160636e-06, |
|
"loss": 2.8486, |
|
"step": 368000 |
|
}, |
|
{ |
|
"epoch": 19.84, |
|
"grad_norm": 0.9270106554031372, |
|
"learning_rate": 8.955575169167403e-07, |
|
"loss": 2.8456, |
|
"step": 369000 |
|
}, |
|
{ |
|
"epoch": 19.9, |
|
"grad_norm": 0.9441812634468079, |
|
"learning_rate": 6.016475433951162e-07, |
|
"loss": 2.8481, |
|
"step": 370000 |
|
}, |
|
{ |
|
"epoch": 19.95, |
|
"grad_norm": 0.9611223340034485, |
|
"learning_rate": 3.0744336569579287e-07, |
|
"loss": 2.8439, |
|
"step": 371000 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.40601854249753977, |
|
"eval_loss": 3.4158718585968018, |
|
"eval_runtime": 154.2832, |
|
"eval_samples_per_second": 375.42, |
|
"eval_steps_per_second": 5.872, |
|
"step": 371900 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"step": 371900, |
|
"total_flos": 1.5669257538816e+18, |
|
"train_loss": 3.157264780781544, |
|
"train_runtime": 81435.8042, |
|
"train_samples_per_second": 146.136, |
|
"train_steps_per_second": 4.567 |
|
} |
|
], |
|
"logging_steps": 1000, |
|
"max_steps": 371900, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 5000, |
|
"total_flos": 1.5669257538816e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|