|
{ |
|
"best_metric": 0.8163847923278809, |
|
"best_model_checkpoint": "runs/deepseek_lora_20240423-223943/checkpoint-2500", |
|
"epoch": 0.0625, |
|
"eval_steps": 500, |
|
"global_step": 2500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 3.086414337158203, |
|
"learning_rate": 4.0000000000000003e-07, |
|
"loss": 0.7892, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 8.478134155273438, |
|
"learning_rate": 8.000000000000001e-07, |
|
"loss": 0.7746, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 5.574502468109131, |
|
"learning_rate": 1.2000000000000002e-06, |
|
"loss": 0.8222, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 2.6497371196746826, |
|
"learning_rate": 1.6000000000000001e-06, |
|
"loss": 0.7423, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 3.116753339767456, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 0.7622, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 3.179832696914673, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 0.8183, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 3.9869463443756104, |
|
"learning_rate": 2.8000000000000003e-06, |
|
"loss": 0.822, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 5.093494415283203, |
|
"learning_rate": 3.2000000000000003e-06, |
|
"loss": 0.7966, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 5.230633735656738, |
|
"learning_rate": 3.6000000000000003e-06, |
|
"loss": 0.8113, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 9.374403953552246, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.7582, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 6.465492248535156, |
|
"learning_rate": 4.4e-06, |
|
"loss": 0.7662, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 6.279934883117676, |
|
"learning_rate": 4.800000000000001e-06, |
|
"loss": 0.8376, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 5.799221992492676, |
|
"learning_rate": 5.2e-06, |
|
"loss": 0.7965, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 3.222240686416626, |
|
"learning_rate": 5.600000000000001e-06, |
|
"loss": 0.8855, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 9.009174346923828, |
|
"learning_rate": 6e-06, |
|
"loss": 0.8394, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 8.040350914001465, |
|
"learning_rate": 6.4000000000000006e-06, |
|
"loss": 0.8426, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 4.131030559539795, |
|
"learning_rate": 6.800000000000001e-06, |
|
"loss": 0.7747, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 3.31986927986145, |
|
"learning_rate": 7.2000000000000005e-06, |
|
"loss": 0.7125, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 5.7623395919799805, |
|
"learning_rate": 7.600000000000001e-06, |
|
"loss": 0.7854, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 10.848206520080566, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.7756, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 13.455166816711426, |
|
"learning_rate": 8.400000000000001e-06, |
|
"loss": 0.7894, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 12.759767532348633, |
|
"learning_rate": 8.8e-06, |
|
"loss": 0.7454, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 4.262899875640869, |
|
"learning_rate": 9.200000000000002e-06, |
|
"loss": 0.8555, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 4.28985071182251, |
|
"learning_rate": 9.600000000000001e-06, |
|
"loss": 0.6845, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 4.174241542816162, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7983, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 12.931599617004395, |
|
"learning_rate": 1.04e-05, |
|
"loss": 0.9041, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 7.004627227783203, |
|
"learning_rate": 1.0800000000000002e-05, |
|
"loss": 0.817, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.6102757453918457, |
|
"learning_rate": 1.1200000000000001e-05, |
|
"loss": 0.7292, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.764902353286743, |
|
"learning_rate": 1.16e-05, |
|
"loss": 0.9042, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.958317995071411, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.7539, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 7.098923683166504, |
|
"learning_rate": 1.2400000000000002e-05, |
|
"loss": 0.7955, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 12.129098892211914, |
|
"learning_rate": 1.2800000000000001e-05, |
|
"loss": 0.849, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.054119825363159, |
|
"learning_rate": 1.3200000000000002e-05, |
|
"loss": 0.8645, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 5.205028057098389, |
|
"learning_rate": 1.3600000000000002e-05, |
|
"loss": 0.8175, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.614790439605713, |
|
"learning_rate": 1.4e-05, |
|
"loss": 0.8998, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.9891204833984375, |
|
"learning_rate": 1.4400000000000001e-05, |
|
"loss": 0.8108, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 4.152099609375, |
|
"learning_rate": 1.48e-05, |
|
"loss": 0.7855, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 9.833850860595703, |
|
"learning_rate": 1.5200000000000002e-05, |
|
"loss": 0.7736, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.849621295928955, |
|
"learning_rate": 1.5600000000000003e-05, |
|
"loss": 0.7668, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 5.4542975425720215, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.7781, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 6.197661876678467, |
|
"learning_rate": 1.64e-05, |
|
"loss": 0.8654, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.2606770992279053, |
|
"learning_rate": 1.6800000000000002e-05, |
|
"loss": 0.7565, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.9680209159851074, |
|
"learning_rate": 1.72e-05, |
|
"loss": 0.7886, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 18.749984741210938, |
|
"learning_rate": 1.76e-05, |
|
"loss": 0.7305, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 5.822000503540039, |
|
"learning_rate": 1.8e-05, |
|
"loss": 0.7833, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 12.999715805053711, |
|
"learning_rate": 1.8400000000000003e-05, |
|
"loss": 0.8483, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 7.193736553192139, |
|
"learning_rate": 1.88e-05, |
|
"loss": 0.84, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 12.573124885559082, |
|
"learning_rate": 1.9200000000000003e-05, |
|
"loss": 0.8437, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 4.4221601486206055, |
|
"learning_rate": 1.9600000000000002e-05, |
|
"loss": 0.6836, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.0399410724639893, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8264, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 0.8175864219665527, |
|
"eval_runtime": 67.7802, |
|
"eval_samples_per_second": 14.754, |
|
"eval_steps_per_second": 14.754, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.971303701400757, |
|
"learning_rate": 1.9978947368421054e-05, |
|
"loss": 0.7385, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.8043839931488037, |
|
"learning_rate": 1.9957894736842107e-05, |
|
"loss": 0.7826, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 11.702253341674805, |
|
"learning_rate": 1.993684210526316e-05, |
|
"loss": 0.7971, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 5.176826000213623, |
|
"learning_rate": 1.9915789473684212e-05, |
|
"loss": 0.748, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 7.120133876800537, |
|
"learning_rate": 1.9894736842105265e-05, |
|
"loss": 0.8461, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 12.286151885986328, |
|
"learning_rate": 1.9873684210526318e-05, |
|
"loss": 0.8335, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 7.857172966003418, |
|
"learning_rate": 1.985263157894737e-05, |
|
"loss": 0.7231, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 5.327859401702881, |
|
"learning_rate": 1.9831578947368423e-05, |
|
"loss": 0.877, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 6.9340362548828125, |
|
"learning_rate": 1.9810526315789476e-05, |
|
"loss": 0.8984, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.1034326553344727, |
|
"learning_rate": 1.9789473684210528e-05, |
|
"loss": 0.7045, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 3.853721857070923, |
|
"learning_rate": 1.976842105263158e-05, |
|
"loss": 0.761, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 7.6926398277282715, |
|
"learning_rate": 1.9747368421052633e-05, |
|
"loss": 0.9493, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 6.261799335479736, |
|
"learning_rate": 1.9726315789473686e-05, |
|
"loss": 0.7719, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 3.864114284515381, |
|
"learning_rate": 1.970526315789474e-05, |
|
"loss": 0.9406, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 7.093533515930176, |
|
"learning_rate": 1.968421052631579e-05, |
|
"loss": 0.7951, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.3724496364593506, |
|
"learning_rate": 1.9663157894736844e-05, |
|
"loss": 0.8648, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 10.12341022491455, |
|
"learning_rate": 1.9642105263157897e-05, |
|
"loss": 0.7823, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.80940842628479, |
|
"learning_rate": 1.962105263157895e-05, |
|
"loss": 0.706, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 8.243487358093262, |
|
"learning_rate": 1.9600000000000002e-05, |
|
"loss": 0.8244, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 11.420123100280762, |
|
"learning_rate": 1.9578947368421055e-05, |
|
"loss": 0.6753, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 63.8618278503418, |
|
"learning_rate": 1.9557894736842107e-05, |
|
"loss": 0.8309, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 4.521258354187012, |
|
"learning_rate": 1.953684210526316e-05, |
|
"loss": 0.8101, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.9532318115234375, |
|
"learning_rate": 1.9515789473684213e-05, |
|
"loss": 0.8533, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 3.792180061340332, |
|
"learning_rate": 1.9494736842105265e-05, |
|
"loss": 0.7573, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 5.155513286590576, |
|
"learning_rate": 1.9473684210526318e-05, |
|
"loss": 0.8961, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 9.195950508117676, |
|
"learning_rate": 1.945263157894737e-05, |
|
"loss": 0.8398, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 6.699478626251221, |
|
"learning_rate": 1.9431578947368423e-05, |
|
"loss": 0.8018, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 5.254507541656494, |
|
"learning_rate": 1.9410526315789476e-05, |
|
"loss": 0.8408, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 4.351966857910156, |
|
"learning_rate": 1.9389473684210525e-05, |
|
"loss": 0.7323, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.361276626586914, |
|
"learning_rate": 1.936842105263158e-05, |
|
"loss": 0.8401, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 5.449990272521973, |
|
"learning_rate": 1.9347368421052634e-05, |
|
"loss": 0.726, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 5.375738143920898, |
|
"learning_rate": 1.9326315789473687e-05, |
|
"loss": 0.8305, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.601025342941284, |
|
"learning_rate": 1.930526315789474e-05, |
|
"loss": 0.9152, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 12.153268814086914, |
|
"learning_rate": 1.9284210526315792e-05, |
|
"loss": 0.8423, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 3.785663604736328, |
|
"learning_rate": 1.9263157894736845e-05, |
|
"loss": 0.7733, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 10.162787437438965, |
|
"learning_rate": 1.9242105263157894e-05, |
|
"loss": 0.893, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 3.871621608734131, |
|
"learning_rate": 1.922105263157895e-05, |
|
"loss": 0.798, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.9919800758361816, |
|
"learning_rate": 1.9200000000000003e-05, |
|
"loss": 0.8484, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 5.40109920501709, |
|
"learning_rate": 1.9178947368421055e-05, |
|
"loss": 0.9129, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 6.794926643371582, |
|
"learning_rate": 1.9157894736842108e-05, |
|
"loss": 0.8687, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 5.942440986633301, |
|
"learning_rate": 1.913684210526316e-05, |
|
"loss": 0.8564, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 5.968307018280029, |
|
"learning_rate": 1.9115789473684213e-05, |
|
"loss": 0.8495, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 8.425616264343262, |
|
"learning_rate": 1.9094736842105262e-05, |
|
"loss": 0.7242, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.819301128387451, |
|
"learning_rate": 1.907368421052632e-05, |
|
"loss": 0.8381, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 6.81688117980957, |
|
"learning_rate": 1.9052631578947368e-05, |
|
"loss": 0.8817, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 5.102423191070557, |
|
"learning_rate": 1.9031578947368424e-05, |
|
"loss": 0.8274, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 4.12994909286499, |
|
"learning_rate": 1.9010526315789476e-05, |
|
"loss": 0.7052, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 5.15468692779541, |
|
"learning_rate": 1.898947368421053e-05, |
|
"loss": 0.772, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.62323796749115, |
|
"learning_rate": 1.8968421052631582e-05, |
|
"loss": 0.7764, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.546677589416504, |
|
"learning_rate": 1.894736842105263e-05, |
|
"loss": 0.8365, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 0.7952949404716492, |
|
"eval_runtime": 67.7544, |
|
"eval_samples_per_second": 14.759, |
|
"eval_steps_per_second": 14.759, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 9.28386402130127, |
|
"learning_rate": 1.8926315789473687e-05, |
|
"loss": 0.8765, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 7.3430304527282715, |
|
"learning_rate": 1.8905263157894736e-05, |
|
"loss": 0.8763, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 4.0531206130981445, |
|
"learning_rate": 1.8884210526315792e-05, |
|
"loss": 0.7943, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 3.028320074081421, |
|
"learning_rate": 1.886315789473684e-05, |
|
"loss": 0.836, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 3.3861188888549805, |
|
"learning_rate": 1.8842105263157898e-05, |
|
"loss": 0.7336, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 3.7832908630371094, |
|
"learning_rate": 1.882105263157895e-05, |
|
"loss": 0.9283, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 3.8170342445373535, |
|
"learning_rate": 1.88e-05, |
|
"loss": 0.7655, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 6.15322732925415, |
|
"learning_rate": 1.8778947368421056e-05, |
|
"loss": 0.9341, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 7.066686153411865, |
|
"learning_rate": 1.8757894736842105e-05, |
|
"loss": 0.85, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.986961603164673, |
|
"learning_rate": 1.873684210526316e-05, |
|
"loss": 0.8943, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.8456902503967285, |
|
"learning_rate": 1.871578947368421e-05, |
|
"loss": 0.8279, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 3.6177377700805664, |
|
"learning_rate": 1.8694736842105266e-05, |
|
"loss": 0.8192, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 14.768010139465332, |
|
"learning_rate": 1.8673684210526316e-05, |
|
"loss": 0.8005, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 11.347342491149902, |
|
"learning_rate": 1.8652631578947368e-05, |
|
"loss": 0.8081, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 4.0560150146484375, |
|
"learning_rate": 1.8631578947368424e-05, |
|
"loss": 0.9389, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 3.3164710998535156, |
|
"learning_rate": 1.8610526315789473e-05, |
|
"loss": 0.8501, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 11.112225532531738, |
|
"learning_rate": 1.858947368421053e-05, |
|
"loss": 0.7162, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 6.200588703155518, |
|
"learning_rate": 1.856842105263158e-05, |
|
"loss": 0.7448, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 6.573482513427734, |
|
"learning_rate": 1.8547368421052635e-05, |
|
"loss": 0.8071, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 5.153548717498779, |
|
"learning_rate": 1.8526315789473684e-05, |
|
"loss": 0.7957, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 5.3308305740356445, |
|
"learning_rate": 1.8505263157894737e-05, |
|
"loss": 0.7301, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 5.269808769226074, |
|
"learning_rate": 1.8484210526315793e-05, |
|
"loss": 0.8072, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 5.588324546813965, |
|
"learning_rate": 1.8463157894736842e-05, |
|
"loss": 0.8587, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 4.593557357788086, |
|
"learning_rate": 1.8442105263157898e-05, |
|
"loss": 0.856, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 5.2591094970703125, |
|
"learning_rate": 1.8421052631578947e-05, |
|
"loss": 0.7717, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 4.052567958831787, |
|
"learning_rate": 1.8400000000000003e-05, |
|
"loss": 0.7823, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 4.447838306427002, |
|
"learning_rate": 1.8378947368421053e-05, |
|
"loss": 0.83, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 4.029257774353027, |
|
"learning_rate": 1.8357894736842105e-05, |
|
"loss": 0.7504, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 9.053960800170898, |
|
"learning_rate": 1.8336842105263158e-05, |
|
"loss": 0.9074, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.2877705097198486, |
|
"learning_rate": 1.831578947368421e-05, |
|
"loss": 0.772, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 3.4482290744781494, |
|
"learning_rate": 1.8294736842105267e-05, |
|
"loss": 0.8658, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 6.684794902801514, |
|
"learning_rate": 1.8273684210526316e-05, |
|
"loss": 0.7848, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 3.553828716278076, |
|
"learning_rate": 1.8252631578947372e-05, |
|
"loss": 0.8219, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.5203397274017334, |
|
"learning_rate": 1.823157894736842e-05, |
|
"loss": 0.9071, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 4.961795806884766, |
|
"learning_rate": 1.8210526315789477e-05, |
|
"loss": 0.6542, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 3.663081645965576, |
|
"learning_rate": 1.8189473684210527e-05, |
|
"loss": 0.7402, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 8.785040855407715, |
|
"learning_rate": 1.816842105263158e-05, |
|
"loss": 0.7462, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 4.659074783325195, |
|
"learning_rate": 1.8147368421052632e-05, |
|
"loss": 0.6951, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 3.5885703563690186, |
|
"learning_rate": 1.8126315789473685e-05, |
|
"loss": 0.7008, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.1295347213745117, |
|
"learning_rate": 1.810526315789474e-05, |
|
"loss": 0.9103, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.4699888229370117, |
|
"learning_rate": 1.808421052631579e-05, |
|
"loss": 0.841, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 5.3273444175720215, |
|
"learning_rate": 1.8063157894736846e-05, |
|
"loss": 0.9041, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 5.149638652801514, |
|
"learning_rate": 1.8042105263157895e-05, |
|
"loss": 0.7784, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.4124910831451416, |
|
"learning_rate": 1.8021052631578948e-05, |
|
"loss": 0.8208, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.9231085777282715, |
|
"learning_rate": 1.8e-05, |
|
"loss": 0.7173, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 4.008113384246826, |
|
"learning_rate": 1.7978947368421053e-05, |
|
"loss": 0.7383, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 5.1748046875, |
|
"learning_rate": 1.795789473684211e-05, |
|
"loss": 0.8399, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.4990293979644775, |
|
"learning_rate": 1.793684210526316e-05, |
|
"loss": 0.6721, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.1186299324035645, |
|
"learning_rate": 1.7915789473684214e-05, |
|
"loss": 0.782, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 5.12732458114624, |
|
"learning_rate": 1.7894736842105264e-05, |
|
"loss": 0.7211, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 0.811568021774292, |
|
"eval_runtime": 67.7961, |
|
"eval_samples_per_second": 14.75, |
|
"eval_steps_per_second": 14.75, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.631096124649048, |
|
"learning_rate": 1.7873684210526316e-05, |
|
"loss": 0.7557, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 8.850045204162598, |
|
"learning_rate": 1.785263157894737e-05, |
|
"loss": 0.8757, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.1114978790283203, |
|
"learning_rate": 1.7831578947368422e-05, |
|
"loss": 0.7613, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 4.5038743019104, |
|
"learning_rate": 1.7810526315789474e-05, |
|
"loss": 0.8049, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 4.2331156730651855, |
|
"learning_rate": 1.7789473684210527e-05, |
|
"loss": 0.8277, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 5.05696964263916, |
|
"learning_rate": 1.7768421052631583e-05, |
|
"loss": 0.7973, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.1331920623779297, |
|
"learning_rate": 1.7747368421052632e-05, |
|
"loss": 0.7688, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 4.984541416168213, |
|
"learning_rate": 1.7726315789473685e-05, |
|
"loss": 0.7865, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 7.149406433105469, |
|
"learning_rate": 1.7705263157894738e-05, |
|
"loss": 0.7728, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 8.092243194580078, |
|
"learning_rate": 1.768421052631579e-05, |
|
"loss": 0.935, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 13.16551399230957, |
|
"learning_rate": 1.7663157894736843e-05, |
|
"loss": 0.8286, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.131350517272949, |
|
"learning_rate": 1.7642105263157896e-05, |
|
"loss": 0.7864, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 7.870023727416992, |
|
"learning_rate": 1.7621052631578948e-05, |
|
"loss": 0.8645, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 10.631692886352539, |
|
"learning_rate": 1.76e-05, |
|
"loss": 0.8473, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 6.421032905578613, |
|
"learning_rate": 1.7578947368421054e-05, |
|
"loss": 0.7868, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 4.57529878616333, |
|
"learning_rate": 1.7557894736842106e-05, |
|
"loss": 0.7882, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.8785624504089355, |
|
"learning_rate": 1.753684210526316e-05, |
|
"loss": 0.7543, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 5.722006320953369, |
|
"learning_rate": 1.751578947368421e-05, |
|
"loss": 0.9626, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.466771364212036, |
|
"learning_rate": 1.7494736842105264e-05, |
|
"loss": 0.783, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.072049856185913, |
|
"learning_rate": 1.7473684210526317e-05, |
|
"loss": 0.7503, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 5.768575668334961, |
|
"learning_rate": 1.745263157894737e-05, |
|
"loss": 0.8193, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.585022211074829, |
|
"learning_rate": 1.7431578947368422e-05, |
|
"loss": 0.8808, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.0711567401885986, |
|
"learning_rate": 1.7410526315789475e-05, |
|
"loss": 0.8098, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.3020272254943848, |
|
"learning_rate": 1.7389473684210527e-05, |
|
"loss": 0.7196, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.645238161087036, |
|
"learning_rate": 1.736842105263158e-05, |
|
"loss": 0.8904, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 6.018638610839844, |
|
"learning_rate": 1.7347368421052633e-05, |
|
"loss": 0.7937, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.629096746444702, |
|
"learning_rate": 1.7326315789473685e-05, |
|
"loss": 0.9171, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.5619189739227295, |
|
"learning_rate": 1.7305263157894738e-05, |
|
"loss": 0.9488, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 9.464752197265625, |
|
"learning_rate": 1.728421052631579e-05, |
|
"loss": 0.8459, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.9856364727020264, |
|
"learning_rate": 1.7263157894736843e-05, |
|
"loss": 0.8378, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 3.753553867340088, |
|
"learning_rate": 1.7242105263157896e-05, |
|
"loss": 0.8093, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 3.4593358039855957, |
|
"learning_rate": 1.722105263157895e-05, |
|
"loss": 0.7896, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.7163546085357666, |
|
"learning_rate": 1.72e-05, |
|
"loss": 0.7188, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 3.105628728866577, |
|
"learning_rate": 1.7178947368421054e-05, |
|
"loss": 0.7643, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.387368679046631, |
|
"learning_rate": 1.7157894736842107e-05, |
|
"loss": 0.8465, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 6.020385265350342, |
|
"learning_rate": 1.713684210526316e-05, |
|
"loss": 0.7798, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 4.560520172119141, |
|
"learning_rate": 1.7115789473684212e-05, |
|
"loss": 0.7704, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 15.739727973937988, |
|
"learning_rate": 1.7094736842105265e-05, |
|
"loss": 0.7148, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 5.79690408706665, |
|
"learning_rate": 1.7073684210526317e-05, |
|
"loss": 0.798, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.6939146518707275, |
|
"learning_rate": 1.705263157894737e-05, |
|
"loss": 0.7641, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 5.193384170532227, |
|
"learning_rate": 1.7031578947368423e-05, |
|
"loss": 0.7866, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 4.940731525421143, |
|
"learning_rate": 1.7010526315789475e-05, |
|
"loss": 0.8261, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.1812446117401123, |
|
"learning_rate": 1.6989473684210528e-05, |
|
"loss": 0.7973, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 3.7413289546966553, |
|
"learning_rate": 1.696842105263158e-05, |
|
"loss": 0.7818, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 4.024014472961426, |
|
"learning_rate": 1.6947368421052633e-05, |
|
"loss": 0.7237, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 3.0871291160583496, |
|
"learning_rate": 1.6926315789473686e-05, |
|
"loss": 0.772, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 3.28814435005188, |
|
"learning_rate": 1.690526315789474e-05, |
|
"loss": 0.7067, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.8241286277770996, |
|
"learning_rate": 1.688421052631579e-05, |
|
"loss": 0.8175, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.5942068099975586, |
|
"learning_rate": 1.6863157894736844e-05, |
|
"loss": 0.9265, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 6.6822662353515625, |
|
"learning_rate": 1.6842105263157896e-05, |
|
"loss": 0.8593, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 0.8064771890640259, |
|
"eval_runtime": 67.7887, |
|
"eval_samples_per_second": 14.752, |
|
"eval_steps_per_second": 14.752, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 7.032164573669434, |
|
"learning_rate": 1.682105263157895e-05, |
|
"loss": 0.8819, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 4.874982833862305, |
|
"learning_rate": 1.6800000000000002e-05, |
|
"loss": 0.8021, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.6172547340393066, |
|
"learning_rate": 1.6778947368421054e-05, |
|
"loss": 0.8017, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 10.659741401672363, |
|
"learning_rate": 1.6757894736842107e-05, |
|
"loss": 0.8896, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 6.189141750335693, |
|
"learning_rate": 1.673684210526316e-05, |
|
"loss": 0.7997, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 4.523468971252441, |
|
"learning_rate": 1.6715789473684212e-05, |
|
"loss": 0.8498, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 8.533658981323242, |
|
"learning_rate": 1.6694736842105265e-05, |
|
"loss": 0.8857, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 3.0041606426239014, |
|
"learning_rate": 1.6673684210526318e-05, |
|
"loss": 0.8112, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 5.055651664733887, |
|
"learning_rate": 1.665263157894737e-05, |
|
"loss": 0.7872, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 5.761922836303711, |
|
"learning_rate": 1.6631578947368423e-05, |
|
"loss": 0.7727, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.518223524093628, |
|
"learning_rate": 1.6610526315789476e-05, |
|
"loss": 0.7997, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 4.975761890411377, |
|
"learning_rate": 1.658947368421053e-05, |
|
"loss": 0.7457, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 3.2227561473846436, |
|
"learning_rate": 1.656842105263158e-05, |
|
"loss": 0.816, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 4.705923080444336, |
|
"learning_rate": 1.6547368421052634e-05, |
|
"loss": 0.8113, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.655057430267334, |
|
"learning_rate": 1.6526315789473686e-05, |
|
"loss": 0.7912, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 3.0186755657196045, |
|
"learning_rate": 1.650526315789474e-05, |
|
"loss": 0.8608, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.232386827468872, |
|
"learning_rate": 1.648421052631579e-05, |
|
"loss": 0.8549, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 11.968620300292969, |
|
"learning_rate": 1.6463157894736844e-05, |
|
"loss": 0.868, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 3.5853216648101807, |
|
"learning_rate": 1.6442105263157897e-05, |
|
"loss": 0.8388, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.375610589981079, |
|
"learning_rate": 1.642105263157895e-05, |
|
"loss": 0.9111, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.9734487533569336, |
|
"learning_rate": 1.64e-05, |
|
"loss": 0.7288, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 10.517192840576172, |
|
"learning_rate": 1.6378947368421055e-05, |
|
"loss": 0.698, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 4.183718204498291, |
|
"learning_rate": 1.6357894736842108e-05, |
|
"loss": 0.7759, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 3.9075675010681152, |
|
"learning_rate": 1.633684210526316e-05, |
|
"loss": 0.7829, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 5.287744998931885, |
|
"learning_rate": 1.6315789473684213e-05, |
|
"loss": 0.7057, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 4.977657318115234, |
|
"learning_rate": 1.6294736842105265e-05, |
|
"loss": 0.8346, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 7.196689128875732, |
|
"learning_rate": 1.6273684210526318e-05, |
|
"loss": 0.8508, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.467477798461914, |
|
"learning_rate": 1.6252631578947367e-05, |
|
"loss": 0.7179, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 7.059762954711914, |
|
"learning_rate": 1.6231578947368423e-05, |
|
"loss": 0.7549, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 3.980865955352783, |
|
"learning_rate": 1.6210526315789473e-05, |
|
"loss": 0.814, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 7.675939559936523, |
|
"learning_rate": 1.618947368421053e-05, |
|
"loss": 0.8227, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 3.530073642730713, |
|
"learning_rate": 1.616842105263158e-05, |
|
"loss": 0.8517, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 3.6851344108581543, |
|
"learning_rate": 1.6147368421052634e-05, |
|
"loss": 0.7684, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 5.206923961639404, |
|
"learning_rate": 1.6126315789473687e-05, |
|
"loss": 0.8199, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 5.220828056335449, |
|
"learning_rate": 1.6105263157894736e-05, |
|
"loss": 0.8871, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 3.5062482357025146, |
|
"learning_rate": 1.6084210526315792e-05, |
|
"loss": 0.8281, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.9830796718597412, |
|
"learning_rate": 1.606315789473684e-05, |
|
"loss": 0.8678, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 3.3255491256713867, |
|
"learning_rate": 1.6042105263157897e-05, |
|
"loss": 0.8337, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 5.259572505950928, |
|
"learning_rate": 1.6021052631578947e-05, |
|
"loss": 0.7954, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 3.6201376914978027, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.818, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 3.3598544597625732, |
|
"learning_rate": 1.5978947368421055e-05, |
|
"loss": 0.7697, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 6.34808349609375, |
|
"learning_rate": 1.5957894736842105e-05, |
|
"loss": 0.6347, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 3.967682361602783, |
|
"learning_rate": 1.593684210526316e-05, |
|
"loss": 0.7178, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 10.222978591918945, |
|
"learning_rate": 1.591578947368421e-05, |
|
"loss": 0.7642, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 3.9339826107025146, |
|
"learning_rate": 1.5894736842105266e-05, |
|
"loss": 0.8197, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.3337771892547607, |
|
"learning_rate": 1.5873684210526315e-05, |
|
"loss": 0.9375, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.8479838371276855, |
|
"learning_rate": 1.585263157894737e-05, |
|
"loss": 0.9196, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 9.294541358947754, |
|
"learning_rate": 1.5831578947368424e-05, |
|
"loss": 0.7144, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 5.325323104858398, |
|
"learning_rate": 1.5810526315789473e-05, |
|
"loss": 0.7897, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 4.377369403839111, |
|
"learning_rate": 1.578947368421053e-05, |
|
"loss": 0.9008, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 0.8163847923278809, |
|
"eval_runtime": 67.7994, |
|
"eval_samples_per_second": 14.749, |
|
"eval_steps_per_second": 14.749, |
|
"step": 2500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 10000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 2500, |
|
"total_flos": 4.025531498496e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|