|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0977734753146176, |
|
"eval_steps": 500, |
|
"global_step": 1000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.030977734753146177, |
|
"grad_norm": 99.0, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 1.9661, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.061955469506292354, |
|
"grad_norm": 374.0, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 2.2124, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09293320425943853, |
|
"grad_norm": 1392640.0, |
|
"learning_rate": 3e-06, |
|
"loss": 2.2122, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.12391093901258471, |
|
"grad_norm": 78.5, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.9339, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.15488867376573087, |
|
"grad_norm": 13500416.0, |
|
"learning_rate": 5e-06, |
|
"loss": 1.8448, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.18586640851887706, |
|
"grad_norm": 98.0, |
|
"learning_rate": 6e-06, |
|
"loss": 1.9577, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.21684414327202323, |
|
"grad_norm": 11337728.0, |
|
"learning_rate": 7.000000000000001e-06, |
|
"loss": 2.089, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.24782187802516942, |
|
"grad_norm": 20736.0, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.8571, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2787996127783156, |
|
"grad_norm": 8512.0, |
|
"learning_rate": 9e-06, |
|
"loss": 2.0639, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.30977734753146174, |
|
"grad_norm": 1004.0, |
|
"learning_rate": 1e-05, |
|
"loss": 1.978, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.34075508228460794, |
|
"grad_norm": 470.0, |
|
"learning_rate": 1.1000000000000001e-05, |
|
"loss": 1.9845, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3717328170377541, |
|
"grad_norm": 268435456.0, |
|
"learning_rate": 1.2e-05, |
|
"loss": 1.9825, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4027105517909003, |
|
"grad_norm": 50593792.0, |
|
"learning_rate": 1.3000000000000001e-05, |
|
"loss": 2.0309, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.43368828654404645, |
|
"grad_norm": 942080.0, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 1.804, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.46466602129719264, |
|
"grad_norm": 93.5, |
|
"learning_rate": 1.5e-05, |
|
"loss": 1.9188, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.49564375605033884, |
|
"grad_norm": 138.0, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 1.7755, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.526621490803485, |
|
"grad_norm": 2310144.0, |
|
"learning_rate": 1.7000000000000003e-05, |
|
"loss": 1.9404, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5575992255566312, |
|
"grad_norm": 252928.0, |
|
"learning_rate": 1.8e-05, |
|
"loss": 1.9115, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5885769603097774, |
|
"grad_norm": 36096.0, |
|
"learning_rate": 1.9e-05, |
|
"loss": 2.0781, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6195546950629235, |
|
"grad_norm": 143.0, |
|
"learning_rate": 2e-05, |
|
"loss": 1.8338, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6505324298160697, |
|
"grad_norm": 222208.0, |
|
"learning_rate": 2.1e-05, |
|
"loss": 1.9065, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6815101645692159, |
|
"grad_norm": 11599872.0, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 1.8142, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.712487899322362, |
|
"grad_norm": 3568.0, |
|
"learning_rate": 2.3000000000000003e-05, |
|
"loss": 1.7811, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7434656340755083, |
|
"grad_norm": 81.5, |
|
"learning_rate": 2.4e-05, |
|
"loss": 1.7512, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7744433688286544, |
|
"grad_norm": 3696.0, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.8697, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.8054211035818006, |
|
"grad_norm": 63744.0, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 1.7677, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8363988383349468, |
|
"grad_norm": 55.5, |
|
"learning_rate": 2.7000000000000002e-05, |
|
"loss": 1.7307, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.8673765730880929, |
|
"grad_norm": 202.0, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 1.9074, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.8983543078412392, |
|
"grad_norm": 348.0, |
|
"learning_rate": 2.9e-05, |
|
"loss": 1.7037, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9293320425943853, |
|
"grad_norm": 342.0, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5886, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9603097773475314, |
|
"grad_norm": 62.5, |
|
"learning_rate": 3.1e-05, |
|
"loss": 1.6942, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.9912875121006777, |
|
"grad_norm": 91.5, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 1.5326, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.0222652468538238, |
|
"grad_norm": 6560.0, |
|
"learning_rate": 3.3e-05, |
|
"loss": 1.6642, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.05324298160697, |
|
"grad_norm": 51.5, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 1.5624, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.084220716360116, |
|
"grad_norm": 1752.0, |
|
"learning_rate": 3.5e-05, |
|
"loss": 1.447, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.1151984511132624, |
|
"grad_norm": 48.25, |
|
"learning_rate": 3.6e-05, |
|
"loss": 1.6133, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.1461761858664086, |
|
"grad_norm": 52.25, |
|
"learning_rate": 3.7e-05, |
|
"loss": 1.4596, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.1771539206195547, |
|
"grad_norm": 12032.0, |
|
"learning_rate": 3.8e-05, |
|
"loss": 1.5244, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.2081316553727008, |
|
"grad_norm": 49.0, |
|
"learning_rate": 3.9000000000000006e-05, |
|
"loss": 1.5972, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.239109390125847, |
|
"grad_norm": 39.25, |
|
"learning_rate": 4e-05, |
|
"loss": 1.2712, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.2700871248789931, |
|
"grad_norm": 1120.0, |
|
"learning_rate": 4.1e-05, |
|
"loss": 1.4318, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.3010648596321395, |
|
"grad_norm": 28160.0, |
|
"learning_rate": 4.2e-05, |
|
"loss": 1.3211, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.3320425943852856, |
|
"grad_norm": 12845056.0, |
|
"learning_rate": 4.3e-05, |
|
"loss": 1.4051, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.3630203291384317, |
|
"grad_norm": 6979584.0, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 1.2505, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.3939980638915779, |
|
"grad_norm": 27.75, |
|
"learning_rate": 4.5e-05, |
|
"loss": 1.1342, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.424975798644724, |
|
"grad_norm": 73.0, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 1.2342, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.4559535333978704, |
|
"grad_norm": 19.75, |
|
"learning_rate": 4.7e-05, |
|
"loss": 1.0688, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.4869312681510165, |
|
"grad_norm": 604.0, |
|
"learning_rate": 4.8e-05, |
|
"loss": 1.0641, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.5179090029041626, |
|
"grad_norm": 79.5, |
|
"learning_rate": 4.9e-05, |
|
"loss": 1.0869, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.5488867376573088, |
|
"grad_norm": 2144.0, |
|
"learning_rate": 5e-05, |
|
"loss": 1.0356, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.5488867376573088, |
|
"eval_loss": 0.21533620357513428, |
|
"eval_runtime": 145.9664, |
|
"eval_samples_per_second": 10.276, |
|
"eval_steps_per_second": 2.569, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.579864472410455, |
|
"grad_norm": 125.0, |
|
"learning_rate": 5.1000000000000006e-05, |
|
"loss": 1.0417, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.6108422071636013, |
|
"grad_norm": 17.875, |
|
"learning_rate": 5.2000000000000004e-05, |
|
"loss": 1.0518, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.6418199419167472, |
|
"grad_norm": 68.0, |
|
"learning_rate": 5.300000000000001e-05, |
|
"loss": 1.1404, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.6727976766698935, |
|
"grad_norm": 19.5, |
|
"learning_rate": 5.4000000000000005e-05, |
|
"loss": 0.9938, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.7037754114230397, |
|
"grad_norm": 290.0, |
|
"learning_rate": 5.500000000000001e-05, |
|
"loss": 0.9374, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.7347531461761858, |
|
"grad_norm": 58.5, |
|
"learning_rate": 5.6000000000000006e-05, |
|
"loss": 1.0777, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.7657308809293322, |
|
"grad_norm": 422.0, |
|
"learning_rate": 5.6999999999999996e-05, |
|
"loss": 1.059, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.796708615682478, |
|
"grad_norm": 28.625, |
|
"learning_rate": 5.8e-05, |
|
"loss": 0.965, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.8276863504356244, |
|
"grad_norm": 43.75, |
|
"learning_rate": 5.9e-05, |
|
"loss": 0.9527, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.8586640851887706, |
|
"grad_norm": 12.75, |
|
"learning_rate": 6e-05, |
|
"loss": 0.8296, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.8896418199419167, |
|
"grad_norm": 16.0, |
|
"learning_rate": 6.1e-05, |
|
"loss": 0.933, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.920619554695063, |
|
"grad_norm": 11.875, |
|
"learning_rate": 6.2e-05, |
|
"loss": 0.8117, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.951597289448209, |
|
"grad_norm": 50.0, |
|
"learning_rate": 6.3e-05, |
|
"loss": 0.9475, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.9825750242013553, |
|
"grad_norm": 143.0, |
|
"learning_rate": 6.400000000000001e-05, |
|
"loss": 0.8241, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.0135527589545013, |
|
"grad_norm": 108.5, |
|
"learning_rate": 6.500000000000001e-05, |
|
"loss": 0.7861, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.0445304937076476, |
|
"grad_norm": 13184.0, |
|
"learning_rate": 6.6e-05, |
|
"loss": 0.8384, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.075508228460794, |
|
"grad_norm": 1736704.0, |
|
"learning_rate": 6.7e-05, |
|
"loss": 0.8896, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.10648596321394, |
|
"grad_norm": 48.75, |
|
"learning_rate": 6.800000000000001e-05, |
|
"loss": 0.9377, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.1374636979670862, |
|
"grad_norm": 2816.0, |
|
"learning_rate": 6.9e-05, |
|
"loss": 0.8322, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.168441432720232, |
|
"grad_norm": 3.125, |
|
"learning_rate": 7e-05, |
|
"loss": 0.8397, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.1994191674733785, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 7.1e-05, |
|
"loss": 0.9265, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.230396902226525, |
|
"grad_norm": 9.875, |
|
"learning_rate": 7.2e-05, |
|
"loss": 0.8141, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.261374636979671, |
|
"grad_norm": 14.8125, |
|
"learning_rate": 7.3e-05, |
|
"loss": 0.7629, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.292352371732817, |
|
"grad_norm": 6.4375, |
|
"learning_rate": 7.4e-05, |
|
"loss": 0.85, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.323330106485963, |
|
"grad_norm": 3.796875, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 0.9116, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.3543078412391094, |
|
"grad_norm": 544768.0, |
|
"learning_rate": 7.6e-05, |
|
"loss": 0.8437, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.3852855759922553, |
|
"grad_norm": 117760.0, |
|
"learning_rate": 7.7e-05, |
|
"loss": 0.9072, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.4162633107454017, |
|
"grad_norm": 22.75, |
|
"learning_rate": 7.800000000000001e-05, |
|
"loss": 1.0168, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.447241045498548, |
|
"grad_norm": 2211840.0, |
|
"learning_rate": 7.900000000000001e-05, |
|
"loss": 1.1433, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.478218780251694, |
|
"grad_norm": 41.5, |
|
"learning_rate": 8e-05, |
|
"loss": 0.7485, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.5091965150048403, |
|
"grad_norm": 1810432.0, |
|
"learning_rate": 8.1e-05, |
|
"loss": 0.9516, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.5401742497579862, |
|
"grad_norm": 1081344.0, |
|
"learning_rate": 8.2e-05, |
|
"loss": 1.0742, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.5711519845111326, |
|
"grad_norm": 337641472.0, |
|
"learning_rate": 8.3e-05, |
|
"loss": 1.023, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.602129719264279, |
|
"grad_norm": 14548992.0, |
|
"learning_rate": 8.4e-05, |
|
"loss": 1.185, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.633107454017425, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 8.5e-05, |
|
"loss": 1.3584, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.664085188770571, |
|
"grad_norm": 618496.0, |
|
"learning_rate": 8.6e-05, |
|
"loss": 0.9947, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.695062923523717, |
|
"grad_norm": 22151168.0, |
|
"learning_rate": 8.7e-05, |
|
"loss": 1.0296, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.7260406582768635, |
|
"grad_norm": 77824.0, |
|
"learning_rate": 8.800000000000001e-05, |
|
"loss": 0.8889, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.75701839303001, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 8.900000000000001e-05, |
|
"loss": 0.733, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.7879961277831558, |
|
"grad_norm": 18.625, |
|
"learning_rate": 9e-05, |
|
"loss": 0.7786, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.818973862536302, |
|
"grad_norm": 9.0625, |
|
"learning_rate": 9.1e-05, |
|
"loss": 0.7221, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.849951597289448, |
|
"grad_norm": 10.4375, |
|
"learning_rate": 9.200000000000001e-05, |
|
"loss": 0.6316, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.8809293320425944, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 9.300000000000001e-05, |
|
"loss": 0.7015, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.9119070667957407, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 9.4e-05, |
|
"loss": 0.7161, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.9428848015488867, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 9.5e-05, |
|
"loss": 0.7325, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.973862536302033, |
|
"grad_norm": 5.90625, |
|
"learning_rate": 9.6e-05, |
|
"loss": 0.6447, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 3.004840271055179, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 9.7e-05, |
|
"loss": 0.7079, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 3.0358180058083253, |
|
"grad_norm": 3.8125, |
|
"learning_rate": 9.8e-05, |
|
"loss": 0.6075, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 3.0667957405614716, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 9.900000000000001e-05, |
|
"loss": 0.697, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 3.0977734753146176, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7175, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.0977734753146176, |
|
"eval_loss": 0.1426076889038086, |
|
"eval_runtime": 146.0018, |
|
"eval_samples_per_second": 10.274, |
|
"eval_steps_per_second": 2.568, |
|
"step": 1000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 9660, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 30, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.0946451186709955e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|