|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 6.195546950629235, |
|
"eval_steps": 500, |
|
"global_step": 2000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.030977734753146177, |
|
"grad_norm": 99.0, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 1.9661, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.061955469506292354, |
|
"grad_norm": 374.0, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 2.2124, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09293320425943853, |
|
"grad_norm": 1392640.0, |
|
"learning_rate": 3e-06, |
|
"loss": 2.2122, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.12391093901258471, |
|
"grad_norm": 78.5, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.9339, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.15488867376573087, |
|
"grad_norm": 13500416.0, |
|
"learning_rate": 5e-06, |
|
"loss": 1.8448, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.18586640851887706, |
|
"grad_norm": 98.0, |
|
"learning_rate": 6e-06, |
|
"loss": 1.9577, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.21684414327202323, |
|
"grad_norm": 11337728.0, |
|
"learning_rate": 7.000000000000001e-06, |
|
"loss": 2.089, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.24782187802516942, |
|
"grad_norm": 20736.0, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.8571, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2787996127783156, |
|
"grad_norm": 8512.0, |
|
"learning_rate": 9e-06, |
|
"loss": 2.0639, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.30977734753146174, |
|
"grad_norm": 1004.0, |
|
"learning_rate": 1e-05, |
|
"loss": 1.978, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.34075508228460794, |
|
"grad_norm": 470.0, |
|
"learning_rate": 1.1000000000000001e-05, |
|
"loss": 1.9845, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3717328170377541, |
|
"grad_norm": 268435456.0, |
|
"learning_rate": 1.2e-05, |
|
"loss": 1.9825, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4027105517909003, |
|
"grad_norm": 50593792.0, |
|
"learning_rate": 1.3000000000000001e-05, |
|
"loss": 2.0309, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.43368828654404645, |
|
"grad_norm": 942080.0, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 1.804, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.46466602129719264, |
|
"grad_norm": 93.5, |
|
"learning_rate": 1.5e-05, |
|
"loss": 1.9188, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.49564375605033884, |
|
"grad_norm": 138.0, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 1.7755, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.526621490803485, |
|
"grad_norm": 2310144.0, |
|
"learning_rate": 1.7000000000000003e-05, |
|
"loss": 1.9404, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5575992255566312, |
|
"grad_norm": 252928.0, |
|
"learning_rate": 1.8e-05, |
|
"loss": 1.9115, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5885769603097774, |
|
"grad_norm": 36096.0, |
|
"learning_rate": 1.9e-05, |
|
"loss": 2.0781, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6195546950629235, |
|
"grad_norm": 143.0, |
|
"learning_rate": 2e-05, |
|
"loss": 1.8338, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6505324298160697, |
|
"grad_norm": 222208.0, |
|
"learning_rate": 2.1e-05, |
|
"loss": 1.9065, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6815101645692159, |
|
"grad_norm": 11599872.0, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 1.8142, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.712487899322362, |
|
"grad_norm": 3568.0, |
|
"learning_rate": 2.3000000000000003e-05, |
|
"loss": 1.7811, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7434656340755083, |
|
"grad_norm": 81.5, |
|
"learning_rate": 2.4e-05, |
|
"loss": 1.7512, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7744433688286544, |
|
"grad_norm": 3696.0, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.8697, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.8054211035818006, |
|
"grad_norm": 63744.0, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 1.7677, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8363988383349468, |
|
"grad_norm": 55.5, |
|
"learning_rate": 2.7000000000000002e-05, |
|
"loss": 1.7307, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.8673765730880929, |
|
"grad_norm": 202.0, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 1.9074, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.8983543078412392, |
|
"grad_norm": 348.0, |
|
"learning_rate": 2.9e-05, |
|
"loss": 1.7037, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9293320425943853, |
|
"grad_norm": 342.0, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5886, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9603097773475314, |
|
"grad_norm": 62.5, |
|
"learning_rate": 3.1e-05, |
|
"loss": 1.6942, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.9912875121006777, |
|
"grad_norm": 91.5, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 1.5326, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.0222652468538238, |
|
"grad_norm": 6560.0, |
|
"learning_rate": 3.3e-05, |
|
"loss": 1.6642, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.05324298160697, |
|
"grad_norm": 51.5, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 1.5624, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.084220716360116, |
|
"grad_norm": 1752.0, |
|
"learning_rate": 3.5e-05, |
|
"loss": 1.447, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.1151984511132624, |
|
"grad_norm": 48.25, |
|
"learning_rate": 3.6e-05, |
|
"loss": 1.6133, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.1461761858664086, |
|
"grad_norm": 52.25, |
|
"learning_rate": 3.7e-05, |
|
"loss": 1.4596, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.1771539206195547, |
|
"grad_norm": 12032.0, |
|
"learning_rate": 3.8e-05, |
|
"loss": 1.5244, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.2081316553727008, |
|
"grad_norm": 49.0, |
|
"learning_rate": 3.9000000000000006e-05, |
|
"loss": 1.5972, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.239109390125847, |
|
"grad_norm": 39.25, |
|
"learning_rate": 4e-05, |
|
"loss": 1.2712, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.2700871248789931, |
|
"grad_norm": 1120.0, |
|
"learning_rate": 4.1e-05, |
|
"loss": 1.4318, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.3010648596321395, |
|
"grad_norm": 28160.0, |
|
"learning_rate": 4.2e-05, |
|
"loss": 1.3211, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.3320425943852856, |
|
"grad_norm": 12845056.0, |
|
"learning_rate": 4.3e-05, |
|
"loss": 1.4051, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.3630203291384317, |
|
"grad_norm": 6979584.0, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 1.2505, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.3939980638915779, |
|
"grad_norm": 27.75, |
|
"learning_rate": 4.5e-05, |
|
"loss": 1.1342, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.424975798644724, |
|
"grad_norm": 73.0, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 1.2342, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.4559535333978704, |
|
"grad_norm": 19.75, |
|
"learning_rate": 4.7e-05, |
|
"loss": 1.0688, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.4869312681510165, |
|
"grad_norm": 604.0, |
|
"learning_rate": 4.8e-05, |
|
"loss": 1.0641, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.5179090029041626, |
|
"grad_norm": 79.5, |
|
"learning_rate": 4.9e-05, |
|
"loss": 1.0869, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.5488867376573088, |
|
"grad_norm": 2144.0, |
|
"learning_rate": 5e-05, |
|
"loss": 1.0356, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.5488867376573088, |
|
"eval_loss": 0.21533620357513428, |
|
"eval_runtime": 145.9664, |
|
"eval_samples_per_second": 10.276, |
|
"eval_steps_per_second": 2.569, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.579864472410455, |
|
"grad_norm": 125.0, |
|
"learning_rate": 5.1000000000000006e-05, |
|
"loss": 1.0417, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.6108422071636013, |
|
"grad_norm": 17.875, |
|
"learning_rate": 5.2000000000000004e-05, |
|
"loss": 1.0518, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.6418199419167472, |
|
"grad_norm": 68.0, |
|
"learning_rate": 5.300000000000001e-05, |
|
"loss": 1.1404, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.6727976766698935, |
|
"grad_norm": 19.5, |
|
"learning_rate": 5.4000000000000005e-05, |
|
"loss": 0.9938, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.7037754114230397, |
|
"grad_norm": 290.0, |
|
"learning_rate": 5.500000000000001e-05, |
|
"loss": 0.9374, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.7347531461761858, |
|
"grad_norm": 58.5, |
|
"learning_rate": 5.6000000000000006e-05, |
|
"loss": 1.0777, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.7657308809293322, |
|
"grad_norm": 422.0, |
|
"learning_rate": 5.6999999999999996e-05, |
|
"loss": 1.059, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.796708615682478, |
|
"grad_norm": 28.625, |
|
"learning_rate": 5.8e-05, |
|
"loss": 0.965, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.8276863504356244, |
|
"grad_norm": 43.75, |
|
"learning_rate": 5.9e-05, |
|
"loss": 0.9527, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.8586640851887706, |
|
"grad_norm": 12.75, |
|
"learning_rate": 6e-05, |
|
"loss": 0.8296, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.8896418199419167, |
|
"grad_norm": 16.0, |
|
"learning_rate": 6.1e-05, |
|
"loss": 0.933, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.920619554695063, |
|
"grad_norm": 11.875, |
|
"learning_rate": 6.2e-05, |
|
"loss": 0.8117, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.951597289448209, |
|
"grad_norm": 50.0, |
|
"learning_rate": 6.3e-05, |
|
"loss": 0.9475, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.9825750242013553, |
|
"grad_norm": 143.0, |
|
"learning_rate": 6.400000000000001e-05, |
|
"loss": 0.8241, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.0135527589545013, |
|
"grad_norm": 108.5, |
|
"learning_rate": 6.500000000000001e-05, |
|
"loss": 0.7861, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.0445304937076476, |
|
"grad_norm": 13184.0, |
|
"learning_rate": 6.6e-05, |
|
"loss": 0.8384, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.075508228460794, |
|
"grad_norm": 1736704.0, |
|
"learning_rate": 6.7e-05, |
|
"loss": 0.8896, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.10648596321394, |
|
"grad_norm": 48.75, |
|
"learning_rate": 6.800000000000001e-05, |
|
"loss": 0.9377, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.1374636979670862, |
|
"grad_norm": 2816.0, |
|
"learning_rate": 6.9e-05, |
|
"loss": 0.8322, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.168441432720232, |
|
"grad_norm": 3.125, |
|
"learning_rate": 7e-05, |
|
"loss": 0.8397, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.1994191674733785, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 7.1e-05, |
|
"loss": 0.9265, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.230396902226525, |
|
"grad_norm": 9.875, |
|
"learning_rate": 7.2e-05, |
|
"loss": 0.8141, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.261374636979671, |
|
"grad_norm": 14.8125, |
|
"learning_rate": 7.3e-05, |
|
"loss": 0.7629, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.292352371732817, |
|
"grad_norm": 6.4375, |
|
"learning_rate": 7.4e-05, |
|
"loss": 0.85, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.323330106485963, |
|
"grad_norm": 3.796875, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 0.9116, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.3543078412391094, |
|
"grad_norm": 544768.0, |
|
"learning_rate": 7.6e-05, |
|
"loss": 0.8437, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.3852855759922553, |
|
"grad_norm": 117760.0, |
|
"learning_rate": 7.7e-05, |
|
"loss": 0.9072, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.4162633107454017, |
|
"grad_norm": 22.75, |
|
"learning_rate": 7.800000000000001e-05, |
|
"loss": 1.0168, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.447241045498548, |
|
"grad_norm": 2211840.0, |
|
"learning_rate": 7.900000000000001e-05, |
|
"loss": 1.1433, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.478218780251694, |
|
"grad_norm": 41.5, |
|
"learning_rate": 8e-05, |
|
"loss": 0.7485, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.5091965150048403, |
|
"grad_norm": 1810432.0, |
|
"learning_rate": 8.1e-05, |
|
"loss": 0.9516, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.5401742497579862, |
|
"grad_norm": 1081344.0, |
|
"learning_rate": 8.2e-05, |
|
"loss": 1.0742, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.5711519845111326, |
|
"grad_norm": 337641472.0, |
|
"learning_rate": 8.3e-05, |
|
"loss": 1.023, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.602129719264279, |
|
"grad_norm": 14548992.0, |
|
"learning_rate": 8.4e-05, |
|
"loss": 1.185, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.633107454017425, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 8.5e-05, |
|
"loss": 1.3584, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.664085188770571, |
|
"grad_norm": 618496.0, |
|
"learning_rate": 8.6e-05, |
|
"loss": 0.9947, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.695062923523717, |
|
"grad_norm": 22151168.0, |
|
"learning_rate": 8.7e-05, |
|
"loss": 1.0296, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.7260406582768635, |
|
"grad_norm": 77824.0, |
|
"learning_rate": 8.800000000000001e-05, |
|
"loss": 0.8889, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.75701839303001, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 8.900000000000001e-05, |
|
"loss": 0.733, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.7879961277831558, |
|
"grad_norm": 18.625, |
|
"learning_rate": 9e-05, |
|
"loss": 0.7786, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.818973862536302, |
|
"grad_norm": 9.0625, |
|
"learning_rate": 9.1e-05, |
|
"loss": 0.7221, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.849951597289448, |
|
"grad_norm": 10.4375, |
|
"learning_rate": 9.200000000000001e-05, |
|
"loss": 0.6316, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.8809293320425944, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 9.300000000000001e-05, |
|
"loss": 0.7015, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.9119070667957407, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 9.4e-05, |
|
"loss": 0.7161, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.9428848015488867, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 9.5e-05, |
|
"loss": 0.7325, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.973862536302033, |
|
"grad_norm": 5.90625, |
|
"learning_rate": 9.6e-05, |
|
"loss": 0.6447, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 3.004840271055179, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 9.7e-05, |
|
"loss": 0.7079, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 3.0358180058083253, |
|
"grad_norm": 3.8125, |
|
"learning_rate": 9.8e-05, |
|
"loss": 0.6075, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 3.0667957405614716, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 9.900000000000001e-05, |
|
"loss": 0.697, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 3.0977734753146176, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7175, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.0977734753146176, |
|
"eval_loss": 0.1426076889038086, |
|
"eval_runtime": 146.0018, |
|
"eval_samples_per_second": 10.274, |
|
"eval_steps_per_second": 2.568, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.128751210067764, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 9.988452655889145e-05, |
|
"loss": 0.5774, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 3.15972894482091, |
|
"grad_norm": 24.125, |
|
"learning_rate": 9.976905311778292e-05, |
|
"loss": 0.5575, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 3.190706679574056, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 9.965357967667437e-05, |
|
"loss": 0.5609, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 3.2216844143272025, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 9.953810623556582e-05, |
|
"loss": 0.5562, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 3.2526621490803485, |
|
"grad_norm": 37748736.0, |
|
"learning_rate": 9.942263279445728e-05, |
|
"loss": 0.6574, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 3.283639883833495, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 9.930715935334873e-05, |
|
"loss": 0.5348, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 3.3146176185866407, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 9.919168591224018e-05, |
|
"loss": 0.5574, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 3.345595353339787, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 9.907621247113164e-05, |
|
"loss": 0.5546, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 3.3765730880929334, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 9.896073903002309e-05, |
|
"loss": 0.4716, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 3.4075508228460794, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 9.884526558891456e-05, |
|
"loss": 0.4992, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.4385285575992257, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 9.8729792147806e-05, |
|
"loss": 0.4814, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 3.4695062923523716, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 9.861431870669747e-05, |
|
"loss": 0.5451, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 3.500484027105518, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 9.849884526558892e-05, |
|
"loss": 0.5436, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 3.5314617618586643, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 9.838337182448038e-05, |
|
"loss": 0.5237, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 3.5624394966118103, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 9.826789838337183e-05, |
|
"loss": 0.4595, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 3.593417231364956, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 9.81524249422633e-05, |
|
"loss": 0.5892, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 3.6243949661181025, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 9.803695150115474e-05, |
|
"loss": 0.4804, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 3.655372700871249, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 9.79214780600462e-05, |
|
"loss": 0.471, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 3.6863504356243952, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 9.780600461893766e-05, |
|
"loss": 0.5353, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 3.717328170377541, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 9.76905311778291e-05, |
|
"loss": 0.5112, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.748305905130687, |
|
"grad_norm": 28.25, |
|
"learning_rate": 9.757505773672056e-05, |
|
"loss": 0.5225, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 3.7792836398838334, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 9.745958429561202e-05, |
|
"loss": 0.4663, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 3.81026137463698, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 9.734411085450347e-05, |
|
"loss": 0.5263, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 3.8412391093901257, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 9.722863741339492e-05, |
|
"loss": 0.5353, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 3.872216844143272, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 9.711316397228638e-05, |
|
"loss": 0.4615, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 3.903194578896418, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 9.699769053117783e-05, |
|
"loss": 0.4887, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 3.9341723136495643, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 9.68822170900693e-05, |
|
"loss": 0.4658, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 3.9651500484027107, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 9.676674364896074e-05, |
|
"loss": 0.4914, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 3.9961277831558566, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 9.66512702078522e-05, |
|
"loss": 0.5355, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 4.0271055179090025, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 9.653579676674366e-05, |
|
"loss": 0.5068, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 4.058083252662149, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 9.64203233256351e-05, |
|
"loss": 0.4409, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 4.089060987415295, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 9.630484988452656e-05, |
|
"loss": 0.4788, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 4.120038722168442, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 9.618937644341802e-05, |
|
"loss": 0.4901, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 4.151016456921588, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 9.607390300230947e-05, |
|
"loss": 0.4119, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 4.181994191674733, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 9.595842956120092e-05, |
|
"loss": 0.4597, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 4.21297192642788, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 9.584295612009238e-05, |
|
"loss": 0.5134, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 4.243949661181026, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 9.572748267898383e-05, |
|
"loss": 0.4222, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 4.2749273959341725, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 9.56120092378753e-05, |
|
"loss": 0.4851, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 4.305905130687319, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 9.549653579676674e-05, |
|
"loss": 0.505, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 4.336882865440464, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 9.538106235565821e-05, |
|
"loss": 0.4439, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 4.367860600193611, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 9.526558891454966e-05, |
|
"loss": 0.4311, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 4.398838334946757, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 9.515011547344112e-05, |
|
"loss": 0.4369, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 4.429816069699903, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 9.503464203233257e-05, |
|
"loss": 0.4863, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 4.46079380445305, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 9.491916859122403e-05, |
|
"loss": 0.4471, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 4.491771539206195, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 9.480369515011548e-05, |
|
"loss": 0.4816, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 4.522749273959342, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 9.468822170900693e-05, |
|
"loss": 0.4363, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 4.553727008712488, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 9.45727482678984e-05, |
|
"loss": 0.414, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 4.584704743465634, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 9.445727482678985e-05, |
|
"loss": 0.4318, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 4.615682478218781, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 9.43418013856813e-05, |
|
"loss": 0.456, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 4.646660212971926, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 9.422632794457276e-05, |
|
"loss": 0.4453, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 4.646660212971926, |
|
"eval_loss": 0.10397772490978241, |
|
"eval_runtime": 145.9588, |
|
"eval_samples_per_second": 10.277, |
|
"eval_steps_per_second": 2.569, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 4.6776379477250725, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 9.411085450346421e-05, |
|
"loss": 0.4319, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 4.708615682478219, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 9.399538106235566e-05, |
|
"loss": 0.4488, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 4.739593417231365, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 9.387990762124712e-05, |
|
"loss": 0.4171, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 4.770571151984511, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 9.376443418013857e-05, |
|
"loss": 0.4042, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 4.801548886737657, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 9.364896073903002e-05, |
|
"loss": 0.4238, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 4.832526621490803, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 9.353348729792148e-05, |
|
"loss": 0.4288, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 4.86350435624395, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 9.341801385681293e-05, |
|
"loss": 0.46, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 4.894482090997096, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 9.330254041570438e-05, |
|
"loss": 0.4821, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 4.9254598257502415, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 9.318706697459585e-05, |
|
"loss": 0.3854, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 4.956437560503388, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 9.30715935334873e-05, |
|
"loss": 0.3778, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 4.987415295256534, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 9.295612009237876e-05, |
|
"loss": 0.425, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 5.018393030009681, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 9.284064665127021e-05, |
|
"loss": 0.3792, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 5.049370764762827, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 9.272517321016166e-05, |
|
"loss": 0.4225, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 5.0803484995159724, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 9.260969976905312e-05, |
|
"loss": 0.3942, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 5.111326234269119, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 9.249422632794457e-05, |
|
"loss": 0.3928, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 5.142303969022265, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 9.237875288683603e-05, |
|
"loss": 0.4357, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 5.1732817037754115, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 9.226327944572748e-05, |
|
"loss": 0.4773, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 5.204259438528558, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 9.214780600461895e-05, |
|
"loss": 0.4383, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 5.235237173281703, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 9.20323325635104e-05, |
|
"loss": 0.4585, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 5.26621490803485, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 9.191685912240186e-05, |
|
"loss": 0.4713, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 5.297192642787996, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 9.180138568129331e-05, |
|
"loss": 0.4222, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 5.328170377541142, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 9.168591224018476e-05, |
|
"loss": 0.3981, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 5.359148112294289, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 9.157043879907622e-05, |
|
"loss": 0.4118, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 5.390125847047434, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 9.145496535796767e-05, |
|
"loss": 0.4211, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 5.421103581800581, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 9.133949191685914e-05, |
|
"loss": 0.3755, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 5.452081316553727, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 9.122401847575059e-05, |
|
"loss": 0.4243, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 5.483059051306873, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 9.110854503464203e-05, |
|
"loss": 0.4099, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 5.51403678606002, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 9.09930715935335e-05, |
|
"loss": 0.4153, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 5.545014520813165, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 9.087759815242495e-05, |
|
"loss": 0.3927, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 5.5759922555663115, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 9.07621247113164e-05, |
|
"loss": 0.3794, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 5.606969990319458, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 9.064665127020786e-05, |
|
"loss": 0.4094, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 5.637947725072604, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 9.053117782909931e-05, |
|
"loss": 0.4592, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 5.668925459825751, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 9.041570438799076e-05, |
|
"loss": 0.4026, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 5.699903194578896, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 9.030023094688222e-05, |
|
"loss": 0.4556, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 5.730880929332042, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 9.018475750577367e-05, |
|
"loss": 0.4606, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 5.761858664085189, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 9.006928406466512e-05, |
|
"loss": 0.4363, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 5.792836398838335, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 8.995381062355659e-05, |
|
"loss": 0.3631, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 5.8238141335914815, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 8.983833718244804e-05, |
|
"loss": 0.4075, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 5.854791868344627, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 8.972286374133948e-05, |
|
"loss": 0.4456, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 5.885769603097773, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 8.960739030023095e-05, |
|
"loss": 0.4451, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 5.91674733785092, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 8.94919168591224e-05, |
|
"loss": 0.4127, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 5.947725072604066, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 8.937644341801386e-05, |
|
"loss": 0.4007, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 5.978702807357212, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 8.926096997690532e-05, |
|
"loss": 0.3923, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 6.009680542110358, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 8.914549653579677e-05, |
|
"loss": 0.3395, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 6.040658276863504, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 8.903002309468824e-05, |
|
"loss": 0.376, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 6.071636011616651, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 8.891454965357969e-05, |
|
"loss": 0.3858, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 6.102613746369797, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 8.879907621247114e-05, |
|
"loss": 0.3368, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 6.133591481122943, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 8.86836027713626e-05, |
|
"loss": 0.4098, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 6.164569215876089, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 8.856812933025405e-05, |
|
"loss": 0.3701, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 6.195546950629235, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 8.84526558891455e-05, |
|
"loss": 0.3528, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 6.195546950629235, |
|
"eval_loss": 0.09939394146203995, |
|
"eval_runtime": 145.9427, |
|
"eval_samples_per_second": 10.278, |
|
"eval_steps_per_second": 2.57, |
|
"step": 2000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 9660, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 30, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.189290237341991e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|