|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 8.113590263691684, |
|
"eval_steps": 500, |
|
"global_step": 1000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 16.835479736328125, |
|
"learning_rate": 0.0002, |
|
"loss": 3.766, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 15.441985130310059, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4106, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 13.51206111907959, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1355, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 14.920987129211426, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9941, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 8.289106369018555, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8887, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 8.003510475158691, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8275, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 7.877600193023682, |
|
"learning_rate": 0.0002, |
|
"loss": 0.798, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 2.8833744525909424, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8464, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 8.206995964050293, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7849, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 7.678927421569824, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7184, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 7.089763164520264, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7099, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 5.643054485321045, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6989, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 7.985189914703369, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6394, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 7.676026344299316, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5882, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 8.714900016784668, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5707, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 5.221066474914551, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5345, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 9.008663177490234, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5568, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 6.81847620010376, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5249, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 0.8458152413368225, |
|
"learning_rate": 0.0002, |
|
"loss": 0.491, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 7.401742935180664, |
|
"learning_rate": 0.0002, |
|
"loss": 0.514, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 7.641871452331543, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5557, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 5.442512512207031, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5314, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 8.104277610778809, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4907, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 6.416436195373535, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5092, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 3.008178949356079, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4633, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 9.349288940429688, |
|
"learning_rate": 0.0002, |
|
"loss": 0.378, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 6.459526062011719, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3765, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 7.634103775024414, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3819, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 9.084547996520996, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3703, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 2.535707473754883, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3501, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 7.023160457611084, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3491, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 9.15241527557373, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3592, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 3.0726027488708496, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3641, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 5.8857879638671875, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3553, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 8.316734313964844, |
|
"learning_rate": 0.0002, |
|
"loss": 0.36, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 3.8568618297576904, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3607, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 6.664178848266602, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3716, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 9.07357406616211, |
|
"learning_rate": 0.0002, |
|
"loss": 0.242, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 4.3498921394348145, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2588, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 6.539889812469482, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2523, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"grad_norm": 8.80722427368164, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2388, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 3.41, |
|
"grad_norm": 5.052772521972656, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2524, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.49, |
|
"grad_norm": 5.22416877746582, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2452, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.57, |
|
"grad_norm": 8.989995956420898, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2558, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 3.65, |
|
"grad_norm": 2.628777027130127, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2449, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 3.73, |
|
"grad_norm": 5.890882968902588, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2591, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"grad_norm": 8.577589988708496, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2557, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 3.89, |
|
"grad_norm": 5.092458724975586, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2603, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.98, |
|
"grad_norm": 5.97870397567749, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2676, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 4.06, |
|
"grad_norm": 8.259415626525879, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2051, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 4.14, |
|
"grad_norm": 7.2230072021484375, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1467, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 4.22, |
|
"grad_norm": 6.01918888092041, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1713, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"grad_norm": 8.41965103149414, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1803, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 4.38, |
|
"grad_norm": 4.342984676361084, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1769, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"grad_norm": 6.167740821838379, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1763, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 4.54, |
|
"grad_norm": 8.30461597442627, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1716, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"grad_norm": 3.77854323387146, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1679, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 4.71, |
|
"grad_norm": 5.807821273803711, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1775, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 4.79, |
|
"grad_norm": 7.620412349700928, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1769, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 4.87, |
|
"grad_norm": 6.027496814727783, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1828, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 4.95, |
|
"grad_norm": 5.835544109344482, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1783, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 5.03, |
|
"grad_norm": 7.455280780792236, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1545, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 5.11, |
|
"grad_norm": 3.6531035900115967, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1213, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 5.19, |
|
"grad_norm": 6.005002498626709, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1211, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 5.27, |
|
"grad_norm": 8.0129976272583, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1268, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 5.35, |
|
"grad_norm": 4.5181169509887695, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1215, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 5.44, |
|
"grad_norm": 5.109263896942139, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1264, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 5.52, |
|
"grad_norm": 7.003955364227295, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1332, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 5.6, |
|
"grad_norm": 4.416803359985352, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1333, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 5.68, |
|
"grad_norm": 4.775448322296143, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1384, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 5.76, |
|
"grad_norm": 6.8711018562316895, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1388, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 5.84, |
|
"grad_norm": 4.3299760818481445, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1392, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 5.92, |
|
"grad_norm": 4.7512006759643555, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1477, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 6.701217174530029, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1401, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 6.09, |
|
"grad_norm": 4.163561820983887, |
|
"learning_rate": 0.0002, |
|
"loss": 0.099, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 6.17, |
|
"grad_norm": 4.784251689910889, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1025, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 6.25, |
|
"grad_norm": 7.253482341766357, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1019, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 6.33, |
|
"grad_norm": 4.389810085296631, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1052, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 6.41, |
|
"grad_norm": 4.857180595397949, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1065, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 6.49, |
|
"grad_norm": 6.876203536987305, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1092, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 6.57, |
|
"grad_norm": 4.4614715576171875, |
|
"learning_rate": 0.0002, |
|
"loss": 0.112, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 6.65, |
|
"grad_norm": 5.019026756286621, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1127, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 6.73, |
|
"grad_norm": 6.864689350128174, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1131, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 6.82, |
|
"grad_norm": 4.057292938232422, |
|
"learning_rate": 0.0002, |
|
"loss": 0.113, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 6.9, |
|
"grad_norm": 4.780605792999268, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1122, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 6.98, |
|
"grad_norm": 6.377821445465088, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1153, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 7.06, |
|
"grad_norm": 3.6482489109039307, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0962, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 7.14, |
|
"grad_norm": 4.848484516143799, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0892, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 7.22, |
|
"grad_norm": 6.297725677490234, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0913, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 7.3, |
|
"grad_norm": 3.4698007106781006, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0927, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 7.38, |
|
"grad_norm": 4.614325523376465, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0933, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 7.46, |
|
"grad_norm": 6.8649163246154785, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0951, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 7.55, |
|
"grad_norm": 3.5742478370666504, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0975, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 7.63, |
|
"grad_norm": 4.66102409362793, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0971, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 7.71, |
|
"grad_norm": 6.615488052368164, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0992, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 7.79, |
|
"grad_norm": 4.218813419342041, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1021, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 7.87, |
|
"grad_norm": 4.6225905418396, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1012, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 7.95, |
|
"grad_norm": 6.005787372589111, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1059, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 8.03, |
|
"grad_norm": 3.2653019428253174, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0929, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 8.11, |
|
"grad_norm": 5.000682353973389, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0787, |
|
"step": 1000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 9, |
|
"save_steps": 10, |
|
"total_flos": 1.9346683868368896e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|