|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.999438727782975, |
|
"eval_steps": 500, |
|
"global_step": 1002, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.029934518241347054, |
|
"grad_norm": 1.3711546008530633, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7728, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.05986903648269411, |
|
"grad_norm": 0.8532931970235523, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6958, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.08980355472404115, |
|
"grad_norm": 0.9311714188883703, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6702, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.11973807296538821, |
|
"grad_norm": 0.7891322057145671, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6661, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.14967259120673526, |
|
"grad_norm": 0.6667175607439161, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6572, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1796071094480823, |
|
"grad_norm": 0.6870807802747252, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6491, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.20954162768942938, |
|
"grad_norm": 0.7923438547434251, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6458, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.23947614593077643, |
|
"grad_norm": 0.7026808650849072, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6473, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2694106641721235, |
|
"grad_norm": 0.9113469922865229, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6385, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2993451824134705, |
|
"grad_norm": 0.6673532156562788, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6397, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3292797006548176, |
|
"grad_norm": 0.7024293184544007, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6355, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3592142188961646, |
|
"grad_norm": 0.7252389528859061, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6343, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3891487371375117, |
|
"grad_norm": 0.7166274742588952, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6345, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.41908325537885877, |
|
"grad_norm": 1.044790711824065, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6283, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.4490177736202058, |
|
"grad_norm": 0.6941559597539316, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6299, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.47895229186155286, |
|
"grad_norm": 0.6459395559042749, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6287, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5088868101028999, |
|
"grad_norm": 0.8200206082293998, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6231, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.538821328344247, |
|
"grad_norm": 0.8147867268820529, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6247, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.568755846585594, |
|
"grad_norm": 0.7086057878705686, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6225, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.598690364826941, |
|
"grad_norm": 0.6936168023515021, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6302, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6286248830682881, |
|
"grad_norm": 0.7668440736078843, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6242, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6585594013096352, |
|
"grad_norm": 0.9231622196440751, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6223, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6884939195509823, |
|
"grad_norm": 0.7268095301330812, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6248, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7184284377923292, |
|
"grad_norm": 0.7253911527197313, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6237, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7483629560336763, |
|
"grad_norm": 0.624367345766973, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6202, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7782974742750234, |
|
"grad_norm": 0.7459567304697926, |
|
"learning_rate": 5e-06, |
|
"loss": 0.619, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8082319925163705, |
|
"grad_norm": 0.7173470697470193, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6264, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.8381665107577175, |
|
"grad_norm": 0.6676024114471235, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6165, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.8681010289990645, |
|
"grad_norm": 0.7422170215057577, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6133, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.8980355472404116, |
|
"grad_norm": 0.770476706070457, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6147, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9279700654817586, |
|
"grad_norm": 0.7280386561072427, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6135, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.9579045837231057, |
|
"grad_norm": 0.6377527403933052, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6155, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.9878391019644528, |
|
"grad_norm": 0.9160036426859673, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6084, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.9998129092609915, |
|
"eval_loss": 0.6195828914642334, |
|
"eval_runtime": 514.5322, |
|
"eval_samples_per_second": 17.494, |
|
"eval_steps_per_second": 0.548, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 1.0177736202057999, |
|
"grad_norm": 0.8698832997202014, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6386, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.047708138447147, |
|
"grad_norm": 0.818554628982315, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5525, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.077642656688494, |
|
"grad_norm": 0.7038909119697851, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5481, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.1075771749298409, |
|
"grad_norm": 0.7417662574743294, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5481, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.137511693171188, |
|
"grad_norm": 0.6971090185954912, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5483, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.167446211412535, |
|
"grad_norm": 0.6877839912070208, |
|
"learning_rate": 5e-06, |
|
"loss": 0.55, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.197380729653882, |
|
"grad_norm": 0.7726546070457438, |
|
"learning_rate": 5e-06, |
|
"loss": 0.55, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.2273152478952292, |
|
"grad_norm": 0.627193492530289, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5524, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.2572497661365762, |
|
"grad_norm": 0.7305679948147004, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5567, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.2871842843779233, |
|
"grad_norm": 0.6901834732706111, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5513, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.3171188026192704, |
|
"grad_norm": 0.7078951488952088, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5533, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.3470533208606175, |
|
"grad_norm": 0.6469932896858731, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5474, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.3769878391019645, |
|
"grad_norm": 0.6984755612841984, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5579, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.4069223573433116, |
|
"grad_norm": 0.7417844126416642, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5571, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.4368568755846587, |
|
"grad_norm": 0.717182519772729, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5591, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.4667913938260055, |
|
"grad_norm": 0.6313303871675323, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5585, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.4967259120673526, |
|
"grad_norm": 0.7084037752703477, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5546, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.5266604303086997, |
|
"grad_norm": 0.6531206728146401, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5577, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.5565949485500468, |
|
"grad_norm": 0.654485495629501, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5555, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.5865294667913938, |
|
"grad_norm": 0.6725661891430049, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5561, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.616463985032741, |
|
"grad_norm": 0.698329402386774, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5599, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.646398503274088, |
|
"grad_norm": 0.6690114035686613, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5592, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.6763330215154348, |
|
"grad_norm": 0.66011999214286, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5498, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.706267539756782, |
|
"grad_norm": 0.8040306195233741, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5518, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.736202057998129, |
|
"grad_norm": 0.7060300162023749, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5529, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.766136576239476, |
|
"grad_norm": 0.6673011361622929, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5558, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.7960710944808231, |
|
"grad_norm": 0.7571655613304361, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5564, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.8260056127221702, |
|
"grad_norm": 0.642019011432012, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5543, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.8559401309635173, |
|
"grad_norm": 0.6324305634532394, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5524, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.8858746492048644, |
|
"grad_norm": 0.6741301602507832, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5543, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.9158091674462114, |
|
"grad_norm": 0.7541104518831577, |
|
"learning_rate": 5e-06, |
|
"loss": 0.556, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.9457436856875585, |
|
"grad_norm": 0.6493122341038763, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5541, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.9756782039289056, |
|
"grad_norm": 0.8173207800207763, |
|
"learning_rate": 5e-06, |
|
"loss": 0.559, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.999625818521983, |
|
"eval_loss": 0.6155872941017151, |
|
"eval_runtime": 513.9247, |
|
"eval_samples_per_second": 17.514, |
|
"eval_steps_per_second": 0.549, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 2.0056127221702527, |
|
"grad_norm": 1.2127948243552868, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6011, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.0355472404115997, |
|
"grad_norm": 0.8183124334466546, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4861, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.065481758652947, |
|
"grad_norm": 0.7278809818448668, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4822, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.095416276894294, |
|
"grad_norm": 0.6907315590571528, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4866, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.125350795135641, |
|
"grad_norm": 0.7350401872049226, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4864, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.155285313376988, |
|
"grad_norm": 0.8001247911358127, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4898, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.185219831618335, |
|
"grad_norm": 0.7054933220563573, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4897, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.2151543498596817, |
|
"grad_norm": 0.7802502700344892, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4901, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.245088868101029, |
|
"grad_norm": 0.7215036811131982, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4923, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.275023386342376, |
|
"grad_norm": 0.6977267806305402, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4878, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.304957904583723, |
|
"grad_norm": 0.7014967299126638, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4929, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.33489242282507, |
|
"grad_norm": 0.6887436747500622, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4887, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.364826941066417, |
|
"grad_norm": 0.6792765123813863, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4905, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.394761459307764, |
|
"grad_norm": 0.7699608340540777, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4933, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.4246959775491113, |
|
"grad_norm": 0.7845823284512902, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5002, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.4546304957904583, |
|
"grad_norm": 0.7943786395213068, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4973, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.4845650140318054, |
|
"grad_norm": 0.7944786954591401, |
|
"learning_rate": 5e-06, |
|
"loss": 0.497, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.5144995322731525, |
|
"grad_norm": 0.7339137535531504, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4944, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.5444340505144996, |
|
"grad_norm": 0.6904479862339539, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4978, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.5743685687558466, |
|
"grad_norm": 0.7239438693443128, |
|
"learning_rate": 5e-06, |
|
"loss": 0.496, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.6043030869971937, |
|
"grad_norm": 0.6527578782432896, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4936, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.634237605238541, |
|
"grad_norm": 0.7144829092908652, |
|
"learning_rate": 5e-06, |
|
"loss": 0.492, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.664172123479888, |
|
"grad_norm": 0.7507359205267485, |
|
"learning_rate": 5e-06, |
|
"loss": 0.498, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.694106641721235, |
|
"grad_norm": 0.698227000026423, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5006, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.724041159962582, |
|
"grad_norm": 0.6978873719386385, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4942, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.753975678203929, |
|
"grad_norm": 0.6619027440664284, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4955, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.7839101964452757, |
|
"grad_norm": 0.6892171519418343, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5013, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.8138447146866232, |
|
"grad_norm": 0.7177100967350676, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4994, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.84377923292797, |
|
"grad_norm": 0.7555576282537906, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4988, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.8737137511693174, |
|
"grad_norm": 0.8751247226062729, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5009, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.903648269410664, |
|
"grad_norm": 0.730851224108063, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4964, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.933582787652011, |
|
"grad_norm": 0.7044959813969391, |
|
"learning_rate": 5e-06, |
|
"loss": 0.502, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.963517305893358, |
|
"grad_norm": 0.7404990276858111, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5036, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.9934518241347052, |
|
"grad_norm": 0.7940138869090708, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5047, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.999438727782975, |
|
"eval_loss": 0.6338862776756287, |
|
"eval_runtime": 513.4832, |
|
"eval_samples_per_second": 17.529, |
|
"eval_steps_per_second": 0.549, |
|
"step": 1002 |
|
}, |
|
{ |
|
"epoch": 2.999438727782975, |
|
"step": 1002, |
|
"total_flos": 3818092983484416.0, |
|
"train_loss": 0.5629288574416719, |
|
"train_runtime": 90608.8212, |
|
"train_samples_per_second": 5.662, |
|
"train_steps_per_second": 0.011 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1002, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3818092983484416.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|