|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 20.0, |
|
"eval_steps": 500, |
|
"global_step": 371860, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.7712463140487671, |
|
"learning_rate": 3.125e-05, |
|
"loss": 6.2181, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.8663497567176819, |
|
"learning_rate": 6.25e-05, |
|
"loss": 5.0051, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.8350703716278076, |
|
"learning_rate": 9.375e-05, |
|
"loss": 4.6784, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.8322649598121643, |
|
"learning_rate": 0.000125, |
|
"loss": 4.462, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.7712095379829407, |
|
"learning_rate": 0.00015625, |
|
"loss": 4.2961, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.7217949628829956, |
|
"learning_rate": 0.0001875, |
|
"loss": 4.1748, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.7187008261680603, |
|
"learning_rate": 0.00021875, |
|
"loss": 4.0752, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.807208776473999, |
|
"learning_rate": 0.00025, |
|
"loss": 3.9772, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.6315006017684937, |
|
"learning_rate": 0.00028121875, |
|
"loss": 3.9054, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.6685634255409241, |
|
"learning_rate": 0.00031246875000000003, |
|
"loss": 3.8504, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.7673210501670837, |
|
"learning_rate": 0.00034368749999999997, |
|
"loss": 3.8023, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.5318849086761475, |
|
"learning_rate": 0.0003749375, |
|
"loss": 3.7615, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.5182545781135559, |
|
"learning_rate": 0.00040615625, |
|
"loss": 3.7276, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.4659978151321411, |
|
"learning_rate": 0.00043737500000000005, |
|
"loss": 3.6939, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.41747376322746277, |
|
"learning_rate": 0.000468625, |
|
"loss": 3.6688, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.4067228436470032, |
|
"learning_rate": 0.00049984375, |
|
"loss": 3.6371, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.38340994715690613, |
|
"learning_rate": 0.00053109375, |
|
"loss": 3.6204, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.39036816358566284, |
|
"learning_rate": 0.0005623125, |
|
"loss": 3.6013, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.35949335589575654, |
|
"eval_loss": 3.7859134674072266, |
|
"eval_runtime": 154.8449, |
|
"eval_samples_per_second": 374.039, |
|
"eval_steps_per_second": 5.845, |
|
"step": 18593 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.35525622963905334, |
|
"learning_rate": 0.0005935625, |
|
"loss": 3.5798, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.33763423562049866, |
|
"learning_rate": 0.00062478125, |
|
"loss": 3.5498, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 0.3208175599575043, |
|
"learning_rate": 0.0006560312499999999, |
|
"loss": 3.5362, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 0.3225935995578766, |
|
"learning_rate": 0.00068728125, |
|
"loss": 3.5245, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.3024989068508148, |
|
"learning_rate": 0.00071853125, |
|
"loss": 3.5121, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 0.29528677463531494, |
|
"learning_rate": 0.00074978125, |
|
"loss": 3.5067, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 0.27072256803512573, |
|
"learning_rate": 0.0007810312499999999, |
|
"loss": 3.4965, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.2673095166683197, |
|
"learning_rate": 0.00081225, |
|
"loss": 3.4828, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 0.25490203499794006, |
|
"learning_rate": 0.0008435000000000001, |
|
"loss": 3.4763, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 0.25206422805786133, |
|
"learning_rate": 0.0008746874999999999, |
|
"loss": 3.466, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.26875174045562744, |
|
"learning_rate": 0.0009059375, |
|
"loss": 3.4619, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 0.23937861621379852, |
|
"learning_rate": 0.0009371875, |
|
"loss": 3.4511, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 0.25659725069999695, |
|
"learning_rate": 0.0009684375, |
|
"loss": 3.4456, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.23310545086860657, |
|
"learning_rate": 0.0009996562500000001, |
|
"loss": 3.4315, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 0.23862479627132416, |
|
"learning_rate": 0.0009970899782263285, |
|
"loss": 3.4262, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 0.22742129862308502, |
|
"learning_rate": 0.0009941475901841935, |
|
"loss": 3.4174, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.23835882544517517, |
|
"learning_rate": 0.0009912081445301007, |
|
"loss": 3.4016, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 0.216399684548378, |
|
"learning_rate": 0.0009882657564879657, |
|
"loss": 3.3936, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 0.24816367030143738, |
|
"learning_rate": 0.0009853233684458307, |
|
"loss": 3.3808, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.38059430056958293, |
|
"eval_loss": 3.5963242053985596, |
|
"eval_runtime": 155.1582, |
|
"eval_samples_per_second": 373.284, |
|
"eval_steps_per_second": 5.833, |
|
"step": 37186 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 0.22436316311359406, |
|
"learning_rate": 0.0009823839227917379, |
|
"loss": 3.3359, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 0.2343701869249344, |
|
"learning_rate": 0.0009794415347496028, |
|
"loss": 3.3285, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 0.21360653638839722, |
|
"learning_rate": 0.00097650208909551, |
|
"loss": 3.3206, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 0.2347899228334427, |
|
"learning_rate": 0.000973559701053375, |
|
"loss": 3.3169, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 0.24300511181354523, |
|
"learning_rate": 0.0009706173130112399, |
|
"loss": 3.3166, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 0.24823549389839172, |
|
"learning_rate": 0.0009676778673571472, |
|
"loss": 3.3092, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 0.23891581594944, |
|
"learning_rate": 0.0009647384217030542, |
|
"loss": 3.3051, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 0.23468884825706482, |
|
"learning_rate": 0.0009617960336609192, |
|
"loss": 3.2993, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 0.22591985762119293, |
|
"learning_rate": 0.0009588536456187842, |
|
"loss": 3.2944, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 0.2134736180305481, |
|
"learning_rate": 0.0009559141999646913, |
|
"loss": 3.2886, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 0.22116614878177643, |
|
"learning_rate": 0.0009529718119225564, |
|
"loss": 3.2876, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 0.23003488779067993, |
|
"learning_rate": 0.0009500294238804213, |
|
"loss": 3.2784, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 0.2259034961462021, |
|
"learning_rate": 0.0009470870358382863, |
|
"loss": 3.2811, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 0.20596662163734436, |
|
"learning_rate": 0.0009441475901841935, |
|
"loss": 3.2733, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.19811898469924927, |
|
"learning_rate": 0.0009412052021420585, |
|
"loss": 3.2686, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 0.21489368379116058, |
|
"learning_rate": 0.0009382628140999236, |
|
"loss": 3.2667, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 0.2073436975479126, |
|
"learning_rate": 0.0009353233684458307, |
|
"loss": 3.2586, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 0.20562222599983215, |
|
"learning_rate": 0.0009323809804036956, |
|
"loss": 3.2581, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.3930808255277856, |
|
"eval_loss": 3.4525861740112305, |
|
"eval_runtime": 154.9124, |
|
"eval_samples_per_second": 373.876, |
|
"eval_steps_per_second": 5.842, |
|
"step": 55779 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"grad_norm": 0.21636074781417847, |
|
"learning_rate": 0.0009294415347496028, |
|
"loss": 3.237, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 3.07, |
|
"grad_norm": 0.2338261902332306, |
|
"learning_rate": 0.0009264991467074678, |
|
"loss": 3.1874, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 0.20794998109340668, |
|
"learning_rate": 0.000923559701053375, |
|
"loss": 3.1927, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"grad_norm": 0.26810312271118164, |
|
"learning_rate": 0.00092061731301124, |
|
"loss": 3.2002, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 3.23, |
|
"grad_norm": 0.22468192875385284, |
|
"learning_rate": 0.0009176749249691049, |
|
"loss": 3.1917, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 0.20156365633010864, |
|
"learning_rate": 0.0009147325369269699, |
|
"loss": 3.1967, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"grad_norm": 0.25422027707099915, |
|
"learning_rate": 0.0009117930912728771, |
|
"loss": 3.1909, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"grad_norm": 0.24394932389259338, |
|
"learning_rate": 0.0009088507032307421, |
|
"loss": 3.1931, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 0.1917160004377365, |
|
"learning_rate": 0.0009059083151886071, |
|
"loss": 3.1907, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 0.22413145005702972, |
|
"learning_rate": 0.0009029688695345141, |
|
"loss": 3.1937, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"grad_norm": 0.22528770565986633, |
|
"learning_rate": 0.0009000264814923792, |
|
"loss": 3.1912, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 0.19711093604564667, |
|
"learning_rate": 0.0008970840934502442, |
|
"loss": 3.1899, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"grad_norm": 0.21881811320781708, |
|
"learning_rate": 0.0008941446477961514, |
|
"loss": 3.1904, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"grad_norm": 0.21984833478927612, |
|
"learning_rate": 0.0008912022597540164, |
|
"loss": 3.185, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 0.21477735042572021, |
|
"learning_rate": 0.0008882628140999234, |
|
"loss": 3.1845, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"grad_norm": 0.2399149090051651, |
|
"learning_rate": 0.0008853204260577885, |
|
"loss": 3.1808, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"grad_norm": 0.24318818747997284, |
|
"learning_rate": 0.0008823780380156535, |
|
"loss": 3.1788, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 3.93, |
|
"grad_norm": 0.2135390043258667, |
|
"learning_rate": 0.0008794385923615607, |
|
"loss": 3.1798, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 3.98, |
|
"grad_norm": 0.2475263923406601, |
|
"learning_rate": 0.0008764962043194257, |
|
"loss": 3.174, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.39836288067538933, |
|
"eval_loss": 3.415846586227417, |
|
"eval_runtime": 155.1031, |
|
"eval_samples_per_second": 373.416, |
|
"eval_steps_per_second": 5.835, |
|
"step": 74372 |
|
}, |
|
{ |
|
"epoch": 4.03, |
|
"grad_norm": 0.24471835792064667, |
|
"learning_rate": 0.0008735567586653328, |
|
"loss": 3.1366, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 4.09, |
|
"grad_norm": 0.2247551828622818, |
|
"learning_rate": 0.0008706143706231978, |
|
"loss": 3.111, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 4.14, |
|
"grad_norm": 0.22561417520046234, |
|
"learning_rate": 0.0008676719825810628, |
|
"loss": 3.1136, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"grad_norm": 0.22179843485355377, |
|
"learning_rate": 0.0008647295945389278, |
|
"loss": 3.1203, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"grad_norm": 0.215871661901474, |
|
"learning_rate": 0.000861790148884835, |
|
"loss": 3.122, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"grad_norm": 0.22627811133861542, |
|
"learning_rate": 0.0008588477608427, |
|
"loss": 3.1234, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 4.36, |
|
"grad_norm": 0.25328510999679565, |
|
"learning_rate": 0.000855905372800565, |
|
"loss": 3.123, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 4.41, |
|
"grad_norm": 0.21424081921577454, |
|
"learning_rate": 0.0008529659271464721, |
|
"loss": 3.1287, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"grad_norm": 0.2255638986825943, |
|
"learning_rate": 0.000850023539104337, |
|
"loss": 3.1251, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 4.52, |
|
"grad_norm": 0.22828561067581177, |
|
"learning_rate": 0.0008470811510622021, |
|
"loss": 3.1235, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 4.57, |
|
"grad_norm": 0.23357871174812317, |
|
"learning_rate": 0.0008441417054081092, |
|
"loss": 3.1257, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 4.63, |
|
"grad_norm": 0.21216043829917908, |
|
"learning_rate": 0.0008411993173659743, |
|
"loss": 3.1265, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 4.68, |
|
"grad_norm": 0.2156064212322235, |
|
"learning_rate": 0.0008382598717118813, |
|
"loss": 3.1266, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 4.73, |
|
"grad_norm": 0.23667491972446442, |
|
"learning_rate": 0.0008353174836697463, |
|
"loss": 3.1261, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 4.79, |
|
"grad_norm": 0.2780851125717163, |
|
"learning_rate": 0.0008323750956276114, |
|
"loss": 3.1197, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 4.84, |
|
"grad_norm": 0.22059884667396545, |
|
"learning_rate": 0.0008294327075854764, |
|
"loss": 3.1227, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 4.89, |
|
"grad_norm": 0.22738564014434814, |
|
"learning_rate": 0.0008264932619313836, |
|
"loss": 3.1256, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 4.95, |
|
"grad_norm": 0.21828770637512207, |
|
"learning_rate": 0.0008235508738892485, |
|
"loss": 3.1218, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.402172842511948, |
|
"eval_loss": 3.4018235206604004, |
|
"eval_runtime": 155.3626, |
|
"eval_samples_per_second": 372.792, |
|
"eval_steps_per_second": 5.825, |
|
"step": 92965 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.22099024057388306, |
|
"learning_rate": 0.0008206114282351556, |
|
"loss": 3.1158, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 5.06, |
|
"grad_norm": 0.2708933651447296, |
|
"learning_rate": 0.0008176690401930207, |
|
"loss": 3.0556, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 5.11, |
|
"grad_norm": 0.21532170474529266, |
|
"learning_rate": 0.0008147266521508857, |
|
"loss": 3.0574, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 5.16, |
|
"grad_norm": 0.2272021472454071, |
|
"learning_rate": 0.0008117842641087507, |
|
"loss": 3.0636, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 5.22, |
|
"grad_norm": 0.2151593714952469, |
|
"learning_rate": 0.0008088418760666157, |
|
"loss": 3.0634, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 5.27, |
|
"grad_norm": 0.23398688435554504, |
|
"learning_rate": 0.0008059053728005649, |
|
"loss": 3.0707, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 5.32, |
|
"grad_norm": 0.2310827672481537, |
|
"learning_rate": 0.00080296298475843, |
|
"loss": 3.0753, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 5.38, |
|
"grad_norm": 0.2718166708946228, |
|
"learning_rate": 0.0008000235391043371, |
|
"loss": 3.0739, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 5.43, |
|
"grad_norm": 0.2552712559700012, |
|
"learning_rate": 0.0007970811510622022, |
|
"loss": 3.0729, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 5.49, |
|
"grad_norm": 0.21800163388252258, |
|
"learning_rate": 0.0007941417054081093, |
|
"loss": 3.0746, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 5.54, |
|
"grad_norm": 0.24081352353096008, |
|
"learning_rate": 0.0007911993173659742, |
|
"loss": 3.076, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 5.59, |
|
"grad_norm": 0.22881019115447998, |
|
"learning_rate": 0.0007882598717118814, |
|
"loss": 3.078, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 5.65, |
|
"grad_norm": 0.23741869628429413, |
|
"learning_rate": 0.0007853174836697464, |
|
"loss": 3.0759, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 5.7, |
|
"grad_norm": 0.22118040919303894, |
|
"learning_rate": 0.0007823750956276114, |
|
"loss": 3.0776, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 5.75, |
|
"grad_norm": 0.24097305536270142, |
|
"learning_rate": 0.0007794356499735186, |
|
"loss": 3.0793, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 5.81, |
|
"grad_norm": 0.23857030272483826, |
|
"learning_rate": 0.0007764932619313834, |
|
"loss": 3.0783, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 5.86, |
|
"grad_norm": 0.2336532175540924, |
|
"learning_rate": 0.0007735508738892485, |
|
"loss": 3.0787, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 5.92, |
|
"grad_norm": 0.23847945034503937, |
|
"learning_rate": 0.0007706084858471135, |
|
"loss": 3.0753, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 5.97, |
|
"grad_norm": 0.2423369288444519, |
|
"learning_rate": 0.0007676690401930207, |
|
"loss": 3.0764, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.40599126928813756, |
|
"eval_loss": 3.3646838665008545, |
|
"eval_runtime": 155.7738, |
|
"eval_samples_per_second": 371.808, |
|
"eval_steps_per_second": 5.81, |
|
"step": 111558 |
|
}, |
|
{ |
|
"epoch": 6.02, |
|
"grad_norm": 0.23392504453659058, |
|
"learning_rate": 0.0007647266521508857, |
|
"loss": 3.0458, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 6.08, |
|
"grad_norm": 0.22365610301494598, |
|
"learning_rate": 0.0007617872064967927, |
|
"loss": 3.015, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 6.13, |
|
"grad_norm": 0.23814208805561066, |
|
"learning_rate": 0.0007588448184546578, |
|
"loss": 3.0154, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 6.19, |
|
"grad_norm": 0.2653498351573944, |
|
"learning_rate": 0.0007559053728005649, |
|
"loss": 3.0237, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 6.24, |
|
"grad_norm": 0.23730780184268951, |
|
"learning_rate": 0.0007529659271464721, |
|
"loss": 3.0289, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 6.29, |
|
"grad_norm": 0.21152764558792114, |
|
"learning_rate": 0.0007500235391043371, |
|
"loss": 3.0267, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 6.35, |
|
"grad_norm": 0.22933965921401978, |
|
"learning_rate": 0.000747081151062202, |
|
"loss": 3.0313, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"grad_norm": 0.260565847158432, |
|
"learning_rate": 0.0007441387630200671, |
|
"loss": 3.033, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 6.45, |
|
"grad_norm": 0.24269381165504456, |
|
"learning_rate": 0.0007411963749779321, |
|
"loss": 3.0375, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 6.51, |
|
"grad_norm": 0.2649300694465637, |
|
"learning_rate": 0.0007382539869357971, |
|
"loss": 3.0384, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 6.56, |
|
"grad_norm": 0.2362346202135086, |
|
"learning_rate": 0.0007353145412817043, |
|
"loss": 3.0386, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 6.62, |
|
"grad_norm": 0.22504200041294098, |
|
"learning_rate": 0.0007323750956276114, |
|
"loss": 3.0408, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 6.67, |
|
"grad_norm": 0.24480938911437988, |
|
"learning_rate": 0.0007294327075854764, |
|
"loss": 3.0336, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 6.72, |
|
"grad_norm": 0.2642379403114319, |
|
"learning_rate": 0.0007264903195433414, |
|
"loss": 3.0409, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 6.78, |
|
"grad_norm": 0.2179790735244751, |
|
"learning_rate": 0.0007235479315012063, |
|
"loss": 3.0449, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 6.83, |
|
"grad_norm": 0.22886839509010315, |
|
"learning_rate": 0.0007206114282351557, |
|
"loss": 3.0367, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 6.88, |
|
"grad_norm": 0.24182362854480743, |
|
"learning_rate": 0.0007176690401930207, |
|
"loss": 3.0396, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 6.94, |
|
"grad_norm": 0.2924305498600006, |
|
"learning_rate": 0.0007147266521508858, |
|
"loss": 3.0383, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 6.99, |
|
"grad_norm": 0.24420206248760223, |
|
"learning_rate": 0.0007117842641087506, |
|
"loss": 3.0403, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.4073320888528435, |
|
"eval_loss": 3.3496501445770264, |
|
"eval_runtime": 154.8896, |
|
"eval_samples_per_second": 373.931, |
|
"eval_steps_per_second": 5.843, |
|
"step": 130151 |
|
}, |
|
{ |
|
"epoch": 7.05, |
|
"grad_norm": 0.24674294888973236, |
|
"learning_rate": 0.0007088448184546579, |
|
"loss": 2.9807, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 7.1, |
|
"grad_norm": 0.2546931803226471, |
|
"learning_rate": 0.000705905372800565, |
|
"loss": 2.9767, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 7.15, |
|
"grad_norm": 0.28846633434295654, |
|
"learning_rate": 0.0007029629847584299, |
|
"loss": 2.9842, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 7.21, |
|
"grad_norm": 0.26209449768066406, |
|
"learning_rate": 0.000700020596716295, |
|
"loss": 2.9883, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 7.26, |
|
"grad_norm": 0.2578311860561371, |
|
"learning_rate": 0.000697081151062202, |
|
"loss": 2.9879, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 7.31, |
|
"grad_norm": 0.22354114055633545, |
|
"learning_rate": 0.0006941387630200671, |
|
"loss": 3.0001, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 7.37, |
|
"grad_norm": 0.2687912583351135, |
|
"learning_rate": 0.0006912022597540164, |
|
"loss": 3.0011, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 7.42, |
|
"grad_norm": 0.24351321160793304, |
|
"learning_rate": 0.0006882598717118814, |
|
"loss": 2.9989, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 7.48, |
|
"grad_norm": 0.22413307428359985, |
|
"learning_rate": 0.0006853174836697464, |
|
"loss": 3.0014, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 7.53, |
|
"grad_norm": 0.2596534192562103, |
|
"learning_rate": 0.0006823750956276113, |
|
"loss": 3.0022, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 7.58, |
|
"grad_norm": 0.2720980644226074, |
|
"learning_rate": 0.0006794356499735185, |
|
"loss": 3.0017, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 7.64, |
|
"grad_norm": 0.23778241872787476, |
|
"learning_rate": 0.0006764932619313835, |
|
"loss": 3.0093, |
|
"step": 142000 |
|
}, |
|
{ |
|
"epoch": 7.69, |
|
"grad_norm": 0.21840888261795044, |
|
"learning_rate": 0.0006735508738892485, |
|
"loss": 3.0078, |
|
"step": 143000 |
|
}, |
|
{ |
|
"epoch": 7.74, |
|
"grad_norm": 0.2463883012533188, |
|
"learning_rate": 0.0006706084858471136, |
|
"loss": 3.0098, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 7.8, |
|
"grad_norm": 0.2569877803325653, |
|
"learning_rate": 0.0006676690401930206, |
|
"loss": 3.0075, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 7.85, |
|
"grad_norm": 0.23889389634132385, |
|
"learning_rate": 0.0006647266521508856, |
|
"loss": 3.0091, |
|
"step": 146000 |
|
}, |
|
{ |
|
"epoch": 7.91, |
|
"grad_norm": 0.2982379198074341, |
|
"learning_rate": 0.0006617842641087507, |
|
"loss": 3.009, |
|
"step": 147000 |
|
}, |
|
{ |
|
"epoch": 7.96, |
|
"grad_norm": 0.23698513209819794, |
|
"learning_rate": 0.0006588418760666157, |
|
"loss": 3.0123, |
|
"step": 148000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.4083857051086448, |
|
"eval_loss": 3.357700824737549, |
|
"eval_runtime": 155.2269, |
|
"eval_samples_per_second": 373.118, |
|
"eval_steps_per_second": 5.83, |
|
"step": 148744 |
|
}, |
|
{ |
|
"epoch": 8.01, |
|
"grad_norm": 0.23715341091156006, |
|
"learning_rate": 0.000655905372800565, |
|
"loss": 2.9891, |
|
"step": 149000 |
|
}, |
|
{ |
|
"epoch": 8.07, |
|
"grad_norm": 0.2874661087989807, |
|
"learning_rate": 0.00065296298475843, |
|
"loss": 2.9449, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 8.12, |
|
"grad_norm": 0.2575448751449585, |
|
"learning_rate": 0.0006500205967162949, |
|
"loss": 2.9515, |
|
"step": 151000 |
|
}, |
|
{ |
|
"epoch": 8.18, |
|
"grad_norm": 0.26473337411880493, |
|
"learning_rate": 0.0006470811510622021, |
|
"loss": 2.9553, |
|
"step": 152000 |
|
}, |
|
{ |
|
"epoch": 8.23, |
|
"grad_norm": 0.2735915184020996, |
|
"learning_rate": 0.0006441387630200671, |
|
"loss": 2.9573, |
|
"step": 153000 |
|
}, |
|
{ |
|
"epoch": 8.28, |
|
"grad_norm": 0.23796038329601288, |
|
"learning_rate": 0.0006412022597540164, |
|
"loss": 2.9581, |
|
"step": 154000 |
|
}, |
|
{ |
|
"epoch": 8.34, |
|
"grad_norm": 0.27315831184387207, |
|
"learning_rate": 0.0006382598717118814, |
|
"loss": 2.9645, |
|
"step": 155000 |
|
}, |
|
{ |
|
"epoch": 8.39, |
|
"grad_norm": 0.24405047297477722, |
|
"learning_rate": 0.0006353174836697464, |
|
"loss": 2.967, |
|
"step": 156000 |
|
}, |
|
{ |
|
"epoch": 8.44, |
|
"grad_norm": 0.24863894283771515, |
|
"learning_rate": 0.0006323750956276114, |
|
"loss": 2.9708, |
|
"step": 157000 |
|
}, |
|
{ |
|
"epoch": 8.5, |
|
"grad_norm": 0.3322613835334778, |
|
"learning_rate": 0.0006294327075854764, |
|
"loss": 2.9707, |
|
"step": 158000 |
|
}, |
|
{ |
|
"epoch": 8.55, |
|
"grad_norm": 0.25508448481559753, |
|
"learning_rate": 0.0006264903195433414, |
|
"loss": 2.9772, |
|
"step": 159000 |
|
}, |
|
{ |
|
"epoch": 8.61, |
|
"grad_norm": 0.2909291088581085, |
|
"learning_rate": 0.0006235508738892486, |
|
"loss": 2.9788, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 8.66, |
|
"grad_norm": 0.24499082565307617, |
|
"learning_rate": 0.0006206084858471134, |
|
"loss": 2.9756, |
|
"step": 161000 |
|
}, |
|
{ |
|
"epoch": 8.71, |
|
"grad_norm": 0.2746281325817108, |
|
"learning_rate": 0.0006176690401930206, |
|
"loss": 2.9791, |
|
"step": 162000 |
|
}, |
|
{ |
|
"epoch": 8.77, |
|
"grad_norm": 0.23963476717472076, |
|
"learning_rate": 0.0006147266521508856, |
|
"loss": 2.9828, |
|
"step": 163000 |
|
}, |
|
{ |
|
"epoch": 8.82, |
|
"grad_norm": 0.26015135645866394, |
|
"learning_rate": 0.0006117872064967928, |
|
"loss": 2.9838, |
|
"step": 164000 |
|
}, |
|
{ |
|
"epoch": 8.87, |
|
"grad_norm": 0.25682857632637024, |
|
"learning_rate": 0.0006088448184546578, |
|
"loss": 2.982, |
|
"step": 165000 |
|
}, |
|
{ |
|
"epoch": 8.93, |
|
"grad_norm": 0.25143811106681824, |
|
"learning_rate": 0.0006059024304125227, |
|
"loss": 2.9858, |
|
"step": 166000 |
|
}, |
|
{ |
|
"epoch": 8.98, |
|
"grad_norm": 0.22260454297065735, |
|
"learning_rate": 0.0006029600423703878, |
|
"loss": 2.9806, |
|
"step": 167000 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.40961016534445654, |
|
"eval_loss": 3.3481061458587646, |
|
"eval_runtime": 155.0817, |
|
"eval_samples_per_second": 373.468, |
|
"eval_steps_per_second": 5.836, |
|
"step": 167337 |
|
}, |
|
{ |
|
"epoch": 9.04, |
|
"grad_norm": 0.29417407512664795, |
|
"learning_rate": 0.0006000176543282528, |
|
"loss": 2.9358, |
|
"step": 168000 |
|
}, |
|
{ |
|
"epoch": 9.09, |
|
"grad_norm": 0.28162261843681335, |
|
"learning_rate": 0.0005970752662861179, |
|
"loss": 2.9193, |
|
"step": 169000 |
|
}, |
|
{ |
|
"epoch": 9.14, |
|
"grad_norm": 0.259048193693161, |
|
"learning_rate": 0.000594135820632025, |
|
"loss": 2.9268, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 9.2, |
|
"grad_norm": 0.2524789571762085, |
|
"learning_rate": 0.00059119343258989, |
|
"loss": 2.9288, |
|
"step": 171000 |
|
}, |
|
{ |
|
"epoch": 9.25, |
|
"grad_norm": 0.2836909294128418, |
|
"learning_rate": 0.0005882510445477551, |
|
"loss": 2.9358, |
|
"step": 172000 |
|
}, |
|
{ |
|
"epoch": 9.3, |
|
"grad_norm": 0.2743144929409027, |
|
"learning_rate": 0.0005853115988936621, |
|
"loss": 2.9375, |
|
"step": 173000 |
|
}, |
|
{ |
|
"epoch": 9.36, |
|
"grad_norm": 0.25862225890159607, |
|
"learning_rate": 0.0005823721532395693, |
|
"loss": 2.9424, |
|
"step": 174000 |
|
}, |
|
{ |
|
"epoch": 9.41, |
|
"grad_norm": 0.2715602517127991, |
|
"learning_rate": 0.0005794297651974343, |
|
"loss": 2.9374, |
|
"step": 175000 |
|
}, |
|
{ |
|
"epoch": 9.47, |
|
"grad_norm": 0.28599709272384644, |
|
"learning_rate": 0.0005764873771552992, |
|
"loss": 2.9472, |
|
"step": 176000 |
|
}, |
|
{ |
|
"epoch": 9.52, |
|
"grad_norm": 0.24361692368984222, |
|
"learning_rate": 0.0005735449891131643, |
|
"loss": 2.9485, |
|
"step": 177000 |
|
}, |
|
{ |
|
"epoch": 9.57, |
|
"grad_norm": 0.2627822458744049, |
|
"learning_rate": 0.0005706055434590713, |
|
"loss": 2.9482, |
|
"step": 178000 |
|
}, |
|
{ |
|
"epoch": 9.63, |
|
"grad_norm": 0.2567738890647888, |
|
"learning_rate": 0.0005676660978049786, |
|
"loss": 2.9525, |
|
"step": 179000 |
|
}, |
|
{ |
|
"epoch": 9.68, |
|
"grad_norm": 0.24835285544395447, |
|
"learning_rate": 0.0005647237097628435, |
|
"loss": 2.9507, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 9.73, |
|
"grad_norm": 0.2631739675998688, |
|
"learning_rate": 0.0005617842641087507, |
|
"loss": 2.9561, |
|
"step": 181000 |
|
}, |
|
{ |
|
"epoch": 9.79, |
|
"grad_norm": 0.26964136958122253, |
|
"learning_rate": 0.0005588418760666157, |
|
"loss": 2.9534, |
|
"step": 182000 |
|
}, |
|
{ |
|
"epoch": 9.84, |
|
"grad_norm": 0.2449122667312622, |
|
"learning_rate": 0.0005558994880244806, |
|
"loss": 2.9538, |
|
"step": 183000 |
|
}, |
|
{ |
|
"epoch": 9.9, |
|
"grad_norm": 0.27662867307662964, |
|
"learning_rate": 0.0005529570999823456, |
|
"loss": 2.9545, |
|
"step": 184000 |
|
}, |
|
{ |
|
"epoch": 9.95, |
|
"grad_norm": 0.2661544382572174, |
|
"learning_rate": 0.0005500176543282528, |
|
"loss": 2.9559, |
|
"step": 185000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.41066129586401706, |
|
"eval_loss": 3.3229050636291504, |
|
"eval_runtime": 155.89, |
|
"eval_samples_per_second": 371.531, |
|
"eval_steps_per_second": 5.805, |
|
"step": 185930 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.2743024230003357, |
|
"learning_rate": 0.0005470782086741599, |
|
"loss": 2.955, |
|
"step": 186000 |
|
}, |
|
{ |
|
"epoch": 10.06, |
|
"grad_norm": 0.23735611140727997, |
|
"learning_rate": 0.000544135820632025, |
|
"loss": 2.8913, |
|
"step": 187000 |
|
}, |
|
{ |
|
"epoch": 10.11, |
|
"grad_norm": 0.26629218459129333, |
|
"learning_rate": 0.000541196374977932, |
|
"loss": 2.8993, |
|
"step": 188000 |
|
}, |
|
{ |
|
"epoch": 10.17, |
|
"grad_norm": 0.28622978925704956, |
|
"learning_rate": 0.0005382539869357971, |
|
"loss": 2.9015, |
|
"step": 189000 |
|
}, |
|
{ |
|
"epoch": 10.22, |
|
"grad_norm": 0.2874310314655304, |
|
"learning_rate": 0.0005353115988936621, |
|
"loss": 2.9063, |
|
"step": 190000 |
|
}, |
|
{ |
|
"epoch": 10.27, |
|
"grad_norm": 0.29681921005249023, |
|
"learning_rate": 0.0005323721532395692, |
|
"loss": 2.9151, |
|
"step": 191000 |
|
}, |
|
{ |
|
"epoch": 10.33, |
|
"grad_norm": 0.27875566482543945, |
|
"learning_rate": 0.0005294297651974343, |
|
"loss": 2.9134, |
|
"step": 192000 |
|
}, |
|
{ |
|
"epoch": 10.38, |
|
"grad_norm": 0.26678669452667236, |
|
"learning_rate": 0.0005264873771552992, |
|
"loss": 2.9179, |
|
"step": 193000 |
|
}, |
|
{ |
|
"epoch": 10.43, |
|
"grad_norm": 0.26234498620033264, |
|
"learning_rate": 0.0005235479315012064, |
|
"loss": 2.9206, |
|
"step": 194000 |
|
}, |
|
{ |
|
"epoch": 10.49, |
|
"grad_norm": 0.2867945432662964, |
|
"learning_rate": 0.0005206055434590714, |
|
"loss": 2.9235, |
|
"step": 195000 |
|
}, |
|
{ |
|
"epoch": 10.54, |
|
"grad_norm": 0.2825027108192444, |
|
"learning_rate": 0.0005176631554169364, |
|
"loss": 2.9234, |
|
"step": 196000 |
|
}, |
|
{ |
|
"epoch": 10.6, |
|
"grad_norm": 0.26379555463790894, |
|
"learning_rate": 0.0005147237097628436, |
|
"loss": 2.9269, |
|
"step": 197000 |
|
}, |
|
{ |
|
"epoch": 10.65, |
|
"grad_norm": 0.278078556060791, |
|
"learning_rate": 0.0005117813217207086, |
|
"loss": 2.9264, |
|
"step": 198000 |
|
}, |
|
{ |
|
"epoch": 10.7, |
|
"grad_norm": 0.29586726427078247, |
|
"learning_rate": 0.0005088418760666157, |
|
"loss": 2.9271, |
|
"step": 199000 |
|
}, |
|
{ |
|
"epoch": 10.76, |
|
"grad_norm": 0.2894950807094574, |
|
"learning_rate": 0.0005058994880244807, |
|
"loss": 2.9308, |
|
"step": 200000 |
|
}, |
|
{ |
|
"epoch": 10.81, |
|
"grad_norm": 0.2889408767223358, |
|
"learning_rate": 0.0005029600423703878, |
|
"loss": 2.9312, |
|
"step": 201000 |
|
}, |
|
{ |
|
"epoch": 10.86, |
|
"grad_norm": 0.2617069184780121, |
|
"learning_rate": 0.0005000176543282529, |
|
"loss": 2.9308, |
|
"step": 202000 |
|
}, |
|
{ |
|
"epoch": 10.92, |
|
"grad_norm": 0.26267439126968384, |
|
"learning_rate": 0.0004970752662861179, |
|
"loss": 2.9297, |
|
"step": 203000 |
|
}, |
|
{ |
|
"epoch": 10.97, |
|
"grad_norm": 0.2771267890930176, |
|
"learning_rate": 0.0004941358206320249, |
|
"loss": 2.9341, |
|
"step": 204000 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.41094117632832033, |
|
"eval_loss": 3.334763765335083, |
|
"eval_runtime": 155.3382, |
|
"eval_samples_per_second": 372.851, |
|
"eval_steps_per_second": 5.826, |
|
"step": 204523 |
|
}, |
|
{ |
|
"epoch": 11.03, |
|
"grad_norm": 0.30724549293518066, |
|
"learning_rate": 0.0004911934325898899, |
|
"loss": 2.9026, |
|
"step": 205000 |
|
}, |
|
{ |
|
"epoch": 11.08, |
|
"grad_norm": 0.27692487835884094, |
|
"learning_rate": 0.0004882539869357971, |
|
"loss": 2.8723, |
|
"step": 206000 |
|
}, |
|
{ |
|
"epoch": 11.13, |
|
"grad_norm": 0.27652502059936523, |
|
"learning_rate": 0.00048531159889366214, |
|
"loss": 2.8784, |
|
"step": 207000 |
|
}, |
|
{ |
|
"epoch": 11.19, |
|
"grad_norm": 0.3087480366230011, |
|
"learning_rate": 0.0004823721532395692, |
|
"loss": 2.8843, |
|
"step": 208000 |
|
}, |
|
{ |
|
"epoch": 11.24, |
|
"grad_norm": 0.28440648317337036, |
|
"learning_rate": 0.00047942976519743425, |
|
"loss": 2.8876, |
|
"step": 209000 |
|
}, |
|
{ |
|
"epoch": 11.29, |
|
"grad_norm": 0.2762095034122467, |
|
"learning_rate": 0.0004764873771552993, |
|
"loss": 2.8929, |
|
"step": 210000 |
|
}, |
|
{ |
|
"epoch": 11.35, |
|
"grad_norm": 0.29456937313079834, |
|
"learning_rate": 0.0004735479315012064, |
|
"loss": 2.8907, |
|
"step": 211000 |
|
}, |
|
{ |
|
"epoch": 11.4, |
|
"grad_norm": 0.2555305063724518, |
|
"learning_rate": 0.00047060554345907144, |
|
"loss": 2.8932, |
|
"step": 212000 |
|
}, |
|
{ |
|
"epoch": 11.46, |
|
"grad_norm": 0.3067052960395813, |
|
"learning_rate": 0.00046766315541693637, |
|
"loss": 2.9, |
|
"step": 213000 |
|
}, |
|
{ |
|
"epoch": 11.51, |
|
"grad_norm": 0.2779659628868103, |
|
"learning_rate": 0.00046472370976284355, |
|
"loss": 2.9, |
|
"step": 214000 |
|
}, |
|
{ |
|
"epoch": 11.56, |
|
"grad_norm": 0.2907882034778595, |
|
"learning_rate": 0.0004617842641087507, |
|
"loss": 2.9031, |
|
"step": 215000 |
|
}, |
|
{ |
|
"epoch": 11.62, |
|
"grad_norm": 0.2984541058540344, |
|
"learning_rate": 0.00045884187606661566, |
|
"loss": 2.8988, |
|
"step": 216000 |
|
}, |
|
{ |
|
"epoch": 11.67, |
|
"grad_norm": 0.258587509393692, |
|
"learning_rate": 0.00045589948802448064, |
|
"loss": 2.9043, |
|
"step": 217000 |
|
}, |
|
{ |
|
"epoch": 11.72, |
|
"grad_norm": 0.2560119926929474, |
|
"learning_rate": 0.0004529570999823457, |
|
"loss": 2.9092, |
|
"step": 218000 |
|
}, |
|
{ |
|
"epoch": 11.78, |
|
"grad_norm": 0.28085601329803467, |
|
"learning_rate": 0.0004500147119402107, |
|
"loss": 2.9032, |
|
"step": 219000 |
|
}, |
|
{ |
|
"epoch": 11.83, |
|
"grad_norm": 0.2866780161857605, |
|
"learning_rate": 0.0004470723238980757, |
|
"loss": 2.9093, |
|
"step": 220000 |
|
}, |
|
{ |
|
"epoch": 11.89, |
|
"grad_norm": 0.2737733721733093, |
|
"learning_rate": 0.0004441328782439829, |
|
"loss": 2.9087, |
|
"step": 221000 |
|
}, |
|
{ |
|
"epoch": 11.94, |
|
"grad_norm": 0.27077600359916687, |
|
"learning_rate": 0.0004411904902018478, |
|
"loss": 2.9117, |
|
"step": 222000 |
|
}, |
|
{ |
|
"epoch": 11.99, |
|
"grad_norm": 0.25244736671447754, |
|
"learning_rate": 0.00043825104454775493, |
|
"loss": 2.9141, |
|
"step": 223000 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.41132122524492226, |
|
"eval_loss": 3.3268089294433594, |
|
"eval_runtime": 154.8788, |
|
"eval_samples_per_second": 373.957, |
|
"eval_steps_per_second": 5.843, |
|
"step": 223116 |
|
}, |
|
{ |
|
"epoch": 12.05, |
|
"grad_norm": 0.29427435994148254, |
|
"learning_rate": 0.00043530865650561997, |
|
"loss": 2.8583, |
|
"step": 224000 |
|
}, |
|
{ |
|
"epoch": 12.1, |
|
"grad_norm": 0.2908006012439728, |
|
"learning_rate": 0.0004323692108515271, |
|
"loss": 2.8585, |
|
"step": 225000 |
|
}, |
|
{ |
|
"epoch": 12.16, |
|
"grad_norm": 0.2678658068180084, |
|
"learning_rate": 0.0004294297651974343, |
|
"loss": 2.8635, |
|
"step": 226000 |
|
}, |
|
{ |
|
"epoch": 12.21, |
|
"grad_norm": 0.2714795172214508, |
|
"learning_rate": 0.0004264873771552992, |
|
"loss": 2.8639, |
|
"step": 227000 |
|
}, |
|
{ |
|
"epoch": 12.26, |
|
"grad_norm": 0.27271273732185364, |
|
"learning_rate": 0.00042354498911316424, |
|
"loss": 2.8664, |
|
"step": 228000 |
|
}, |
|
{ |
|
"epoch": 12.32, |
|
"grad_norm": 0.2761640250682831, |
|
"learning_rate": 0.0004206026010710293, |
|
"loss": 2.8666, |
|
"step": 229000 |
|
}, |
|
{ |
|
"epoch": 12.37, |
|
"grad_norm": 0.2725890576839447, |
|
"learning_rate": 0.0004176631554169364, |
|
"loss": 2.8758, |
|
"step": 230000 |
|
}, |
|
{ |
|
"epoch": 12.42, |
|
"grad_norm": 0.28392326831817627, |
|
"learning_rate": 0.0004147207673748014, |
|
"loss": 2.8737, |
|
"step": 231000 |
|
}, |
|
{ |
|
"epoch": 12.48, |
|
"grad_norm": 0.2806406319141388, |
|
"learning_rate": 0.0004117813217207085, |
|
"loss": 2.8759, |
|
"step": 232000 |
|
}, |
|
{ |
|
"epoch": 12.53, |
|
"grad_norm": 0.2793915867805481, |
|
"learning_rate": 0.00040884187606661564, |
|
"loss": 2.8795, |
|
"step": 233000 |
|
}, |
|
{ |
|
"epoch": 12.59, |
|
"grad_norm": 0.31969547271728516, |
|
"learning_rate": 0.0004058994880244807, |
|
"loss": 2.8809, |
|
"step": 234000 |
|
}, |
|
{ |
|
"epoch": 12.64, |
|
"grad_norm": 0.2924957573413849, |
|
"learning_rate": 0.0004029570999823457, |
|
"loss": 2.8791, |
|
"step": 235000 |
|
}, |
|
{ |
|
"epoch": 12.69, |
|
"grad_norm": 0.3155530095100403, |
|
"learning_rate": 0.00040001471194021064, |
|
"loss": 2.8859, |
|
"step": 236000 |
|
}, |
|
{ |
|
"epoch": 12.75, |
|
"grad_norm": 0.29097285866737366, |
|
"learning_rate": 0.0003970752662861178, |
|
"loss": 2.8828, |
|
"step": 237000 |
|
}, |
|
{ |
|
"epoch": 12.8, |
|
"grad_norm": 0.27443572878837585, |
|
"learning_rate": 0.0003941328782439828, |
|
"loss": 2.8873, |
|
"step": 238000 |
|
}, |
|
{ |
|
"epoch": 12.85, |
|
"grad_norm": 0.2774432301521301, |
|
"learning_rate": 0.00039119049020184784, |
|
"loss": 2.8886, |
|
"step": 239000 |
|
}, |
|
{ |
|
"epoch": 12.91, |
|
"grad_norm": 0.30703893303871155, |
|
"learning_rate": 0.00038824810215971287, |
|
"loss": 2.8911, |
|
"step": 240000 |
|
}, |
|
{ |
|
"epoch": 12.96, |
|
"grad_norm": 0.2948872447013855, |
|
"learning_rate": 0.00038530865650561995, |
|
"loss": 2.8904, |
|
"step": 241000 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.41215334224704914, |
|
"eval_loss": 3.327643871307373, |
|
"eval_runtime": 156.0786, |
|
"eval_samples_per_second": 371.082, |
|
"eval_steps_per_second": 5.798, |
|
"step": 241709 |
|
}, |
|
{ |
|
"epoch": 13.02, |
|
"grad_norm": 0.2956237196922302, |
|
"learning_rate": 0.000382366268463485, |
|
"loss": 2.8708, |
|
"step": 242000 |
|
}, |
|
{ |
|
"epoch": 13.07, |
|
"grad_norm": 0.28290843963623047, |
|
"learning_rate": 0.0003794268228093921, |
|
"loss": 2.8364, |
|
"step": 243000 |
|
}, |
|
{ |
|
"epoch": 13.12, |
|
"grad_norm": 0.2849504053592682, |
|
"learning_rate": 0.00037648443476725714, |
|
"loss": 2.8406, |
|
"step": 244000 |
|
}, |
|
{ |
|
"epoch": 13.18, |
|
"grad_norm": 0.31363728642463684, |
|
"learning_rate": 0.00037354204672512207, |
|
"loss": 2.8425, |
|
"step": 245000 |
|
}, |
|
{ |
|
"epoch": 13.23, |
|
"grad_norm": 0.2917320430278778, |
|
"learning_rate": 0.00037060260107102925, |
|
"loss": 2.8473, |
|
"step": 246000 |
|
}, |
|
{ |
|
"epoch": 13.28, |
|
"grad_norm": 0.2930636703968048, |
|
"learning_rate": 0.00036766021302889423, |
|
"loss": 2.8499, |
|
"step": 247000 |
|
}, |
|
{ |
|
"epoch": 13.34, |
|
"grad_norm": 0.31320619583129883, |
|
"learning_rate": 0.00036471782498675927, |
|
"loss": 2.8541, |
|
"step": 248000 |
|
}, |
|
{ |
|
"epoch": 13.39, |
|
"grad_norm": 0.2878115177154541, |
|
"learning_rate": 0.0003617783793326664, |
|
"loss": 2.8538, |
|
"step": 249000 |
|
}, |
|
{ |
|
"epoch": 13.45, |
|
"grad_norm": 0.30532023310661316, |
|
"learning_rate": 0.0003588359912905314, |
|
"loss": 2.8557, |
|
"step": 250000 |
|
}, |
|
{ |
|
"epoch": 13.5, |
|
"grad_norm": 0.2783012092113495, |
|
"learning_rate": 0.0003558936032483964, |
|
"loss": 2.8569, |
|
"step": 251000 |
|
}, |
|
{ |
|
"epoch": 13.55, |
|
"grad_norm": 0.28491777181625366, |
|
"learning_rate": 0.0003529512152062614, |
|
"loss": 2.8572, |
|
"step": 252000 |
|
}, |
|
{ |
|
"epoch": 13.61, |
|
"grad_norm": 0.2942630648612976, |
|
"learning_rate": 0.0003500117695521686, |
|
"loss": 2.8595, |
|
"step": 253000 |
|
}, |
|
{ |
|
"epoch": 13.66, |
|
"grad_norm": 0.283105731010437, |
|
"learning_rate": 0.00034706938151003356, |
|
"loss": 2.8645, |
|
"step": 254000 |
|
}, |
|
{ |
|
"epoch": 13.71, |
|
"grad_norm": 0.2897133231163025, |
|
"learning_rate": 0.00034412699346789854, |
|
"loss": 2.8625, |
|
"step": 255000 |
|
}, |
|
{ |
|
"epoch": 13.77, |
|
"grad_norm": 0.2736916244029999, |
|
"learning_rate": 0.0003411846054257636, |
|
"loss": 2.8669, |
|
"step": 256000 |
|
}, |
|
{ |
|
"epoch": 13.82, |
|
"grad_norm": 0.271679550409317, |
|
"learning_rate": 0.0003382451597716707, |
|
"loss": 2.8667, |
|
"step": 257000 |
|
}, |
|
{ |
|
"epoch": 13.88, |
|
"grad_norm": 0.28864508867263794, |
|
"learning_rate": 0.00033530277172953574, |
|
"loss": 2.8629, |
|
"step": 258000 |
|
}, |
|
{ |
|
"epoch": 13.93, |
|
"grad_norm": 0.2957324981689453, |
|
"learning_rate": 0.00033236038368740067, |
|
"loss": 2.8712, |
|
"step": 259000 |
|
}, |
|
{ |
|
"epoch": 13.98, |
|
"grad_norm": 0.28916797041893005, |
|
"learning_rate": 0.00032942093803330785, |
|
"loss": 2.8682, |
|
"step": 260000 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.4127784041385224, |
|
"eval_loss": 3.3431813716888428, |
|
"eval_runtime": 154.3026, |
|
"eval_samples_per_second": 375.353, |
|
"eval_steps_per_second": 5.865, |
|
"step": 260302 |
|
}, |
|
{ |
|
"epoch": 14.04, |
|
"grad_norm": 0.2934631407260895, |
|
"learning_rate": 0.00032647854999117283, |
|
"loss": 2.8319, |
|
"step": 261000 |
|
}, |
|
{ |
|
"epoch": 14.09, |
|
"grad_norm": 0.29204925894737244, |
|
"learning_rate": 0.00032353910433708, |
|
"loss": 2.8212, |
|
"step": 262000 |
|
}, |
|
{ |
|
"epoch": 14.15, |
|
"grad_norm": 0.2898711860179901, |
|
"learning_rate": 0.000320596716294945, |
|
"loss": 2.8233, |
|
"step": 263000 |
|
}, |
|
{ |
|
"epoch": 14.2, |
|
"grad_norm": 0.30352848768234253, |
|
"learning_rate": 0.00031765432825281, |
|
"loss": 2.824, |
|
"step": 264000 |
|
}, |
|
{ |
|
"epoch": 14.25, |
|
"grad_norm": 0.31454843282699585, |
|
"learning_rate": 0.000314711940210675, |
|
"loss": 2.8312, |
|
"step": 265000 |
|
}, |
|
{ |
|
"epoch": 14.31, |
|
"grad_norm": 0.3224280774593353, |
|
"learning_rate": 0.00031177249455658214, |
|
"loss": 2.8355, |
|
"step": 266000 |
|
}, |
|
{ |
|
"epoch": 14.36, |
|
"grad_norm": 0.30281195044517517, |
|
"learning_rate": 0.0003088301065144472, |
|
"loss": 2.8334, |
|
"step": 267000 |
|
}, |
|
{ |
|
"epoch": 14.41, |
|
"grad_norm": 0.283155232667923, |
|
"learning_rate": 0.0003058906608603543, |
|
"loss": 2.8364, |
|
"step": 268000 |
|
}, |
|
{ |
|
"epoch": 14.47, |
|
"grad_norm": 0.2802044451236725, |
|
"learning_rate": 0.0003029482728182193, |
|
"loss": 2.8358, |
|
"step": 269000 |
|
}, |
|
{ |
|
"epoch": 14.52, |
|
"grad_norm": 0.28235703706741333, |
|
"learning_rate": 0.0003000088271641264, |
|
"loss": 2.8382, |
|
"step": 270000 |
|
}, |
|
{ |
|
"epoch": 14.58, |
|
"grad_norm": 0.33522123098373413, |
|
"learning_rate": 0.00029706643912199145, |
|
"loss": 2.8378, |
|
"step": 271000 |
|
}, |
|
{ |
|
"epoch": 14.63, |
|
"grad_norm": 0.2946733236312866, |
|
"learning_rate": 0.00029412405107985643, |
|
"loss": 2.8444, |
|
"step": 272000 |
|
}, |
|
{ |
|
"epoch": 14.68, |
|
"grad_norm": 0.28696757555007935, |
|
"learning_rate": 0.0002911816630377214, |
|
"loss": 2.8421, |
|
"step": 273000 |
|
}, |
|
{ |
|
"epoch": 14.74, |
|
"grad_norm": 0.26879528164863586, |
|
"learning_rate": 0.00028824221738362854, |
|
"loss": 2.8413, |
|
"step": 274000 |
|
}, |
|
{ |
|
"epoch": 14.79, |
|
"grad_norm": 0.31551554799079895, |
|
"learning_rate": 0.00028529982934149357, |
|
"loss": 2.8448, |
|
"step": 275000 |
|
}, |
|
{ |
|
"epoch": 14.84, |
|
"grad_norm": 0.35561737418174744, |
|
"learning_rate": 0.0002823574412993586, |
|
"loss": 2.8468, |
|
"step": 276000 |
|
}, |
|
{ |
|
"epoch": 14.9, |
|
"grad_norm": 0.29787468910217285, |
|
"learning_rate": 0.00027941799564526574, |
|
"loss": 2.8465, |
|
"step": 277000 |
|
}, |
|
{ |
|
"epoch": 14.95, |
|
"grad_norm": 0.3122011423110962, |
|
"learning_rate": 0.0002764756076031307, |
|
"loss": 2.8518, |
|
"step": 278000 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.41202596506022265, |
|
"eval_loss": 3.353341579437256, |
|
"eval_runtime": 154.4211, |
|
"eval_samples_per_second": 375.065, |
|
"eval_steps_per_second": 5.861, |
|
"step": 278895 |
|
}, |
|
{ |
|
"epoch": 15.01, |
|
"grad_norm": 0.2871023118495941, |
|
"learning_rate": 0.00027353616194903784, |
|
"loss": 2.8454, |
|
"step": 279000 |
|
}, |
|
{ |
|
"epoch": 15.06, |
|
"grad_norm": 0.3092021048069, |
|
"learning_rate": 0.0002705937739069029, |
|
"loss": 2.7967, |
|
"step": 280000 |
|
}, |
|
{ |
|
"epoch": 15.11, |
|
"grad_norm": 0.30402371287345886, |
|
"learning_rate": 0.00026765138586476786, |
|
"loss": 2.8006, |
|
"step": 281000 |
|
}, |
|
{ |
|
"epoch": 15.17, |
|
"grad_norm": 0.3319647014141083, |
|
"learning_rate": 0.00026471488259871713, |
|
"loss": 2.8097, |
|
"step": 282000 |
|
}, |
|
{ |
|
"epoch": 15.22, |
|
"grad_norm": 0.3174166977405548, |
|
"learning_rate": 0.0002617724945565821, |
|
"loss": 2.809, |
|
"step": 283000 |
|
}, |
|
{ |
|
"epoch": 15.27, |
|
"grad_norm": 0.30577802658081055, |
|
"learning_rate": 0.0002588301065144471, |
|
"loss": 2.8155, |
|
"step": 284000 |
|
}, |
|
{ |
|
"epoch": 15.33, |
|
"grad_norm": 0.30882659554481506, |
|
"learning_rate": 0.0002558906608603543, |
|
"loss": 2.8143, |
|
"step": 285000 |
|
}, |
|
{ |
|
"epoch": 15.38, |
|
"grad_norm": 0.310907781124115, |
|
"learning_rate": 0.00025294827281821926, |
|
"loss": 2.817, |
|
"step": 286000 |
|
}, |
|
{ |
|
"epoch": 15.44, |
|
"grad_norm": 0.32856982946395874, |
|
"learning_rate": 0.0002500058847760843, |
|
"loss": 2.8181, |
|
"step": 287000 |
|
}, |
|
{ |
|
"epoch": 15.49, |
|
"grad_norm": 0.3121114671230316, |
|
"learning_rate": 0.0002470634967339493, |
|
"loss": 2.8216, |
|
"step": 288000 |
|
}, |
|
{ |
|
"epoch": 15.54, |
|
"grad_norm": 0.2881309390068054, |
|
"learning_rate": 0.0002441211086918143, |
|
"loss": 2.8199, |
|
"step": 289000 |
|
}, |
|
{ |
|
"epoch": 15.6, |
|
"grad_norm": 0.30656737089157104, |
|
"learning_rate": 0.00024118166303772144, |
|
"loss": 2.82, |
|
"step": 290000 |
|
}, |
|
{ |
|
"epoch": 15.65, |
|
"grad_norm": 0.33178097009658813, |
|
"learning_rate": 0.00023824221738362857, |
|
"loss": 2.8245, |
|
"step": 291000 |
|
}, |
|
{ |
|
"epoch": 15.7, |
|
"grad_norm": 0.274804025888443, |
|
"learning_rate": 0.00023529982934149355, |
|
"loss": 2.8266, |
|
"step": 292000 |
|
}, |
|
{ |
|
"epoch": 15.76, |
|
"grad_norm": 0.3083294928073883, |
|
"learning_rate": 0.00023235744129935856, |
|
"loss": 2.8237, |
|
"step": 293000 |
|
}, |
|
{ |
|
"epoch": 15.81, |
|
"grad_norm": 0.28113245964050293, |
|
"learning_rate": 0.00022941505325722357, |
|
"loss": 2.8288, |
|
"step": 294000 |
|
}, |
|
{ |
|
"epoch": 15.87, |
|
"grad_norm": 0.28199005126953125, |
|
"learning_rate": 0.00022647560760313072, |
|
"loss": 2.8286, |
|
"step": 295000 |
|
}, |
|
{ |
|
"epoch": 15.92, |
|
"grad_norm": 0.27350249886512756, |
|
"learning_rate": 0.00022353616194903785, |
|
"loss": 2.8299, |
|
"step": 296000 |
|
}, |
|
{ |
|
"epoch": 15.97, |
|
"grad_norm": 0.2859692871570587, |
|
"learning_rate": 0.00022059377390690286, |
|
"loss": 2.8294, |
|
"step": 297000 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.411999965602785, |
|
"eval_loss": 3.3578226566314697, |
|
"eval_runtime": 154.1292, |
|
"eval_samples_per_second": 375.776, |
|
"eval_steps_per_second": 5.872, |
|
"step": 297488 |
|
}, |
|
{ |
|
"epoch": 16.03, |
|
"grad_norm": 0.3390689790248871, |
|
"learning_rate": 0.00021765138586476784, |
|
"loss": 2.8065, |
|
"step": 298000 |
|
}, |
|
{ |
|
"epoch": 16.08, |
|
"grad_norm": 0.31765133142471313, |
|
"learning_rate": 0.00021470899782263285, |
|
"loss": 2.7857, |
|
"step": 299000 |
|
}, |
|
{ |
|
"epoch": 16.14, |
|
"grad_norm": 0.3472673296928406, |
|
"learning_rate": 0.00021176955216854, |
|
"loss": 2.7928, |
|
"step": 300000 |
|
}, |
|
{ |
|
"epoch": 16.19, |
|
"grad_norm": 0.2980306148529053, |
|
"learning_rate": 0.00020882716412640498, |
|
"loss": 2.7942, |
|
"step": 301000 |
|
}, |
|
{ |
|
"epoch": 16.24, |
|
"grad_norm": 0.3002999424934387, |
|
"learning_rate": 0.00020588477608427, |
|
"loss": 2.7965, |
|
"step": 302000 |
|
}, |
|
{ |
|
"epoch": 16.3, |
|
"grad_norm": 0.31571468710899353, |
|
"learning_rate": 0.00020294533043017712, |
|
"loss": 2.7946, |
|
"step": 303000 |
|
}, |
|
{ |
|
"epoch": 16.35, |
|
"grad_norm": 0.3243144750595093, |
|
"learning_rate": 0.00020000294238804216, |
|
"loss": 2.8006, |
|
"step": 304000 |
|
}, |
|
{ |
|
"epoch": 16.4, |
|
"grad_norm": 0.3149029314517975, |
|
"learning_rate": 0.0001970664391219914, |
|
"loss": 2.7965, |
|
"step": 305000 |
|
}, |
|
{ |
|
"epoch": 16.46, |
|
"grad_norm": 0.2891409695148468, |
|
"learning_rate": 0.00019412405107985644, |
|
"loss": 2.8011, |
|
"step": 306000 |
|
}, |
|
{ |
|
"epoch": 16.51, |
|
"grad_norm": 0.299628883600235, |
|
"learning_rate": 0.00019118166303772142, |
|
"loss": 2.7987, |
|
"step": 307000 |
|
}, |
|
{ |
|
"epoch": 16.57, |
|
"grad_norm": 0.3170998990535736, |
|
"learning_rate": 0.00018823927499558643, |
|
"loss": 2.808, |
|
"step": 308000 |
|
}, |
|
{ |
|
"epoch": 16.62, |
|
"grad_norm": 0.3119168281555176, |
|
"learning_rate": 0.00018529688695345144, |
|
"loss": 2.8046, |
|
"step": 309000 |
|
}, |
|
{ |
|
"epoch": 16.67, |
|
"grad_norm": 0.31267720460891724, |
|
"learning_rate": 0.00018235744129935856, |
|
"loss": 2.8054, |
|
"step": 310000 |
|
}, |
|
{ |
|
"epoch": 16.73, |
|
"grad_norm": 0.30472496151924133, |
|
"learning_rate": 0.00017941505325722357, |
|
"loss": 2.8049, |
|
"step": 311000 |
|
}, |
|
{ |
|
"epoch": 16.78, |
|
"grad_norm": 0.30369436740875244, |
|
"learning_rate": 0.00017647266521508855, |
|
"loss": 2.8076, |
|
"step": 312000 |
|
}, |
|
{ |
|
"epoch": 16.83, |
|
"grad_norm": 0.3209957182407379, |
|
"learning_rate": 0.0001735302771729536, |
|
"loss": 2.8084, |
|
"step": 313000 |
|
}, |
|
{ |
|
"epoch": 16.89, |
|
"grad_norm": 0.3150200843811035, |
|
"learning_rate": 0.00017059377390690283, |
|
"loss": 2.8094, |
|
"step": 314000 |
|
}, |
|
{ |
|
"epoch": 16.94, |
|
"grad_norm": 0.3224383592605591, |
|
"learning_rate": 0.00016765432825281, |
|
"loss": 2.8082, |
|
"step": 315000 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"grad_norm": 0.30561330914497375, |
|
"learning_rate": 0.00016471194021067497, |
|
"loss": 2.8079, |
|
"step": 316000 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.4124089028054288, |
|
"eval_loss": 3.35547137260437, |
|
"eval_runtime": 154.4008, |
|
"eval_samples_per_second": 375.115, |
|
"eval_steps_per_second": 5.861, |
|
"step": 316081 |
|
}, |
|
{ |
|
"epoch": 17.05, |
|
"grad_norm": 0.3246624767780304, |
|
"learning_rate": 0.00016176955216854, |
|
"loss": 2.7768, |
|
"step": 317000 |
|
}, |
|
{ |
|
"epoch": 17.1, |
|
"grad_norm": 0.32693788409233093, |
|
"learning_rate": 0.000158827164126405, |
|
"loss": 2.7745, |
|
"step": 318000 |
|
}, |
|
{ |
|
"epoch": 17.16, |
|
"grad_norm": 0.29351866245269775, |
|
"learning_rate": 0.00015588771847231214, |
|
"loss": 2.7746, |
|
"step": 319000 |
|
}, |
|
{ |
|
"epoch": 17.21, |
|
"grad_norm": 0.3380851149559021, |
|
"learning_rate": 0.00015294533043017715, |
|
"loss": 2.7814, |
|
"step": 320000 |
|
}, |
|
{ |
|
"epoch": 17.26, |
|
"grad_norm": 0.326935350894928, |
|
"learning_rate": 0.00015000588477608425, |
|
"loss": 2.7746, |
|
"step": 321000 |
|
}, |
|
{ |
|
"epoch": 17.32, |
|
"grad_norm": 0.30314844846725464, |
|
"learning_rate": 0.0001470634967339493, |
|
"loss": 2.7811, |
|
"step": 322000 |
|
}, |
|
{ |
|
"epoch": 17.37, |
|
"grad_norm": 0.33331480622291565, |
|
"learning_rate": 0.00014412110869181427, |
|
"loss": 2.7809, |
|
"step": 323000 |
|
}, |
|
{ |
|
"epoch": 17.43, |
|
"grad_norm": 0.3350047767162323, |
|
"learning_rate": 0.0001411787206496793, |
|
"loss": 2.7831, |
|
"step": 324000 |
|
}, |
|
{ |
|
"epoch": 17.48, |
|
"grad_norm": 0.3177855908870697, |
|
"learning_rate": 0.00013823633260754429, |
|
"loss": 2.7884, |
|
"step": 325000 |
|
}, |
|
{ |
|
"epoch": 17.53, |
|
"grad_norm": 0.3424097001552582, |
|
"learning_rate": 0.00013529688695345144, |
|
"loss": 2.7879, |
|
"step": 326000 |
|
}, |
|
{ |
|
"epoch": 17.59, |
|
"grad_norm": 0.3361828029155731, |
|
"learning_rate": 0.00013235449891131642, |
|
"loss": 2.7846, |
|
"step": 327000 |
|
}, |
|
{ |
|
"epoch": 17.64, |
|
"grad_norm": 0.3659563362598419, |
|
"learning_rate": 0.00012941505325722355, |
|
"loss": 2.7847, |
|
"step": 328000 |
|
}, |
|
{ |
|
"epoch": 17.69, |
|
"grad_norm": 0.34645649790763855, |
|
"learning_rate": 0.00012647266521508858, |
|
"loss": 2.7868, |
|
"step": 329000 |
|
}, |
|
{ |
|
"epoch": 17.75, |
|
"grad_norm": 0.3232506513595581, |
|
"learning_rate": 0.0001235332195609957, |
|
"loss": 2.7882, |
|
"step": 330000 |
|
}, |
|
{ |
|
"epoch": 17.8, |
|
"grad_norm": 0.32720622420310974, |
|
"learning_rate": 0.00012059083151886072, |
|
"loss": 2.7895, |
|
"step": 331000 |
|
}, |
|
{ |
|
"epoch": 17.86, |
|
"grad_norm": 0.30321887135505676, |
|
"learning_rate": 0.0001176484434767257, |
|
"loss": 2.7889, |
|
"step": 332000 |
|
}, |
|
{ |
|
"epoch": 17.91, |
|
"grad_norm": 0.3072613775730133, |
|
"learning_rate": 0.00011470899782263284, |
|
"loss": 2.7921, |
|
"step": 333000 |
|
}, |
|
{ |
|
"epoch": 17.96, |
|
"grad_norm": 0.3558065593242645, |
|
"learning_rate": 0.00011176660978049785, |
|
"loss": 2.7936, |
|
"step": 334000 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.412132784536517, |
|
"eval_loss": 3.3698041439056396, |
|
"eval_runtime": 155.5815, |
|
"eval_samples_per_second": 372.268, |
|
"eval_steps_per_second": 5.817, |
|
"step": 334674 |
|
}, |
|
{ |
|
"epoch": 18.02, |
|
"grad_norm": 0.2892885208129883, |
|
"learning_rate": 0.00010882422173836286, |
|
"loss": 2.7782, |
|
"step": 335000 |
|
}, |
|
{ |
|
"epoch": 18.07, |
|
"grad_norm": 0.31055745482444763, |
|
"learning_rate": 0.00010588477608427, |
|
"loss": 2.7608, |
|
"step": 336000 |
|
}, |
|
{ |
|
"epoch": 18.13, |
|
"grad_norm": 0.3110492527484894, |
|
"learning_rate": 0.000102942388042135, |
|
"loss": 2.7596, |
|
"step": 337000 |
|
}, |
|
{ |
|
"epoch": 18.18, |
|
"grad_norm": 0.32563742995262146, |
|
"learning_rate": 0.00010000294238804214, |
|
"loss": 2.7604, |
|
"step": 338000 |
|
}, |
|
{ |
|
"epoch": 18.23, |
|
"grad_norm": 0.29414141178131104, |
|
"learning_rate": 9.706055434590715e-05, |
|
"loss": 2.7646, |
|
"step": 339000 |
|
}, |
|
{ |
|
"epoch": 18.29, |
|
"grad_norm": 0.3376915752887726, |
|
"learning_rate": 9.411816630377214e-05, |
|
"loss": 2.7653, |
|
"step": 340000 |
|
}, |
|
{ |
|
"epoch": 18.34, |
|
"grad_norm": 0.3259836733341217, |
|
"learning_rate": 9.117872064967928e-05, |
|
"loss": 2.7683, |
|
"step": 341000 |
|
}, |
|
{ |
|
"epoch": 18.39, |
|
"grad_norm": 0.35515841841697693, |
|
"learning_rate": 8.823633260754428e-05, |
|
"loss": 2.7668, |
|
"step": 342000 |
|
}, |
|
{ |
|
"epoch": 18.45, |
|
"grad_norm": 0.32276225090026855, |
|
"learning_rate": 8.529688695345142e-05, |
|
"loss": 2.7718, |
|
"step": 343000 |
|
}, |
|
{ |
|
"epoch": 18.5, |
|
"grad_norm": 0.3123570382595062, |
|
"learning_rate": 8.235449891131643e-05, |
|
"loss": 2.7713, |
|
"step": 344000 |
|
}, |
|
{ |
|
"epoch": 18.56, |
|
"grad_norm": 0.3183230459690094, |
|
"learning_rate": 7.941211086918143e-05, |
|
"loss": 2.7693, |
|
"step": 345000 |
|
}, |
|
{ |
|
"epoch": 18.61, |
|
"grad_norm": 0.33093953132629395, |
|
"learning_rate": 7.646972282704644e-05, |
|
"loss": 2.7721, |
|
"step": 346000 |
|
}, |
|
{ |
|
"epoch": 18.66, |
|
"grad_norm": 0.36558830738067627, |
|
"learning_rate": 7.35332195609957e-05, |
|
"loss": 2.7734, |
|
"step": 347000 |
|
}, |
|
{ |
|
"epoch": 18.72, |
|
"grad_norm": 0.3600742220878601, |
|
"learning_rate": 7.059083151886071e-05, |
|
"loss": 2.7704, |
|
"step": 348000 |
|
}, |
|
{ |
|
"epoch": 18.77, |
|
"grad_norm": 0.3038669526576996, |
|
"learning_rate": 6.764844347672572e-05, |
|
"loss": 2.771, |
|
"step": 349000 |
|
}, |
|
{ |
|
"epoch": 18.82, |
|
"grad_norm": 0.31358402967453003, |
|
"learning_rate": 6.470605543459072e-05, |
|
"loss": 2.7703, |
|
"step": 350000 |
|
}, |
|
{ |
|
"epoch": 18.88, |
|
"grad_norm": 0.32478052377700806, |
|
"learning_rate": 6.176660978049786e-05, |
|
"loss": 2.7749, |
|
"step": 351000 |
|
}, |
|
{ |
|
"epoch": 18.93, |
|
"grad_norm": 0.31476205587387085, |
|
"learning_rate": 5.882422173836285e-05, |
|
"loss": 2.7685, |
|
"step": 352000 |
|
}, |
|
{ |
|
"epoch": 18.99, |
|
"grad_norm": 0.32048299908638, |
|
"learning_rate": 5.588183369622786e-05, |
|
"loss": 2.7716, |
|
"step": 353000 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 0.4123962053959825, |
|
"eval_loss": 3.3712940216064453, |
|
"eval_runtime": 155.1138, |
|
"eval_samples_per_second": 373.39, |
|
"eval_steps_per_second": 5.834, |
|
"step": 353267 |
|
}, |
|
{ |
|
"epoch": 19.04, |
|
"grad_norm": 0.3352065980434418, |
|
"learning_rate": 5.2942388042135e-05, |
|
"loss": 2.7582, |
|
"step": 354000 |
|
}, |
|
{ |
|
"epoch": 19.09, |
|
"grad_norm": 0.3091578185558319, |
|
"learning_rate": 5e-05, |
|
"loss": 2.7489, |
|
"step": 355000 |
|
}, |
|
{ |
|
"epoch": 19.15, |
|
"grad_norm": 0.3163221776485443, |
|
"learning_rate": 4.706055434590714e-05, |
|
"loss": 2.7524, |
|
"step": 356000 |
|
}, |
|
{ |
|
"epoch": 19.2, |
|
"grad_norm": 0.3469810485839844, |
|
"learning_rate": 4.412110869181428e-05, |
|
"loss": 2.7538, |
|
"step": 357000 |
|
}, |
|
{ |
|
"epoch": 19.25, |
|
"grad_norm": 0.33250272274017334, |
|
"learning_rate": 4.117872064967928e-05, |
|
"loss": 2.7526, |
|
"step": 358000 |
|
}, |
|
{ |
|
"epoch": 19.31, |
|
"grad_norm": 0.32294201850891113, |
|
"learning_rate": 3.823927499558642e-05, |
|
"loss": 2.753, |
|
"step": 359000 |
|
}, |
|
{ |
|
"epoch": 19.36, |
|
"grad_norm": 0.32193583250045776, |
|
"learning_rate": 3.529688695345142e-05, |
|
"loss": 2.7514, |
|
"step": 360000 |
|
}, |
|
{ |
|
"epoch": 19.42, |
|
"grad_norm": 0.32082927227020264, |
|
"learning_rate": 3.235449891131643e-05, |
|
"loss": 2.7545, |
|
"step": 361000 |
|
}, |
|
{ |
|
"epoch": 19.47, |
|
"grad_norm": 0.2935381233692169, |
|
"learning_rate": 2.9412110869181425e-05, |
|
"loss": 2.7542, |
|
"step": 362000 |
|
}, |
|
{ |
|
"epoch": 19.52, |
|
"grad_norm": 0.35660600662231445, |
|
"learning_rate": 2.6472665215088566e-05, |
|
"loss": 2.7546, |
|
"step": 363000 |
|
}, |
|
{ |
|
"epoch": 19.58, |
|
"grad_norm": 0.3313020169734955, |
|
"learning_rate": 2.3533219560995707e-05, |
|
"loss": 2.7527, |
|
"step": 364000 |
|
}, |
|
{ |
|
"epoch": 19.63, |
|
"grad_norm": 0.3213827610015869, |
|
"learning_rate": 2.059083151886071e-05, |
|
"loss": 2.7564, |
|
"step": 365000 |
|
}, |
|
{ |
|
"epoch": 19.68, |
|
"grad_norm": 0.3247123062610626, |
|
"learning_rate": 1.764844347672571e-05, |
|
"loss": 2.7562, |
|
"step": 366000 |
|
}, |
|
{ |
|
"epoch": 19.74, |
|
"grad_norm": 0.32302042841911316, |
|
"learning_rate": 1.4706055434590713e-05, |
|
"loss": 2.7534, |
|
"step": 367000 |
|
}, |
|
{ |
|
"epoch": 19.79, |
|
"grad_norm": 0.3282775282859802, |
|
"learning_rate": 1.1766609780497853e-05, |
|
"loss": 2.7548, |
|
"step": 368000 |
|
}, |
|
{ |
|
"epoch": 19.85, |
|
"grad_norm": 0.3025127947330475, |
|
"learning_rate": 8.82716412640499e-06, |
|
"loss": 2.7546, |
|
"step": 369000 |
|
}, |
|
{ |
|
"epoch": 19.9, |
|
"grad_norm": 0.3381025195121765, |
|
"learning_rate": 5.8847760842699935e-06, |
|
"loss": 2.7557, |
|
"step": 370000 |
|
}, |
|
{ |
|
"epoch": 19.95, |
|
"grad_norm": 0.3549807667732239, |
|
"learning_rate": 2.9423880421349967e-06, |
|
"loss": 2.7573, |
|
"step": 371000 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.41202925698119025, |
|
"eval_loss": 3.3818423748016357, |
|
"eval_runtime": 155.053, |
|
"eval_samples_per_second": 373.537, |
|
"eval_steps_per_second": 5.837, |
|
"step": 371860 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"step": 371860, |
|
"total_flos": 1.5667414205184e+18, |
|
"train_loss": 3.025039612285832, |
|
"train_runtime": 81424.795, |
|
"train_samples_per_second": 146.139, |
|
"train_steps_per_second": 4.567 |
|
} |
|
], |
|
"logging_steps": 1000, |
|
"max_steps": 371860, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 5000, |
|
"total_flos": 1.5667414205184e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|