|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 20.0, |
|
"eval_steps": 500, |
|
"global_step": 372040, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.772857129573822, |
|
"learning_rate": 3.125e-05, |
|
"loss": 6.1994, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.9309808611869812, |
|
"learning_rate": 6.25e-05, |
|
"loss": 4.9973, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.893162190914154, |
|
"learning_rate": 9.375e-05, |
|
"loss": 4.6739, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.7998523116111755, |
|
"learning_rate": 0.000125, |
|
"loss": 4.4521, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.7740798592567444, |
|
"learning_rate": 0.00015625, |
|
"loss": 4.2992, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.725793719291687, |
|
"learning_rate": 0.0001875, |
|
"loss": 4.181, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.7274988889694214, |
|
"learning_rate": 0.00021875, |
|
"loss": 4.0768, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.6781344413757324, |
|
"learning_rate": 0.00025, |
|
"loss": 3.9903, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.6776793599128723, |
|
"learning_rate": 0.00028121875, |
|
"loss": 3.9081, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.6176994442939758, |
|
"learning_rate": 0.00031246875000000003, |
|
"loss": 3.8477, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.5840777158737183, |
|
"learning_rate": 0.00034368749999999997, |
|
"loss": 3.8126, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.5407571792602539, |
|
"learning_rate": 0.00037490625, |
|
"loss": 3.7654, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.5007668137550354, |
|
"learning_rate": 0.00040615625, |
|
"loss": 3.728, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.46616649627685547, |
|
"learning_rate": 0.00043737500000000005, |
|
"loss": 3.6994, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.40752965211868286, |
|
"learning_rate": 0.000468625, |
|
"loss": 3.6746, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.4074413478374481, |
|
"learning_rate": 0.00049984375, |
|
"loss": 3.6544, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.38538795709609985, |
|
"learning_rate": 0.00053109375, |
|
"loss": 3.6247, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.39283987879753113, |
|
"learning_rate": 0.0005623125, |
|
"loss": 3.6071, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.35732314046161223, |
|
"eval_loss": 3.788609266281128, |
|
"eval_runtime": 153.8947, |
|
"eval_samples_per_second": 376.368, |
|
"eval_steps_per_second": 5.887, |
|
"step": 18602 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.33553165197372437, |
|
"learning_rate": 0.0005935625, |
|
"loss": 3.5753, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.33198973536491394, |
|
"learning_rate": 0.0006248125, |
|
"loss": 3.549, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 0.30177804827690125, |
|
"learning_rate": 0.000656, |
|
"loss": 3.5373, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 0.2844315767288208, |
|
"learning_rate": 0.00068725, |
|
"loss": 3.5339, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.2813923954963684, |
|
"learning_rate": 0.0007185000000000001, |
|
"loss": 3.5241, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 0.25613418221473694, |
|
"learning_rate": 0.00074971875, |
|
"loss": 3.5079, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 0.26424020528793335, |
|
"learning_rate": 0.00078096875, |
|
"loss": 3.4997, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.24137265980243683, |
|
"learning_rate": 0.00081221875, |
|
"loss": 3.4857, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 0.23364698886871338, |
|
"learning_rate": 0.00084346875, |
|
"loss": 3.4751, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 0.21730710566043854, |
|
"learning_rate": 0.0008746874999999999, |
|
"loss": 3.4681, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.2167159467935562, |
|
"learning_rate": 0.0009059375, |
|
"loss": 3.4576, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 0.2263418734073639, |
|
"learning_rate": 0.0009371562500000001, |
|
"loss": 3.4514, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 0.2026355117559433, |
|
"learning_rate": 0.0009684062500000001, |
|
"loss": 3.4421, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.2083364725112915, |
|
"learning_rate": 0.0009996562500000001, |
|
"loss": 3.4402, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 0.1914134919643402, |
|
"learning_rate": 0.0009970944594753558, |
|
"loss": 3.4246, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 0.2077130675315857, |
|
"learning_rate": 0.0009941536289848254, |
|
"loss": 3.4173, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.19742459058761597, |
|
"learning_rate": 0.0009912127984942948, |
|
"loss": 3.4043, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 0.18328115344047546, |
|
"learning_rate": 0.000988274908834255, |
|
"loss": 3.3924, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 0.17804497480392456, |
|
"learning_rate": 0.0009853340783437243, |
|
"loss": 3.379, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.3797682433009351, |
|
"eval_loss": 3.5653116703033447, |
|
"eval_runtime": 155.3698, |
|
"eval_samples_per_second": 372.794, |
|
"eval_steps_per_second": 5.831, |
|
"step": 37204 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 0.1852262169122696, |
|
"learning_rate": 0.0009823932478531937, |
|
"loss": 3.3362, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 0.18120849132537842, |
|
"learning_rate": 0.0009794524173626633, |
|
"loss": 3.3286, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 0.1793203055858612, |
|
"learning_rate": 0.0009765145277026233, |
|
"loss": 3.3231, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 0.17847037315368652, |
|
"learning_rate": 0.0009735736972120928, |
|
"loss": 3.3184, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 0.17294944822788239, |
|
"learning_rate": 0.0009706358075520528, |
|
"loss": 3.3086, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 0.16597671806812286, |
|
"learning_rate": 0.0009676949770615222, |
|
"loss": 3.3067, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 0.16891473531723022, |
|
"learning_rate": 0.0009647570874014821, |
|
"loss": 3.3, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 0.17510074377059937, |
|
"learning_rate": 0.0009618162569109516, |
|
"loss": 3.3023, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 0.17169316112995148, |
|
"learning_rate": 0.0009588754264204211, |
|
"loss": 3.295, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 0.174598827958107, |
|
"learning_rate": 0.0009559345959298906, |
|
"loss": 3.2861, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 0.18967097997665405, |
|
"learning_rate": 0.0009529967062698506, |
|
"loss": 3.2845, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 0.1692247986793518, |
|
"learning_rate": 0.0009500617574403012, |
|
"loss": 3.2772, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 0.1717061698436737, |
|
"learning_rate": 0.0009471209269497706, |
|
"loss": 3.2767, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 0.18360279500484467, |
|
"learning_rate": 0.0009441800964592401, |
|
"loss": 3.2632, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.19667457044124603, |
|
"learning_rate": 0.0009412392659687096, |
|
"loss": 3.2638, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 0.16959011554718018, |
|
"learning_rate": 0.0009382984354781791, |
|
"loss": 3.2597, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 0.19635465741157532, |
|
"learning_rate": 0.0009353576049876486, |
|
"loss": 3.2503, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 0.17817912995815277, |
|
"learning_rate": 0.0009324167744971181, |
|
"loss": 3.2515, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.39181961046231156, |
|
"eval_loss": 3.4691600799560547, |
|
"eval_runtime": 155.8093, |
|
"eval_samples_per_second": 371.743, |
|
"eval_steps_per_second": 5.815, |
|
"step": 55806 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"grad_norm": 0.18061339855194092, |
|
"learning_rate": 0.0009294759440065876, |
|
"loss": 3.2344, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"grad_norm": 0.17083428800106049, |
|
"learning_rate": 0.0009265380543465475, |
|
"loss": 3.1818, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 0.18623469769954681, |
|
"learning_rate": 0.000923597223856017, |
|
"loss": 3.1867, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"grad_norm": 0.16947512328624725, |
|
"learning_rate": 0.0009206593341959769, |
|
"loss": 3.1896, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 3.23, |
|
"grad_norm": 0.18858687579631805, |
|
"learning_rate": 0.0009177185037054464, |
|
"loss": 3.1933, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 0.1784023642539978, |
|
"learning_rate": 0.0009147776732149159, |
|
"loss": 3.1876, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"grad_norm": 0.18037760257720947, |
|
"learning_rate": 0.0009118427243853664, |
|
"loss": 3.1871, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"grad_norm": 0.1854478120803833, |
|
"learning_rate": 0.0009089018938948359, |
|
"loss": 3.1915, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 0.17140689492225647, |
|
"learning_rate": 0.0009059610634043054, |
|
"loss": 3.1875, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 3.49, |
|
"grad_norm": 0.18045330047607422, |
|
"learning_rate": 0.0009030202329137749, |
|
"loss": 3.1863, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"grad_norm": 0.1827496588230133, |
|
"learning_rate": 0.0009000794024232444, |
|
"loss": 3.1871, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 0.20431484282016754, |
|
"learning_rate": 0.0008971415127632044, |
|
"loss": 3.1825, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"grad_norm": 0.17922812700271606, |
|
"learning_rate": 0.0008942006822726738, |
|
"loss": 3.181, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"grad_norm": 0.18560312688350677, |
|
"learning_rate": 0.0008912627926126339, |
|
"loss": 3.1767, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 0.182236447930336, |
|
"learning_rate": 0.0008883219621221033, |
|
"loss": 3.1808, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"grad_norm": 0.1626577526330948, |
|
"learning_rate": 0.0008853870132925539, |
|
"loss": 3.1743, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"grad_norm": 0.1721421480178833, |
|
"learning_rate": 0.0008824461828020232, |
|
"loss": 3.1734, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 0.17298929393291473, |
|
"learning_rate": 0.0008795053523114927, |
|
"loss": 3.1718, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 3.98, |
|
"grad_norm": 0.19776953756809235, |
|
"learning_rate": 0.0008765645218209622, |
|
"loss": 3.1729, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.3983427178451906, |
|
"eval_loss": 3.419321060180664, |
|
"eval_runtime": 154.6724, |
|
"eval_samples_per_second": 374.475, |
|
"eval_steps_per_second": 5.858, |
|
"step": 74408 |
|
}, |
|
{ |
|
"epoch": 4.03, |
|
"grad_norm": 0.19919687509536743, |
|
"learning_rate": 0.0008736236913304317, |
|
"loss": 3.1245, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 4.09, |
|
"grad_norm": 0.18255846202373505, |
|
"learning_rate": 0.0008706858016703917, |
|
"loss": 3.1058, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 4.14, |
|
"grad_norm": 0.22742678225040436, |
|
"learning_rate": 0.0008677449711798612, |
|
"loss": 3.108, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 4.19, |
|
"grad_norm": 0.23507148027420044, |
|
"learning_rate": 0.0008648041406893307, |
|
"loss": 3.1162, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"grad_norm": 0.17946326732635498, |
|
"learning_rate": 0.0008618662510292908, |
|
"loss": 3.1139, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"grad_norm": 0.20196717977523804, |
|
"learning_rate": 0.0008589254205387602, |
|
"loss": 3.1171, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 4.35, |
|
"grad_norm": 0.1969883292913437, |
|
"learning_rate": 0.0008559845900482296, |
|
"loss": 3.1167, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 4.41, |
|
"grad_norm": 0.20422372221946716, |
|
"learning_rate": 0.0008530437595576991, |
|
"loss": 3.1144, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"grad_norm": 0.2068530172109604, |
|
"learning_rate": 0.0008501088107281496, |
|
"loss": 3.1169, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 4.52, |
|
"grad_norm": 0.20425015687942505, |
|
"learning_rate": 0.0008471679802376191, |
|
"loss": 3.1188, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 4.57, |
|
"grad_norm": 0.20677131414413452, |
|
"learning_rate": 0.0008442271497470886, |
|
"loss": 3.1181, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"grad_norm": 0.1946183443069458, |
|
"learning_rate": 0.000841286319256558, |
|
"loss": 3.1165, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 4.68, |
|
"grad_norm": 0.19427254796028137, |
|
"learning_rate": 0.000838348429596518, |
|
"loss": 3.1205, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 4.73, |
|
"grad_norm": 0.2612309455871582, |
|
"learning_rate": 0.0008354075991059875, |
|
"loss": 3.1168, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 4.78, |
|
"grad_norm": 0.19651499390602112, |
|
"learning_rate": 0.0008324697094459476, |
|
"loss": 3.1181, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 4.84, |
|
"grad_norm": 0.21289727091789246, |
|
"learning_rate": 0.0008295288789554171, |
|
"loss": 3.1144, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 4.89, |
|
"grad_norm": 0.18825019896030426, |
|
"learning_rate": 0.000826590989295377, |
|
"loss": 3.1141, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 4.95, |
|
"grad_norm": 0.20688407123088837, |
|
"learning_rate": 0.0008236501588048465, |
|
"loss": 3.114, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.18017753958702087, |
|
"learning_rate": 0.000820709328314316, |
|
"loss": 3.1139, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.40260902798169274, |
|
"eval_loss": 3.3907480239868164, |
|
"eval_runtime": 155.5488, |
|
"eval_samples_per_second": 372.365, |
|
"eval_steps_per_second": 5.825, |
|
"step": 93010 |
|
}, |
|
{ |
|
"epoch": 5.05, |
|
"grad_norm": 0.2159438282251358, |
|
"learning_rate": 0.0008177714386542759, |
|
"loss": 3.0438, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 5.11, |
|
"grad_norm": 0.1938624382019043, |
|
"learning_rate": 0.0008148306081637454, |
|
"loss": 3.0471, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 5.16, |
|
"grad_norm": 0.1948939859867096, |
|
"learning_rate": 0.0008118897776732149, |
|
"loss": 3.0519, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 5.21, |
|
"grad_norm": 0.21813590824604034, |
|
"learning_rate": 0.000808951888013175, |
|
"loss": 3.0615, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 5.27, |
|
"grad_norm": 0.25585538148880005, |
|
"learning_rate": 0.0008060110575226443, |
|
"loss": 3.0615, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 5.32, |
|
"grad_norm": 0.2105061262845993, |
|
"learning_rate": 0.0008030702270321138, |
|
"loss": 3.0617, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 5.38, |
|
"grad_norm": 0.18835408985614777, |
|
"learning_rate": 0.0008001293965415833, |
|
"loss": 3.0677, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 5.43, |
|
"grad_norm": 0.1814804971218109, |
|
"learning_rate": 0.0007971915068815434, |
|
"loss": 3.07, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 5.48, |
|
"grad_norm": 0.2165517359972, |
|
"learning_rate": 0.0007942536172215034, |
|
"loss": 3.0654, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 5.54, |
|
"grad_norm": 0.21530425548553467, |
|
"learning_rate": 0.0007913127867309729, |
|
"loss": 3.0686, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 5.59, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0007883748970709328, |
|
"loss": 3.0676, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 5.64, |
|
"grad_norm": 0.20055846869945526, |
|
"learning_rate": 0.0007854340665804023, |
|
"loss": 3.0705, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 5.7, |
|
"grad_norm": 0.19475945830345154, |
|
"learning_rate": 0.0007824932360898718, |
|
"loss": 3.0684, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 5.75, |
|
"grad_norm": 0.2444075047969818, |
|
"learning_rate": 0.0007795553464298318, |
|
"loss": 3.0644, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 5.81, |
|
"grad_norm": 0.3142812252044678, |
|
"learning_rate": 0.0007766145159393013, |
|
"loss": 3.0701, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 5.86, |
|
"grad_norm": 0.1972387135028839, |
|
"learning_rate": 0.0007736736854487707, |
|
"loss": 3.0648, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 5.91, |
|
"grad_norm": 0.18938925862312317, |
|
"learning_rate": 0.0007707328549582402, |
|
"loss": 3.0686, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 5.97, |
|
"grad_norm": 0.22070257365703583, |
|
"learning_rate": 0.0007677949652982002, |
|
"loss": 3.0709, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.40427472089482946, |
|
"eval_loss": 3.3642256259918213, |
|
"eval_runtime": 154.6657, |
|
"eval_samples_per_second": 374.492, |
|
"eval_steps_per_second": 5.858, |
|
"step": 111612 |
|
}, |
|
{ |
|
"epoch": 6.02, |
|
"grad_norm": 0.20117591321468353, |
|
"learning_rate": 0.0007648541348076697, |
|
"loss": 3.0385, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 6.07, |
|
"grad_norm": 0.2254948914051056, |
|
"learning_rate": 0.0007619133043171391, |
|
"loss": 2.9981, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 6.13, |
|
"grad_norm": 0.21827402710914612, |
|
"learning_rate": 0.0007589754146570992, |
|
"loss": 3.007, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 6.18, |
|
"grad_norm": 0.2094143182039261, |
|
"learning_rate": 0.0007560345841665687, |
|
"loss": 3.0119, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 6.24, |
|
"grad_norm": 0.23713943362236023, |
|
"learning_rate": 0.0007530937536760382, |
|
"loss": 3.0158, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 6.29, |
|
"grad_norm": 0.21203161776065826, |
|
"learning_rate": 0.0007501558640159981, |
|
"loss": 3.0179, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 6.34, |
|
"grad_norm": 0.223785862326622, |
|
"learning_rate": 0.0007472150335254676, |
|
"loss": 3.0211, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"grad_norm": 0.3047547936439514, |
|
"learning_rate": 0.0007442742030349371, |
|
"loss": 3.0232, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 6.45, |
|
"grad_norm": 0.22039389610290527, |
|
"learning_rate": 0.0007413333725444066, |
|
"loss": 3.0261, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 6.5, |
|
"grad_norm": 0.22225935757160187, |
|
"learning_rate": 0.0007383954828843665, |
|
"loss": 3.0268, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 6.56, |
|
"grad_norm": 0.30233263969421387, |
|
"learning_rate": 0.000735454652393836, |
|
"loss": 3.0299, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 6.61, |
|
"grad_norm": 0.23030564188957214, |
|
"learning_rate": 0.0007325138219033055, |
|
"loss": 3.0301, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 6.67, |
|
"grad_norm": 0.2047109752893448, |
|
"learning_rate": 0.0007295759322432654, |
|
"loss": 3.027, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 6.72, |
|
"grad_norm": 0.21085889637470245, |
|
"learning_rate": 0.0007266380425832255, |
|
"loss": 3.0278, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 6.77, |
|
"grad_norm": 0.2596847712993622, |
|
"learning_rate": 0.000723697212092695, |
|
"loss": 3.0358, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 6.83, |
|
"grad_norm": 0.1978030651807785, |
|
"learning_rate": 0.0007207563816021645, |
|
"loss": 3.0315, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 6.88, |
|
"grad_norm": 0.21782337129116058, |
|
"learning_rate": 0.000717815551111634, |
|
"loss": 3.028, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 6.93, |
|
"grad_norm": 0.2578461766242981, |
|
"learning_rate": 0.000714877661451594, |
|
"loss": 3.0329, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 6.99, |
|
"grad_norm": 0.20211657881736755, |
|
"learning_rate": 0.0007119368309610634, |
|
"loss": 3.0297, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.40667091369655045, |
|
"eval_loss": 3.3544673919677734, |
|
"eval_runtime": 155.1034, |
|
"eval_samples_per_second": 373.435, |
|
"eval_steps_per_second": 5.841, |
|
"step": 130214 |
|
}, |
|
{ |
|
"epoch": 7.04, |
|
"grad_norm": 0.2451675981283188, |
|
"learning_rate": 0.0007089989413010234, |
|
"loss": 2.9744, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 7.1, |
|
"grad_norm": 0.26030394434928894, |
|
"learning_rate": 0.0007060581108104929, |
|
"loss": 2.9677, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 7.15, |
|
"grad_norm": 0.22699125111103058, |
|
"learning_rate": 0.0007031172803199624, |
|
"loss": 2.9753, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 7.2, |
|
"grad_norm": 0.22514161467552185, |
|
"learning_rate": 0.0007001764498294319, |
|
"loss": 2.974, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 7.26, |
|
"grad_norm": 0.23178595304489136, |
|
"learning_rate": 0.0006972356193389014, |
|
"loss": 2.9784, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 7.31, |
|
"grad_norm": 0.27629485726356506, |
|
"learning_rate": 0.0006942977296788612, |
|
"loss": 2.9856, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 7.36, |
|
"grad_norm": 0.2211887091398239, |
|
"learning_rate": 0.0006913598400188213, |
|
"loss": 2.9859, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 7.42, |
|
"grad_norm": 0.2578117847442627, |
|
"learning_rate": 0.0006884190095282908, |
|
"loss": 2.985, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 7.47, |
|
"grad_norm": 0.25054463744163513, |
|
"learning_rate": 0.0006854781790377603, |
|
"loss": 2.9902, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 7.53, |
|
"grad_norm": 0.21288973093032837, |
|
"learning_rate": 0.0006825402893777203, |
|
"loss": 2.9922, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 7.58, |
|
"grad_norm": 0.21266783773899078, |
|
"learning_rate": 0.0006795994588871898, |
|
"loss": 2.994, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 7.63, |
|
"grad_norm": 0.26795753836631775, |
|
"learning_rate": 0.0006766615692271497, |
|
"loss": 2.9945, |
|
"step": 142000 |
|
}, |
|
{ |
|
"epoch": 7.69, |
|
"grad_norm": 0.2051943689584732, |
|
"learning_rate": 0.0006737207387366192, |
|
"loss": 2.994, |
|
"step": 143000 |
|
}, |
|
{ |
|
"epoch": 7.74, |
|
"grad_norm": 0.2313968390226364, |
|
"learning_rate": 0.0006707799082460887, |
|
"loss": 2.9988, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 7.79, |
|
"grad_norm": 0.24470819532871246, |
|
"learning_rate": 0.0006678390777555582, |
|
"loss": 2.998, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 7.85, |
|
"grad_norm": 0.203665092587471, |
|
"learning_rate": 0.0006649011880955182, |
|
"loss": 2.9962, |
|
"step": 146000 |
|
}, |
|
{ |
|
"epoch": 7.9, |
|
"grad_norm": 0.21225391328334808, |
|
"learning_rate": 0.0006619603576049877, |
|
"loss": 2.9975, |
|
"step": 147000 |
|
}, |
|
{ |
|
"epoch": 7.96, |
|
"grad_norm": 0.25866907835006714, |
|
"learning_rate": 0.0006590195271144572, |
|
"loss": 2.9988, |
|
"step": 148000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.4079528153770697, |
|
"eval_loss": 3.359600782394409, |
|
"eval_runtime": 154.5906, |
|
"eval_samples_per_second": 374.673, |
|
"eval_steps_per_second": 5.861, |
|
"step": 148816 |
|
}, |
|
{ |
|
"epoch": 8.01, |
|
"grad_norm": 0.25523641705513, |
|
"learning_rate": 0.0006560816374544171, |
|
"loss": 2.9863, |
|
"step": 149000 |
|
}, |
|
{ |
|
"epoch": 8.06, |
|
"grad_norm": 0.22232286632061005, |
|
"learning_rate": 0.0006531408069638866, |
|
"loss": 2.9316, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 8.12, |
|
"grad_norm": 0.22783271968364716, |
|
"learning_rate": 0.000650199976473356, |
|
"loss": 2.9369, |
|
"step": 151000 |
|
}, |
|
{ |
|
"epoch": 8.17, |
|
"grad_norm": 0.21875810623168945, |
|
"learning_rate": 0.0006472591459828255, |
|
"loss": 2.9418, |
|
"step": 152000 |
|
}, |
|
{ |
|
"epoch": 8.22, |
|
"grad_norm": 0.2610374093055725, |
|
"learning_rate": 0.0006443212563227856, |
|
"loss": 2.9468, |
|
"step": 153000 |
|
}, |
|
{ |
|
"epoch": 8.28, |
|
"grad_norm": 0.2271444946527481, |
|
"learning_rate": 0.0006413804258322551, |
|
"loss": 2.9532, |
|
"step": 154000 |
|
}, |
|
{ |
|
"epoch": 8.33, |
|
"grad_norm": 0.23558945953845978, |
|
"learning_rate": 0.000638442536172215, |
|
"loss": 2.9513, |
|
"step": 155000 |
|
}, |
|
{ |
|
"epoch": 8.39, |
|
"grad_norm": 0.23239487409591675, |
|
"learning_rate": 0.0006355046465121751, |
|
"loss": 2.9544, |
|
"step": 156000 |
|
}, |
|
{ |
|
"epoch": 8.44, |
|
"grad_norm": 0.2163064181804657, |
|
"learning_rate": 0.0006325638160216445, |
|
"loss": 2.9594, |
|
"step": 157000 |
|
}, |
|
{ |
|
"epoch": 8.49, |
|
"grad_norm": 0.25110113620758057, |
|
"learning_rate": 0.000629622985531114, |
|
"loss": 2.9587, |
|
"step": 158000 |
|
}, |
|
{ |
|
"epoch": 8.55, |
|
"grad_norm": 0.23023970425128937, |
|
"learning_rate": 0.0006266850958710741, |
|
"loss": 2.9635, |
|
"step": 159000 |
|
}, |
|
{ |
|
"epoch": 8.6, |
|
"grad_norm": 0.2888554036617279, |
|
"learning_rate": 0.0006237442653805435, |
|
"loss": 2.9645, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 8.65, |
|
"grad_norm": 0.22880807518959045, |
|
"learning_rate": 0.000620803434890013, |
|
"loss": 2.9623, |
|
"step": 161000 |
|
}, |
|
{ |
|
"epoch": 8.71, |
|
"grad_norm": 0.25806036591529846, |
|
"learning_rate": 0.0006178626043994823, |
|
"loss": 2.971, |
|
"step": 162000 |
|
}, |
|
{ |
|
"epoch": 8.76, |
|
"grad_norm": 0.25819092988967896, |
|
"learning_rate": 0.0006149247147394424, |
|
"loss": 2.9689, |
|
"step": 163000 |
|
}, |
|
{ |
|
"epoch": 8.82, |
|
"grad_norm": 0.2252822071313858, |
|
"learning_rate": 0.0006119838842489119, |
|
"loss": 2.9672, |
|
"step": 164000 |
|
}, |
|
{ |
|
"epoch": 8.87, |
|
"grad_norm": 0.22955825924873352, |
|
"learning_rate": 0.0006090489354193625, |
|
"loss": 2.9666, |
|
"step": 165000 |
|
}, |
|
{ |
|
"epoch": 8.92, |
|
"grad_norm": 0.33369356393814087, |
|
"learning_rate": 0.000606108104928832, |
|
"loss": 2.9677, |
|
"step": 166000 |
|
}, |
|
{ |
|
"epoch": 8.98, |
|
"grad_norm": 0.2391689568758011, |
|
"learning_rate": 0.0006031672744383015, |
|
"loss": 2.9717, |
|
"step": 167000 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.40866631908468914, |
|
"eval_loss": 3.3723206520080566, |
|
"eval_runtime": 155.2809, |
|
"eval_samples_per_second": 373.008, |
|
"eval_steps_per_second": 5.835, |
|
"step": 167418 |
|
}, |
|
{ |
|
"epoch": 9.03, |
|
"grad_norm": 0.22217118740081787, |
|
"learning_rate": 0.0006002293847782614, |
|
"loss": 2.9298, |
|
"step": 168000 |
|
}, |
|
{ |
|
"epoch": 9.09, |
|
"grad_norm": 0.249114990234375, |
|
"learning_rate": 0.0005972885542877309, |
|
"loss": 2.9048, |
|
"step": 169000 |
|
}, |
|
{ |
|
"epoch": 9.14, |
|
"grad_norm": 0.28138816356658936, |
|
"learning_rate": 0.0005943506646276909, |
|
"loss": 2.9122, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 9.19, |
|
"grad_norm": 0.26497504115104675, |
|
"learning_rate": 0.0005914098341371603, |
|
"loss": 2.9165, |
|
"step": 171000 |
|
}, |
|
{ |
|
"epoch": 9.25, |
|
"grad_norm": 0.33365172147750854, |
|
"learning_rate": 0.0005884690036466298, |
|
"loss": 2.9174, |
|
"step": 172000 |
|
}, |
|
{ |
|
"epoch": 9.3, |
|
"grad_norm": 0.257129043340683, |
|
"learning_rate": 0.0005855281731560992, |
|
"loss": 2.924, |
|
"step": 173000 |
|
}, |
|
{ |
|
"epoch": 9.35, |
|
"grad_norm": 0.29947561025619507, |
|
"learning_rate": 0.0005825902834960592, |
|
"loss": 2.9282, |
|
"step": 174000 |
|
}, |
|
{ |
|
"epoch": 9.41, |
|
"grad_norm": 0.23127929866313934, |
|
"learning_rate": 0.0005796494530055287, |
|
"loss": 2.9272, |
|
"step": 175000 |
|
}, |
|
{ |
|
"epoch": 9.46, |
|
"grad_norm": 0.2609017491340637, |
|
"learning_rate": 0.0005767086225149982, |
|
"loss": 2.9311, |
|
"step": 176000 |
|
}, |
|
{ |
|
"epoch": 9.52, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0005737707328549583, |
|
"loss": 2.9321, |
|
"step": 177000 |
|
}, |
|
{ |
|
"epoch": 9.57, |
|
"grad_norm": 0.27535927295684814, |
|
"learning_rate": 0.0005708299023644278, |
|
"loss": 2.9343, |
|
"step": 178000 |
|
}, |
|
{ |
|
"epoch": 9.62, |
|
"grad_norm": 0.2315855473279953, |
|
"learning_rate": 0.0005678890718738972, |
|
"loss": 2.9342, |
|
"step": 179000 |
|
}, |
|
{ |
|
"epoch": 9.68, |
|
"grad_norm": 0.2345881462097168, |
|
"learning_rate": 0.0005649482413833666, |
|
"loss": 2.9377, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 9.73, |
|
"grad_norm": 0.36270418763160706, |
|
"learning_rate": 0.0005620103517233267, |
|
"loss": 2.9415, |
|
"step": 181000 |
|
}, |
|
{ |
|
"epoch": 9.78, |
|
"grad_norm": 0.22433358430862427, |
|
"learning_rate": 0.0005590695212327962, |
|
"loss": 2.9433, |
|
"step": 182000 |
|
}, |
|
{ |
|
"epoch": 9.84, |
|
"grad_norm": 0.23198537528514862, |
|
"learning_rate": 0.0005561286907422656, |
|
"loss": 2.9435, |
|
"step": 183000 |
|
}, |
|
{ |
|
"epoch": 9.89, |
|
"grad_norm": 0.3433922827243805, |
|
"learning_rate": 0.0005531908010822257, |
|
"loss": 2.9422, |
|
"step": 184000 |
|
}, |
|
{ |
|
"epoch": 9.95, |
|
"grad_norm": 0.2841480076313019, |
|
"learning_rate": 0.0005502499705916952, |
|
"loss": 2.9457, |
|
"step": 185000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.2232799530029297, |
|
"learning_rate": 0.0005473091401011647, |
|
"loss": 2.9432, |
|
"step": 186000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.4092650817761506, |
|
"eval_loss": 3.357872486114502, |
|
"eval_runtime": 155.1007, |
|
"eval_samples_per_second": 373.441, |
|
"eval_steps_per_second": 5.841, |
|
"step": 186020 |
|
}, |
|
{ |
|
"epoch": 10.05, |
|
"grad_norm": 0.2610397934913635, |
|
"learning_rate": 0.0005443712504411245, |
|
"loss": 2.8782, |
|
"step": 187000 |
|
}, |
|
{ |
|
"epoch": 10.11, |
|
"grad_norm": 0.3216498792171478, |
|
"learning_rate": 0.000541430419950594, |
|
"loss": 2.8815, |
|
"step": 188000 |
|
}, |
|
{ |
|
"epoch": 10.16, |
|
"grad_norm": 0.2474890649318695, |
|
"learning_rate": 0.0005384895894600635, |
|
"loss": 2.8881, |
|
"step": 189000 |
|
}, |
|
{ |
|
"epoch": 10.21, |
|
"grad_norm": 0.35237178206443787, |
|
"learning_rate": 0.000535548758969533, |
|
"loss": 2.8933, |
|
"step": 190000 |
|
}, |
|
{ |
|
"epoch": 10.27, |
|
"grad_norm": 0.3191303610801697, |
|
"learning_rate": 0.000532610869309493, |
|
"loss": 2.8989, |
|
"step": 191000 |
|
}, |
|
{ |
|
"epoch": 10.32, |
|
"grad_norm": 0.2325054258108139, |
|
"learning_rate": 0.0005296700388189625, |
|
"loss": 2.8978, |
|
"step": 192000 |
|
}, |
|
{ |
|
"epoch": 10.38, |
|
"grad_norm": 0.24588368833065033, |
|
"learning_rate": 0.0005267321491589225, |
|
"loss": 2.9019, |
|
"step": 193000 |
|
}, |
|
{ |
|
"epoch": 10.43, |
|
"grad_norm": 0.2485446035861969, |
|
"learning_rate": 0.0005237913186683919, |
|
"loss": 2.9035, |
|
"step": 194000 |
|
}, |
|
{ |
|
"epoch": 10.48, |
|
"grad_norm": 0.24470220506191254, |
|
"learning_rate": 0.000520853429008352, |
|
"loss": 2.9077, |
|
"step": 195000 |
|
}, |
|
{ |
|
"epoch": 10.54, |
|
"grad_norm": 0.248806893825531, |
|
"learning_rate": 0.0005179125985178215, |
|
"loss": 2.9089, |
|
"step": 196000 |
|
}, |
|
{ |
|
"epoch": 10.59, |
|
"grad_norm": 0.22887374460697174, |
|
"learning_rate": 0.000514971768027291, |
|
"loss": 2.908, |
|
"step": 197000 |
|
}, |
|
{ |
|
"epoch": 10.64, |
|
"grad_norm": 0.2529465854167938, |
|
"learning_rate": 0.0005120309375367605, |
|
"loss": 2.9127, |
|
"step": 198000 |
|
}, |
|
{ |
|
"epoch": 10.7, |
|
"grad_norm": 0.25075119733810425, |
|
"learning_rate": 0.0005090901070462299, |
|
"loss": 2.911, |
|
"step": 199000 |
|
}, |
|
{ |
|
"epoch": 10.75, |
|
"grad_norm": 0.23723989725112915, |
|
"learning_rate": 0.0005061522173861898, |
|
"loss": 2.9152, |
|
"step": 200000 |
|
}, |
|
{ |
|
"epoch": 10.81, |
|
"grad_norm": 0.25637319684028625, |
|
"learning_rate": 0.0005032143277261498, |
|
"loss": 2.9149, |
|
"step": 201000 |
|
}, |
|
{ |
|
"epoch": 10.86, |
|
"grad_norm": 0.28713148832321167, |
|
"learning_rate": 0.0005002734972356193, |
|
"loss": 2.9178, |
|
"step": 202000 |
|
}, |
|
{ |
|
"epoch": 10.91, |
|
"grad_norm": 0.27700188755989075, |
|
"learning_rate": 0.0004973356075755794, |
|
"loss": 2.9214, |
|
"step": 203000 |
|
}, |
|
{ |
|
"epoch": 10.97, |
|
"grad_norm": 0.33580395579338074, |
|
"learning_rate": 0.0004943947770850489, |
|
"loss": 2.9217, |
|
"step": 204000 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.4097510516302999, |
|
"eval_loss": 3.370105028152466, |
|
"eval_runtime": 155.2297, |
|
"eval_samples_per_second": 373.131, |
|
"eval_steps_per_second": 5.837, |
|
"step": 204622 |
|
}, |
|
{ |
|
"epoch": 11.02, |
|
"grad_norm": 0.23724067211151123, |
|
"learning_rate": 0.0004914539465945184, |
|
"loss": 2.8953, |
|
"step": 205000 |
|
}, |
|
{ |
|
"epoch": 11.07, |
|
"grad_norm": 0.33235830068588257, |
|
"learning_rate": 0.0004885160569344783, |
|
"loss": 2.8533, |
|
"step": 206000 |
|
}, |
|
{ |
|
"epoch": 11.13, |
|
"grad_norm": 0.2660665810108185, |
|
"learning_rate": 0.0004855752264439478, |
|
"loss": 2.8628, |
|
"step": 207000 |
|
}, |
|
{ |
|
"epoch": 11.18, |
|
"grad_norm": 0.24949678778648376, |
|
"learning_rate": 0.00048263439595341726, |
|
"loss": 2.8692, |
|
"step": 208000 |
|
}, |
|
{ |
|
"epoch": 11.24, |
|
"grad_norm": 0.2526322901248932, |
|
"learning_rate": 0.00047969356546288675, |
|
"loss": 2.8702, |
|
"step": 209000 |
|
}, |
|
{ |
|
"epoch": 11.29, |
|
"grad_norm": 0.29348504543304443, |
|
"learning_rate": 0.00047676155746382784, |
|
"loss": 2.8716, |
|
"step": 210000 |
|
}, |
|
{ |
|
"epoch": 11.34, |
|
"grad_norm": 0.2396526336669922, |
|
"learning_rate": 0.0004738207269732973, |
|
"loss": 2.8788, |
|
"step": 211000 |
|
}, |
|
{ |
|
"epoch": 11.4, |
|
"grad_norm": 0.24263983964920044, |
|
"learning_rate": 0.00047087989648276677, |
|
"loss": 2.8842, |
|
"step": 212000 |
|
}, |
|
{ |
|
"epoch": 11.45, |
|
"grad_norm": 0.24339748919010162, |
|
"learning_rate": 0.00046793906599223626, |
|
"loss": 2.8805, |
|
"step": 213000 |
|
}, |
|
{ |
|
"epoch": 11.5, |
|
"grad_norm": 0.35075515508651733, |
|
"learning_rate": 0.00046499823550170564, |
|
"loss": 2.8861, |
|
"step": 214000 |
|
}, |
|
{ |
|
"epoch": 11.56, |
|
"grad_norm": 0.2587752938270569, |
|
"learning_rate": 0.00046205740501117514, |
|
"loss": 2.8861, |
|
"step": 215000 |
|
}, |
|
{ |
|
"epoch": 11.61, |
|
"grad_norm": 0.2334243208169937, |
|
"learning_rate": 0.00045911657452064463, |
|
"loss": 2.8904, |
|
"step": 216000 |
|
}, |
|
{ |
|
"epoch": 11.67, |
|
"grad_norm": 0.2610035836696625, |
|
"learning_rate": 0.00045617868486060465, |
|
"loss": 2.8888, |
|
"step": 217000 |
|
}, |
|
{ |
|
"epoch": 11.72, |
|
"grad_norm": 0.2527429461479187, |
|
"learning_rate": 0.00045323785437007414, |
|
"loss": 2.8869, |
|
"step": 218000 |
|
}, |
|
{ |
|
"epoch": 11.77, |
|
"grad_norm": 0.27201133966445923, |
|
"learning_rate": 0.00045030290554052464, |
|
"loss": 2.8921, |
|
"step": 219000 |
|
}, |
|
{ |
|
"epoch": 11.83, |
|
"grad_norm": 0.25473257899284363, |
|
"learning_rate": 0.00044736207504999414, |
|
"loss": 2.8933, |
|
"step": 220000 |
|
}, |
|
{ |
|
"epoch": 11.88, |
|
"grad_norm": 0.24522466957569122, |
|
"learning_rate": 0.0004444212445594636, |
|
"loss": 2.8943, |
|
"step": 221000 |
|
}, |
|
{ |
|
"epoch": 11.93, |
|
"grad_norm": 0.2609509825706482, |
|
"learning_rate": 0.00044148041406893307, |
|
"loss": 2.8972, |
|
"step": 222000 |
|
}, |
|
{ |
|
"epoch": 11.99, |
|
"grad_norm": 0.252936989068985, |
|
"learning_rate": 0.0004385425244088931, |
|
"loss": 2.8986, |
|
"step": 223000 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.41030937281606633, |
|
"eval_loss": 3.364614963531494, |
|
"eval_runtime": 154.8866, |
|
"eval_samples_per_second": 373.957, |
|
"eval_steps_per_second": 5.849, |
|
"step": 223224 |
|
}, |
|
{ |
|
"epoch": 12.04, |
|
"grad_norm": 0.2545512914657593, |
|
"learning_rate": 0.0004356016939183626, |
|
"loss": 2.8465, |
|
"step": 224000 |
|
}, |
|
{ |
|
"epoch": 12.1, |
|
"grad_norm": 0.28531843423843384, |
|
"learning_rate": 0.0004326638042583226, |
|
"loss": 2.8391, |
|
"step": 225000 |
|
}, |
|
{ |
|
"epoch": 12.15, |
|
"grad_norm": 0.26895156502723694, |
|
"learning_rate": 0.000429722973767792, |
|
"loss": 2.8444, |
|
"step": 226000 |
|
}, |
|
{ |
|
"epoch": 12.2, |
|
"grad_norm": 0.28283581137657166, |
|
"learning_rate": 0.0004267821432772615, |
|
"loss": 2.849, |
|
"step": 227000 |
|
}, |
|
{ |
|
"epoch": 12.26, |
|
"grad_norm": 0.3257947564125061, |
|
"learning_rate": 0.0004238442536172215, |
|
"loss": 2.8473, |
|
"step": 228000 |
|
}, |
|
{ |
|
"epoch": 12.31, |
|
"grad_norm": 0.26338285207748413, |
|
"learning_rate": 0.000420903423126691, |
|
"loss": 2.8562, |
|
"step": 229000 |
|
}, |
|
{ |
|
"epoch": 12.36, |
|
"grad_norm": 0.24919289350509644, |
|
"learning_rate": 0.0004179625926361605, |
|
"loss": 2.8545, |
|
"step": 230000 |
|
}, |
|
{ |
|
"epoch": 12.42, |
|
"grad_norm": 0.24178272485733032, |
|
"learning_rate": 0.00041502176214562993, |
|
"loss": 2.8579, |
|
"step": 231000 |
|
}, |
|
{ |
|
"epoch": 12.47, |
|
"grad_norm": 0.2602218985557556, |
|
"learning_rate": 0.00041208387248558995, |
|
"loss": 2.8588, |
|
"step": 232000 |
|
}, |
|
{ |
|
"epoch": 12.53, |
|
"grad_norm": 0.23942138254642487, |
|
"learning_rate": 0.0004091459828255499, |
|
"loss": 2.8592, |
|
"step": 233000 |
|
}, |
|
{ |
|
"epoch": 12.58, |
|
"grad_norm": 0.24679596722126007, |
|
"learning_rate": 0.0004062051523350194, |
|
"loss": 2.8692, |
|
"step": 234000 |
|
}, |
|
{ |
|
"epoch": 12.63, |
|
"grad_norm": 0.24444027245044708, |
|
"learning_rate": 0.0004032643218444889, |
|
"loss": 2.8655, |
|
"step": 235000 |
|
}, |
|
{ |
|
"epoch": 12.69, |
|
"grad_norm": 0.30420973896980286, |
|
"learning_rate": 0.00040032349135395834, |
|
"loss": 2.8658, |
|
"step": 236000 |
|
}, |
|
{ |
|
"epoch": 12.74, |
|
"grad_norm": 0.2618982493877411, |
|
"learning_rate": 0.00039738560169391837, |
|
"loss": 2.8709, |
|
"step": 237000 |
|
}, |
|
{ |
|
"epoch": 12.79, |
|
"grad_norm": 0.2887171804904938, |
|
"learning_rate": 0.00039444477120338786, |
|
"loss": 2.8724, |
|
"step": 238000 |
|
}, |
|
{ |
|
"epoch": 12.85, |
|
"grad_norm": 0.26698777079582214, |
|
"learning_rate": 0.00039150394071285735, |
|
"loss": 2.8689, |
|
"step": 239000 |
|
}, |
|
{ |
|
"epoch": 12.9, |
|
"grad_norm": 0.2446519136428833, |
|
"learning_rate": 0.0003885631102223268, |
|
"loss": 2.8778, |
|
"step": 240000 |
|
}, |
|
{ |
|
"epoch": 12.96, |
|
"grad_norm": 0.25453439354896545, |
|
"learning_rate": 0.00038562522056228676, |
|
"loss": 2.8745, |
|
"step": 241000 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.4104802079472664, |
|
"eval_loss": 3.367594003677368, |
|
"eval_runtime": 154.8681, |
|
"eval_samples_per_second": 374.002, |
|
"eval_steps_per_second": 5.85, |
|
"step": 241826 |
|
}, |
|
{ |
|
"epoch": 13.01, |
|
"grad_norm": 0.2726215124130249, |
|
"learning_rate": 0.0003826873309022468, |
|
"loss": 2.8634, |
|
"step": 242000 |
|
}, |
|
{ |
|
"epoch": 13.06, |
|
"grad_norm": 0.2711837887763977, |
|
"learning_rate": 0.0003797465004117163, |
|
"loss": 2.8164, |
|
"step": 243000 |
|
}, |
|
{ |
|
"epoch": 13.12, |
|
"grad_norm": 0.2655225694179535, |
|
"learning_rate": 0.00037680566992118577, |
|
"loss": 2.8182, |
|
"step": 244000 |
|
}, |
|
{ |
|
"epoch": 13.17, |
|
"grad_norm": 0.3055564761161804, |
|
"learning_rate": 0.00037386778026114574, |
|
"loss": 2.8278, |
|
"step": 245000 |
|
}, |
|
{ |
|
"epoch": 13.22, |
|
"grad_norm": 0.3115604817867279, |
|
"learning_rate": 0.00037092694977061523, |
|
"loss": 2.8282, |
|
"step": 246000 |
|
}, |
|
{ |
|
"epoch": 13.28, |
|
"grad_norm": 0.27207228541374207, |
|
"learning_rate": 0.00036798611928008467, |
|
"loss": 2.8323, |
|
"step": 247000 |
|
}, |
|
{ |
|
"epoch": 13.33, |
|
"grad_norm": 0.2732410132884979, |
|
"learning_rate": 0.00036504528878955416, |
|
"loss": 2.8363, |
|
"step": 248000 |
|
}, |
|
{ |
|
"epoch": 13.39, |
|
"grad_norm": 0.24663548171520233, |
|
"learning_rate": 0.0003621073991295142, |
|
"loss": 2.8382, |
|
"step": 249000 |
|
}, |
|
{ |
|
"epoch": 13.44, |
|
"grad_norm": 0.2780630588531494, |
|
"learning_rate": 0.0003591665686389837, |
|
"loss": 2.8379, |
|
"step": 250000 |
|
}, |
|
{ |
|
"epoch": 13.49, |
|
"grad_norm": 0.2623123526573181, |
|
"learning_rate": 0.0003562257381484531, |
|
"loss": 2.8383, |
|
"step": 251000 |
|
}, |
|
{ |
|
"epoch": 13.55, |
|
"grad_norm": 0.26465824246406555, |
|
"learning_rate": 0.00035329078931890367, |
|
"loss": 2.8445, |
|
"step": 252000 |
|
}, |
|
{ |
|
"epoch": 13.6, |
|
"grad_norm": 0.2646311819553375, |
|
"learning_rate": 0.0003503499588283731, |
|
"loss": 2.8417, |
|
"step": 253000 |
|
}, |
|
{ |
|
"epoch": 13.65, |
|
"grad_norm": 0.2561984956264496, |
|
"learning_rate": 0.00034741206916833313, |
|
"loss": 2.8439, |
|
"step": 254000 |
|
}, |
|
{ |
|
"epoch": 13.71, |
|
"grad_norm": 0.3358238637447357, |
|
"learning_rate": 0.0003444712386778026, |
|
"loss": 2.8495, |
|
"step": 255000 |
|
}, |
|
{ |
|
"epoch": 13.76, |
|
"grad_norm": 0.3044319748878479, |
|
"learning_rate": 0.0003415304081872721, |
|
"loss": 2.8472, |
|
"step": 256000 |
|
}, |
|
{ |
|
"epoch": 13.82, |
|
"grad_norm": 0.2482803910970688, |
|
"learning_rate": 0.0003385925185272321, |
|
"loss": 2.8555, |
|
"step": 257000 |
|
}, |
|
{ |
|
"epoch": 13.87, |
|
"grad_norm": 0.2835788130760193, |
|
"learning_rate": 0.0003356516880367016, |
|
"loss": 2.8494, |
|
"step": 258000 |
|
}, |
|
{ |
|
"epoch": 13.92, |
|
"grad_norm": 0.2810549736022949, |
|
"learning_rate": 0.000332710857546171, |
|
"loss": 2.8522, |
|
"step": 259000 |
|
}, |
|
{ |
|
"epoch": 13.98, |
|
"grad_norm": 0.30381470918655396, |
|
"learning_rate": 0.0003297700270556405, |
|
"loss": 2.8518, |
|
"step": 260000 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.4109941912696463, |
|
"eval_loss": 3.3750314712524414, |
|
"eval_runtime": 154.897, |
|
"eval_samples_per_second": 373.932, |
|
"eval_steps_per_second": 5.849, |
|
"step": 260428 |
|
}, |
|
{ |
|
"epoch": 14.03, |
|
"grad_norm": 0.27919963002204895, |
|
"learning_rate": 0.00032683213739560053, |
|
"loss": 2.8197, |
|
"step": 261000 |
|
}, |
|
{ |
|
"epoch": 14.08, |
|
"grad_norm": 0.27463871240615845, |
|
"learning_rate": 0.00032389424773556055, |
|
"loss": 2.7989, |
|
"step": 262000 |
|
}, |
|
{ |
|
"epoch": 14.14, |
|
"grad_norm": 0.26133376359939575, |
|
"learning_rate": 0.00032095341724503005, |
|
"loss": 2.8029, |
|
"step": 263000 |
|
}, |
|
{ |
|
"epoch": 14.19, |
|
"grad_norm": 0.34036701917648315, |
|
"learning_rate": 0.00031801258675449943, |
|
"loss": 2.8059, |
|
"step": 264000 |
|
}, |
|
{ |
|
"epoch": 14.25, |
|
"grad_norm": 0.293765664100647, |
|
"learning_rate": 0.00031507469709445945, |
|
"loss": 2.8085, |
|
"step": 265000 |
|
}, |
|
{ |
|
"epoch": 14.3, |
|
"grad_norm": 0.26148098707199097, |
|
"learning_rate": 0.00031213386660392895, |
|
"loss": 2.8132, |
|
"step": 266000 |
|
}, |
|
{ |
|
"epoch": 14.35, |
|
"grad_norm": 0.25458094477653503, |
|
"learning_rate": 0.00030919303611339844, |
|
"loss": 2.8154, |
|
"step": 267000 |
|
}, |
|
{ |
|
"epoch": 14.41, |
|
"grad_norm": 0.26752936840057373, |
|
"learning_rate": 0.00030625220562286793, |
|
"loss": 2.8209, |
|
"step": 268000 |
|
}, |
|
{ |
|
"epoch": 14.46, |
|
"grad_norm": 0.26262667775154114, |
|
"learning_rate": 0.00030331137513233737, |
|
"loss": 2.8175, |
|
"step": 269000 |
|
}, |
|
{ |
|
"epoch": 14.51, |
|
"grad_norm": 0.3177816867828369, |
|
"learning_rate": 0.00030037054464180686, |
|
"loss": 2.8221, |
|
"step": 270000 |
|
}, |
|
{ |
|
"epoch": 14.57, |
|
"grad_norm": 0.34930920600891113, |
|
"learning_rate": 0.0002974326549817669, |
|
"loss": 2.8277, |
|
"step": 271000 |
|
}, |
|
{ |
|
"epoch": 14.62, |
|
"grad_norm": 0.25501158833503723, |
|
"learning_rate": 0.0002944918244912364, |
|
"loss": 2.8237, |
|
"step": 272000 |
|
}, |
|
{ |
|
"epoch": 14.68, |
|
"grad_norm": 0.25757795572280884, |
|
"learning_rate": 0.00029155393483119635, |
|
"loss": 2.823, |
|
"step": 273000 |
|
}, |
|
{ |
|
"epoch": 14.73, |
|
"grad_norm": 0.262439489364624, |
|
"learning_rate": 0.0002886131043406658, |
|
"loss": 2.8256, |
|
"step": 274000 |
|
}, |
|
{ |
|
"epoch": 14.78, |
|
"grad_norm": 0.27561119198799133, |
|
"learning_rate": 0.0002856722738501353, |
|
"loss": 2.8325, |
|
"step": 275000 |
|
}, |
|
{ |
|
"epoch": 14.84, |
|
"grad_norm": 0.30357280373573303, |
|
"learning_rate": 0.0002827343841900953, |
|
"loss": 2.8292, |
|
"step": 276000 |
|
}, |
|
{ |
|
"epoch": 14.89, |
|
"grad_norm": 0.27511313557624817, |
|
"learning_rate": 0.0002797935536995648, |
|
"loss": 2.8319, |
|
"step": 277000 |
|
}, |
|
{ |
|
"epoch": 14.94, |
|
"grad_norm": 0.26999151706695557, |
|
"learning_rate": 0.0002768527232090343, |
|
"loss": 2.8341, |
|
"step": 278000 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"grad_norm": 0.27331700921058655, |
|
"learning_rate": 0.00027391483354899425, |
|
"loss": 2.8328, |
|
"step": 279000 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.4111497096844038, |
|
"eval_loss": 3.3722476959228516, |
|
"eval_runtime": 154.7468, |
|
"eval_samples_per_second": 374.295, |
|
"eval_steps_per_second": 5.855, |
|
"step": 279030 |
|
}, |
|
{ |
|
"epoch": 15.05, |
|
"grad_norm": 0.2742418944835663, |
|
"learning_rate": 0.0002709769438889542, |
|
"loss": 2.7834, |
|
"step": 280000 |
|
}, |
|
{ |
|
"epoch": 15.11, |
|
"grad_norm": 0.2747434675693512, |
|
"learning_rate": 0.0002680361133984237, |
|
"loss": 2.7848, |
|
"step": 281000 |
|
}, |
|
{ |
|
"epoch": 15.16, |
|
"grad_norm": 0.308196485042572, |
|
"learning_rate": 0.0002650952829078932, |
|
"loss": 2.7865, |
|
"step": 282000 |
|
}, |
|
{ |
|
"epoch": 15.21, |
|
"grad_norm": 0.3813375234603882, |
|
"learning_rate": 0.0002621544524173627, |
|
"loss": 2.7904, |
|
"step": 283000 |
|
}, |
|
{ |
|
"epoch": 15.27, |
|
"grad_norm": 0.25685274600982666, |
|
"learning_rate": 0.00025921656275732267, |
|
"loss": 2.7944, |
|
"step": 284000 |
|
}, |
|
{ |
|
"epoch": 15.32, |
|
"grad_norm": 0.2853144109249115, |
|
"learning_rate": 0.0002562757322667921, |
|
"loss": 2.7967, |
|
"step": 285000 |
|
}, |
|
{ |
|
"epoch": 15.37, |
|
"grad_norm": 0.2811448276042938, |
|
"learning_rate": 0.0002533349017762616, |
|
"loss": 2.7989, |
|
"step": 286000 |
|
}, |
|
{ |
|
"epoch": 15.43, |
|
"grad_norm": 0.32674285769462585, |
|
"learning_rate": 0.0002503970121162216, |
|
"loss": 2.8013, |
|
"step": 287000 |
|
}, |
|
{ |
|
"epoch": 15.48, |
|
"grad_norm": 0.2677679657936096, |
|
"learning_rate": 0.0002474561816256911, |
|
"loss": 2.8003, |
|
"step": 288000 |
|
}, |
|
{ |
|
"epoch": 15.54, |
|
"grad_norm": 0.2848023474216461, |
|
"learning_rate": 0.00024451829196565114, |
|
"loss": 2.8051, |
|
"step": 289000 |
|
}, |
|
{ |
|
"epoch": 15.59, |
|
"grad_norm": 0.3296356499195099, |
|
"learning_rate": 0.00024157746147512058, |
|
"loss": 2.8004, |
|
"step": 290000 |
|
}, |
|
{ |
|
"epoch": 15.64, |
|
"grad_norm": 0.3256130516529083, |
|
"learning_rate": 0.00023863663098459004, |
|
"loss": 2.8051, |
|
"step": 291000 |
|
}, |
|
{ |
|
"epoch": 15.7, |
|
"grad_norm": 0.30205684900283813, |
|
"learning_rate": 0.0002357016821550406, |
|
"loss": 2.8075, |
|
"step": 292000 |
|
}, |
|
{ |
|
"epoch": 15.75, |
|
"grad_norm": 0.2870996296405792, |
|
"learning_rate": 0.00023276085166451006, |
|
"loss": 2.8081, |
|
"step": 293000 |
|
}, |
|
{ |
|
"epoch": 15.8, |
|
"grad_norm": 0.26530566811561584, |
|
"learning_rate": 0.00022982002117397955, |
|
"loss": 2.8119, |
|
"step": 294000 |
|
}, |
|
{ |
|
"epoch": 15.86, |
|
"grad_norm": 0.27719756960868835, |
|
"learning_rate": 0.000226879190683449, |
|
"loss": 2.8085, |
|
"step": 295000 |
|
}, |
|
{ |
|
"epoch": 15.91, |
|
"grad_norm": 0.27279022336006165, |
|
"learning_rate": 0.00022393836019291848, |
|
"loss": 2.8137, |
|
"step": 296000 |
|
}, |
|
{ |
|
"epoch": 15.97, |
|
"grad_norm": 0.2712692618370056, |
|
"learning_rate": 0.0002210004705328785, |
|
"loss": 2.8089, |
|
"step": 297000 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.4115371957389701, |
|
"eval_loss": 3.3797037601470947, |
|
"eval_runtime": 154.9333, |
|
"eval_samples_per_second": 373.845, |
|
"eval_steps_per_second": 5.848, |
|
"step": 297632 |
|
}, |
|
{ |
|
"epoch": 16.02, |
|
"grad_norm": 0.2609094977378845, |
|
"learning_rate": 0.0002180625808728385, |
|
"loss": 2.7927, |
|
"step": 298000 |
|
}, |
|
{ |
|
"epoch": 16.07, |
|
"grad_norm": 0.33733800053596497, |
|
"learning_rate": 0.00021512175038230797, |
|
"loss": 2.7628, |
|
"step": 299000 |
|
}, |
|
{ |
|
"epoch": 16.13, |
|
"grad_norm": 0.27339014410972595, |
|
"learning_rate": 0.00021218091989177743, |
|
"loss": 2.7711, |
|
"step": 300000 |
|
}, |
|
{ |
|
"epoch": 16.18, |
|
"grad_norm": 0.27283820509910583, |
|
"learning_rate": 0.00020924303023173743, |
|
"loss": 2.7731, |
|
"step": 301000 |
|
}, |
|
{ |
|
"epoch": 16.23, |
|
"grad_norm": 0.29361584782600403, |
|
"learning_rate": 0.00020630219974120692, |
|
"loss": 2.7739, |
|
"step": 302000 |
|
}, |
|
{ |
|
"epoch": 16.29, |
|
"grad_norm": 0.2751254439353943, |
|
"learning_rate": 0.0002033613692506764, |
|
"loss": 2.7733, |
|
"step": 303000 |
|
}, |
|
{ |
|
"epoch": 16.34, |
|
"grad_norm": 0.31571540236473083, |
|
"learning_rate": 0.00020042053876014588, |
|
"loss": 2.7803, |
|
"step": 304000 |
|
}, |
|
{ |
|
"epoch": 16.4, |
|
"grad_norm": 0.2743053138256073, |
|
"learning_rate": 0.00019748264910010588, |
|
"loss": 2.7825, |
|
"step": 305000 |
|
}, |
|
{ |
|
"epoch": 16.45, |
|
"grad_norm": 0.30572009086608887, |
|
"learning_rate": 0.00019454181860957534, |
|
"loss": 2.7831, |
|
"step": 306000 |
|
}, |
|
{ |
|
"epoch": 16.5, |
|
"grad_norm": 0.27128228545188904, |
|
"learning_rate": 0.00019160392894953534, |
|
"loss": 2.7858, |
|
"step": 307000 |
|
}, |
|
{ |
|
"epoch": 16.56, |
|
"grad_norm": 0.28637221455574036, |
|
"learning_rate": 0.00018866603928949536, |
|
"loss": 2.7873, |
|
"step": 308000 |
|
}, |
|
{ |
|
"epoch": 16.61, |
|
"grad_norm": 0.30166834592819214, |
|
"learning_rate": 0.00018572520879896485, |
|
"loss": 2.7859, |
|
"step": 309000 |
|
}, |
|
{ |
|
"epoch": 16.66, |
|
"grad_norm": 0.26175713539123535, |
|
"learning_rate": 0.00018278437830843432, |
|
"loss": 2.7902, |
|
"step": 310000 |
|
}, |
|
{ |
|
"epoch": 16.72, |
|
"grad_norm": 0.3338593542575836, |
|
"learning_rate": 0.00017984648864839432, |
|
"loss": 2.7876, |
|
"step": 311000 |
|
}, |
|
{ |
|
"epoch": 16.77, |
|
"grad_norm": 0.291354775428772, |
|
"learning_rate": 0.00017690565815786378, |
|
"loss": 2.789, |
|
"step": 312000 |
|
}, |
|
{ |
|
"epoch": 16.83, |
|
"grad_norm": 0.3241804242134094, |
|
"learning_rate": 0.00017396482766733327, |
|
"loss": 2.7875, |
|
"step": 313000 |
|
}, |
|
{ |
|
"epoch": 16.88, |
|
"grad_norm": 0.27583855390548706, |
|
"learning_rate": 0.00017102399717680274, |
|
"loss": 2.7928, |
|
"step": 314000 |
|
}, |
|
{ |
|
"epoch": 16.93, |
|
"grad_norm": 0.38700050115585327, |
|
"learning_rate": 0.00016808610751676274, |
|
"loss": 2.7902, |
|
"step": 315000 |
|
}, |
|
{ |
|
"epoch": 16.99, |
|
"grad_norm": 0.2652926445007324, |
|
"learning_rate": 0.00016514527702623223, |
|
"loss": 2.7911, |
|
"step": 316000 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.4108795174320692, |
|
"eval_loss": 3.3881633281707764, |
|
"eval_runtime": 154.5367, |
|
"eval_samples_per_second": 374.804, |
|
"eval_steps_per_second": 5.863, |
|
"step": 316234 |
|
}, |
|
{ |
|
"epoch": 17.04, |
|
"grad_norm": 0.2926689684391022, |
|
"learning_rate": 0.0001622044465357017, |
|
"loss": 2.7624, |
|
"step": 317000 |
|
}, |
|
{ |
|
"epoch": 17.09, |
|
"grad_norm": 0.2626492977142334, |
|
"learning_rate": 0.0001592665568756617, |
|
"loss": 2.7546, |
|
"step": 318000 |
|
}, |
|
{ |
|
"epoch": 17.15, |
|
"grad_norm": 0.27373695373535156, |
|
"learning_rate": 0.00015632572638513115, |
|
"loss": 2.7568, |
|
"step": 319000 |
|
}, |
|
{ |
|
"epoch": 17.2, |
|
"grad_norm": 0.3076825737953186, |
|
"learning_rate": 0.00015338489589460065, |
|
"loss": 2.7601, |
|
"step": 320000 |
|
}, |
|
{ |
|
"epoch": 17.26, |
|
"grad_norm": 0.2832266092300415, |
|
"learning_rate": 0.0001504440654040701, |
|
"loss": 2.7627, |
|
"step": 321000 |
|
}, |
|
{ |
|
"epoch": 17.31, |
|
"grad_norm": 0.3134935200214386, |
|
"learning_rate": 0.0001475032349135396, |
|
"loss": 2.7633, |
|
"step": 322000 |
|
}, |
|
{ |
|
"epoch": 17.36, |
|
"grad_norm": 0.28282949328422546, |
|
"learning_rate": 0.0001445653452534996, |
|
"loss": 2.7616, |
|
"step": 323000 |
|
}, |
|
{ |
|
"epoch": 17.42, |
|
"grad_norm": 0.2717437744140625, |
|
"learning_rate": 0.00014162451476296906, |
|
"loss": 2.7664, |
|
"step": 324000 |
|
}, |
|
{ |
|
"epoch": 17.47, |
|
"grad_norm": 0.3018600642681122, |
|
"learning_rate": 0.00013868662510292906, |
|
"loss": 2.7672, |
|
"step": 325000 |
|
}, |
|
{ |
|
"epoch": 17.52, |
|
"grad_norm": 0.41076505184173584, |
|
"learning_rate": 0.00013574579461239855, |
|
"loss": 2.7662, |
|
"step": 326000 |
|
}, |
|
{ |
|
"epoch": 17.58, |
|
"grad_norm": 0.28314197063446045, |
|
"learning_rate": 0.00013280790495235855, |
|
"loss": 2.7679, |
|
"step": 327000 |
|
}, |
|
{ |
|
"epoch": 17.63, |
|
"grad_norm": 0.2659911811351776, |
|
"learning_rate": 0.00012986707446182802, |
|
"loss": 2.7656, |
|
"step": 328000 |
|
}, |
|
{ |
|
"epoch": 17.69, |
|
"grad_norm": 0.28878411650657654, |
|
"learning_rate": 0.00012692918480178804, |
|
"loss": 2.7627, |
|
"step": 329000 |
|
}, |
|
{ |
|
"epoch": 17.74, |
|
"grad_norm": 0.2714906930923462, |
|
"learning_rate": 0.0001239883543112575, |
|
"loss": 2.7715, |
|
"step": 330000 |
|
}, |
|
{ |
|
"epoch": 17.79, |
|
"grad_norm": 0.3475322425365448, |
|
"learning_rate": 0.00012105046465121751, |
|
"loss": 2.7704, |
|
"step": 331000 |
|
}, |
|
{ |
|
"epoch": 17.85, |
|
"grad_norm": 0.2924496531486511, |
|
"learning_rate": 0.00011810963416068698, |
|
"loss": 2.7695, |
|
"step": 332000 |
|
}, |
|
{ |
|
"epoch": 17.9, |
|
"grad_norm": 0.2651541829109192, |
|
"learning_rate": 0.00011516880367015646, |
|
"loss": 2.7744, |
|
"step": 333000 |
|
}, |
|
{ |
|
"epoch": 17.96, |
|
"grad_norm": 0.27891284227371216, |
|
"learning_rate": 0.00011223091401011645, |
|
"loss": 2.773, |
|
"step": 334000 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.41146578490748537, |
|
"eval_loss": 3.395080804824829, |
|
"eval_runtime": 154.4425, |
|
"eval_samples_per_second": 375.033, |
|
"eval_steps_per_second": 5.866, |
|
"step": 334836 |
|
}, |
|
{ |
|
"epoch": 18.01, |
|
"grad_norm": 0.3232966363430023, |
|
"learning_rate": 0.00010929008351958593, |
|
"loss": 2.767, |
|
"step": 335000 |
|
}, |
|
{ |
|
"epoch": 18.06, |
|
"grad_norm": 0.3170246481895447, |
|
"learning_rate": 0.00010635219385954593, |
|
"loss": 2.7387, |
|
"step": 336000 |
|
}, |
|
{ |
|
"epoch": 18.12, |
|
"grad_norm": 0.28522101044654846, |
|
"learning_rate": 0.00010341136336901541, |
|
"loss": 2.7415, |
|
"step": 337000 |
|
}, |
|
{ |
|
"epoch": 18.17, |
|
"grad_norm": 0.35791322588920593, |
|
"learning_rate": 0.00010047347370897542, |
|
"loss": 2.741, |
|
"step": 338000 |
|
}, |
|
{ |
|
"epoch": 18.22, |
|
"grad_norm": 0.2994053065776825, |
|
"learning_rate": 9.75326432184449e-05, |
|
"loss": 2.7415, |
|
"step": 339000 |
|
}, |
|
{ |
|
"epoch": 18.28, |
|
"grad_norm": 0.31925782561302185, |
|
"learning_rate": 9.459181272791436e-05, |
|
"loss": 2.7482, |
|
"step": 340000 |
|
}, |
|
{ |
|
"epoch": 18.33, |
|
"grad_norm": 0.27817079424858093, |
|
"learning_rate": 9.165098223738384e-05, |
|
"loss": 2.7465, |
|
"step": 341000 |
|
}, |
|
{ |
|
"epoch": 18.39, |
|
"grad_norm": 0.27790531516075134, |
|
"learning_rate": 8.871309257734385e-05, |
|
"loss": 2.7479, |
|
"step": 342000 |
|
}, |
|
{ |
|
"epoch": 18.44, |
|
"grad_norm": 0.33701515197753906, |
|
"learning_rate": 8.577226208681333e-05, |
|
"loss": 2.749, |
|
"step": 343000 |
|
}, |
|
{ |
|
"epoch": 18.49, |
|
"grad_norm": 0.32259926199913025, |
|
"learning_rate": 8.283143159628278e-05, |
|
"loss": 2.7484, |
|
"step": 344000 |
|
}, |
|
{ |
|
"epoch": 18.55, |
|
"grad_norm": 0.2922397255897522, |
|
"learning_rate": 7.989060110575226e-05, |
|
"loss": 2.7539, |
|
"step": 345000 |
|
}, |
|
{ |
|
"epoch": 18.6, |
|
"grad_norm": 0.27831122279167175, |
|
"learning_rate": 7.694977061522174e-05, |
|
"loss": 2.7501, |
|
"step": 346000 |
|
}, |
|
{ |
|
"epoch": 18.65, |
|
"grad_norm": 0.29930371046066284, |
|
"learning_rate": 7.401188095518175e-05, |
|
"loss": 2.7528, |
|
"step": 347000 |
|
}, |
|
{ |
|
"epoch": 18.71, |
|
"grad_norm": 0.2781750559806824, |
|
"learning_rate": 7.107105046465121e-05, |
|
"loss": 2.7542, |
|
"step": 348000 |
|
}, |
|
{ |
|
"epoch": 18.76, |
|
"grad_norm": 0.34259232878685, |
|
"learning_rate": 6.813316080461122e-05, |
|
"loss": 2.7519, |
|
"step": 349000 |
|
}, |
|
{ |
|
"epoch": 18.82, |
|
"grad_norm": 0.27175283432006836, |
|
"learning_rate": 6.51923303140807e-05, |
|
"loss": 2.7517, |
|
"step": 350000 |
|
}, |
|
{ |
|
"epoch": 18.87, |
|
"grad_norm": 0.28221696615219116, |
|
"learning_rate": 6.225149982355017e-05, |
|
"loss": 2.7564, |
|
"step": 351000 |
|
}, |
|
{ |
|
"epoch": 18.92, |
|
"grad_norm": 0.29943257570266724, |
|
"learning_rate": 5.931361016351018e-05, |
|
"loss": 2.7532, |
|
"step": 352000 |
|
}, |
|
{ |
|
"epoch": 18.98, |
|
"grad_norm": 0.31146806478500366, |
|
"learning_rate": 5.637277967297965e-05, |
|
"loss": 2.7517, |
|
"step": 353000 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 0.41124107255441245, |
|
"eval_loss": 3.40229868888855, |
|
"eval_runtime": 154.6135, |
|
"eval_samples_per_second": 374.618, |
|
"eval_steps_per_second": 5.86, |
|
"step": 353438 |
|
}, |
|
{ |
|
"epoch": 19.03, |
|
"grad_norm": 0.2778169512748718, |
|
"learning_rate": 5.343194918244913e-05, |
|
"loss": 2.7402, |
|
"step": 354000 |
|
}, |
|
{ |
|
"epoch": 19.08, |
|
"grad_norm": 0.27510160207748413, |
|
"learning_rate": 5.04911186919186e-05, |
|
"loss": 2.733, |
|
"step": 355000 |
|
}, |
|
{ |
|
"epoch": 19.14, |
|
"grad_norm": 0.29721584916114807, |
|
"learning_rate": 4.75532290318786e-05, |
|
"loss": 2.7334, |
|
"step": 356000 |
|
}, |
|
{ |
|
"epoch": 19.19, |
|
"grad_norm": 0.3098326623439789, |
|
"learning_rate": 4.4612398541348075e-05, |
|
"loss": 2.7346, |
|
"step": 357000 |
|
}, |
|
{ |
|
"epoch": 19.25, |
|
"grad_norm": 0.3339441418647766, |
|
"learning_rate": 4.1671568050817554e-05, |
|
"loss": 2.7324, |
|
"step": 358000 |
|
}, |
|
{ |
|
"epoch": 19.3, |
|
"grad_norm": 0.29926493763923645, |
|
"learning_rate": 3.873367839077756e-05, |
|
"loss": 2.7342, |
|
"step": 359000 |
|
}, |
|
{ |
|
"epoch": 19.35, |
|
"grad_norm": 0.32908961176872253, |
|
"learning_rate": 3.579578873073756e-05, |
|
"loss": 2.7342, |
|
"step": 360000 |
|
}, |
|
{ |
|
"epoch": 19.41, |
|
"grad_norm": 0.3179979622364044, |
|
"learning_rate": 3.285495824020703e-05, |
|
"loss": 2.7333, |
|
"step": 361000 |
|
}, |
|
{ |
|
"epoch": 19.46, |
|
"grad_norm": 0.34039121866226196, |
|
"learning_rate": 2.9914127749676508e-05, |
|
"loss": 2.7342, |
|
"step": 362000 |
|
}, |
|
{ |
|
"epoch": 19.51, |
|
"grad_norm": 0.29654234647750854, |
|
"learning_rate": 2.6973297259145983e-05, |
|
"loss": 2.7365, |
|
"step": 363000 |
|
}, |
|
{ |
|
"epoch": 19.57, |
|
"grad_norm": 0.26721277832984924, |
|
"learning_rate": 2.403540759910599e-05, |
|
"loss": 2.7342, |
|
"step": 364000 |
|
}, |
|
{ |
|
"epoch": 19.62, |
|
"grad_norm": 0.33862730860710144, |
|
"learning_rate": 2.109457710857546e-05, |
|
"loss": 2.7342, |
|
"step": 365000 |
|
}, |
|
{ |
|
"epoch": 19.68, |
|
"grad_norm": 0.35417601466178894, |
|
"learning_rate": 1.8153746618044937e-05, |
|
"loss": 2.7328, |
|
"step": 366000 |
|
}, |
|
{ |
|
"epoch": 19.73, |
|
"grad_norm": 0.2858126163482666, |
|
"learning_rate": 1.5215856958004942e-05, |
|
"loss": 2.7381, |
|
"step": 367000 |
|
}, |
|
{ |
|
"epoch": 19.78, |
|
"grad_norm": 0.28896602988243103, |
|
"learning_rate": 1.2275026467474415e-05, |
|
"loss": 2.7363, |
|
"step": 368000 |
|
}, |
|
{ |
|
"epoch": 19.84, |
|
"grad_norm": 0.28532859683036804, |
|
"learning_rate": 9.334195976943889e-06, |
|
"loss": 2.7336, |
|
"step": 369000 |
|
}, |
|
{ |
|
"epoch": 19.89, |
|
"grad_norm": 0.28909268975257874, |
|
"learning_rate": 6.393365486413364e-06, |
|
"loss": 2.7353, |
|
"step": 370000 |
|
}, |
|
{ |
|
"epoch": 19.94, |
|
"grad_norm": 0.32802271842956543, |
|
"learning_rate": 3.455475826373368e-06, |
|
"loss": 2.7323, |
|
"step": 371000 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 0.28372275829315186, |
|
"learning_rate": 5.146453358428421e-07, |
|
"loss": 2.7342, |
|
"step": 372000 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.4110677518157195, |
|
"eval_loss": 3.411550760269165, |
|
"eval_runtime": 155.3905, |
|
"eval_samples_per_second": 372.745, |
|
"eval_steps_per_second": 5.83, |
|
"step": 372040 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"step": 372040, |
|
"total_flos": 1.56748665397248e+18, |
|
"train_loss": 3.012804690259411, |
|
"train_runtime": 81277.7938, |
|
"train_samples_per_second": 146.472, |
|
"train_steps_per_second": 4.577 |
|
} |
|
], |
|
"logging_steps": 1000, |
|
"max_steps": 372040, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 5000, |
|
"total_flos": 1.56748665397248e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|