{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1473, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.020366598778004074, "grad_norm": 7.34504383318826, "learning_rate": 6.756756756756758e-07, "loss": 0.5197, "step": 10 }, { "epoch": 0.04073319755600815, "grad_norm": 1.9502076909187314, "learning_rate": 1.3513513513513515e-06, "loss": 0.4425, "step": 20 }, { "epoch": 0.06109979633401222, "grad_norm": 1.3703382365758696, "learning_rate": 2.0270270270270273e-06, "loss": 0.3956, "step": 30 }, { "epoch": 0.0814663951120163, "grad_norm": 1.5274033164070318, "learning_rate": 2.702702702702703e-06, "loss": 0.3706, "step": 40 }, { "epoch": 0.10183299389002037, "grad_norm": 1.5387129397990396, "learning_rate": 3.3783783783783788e-06, "loss": 0.3558, "step": 50 }, { "epoch": 0.12219959266802444, "grad_norm": 1.5285386592148773, "learning_rate": 4.0540540540540545e-06, "loss": 0.3439, "step": 60 }, { "epoch": 0.1425661914460285, "grad_norm": 1.4343248038161889, "learning_rate": 4.72972972972973e-06, "loss": 0.3351, "step": 70 }, { "epoch": 0.1629327902240326, "grad_norm": 1.3758371691545166, "learning_rate": 4.999795773182583e-06, "loss": 0.3273, "step": 80 }, { "epoch": 0.18329938900203666, "grad_norm": 1.5978694880041386, "learning_rate": 4.998547854667808e-06, "loss": 0.3217, "step": 90 }, { "epoch": 0.20366598778004075, "grad_norm": 1.3851731195685373, "learning_rate": 4.9961661054815474e-06, "loss": 0.322, "step": 100 }, { "epoch": 0.2240325865580448, "grad_norm": 1.3521127973831295, "learning_rate": 4.992651726621268e-06, "loss": 0.3146, "step": 110 }, { "epoch": 0.24439918533604887, "grad_norm": 1.3516467751032153, "learning_rate": 4.988006490213162e-06, "loss": 0.3143, "step": 120 }, { "epoch": 0.26476578411405294, "grad_norm": 1.2701037565844242, "learning_rate": 4.9822327386185436e-06, "loss": 0.3077, "step": 130 }, { "epoch": 0.285132382892057, "grad_norm": 1.6979740720879781, "learning_rate": 4.975333383252724e-06, "loss": 0.307, "step": 140 }, { "epoch": 0.3054989816700611, "grad_norm": 1.3490436638938983, "learning_rate": 4.967311903116924e-06, "loss": 0.2987, "step": 150 }, { "epoch": 0.3258655804480652, "grad_norm": 1.7161350768628911, "learning_rate": 4.958172343043982e-06, "loss": 0.2999, "step": 160 }, { "epoch": 0.34623217922606925, "grad_norm": 1.382612250934794, "learning_rate": 4.947919311658756e-06, "loss": 0.2991, "step": 170 }, { "epoch": 0.3665987780040733, "grad_norm": 1.3151580723383025, "learning_rate": 4.936557979054219e-06, "loss": 0.2949, "step": 180 }, { "epoch": 0.3869653767820774, "grad_norm": 1.3980284114864507, "learning_rate": 4.924094074184436e-06, "loss": 0.2931, "step": 190 }, { "epoch": 0.4073319755600815, "grad_norm": 1.4995260622729383, "learning_rate": 4.910533881975742e-06, "loss": 0.2897, "step": 200 }, { "epoch": 0.42769857433808556, "grad_norm": 1.3091750919874185, "learning_rate": 4.895884240157568e-06, "loss": 0.2897, "step": 210 }, { "epoch": 0.4480651731160896, "grad_norm": 1.4747628264164012, "learning_rate": 4.880152535814507e-06, "loss": 0.2868, "step": 220 }, { "epoch": 0.4684317718940937, "grad_norm": 1.4470008213149266, "learning_rate": 4.863346701661384e-06, "loss": 0.2874, "step": 230 }, { "epoch": 0.48879837067209775, "grad_norm": 1.4515164854269305, "learning_rate": 4.845475212043178e-06, "loss": 0.2878, "step": 240 }, { "epoch": 0.5091649694501018, "grad_norm": 1.2578053743674917, "learning_rate": 4.8265470786618345e-06, "loss": 0.2811, "step": 250 }, { "epoch": 0.5295315682281059, "grad_norm": 1.5098983649320945, "learning_rate": 4.806571846032109e-06, "loss": 0.2773, "step": 260 }, { "epoch": 0.5498981670061099, "grad_norm": 1.260736734336587, "learning_rate": 4.785559586668751e-06, "loss": 0.28, "step": 270 }, { "epoch": 0.570264765784114, "grad_norm": 1.1734721692297405, "learning_rate": 4.763520896007427e-06, "loss": 0.2825, "step": 280 }, { "epoch": 0.5906313645621182, "grad_norm": 1.1031919455840085, "learning_rate": 4.74046688706198e-06, "loss": 0.2759, "step": 290 }, { "epoch": 0.6109979633401222, "grad_norm": 1.0944761517106407, "learning_rate": 4.716409184820684e-06, "loss": 0.2766, "step": 300 }, { "epoch": 0.6313645621181263, "grad_norm": 1.23087624035179, "learning_rate": 4.691359920384341e-06, "loss": 0.2777, "step": 310 }, { "epoch": 0.6517311608961304, "grad_norm": 1.1545647683752378, "learning_rate": 4.6653317248491754e-06, "loss": 0.2752, "step": 320 }, { "epoch": 0.6720977596741344, "grad_norm": 1.2931986965229072, "learning_rate": 4.638337722937599e-06, "loss": 0.2746, "step": 330 }, { "epoch": 0.6924643584521385, "grad_norm": 1.1273344784116983, "learning_rate": 4.610391526380066e-06, "loss": 0.2688, "step": 340 }, { "epoch": 0.7128309572301426, "grad_norm": 1.415362493265054, "learning_rate": 4.581507227051359e-06, "loss": 0.2703, "step": 350 }, { "epoch": 0.7331975560081466, "grad_norm": 1.1591318242459208, "learning_rate": 4.551699389864756e-06, "loss": 0.2665, "step": 360 }, { "epoch": 0.7535641547861507, "grad_norm": 1.0554558469893176, "learning_rate": 4.520983045427667e-06, "loss": 0.2693, "step": 370 }, { "epoch": 0.7739307535641547, "grad_norm": 1.2265072577256237, "learning_rate": 4.489373682462457e-06, "loss": 0.2663, "step": 380 }, { "epoch": 0.7942973523421588, "grad_norm": 1.2855575031097177, "learning_rate": 4.456887239996245e-06, "loss": 0.2679, "step": 390 }, { "epoch": 0.814663951120163, "grad_norm": 1.081374466408974, "learning_rate": 4.423540099323651e-06, "loss": 0.2661, "step": 400 }, { "epoch": 0.835030549898167, "grad_norm": 1.1990325052929929, "learning_rate": 4.3893490757465264e-06, "loss": 0.2648, "step": 410 }, { "epoch": 0.8553971486761711, "grad_norm": 1.2726034950554865, "learning_rate": 4.354331410094831e-06, "loss": 0.2659, "step": 420 }, { "epoch": 0.8757637474541752, "grad_norm": 1.1644285218921815, "learning_rate": 4.318504760032946e-06, "loss": 0.2613, "step": 430 }, { "epoch": 0.8961303462321792, "grad_norm": 1.236076922309445, "learning_rate": 4.281887191155788e-06, "loss": 0.2636, "step": 440 }, { "epoch": 0.9164969450101833, "grad_norm": 1.1721589750974497, "learning_rate": 4.244497167879234e-06, "loss": 0.261, "step": 450 }, { "epoch": 0.9368635437881874, "grad_norm": 1.085539973119722, "learning_rate": 4.206353544129437e-06, "loss": 0.2632, "step": 460 }, { "epoch": 0.9572301425661914, "grad_norm": 1.1393530942547674, "learning_rate": 4.1674755538357234e-06, "loss": 0.2593, "step": 470 }, { "epoch": 0.9775967413441955, "grad_norm": 1.144980396038313, "learning_rate": 4.12788280123189e-06, "loss": 0.2582, "step": 480 }, { "epoch": 0.9979633401221996, "grad_norm": 1.1413918380162131, "learning_rate": 4.087595250970756e-06, "loss": 0.2555, "step": 490 }, { "epoch": 1.0, "eval_loss": 0.2573759853839874, "eval_runtime": 44.646, "eval_samples_per_second": 296.13, "eval_steps_per_second": 1.165, "step": 491 }, { "epoch": 1.0183299389002036, "grad_norm": 1.5557280500135056, "learning_rate": 4.0466332180569904e-06, "loss": 0.1936, "step": 500 }, { "epoch": 1.0386965376782078, "grad_norm": 1.3238755177486623, "learning_rate": 4.0050173576032644e-06, "loss": 0.1814, "step": 510 }, { "epoch": 1.0590631364562118, "grad_norm": 1.2066120698543659, "learning_rate": 3.9627686544149085e-06, "loss": 0.181, "step": 520 }, { "epoch": 1.079429735234216, "grad_norm": 1.088083679916152, "learning_rate": 3.919908412408315e-06, "loss": 0.1799, "step": 530 }, { "epoch": 1.0997963340122199, "grad_norm": 1.123343825207487, "learning_rate": 3.876458243868441e-06, "loss": 0.1813, "step": 540 }, { "epoch": 1.120162932790224, "grad_norm": 1.3201033277961134, "learning_rate": 3.832440058550809e-06, "loss": 0.1815, "step": 550 }, { "epoch": 1.140529531568228, "grad_norm": 1.278395529927195, "learning_rate": 3.7878760526335024e-06, "loss": 0.1781, "step": 560 }, { "epoch": 1.1608961303462322, "grad_norm": 1.11604395104044, "learning_rate": 3.742788697524746e-06, "loss": 0.1773, "step": 570 }, { "epoch": 1.1812627291242364, "grad_norm": 1.1089540254153334, "learning_rate": 3.6972007285316907e-06, "loss": 0.1806, "step": 580 }, { "epoch": 1.2016293279022403, "grad_norm": 1.1472218678631292, "learning_rate": 3.6511351333961297e-06, "loss": 0.1795, "step": 590 }, { "epoch": 1.2219959266802445, "grad_norm": 1.0597976659827861, "learning_rate": 3.6046151407029294e-06, "loss": 0.1815, "step": 600 }, { "epoch": 1.2423625254582484, "grad_norm": 1.268227483080295, "learning_rate": 3.557664208167006e-06, "loss": 0.182, "step": 610 }, { "epoch": 1.2627291242362526, "grad_norm": 1.3674823513528875, "learning_rate": 3.5103060108047724e-06, "loss": 0.181, "step": 620 }, { "epoch": 1.2830957230142566, "grad_norm": 1.2651346876174, "learning_rate": 3.462564428996006e-06, "loss": 0.1813, "step": 630 }, { "epoch": 1.3034623217922607, "grad_norm": 1.1472859968056397, "learning_rate": 3.414463536442161e-06, "loss": 0.1794, "step": 640 }, { "epoch": 1.3238289205702647, "grad_norm": 1.2499659583847704, "learning_rate": 3.3660275880272014e-06, "loss": 0.1792, "step": 650 }, { "epoch": 1.3441955193482689, "grad_norm": 1.1361447212288704, "learning_rate": 3.3172810075870724e-06, "loss": 0.1791, "step": 660 }, { "epoch": 1.364562118126273, "grad_norm": 1.0924410851905066, "learning_rate": 3.2682483755939735e-06, "loss": 0.1798, "step": 670 }, { "epoch": 1.384928716904277, "grad_norm": 1.1623338702342625, "learning_rate": 3.2189544167616544e-06, "loss": 0.1796, "step": 680 }, { "epoch": 1.405295315682281, "grad_norm": 1.3102412387590796, "learning_rate": 3.1694239875779708e-06, "loss": 0.1758, "step": 690 }, { "epoch": 1.4256619144602851, "grad_norm": 1.179561417494925, "learning_rate": 3.1196820637709976e-06, "loss": 0.1782, "step": 700 }, { "epoch": 1.4460285132382893, "grad_norm": 1.1636897041976815, "learning_rate": 3.0697537277150108e-06, "loss": 0.1796, "step": 710 }, { "epoch": 1.4663951120162932, "grad_norm": 1.0835611859399772, "learning_rate": 3.0196641557826993e-06, "loss": 0.181, "step": 720 }, { "epoch": 1.4867617107942974, "grad_norm": 1.1641341412000015, "learning_rate": 2.969438605649973e-06, "loss": 0.1784, "step": 730 }, { "epoch": 1.5071283095723014, "grad_norm": 1.4309684703117318, "learning_rate": 2.9191024035597715e-06, "loss": 0.1785, "step": 740 }, { "epoch": 1.5274949083503055, "grad_norm": 1.1211318477711534, "learning_rate": 2.868680931551307e-06, "loss": 0.1785, "step": 750 }, { "epoch": 1.5478615071283097, "grad_norm": 1.0844733637132042, "learning_rate": 2.818199614661163e-06, "loss": 0.1784, "step": 760 }, { "epoch": 1.5682281059063137, "grad_norm": 1.0733420166347605, "learning_rate": 2.7676839081027153e-06, "loss": 0.177, "step": 770 }, { "epoch": 1.5885947046843176, "grad_norm": 1.0406238069755496, "learning_rate": 2.7171592844303377e-06, "loss": 0.177, "step": 780 }, { "epoch": 1.6089613034623218, "grad_norm": 1.0173564043856271, "learning_rate": 2.6666512206948574e-06, "loss": 0.1776, "step": 790 }, { "epoch": 1.629327902240326, "grad_norm": 1.098211071807489, "learning_rate": 2.6161851855967546e-06, "loss": 0.1766, "step": 800 }, { "epoch": 1.64969450101833, "grad_norm": 1.23874084216799, "learning_rate": 2.565786626643557e-06, "loss": 0.1805, "step": 810 }, { "epoch": 1.6700610997963339, "grad_norm": 1.0816973984606886, "learning_rate": 2.5154809573179374e-06, "loss": 0.1797, "step": 820 }, { "epoch": 1.690427698574338, "grad_norm": 1.0857968496669568, "learning_rate": 2.465293544262953e-06, "loss": 0.1767, "step": 830 }, { "epoch": 1.7107942973523422, "grad_norm": 1.0614368562971397, "learning_rate": 2.4152496944909185e-06, "loss": 0.1747, "step": 840 }, { "epoch": 1.7311608961303462, "grad_norm": 1.1079176704743572, "learning_rate": 2.365374642622334e-06, "loss": 0.1749, "step": 850 }, { "epoch": 1.7515274949083504, "grad_norm": 1.149185145242321, "learning_rate": 2.315693538161327e-06, "loss": 0.1746, "step": 860 }, { "epoch": 1.7718940936863543, "grad_norm": 1.1364076771423424, "learning_rate": 2.2662314328140077e-06, "loss": 0.1757, "step": 870 }, { "epoch": 1.7922606924643585, "grad_norm": 1.0481149270348264, "learning_rate": 2.217013267856143e-06, "loss": 0.175, "step": 880 }, { "epoch": 1.8126272912423627, "grad_norm": 1.121398301379226, "learning_rate": 2.168063861556515e-06, "loss": 0.1739, "step": 890 }, { "epoch": 1.8329938900203666, "grad_norm": 1.1627216220108434, "learning_rate": 2.1194078966623004e-06, "loss": 0.1739, "step": 900 }, { "epoch": 1.8533604887983706, "grad_norm": 1.0423836344485042, "learning_rate": 2.0710699079527936e-06, "loss": 0.1737, "step": 910 }, { "epoch": 1.8737270875763747, "grad_norm": 1.1321126420156151, "learning_rate": 2.0230742698677407e-06, "loss": 0.1752, "step": 920 }, { "epoch": 1.894093686354379, "grad_norm": 1.0982860659977216, "learning_rate": 1.9754451842165187e-06, "loss": 0.1732, "step": 930 }, { "epoch": 1.9144602851323829, "grad_norm": 1.0607568085093442, "learning_rate": 1.9282066679743756e-06, "loss": 0.1772, "step": 940 }, { "epoch": 1.9348268839103868, "grad_norm": 1.1195338613324972, "learning_rate": 1.8813825411718608e-06, "loss": 0.1739, "step": 950 }, { "epoch": 1.955193482688391, "grad_norm": 1.2105165500055401, "learning_rate": 1.834996414883573e-06, "loss": 0.1745, "step": 960 }, { "epoch": 1.9755600814663952, "grad_norm": 1.0195074211250463, "learning_rate": 1.789071679322268e-06, "loss": 0.1726, "step": 970 }, { "epoch": 1.9959266802443993, "grad_norm": 1.1218629665522688, "learning_rate": 1.7436314920443475e-06, "loss": 0.1732, "step": 980 }, { "epoch": 2.0, "eval_loss": 0.2461756318807602, "eval_runtime": 44.0297, "eval_samples_per_second": 300.275, "eval_steps_per_second": 1.181, "step": 982 }, { "epoch": 2.016293279022403, "grad_norm": 1.3922840302017268, "learning_rate": 1.6986987662726467e-06, "loss": 0.1202, "step": 990 }, { "epoch": 2.0366598778004072, "grad_norm": 1.2614971724968627, "learning_rate": 1.6542961593424448e-06, "loss": 0.1048, "step": 1000 }, { "epoch": 2.0570264765784114, "grad_norm": 1.1248418232642112, "learning_rate": 1.6104460612764963e-06, "loss": 0.1039, "step": 1010 }, { "epoch": 2.0773930753564156, "grad_norm": 1.1221931057869212, "learning_rate": 1.5671705834948596e-06, "loss": 0.1052, "step": 1020 }, { "epoch": 2.0977596741344193, "grad_norm": 1.3233889385769144, "learning_rate": 1.5244915476652073e-06, "loss": 0.1045, "step": 1030 }, { "epoch": 2.1181262729124235, "grad_norm": 1.219740632707147, "learning_rate": 1.4824304746992488e-06, "loss": 0.1045, "step": 1040 }, { "epoch": 2.1384928716904277, "grad_norm": 1.1317738274870492, "learning_rate": 1.441008573900804e-06, "loss": 0.1038, "step": 1050 }, { "epoch": 2.158859470468432, "grad_norm": 1.1214324601378942, "learning_rate": 1.4002467322710097e-06, "loss": 0.1041, "step": 1060 }, { "epoch": 2.179226069246436, "grad_norm": 1.113920354990809, "learning_rate": 1.3601655039760452e-06, "loss": 0.1051, "step": 1070 }, { "epoch": 2.1995926680244398, "grad_norm": 1.1594270616003397, "learning_rate": 1.3207850999826893e-06, "loss": 0.1048, "step": 1080 }, { "epoch": 2.219959266802444, "grad_norm": 1.026595442656351, "learning_rate": 1.2821253778669374e-06, "loss": 0.104, "step": 1090 }, { "epoch": 2.240325865580448, "grad_norm": 1.0868748314482166, "learning_rate": 1.2442058318008143e-06, "loss": 0.1045, "step": 1100 }, { "epoch": 2.2606924643584523, "grad_norm": 1.1163655110568669, "learning_rate": 1.2070455827224361e-06, "loss": 0.1028, "step": 1110 }, { "epoch": 2.281059063136456, "grad_norm": 1.1265872495516769, "learning_rate": 1.170663368694271e-06, "loss": 0.1049, "step": 1120 }, { "epoch": 2.30142566191446, "grad_norm": 1.0787109584417223, "learning_rate": 1.1350775354544707e-06, "loss": 0.104, "step": 1130 }, { "epoch": 2.3217922606924644, "grad_norm": 1.2647428474671398, "learning_rate": 1.1003060271660259e-06, "loss": 0.104, "step": 1140 }, { "epoch": 2.3421588594704685, "grad_norm": 1.1925610582272053, "learning_rate": 1.06636637736842e-06, "loss": 0.1042, "step": 1150 }, { "epoch": 2.3625254582484727, "grad_norm": 1.0856982970751745, "learning_rate": 1.0332757001363402e-06, "loss": 0.1043, "step": 1160 }, { "epoch": 2.3828920570264764, "grad_norm": 1.1357554208144642, "learning_rate": 1.0010506814498994e-06, "loss": 0.1031, "step": 1170 }, { "epoch": 2.4032586558044806, "grad_norm": 1.185189241987044, "learning_rate": 9.697075707807282e-07, "loss": 0.1041, "step": 1180 }, { "epoch": 2.423625254582485, "grad_norm": 1.134389319376477, "learning_rate": 9.392621728981774e-07, "loss": 0.1045, "step": 1190 }, { "epoch": 2.443991853360489, "grad_norm": 1.1120088243562505, "learning_rate": 9.097298398997584e-07, "loss": 0.1027, "step": 1200 }, { "epoch": 2.4643584521384927, "grad_norm": 1.1115040698718408, "learning_rate": 8.811254634698468e-07, "loss": 0.105, "step": 1210 }, { "epoch": 2.484725050916497, "grad_norm": 1.0851503339063027, "learning_rate": 8.534634673705461e-07, "loss": 0.1038, "step": 1220 }, { "epoch": 2.505091649694501, "grad_norm": 1.1183787883183298, "learning_rate": 8.267578001685048e-07, "loss": 0.102, "step": 1230 }, { "epoch": 2.525458248472505, "grad_norm": 1.1421505006564803, "learning_rate": 8.010219282013471e-07, "loss": 0.1047, "step": 1240 }, { "epoch": 2.5458248472505094, "grad_norm": 1.1411521931249191, "learning_rate": 7.76268828787271e-07, "loss": 0.1037, "step": 1250 }, { "epoch": 2.566191446028513, "grad_norm": 1.0934492001333824, "learning_rate": 7.525109836812336e-07, "loss": 0.1028, "step": 1260 }, { "epoch": 2.5865580448065173, "grad_norm": 1.098352714811253, "learning_rate": 7.297603727810222e-07, "loss": 0.1034, "step": 1270 }, { "epoch": 2.6069246435845215, "grad_norm": 1.1012241466696564, "learning_rate": 7.0802846808639e-07, "loss": 0.1039, "step": 1280 }, { "epoch": 2.627291242362525, "grad_norm": 1.082718927412021, "learning_rate": 6.873262279142977e-07, "loss": 0.1026, "step": 1290 }, { "epoch": 2.6476578411405294, "grad_norm": 1.0913895786684868, "learning_rate": 6.676640913731799e-07, "loss": 0.1026, "step": 1300 }, { "epoch": 2.6680244399185336, "grad_norm": 1.114449983311916, "learning_rate": 6.490519730990235e-07, "loss": 0.1036, "step": 1310 }, { "epoch": 2.6883910386965377, "grad_norm": 1.0981797148792212, "learning_rate": 6.314992582559093e-07, "loss": 0.1044, "step": 1320 }, { "epoch": 2.708757637474542, "grad_norm": 1.1031210331032415, "learning_rate": 6.150147978035423e-07, "loss": 0.1028, "step": 1330 }, { "epoch": 2.729124236252546, "grad_norm": 1.0231804838279508, "learning_rate": 5.996069040341544e-07, "loss": 0.1007, "step": 1340 }, { "epoch": 2.74949083503055, "grad_norm": 1.1134674052519227, "learning_rate": 5.852833463810277e-07, "loss": 0.1032, "step": 1350 }, { "epoch": 2.769857433808554, "grad_norm": 1.0615744074260045, "learning_rate": 5.720513475007569e-07, "loss": 0.1042, "step": 1360 }, { "epoch": 2.790224032586558, "grad_norm": 1.0551133440758214, "learning_rate": 5.599175796312243e-07, "loss": 0.1037, "step": 1370 }, { "epoch": 2.810590631364562, "grad_norm": 1.085953698001666, "learning_rate": 5.488881612271214e-07, "loss": 0.1027, "step": 1380 }, { "epoch": 2.830957230142566, "grad_norm": 1.069753873055028, "learning_rate": 5.389686538747183e-07, "loss": 0.1036, "step": 1390 }, { "epoch": 2.8513238289205702, "grad_norm": 1.1482168307825666, "learning_rate": 5.301640594874314e-07, "loss": 0.1035, "step": 1400 }, { "epoch": 2.8716904276985744, "grad_norm": 1.0724177471486462, "learning_rate": 5.224788177836086e-07, "loss": 0.1028, "step": 1410 }, { "epoch": 2.8920570264765786, "grad_norm": 1.0644407890849343, "learning_rate": 5.159168040478003e-07, "loss": 0.1036, "step": 1420 }, { "epoch": 2.9124236252545828, "grad_norm": 1.1122712461112416, "learning_rate": 5.104813271766455e-07, "loss": 0.1011, "step": 1430 }, { "epoch": 2.9327902240325865, "grad_norm": 1.0864458509689723, "learning_rate": 5.061751280103621e-07, "loss": 0.1027, "step": 1440 }, { "epoch": 2.9531568228105907, "grad_norm": 1.0960544633048377, "learning_rate": 5.030003779506757e-07, "loss": 0.1016, "step": 1450 }, { "epoch": 2.973523421588595, "grad_norm": 1.0723267354322656, "learning_rate": 5.009586778658912e-07, "loss": 0.1012, "step": 1460 }, { "epoch": 2.9938900203665986, "grad_norm": 1.0958889614425518, "learning_rate": 5.000510572836534e-07, "loss": 0.1013, "step": 1470 }, { "epoch": 3.0, "eval_loss": 0.2728930115699768, "eval_runtime": 45.1101, "eval_samples_per_second": 293.083, "eval_steps_per_second": 1.153, "step": 1473 }, { "epoch": 3.0, "step": 1473, "total_flos": 2467329862533120.0, "train_loss": 0.19335093406374892, "train_runtime": 8782.5965, "train_samples_per_second": 85.801, "train_steps_per_second": 0.168 } ], "logging_steps": 10, "max_steps": 1473, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2467329862533120.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }