{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9920779176240612, "eval_steps": 500, "global_step": 226, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004389725299221509, "grad_norm": 0.4991928040981293, "learning_rate": 2.0000000000000003e-06, "loss": 2.1881, "step": 1 }, { "epoch": 0.008779450598443019, "grad_norm": 0.5059462785720825, "learning_rate": 4.000000000000001e-06, "loss": 2.1777, "step": 2 }, { "epoch": 0.01316917589766453, "grad_norm": 0.46635618805885315, "learning_rate": 6e-06, "loss": 2.1398, "step": 3 }, { "epoch": 0.017558901196886037, "grad_norm": 0.5066556930541992, "learning_rate": 8.000000000000001e-06, "loss": 2.1882, "step": 4 }, { "epoch": 0.02194862649610755, "grad_norm": 0.48363861441612244, "learning_rate": 1e-05, "loss": 2.14, "step": 5 }, { "epoch": 0.02633835179532906, "grad_norm": 0.4686345160007477, "learning_rate": 1.2e-05, "loss": 2.1519, "step": 6 }, { "epoch": 0.030728077094550568, "grad_norm": 0.4926372766494751, "learning_rate": 1.4000000000000001e-05, "loss": 2.2138, "step": 7 }, { "epoch": 0.035117802393772074, "grad_norm": 0.4613856375217438, "learning_rate": 1.6000000000000003e-05, "loss": 2.1158, "step": 8 }, { "epoch": 0.03950752769299359, "grad_norm": 0.45599788427352905, "learning_rate": 1.8e-05, "loss": 2.1337, "step": 9 }, { "epoch": 0.0438972529922151, "grad_norm": 0.44868260622024536, "learning_rate": 2e-05, "loss": 2.1684, "step": 10 }, { "epoch": 0.048286978291436605, "grad_norm": 0.41381940245628357, "learning_rate": 2.2000000000000003e-05, "loss": 2.1257, "step": 11 }, { "epoch": 0.05267670359065812, "grad_norm": 0.3904091417789459, "learning_rate": 2.4e-05, "loss": 2.0883, "step": 12 }, { "epoch": 0.057066428889879624, "grad_norm": 0.3864079415798187, "learning_rate": 2.6000000000000002e-05, "loss": 2.0833, "step": 13 }, { "epoch": 0.061456154189101136, "grad_norm": 0.4045635163784027, "learning_rate": 2.8000000000000003e-05, "loss": 2.0999, "step": 14 }, { "epoch": 0.06584587948832264, "grad_norm": 0.3710981011390686, "learning_rate": 3e-05, "loss": 2.064, "step": 15 }, { "epoch": 0.07023560478754415, "grad_norm": 0.37955793738365173, "learning_rate": 3.2000000000000005e-05, "loss": 2.0859, "step": 16 }, { "epoch": 0.07462533008676567, "grad_norm": 0.3688313961029053, "learning_rate": 3.4000000000000007e-05, "loss": 2.0448, "step": 17 }, { "epoch": 0.07901505538598717, "grad_norm": 0.36689838767051697, "learning_rate": 3.6e-05, "loss": 2.0175, "step": 18 }, { "epoch": 0.08340478068520868, "grad_norm": 0.3566659390926361, "learning_rate": 3.8e-05, "loss": 2.0004, "step": 19 }, { "epoch": 0.0877945059844302, "grad_norm": 0.3700122535228729, "learning_rate": 4e-05, "loss": 1.9991, "step": 20 }, { "epoch": 0.0921842312836517, "grad_norm": 0.39829716086387634, "learning_rate": 4.2e-05, "loss": 1.9925, "step": 21 }, { "epoch": 0.09657395658287321, "grad_norm": 0.3828795552253723, "learning_rate": 4.4000000000000006e-05, "loss": 1.9868, "step": 22 }, { "epoch": 0.10096368188209472, "grad_norm": 0.3811214566230774, "learning_rate": 4.600000000000001e-05, "loss": 1.9273, "step": 23 }, { "epoch": 0.10535340718131624, "grad_norm": 0.39460840821266174, "learning_rate": 4.8e-05, "loss": 1.8869, "step": 24 }, { "epoch": 0.10974313248053774, "grad_norm": 0.4100440740585327, "learning_rate": 5e-05, "loss": 1.8803, "step": 25 }, { "epoch": 0.11413285777975925, "grad_norm": 0.41026589274406433, "learning_rate": 5.2000000000000004e-05, "loss": 1.8366, "step": 26 }, { "epoch": 0.11852258307898077, "grad_norm": 0.4242023229598999, "learning_rate": 5.4000000000000005e-05, "loss": 1.7924, "step": 27 }, { "epoch": 0.12291230837820227, "grad_norm": 0.44090697169303894, "learning_rate": 5.6000000000000006e-05, "loss": 1.7925, "step": 28 }, { "epoch": 0.1273020336774238, "grad_norm": 0.40956804156303406, "learning_rate": 5.8e-05, "loss": 1.7417, "step": 29 }, { "epoch": 0.13169175897664528, "grad_norm": 0.44362565875053406, "learning_rate": 6e-05, "loss": 1.7549, "step": 30 }, { "epoch": 0.1360814842758668, "grad_norm": 0.49523505568504333, "learning_rate": 6.2e-05, "loss": 1.724, "step": 31 }, { "epoch": 0.1404712095750883, "grad_norm": 0.5311077833175659, "learning_rate": 6.400000000000001e-05, "loss": 1.6494, "step": 32 }, { "epoch": 0.14486093487430982, "grad_norm": 0.48909229040145874, "learning_rate": 6.6e-05, "loss": 1.6347, "step": 33 }, { "epoch": 0.14925066017353134, "grad_norm": 0.4973181486129761, "learning_rate": 6.800000000000001e-05, "loss": 1.5957, "step": 34 }, { "epoch": 0.15364038547275283, "grad_norm": 0.48119479417800903, "learning_rate": 7e-05, "loss": 1.5704, "step": 35 }, { "epoch": 0.15803011077197435, "grad_norm": 0.4916652739048004, "learning_rate": 7.2e-05, "loss": 1.5255, "step": 36 }, { "epoch": 0.16241983607119587, "grad_norm": 0.5174223780632019, "learning_rate": 7.4e-05, "loss": 1.4716, "step": 37 }, { "epoch": 0.16680956137041736, "grad_norm": 0.3963719606399536, "learning_rate": 7.6e-05, "loss": 1.4648, "step": 38 }, { "epoch": 0.17119928666963888, "grad_norm": 0.38646385073661804, "learning_rate": 7.800000000000001e-05, "loss": 1.3944, "step": 39 }, { "epoch": 0.1755890119688604, "grad_norm": 0.4066337049007416, "learning_rate": 8e-05, "loss": 1.4056, "step": 40 }, { "epoch": 0.1799787372680819, "grad_norm": 0.3413829803466797, "learning_rate": 8.2e-05, "loss": 1.3835, "step": 41 }, { "epoch": 0.1843684625673034, "grad_norm": 0.3751814663410187, "learning_rate": 8.4e-05, "loss": 1.4115, "step": 42 }, { "epoch": 0.1887581878665249, "grad_norm": 0.3934766352176666, "learning_rate": 8.6e-05, "loss": 1.3613, "step": 43 }, { "epoch": 0.19314791316574642, "grad_norm": 0.3946179151535034, "learning_rate": 8.800000000000001e-05, "loss": 1.3711, "step": 44 }, { "epoch": 0.19753763846496794, "grad_norm": 0.4430006742477417, "learning_rate": 9e-05, "loss": 1.3717, "step": 45 }, { "epoch": 0.20192736376418943, "grad_norm": 0.5315638184547424, "learning_rate": 9.200000000000001e-05, "loss": 1.3699, "step": 46 }, { "epoch": 0.20631708906341095, "grad_norm": 0.567436933517456, "learning_rate": 9.4e-05, "loss": 1.3513, "step": 47 }, { "epoch": 0.21070681436263247, "grad_norm": 0.642360270023346, "learning_rate": 9.6e-05, "loss": 1.335, "step": 48 }, { "epoch": 0.21509653966185396, "grad_norm": 0.5742576718330383, "learning_rate": 9.8e-05, "loss": 1.3214, "step": 49 }, { "epoch": 0.21948626496107548, "grad_norm": 0.17982161045074463, "learning_rate": 0.0001, "loss": 1.336, "step": 50 }, { "epoch": 0.223875990260297, "grad_norm": 0.1774454414844513, "learning_rate": 0.00010200000000000001, "loss": 1.2765, "step": 51 }, { "epoch": 0.2282657155595185, "grad_norm": 0.13885167241096497, "learning_rate": 0.00010400000000000001, "loss": 1.3445, "step": 52 }, { "epoch": 0.23265544085874001, "grad_norm": 0.16647277772426605, "learning_rate": 0.00010600000000000002, "loss": 1.3369, "step": 53 }, { "epoch": 0.23704516615796153, "grad_norm": 0.12684348225593567, "learning_rate": 0.00010800000000000001, "loss": 1.3123, "step": 54 }, { "epoch": 0.24143489145718303, "grad_norm": 0.1202506348490715, "learning_rate": 0.00011000000000000002, "loss": 1.3016, "step": 55 }, { "epoch": 0.24582461675640455, "grad_norm": 0.1340794712305069, "learning_rate": 0.00011200000000000001, "loss": 1.2903, "step": 56 }, { "epoch": 0.25021434205562604, "grad_norm": 0.10907240211963654, "learning_rate": 0.00011399999999999999, "loss": 1.2978, "step": 57 }, { "epoch": 0.2546040673548476, "grad_norm": 0.11926048249006271, "learning_rate": 0.000116, "loss": 1.2768, "step": 58 }, { "epoch": 0.2589937926540691, "grad_norm": 0.12190070748329163, "learning_rate": 0.000118, "loss": 1.2916, "step": 59 }, { "epoch": 0.26338351795329057, "grad_norm": 0.11329507827758789, "learning_rate": 0.00012, "loss": 1.2934, "step": 60 }, { "epoch": 0.2677732432525121, "grad_norm": 0.10332001745700836, "learning_rate": 0.000122, "loss": 1.2937, "step": 61 }, { "epoch": 0.2721629685517336, "grad_norm": 0.11909238249063492, "learning_rate": 0.000124, "loss": 1.2638, "step": 62 }, { "epoch": 0.2765526938509551, "grad_norm": 0.10772378742694855, "learning_rate": 0.000126, "loss": 1.2764, "step": 63 }, { "epoch": 0.2809424191501766, "grad_norm": 0.10739663988351822, "learning_rate": 0.00012800000000000002, "loss": 1.2993, "step": 64 }, { "epoch": 0.28533214444939814, "grad_norm": 0.11664730310440063, "learning_rate": 0.00013000000000000002, "loss": 1.2805, "step": 65 }, { "epoch": 0.28972186974861963, "grad_norm": 0.10890611261129379, "learning_rate": 0.000132, "loss": 1.2912, "step": 66 }, { "epoch": 0.2941115950478411, "grad_norm": 0.09589364379644394, "learning_rate": 0.000134, "loss": 1.2637, "step": 67 }, { "epoch": 0.29850132034706267, "grad_norm": 0.10450520366430283, "learning_rate": 0.00013600000000000003, "loss": 1.259, "step": 68 }, { "epoch": 0.30289104564628416, "grad_norm": 0.09578921645879745, "learning_rate": 0.000138, "loss": 1.2497, "step": 69 }, { "epoch": 0.30728077094550565, "grad_norm": 0.09961879253387451, "learning_rate": 0.00014, "loss": 1.2375, "step": 70 }, { "epoch": 0.3116704962447272, "grad_norm": 0.1068640947341919, "learning_rate": 0.000142, "loss": 1.2567, "step": 71 }, { "epoch": 0.3160602215439487, "grad_norm": 0.09834500402212143, "learning_rate": 0.000144, "loss": 1.2582, "step": 72 }, { "epoch": 0.3204499468431702, "grad_norm": 0.10703941434621811, "learning_rate": 0.000146, "loss": 1.2936, "step": 73 }, { "epoch": 0.32483967214239173, "grad_norm": 0.10532184690237045, "learning_rate": 0.000148, "loss": 1.269, "step": 74 }, { "epoch": 0.3292293974416132, "grad_norm": 0.09534098953008652, "learning_rate": 0.00015000000000000001, "loss": 1.2508, "step": 75 }, { "epoch": 0.3336191227408347, "grad_norm": 0.0927763432264328, "learning_rate": 0.000152, "loss": 1.2742, "step": 76 }, { "epoch": 0.33800884804005626, "grad_norm": 0.10629042237997055, "learning_rate": 0.000154, "loss": 1.2142, "step": 77 }, { "epoch": 0.34239857333927776, "grad_norm": 0.0965496301651001, "learning_rate": 0.00015600000000000002, "loss": 1.2506, "step": 78 }, { "epoch": 0.34678829863849925, "grad_norm": 0.10647477954626083, "learning_rate": 0.00015800000000000002, "loss": 1.2336, "step": 79 }, { "epoch": 0.3511780239377208, "grad_norm": 0.08916866779327393, "learning_rate": 0.00016, "loss": 1.2501, "step": 80 }, { "epoch": 0.3555677492369423, "grad_norm": 0.09164993464946747, "learning_rate": 0.000162, "loss": 1.2468, "step": 81 }, { "epoch": 0.3599574745361638, "grad_norm": 0.09824755787849426, "learning_rate": 0.000164, "loss": 1.2714, "step": 82 }, { "epoch": 0.3643471998353853, "grad_norm": 0.08973097801208496, "learning_rate": 0.000166, "loss": 1.2559, "step": 83 }, { "epoch": 0.3687369251346068, "grad_norm": 0.1210639625787735, "learning_rate": 0.000168, "loss": 1.1907, "step": 84 }, { "epoch": 0.3731266504338283, "grad_norm": 0.09279964119195938, "learning_rate": 0.00017, "loss": 1.2155, "step": 85 }, { "epoch": 0.3775163757330498, "grad_norm": 0.09416475147008896, "learning_rate": 0.000172, "loss": 1.2073, "step": 86 }, { "epoch": 0.38190610103227135, "grad_norm": 0.10760964453220367, "learning_rate": 0.000174, "loss": 1.256, "step": 87 }, { "epoch": 0.38629582633149284, "grad_norm": 0.0914899930357933, "learning_rate": 0.00017600000000000002, "loss": 1.2477, "step": 88 }, { "epoch": 0.39068555163071433, "grad_norm": 0.09619202464818954, "learning_rate": 0.00017800000000000002, "loss": 1.2473, "step": 89 }, { "epoch": 0.3950752769299359, "grad_norm": 0.10301009565591812, "learning_rate": 0.00018, "loss": 1.2413, "step": 90 }, { "epoch": 0.3994650022291574, "grad_norm": 0.10199137032032013, "learning_rate": 0.000182, "loss": 1.1799, "step": 91 }, { "epoch": 0.40385472752837887, "grad_norm": 0.12453147768974304, "learning_rate": 0.00018400000000000003, "loss": 1.2512, "step": 92 }, { "epoch": 0.4082444528276004, "grad_norm": 0.10290256887674332, "learning_rate": 0.00018600000000000002, "loss": 1.1937, "step": 93 }, { "epoch": 0.4126341781268219, "grad_norm": 0.09801364690065384, "learning_rate": 0.000188, "loss": 1.1563, "step": 94 }, { "epoch": 0.4170239034260434, "grad_norm": 0.1032637357711792, "learning_rate": 0.00019, "loss": 1.2187, "step": 95 }, { "epoch": 0.42141362872526494, "grad_norm": 0.09960728138685226, "learning_rate": 0.000192, "loss": 1.212, "step": 96 }, { "epoch": 0.42580335402448644, "grad_norm": 0.10608214884996414, "learning_rate": 0.000194, "loss": 1.1876, "step": 97 }, { "epoch": 0.4301930793237079, "grad_norm": 0.10623721778392792, "learning_rate": 0.000196, "loss": 1.1874, "step": 98 }, { "epoch": 0.4345828046229295, "grad_norm": 0.11170148849487305, "learning_rate": 0.00019800000000000002, "loss": 1.2133, "step": 99 }, { "epoch": 0.43897252992215097, "grad_norm": 0.1105794757604599, "learning_rate": 0.0002, "loss": 1.2159, "step": 100 }, { "epoch": 0.44336225522137246, "grad_norm": 0.10575263202190399, "learning_rate": 0.00019842519685039372, "loss": 1.2328, "step": 101 }, { "epoch": 0.447751980520594, "grad_norm": 0.10251262784004211, "learning_rate": 0.00019685039370078743, "loss": 1.2312, "step": 102 }, { "epoch": 0.4521417058198155, "grad_norm": 0.10712327063083649, "learning_rate": 0.0001952755905511811, "loss": 1.2005, "step": 103 }, { "epoch": 0.456531431119037, "grad_norm": 0.10166290402412415, "learning_rate": 0.0001937007874015748, "loss": 1.2169, "step": 104 }, { "epoch": 0.46092115641825854, "grad_norm": 0.10501708090305328, "learning_rate": 0.0001921259842519685, "loss": 1.272, "step": 105 }, { "epoch": 0.46531088171748003, "grad_norm": 0.09489674866199493, "learning_rate": 0.0001905511811023622, "loss": 1.1831, "step": 106 }, { "epoch": 0.4697006070167015, "grad_norm": 0.11105147004127502, "learning_rate": 0.0001889763779527559, "loss": 1.1796, "step": 107 }, { "epoch": 0.47409033231592307, "grad_norm": 0.12772202491760254, "learning_rate": 0.00018740157480314962, "loss": 1.2027, "step": 108 }, { "epoch": 0.47848005761514456, "grad_norm": 0.09670021384954453, "learning_rate": 0.00018582677165354333, "loss": 1.2079, "step": 109 }, { "epoch": 0.48286978291436605, "grad_norm": 0.11082947999238968, "learning_rate": 0.000184251968503937, "loss": 1.1967, "step": 110 }, { "epoch": 0.48725950821358754, "grad_norm": 0.11094267666339874, "learning_rate": 0.00018267716535433072, "loss": 1.1804, "step": 111 }, { "epoch": 0.4916492335128091, "grad_norm": 0.10237395763397217, "learning_rate": 0.0001811023622047244, "loss": 1.2016, "step": 112 }, { "epoch": 0.4960389588120306, "grad_norm": 0.11759161949157715, "learning_rate": 0.0001795275590551181, "loss": 1.1809, "step": 113 }, { "epoch": 0.5004286841112521, "grad_norm": 0.1055106446146965, "learning_rate": 0.00017795275590551182, "loss": 1.1918, "step": 114 }, { "epoch": 0.5048184094104736, "grad_norm": 0.10989883542060852, "learning_rate": 0.00017637795275590552, "loss": 1.1681, "step": 115 }, { "epoch": 0.5092081347096952, "grad_norm": 0.1036444902420044, "learning_rate": 0.00017480314960629923, "loss": 1.2278, "step": 116 }, { "epoch": 0.5135978600089166, "grad_norm": 0.10821900516748428, "learning_rate": 0.0001732283464566929, "loss": 1.1896, "step": 117 }, { "epoch": 0.5179875853081382, "grad_norm": 0.1072741150856018, "learning_rate": 0.00017165354330708662, "loss": 1.2021, "step": 118 }, { "epoch": 0.5223773106073597, "grad_norm": 0.10644800215959549, "learning_rate": 0.00017007874015748033, "loss": 1.1872, "step": 119 }, { "epoch": 0.5267670359065811, "grad_norm": 0.11100894212722778, "learning_rate": 0.000168503937007874, "loss": 1.1675, "step": 120 }, { "epoch": 0.5311567612058027, "grad_norm": 0.1064189150929451, "learning_rate": 0.00016692913385826772, "loss": 1.1825, "step": 121 }, { "epoch": 0.5355464865050242, "grad_norm": 0.1044350117444992, "learning_rate": 0.00016535433070866143, "loss": 1.1664, "step": 122 }, { "epoch": 0.5399362118042457, "grad_norm": 0.10881777852773666, "learning_rate": 0.00016377952755905514, "loss": 1.196, "step": 123 }, { "epoch": 0.5443259371034672, "grad_norm": 0.11187247931957245, "learning_rate": 0.00016220472440944882, "loss": 1.168, "step": 124 }, { "epoch": 0.5487156624026887, "grad_norm": 0.10924376547336578, "learning_rate": 0.00016062992125984252, "loss": 1.1885, "step": 125 }, { "epoch": 0.5531053877019102, "grad_norm": 0.10683095455169678, "learning_rate": 0.00015905511811023623, "loss": 1.2297, "step": 126 }, { "epoch": 0.5574951130011317, "grad_norm": 0.11374053359031677, "learning_rate": 0.00015748031496062994, "loss": 1.1554, "step": 127 }, { "epoch": 0.5618848383003532, "grad_norm": 0.1083027645945549, "learning_rate": 0.00015590551181102362, "loss": 1.1854, "step": 128 }, { "epoch": 0.5662745635995747, "grad_norm": 0.11480952799320221, "learning_rate": 0.00015433070866141733, "loss": 1.2056, "step": 129 }, { "epoch": 0.5706642888987963, "grad_norm": 0.10962171852588654, "learning_rate": 0.00015275590551181104, "loss": 1.1965, "step": 130 }, { "epoch": 0.5750540141980177, "grad_norm": 0.10890405625104904, "learning_rate": 0.00015118110236220472, "loss": 1.2199, "step": 131 }, { "epoch": 0.5794437394972393, "grad_norm": 0.10688222944736481, "learning_rate": 0.00014960629921259843, "loss": 1.1709, "step": 132 }, { "epoch": 0.5838334647964608, "grad_norm": 0.11433369666337967, "learning_rate": 0.00014803149606299214, "loss": 1.175, "step": 133 }, { "epoch": 0.5882231900956822, "grad_norm": 0.11388689279556274, "learning_rate": 0.00014645669291338584, "loss": 1.2196, "step": 134 }, { "epoch": 0.5926129153949038, "grad_norm": 0.12888666987419128, "learning_rate": 0.00014488188976377955, "loss": 1.1724, "step": 135 }, { "epoch": 0.5970026406941253, "grad_norm": 0.11102350801229477, "learning_rate": 0.00014330708661417323, "loss": 1.1953, "step": 136 }, { "epoch": 0.6013923659933468, "grad_norm": 0.11533666402101517, "learning_rate": 0.00014173228346456694, "loss": 1.1584, "step": 137 }, { "epoch": 0.6057820912925683, "grad_norm": 0.11430997401475906, "learning_rate": 0.00014015748031496062, "loss": 1.2081, "step": 138 }, { "epoch": 0.6101718165917899, "grad_norm": 0.12337413430213928, "learning_rate": 0.00013858267716535433, "loss": 1.2004, "step": 139 }, { "epoch": 0.6145615418910113, "grad_norm": 0.1094527319073677, "learning_rate": 0.00013700787401574804, "loss": 1.1692, "step": 140 }, { "epoch": 0.6189512671902329, "grad_norm": 0.11522892862558365, "learning_rate": 0.00013543307086614175, "loss": 1.178, "step": 141 }, { "epoch": 0.6233409924894544, "grad_norm": 0.11315246671438217, "learning_rate": 0.00013385826771653546, "loss": 1.2025, "step": 142 }, { "epoch": 0.6277307177886758, "grad_norm": 0.1142587885260582, "learning_rate": 0.00013228346456692914, "loss": 1.1447, "step": 143 }, { "epoch": 0.6321204430878974, "grad_norm": 0.1277647316455841, "learning_rate": 0.00013070866141732282, "loss": 1.159, "step": 144 }, { "epoch": 0.6365101683871189, "grad_norm": 0.11825836449861526, "learning_rate": 0.00012913385826771653, "loss": 1.1764, "step": 145 }, { "epoch": 0.6408998936863404, "grad_norm": 0.12381446361541748, "learning_rate": 0.00012755905511811023, "loss": 1.1617, "step": 146 }, { "epoch": 0.6452896189855619, "grad_norm": 0.1099829152226448, "learning_rate": 0.00012598425196850394, "loss": 1.148, "step": 147 }, { "epoch": 0.6496793442847835, "grad_norm": 0.11318068206310272, "learning_rate": 0.00012440944881889765, "loss": 1.1049, "step": 148 }, { "epoch": 0.6540690695840049, "grad_norm": 0.11695291101932526, "learning_rate": 0.00012283464566929136, "loss": 1.1143, "step": 149 }, { "epoch": 0.6584587948832265, "grad_norm": 0.1058238297700882, "learning_rate": 0.00012125984251968505, "loss": 1.1167, "step": 150 }, { "epoch": 0.662848520182448, "grad_norm": 0.11428267508745193, "learning_rate": 0.00011968503937007876, "loss": 1.1269, "step": 151 }, { "epoch": 0.6672382454816694, "grad_norm": 0.11971823871135712, "learning_rate": 0.00011811023622047244, "loss": 1.1636, "step": 152 }, { "epoch": 0.671627970780891, "grad_norm": 0.11238817870616913, "learning_rate": 0.00011653543307086614, "loss": 1.1835, "step": 153 }, { "epoch": 0.6760176960801125, "grad_norm": 0.10920320451259613, "learning_rate": 0.00011496062992125984, "loss": 1.1814, "step": 154 }, { "epoch": 0.680407421379334, "grad_norm": 0.11099740117788315, "learning_rate": 0.00011338582677165355, "loss": 1.1604, "step": 155 }, { "epoch": 0.6847971466785555, "grad_norm": 0.12113165110349655, "learning_rate": 0.00011181102362204725, "loss": 1.1563, "step": 156 }, { "epoch": 0.6891868719777771, "grad_norm": 0.12130718678236008, "learning_rate": 0.00011023622047244096, "loss": 1.1252, "step": 157 }, { "epoch": 0.6935765972769985, "grad_norm": 0.12606468796730042, "learning_rate": 0.00010866141732283466, "loss": 1.1384, "step": 158 }, { "epoch": 0.69796632257622, "grad_norm": 0.11793071776628494, "learning_rate": 0.00010708661417322836, "loss": 1.1104, "step": 159 }, { "epoch": 0.7023560478754416, "grad_norm": 0.1225033551454544, "learning_rate": 0.00010551181102362204, "loss": 1.1666, "step": 160 }, { "epoch": 0.706745773174663, "grad_norm": 0.1288159191608429, "learning_rate": 0.00010393700787401575, "loss": 1.1616, "step": 161 }, { "epoch": 0.7111354984738846, "grad_norm": 0.12077050656080246, "learning_rate": 0.00010236220472440946, "loss": 1.1606, "step": 162 }, { "epoch": 0.7155252237731061, "grad_norm": 0.11567137390375137, "learning_rate": 0.00010078740157480315, "loss": 1.1199, "step": 163 }, { "epoch": 0.7199149490723276, "grad_norm": 0.11462169140577316, "learning_rate": 9.921259842519686e-05, "loss": 1.1273, "step": 164 }, { "epoch": 0.7243046743715491, "grad_norm": 0.10957664996385574, "learning_rate": 9.763779527559055e-05, "loss": 1.1864, "step": 165 }, { "epoch": 0.7286943996707707, "grad_norm": 0.11114432662725449, "learning_rate": 9.606299212598425e-05, "loss": 1.1557, "step": 166 }, { "epoch": 0.7330841249699921, "grad_norm": 0.12677372992038727, "learning_rate": 9.448818897637796e-05, "loss": 1.1381, "step": 167 }, { "epoch": 0.7374738502692136, "grad_norm": 0.12376675754785538, "learning_rate": 9.291338582677166e-05, "loss": 1.2181, "step": 168 }, { "epoch": 0.7418635755684352, "grad_norm": 0.10372158885002136, "learning_rate": 9.133858267716536e-05, "loss": 1.177, "step": 169 }, { "epoch": 0.7462533008676566, "grad_norm": 0.12930211424827576, "learning_rate": 8.976377952755905e-05, "loss": 1.1506, "step": 170 }, { "epoch": 0.7506430261668782, "grad_norm": 0.11346323788166046, "learning_rate": 8.818897637795276e-05, "loss": 1.1328, "step": 171 }, { "epoch": 0.7550327514660996, "grad_norm": 0.11889853328466415, "learning_rate": 8.661417322834646e-05, "loss": 1.1256, "step": 172 }, { "epoch": 0.7594224767653212, "grad_norm": 0.1218617707490921, "learning_rate": 8.503937007874016e-05, "loss": 1.1069, "step": 173 }, { "epoch": 0.7638122020645427, "grad_norm": 0.1208794042468071, "learning_rate": 8.346456692913386e-05, "loss": 1.1376, "step": 174 }, { "epoch": 0.7682019273637641, "grad_norm": 0.12102183699607849, "learning_rate": 8.188976377952757e-05, "loss": 1.1431, "step": 175 }, { "epoch": 0.7725916526629857, "grad_norm": 0.12085499614477158, "learning_rate": 8.031496062992126e-05, "loss": 1.1455, "step": 176 }, { "epoch": 0.7769813779622072, "grad_norm": 0.12233547866344452, "learning_rate": 7.874015748031497e-05, "loss": 1.1791, "step": 177 }, { "epoch": 0.7813711032614287, "grad_norm": 0.12110645323991776, "learning_rate": 7.716535433070867e-05, "loss": 1.1266, "step": 178 }, { "epoch": 0.7857608285606502, "grad_norm": 0.12754279375076294, "learning_rate": 7.559055118110236e-05, "loss": 1.1623, "step": 179 }, { "epoch": 0.7901505538598718, "grad_norm": 0.12367318570613861, "learning_rate": 7.401574803149607e-05, "loss": 1.1562, "step": 180 }, { "epoch": 0.7945402791590932, "grad_norm": 0.11075824499130249, "learning_rate": 7.244094488188978e-05, "loss": 1.1414, "step": 181 }, { "epoch": 0.7989300044583147, "grad_norm": 0.11240646988153458, "learning_rate": 7.086614173228347e-05, "loss": 1.1661, "step": 182 }, { "epoch": 0.8033197297575363, "grad_norm": 0.12439629435539246, "learning_rate": 6.929133858267717e-05, "loss": 1.1282, "step": 183 }, { "epoch": 0.8077094550567577, "grad_norm": 0.1265660971403122, "learning_rate": 6.771653543307087e-05, "loss": 1.1354, "step": 184 }, { "epoch": 0.8120991803559793, "grad_norm": 0.12387965619564056, "learning_rate": 6.614173228346457e-05, "loss": 1.1859, "step": 185 }, { "epoch": 0.8164889056552008, "grad_norm": 0.11839327216148376, "learning_rate": 6.456692913385826e-05, "loss": 1.0988, "step": 186 }, { "epoch": 0.8208786309544223, "grad_norm": 0.12071321904659271, "learning_rate": 6.299212598425197e-05, "loss": 1.1426, "step": 187 }, { "epoch": 0.8252683562536438, "grad_norm": 0.12716011703014374, "learning_rate": 6.141732283464568e-05, "loss": 1.1614, "step": 188 }, { "epoch": 0.8296580815528654, "grad_norm": 0.12180924415588379, "learning_rate": 5.984251968503938e-05, "loss": 1.1693, "step": 189 }, { "epoch": 0.8340478068520868, "grad_norm": 0.11488785594701767, "learning_rate": 5.826771653543307e-05, "loss": 1.0999, "step": 190 }, { "epoch": 0.8384375321513083, "grad_norm": 0.11842205375432968, "learning_rate": 5.6692913385826777e-05, "loss": 1.1727, "step": 191 }, { "epoch": 0.8428272574505299, "grad_norm": 0.11119809746742249, "learning_rate": 5.511811023622048e-05, "loss": 1.1284, "step": 192 }, { "epoch": 0.8472169827497513, "grad_norm": 0.11790735274553299, "learning_rate": 5.354330708661418e-05, "loss": 1.175, "step": 193 }, { "epoch": 0.8516067080489729, "grad_norm": 0.12150382995605469, "learning_rate": 5.1968503937007874e-05, "loss": 1.1447, "step": 194 }, { "epoch": 0.8559964333481944, "grad_norm": 0.11928427219390869, "learning_rate": 5.0393700787401575e-05, "loss": 1.1735, "step": 195 }, { "epoch": 0.8603861586474159, "grad_norm": 0.11008232831954956, "learning_rate": 4.881889763779528e-05, "loss": 1.1286, "step": 196 }, { "epoch": 0.8647758839466374, "grad_norm": 0.11809035390615463, "learning_rate": 4.724409448818898e-05, "loss": 1.1773, "step": 197 }, { "epoch": 0.869165609245859, "grad_norm": 0.12080511450767517, "learning_rate": 4.566929133858268e-05, "loss": 1.1454, "step": 198 }, { "epoch": 0.8735553345450804, "grad_norm": 0.12405778467655182, "learning_rate": 4.409448818897638e-05, "loss": 1.1387, "step": 199 }, { "epoch": 0.8779450598443019, "grad_norm": 0.11586454510688782, "learning_rate": 4.251968503937008e-05, "loss": 1.1949, "step": 200 }, { "epoch": 0.8823347851435235, "grad_norm": 0.11438584327697754, "learning_rate": 4.0944881889763784e-05, "loss": 1.1755, "step": 201 }, { "epoch": 0.8867245104427449, "grad_norm": 0.11868538707494736, "learning_rate": 3.9370078740157485e-05, "loss": 1.1751, "step": 202 }, { "epoch": 0.8911142357419665, "grad_norm": 0.11678507924079895, "learning_rate": 3.779527559055118e-05, "loss": 1.1754, "step": 203 }, { "epoch": 0.895503961041188, "grad_norm": 0.11950727552175522, "learning_rate": 3.622047244094489e-05, "loss": 1.1755, "step": 204 }, { "epoch": 0.8998936863404094, "grad_norm": 0.12049534171819687, "learning_rate": 3.464566929133858e-05, "loss": 1.1524, "step": 205 }, { "epoch": 0.904283411639631, "grad_norm": 0.11280712485313416, "learning_rate": 3.3070866141732284e-05, "loss": 1.1624, "step": 206 }, { "epoch": 0.9086731369388525, "grad_norm": 0.1136147677898407, "learning_rate": 3.1496062992125985e-05, "loss": 1.1677, "step": 207 }, { "epoch": 0.913062862238074, "grad_norm": 0.11815124750137329, "learning_rate": 2.992125984251969e-05, "loss": 1.1323, "step": 208 }, { "epoch": 0.9174525875372955, "grad_norm": 0.12229173630475998, "learning_rate": 2.8346456692913388e-05, "loss": 1.1788, "step": 209 }, { "epoch": 0.9218423128365171, "grad_norm": 0.11396148800849915, "learning_rate": 2.677165354330709e-05, "loss": 1.1305, "step": 210 }, { "epoch": 0.9262320381357385, "grad_norm": 0.11646320670843124, "learning_rate": 2.5196850393700788e-05, "loss": 1.1311, "step": 211 }, { "epoch": 0.9306217634349601, "grad_norm": 0.11506093293428421, "learning_rate": 2.362204724409449e-05, "loss": 1.1457, "step": 212 }, { "epoch": 0.9350114887341816, "grad_norm": 0.11136835813522339, "learning_rate": 2.204724409448819e-05, "loss": 1.1564, "step": 213 }, { "epoch": 0.939401214033403, "grad_norm": 0.1178024634718895, "learning_rate": 2.0472440944881892e-05, "loss": 1.1222, "step": 214 }, { "epoch": 0.9437909393326246, "grad_norm": 0.11787544190883636, "learning_rate": 1.889763779527559e-05, "loss": 1.1828, "step": 215 }, { "epoch": 0.9481806646318461, "grad_norm": 0.11110374331474304, "learning_rate": 1.732283464566929e-05, "loss": 1.1995, "step": 216 }, { "epoch": 0.9525703899310676, "grad_norm": 0.11162659525871277, "learning_rate": 1.5748031496062993e-05, "loss": 1.1405, "step": 217 }, { "epoch": 0.9569601152302891, "grad_norm": 0.11330625414848328, "learning_rate": 1.4173228346456694e-05, "loss": 1.1215, "step": 218 }, { "epoch": 0.9613498405295107, "grad_norm": 0.1166309043765068, "learning_rate": 1.2598425196850394e-05, "loss": 1.1508, "step": 219 }, { "epoch": 0.9657395658287321, "grad_norm": 0.10871995985507965, "learning_rate": 1.1023622047244095e-05, "loss": 1.1895, "step": 220 }, { "epoch": 0.9701292911279537, "grad_norm": 0.11384564638137817, "learning_rate": 9.448818897637795e-06, "loss": 1.1367, "step": 221 }, { "epoch": 0.9745190164271751, "grad_norm": 0.11055337637662888, "learning_rate": 7.874015748031496e-06, "loss": 1.1282, "step": 222 }, { "epoch": 0.9789087417263966, "grad_norm": 0.1121806651353836, "learning_rate": 6.299212598425197e-06, "loss": 1.1434, "step": 223 }, { "epoch": 0.9832984670256182, "grad_norm": 0.10578761994838715, "learning_rate": 4.7244094488188975e-06, "loss": 1.2258, "step": 224 }, { "epoch": 0.9876881923248396, "grad_norm": 0.11392487585544586, "learning_rate": 3.1496062992125985e-06, "loss": 1.1703, "step": 225 }, { "epoch": 0.9920779176240612, "grad_norm": 0.10907357186079025, "learning_rate": 1.5748031496062992e-06, "loss": 1.1232, "step": 226 } ], "logging_steps": 1, "max_steps": 227, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.342231954207949e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }