{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.821256038647343, "eval_steps": 500, "global_step": 170, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004830917874396135, "grad_norm": 0.7645118236541748, "learning_rate": 5e-06, "loss": 1.259, "step": 1 }, { "epoch": 0.00966183574879227, "grad_norm": 0.9345910549163818, "learning_rate": 1e-05, "loss": 1.4787, "step": 2 }, { "epoch": 0.014492753623188406, "grad_norm": 0.9917318224906921, "learning_rate": 1.5e-05, "loss": 1.5453, "step": 3 }, { "epoch": 0.01932367149758454, "grad_norm": 1.0239824056625366, "learning_rate": 2e-05, "loss": 1.5964, "step": 4 }, { "epoch": 0.024154589371980676, "grad_norm": 0.9726951718330383, "learning_rate": 2.5e-05, "loss": 1.5687, "step": 5 }, { "epoch": 0.028985507246376812, "grad_norm": 0.7599917650222778, "learning_rate": 3e-05, "loss": 1.5249, "step": 6 }, { "epoch": 0.033816425120772944, "grad_norm": 0.5268093347549438, "learning_rate": 3.5e-05, "loss": 1.4637, "step": 7 }, { "epoch": 0.03864734299516908, "grad_norm": 0.5739946365356445, "learning_rate": 4e-05, "loss": 1.4514, "step": 8 }, { "epoch": 0.043478260869565216, "grad_norm": 0.6630675792694092, "learning_rate": 4.5e-05, "loss": 1.442, "step": 9 }, { "epoch": 0.04830917874396135, "grad_norm": 0.5699703097343445, "learning_rate": 5e-05, "loss": 1.4091, "step": 10 }, { "epoch": 0.05314009661835749, "grad_norm": 0.4952673017978668, "learning_rate": 5.500000000000001e-05, "loss": 1.3877, "step": 11 }, { "epoch": 0.057971014492753624, "grad_norm": 0.5180989503860474, "learning_rate": 6e-05, "loss": 1.3707, "step": 12 }, { "epoch": 0.06280193236714976, "grad_norm": 0.41192442178726196, "learning_rate": 6.500000000000001e-05, "loss": 1.3599, "step": 13 }, { "epoch": 0.06763285024154589, "grad_norm": 0.28801196813583374, "learning_rate": 7e-05, "loss": 1.3484, "step": 14 }, { "epoch": 0.07246376811594203, "grad_norm": 0.2618640959262848, "learning_rate": 7.500000000000001e-05, "loss": 1.3382, "step": 15 }, { "epoch": 0.07729468599033816, "grad_norm": 0.2657703161239624, "learning_rate": 8e-05, "loss": 1.335, "step": 16 }, { "epoch": 0.0821256038647343, "grad_norm": 0.2432931512594223, "learning_rate": 8.5e-05, "loss": 1.323, "step": 17 }, { "epoch": 0.08695652173913043, "grad_norm": 0.24172987043857574, "learning_rate": 9e-05, "loss": 1.3317, "step": 18 }, { "epoch": 0.09178743961352658, "grad_norm": 0.26086804270744324, "learning_rate": 9.5e-05, "loss": 1.3163, "step": 19 }, { "epoch": 0.0966183574879227, "grad_norm": 0.2007642686367035, "learning_rate": 0.0001, "loss": 1.2877, "step": 20 }, { "epoch": 0.10144927536231885, "grad_norm": 0.2327784299850464, "learning_rate": 9.946524064171123e-05, "loss": 1.2899, "step": 21 }, { "epoch": 0.10628019323671498, "grad_norm": 0.20648740231990814, "learning_rate": 9.893048128342246e-05, "loss": 1.287, "step": 22 }, { "epoch": 0.1111111111111111, "grad_norm": 0.22094646096229553, "learning_rate": 9.83957219251337e-05, "loss": 1.2873, "step": 23 }, { "epoch": 0.11594202898550725, "grad_norm": 0.18131175637245178, "learning_rate": 9.786096256684493e-05, "loss": 1.2594, "step": 24 }, { "epoch": 0.12077294685990338, "grad_norm": 0.16657911241054535, "learning_rate": 9.732620320855615e-05, "loss": 1.2639, "step": 25 }, { "epoch": 0.12560386473429952, "grad_norm": 0.1740303933620453, "learning_rate": 9.679144385026739e-05, "loss": 1.2603, "step": 26 }, { "epoch": 0.13043478260869565, "grad_norm": 0.15640808641910553, "learning_rate": 9.625668449197861e-05, "loss": 1.274, "step": 27 }, { "epoch": 0.13526570048309178, "grad_norm": 0.16680403053760529, "learning_rate": 9.572192513368984e-05, "loss": 1.2724, "step": 28 }, { "epoch": 0.14009661835748793, "grad_norm": 0.15997755527496338, "learning_rate": 9.518716577540108e-05, "loss": 1.2672, "step": 29 }, { "epoch": 0.14492753623188406, "grad_norm": 0.15305301547050476, "learning_rate": 9.46524064171123e-05, "loss": 1.2718, "step": 30 }, { "epoch": 0.1497584541062802, "grad_norm": 0.14839769899845123, "learning_rate": 9.411764705882353e-05, "loss": 1.2707, "step": 31 }, { "epoch": 0.15458937198067632, "grad_norm": 0.14878958463668823, "learning_rate": 9.358288770053476e-05, "loss": 1.2648, "step": 32 }, { "epoch": 0.15942028985507245, "grad_norm": 0.17154482007026672, "learning_rate": 9.3048128342246e-05, "loss": 1.2641, "step": 33 }, { "epoch": 0.1642512077294686, "grad_norm": 0.1447138488292694, "learning_rate": 9.251336898395723e-05, "loss": 1.27, "step": 34 }, { "epoch": 0.16908212560386474, "grad_norm": 0.1631896197795868, "learning_rate": 9.197860962566846e-05, "loss": 1.276, "step": 35 }, { "epoch": 0.17391304347826086, "grad_norm": 0.14892889559268951, "learning_rate": 9.144385026737968e-05, "loss": 1.2747, "step": 36 }, { "epoch": 0.178743961352657, "grad_norm": 0.1588708907365799, "learning_rate": 9.090909090909092e-05, "loss": 1.276, "step": 37 }, { "epoch": 0.18357487922705315, "grad_norm": 0.151743546128273, "learning_rate": 9.037433155080214e-05, "loss": 1.2788, "step": 38 }, { "epoch": 0.18840579710144928, "grad_norm": 0.15703994035720825, "learning_rate": 8.983957219251337e-05, "loss": 1.2936, "step": 39 }, { "epoch": 0.1932367149758454, "grad_norm": 0.1660437434911728, "learning_rate": 8.930481283422461e-05, "loss": 1.2824, "step": 40 }, { "epoch": 0.19806763285024154, "grad_norm": 0.15268553793430328, "learning_rate": 8.877005347593583e-05, "loss": 1.3056, "step": 41 }, { "epoch": 0.2028985507246377, "grad_norm": 0.1577601134777069, "learning_rate": 8.823529411764706e-05, "loss": 1.3142, "step": 42 }, { "epoch": 0.20772946859903382, "grad_norm": 0.16757714748382568, "learning_rate": 8.770053475935829e-05, "loss": 1.3389, "step": 43 }, { "epoch": 0.21256038647342995, "grad_norm": 0.1712018847465515, "learning_rate": 8.716577540106952e-05, "loss": 1.3437, "step": 44 }, { "epoch": 0.21739130434782608, "grad_norm": 0.1829441487789154, "learning_rate": 8.663101604278076e-05, "loss": 1.358, "step": 45 }, { "epoch": 0.2222222222222222, "grad_norm": 0.20615732669830322, "learning_rate": 8.609625668449198e-05, "loss": 1.3878, "step": 46 }, { "epoch": 0.22705314009661837, "grad_norm": 0.23940807580947876, "learning_rate": 8.556149732620321e-05, "loss": 1.4699, "step": 47 }, { "epoch": 0.2318840579710145, "grad_norm": 0.41468575596809387, "learning_rate": 8.502673796791443e-05, "loss": 1.4893, "step": 48 }, { "epoch": 0.23671497584541062, "grad_norm": 0.5656126737594604, "learning_rate": 8.449197860962568e-05, "loss": 1.4967, "step": 49 }, { "epoch": 0.24154589371980675, "grad_norm": 4.2125325202941895, "learning_rate": 8.39572192513369e-05, "loss": 1.6295, "step": 50 }, { "epoch": 0.2463768115942029, "grad_norm": 1.100631833076477, "learning_rate": 8.342245989304814e-05, "loss": 1.0523, "step": 51 }, { "epoch": 0.25120772946859904, "grad_norm": 0.44898825883865356, "learning_rate": 8.288770053475936e-05, "loss": 1.106, "step": 52 }, { "epoch": 0.2560386473429952, "grad_norm": 0.2860921025276184, "learning_rate": 8.23529411764706e-05, "loss": 1.1398, "step": 53 }, { "epoch": 0.2608695652173913, "grad_norm": 0.28824105858802795, "learning_rate": 8.181818181818183e-05, "loss": 1.1547, "step": 54 }, { "epoch": 0.26570048309178745, "grad_norm": 0.32123416662216187, "learning_rate": 8.128342245989305e-05, "loss": 1.1646, "step": 55 }, { "epoch": 0.27053140096618356, "grad_norm": 0.2752850353717804, "learning_rate": 8.074866310160429e-05, "loss": 1.1752, "step": 56 }, { "epoch": 0.2753623188405797, "grad_norm": 0.22371803224086761, "learning_rate": 8.021390374331551e-05, "loss": 1.1934, "step": 57 }, { "epoch": 0.28019323671497587, "grad_norm": 0.23126192390918732, "learning_rate": 7.967914438502674e-05, "loss": 1.2057, "step": 58 }, { "epoch": 0.28502415458937197, "grad_norm": 0.24694480001926422, "learning_rate": 7.914438502673798e-05, "loss": 1.1858, "step": 59 }, { "epoch": 0.2898550724637681, "grad_norm": 0.20105589926242828, "learning_rate": 7.86096256684492e-05, "loss": 1.202, "step": 60 }, { "epoch": 0.2946859903381642, "grad_norm": 0.15975232422351837, "learning_rate": 7.807486631016043e-05, "loss": 1.2014, "step": 61 }, { "epoch": 0.2995169082125604, "grad_norm": 0.17269295454025269, "learning_rate": 7.754010695187165e-05, "loss": 1.1878, "step": 62 }, { "epoch": 0.30434782608695654, "grad_norm": 0.1990584284067154, "learning_rate": 7.700534759358289e-05, "loss": 1.1951, "step": 63 }, { "epoch": 0.30917874396135264, "grad_norm": 0.18062998354434967, "learning_rate": 7.647058823529411e-05, "loss": 1.1978, "step": 64 }, { "epoch": 0.3140096618357488, "grad_norm": 0.15606802701950073, "learning_rate": 7.593582887700536e-05, "loss": 1.2153, "step": 65 }, { "epoch": 0.3188405797101449, "grad_norm": 0.1434660404920578, "learning_rate": 7.540106951871658e-05, "loss": 1.2074, "step": 66 }, { "epoch": 0.32367149758454106, "grad_norm": 0.1473468840122223, "learning_rate": 7.486631016042782e-05, "loss": 1.1962, "step": 67 }, { "epoch": 0.3285024154589372, "grad_norm": 0.1452961415052414, "learning_rate": 7.433155080213904e-05, "loss": 1.209, "step": 68 }, { "epoch": 0.3333333333333333, "grad_norm": 0.14672888815402985, "learning_rate": 7.379679144385027e-05, "loss": 1.2126, "step": 69 }, { "epoch": 0.33816425120772947, "grad_norm": 0.14124266803264618, "learning_rate": 7.326203208556151e-05, "loss": 1.2111, "step": 70 }, { "epoch": 0.34299516908212563, "grad_norm": 0.13139352202415466, "learning_rate": 7.272727272727273e-05, "loss": 1.216, "step": 71 }, { "epoch": 0.34782608695652173, "grad_norm": 0.1385214626789093, "learning_rate": 7.219251336898396e-05, "loss": 1.2199, "step": 72 }, { "epoch": 0.3526570048309179, "grad_norm": 0.1288975179195404, "learning_rate": 7.165775401069518e-05, "loss": 1.201, "step": 73 }, { "epoch": 0.357487922705314, "grad_norm": 0.13003361225128174, "learning_rate": 7.112299465240642e-05, "loss": 1.2186, "step": 74 }, { "epoch": 0.36231884057971014, "grad_norm": 0.13762855529785156, "learning_rate": 7.058823529411765e-05, "loss": 1.2209, "step": 75 }, { "epoch": 0.3671497584541063, "grad_norm": 0.13935087621212006, "learning_rate": 7.005347593582889e-05, "loss": 1.2219, "step": 76 }, { "epoch": 0.3719806763285024, "grad_norm": 0.13384683430194855, "learning_rate": 6.951871657754011e-05, "loss": 1.2419, "step": 77 }, { "epoch": 0.37681159420289856, "grad_norm": 0.12453139573335648, "learning_rate": 6.898395721925133e-05, "loss": 1.2154, "step": 78 }, { "epoch": 0.38164251207729466, "grad_norm": 0.13903535902500153, "learning_rate": 6.844919786096257e-05, "loss": 1.2378, "step": 79 }, { "epoch": 0.3864734299516908, "grad_norm": 0.13833968341350555, "learning_rate": 6.79144385026738e-05, "loss": 1.2244, "step": 80 }, { "epoch": 0.391304347826087, "grad_norm": 0.13052114844322205, "learning_rate": 6.737967914438504e-05, "loss": 1.226, "step": 81 }, { "epoch": 0.3961352657004831, "grad_norm": 0.13437196612358093, "learning_rate": 6.684491978609626e-05, "loss": 1.2457, "step": 82 }, { "epoch": 0.40096618357487923, "grad_norm": 0.13693881034851074, "learning_rate": 6.631016042780749e-05, "loss": 1.2384, "step": 83 }, { "epoch": 0.4057971014492754, "grad_norm": 0.13426737487316132, "learning_rate": 6.577540106951871e-05, "loss": 1.2427, "step": 84 }, { "epoch": 0.4106280193236715, "grad_norm": 0.13844414055347443, "learning_rate": 6.524064171122995e-05, "loss": 1.2661, "step": 85 }, { "epoch": 0.41545893719806765, "grad_norm": 0.13957887887954712, "learning_rate": 6.470588235294118e-05, "loss": 1.2696, "step": 86 }, { "epoch": 0.42028985507246375, "grad_norm": 0.13465899229049683, "learning_rate": 6.41711229946524e-05, "loss": 1.2591, "step": 87 }, { "epoch": 0.4251207729468599, "grad_norm": 0.1441555917263031, "learning_rate": 6.363636363636364e-05, "loss": 1.2766, "step": 88 }, { "epoch": 0.42995169082125606, "grad_norm": 0.1500505656003952, "learning_rate": 6.310160427807486e-05, "loss": 1.2887, "step": 89 }, { "epoch": 0.43478260869565216, "grad_norm": 0.15137352049350739, "learning_rate": 6.25668449197861e-05, "loss": 1.2792, "step": 90 }, { "epoch": 0.4396135265700483, "grad_norm": 0.15071454644203186, "learning_rate": 6.203208556149733e-05, "loss": 1.2876, "step": 91 }, { "epoch": 0.4444444444444444, "grad_norm": 0.1570783108472824, "learning_rate": 6.149732620320857e-05, "loss": 1.3223, "step": 92 }, { "epoch": 0.4492753623188406, "grad_norm": 0.16483648121356964, "learning_rate": 6.096256684491979e-05, "loss": 1.3285, "step": 93 }, { "epoch": 0.45410628019323673, "grad_norm": 0.1700102537870407, "learning_rate": 6.0427807486631016e-05, "loss": 1.3349, "step": 94 }, { "epoch": 0.45893719806763283, "grad_norm": 0.1778935194015503, "learning_rate": 5.9893048128342244e-05, "loss": 1.3644, "step": 95 }, { "epoch": 0.463768115942029, "grad_norm": 0.19471201300621033, "learning_rate": 5.9358288770053486e-05, "loss": 1.4028, "step": 96 }, { "epoch": 0.46859903381642515, "grad_norm": 0.25646305084228516, "learning_rate": 5.882352941176471e-05, "loss": 1.4694, "step": 97 }, { "epoch": 0.47342995169082125, "grad_norm": 0.277474045753479, "learning_rate": 5.8288770053475936e-05, "loss": 1.5052, "step": 98 }, { "epoch": 0.4782608695652174, "grad_norm": 0.33792170882225037, "learning_rate": 5.7754010695187164e-05, "loss": 1.5414, "step": 99 }, { "epoch": 0.4830917874396135, "grad_norm": 0.6432341933250427, "learning_rate": 5.721925133689839e-05, "loss": 1.6046, "step": 100 }, { "epoch": 0.48792270531400966, "grad_norm": 0.5846565365791321, "learning_rate": 5.6684491978609634e-05, "loss": 0.9549, "step": 101 }, { "epoch": 0.4927536231884058, "grad_norm": 0.49914559721946716, "learning_rate": 5.614973262032086e-05, "loss": 1.0978, "step": 102 }, { "epoch": 0.4975845410628019, "grad_norm": 0.33365777134895325, "learning_rate": 5.561497326203209e-05, "loss": 1.1242, "step": 103 }, { "epoch": 0.5024154589371981, "grad_norm": 0.7763075828552246, "learning_rate": 5.508021390374332e-05, "loss": 1.1525, "step": 104 }, { "epoch": 0.5072463768115942, "grad_norm": 0.2733679413795471, "learning_rate": 5.4545454545454546e-05, "loss": 1.1607, "step": 105 }, { "epoch": 0.5120772946859904, "grad_norm": 0.26070019602775574, "learning_rate": 5.401069518716578e-05, "loss": 1.1609, "step": 106 }, { "epoch": 0.5169082125603864, "grad_norm": 0.2708110809326172, "learning_rate": 5.347593582887701e-05, "loss": 1.1815, "step": 107 }, { "epoch": 0.5217391304347826, "grad_norm": 0.26362475752830505, "learning_rate": 5.294117647058824e-05, "loss": 1.1943, "step": 108 }, { "epoch": 0.5265700483091788, "grad_norm": 0.21209686994552612, "learning_rate": 5.2406417112299466e-05, "loss": 1.1944, "step": 109 }, { "epoch": 0.5314009661835749, "grad_norm": 0.21895644068717957, "learning_rate": 5.1871657754010694e-05, "loss": 1.192, "step": 110 }, { "epoch": 0.5362318840579711, "grad_norm": 0.20762521028518677, "learning_rate": 5.1336898395721935e-05, "loss": 1.1987, "step": 111 }, { "epoch": 0.5410628019323671, "grad_norm": 0.18395845592021942, "learning_rate": 5.0802139037433164e-05, "loss": 1.1925, "step": 112 }, { "epoch": 0.5458937198067633, "grad_norm": 0.1653558313846588, "learning_rate": 5.026737967914439e-05, "loss": 1.1961, "step": 113 }, { "epoch": 0.5507246376811594, "grad_norm": 0.1628160923719406, "learning_rate": 4.973262032085561e-05, "loss": 1.1993, "step": 114 }, { "epoch": 0.5555555555555556, "grad_norm": 0.17022235691547394, "learning_rate": 4.919786096256685e-05, "loss": 1.2101, "step": 115 }, { "epoch": 0.5603864734299517, "grad_norm": 0.1710771918296814, "learning_rate": 4.8663101604278076e-05, "loss": 1.2059, "step": 116 }, { "epoch": 0.5652173913043478, "grad_norm": 0.17160499095916748, "learning_rate": 4.8128342245989304e-05, "loss": 1.206, "step": 117 }, { "epoch": 0.5700483091787439, "grad_norm": 0.15871083736419678, "learning_rate": 4.759358288770054e-05, "loss": 1.2067, "step": 118 }, { "epoch": 0.5748792270531401, "grad_norm": 0.14880803227424622, "learning_rate": 4.705882352941177e-05, "loss": 1.218, "step": 119 }, { "epoch": 0.5797101449275363, "grad_norm": 0.14413942396640778, "learning_rate": 4.6524064171123e-05, "loss": 1.1958, "step": 120 }, { "epoch": 0.5845410628019324, "grad_norm": 0.14965958893299103, "learning_rate": 4.598930481283423e-05, "loss": 1.2206, "step": 121 }, { "epoch": 0.5893719806763285, "grad_norm": 0.14546802639961243, "learning_rate": 4.545454545454546e-05, "loss": 1.2107, "step": 122 }, { "epoch": 0.5942028985507246, "grad_norm": 0.14043672382831573, "learning_rate": 4.491978609625669e-05, "loss": 1.2059, "step": 123 }, { "epoch": 0.5990338164251208, "grad_norm": 0.1403893083333969, "learning_rate": 4.4385026737967915e-05, "loss": 1.2327, "step": 124 }, { "epoch": 0.6038647342995169, "grad_norm": 0.13266952335834503, "learning_rate": 4.385026737967914e-05, "loss": 1.194, "step": 125 }, { "epoch": 0.6086956521739131, "grad_norm": 0.1347023993730545, "learning_rate": 4.331550802139038e-05, "loss": 1.1951, "step": 126 }, { "epoch": 0.6135265700483091, "grad_norm": 0.13116984069347382, "learning_rate": 4.2780748663101606e-05, "loss": 1.2206, "step": 127 }, { "epoch": 0.6183574879227053, "grad_norm": 0.14027127623558044, "learning_rate": 4.224598930481284e-05, "loss": 1.2304, "step": 128 }, { "epoch": 0.6231884057971014, "grad_norm": 0.13990454375743866, "learning_rate": 4.171122994652407e-05, "loss": 1.2229, "step": 129 }, { "epoch": 0.6280193236714976, "grad_norm": 0.13515028357505798, "learning_rate": 4.11764705882353e-05, "loss": 1.2191, "step": 130 }, { "epoch": 0.6328502415458938, "grad_norm": 0.1329352706670761, "learning_rate": 4.0641711229946525e-05, "loss": 1.2308, "step": 131 }, { "epoch": 0.6376811594202898, "grad_norm": 0.13061358034610748, "learning_rate": 4.0106951871657754e-05, "loss": 1.2446, "step": 132 }, { "epoch": 0.642512077294686, "grad_norm": 0.13514551520347595, "learning_rate": 3.957219251336899e-05, "loss": 1.2425, "step": 133 }, { "epoch": 0.6473429951690821, "grad_norm": 0.13962285220623016, "learning_rate": 3.903743315508022e-05, "loss": 1.2574, "step": 134 }, { "epoch": 0.6521739130434783, "grad_norm": 0.14081381261348724, "learning_rate": 3.8502673796791445e-05, "loss": 1.247, "step": 135 }, { "epoch": 0.6570048309178744, "grad_norm": 0.1396479606628418, "learning_rate": 3.796791443850268e-05, "loss": 1.2379, "step": 136 }, { "epoch": 0.6618357487922706, "grad_norm": 0.13990682363510132, "learning_rate": 3.743315508021391e-05, "loss": 1.2637, "step": 137 }, { "epoch": 0.6666666666666666, "grad_norm": 0.1405869573354721, "learning_rate": 3.6898395721925136e-05, "loss": 1.2458, "step": 138 }, { "epoch": 0.6714975845410628, "grad_norm": 0.14569957554340363, "learning_rate": 3.6363636363636364e-05, "loss": 1.2755, "step": 139 }, { "epoch": 0.6763285024154589, "grad_norm": 0.14868015050888062, "learning_rate": 3.582887700534759e-05, "loss": 1.2836, "step": 140 }, { "epoch": 0.6811594202898551, "grad_norm": 0.1531868726015091, "learning_rate": 3.529411764705883e-05, "loss": 1.295, "step": 141 }, { "epoch": 0.6859903381642513, "grad_norm": 0.16108393669128418, "learning_rate": 3.4759358288770055e-05, "loss": 1.3239, "step": 142 }, { "epoch": 0.6908212560386473, "grad_norm": 0.1609143316745758, "learning_rate": 3.4224598930481284e-05, "loss": 1.3301, "step": 143 }, { "epoch": 0.6956521739130435, "grad_norm": 0.16705213487148285, "learning_rate": 3.368983957219252e-05, "loss": 1.3484, "step": 144 }, { "epoch": 0.7004830917874396, "grad_norm": 0.18058659136295319, "learning_rate": 3.3155080213903747e-05, "loss": 1.3794, "step": 145 }, { "epoch": 0.7053140096618358, "grad_norm": 0.20221418142318726, "learning_rate": 3.2620320855614975e-05, "loss": 1.4019, "step": 146 }, { "epoch": 0.7101449275362319, "grad_norm": 0.24968379735946655, "learning_rate": 3.20855614973262e-05, "loss": 1.4674, "step": 147 }, { "epoch": 0.714975845410628, "grad_norm": 0.3043461740016937, "learning_rate": 3.155080213903743e-05, "loss": 1.4972, "step": 148 }, { "epoch": 0.7198067632850241, "grad_norm": 0.33808770775794983, "learning_rate": 3.1016042780748666e-05, "loss": 1.5237, "step": 149 }, { "epoch": 0.7246376811594203, "grad_norm": 0.5125290155410767, "learning_rate": 3.0481283422459894e-05, "loss": 1.5533, "step": 150 }, { "epoch": 0.7294685990338164, "grad_norm": 0.25133025646209717, "learning_rate": 2.9946524064171122e-05, "loss": 0.8978, "step": 151 }, { "epoch": 0.7342995169082126, "grad_norm": 0.30135515332221985, "learning_rate": 2.9411764705882354e-05, "loss": 1.0278, "step": 152 }, { "epoch": 0.7391304347826086, "grad_norm": 0.2961145043373108, "learning_rate": 2.8877005347593582e-05, "loss": 1.1024, "step": 153 }, { "epoch": 0.7439613526570048, "grad_norm": 0.273294597864151, "learning_rate": 2.8342245989304817e-05, "loss": 1.1373, "step": 154 }, { "epoch": 0.748792270531401, "grad_norm": 0.23799936473369598, "learning_rate": 2.7807486631016045e-05, "loss": 1.1419, "step": 155 }, { "epoch": 0.7536231884057971, "grad_norm": 0.21061824262142181, "learning_rate": 2.7272727272727273e-05, "loss": 1.1512, "step": 156 }, { "epoch": 0.7584541062801933, "grad_norm": 0.21470795571804047, "learning_rate": 2.6737967914438505e-05, "loss": 1.1616, "step": 157 }, { "epoch": 0.7632850241545893, "grad_norm": 0.21231332421302795, "learning_rate": 2.6203208556149733e-05, "loss": 1.1662, "step": 158 }, { "epoch": 0.7681159420289855, "grad_norm": 0.20699279010295868, "learning_rate": 2.5668449197860968e-05, "loss": 1.1716, "step": 159 }, { "epoch": 0.7729468599033816, "grad_norm": 0.20150579512119293, "learning_rate": 2.5133689839572196e-05, "loss": 1.1836, "step": 160 }, { "epoch": 0.7777777777777778, "grad_norm": 0.1880892962217331, "learning_rate": 2.4598930481283424e-05, "loss": 1.1794, "step": 161 }, { "epoch": 0.782608695652174, "grad_norm": 0.17840184271335602, "learning_rate": 2.4064171122994652e-05, "loss": 1.1757, "step": 162 }, { "epoch": 0.7874396135265701, "grad_norm": 0.18409621715545654, "learning_rate": 2.3529411764705884e-05, "loss": 1.1935, "step": 163 }, { "epoch": 0.7922705314009661, "grad_norm": 0.18802137672901154, "learning_rate": 2.2994652406417115e-05, "loss": 1.1795, "step": 164 }, { "epoch": 0.7971014492753623, "grad_norm": 0.1941538006067276, "learning_rate": 2.2459893048128343e-05, "loss": 1.2069, "step": 165 }, { "epoch": 0.8019323671497585, "grad_norm": 0.18578243255615234, "learning_rate": 2.192513368983957e-05, "loss": 1.1965, "step": 166 }, { "epoch": 0.8067632850241546, "grad_norm": 0.17622320353984833, "learning_rate": 2.1390374331550803e-05, "loss": 1.1839, "step": 167 }, { "epoch": 0.8115942028985508, "grad_norm": 0.16080059111118317, "learning_rate": 2.0855614973262035e-05, "loss": 1.1926, "step": 168 }, { "epoch": 0.8164251207729468, "grad_norm": 0.14835765957832336, "learning_rate": 2.0320855614973263e-05, "loss": 1.1958, "step": 169 }, { "epoch": 0.821256038647343, "grad_norm": 0.14560697972774506, "learning_rate": 1.9786096256684494e-05, "loss": 1.2047, "step": 170 } ], "logging_steps": 1, "max_steps": 207, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.493930894749139e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }