{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9476110645431683, "eval_steps": 149, "global_step": 1788, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016764459346186086, "grad_norm": 58.5, "learning_rate": 4.0000000000000003e-07, "loss": 5.1954, "step": 1 }, { "epoch": 0.0016764459346186086, "eval_loss": 5.599944114685059, "eval_runtime": 169.2777, "eval_samples_per_second": 3.101, "eval_steps_per_second": 1.554, "step": 1 }, { "epoch": 0.003352891869237217, "grad_norm": 141.74281311035156, "learning_rate": 8.000000000000001e-07, "loss": 5.9212, "step": 2 }, { "epoch": 0.005029337803855826, "grad_norm": 68.65497589111328, "learning_rate": 1.2000000000000002e-06, "loss": 6.4494, "step": 3 }, { "epoch": 0.006705783738474434, "grad_norm": 50.80929946899414, "learning_rate": 1.6000000000000001e-06, "loss": 5.8571, "step": 4 }, { "epoch": 0.008382229673093043, "grad_norm": 42.32209396362305, "learning_rate": 2.0000000000000003e-06, "loss": 5.4878, "step": 5 }, { "epoch": 0.010058675607711651, "grad_norm": 74.76172637939453, "learning_rate": 2.4000000000000003e-06, "loss": 4.7397, "step": 6 }, { "epoch": 0.01173512154233026, "grad_norm": 48.762447357177734, "learning_rate": 2.8000000000000003e-06, "loss": 5.5576, "step": 7 }, { "epoch": 0.013411567476948869, "grad_norm": 31.65540313720703, "learning_rate": 3.2000000000000003e-06, "loss": 5.4924, "step": 8 }, { "epoch": 0.015088013411567477, "grad_norm": 56.13128662109375, "learning_rate": 3.6000000000000003e-06, "loss": 5.889, "step": 9 }, { "epoch": 0.016764459346186086, "grad_norm": 53.97425842285156, "learning_rate": 4.000000000000001e-06, "loss": 5.4841, "step": 10 }, { "epoch": 0.018440905280804692, "grad_norm": 54.93872833251953, "learning_rate": 4.4e-06, "loss": 5.213, "step": 11 }, { "epoch": 0.020117351215423303, "grad_norm": 23.41396713256836, "learning_rate": 4.800000000000001e-06, "loss": 5.1099, "step": 12 }, { "epoch": 0.02179379715004191, "grad_norm": 62.31817626953125, "learning_rate": 5.2e-06, "loss": 4.6125, "step": 13 }, { "epoch": 0.02347024308466052, "grad_norm": 38.71681594848633, "learning_rate": 5.600000000000001e-06, "loss": 5.3719, "step": 14 }, { "epoch": 0.025146689019279127, "grad_norm": 27.996841430664062, "learning_rate": 6e-06, "loss": 5.4445, "step": 15 }, { "epoch": 0.026823134953897737, "grad_norm": 14.310535430908203, "learning_rate": 6.4000000000000006e-06, "loss": 5.3801, "step": 16 }, { "epoch": 0.028499580888516344, "grad_norm": 17.935075759887695, "learning_rate": 6.800000000000001e-06, "loss": 5.6118, "step": 17 }, { "epoch": 0.030176026823134954, "grad_norm": 17.89678192138672, "learning_rate": 7.2000000000000005e-06, "loss": 5.0448, "step": 18 }, { "epoch": 0.03185247275775356, "grad_norm": 17.445358276367188, "learning_rate": 7.600000000000001e-06, "loss": 5.2372, "step": 19 }, { "epoch": 0.03352891869237217, "grad_norm": 15.668132781982422, "learning_rate": 8.000000000000001e-06, "loss": 4.9143, "step": 20 }, { "epoch": 0.03520536462699078, "grad_norm": 11.796053886413574, "learning_rate": 8.400000000000001e-06, "loss": 5.3982, "step": 21 }, { "epoch": 0.036881810561609385, "grad_norm": 11.359800338745117, "learning_rate": 8.8e-06, "loss": 5.3074, "step": 22 }, { "epoch": 0.038558256496227995, "grad_norm": 12.502777099609375, "learning_rate": 9.200000000000002e-06, "loss": 4.664, "step": 23 }, { "epoch": 0.040234702430846606, "grad_norm": 10.453709602355957, "learning_rate": 9.600000000000001e-06, "loss": 5.0563, "step": 24 }, { "epoch": 0.041911148365465216, "grad_norm": 8.813002586364746, "learning_rate": 1e-05, "loss": 5.2616, "step": 25 }, { "epoch": 0.04358759430008382, "grad_norm": 10.084755897521973, "learning_rate": 1.04e-05, "loss": 4.999, "step": 26 }, { "epoch": 0.04526404023470243, "grad_norm": 9.359565734863281, "learning_rate": 1.0800000000000002e-05, "loss": 4.0741, "step": 27 }, { "epoch": 0.04694048616932104, "grad_norm": 8.244972229003906, "learning_rate": 1.1200000000000001e-05, "loss": 4.6696, "step": 28 }, { "epoch": 0.04861693210393965, "grad_norm": 9.39881420135498, "learning_rate": 1.16e-05, "loss": 4.5297, "step": 29 }, { "epoch": 0.050293378038558254, "grad_norm": 10.552752494812012, "learning_rate": 1.2e-05, "loss": 5.1291, "step": 30 }, { "epoch": 0.051969823973176864, "grad_norm": 7.995792865753174, "learning_rate": 1.2400000000000002e-05, "loss": 4.6988, "step": 31 }, { "epoch": 0.053646269907795474, "grad_norm": 15.120522499084473, "learning_rate": 1.2800000000000001e-05, "loss": 4.7313, "step": 32 }, { "epoch": 0.055322715842414084, "grad_norm": 9.446954727172852, "learning_rate": 1.3200000000000002e-05, "loss": 4.585, "step": 33 }, { "epoch": 0.05699916177703269, "grad_norm": 15.281286239624023, "learning_rate": 1.3600000000000002e-05, "loss": 4.7482, "step": 34 }, { "epoch": 0.0586756077116513, "grad_norm": 11.006072998046875, "learning_rate": 1.4e-05, "loss": 4.7559, "step": 35 }, { "epoch": 0.06035205364626991, "grad_norm": 14.332401275634766, "learning_rate": 1.4400000000000001e-05, "loss": 4.6999, "step": 36 }, { "epoch": 0.06202849958088852, "grad_norm": 12.706934928894043, "learning_rate": 1.48e-05, "loss": 3.7666, "step": 37 }, { "epoch": 0.06370494551550712, "grad_norm": 14.815118789672852, "learning_rate": 1.5200000000000002e-05, "loss": 5.4161, "step": 38 }, { "epoch": 0.06538139145012574, "grad_norm": 10.75166130065918, "learning_rate": 1.5600000000000003e-05, "loss": 4.3832, "step": 39 }, { "epoch": 0.06705783738474434, "grad_norm": 11.158617973327637, "learning_rate": 1.6000000000000003e-05, "loss": 4.5008, "step": 40 }, { "epoch": 0.06873428331936295, "grad_norm": 8.258965492248535, "learning_rate": 1.64e-05, "loss": 4.7895, "step": 41 }, { "epoch": 0.07041072925398156, "grad_norm": 9.493805885314941, "learning_rate": 1.6800000000000002e-05, "loss": 4.272, "step": 42 }, { "epoch": 0.07208717518860017, "grad_norm": 12.44154167175293, "learning_rate": 1.72e-05, "loss": 4.3662, "step": 43 }, { "epoch": 0.07376362112321877, "grad_norm": 14.578902244567871, "learning_rate": 1.76e-05, "loss": 4.7212, "step": 44 }, { "epoch": 0.07544006705783739, "grad_norm": 11.346492767333984, "learning_rate": 1.8e-05, "loss": 3.9937, "step": 45 }, { "epoch": 0.07711651299245599, "grad_norm": 8.398422241210938, "learning_rate": 1.8400000000000003e-05, "loss": 4.2908, "step": 46 }, { "epoch": 0.07879295892707461, "grad_norm": 9.767253875732422, "learning_rate": 1.88e-05, "loss": 4.5815, "step": 47 }, { "epoch": 0.08046940486169321, "grad_norm": 34.539703369140625, "learning_rate": 1.9200000000000003e-05, "loss": 3.979, "step": 48 }, { "epoch": 0.08214585079631181, "grad_norm": 11.162960052490234, "learning_rate": 1.9600000000000002e-05, "loss": 4.245, "step": 49 }, { "epoch": 0.08382229673093043, "grad_norm": 11.143671035766602, "learning_rate": 2e-05, "loss": 4.052, "step": 50 }, { "epoch": 0.08549874266554904, "grad_norm": 15.760330200195312, "learning_rate": 1.999998366308358e-05, "loss": 4.3445, "step": 51 }, { "epoch": 0.08717518860016764, "grad_norm": 11.40070629119873, "learning_rate": 1.9999934652387694e-05, "loss": 4.0951, "step": 52 }, { "epoch": 0.08885163453478626, "grad_norm": 11.450596809387207, "learning_rate": 1.9999852968072475e-05, "loss": 4.2507, "step": 53 }, { "epoch": 0.09052808046940486, "grad_norm": 7.830483913421631, "learning_rate": 1.9999738610404825e-05, "loss": 3.9672, "step": 54 }, { "epoch": 0.09220452640402348, "grad_norm": 18.022193908691406, "learning_rate": 1.9999591579758392e-05, "loss": 4.5989, "step": 55 }, { "epoch": 0.09388097233864208, "grad_norm": 15.520894050598145, "learning_rate": 1.9999411876613578e-05, "loss": 4.3096, "step": 56 }, { "epoch": 0.09555741827326068, "grad_norm": 33.34592819213867, "learning_rate": 1.9999199501557543e-05, "loss": 4.0011, "step": 57 }, { "epoch": 0.0972338642078793, "grad_norm": 8.097467422485352, "learning_rate": 1.9998954455284204e-05, "loss": 3.8791, "step": 58 }, { "epoch": 0.0989103101424979, "grad_norm": 13.86074447631836, "learning_rate": 1.9998676738594214e-05, "loss": 3.9509, "step": 59 }, { "epoch": 0.10058675607711651, "grad_norm": 17.1189022064209, "learning_rate": 1.999836635239498e-05, "loss": 3.9808, "step": 60 }, { "epoch": 0.10226320201173512, "grad_norm": 42.220489501953125, "learning_rate": 1.9998023297700656e-05, "loss": 3.7983, "step": 61 }, { "epoch": 0.10393964794635373, "grad_norm": 36.6920166015625, "learning_rate": 1.9997647575632134e-05, "loss": 4.3651, "step": 62 }, { "epoch": 0.10561609388097234, "grad_norm": 24.66423988342285, "learning_rate": 1.9997239187417036e-05, "loss": 3.7808, "step": 63 }, { "epoch": 0.10729253981559095, "grad_norm": 8.944499015808105, "learning_rate": 1.9996798134389728e-05, "loss": 3.6383, "step": 64 }, { "epoch": 0.10896898575020955, "grad_norm": 12.11451244354248, "learning_rate": 1.99963244179913e-05, "loss": 4.2861, "step": 65 }, { "epoch": 0.11064543168482817, "grad_norm": 12.59256362915039, "learning_rate": 1.9995818039769564e-05, "loss": 4.3145, "step": 66 }, { "epoch": 0.11232187761944677, "grad_norm": 22.575510025024414, "learning_rate": 1.9995279001379045e-05, "loss": 4.3435, "step": 67 }, { "epoch": 0.11399832355406538, "grad_norm": 9.065831184387207, "learning_rate": 1.9994707304581e-05, "loss": 4.0242, "step": 68 }, { "epoch": 0.11567476948868399, "grad_norm": 62.657169342041016, "learning_rate": 1.999410295124337e-05, "loss": 3.7555, "step": 69 }, { "epoch": 0.1173512154233026, "grad_norm": 150.11578369140625, "learning_rate": 1.999346594334082e-05, "loss": 3.4173, "step": 70 }, { "epoch": 0.11902766135792121, "grad_norm": 9.965038299560547, "learning_rate": 1.999279628295469e-05, "loss": 3.7415, "step": 71 }, { "epoch": 0.12070410729253982, "grad_norm": 26.2466983795166, "learning_rate": 1.999209397227302e-05, "loss": 3.7272, "step": 72 }, { "epoch": 0.12238055322715842, "grad_norm": 11.477972984313965, "learning_rate": 1.999135901359053e-05, "loss": 3.9016, "step": 73 }, { "epoch": 0.12405699916177704, "grad_norm": 6.504822731018066, "learning_rate": 1.9990591409308607e-05, "loss": 3.7284, "step": 74 }, { "epoch": 0.12573344509639564, "grad_norm": 5.739537239074707, "learning_rate": 1.9989791161935317e-05, "loss": 3.8759, "step": 75 }, { "epoch": 0.12740989103101424, "grad_norm": 6.07656717300415, "learning_rate": 1.9988958274085367e-05, "loss": 4.062, "step": 76 }, { "epoch": 0.12908633696563285, "grad_norm": 9.112624168395996, "learning_rate": 1.9988092748480126e-05, "loss": 3.7878, "step": 77 }, { "epoch": 0.13076278290025148, "grad_norm": 7.942294597625732, "learning_rate": 1.9987194587947592e-05, "loss": 4.0672, "step": 78 }, { "epoch": 0.13243922883487008, "grad_norm": 7.153411388397217, "learning_rate": 1.9986263795422404e-05, "loss": 4.2913, "step": 79 }, { "epoch": 0.13411567476948869, "grad_norm": 16.938215255737305, "learning_rate": 1.998530037394582e-05, "loss": 4.5853, "step": 80 }, { "epoch": 0.1357921207041073, "grad_norm": 6.260412693023682, "learning_rate": 1.99843043266657e-05, "loss": 3.872, "step": 81 }, { "epoch": 0.1374685666387259, "grad_norm": 7.295062065124512, "learning_rate": 1.9983275656836517e-05, "loss": 3.3244, "step": 82 }, { "epoch": 0.13914501257334452, "grad_norm": 11.486648559570312, "learning_rate": 1.998221436781933e-05, "loss": 3.7609, "step": 83 }, { "epoch": 0.14082145850796313, "grad_norm": 9.9664306640625, "learning_rate": 1.9981120463081778e-05, "loss": 3.5541, "step": 84 }, { "epoch": 0.14249790444258173, "grad_norm": 30.60560417175293, "learning_rate": 1.9979993946198065e-05, "loss": 3.5852, "step": 85 }, { "epoch": 0.14417435037720033, "grad_norm": 29.788236618041992, "learning_rate": 1.9978834820848952e-05, "loss": 4.2221, "step": 86 }, { "epoch": 0.14585079631181894, "grad_norm": 14.765555381774902, "learning_rate": 1.9977643090821747e-05, "loss": 3.205, "step": 87 }, { "epoch": 0.14752724224643754, "grad_norm": 10.866264343261719, "learning_rate": 1.997641876001029e-05, "loss": 3.4118, "step": 88 }, { "epoch": 0.14920368818105617, "grad_norm": 14.126553535461426, "learning_rate": 1.997516183241494e-05, "loss": 4.0312, "step": 89 }, { "epoch": 0.15088013411567477, "grad_norm": 11.85935115814209, "learning_rate": 1.9973872312142556e-05, "loss": 3.2715, "step": 90 }, { "epoch": 0.15255658005029338, "grad_norm": 16.084854125976562, "learning_rate": 1.99725502034065e-05, "loss": 3.5786, "step": 91 }, { "epoch": 0.15423302598491198, "grad_norm": 5.636152267456055, "learning_rate": 1.9971195510526607e-05, "loss": 3.0696, "step": 92 }, { "epoch": 0.15590947191953058, "grad_norm": 14.159217834472656, "learning_rate": 1.9969808237929175e-05, "loss": 3.4602, "step": 93 }, { "epoch": 0.15758591785414922, "grad_norm": 7.173055648803711, "learning_rate": 1.996838839014696e-05, "loss": 3.3581, "step": 94 }, { "epoch": 0.15926236378876782, "grad_norm": 7.017744541168213, "learning_rate": 1.9966935971819145e-05, "loss": 4.3253, "step": 95 }, { "epoch": 0.16093880972338642, "grad_norm": 17.355792999267578, "learning_rate": 1.996545098769134e-05, "loss": 3.6124, "step": 96 }, { "epoch": 0.16261525565800503, "grad_norm": 10.390541076660156, "learning_rate": 1.9963933442615556e-05, "loss": 3.8271, "step": 97 }, { "epoch": 0.16429170159262363, "grad_norm": 8.931659698486328, "learning_rate": 1.9962383341550194e-05, "loss": 4.3342, "step": 98 }, { "epoch": 0.16596814752724226, "grad_norm": 10.540436744689941, "learning_rate": 1.996080068956003e-05, "loss": 3.7479, "step": 99 }, { "epoch": 0.16764459346186086, "grad_norm": 20.35198211669922, "learning_rate": 1.9959185491816192e-05, "loss": 3.2917, "step": 100 }, { "epoch": 0.16932103939647947, "grad_norm": 51.6972541809082, "learning_rate": 1.995753775359615e-05, "loss": 3.4786, "step": 101 }, { "epoch": 0.17099748533109807, "grad_norm": 11.492533683776855, "learning_rate": 1.9955857480283702e-05, "loss": 3.5485, "step": 102 }, { "epoch": 0.17267393126571667, "grad_norm": 9.516109466552734, "learning_rate": 1.9954144677368937e-05, "loss": 3.2608, "step": 103 }, { "epoch": 0.17435037720033528, "grad_norm": 13.29306697845459, "learning_rate": 1.9952399350448247e-05, "loss": 3.7446, "step": 104 }, { "epoch": 0.1760268231349539, "grad_norm": 55.154090881347656, "learning_rate": 1.9950621505224276e-05, "loss": 3.5877, "step": 105 }, { "epoch": 0.1777032690695725, "grad_norm": 14.906719207763672, "learning_rate": 1.994881114750593e-05, "loss": 3.6792, "step": 106 }, { "epoch": 0.17937971500419111, "grad_norm": 6.388235092163086, "learning_rate": 1.9946968283208342e-05, "loss": 3.6596, "step": 107 }, { "epoch": 0.18105616093880972, "grad_norm": 19.995182037353516, "learning_rate": 1.994509291835285e-05, "loss": 3.9101, "step": 108 }, { "epoch": 0.18273260687342832, "grad_norm": 14.770247459411621, "learning_rate": 1.9943185059067e-05, "loss": 3.9598, "step": 109 }, { "epoch": 0.18440905280804695, "grad_norm": 16.106491088867188, "learning_rate": 1.9941244711584497e-05, "loss": 3.2101, "step": 110 }, { "epoch": 0.18608549874266556, "grad_norm": 6.1567559242248535, "learning_rate": 1.993927188224519e-05, "loss": 2.9915, "step": 111 }, { "epoch": 0.18776194467728416, "grad_norm": 7.953608989715576, "learning_rate": 1.993726657749508e-05, "loss": 3.6421, "step": 112 }, { "epoch": 0.18943839061190276, "grad_norm": 15.104812622070312, "learning_rate": 1.993522880388626e-05, "loss": 3.6881, "step": 113 }, { "epoch": 0.19111483654652137, "grad_norm": 10.339803695678711, "learning_rate": 1.993315856807692e-05, "loss": 3.2873, "step": 114 }, { "epoch": 0.19279128248113997, "grad_norm": 7.849524021148682, "learning_rate": 1.9931055876831312e-05, "loss": 3.2027, "step": 115 }, { "epoch": 0.1944677284157586, "grad_norm": 9.873600006103516, "learning_rate": 1.9928920737019735e-05, "loss": 3.9538, "step": 116 }, { "epoch": 0.1961441743503772, "grad_norm": 14.066727638244629, "learning_rate": 1.992675315561851e-05, "loss": 3.0962, "step": 117 }, { "epoch": 0.1978206202849958, "grad_norm": 15.931619644165039, "learning_rate": 1.9924553139709957e-05, "loss": 3.6339, "step": 118 }, { "epoch": 0.1994970662196144, "grad_norm": 7.511065483093262, "learning_rate": 1.992232069648237e-05, "loss": 3.1209, "step": 119 }, { "epoch": 0.20117351215423301, "grad_norm": 6.674152374267578, "learning_rate": 1.9920055833229996e-05, "loss": 3.1333, "step": 120 }, { "epoch": 0.20284995808885165, "grad_norm": 7.845397472381592, "learning_rate": 1.991775855735301e-05, "loss": 3.6301, "step": 121 }, { "epoch": 0.20452640402347025, "grad_norm": 13.73812198638916, "learning_rate": 1.99154288763575e-05, "loss": 3.8211, "step": 122 }, { "epoch": 0.20620284995808885, "grad_norm": 13.48192024230957, "learning_rate": 1.991306679785542e-05, "loss": 3.3921, "step": 123 }, { "epoch": 0.20787929589270746, "grad_norm": 14.931389808654785, "learning_rate": 1.9910672329564584e-05, "loss": 3.5289, "step": 124 }, { "epoch": 0.20955574182732606, "grad_norm": 10.047006607055664, "learning_rate": 1.9908245479308643e-05, "loss": 4.1484, "step": 125 }, { "epoch": 0.2112321877619447, "grad_norm": 7.147514343261719, "learning_rate": 1.9905786255017046e-05, "loss": 3.5322, "step": 126 }, { "epoch": 0.2129086336965633, "grad_norm": 6.860592842102051, "learning_rate": 1.9903294664725023e-05, "loss": 3.6278, "step": 127 }, { "epoch": 0.2145850796311819, "grad_norm": 10.078204154968262, "learning_rate": 1.9900770716573545e-05, "loss": 2.983, "step": 128 }, { "epoch": 0.2162615255658005, "grad_norm": 28.765235900878906, "learning_rate": 1.989821441880933e-05, "loss": 3.5085, "step": 129 }, { "epoch": 0.2179379715004191, "grad_norm": 18.207780838012695, "learning_rate": 1.9895625779784774e-05, "loss": 3.1579, "step": 130 }, { "epoch": 0.2196144174350377, "grad_norm": 30.48822021484375, "learning_rate": 1.9893004807957957e-05, "loss": 2.9274, "step": 131 }, { "epoch": 0.22129086336965634, "grad_norm": 8.024901390075684, "learning_rate": 1.98903515118926e-05, "loss": 3.1684, "step": 132 }, { "epoch": 0.22296730930427494, "grad_norm": 28.956817626953125, "learning_rate": 1.9887665900258035e-05, "loss": 3.464, "step": 133 }, { "epoch": 0.22464375523889354, "grad_norm": 7.361958980560303, "learning_rate": 1.9884947981829186e-05, "loss": 3.7481, "step": 134 }, { "epoch": 0.22632020117351215, "grad_norm": 10.898969650268555, "learning_rate": 1.9882197765486533e-05, "loss": 3.1425, "step": 135 }, { "epoch": 0.22799664710813075, "grad_norm": 43.5575065612793, "learning_rate": 1.987941526021609e-05, "loss": 3.6062, "step": 136 }, { "epoch": 0.22967309304274938, "grad_norm": 62.12515640258789, "learning_rate": 1.987660047510936e-05, "loss": 3.1708, "step": 137 }, { "epoch": 0.23134953897736799, "grad_norm": 15.311452865600586, "learning_rate": 1.9873753419363336e-05, "loss": 3.3333, "step": 138 }, { "epoch": 0.2330259849119866, "grad_norm": 23.646188735961914, "learning_rate": 1.9870874102280433e-05, "loss": 3.3916, "step": 139 }, { "epoch": 0.2347024308466052, "grad_norm": 35.26377868652344, "learning_rate": 1.9867962533268487e-05, "loss": 2.9367, "step": 140 }, { "epoch": 0.2363788767812238, "grad_norm": 5.207340240478516, "learning_rate": 1.9865018721840708e-05, "loss": 3.5029, "step": 141 }, { "epoch": 0.23805532271584243, "grad_norm": 7.34952449798584, "learning_rate": 1.9862042677615657e-05, "loss": 3.3062, "step": 142 }, { "epoch": 0.23973176865046103, "grad_norm": 7.3016862869262695, "learning_rate": 1.985903441031721e-05, "loss": 3.5031, "step": 143 }, { "epoch": 0.24140821458507963, "grad_norm": 10.994498252868652, "learning_rate": 1.9855993929774532e-05, "loss": 3.6388, "step": 144 }, { "epoch": 0.24308466051969824, "grad_norm": 7.479041576385498, "learning_rate": 1.9852921245922035e-05, "loss": 3.0133, "step": 145 }, { "epoch": 0.24476110645431684, "grad_norm": 13.122299194335938, "learning_rate": 1.9849816368799356e-05, "loss": 3.056, "step": 146 }, { "epoch": 0.24643755238893544, "grad_norm": 7.495057582855225, "learning_rate": 1.984667930855132e-05, "loss": 2.8964, "step": 147 }, { "epoch": 0.24811399832355407, "grad_norm": 14.617401123046875, "learning_rate": 1.9843510075427902e-05, "loss": 3.0264, "step": 148 }, { "epoch": 0.24979044425817268, "grad_norm": 8.2734375, "learning_rate": 1.9840308679784207e-05, "loss": 3.6973, "step": 149 }, { "epoch": 0.24979044425817268, "eval_loss": 3.3824267387390137, "eval_runtime": 170.3666, "eval_samples_per_second": 3.082, "eval_steps_per_second": 1.544, "step": 149 }, { "epoch": 0.2514668901927913, "grad_norm": 12.668761253356934, "learning_rate": 1.9837075132080418e-05, "loss": 3.4542, "step": 150 }, { "epoch": 0.2531433361274099, "grad_norm": 9.176738739013672, "learning_rate": 1.9833809442881776e-05, "loss": 3.0109, "step": 151 }, { "epoch": 0.2548197820620285, "grad_norm": 7.975216865539551, "learning_rate": 1.9830511622858535e-05, "loss": 3.3234, "step": 152 }, { "epoch": 0.2564962279966471, "grad_norm": 13.378955841064453, "learning_rate": 1.982718168278594e-05, "loss": 3.5247, "step": 153 }, { "epoch": 0.2581726739312657, "grad_norm": 6.3466105461120605, "learning_rate": 1.9823819633544185e-05, "loss": 3.4305, "step": 154 }, { "epoch": 0.2598491198658843, "grad_norm": 23.52126121520996, "learning_rate": 1.9820425486118364e-05, "loss": 3.3366, "step": 155 }, { "epoch": 0.26152556580050296, "grad_norm": 10.38585090637207, "learning_rate": 1.981699925159847e-05, "loss": 3.4605, "step": 156 }, { "epoch": 0.26320201173512153, "grad_norm": 8.341891288757324, "learning_rate": 1.9813540941179313e-05, "loss": 3.4238, "step": 157 }, { "epoch": 0.26487845766974016, "grad_norm": 6.649683952331543, "learning_rate": 1.981005056616053e-05, "loss": 3.3105, "step": 158 }, { "epoch": 0.26655490360435874, "grad_norm": 6.743122577667236, "learning_rate": 1.9806528137946506e-05, "loss": 3.0913, "step": 159 }, { "epoch": 0.26823134953897737, "grad_norm": 7.338756084442139, "learning_rate": 1.9802973668046364e-05, "loss": 3.3606, "step": 160 }, { "epoch": 0.269907795473596, "grad_norm": 12.344084739685059, "learning_rate": 1.979938716807392e-05, "loss": 3.1622, "step": 161 }, { "epoch": 0.2715842414082146, "grad_norm": 27.183156967163086, "learning_rate": 1.9795768649747648e-05, "loss": 3.1674, "step": 162 }, { "epoch": 0.2732606873428332, "grad_norm": 8.633094787597656, "learning_rate": 1.9792118124890633e-05, "loss": 3.7152, "step": 163 }, { "epoch": 0.2749371332774518, "grad_norm": 21.54610252380371, "learning_rate": 1.9788435605430535e-05, "loss": 3.2184, "step": 164 }, { "epoch": 0.2766135792120704, "grad_norm": 6.796072959899902, "learning_rate": 1.978472110339956e-05, "loss": 3.213, "step": 165 }, { "epoch": 0.27829002514668905, "grad_norm": 14.462117195129395, "learning_rate": 1.978097463093441e-05, "loss": 3.2321, "step": 166 }, { "epoch": 0.2799664710813076, "grad_norm": 33.00666046142578, "learning_rate": 1.9777196200276244e-05, "loss": 3.5638, "step": 167 }, { "epoch": 0.28164291701592625, "grad_norm": 7.086273193359375, "learning_rate": 1.9773385823770646e-05, "loss": 3.385, "step": 168 }, { "epoch": 0.28331936295054483, "grad_norm": 10.10799789428711, "learning_rate": 1.9769543513867576e-05, "loss": 3.3713, "step": 169 }, { "epoch": 0.28499580888516346, "grad_norm": 5.3273606300354, "learning_rate": 1.976566928312133e-05, "loss": 3.1975, "step": 170 }, { "epoch": 0.28667225481978204, "grad_norm": 15.568498611450195, "learning_rate": 1.976176314419051e-05, "loss": 2.7329, "step": 171 }, { "epoch": 0.28834870075440067, "grad_norm": 15.142104148864746, "learning_rate": 1.9757825109837965e-05, "loss": 3.3503, "step": 172 }, { "epoch": 0.2900251466890193, "grad_norm": 8.253128051757812, "learning_rate": 1.9753855192930766e-05, "loss": 3.6897, "step": 173 }, { "epoch": 0.2917015926236379, "grad_norm": 23.802371978759766, "learning_rate": 1.9749853406440147e-05, "loss": 3.138, "step": 174 }, { "epoch": 0.2933780385582565, "grad_norm": 5.703595161437988, "learning_rate": 1.9745819763441483e-05, "loss": 3.504, "step": 175 }, { "epoch": 0.2950544844928751, "grad_norm": 10.943706512451172, "learning_rate": 1.974175427711423e-05, "loss": 3.881, "step": 176 }, { "epoch": 0.2967309304274937, "grad_norm": 7.355160713195801, "learning_rate": 1.9737656960741895e-05, "loss": 3.4582, "step": 177 }, { "epoch": 0.29840737636211234, "grad_norm": 5.9141645431518555, "learning_rate": 1.9733527827711977e-05, "loss": 3.1351, "step": 178 }, { "epoch": 0.3000838222967309, "grad_norm": 5.769615650177002, "learning_rate": 1.9729366891515934e-05, "loss": 2.8578, "step": 179 }, { "epoch": 0.30176026823134955, "grad_norm": 50.10456848144531, "learning_rate": 1.9725174165749143e-05, "loss": 3.0525, "step": 180 }, { "epoch": 0.3034367141659681, "grad_norm": 6.335470199584961, "learning_rate": 1.9720949664110843e-05, "loss": 3.3414, "step": 181 }, { "epoch": 0.30511316010058676, "grad_norm": 5.255411624908447, "learning_rate": 1.97166934004041e-05, "loss": 3.5683, "step": 182 }, { "epoch": 0.3067896060352054, "grad_norm": 16.621810913085938, "learning_rate": 1.9712405388535766e-05, "loss": 3.5738, "step": 183 }, { "epoch": 0.30846605196982396, "grad_norm": 8.535473823547363, "learning_rate": 1.970808564251641e-05, "loss": 3.6592, "step": 184 }, { "epoch": 0.3101424979044426, "grad_norm": 10.178449630737305, "learning_rate": 1.9703734176460302e-05, "loss": 3.6652, "step": 185 }, { "epoch": 0.31181894383906117, "grad_norm": 16.28668785095215, "learning_rate": 1.9699351004585354e-05, "loss": 3.1958, "step": 186 }, { "epoch": 0.3134953897736798, "grad_norm": 20.59012794494629, "learning_rate": 1.969493614121306e-05, "loss": 3.0984, "step": 187 }, { "epoch": 0.31517183570829843, "grad_norm": 9.563077926635742, "learning_rate": 1.9690489600768476e-05, "loss": 2.803, "step": 188 }, { "epoch": 0.316848281642917, "grad_norm": 10.58127212524414, "learning_rate": 1.968601139778015e-05, "loss": 3.0974, "step": 189 }, { "epoch": 0.31852472757753564, "grad_norm": 9.850335121154785, "learning_rate": 1.9681501546880094e-05, "loss": 3.1858, "step": 190 }, { "epoch": 0.3202011735121542, "grad_norm": 7.609768390655518, "learning_rate": 1.9676960062803714e-05, "loss": 2.736, "step": 191 }, { "epoch": 0.32187761944677284, "grad_norm": 23.370101928710938, "learning_rate": 1.967238696038978e-05, "loss": 3.14, "step": 192 }, { "epoch": 0.3235540653813915, "grad_norm": 36.561622619628906, "learning_rate": 1.9667782254580373e-05, "loss": 3.256, "step": 193 }, { "epoch": 0.32523051131601005, "grad_norm": 5.4426140785217285, "learning_rate": 1.9663145960420828e-05, "loss": 2.9023, "step": 194 }, { "epoch": 0.3269069572506287, "grad_norm": 8.721763610839844, "learning_rate": 1.96584780930597e-05, "loss": 3.0318, "step": 195 }, { "epoch": 0.32858340318524726, "grad_norm": 7.816901683807373, "learning_rate": 1.9653778667748695e-05, "loss": 3.4148, "step": 196 }, { "epoch": 0.3302598491198659, "grad_norm": 5.479881763458252, "learning_rate": 1.964904769984264e-05, "loss": 3.2733, "step": 197 }, { "epoch": 0.3319362950544845, "grad_norm": 5.068490028381348, "learning_rate": 1.9644285204799424e-05, "loss": 3.3347, "step": 198 }, { "epoch": 0.3336127409891031, "grad_norm": 50.14521789550781, "learning_rate": 1.9639491198179935e-05, "loss": 3.2234, "step": 199 }, { "epoch": 0.3352891869237217, "grad_norm": 9.903667449951172, "learning_rate": 1.9634665695648035e-05, "loss": 2.8101, "step": 200 }, { "epoch": 0.3369656328583403, "grad_norm": 29.837238311767578, "learning_rate": 1.962980871297049e-05, "loss": 3.05, "step": 201 }, { "epoch": 0.33864207879295893, "grad_norm": 8.479269027709961, "learning_rate": 1.962492026601693e-05, "loss": 2.7328, "step": 202 }, { "epoch": 0.3403185247275775, "grad_norm": 5.382146835327148, "learning_rate": 1.9620000370759767e-05, "loss": 3.0454, "step": 203 }, { "epoch": 0.34199497066219614, "grad_norm": 7.922940254211426, "learning_rate": 1.9615049043274207e-05, "loss": 3.168, "step": 204 }, { "epoch": 0.34367141659681477, "grad_norm": 17.525190353393555, "learning_rate": 1.961006629973812e-05, "loss": 3.3556, "step": 205 }, { "epoch": 0.34534786253143335, "grad_norm": 8.701769828796387, "learning_rate": 1.9605052156432042e-05, "loss": 3.3072, "step": 206 }, { "epoch": 0.347024308466052, "grad_norm": 12.11384391784668, "learning_rate": 1.9600006629739105e-05, "loss": 3.6212, "step": 207 }, { "epoch": 0.34870075440067055, "grad_norm": 9.948774337768555, "learning_rate": 1.9594929736144978e-05, "loss": 3.2593, "step": 208 }, { "epoch": 0.3503772003352892, "grad_norm": 7.080051898956299, "learning_rate": 1.958982149223781e-05, "loss": 3.222, "step": 209 }, { "epoch": 0.3520536462699078, "grad_norm": 12.150545120239258, "learning_rate": 1.95846819147082e-05, "loss": 2.772, "step": 210 }, { "epoch": 0.3537300922045264, "grad_norm": 35.680171966552734, "learning_rate": 1.9579511020349117e-05, "loss": 2.7393, "step": 211 }, { "epoch": 0.355406538139145, "grad_norm": 7.08084774017334, "learning_rate": 1.9574308826055853e-05, "loss": 2.9431, "step": 212 }, { "epoch": 0.3570829840737636, "grad_norm": 11.862239837646484, "learning_rate": 1.956907534882597e-05, "loss": 2.8458, "step": 213 }, { "epoch": 0.35875943000838223, "grad_norm": 22.763423919677734, "learning_rate": 1.9563810605759242e-05, "loss": 3.4575, "step": 214 }, { "epoch": 0.36043587594300086, "grad_norm": 7.000513553619385, "learning_rate": 1.955851461405761e-05, "loss": 3.2901, "step": 215 }, { "epoch": 0.36211232187761944, "grad_norm": 67.07561492919922, "learning_rate": 1.9553187391025102e-05, "loss": 2.5827, "step": 216 }, { "epoch": 0.36378876781223807, "grad_norm": 21.986820220947266, "learning_rate": 1.95478289540678e-05, "loss": 2.8311, "step": 217 }, { "epoch": 0.36546521374685664, "grad_norm": 8.331358909606934, "learning_rate": 1.954243932069377e-05, "loss": 3.2158, "step": 218 }, { "epoch": 0.3671416596814753, "grad_norm": 37.39122009277344, "learning_rate": 1.9537018508513013e-05, "loss": 2.4418, "step": 219 }, { "epoch": 0.3688181056160939, "grad_norm": 13.720285415649414, "learning_rate": 1.9531566535237397e-05, "loss": 2.5423, "step": 220 }, { "epoch": 0.3704945515507125, "grad_norm": 10.579797744750977, "learning_rate": 1.952608341868061e-05, "loss": 3.2097, "step": 221 }, { "epoch": 0.3721709974853311, "grad_norm": 8.911162376403809, "learning_rate": 1.9520569176758096e-05, "loss": 3.2593, "step": 222 }, { "epoch": 0.3738474434199497, "grad_norm": 7.1473469734191895, "learning_rate": 1.9515023827486993e-05, "loss": 2.9501, "step": 223 }, { "epoch": 0.3755238893545683, "grad_norm": 6.834731578826904, "learning_rate": 1.9509447388986086e-05, "loss": 3.0714, "step": 224 }, { "epoch": 0.37720033528918695, "grad_norm": 14.296359062194824, "learning_rate": 1.9503839879475737e-05, "loss": 3.3976, "step": 225 }, { "epoch": 0.3788767812238055, "grad_norm": 5.17194938659668, "learning_rate": 1.949820131727783e-05, "loss": 2.9592, "step": 226 }, { "epoch": 0.38055322715842416, "grad_norm": 7.428761959075928, "learning_rate": 1.9492531720815702e-05, "loss": 3.5138, "step": 227 }, { "epoch": 0.38222967309304273, "grad_norm": 13.421716690063477, "learning_rate": 1.9486831108614106e-05, "loss": 3.0787, "step": 228 }, { "epoch": 0.38390611902766136, "grad_norm": 12.17398738861084, "learning_rate": 1.9481099499299122e-05, "loss": 2.9996, "step": 229 }, { "epoch": 0.38558256496227994, "grad_norm": 12.111760139465332, "learning_rate": 1.9475336911598117e-05, "loss": 3.2421, "step": 230 }, { "epoch": 0.38725901089689857, "grad_norm": 13.06086254119873, "learning_rate": 1.9469543364339674e-05, "loss": 3.3182, "step": 231 }, { "epoch": 0.3889354568315172, "grad_norm": 11.486510276794434, "learning_rate": 1.946371887645353e-05, "loss": 3.5173, "step": 232 }, { "epoch": 0.3906119027661358, "grad_norm": 17.7554874420166, "learning_rate": 1.9457863466970522e-05, "loss": 3.0414, "step": 233 }, { "epoch": 0.3922883487007544, "grad_norm": 13.917510032653809, "learning_rate": 1.9451977155022513e-05, "loss": 3.0203, "step": 234 }, { "epoch": 0.393964794635373, "grad_norm": 5.215061187744141, "learning_rate": 1.9446059959842345e-05, "loss": 2.9301, "step": 235 }, { "epoch": 0.3956412405699916, "grad_norm": 7.793075084686279, "learning_rate": 1.9440111900763758e-05, "loss": 3.1822, "step": 236 }, { "epoch": 0.39731768650461025, "grad_norm": 35.7464714050293, "learning_rate": 1.9434132997221347e-05, "loss": 3.2708, "step": 237 }, { "epoch": 0.3989941324392288, "grad_norm": 15.724870681762695, "learning_rate": 1.9428123268750477e-05, "loss": 2.8611, "step": 238 }, { "epoch": 0.40067057837384745, "grad_norm": 6.313983917236328, "learning_rate": 1.942208273498723e-05, "loss": 3.1243, "step": 239 }, { "epoch": 0.40234702430846603, "grad_norm": 6.267673015594482, "learning_rate": 1.941601141566836e-05, "loss": 3.3727, "step": 240 }, { "epoch": 0.40402347024308466, "grad_norm": 11.894177436828613, "learning_rate": 1.940990933063118e-05, "loss": 3.1259, "step": 241 }, { "epoch": 0.4056999161777033, "grad_norm": 89.67118072509766, "learning_rate": 1.9403776499813546e-05, "loss": 3.1069, "step": 242 }, { "epoch": 0.40737636211232187, "grad_norm": 24.595121383666992, "learning_rate": 1.939761294325376e-05, "loss": 3.0565, "step": 243 }, { "epoch": 0.4090528080469405, "grad_norm": 11.026915550231934, "learning_rate": 1.9391418681090538e-05, "loss": 2.8763, "step": 244 }, { "epoch": 0.4107292539815591, "grad_norm": 8.579183578491211, "learning_rate": 1.9385193733562895e-05, "loss": 3.129, "step": 245 }, { "epoch": 0.4124056999161777, "grad_norm": 8.967629432678223, "learning_rate": 1.9378938121010128e-05, "loss": 2.954, "step": 246 }, { "epoch": 0.41408214585079633, "grad_norm": 9.92216968536377, "learning_rate": 1.937265186387172e-05, "loss": 3.0789, "step": 247 }, { "epoch": 0.4157585917854149, "grad_norm": 6.382918357849121, "learning_rate": 1.936633498268728e-05, "loss": 3.12, "step": 248 }, { "epoch": 0.41743503772003354, "grad_norm": 15.347322463989258, "learning_rate": 1.9359987498096483e-05, "loss": 3.0589, "step": 249 }, { "epoch": 0.4191114836546521, "grad_norm": 11.398663520812988, "learning_rate": 1.9353609430838987e-05, "loss": 2.9631, "step": 250 }, { "epoch": 0.42078792958927075, "grad_norm": 29.425142288208008, "learning_rate": 1.9347200801754395e-05, "loss": 2.9028, "step": 251 }, { "epoch": 0.4224643755238894, "grad_norm": 8.544347763061523, "learning_rate": 1.9340761631782142e-05, "loss": 2.9608, "step": 252 }, { "epoch": 0.42414082145850796, "grad_norm": 10.931676864624023, "learning_rate": 1.9334291941961472e-05, "loss": 3.1047, "step": 253 }, { "epoch": 0.4258172673931266, "grad_norm": 17.0784912109375, "learning_rate": 1.932779175343134e-05, "loss": 3.1698, "step": 254 }, { "epoch": 0.42749371332774516, "grad_norm": 5.1688690185546875, "learning_rate": 1.932126108743035e-05, "loss": 3.246, "step": 255 }, { "epoch": 0.4291701592623638, "grad_norm": 8.136938095092773, "learning_rate": 1.9314699965296694e-05, "loss": 3.5841, "step": 256 }, { "epoch": 0.4308466051969824, "grad_norm": 9.045907020568848, "learning_rate": 1.930810840846807e-05, "loss": 3.4752, "step": 257 }, { "epoch": 0.432523051131601, "grad_norm": 11.020782470703125, "learning_rate": 1.9301486438481628e-05, "loss": 2.7305, "step": 258 }, { "epoch": 0.43419949706621963, "grad_norm": 5.10975980758667, "learning_rate": 1.9294834076973872e-05, "loss": 2.5905, "step": 259 }, { "epoch": 0.4358759430008382, "grad_norm": 10.9817533493042, "learning_rate": 1.9288151345680623e-05, "loss": 2.9957, "step": 260 }, { "epoch": 0.43755238893545684, "grad_norm": 8.335543632507324, "learning_rate": 1.9281438266436923e-05, "loss": 3.1701, "step": 261 }, { "epoch": 0.4392288348700754, "grad_norm": 14.775369644165039, "learning_rate": 1.927469486117698e-05, "loss": 3.2502, "step": 262 }, { "epoch": 0.44090528080469404, "grad_norm": 5.46981954574585, "learning_rate": 1.926792115193407e-05, "loss": 2.9081, "step": 263 }, { "epoch": 0.4425817267393127, "grad_norm": 6.404331207275391, "learning_rate": 1.9261117160840516e-05, "loss": 2.9836, "step": 264 }, { "epoch": 0.44425817267393125, "grad_norm": 11.281539916992188, "learning_rate": 1.9254282910127554e-05, "loss": 3.2841, "step": 265 }, { "epoch": 0.4459346186085499, "grad_norm": 18.983095169067383, "learning_rate": 1.92474184221253e-05, "loss": 2.9025, "step": 266 }, { "epoch": 0.44761106454316846, "grad_norm": 20.735675811767578, "learning_rate": 1.9240523719262672e-05, "loss": 3.2832, "step": 267 }, { "epoch": 0.4492875104777871, "grad_norm": 46.4406852722168, "learning_rate": 1.9233598824067302e-05, "loss": 2.9044, "step": 268 }, { "epoch": 0.4509639564124057, "grad_norm": 7.444986820220947, "learning_rate": 1.922664375916548e-05, "loss": 3.1917, "step": 269 }, { "epoch": 0.4526404023470243, "grad_norm": 32.7586669921875, "learning_rate": 1.921965854728207e-05, "loss": 2.3738, "step": 270 }, { "epoch": 0.4543168482816429, "grad_norm": 6.521704196929932, "learning_rate": 1.9212643211240433e-05, "loss": 3.307, "step": 271 }, { "epoch": 0.4559932942162615, "grad_norm": 14.991122245788574, "learning_rate": 1.920559777396236e-05, "loss": 2.832, "step": 272 }, { "epoch": 0.45766974015088013, "grad_norm": 17.04147720336914, "learning_rate": 1.9198522258468e-05, "loss": 3.1575, "step": 273 }, { "epoch": 0.45934618608549876, "grad_norm": 16.540546417236328, "learning_rate": 1.919141668787577e-05, "loss": 3.1848, "step": 274 }, { "epoch": 0.46102263202011734, "grad_norm": 8.397128105163574, "learning_rate": 1.9184281085402293e-05, "loss": 2.8428, "step": 275 }, { "epoch": 0.46269907795473597, "grad_norm": 8.707103729248047, "learning_rate": 1.917711547436232e-05, "loss": 3.2335, "step": 276 }, { "epoch": 0.46437552388935455, "grad_norm": 12.802251815795898, "learning_rate": 1.9169919878168648e-05, "loss": 2.7268, "step": 277 }, { "epoch": 0.4660519698239732, "grad_norm": 5.073677062988281, "learning_rate": 1.9162694320332047e-05, "loss": 3.0764, "step": 278 }, { "epoch": 0.4677284157585918, "grad_norm": 7.319097995758057, "learning_rate": 1.915543882446118e-05, "loss": 3.3301, "step": 279 }, { "epoch": 0.4694048616932104, "grad_norm": 4.765015602111816, "learning_rate": 1.914815341426254e-05, "loss": 2.3654, "step": 280 }, { "epoch": 0.471081307627829, "grad_norm": 23.900737762451172, "learning_rate": 1.9140838113540347e-05, "loss": 2.6325, "step": 281 }, { "epoch": 0.4727577535624476, "grad_norm": 8.720821380615234, "learning_rate": 1.9133492946196498e-05, "loss": 2.8902, "step": 282 }, { "epoch": 0.4744341994970662, "grad_norm": 6.236704349517822, "learning_rate": 1.912611793623047e-05, "loss": 3.077, "step": 283 }, { "epoch": 0.47611064543168485, "grad_norm": 6.501819610595703, "learning_rate": 1.9118713107739246e-05, "loss": 2.8594, "step": 284 }, { "epoch": 0.47778709136630343, "grad_norm": 11.875177383422852, "learning_rate": 1.9111278484917236e-05, "loss": 2.7059, "step": 285 }, { "epoch": 0.47946353730092206, "grad_norm": 19.49224090576172, "learning_rate": 1.9103814092056205e-05, "loss": 2.6698, "step": 286 }, { "epoch": 0.48113998323554064, "grad_norm": 4.689838409423828, "learning_rate": 1.9096319953545186e-05, "loss": 2.3858, "step": 287 }, { "epoch": 0.48281642917015927, "grad_norm": 18.400123596191406, "learning_rate": 1.90887960938704e-05, "loss": 3.4256, "step": 288 }, { "epoch": 0.4844928751047779, "grad_norm": 6.435288906097412, "learning_rate": 1.908124253761518e-05, "loss": 3.5192, "step": 289 }, { "epoch": 0.4861693210393965, "grad_norm": 9.22422981262207, "learning_rate": 1.9073659309459897e-05, "loss": 3.2422, "step": 290 }, { "epoch": 0.4878457669740151, "grad_norm": 10.713644027709961, "learning_rate": 1.9066046434181854e-05, "loss": 2.756, "step": 291 }, { "epoch": 0.4895222129086337, "grad_norm": 5.319098949432373, "learning_rate": 1.9058403936655235e-05, "loss": 3.1851, "step": 292 }, { "epoch": 0.4911986588432523, "grad_norm": 9.979037284851074, "learning_rate": 1.9050731841851008e-05, "loss": 2.6041, "step": 293 }, { "epoch": 0.4928751047778709, "grad_norm": 7.896168231964111, "learning_rate": 1.9043030174836852e-05, "loss": 3.0118, "step": 294 }, { "epoch": 0.4945515507124895, "grad_norm": 8.449163436889648, "learning_rate": 1.9035298960777063e-05, "loss": 2.863, "step": 295 }, { "epoch": 0.49622799664710815, "grad_norm": 7.167267799377441, "learning_rate": 1.902753822493248e-05, "loss": 2.5699, "step": 296 }, { "epoch": 0.4979044425817267, "grad_norm": 6.998987197875977, "learning_rate": 1.90197479926604e-05, "loss": 2.6554, "step": 297 }, { "epoch": 0.49958088851634536, "grad_norm": 5.500977993011475, "learning_rate": 1.9011928289414502e-05, "loss": 2.8722, "step": 298 }, { "epoch": 0.49958088851634536, "eval_loss": 2.926950454711914, "eval_runtime": 170.0066, "eval_samples_per_second": 3.088, "eval_steps_per_second": 1.547, "step": 298 }, { "epoch": 0.501257334450964, "grad_norm": 5.163046360015869, "learning_rate": 1.9004079140744745e-05, "loss": 2.8579, "step": 299 }, { "epoch": 0.5029337803855826, "grad_norm": 5.7494354248046875, "learning_rate": 1.899620057229732e-05, "loss": 2.978, "step": 300 }, { "epoch": 0.5046102263202011, "grad_norm": 6.08455753326416, "learning_rate": 1.8988292609814513e-05, "loss": 2.8236, "step": 301 }, { "epoch": 0.5062866722548198, "grad_norm": 7.003081798553467, "learning_rate": 1.8980355279134682e-05, "loss": 3.0391, "step": 302 }, { "epoch": 0.5079631181894384, "grad_norm": 4.652783393859863, "learning_rate": 1.8972388606192124e-05, "loss": 2.8344, "step": 303 }, { "epoch": 0.509639564124057, "grad_norm": 10.888507843017578, "learning_rate": 1.8964392617017013e-05, "loss": 2.7384, "step": 304 }, { "epoch": 0.5113160100586757, "grad_norm": 7.572864055633545, "learning_rate": 1.895636733773531e-05, "loss": 2.8011, "step": 305 }, { "epoch": 0.5129924559932942, "grad_norm": 14.348418235778809, "learning_rate": 1.8948312794568674e-05, "loss": 2.9687, "step": 306 }, { "epoch": 0.5146689019279128, "grad_norm": 21.65433692932129, "learning_rate": 1.8940229013834397e-05, "loss": 2.8686, "step": 307 }, { "epoch": 0.5163453478625314, "grad_norm": 14.115485191345215, "learning_rate": 1.8932116021945277e-05, "loss": 2.9705, "step": 308 }, { "epoch": 0.5180217937971501, "grad_norm": 5.344349384307861, "learning_rate": 1.8923973845409575e-05, "loss": 3.1608, "step": 309 }, { "epoch": 0.5196982397317687, "grad_norm": 29.685548782348633, "learning_rate": 1.8915802510830895e-05, "loss": 3.0713, "step": 310 }, { "epoch": 0.5213746856663872, "grad_norm": 11.76620864868164, "learning_rate": 1.8907602044908125e-05, "loss": 2.6452, "step": 311 }, { "epoch": 0.5230511316010059, "grad_norm": 28.28868865966797, "learning_rate": 1.8899372474435335e-05, "loss": 2.9104, "step": 312 }, { "epoch": 0.5247275775356245, "grad_norm": 15.250394821166992, "learning_rate": 1.8891113826301675e-05, "loss": 2.8746, "step": 313 }, { "epoch": 0.5264040234702431, "grad_norm": 24.13912010192871, "learning_rate": 1.888282612749132e-05, "loss": 2.8818, "step": 314 }, { "epoch": 0.5280804694048616, "grad_norm": 17.81316566467285, "learning_rate": 1.8874509405083358e-05, "loss": 2.8672, "step": 315 }, { "epoch": 0.5297569153394803, "grad_norm": 13.147285461425781, "learning_rate": 1.886616368625171e-05, "loss": 2.7462, "step": 316 }, { "epoch": 0.5314333612740989, "grad_norm": 9.764019966125488, "learning_rate": 1.8857788998265033e-05, "loss": 3.1994, "step": 317 }, { "epoch": 0.5331098072087175, "grad_norm": 5.41625452041626, "learning_rate": 1.884938536848665e-05, "loss": 2.6813, "step": 318 }, { "epoch": 0.5347862531433362, "grad_norm": 6.776956558227539, "learning_rate": 1.8840952824374433e-05, "loss": 2.7655, "step": 319 }, { "epoch": 0.5364626990779547, "grad_norm": 19.199420928955078, "learning_rate": 1.883249139348074e-05, "loss": 2.8305, "step": 320 }, { "epoch": 0.5381391450125733, "grad_norm": 9.40645980834961, "learning_rate": 1.8824001103452316e-05, "loss": 3.2963, "step": 321 }, { "epoch": 0.539815590947192, "grad_norm": 11.512080192565918, "learning_rate": 1.8815481982030176e-05, "loss": 2.8924, "step": 322 }, { "epoch": 0.5414920368818106, "grad_norm": 10.006054878234863, "learning_rate": 1.8806934057049564e-05, "loss": 2.4106, "step": 323 }, { "epoch": 0.5431684828164292, "grad_norm": 15.719528198242188, "learning_rate": 1.879835735643983e-05, "loss": 3.1974, "step": 324 }, { "epoch": 0.5448449287510477, "grad_norm": 7.921346664428711, "learning_rate": 1.878975190822434e-05, "loss": 2.3169, "step": 325 }, { "epoch": 0.5465213746856664, "grad_norm": 10.61286449432373, "learning_rate": 1.8781117740520386e-05, "loss": 3.4616, "step": 326 }, { "epoch": 0.548197820620285, "grad_norm": 6.670656681060791, "learning_rate": 1.8772454881539116e-05, "loss": 2.9083, "step": 327 }, { "epoch": 0.5498742665549036, "grad_norm": 4.469232082366943, "learning_rate": 1.8763763359585395e-05, "loss": 2.8416, "step": 328 }, { "epoch": 0.5515507124895223, "grad_norm": 11.258614540100098, "learning_rate": 1.875504320305777e-05, "loss": 3.0544, "step": 329 }, { "epoch": 0.5532271584241408, "grad_norm": 16.866304397583008, "learning_rate": 1.8746294440448324e-05, "loss": 2.6856, "step": 330 }, { "epoch": 0.5549036043587594, "grad_norm": 18.251405715942383, "learning_rate": 1.8737517100342624e-05, "loss": 2.4673, "step": 331 }, { "epoch": 0.5565800502933781, "grad_norm": 42.30506134033203, "learning_rate": 1.872871121141961e-05, "loss": 3.1436, "step": 332 }, { "epoch": 0.5582564962279967, "grad_norm": 9.334754943847656, "learning_rate": 1.8719876802451483e-05, "loss": 2.6472, "step": 333 }, { "epoch": 0.5599329421626152, "grad_norm": 10.522323608398438, "learning_rate": 1.871101390230365e-05, "loss": 2.8134, "step": 334 }, { "epoch": 0.5616093880972338, "grad_norm": 6.456719398498535, "learning_rate": 1.87021225399346e-05, "loss": 2.6746, "step": 335 }, { "epoch": 0.5632858340318525, "grad_norm": 14.442458152770996, "learning_rate": 1.869320274439583e-05, "loss": 2.5594, "step": 336 }, { "epoch": 0.5649622799664711, "grad_norm": 10.787813186645508, "learning_rate": 1.8684254544831718e-05, "loss": 2.8813, "step": 337 }, { "epoch": 0.5666387259010897, "grad_norm": 13.258678436279297, "learning_rate": 1.8675277970479472e-05, "loss": 3.2202, "step": 338 }, { "epoch": 0.5683151718357083, "grad_norm": 8.291848182678223, "learning_rate": 1.8666273050668997e-05, "loss": 2.393, "step": 339 }, { "epoch": 0.5699916177703269, "grad_norm": 9.651192665100098, "learning_rate": 1.8657239814822817e-05, "loss": 2.7176, "step": 340 }, { "epoch": 0.5716680637049455, "grad_norm": 7.801069736480713, "learning_rate": 1.864817829245598e-05, "loss": 2.7717, "step": 341 }, { "epoch": 0.5733445096395641, "grad_norm": 5.9048991203308105, "learning_rate": 1.8639088513175948e-05, "loss": 2.8141, "step": 342 }, { "epoch": 0.5750209555741828, "grad_norm": 14.559867858886719, "learning_rate": 1.8629970506682516e-05, "loss": 2.9477, "step": 343 }, { "epoch": 0.5766974015088013, "grad_norm": 6.023739814758301, "learning_rate": 1.8620824302767707e-05, "loss": 2.686, "step": 344 }, { "epoch": 0.5783738474434199, "grad_norm": 6.415578365325928, "learning_rate": 1.861164993131567e-05, "loss": 2.8317, "step": 345 }, { "epoch": 0.5800502933780386, "grad_norm": 263.0706787109375, "learning_rate": 1.86024474223026e-05, "loss": 2.8038, "step": 346 }, { "epoch": 0.5817267393126572, "grad_norm": 128.69090270996094, "learning_rate": 1.8593216805796612e-05, "loss": 3.1559, "step": 347 }, { "epoch": 0.5834031852472757, "grad_norm": 36.093353271484375, "learning_rate": 1.8583958111957676e-05, "loss": 2.8523, "step": 348 }, { "epoch": 0.5850796311818944, "grad_norm": 7.625601768493652, "learning_rate": 1.857467137103749e-05, "loss": 2.8468, "step": 349 }, { "epoch": 0.586756077116513, "grad_norm": 11.168829917907715, "learning_rate": 1.85653566133794e-05, "loss": 2.8308, "step": 350 }, { "epoch": 0.5884325230511316, "grad_norm": 10.921439170837402, "learning_rate": 1.8556013869418282e-05, "loss": 2.5821, "step": 351 }, { "epoch": 0.5901089689857502, "grad_norm": 11.208488464355469, "learning_rate": 1.854664316968047e-05, "loss": 2.7684, "step": 352 }, { "epoch": 0.5917854149203688, "grad_norm": 26.6365966796875, "learning_rate": 1.8537244544783622e-05, "loss": 2.8621, "step": 353 }, { "epoch": 0.5934618608549874, "grad_norm": 14.89959716796875, "learning_rate": 1.8527818025436662e-05, "loss": 2.5929, "step": 354 }, { "epoch": 0.595138306789606, "grad_norm": 5.045697212219238, "learning_rate": 1.8518363642439628e-05, "loss": 2.5386, "step": 355 }, { "epoch": 0.5968147527242247, "grad_norm": 5.232949256896973, "learning_rate": 1.8508881426683618e-05, "loss": 2.9912, "step": 356 }, { "epoch": 0.5984911986588433, "grad_norm": 8.896004676818848, "learning_rate": 1.849937140915066e-05, "loss": 2.9582, "step": 357 }, { "epoch": 0.6001676445934618, "grad_norm": 7.124967575073242, "learning_rate": 1.8489833620913644e-05, "loss": 2.4379, "step": 358 }, { "epoch": 0.6018440905280805, "grad_norm": 4.953553676605225, "learning_rate": 1.8480268093136157e-05, "loss": 2.8297, "step": 359 }, { "epoch": 0.6035205364626991, "grad_norm": 7.589214324951172, "learning_rate": 1.847067485707246e-05, "loss": 3.015, "step": 360 }, { "epoch": 0.6051969823973177, "grad_norm": 9.251113891601562, "learning_rate": 1.8461053944067324e-05, "loss": 2.8432, "step": 361 }, { "epoch": 0.6068734283319362, "grad_norm": 8.667802810668945, "learning_rate": 1.8451405385555965e-05, "loss": 2.991, "step": 362 }, { "epoch": 0.6085498742665549, "grad_norm": 25.259563446044922, "learning_rate": 1.844172921306392e-05, "loss": 2.5958, "step": 363 }, { "epoch": 0.6102263202011735, "grad_norm": 14.593143463134766, "learning_rate": 1.8432025458206953e-05, "loss": 2.92, "step": 364 }, { "epoch": 0.6119027661357921, "grad_norm": 70.61213684082031, "learning_rate": 1.8422294152690948e-05, "loss": 2.396, "step": 365 }, { "epoch": 0.6135792120704108, "grad_norm": 22.44847297668457, "learning_rate": 1.8412535328311813e-05, "loss": 3.4188, "step": 366 }, { "epoch": 0.6152556580050293, "grad_norm": 45.18448257446289, "learning_rate": 1.8402749016955367e-05, "loss": 2.7729, "step": 367 }, { "epoch": 0.6169321039396479, "grad_norm": 10.672445297241211, "learning_rate": 1.839293525059724e-05, "loss": 2.8894, "step": 368 }, { "epoch": 0.6186085498742665, "grad_norm": 61.39145278930664, "learning_rate": 1.8383094061302767e-05, "loss": 2.7822, "step": 369 }, { "epoch": 0.6202849958088852, "grad_norm": 4.671435356140137, "learning_rate": 1.8373225481226886e-05, "loss": 2.5807, "step": 370 }, { "epoch": 0.6219614417435038, "grad_norm": 8.374921798706055, "learning_rate": 1.8363329542614033e-05, "loss": 2.6379, "step": 371 }, { "epoch": 0.6236378876781223, "grad_norm": 6.115523815155029, "learning_rate": 1.835340627779803e-05, "loss": 2.6256, "step": 372 }, { "epoch": 0.625314333612741, "grad_norm": 13.015432357788086, "learning_rate": 1.8343455719201986e-05, "loss": 2.5948, "step": 373 }, { "epoch": 0.6269907795473596, "grad_norm": 16.026737213134766, "learning_rate": 1.833347789933819e-05, "loss": 2.3516, "step": 374 }, { "epoch": 0.6286672254819782, "grad_norm": 10.86506462097168, "learning_rate": 1.8323472850808005e-05, "loss": 2.4668, "step": 375 }, { "epoch": 0.6303436714165969, "grad_norm": 14.798431396484375, "learning_rate": 1.831344060630176e-05, "loss": 2.9502, "step": 376 }, { "epoch": 0.6320201173512154, "grad_norm": 12.693931579589844, "learning_rate": 1.830338119859864e-05, "loss": 2.5228, "step": 377 }, { "epoch": 0.633696563285834, "grad_norm": 9.94350814819336, "learning_rate": 1.8293294660566593e-05, "loss": 2.7851, "step": 378 }, { "epoch": 0.6353730092204526, "grad_norm": 8.782124519348145, "learning_rate": 1.8283181025162194e-05, "loss": 2.2284, "step": 379 }, { "epoch": 0.6370494551550713, "grad_norm": 5.592970848083496, "learning_rate": 1.8273040325430575e-05, "loss": 2.3087, "step": 380 }, { "epoch": 0.6387259010896899, "grad_norm": 11.876554489135742, "learning_rate": 1.826287259450528e-05, "loss": 3.2925, "step": 381 }, { "epoch": 0.6404023470243084, "grad_norm": 32.023468017578125, "learning_rate": 1.8252677865608196e-05, "loss": 2.8576, "step": 382 }, { "epoch": 0.6420787929589271, "grad_norm": 4.798394680023193, "learning_rate": 1.8242456172049397e-05, "loss": 2.7401, "step": 383 }, { "epoch": 0.6437552388935457, "grad_norm": 45.88468933105469, "learning_rate": 1.823220754722708e-05, "loss": 2.5372, "step": 384 }, { "epoch": 0.6454316848281643, "grad_norm": 12.206032752990723, "learning_rate": 1.8221932024627432e-05, "loss": 2.922, "step": 385 }, { "epoch": 0.647108130762783, "grad_norm": 18.00353240966797, "learning_rate": 1.8211629637824516e-05, "loss": 2.7617, "step": 386 }, { "epoch": 0.6487845766974015, "grad_norm": 37.15439224243164, "learning_rate": 1.8201300420480187e-05, "loss": 2.7826, "step": 387 }, { "epoch": 0.6504610226320201, "grad_norm": 12.052484512329102, "learning_rate": 1.819094440634395e-05, "loss": 2.5883, "step": 388 }, { "epoch": 0.6521374685666387, "grad_norm": 4.733270645141602, "learning_rate": 1.818056162925288e-05, "loss": 2.8675, "step": 389 }, { "epoch": 0.6538139145012574, "grad_norm": 12.65290355682373, "learning_rate": 1.8170152123131485e-05, "loss": 2.8502, "step": 390 }, { "epoch": 0.6554903604358759, "grad_norm": 26.007688522338867, "learning_rate": 1.8159715921991612e-05, "loss": 2.7797, "step": 391 }, { "epoch": 0.6571668063704945, "grad_norm": 6.335994243621826, "learning_rate": 1.814925305993233e-05, "loss": 2.5553, "step": 392 }, { "epoch": 0.6588432523051132, "grad_norm": 23.075742721557617, "learning_rate": 1.8138763571139817e-05, "loss": 2.3218, "step": 393 }, { "epoch": 0.6605196982397318, "grad_norm": 23.86815071105957, "learning_rate": 1.8128247489887257e-05, "loss": 3.0085, "step": 394 }, { "epoch": 0.6621961441743504, "grad_norm": 12.74923324584961, "learning_rate": 1.811770485053472e-05, "loss": 2.4417, "step": 395 }, { "epoch": 0.663872590108969, "grad_norm": 14.195935249328613, "learning_rate": 1.8107135687529044e-05, "loss": 2.7107, "step": 396 }, { "epoch": 0.6655490360435876, "grad_norm": 7.245200157165527, "learning_rate": 1.8096540035403736e-05, "loss": 2.7102, "step": 397 }, { "epoch": 0.6672254819782062, "grad_norm": 6.893463134765625, "learning_rate": 1.8085917928778856e-05, "loss": 2.8849, "step": 398 }, { "epoch": 0.6689019279128248, "grad_norm": 19.567739486694336, "learning_rate": 1.80752694023609e-05, "loss": 2.6283, "step": 399 }, { "epoch": 0.6705783738474435, "grad_norm": 9.866447448730469, "learning_rate": 1.8064594490942675e-05, "loss": 2.5458, "step": 400 }, { "epoch": 0.672254819782062, "grad_norm": 9.496179580688477, "learning_rate": 1.8053893229403218e-05, "loss": 2.3568, "step": 401 }, { "epoch": 0.6739312657166806, "grad_norm": 7.089486598968506, "learning_rate": 1.804316565270765e-05, "loss": 2.6805, "step": 402 }, { "epoch": 0.6756077116512993, "grad_norm": 5.836471080780029, "learning_rate": 1.8032411795907072e-05, "loss": 2.7495, "step": 403 }, { "epoch": 0.6772841575859179, "grad_norm": 6.555230140686035, "learning_rate": 1.802163169413846e-05, "loss": 2.1888, "step": 404 }, { "epoch": 0.6789606035205364, "grad_norm": 6.780150890350342, "learning_rate": 1.801082538262454e-05, "loss": 2.8941, "step": 405 }, { "epoch": 0.680637049455155, "grad_norm": 10.555006980895996, "learning_rate": 1.7999992896673667e-05, "loss": 2.1833, "step": 406 }, { "epoch": 0.6823134953897737, "grad_norm": 10.927568435668945, "learning_rate": 1.7989134271679726e-05, "loss": 2.5588, "step": 407 }, { "epoch": 0.6839899413243923, "grad_norm": 11.000985145568848, "learning_rate": 1.7978249543122012e-05, "loss": 2.5676, "step": 408 }, { "epoch": 0.6856663872590109, "grad_norm": 15.294034957885742, "learning_rate": 1.7967338746565103e-05, "loss": 2.5535, "step": 409 }, { "epoch": 0.6873428331936295, "grad_norm": 10.137288093566895, "learning_rate": 1.7956401917658747e-05, "loss": 2.6371, "step": 410 }, { "epoch": 0.6890192791282481, "grad_norm": 36.3173828125, "learning_rate": 1.7945439092137763e-05, "loss": 2.4347, "step": 411 }, { "epoch": 0.6906957250628667, "grad_norm": 7.624430179595947, "learning_rate": 1.7934450305821905e-05, "loss": 2.7075, "step": 412 }, { "epoch": 0.6923721709974854, "grad_norm": 10.408517837524414, "learning_rate": 1.7923435594615744e-05, "loss": 2.908, "step": 413 }, { "epoch": 0.694048616932104, "grad_norm": 25.367889404296875, "learning_rate": 1.7912394994508568e-05, "loss": 2.4682, "step": 414 }, { "epoch": 0.6957250628667225, "grad_norm": 36.155452728271484, "learning_rate": 1.7901328541574246e-05, "loss": 2.6569, "step": 415 }, { "epoch": 0.6974015088013411, "grad_norm": 31.817476272583008, "learning_rate": 1.789023627197112e-05, "loss": 2.6827, "step": 416 }, { "epoch": 0.6990779547359598, "grad_norm": 6.234879970550537, "learning_rate": 1.7879118221941893e-05, "loss": 1.8783, "step": 417 }, { "epoch": 0.7007544006705784, "grad_norm": 59.8724365234375, "learning_rate": 1.786797442781349e-05, "loss": 2.5798, "step": 418 }, { "epoch": 0.702430846605197, "grad_norm": 25.502656936645508, "learning_rate": 1.785680492599696e-05, "loss": 2.6873, "step": 419 }, { "epoch": 0.7041072925398156, "grad_norm": 10.163752555847168, "learning_rate": 1.784560975298735e-05, "loss": 2.4537, "step": 420 }, { "epoch": 0.7057837384744342, "grad_norm": 17.922996520996094, "learning_rate": 1.783438894536357e-05, "loss": 2.7865, "step": 421 }, { "epoch": 0.7074601844090528, "grad_norm": 5.529216289520264, "learning_rate": 1.7823142539788307e-05, "loss": 2.1909, "step": 422 }, { "epoch": 0.7091366303436715, "grad_norm": 6.702016830444336, "learning_rate": 1.7811870573007878e-05, "loss": 2.5274, "step": 423 }, { "epoch": 0.71081307627829, "grad_norm": 8.64082145690918, "learning_rate": 1.7800573081852124e-05, "loss": 2.1772, "step": 424 }, { "epoch": 0.7124895222129086, "grad_norm": 13.820183753967285, "learning_rate": 1.7789250103234267e-05, "loss": 2.4405, "step": 425 }, { "epoch": 0.7141659681475272, "grad_norm": 10.310776710510254, "learning_rate": 1.7777901674150827e-05, "loss": 2.4232, "step": 426 }, { "epoch": 0.7158424140821459, "grad_norm": 5.837985038757324, "learning_rate": 1.7766527831681472e-05, "loss": 2.8772, "step": 427 }, { "epoch": 0.7175188600167645, "grad_norm": 6.432294845581055, "learning_rate": 1.77551286129889e-05, "loss": 2.8268, "step": 428 }, { "epoch": 0.719195305951383, "grad_norm": 6.231441497802734, "learning_rate": 1.7743704055318728e-05, "loss": 2.6818, "step": 429 }, { "epoch": 0.7208717518860017, "grad_norm": 7.018197059631348, "learning_rate": 1.773225419599937e-05, "loss": 2.5229, "step": 430 }, { "epoch": 0.7225481978206203, "grad_norm": 46.13083267211914, "learning_rate": 1.7720779072441897e-05, "loss": 2.3744, "step": 431 }, { "epoch": 0.7242246437552389, "grad_norm": 18.846233367919922, "learning_rate": 1.770927872213994e-05, "loss": 2.2858, "step": 432 }, { "epoch": 0.7259010896898574, "grad_norm": 9.241786003112793, "learning_rate": 1.7697753182669553e-05, "loss": 2.7473, "step": 433 }, { "epoch": 0.7275775356244761, "grad_norm": 30.916784286499023, "learning_rate": 1.7686202491689086e-05, "loss": 2.7626, "step": 434 }, { "epoch": 0.7292539815590947, "grad_norm": 5.970525741577148, "learning_rate": 1.767462668693908e-05, "loss": 2.6829, "step": 435 }, { "epoch": 0.7309304274937133, "grad_norm": 5.0676422119140625, "learning_rate": 1.7663025806242126e-05, "loss": 2.6413, "step": 436 }, { "epoch": 0.732606873428332, "grad_norm": 21.252967834472656, "learning_rate": 1.765139988750274e-05, "loss": 2.8193, "step": 437 }, { "epoch": 0.7342833193629505, "grad_norm": 6.28440523147583, "learning_rate": 1.763974896870726e-05, "loss": 2.9153, "step": 438 }, { "epoch": 0.7359597652975691, "grad_norm": 13.996260643005371, "learning_rate": 1.7628073087923705e-05, "loss": 2.5421, "step": 439 }, { "epoch": 0.7376362112321878, "grad_norm": 7.920317649841309, "learning_rate": 1.7616372283301644e-05, "loss": 2.493, "step": 440 }, { "epoch": 0.7393126571668064, "grad_norm": 31.403247833251953, "learning_rate": 1.7604646593072107e-05, "loss": 2.5437, "step": 441 }, { "epoch": 0.740989103101425, "grad_norm": 361.50439453125, "learning_rate": 1.75928960555474e-05, "loss": 3.2824, "step": 442 }, { "epoch": 0.7426655490360435, "grad_norm": 121.25823974609375, "learning_rate": 1.758112070912104e-05, "loss": 2.8763, "step": 443 }, { "epoch": 0.7443419949706622, "grad_norm": 102.47718811035156, "learning_rate": 1.75693205922676e-05, "loss": 2.8239, "step": 444 }, { "epoch": 0.7460184409052808, "grad_norm": 30.230337142944336, "learning_rate": 1.7557495743542586e-05, "loss": 2.9018, "step": 445 }, { "epoch": 0.7476948868398994, "grad_norm": 11.892454147338867, "learning_rate": 1.7545646201582304e-05, "loss": 3.1265, "step": 446 }, { "epoch": 0.7493713327745181, "grad_norm": 43.32453918457031, "learning_rate": 1.7533772005103754e-05, "loss": 2.2487, "step": 447 }, { "epoch": 0.7493713327745181, "eval_loss": 2.6958272457122803, "eval_runtime": 170.0573, "eval_samples_per_second": 3.087, "eval_steps_per_second": 1.547, "step": 447 }, { "epoch": 0.7510477787091366, "grad_norm": 9.100330352783203, "learning_rate": 1.7521873192904485e-05, "loss": 2.5685, "step": 448 }, { "epoch": 0.7527242246437552, "grad_norm": 6.081371307373047, "learning_rate": 1.7509949803862477e-05, "loss": 2.5107, "step": 449 }, { "epoch": 0.7544006705783739, "grad_norm": 5.724084377288818, "learning_rate": 1.7498001876936013e-05, "loss": 2.6757, "step": 450 }, { "epoch": 0.7560771165129925, "grad_norm": 19.598569869995117, "learning_rate": 1.7486029451163552e-05, "loss": 2.5465, "step": 451 }, { "epoch": 0.757753562447611, "grad_norm": 5.665065288543701, "learning_rate": 1.7474032565663595e-05, "loss": 2.6421, "step": 452 }, { "epoch": 0.7594300083822296, "grad_norm": 9.71866512298584, "learning_rate": 1.7462011259634565e-05, "loss": 2.5216, "step": 453 }, { "epoch": 0.7611064543168483, "grad_norm": 12.439435958862305, "learning_rate": 1.7449965572354675e-05, "loss": 2.6464, "step": 454 }, { "epoch": 0.7627829002514669, "grad_norm": 5.004015922546387, "learning_rate": 1.743789554318181e-05, "loss": 2.3841, "step": 455 }, { "epoch": 0.7644593461860855, "grad_norm": 9.906461715698242, "learning_rate": 1.742580121155337e-05, "loss": 2.4622, "step": 456 }, { "epoch": 0.7661357921207042, "grad_norm": 6.506420612335205, "learning_rate": 1.7413682616986185e-05, "loss": 2.4924, "step": 457 }, { "epoch": 0.7678122380553227, "grad_norm": 4.7660722732543945, "learning_rate": 1.7401539799076337e-05, "loss": 2.5635, "step": 458 }, { "epoch": 0.7694886839899413, "grad_norm": 10.729870796203613, "learning_rate": 1.738937279749907e-05, "loss": 2.7504, "step": 459 }, { "epoch": 0.7711651299245599, "grad_norm": 9.837868690490723, "learning_rate": 1.7377181652008644e-05, "loss": 2.9089, "step": 460 }, { "epoch": 0.7728415758591786, "grad_norm": 10.955761909484863, "learning_rate": 1.73649664024382e-05, "loss": 2.5363, "step": 461 }, { "epoch": 0.7745180217937971, "grad_norm": 14.6876220703125, "learning_rate": 1.7352727088699645e-05, "loss": 2.6311, "step": 462 }, { "epoch": 0.7761944677284157, "grad_norm": 17.01027488708496, "learning_rate": 1.73404637507835e-05, "loss": 2.8192, "step": 463 }, { "epoch": 0.7778709136630344, "grad_norm": 6.5204668045043945, "learning_rate": 1.73281764287588e-05, "loss": 3.0088, "step": 464 }, { "epoch": 0.779547359597653, "grad_norm": 9.527382850646973, "learning_rate": 1.731586516277293e-05, "loss": 2.4343, "step": 465 }, { "epoch": 0.7812238055322716, "grad_norm": 21.263635635375977, "learning_rate": 1.730352999305152e-05, "loss": 2.3241, "step": 466 }, { "epoch": 0.7829002514668902, "grad_norm": 32.5350227355957, "learning_rate": 1.7291170959898288e-05, "loss": 3.1115, "step": 467 }, { "epoch": 0.7845766974015088, "grad_norm": 13.576776504516602, "learning_rate": 1.7278788103694944e-05, "loss": 2.6608, "step": 468 }, { "epoch": 0.7862531433361274, "grad_norm": 9.036080360412598, "learning_rate": 1.7266381464901015e-05, "loss": 3.0708, "step": 469 }, { "epoch": 0.787929589270746, "grad_norm": 11.212925910949707, "learning_rate": 1.725395108405375e-05, "loss": 2.6661, "step": 470 }, { "epoch": 0.7896060352053647, "grad_norm": 9.199807167053223, "learning_rate": 1.7241497001767967e-05, "loss": 2.804, "step": 471 }, { "epoch": 0.7912824811399832, "grad_norm": 13.182611465454102, "learning_rate": 1.7229019258735923e-05, "loss": 2.4145, "step": 472 }, { "epoch": 0.7929589270746018, "grad_norm": 8.599435806274414, "learning_rate": 1.7216517895727198e-05, "loss": 2.426, "step": 473 }, { "epoch": 0.7946353730092205, "grad_norm": 4.153339385986328, "learning_rate": 1.720399295358852e-05, "loss": 2.4717, "step": 474 }, { "epoch": 0.7963118189438391, "grad_norm": 4.428544521331787, "learning_rate": 1.7191444473243694e-05, "loss": 2.6481, "step": 475 }, { "epoch": 0.7979882648784576, "grad_norm": 28.37386131286621, "learning_rate": 1.7178872495693397e-05, "loss": 2.8006, "step": 476 }, { "epoch": 0.7996647108130763, "grad_norm": 11.782523155212402, "learning_rate": 1.716627706201511e-05, "loss": 2.908, "step": 477 }, { "epoch": 0.8013411567476949, "grad_norm": 5.304565906524658, "learning_rate": 1.715365821336294e-05, "loss": 2.3116, "step": 478 }, { "epoch": 0.8030176026823135, "grad_norm": 10.117254257202148, "learning_rate": 1.71410159909675e-05, "loss": 2.6815, "step": 479 }, { "epoch": 0.8046940486169321, "grad_norm": 8.59892463684082, "learning_rate": 1.7128350436135777e-05, "loss": 2.4734, "step": 480 }, { "epoch": 0.8063704945515507, "grad_norm": 12.518685340881348, "learning_rate": 1.7115661590250992e-05, "loss": 2.8112, "step": 481 }, { "epoch": 0.8080469404861693, "grad_norm": 4.842088222503662, "learning_rate": 1.7102949494772472e-05, "loss": 2.4006, "step": 482 }, { "epoch": 0.8097233864207879, "grad_norm": 9.626520156860352, "learning_rate": 1.7090214191235502e-05, "loss": 2.5598, "step": 483 }, { "epoch": 0.8113998323554066, "grad_norm": 4.74247932434082, "learning_rate": 1.70774557212512e-05, "loss": 2.6791, "step": 484 }, { "epoch": 0.8130762782900252, "grad_norm": 13.712872505187988, "learning_rate": 1.7064674126506378e-05, "loss": 2.498, "step": 485 }, { "epoch": 0.8147527242246437, "grad_norm": 11.633493423461914, "learning_rate": 1.7051869448763406e-05, "loss": 2.8082, "step": 486 }, { "epoch": 0.8164291701592624, "grad_norm": 5.511799335479736, "learning_rate": 1.7039041729860074e-05, "loss": 2.434, "step": 487 }, { "epoch": 0.818105616093881, "grad_norm": 10.040752410888672, "learning_rate": 1.7026191011709456e-05, "loss": 2.3358, "step": 488 }, { "epoch": 0.8197820620284996, "grad_norm": 10.578059196472168, "learning_rate": 1.7013317336299773e-05, "loss": 2.5523, "step": 489 }, { "epoch": 0.8214585079631181, "grad_norm": 8.619462013244629, "learning_rate": 1.7000420745694256e-05, "loss": 2.4232, "step": 490 }, { "epoch": 0.8231349538977368, "grad_norm": 6.91777229309082, "learning_rate": 1.6987501282031013e-05, "loss": 2.6948, "step": 491 }, { "epoch": 0.8248113998323554, "grad_norm": 9.747309684753418, "learning_rate": 1.6974558987522876e-05, "loss": 2.6499, "step": 492 }, { "epoch": 0.826487845766974, "grad_norm": 10.750295639038086, "learning_rate": 1.696159390445729e-05, "loss": 2.1614, "step": 493 }, { "epoch": 0.8281642917015927, "grad_norm": 16.06046485900879, "learning_rate": 1.6948606075196148e-05, "loss": 2.5178, "step": 494 }, { "epoch": 0.8298407376362112, "grad_norm": 9.259533882141113, "learning_rate": 1.6935595542175666e-05, "loss": 2.9301, "step": 495 }, { "epoch": 0.8315171835708298, "grad_norm": 7.662787914276123, "learning_rate": 1.692256234790624e-05, "loss": 2.2103, "step": 496 }, { "epoch": 0.8331936295054484, "grad_norm": 4.963723659515381, "learning_rate": 1.6909506534972316e-05, "loss": 2.5775, "step": 497 }, { "epoch": 0.8348700754400671, "grad_norm": 9.992377281188965, "learning_rate": 1.6896428146032234e-05, "loss": 2.5238, "step": 498 }, { "epoch": 0.8365465213746857, "grad_norm": 14.528229713439941, "learning_rate": 1.6883327223818108e-05, "loss": 2.6701, "step": 499 }, { "epoch": 0.8382229673093042, "grad_norm": 14.423613548278809, "learning_rate": 1.6870203811135665e-05, "loss": 2.851, "step": 500 }, { "epoch": 0.8398994132439229, "grad_norm": 12.226161003112793, "learning_rate": 1.6857057950864134e-05, "loss": 2.3799, "step": 501 }, { "epoch": 0.8415758591785415, "grad_norm": 19.34228515625, "learning_rate": 1.6843889685956073e-05, "loss": 2.8201, "step": 502 }, { "epoch": 0.8432523051131601, "grad_norm": 18.41592788696289, "learning_rate": 1.683069905943725e-05, "loss": 2.8907, "step": 503 }, { "epoch": 0.8449287510477788, "grad_norm": 5.469997406005859, "learning_rate": 1.68174861144065e-05, "loss": 2.2227, "step": 504 }, { "epoch": 0.8466051969823973, "grad_norm": 7.710814952850342, "learning_rate": 1.6804250894035576e-05, "loss": 2.7472, "step": 505 }, { "epoch": 0.8482816429170159, "grad_norm": 5.026285648345947, "learning_rate": 1.6790993441569022e-05, "loss": 2.8893, "step": 506 }, { "epoch": 0.8499580888516345, "grad_norm": 8.987253189086914, "learning_rate": 1.677771380032401e-05, "loss": 2.7487, "step": 507 }, { "epoch": 0.8516345347862532, "grad_norm": 9.631572723388672, "learning_rate": 1.6764412013690222e-05, "loss": 2.5859, "step": 508 }, { "epoch": 0.8533109807208717, "grad_norm": 8.267977714538574, "learning_rate": 1.6751088125129687e-05, "loss": 2.488, "step": 509 }, { "epoch": 0.8549874266554903, "grad_norm": 12.981925964355469, "learning_rate": 1.6737742178176666e-05, "loss": 2.553, "step": 510 }, { "epoch": 0.856663872590109, "grad_norm": 6.194202423095703, "learning_rate": 1.6724374216437475e-05, "loss": 2.5024, "step": 511 }, { "epoch": 0.8583403185247276, "grad_norm": 7.61236572265625, "learning_rate": 1.671098428359037e-05, "loss": 2.4377, "step": 512 }, { "epoch": 0.8600167644593462, "grad_norm": 5.145570755004883, "learning_rate": 1.6697572423385395e-05, "loss": 2.553, "step": 513 }, { "epoch": 0.8616932103939648, "grad_norm": 12.838342666625977, "learning_rate": 1.6684138679644237e-05, "loss": 2.3275, "step": 514 }, { "epoch": 0.8633696563285834, "grad_norm": 6.936984539031982, "learning_rate": 1.667068309626009e-05, "loss": 2.9689, "step": 515 }, { "epoch": 0.865046102263202, "grad_norm": 6.189188003540039, "learning_rate": 1.6657205717197495e-05, "loss": 2.9188, "step": 516 }, { "epoch": 0.8667225481978206, "grad_norm": 7.555627346038818, "learning_rate": 1.6643706586492217e-05, "loss": 2.2903, "step": 517 }, { "epoch": 0.8683989941324393, "grad_norm": 22.666934967041016, "learning_rate": 1.66301857482511e-05, "loss": 2.6758, "step": 518 }, { "epoch": 0.8700754400670578, "grad_norm": 9.236169815063477, "learning_rate": 1.6616643246651888e-05, "loss": 2.3872, "step": 519 }, { "epoch": 0.8717518860016764, "grad_norm": 10.500259399414062, "learning_rate": 1.6603079125943136e-05, "loss": 2.3918, "step": 520 }, { "epoch": 0.8734283319362951, "grad_norm": 13.099411010742188, "learning_rate": 1.6589493430444026e-05, "loss": 2.4583, "step": 521 }, { "epoch": 0.8751047778709137, "grad_norm": 4.944172382354736, "learning_rate": 1.6575886204544223e-05, "loss": 2.4083, "step": 522 }, { "epoch": 0.8767812238055323, "grad_norm": 23.64794158935547, "learning_rate": 1.6562257492703756e-05, "loss": 2.7031, "step": 523 }, { "epoch": 0.8784576697401508, "grad_norm": 31.68302345275879, "learning_rate": 1.6548607339452853e-05, "loss": 2.2841, "step": 524 }, { "epoch": 0.8801341156747695, "grad_norm": 26.529382705688477, "learning_rate": 1.653493578939179e-05, "loss": 2.4463, "step": 525 }, { "epoch": 0.8818105616093881, "grad_norm": 7.102970600128174, "learning_rate": 1.6521242887190764e-05, "loss": 2.8223, "step": 526 }, { "epoch": 0.8834870075440067, "grad_norm": 32.49003982543945, "learning_rate": 1.6507528677589736e-05, "loss": 2.597, "step": 527 }, { "epoch": 0.8851634534786254, "grad_norm": 11.18454647064209, "learning_rate": 1.6493793205398282e-05, "loss": 2.4617, "step": 528 }, { "epoch": 0.8868398994132439, "grad_norm": 5.5496649742126465, "learning_rate": 1.6480036515495456e-05, "loss": 2.7305, "step": 529 }, { "epoch": 0.8885163453478625, "grad_norm": 5.789486408233643, "learning_rate": 1.6466258652829638e-05, "loss": 2.7942, "step": 530 }, { "epoch": 0.8901927912824812, "grad_norm": 5.066472053527832, "learning_rate": 1.6452459662418386e-05, "loss": 2.5676, "step": 531 }, { "epoch": 0.8918692372170998, "grad_norm": 10.830833435058594, "learning_rate": 1.643863958934829e-05, "loss": 2.6049, "step": 532 }, { "epoch": 0.8935456831517183, "grad_norm": 15.551868438720703, "learning_rate": 1.6424798478774823e-05, "loss": 2.6454, "step": 533 }, { "epoch": 0.8952221290863369, "grad_norm": 10.094992637634277, "learning_rate": 1.64109363759222e-05, "loss": 2.5231, "step": 534 }, { "epoch": 0.8968985750209556, "grad_norm": 5.12712287902832, "learning_rate": 1.639705332608323e-05, "loss": 2.2854, "step": 535 }, { "epoch": 0.8985750209555742, "grad_norm": 5.026795387268066, "learning_rate": 1.638314937461915e-05, "loss": 2.7255, "step": 536 }, { "epoch": 0.9002514668901928, "grad_norm": 14.15123462677002, "learning_rate": 1.63692245669595e-05, "loss": 2.6528, "step": 537 }, { "epoch": 0.9019279128248114, "grad_norm": 6.872882843017578, "learning_rate": 1.6355278948601968e-05, "loss": 2.5672, "step": 538 }, { "epoch": 0.90360435875943, "grad_norm": 12.379980087280273, "learning_rate": 1.634131256511223e-05, "loss": 2.5084, "step": 539 }, { "epoch": 0.9052808046940486, "grad_norm": 16.49246597290039, "learning_rate": 1.6327325462123816e-05, "loss": 2.1847, "step": 540 }, { "epoch": 0.9069572506286673, "grad_norm": 10.12289047241211, "learning_rate": 1.6313317685337947e-05, "loss": 2.4811, "step": 541 }, { "epoch": 0.9086336965632859, "grad_norm": 6.666973114013672, "learning_rate": 1.629928928052341e-05, "loss": 2.6822, "step": 542 }, { "epoch": 0.9103101424979044, "grad_norm": 4.159225940704346, "learning_rate": 1.6285240293516375e-05, "loss": 2.1565, "step": 543 }, { "epoch": 0.911986588432523, "grad_norm": 28.310955047607422, "learning_rate": 1.6271170770220258e-05, "loss": 2.5745, "step": 544 }, { "epoch": 0.9136630343671417, "grad_norm": 5.025387763977051, "learning_rate": 1.62570807566056e-05, "loss": 2.5054, "step": 545 }, { "epoch": 0.9153394803017603, "grad_norm": 4.66910982131958, "learning_rate": 1.6242970298709867e-05, "loss": 2.4418, "step": 546 }, { "epoch": 0.9170159262363788, "grad_norm": 51.68278503417969, "learning_rate": 1.6228839442637334e-05, "loss": 2.6826, "step": 547 }, { "epoch": 0.9186923721709975, "grad_norm": 12.514103889465332, "learning_rate": 1.6214688234558927e-05, "loss": 2.7281, "step": 548 }, { "epoch": 0.9203688181056161, "grad_norm": 5.299622535705566, "learning_rate": 1.6200516720712063e-05, "loss": 2.6907, "step": 549 }, { "epoch": 0.9220452640402347, "grad_norm": 19.432247161865234, "learning_rate": 1.618632494740051e-05, "loss": 2.3059, "step": 550 }, { "epoch": 0.9237217099748533, "grad_norm": 9.474685668945312, "learning_rate": 1.6172112960994234e-05, "loss": 2.521, "step": 551 }, { "epoch": 0.9253981559094719, "grad_norm": 13.688502311706543, "learning_rate": 1.615788080792924e-05, "loss": 2.5688, "step": 552 }, { "epoch": 0.9270746018440905, "grad_norm": 4.868835926055908, "learning_rate": 1.6143628534707426e-05, "loss": 2.67, "step": 553 }, { "epoch": 0.9287510477787091, "grad_norm": 5.040724277496338, "learning_rate": 1.612935618789643e-05, "loss": 2.3983, "step": 554 }, { "epoch": 0.9304274937133278, "grad_norm": 9.414543151855469, "learning_rate": 1.611506381412948e-05, "loss": 2.6015, "step": 555 }, { "epoch": 0.9321039396479464, "grad_norm": 6.755870819091797, "learning_rate": 1.6100751460105244e-05, "loss": 2.633, "step": 556 }, { "epoch": 0.9337803855825649, "grad_norm": 8.802675247192383, "learning_rate": 1.6086419172587663e-05, "loss": 2.6481, "step": 557 }, { "epoch": 0.9354568315171836, "grad_norm": 5.96451997756958, "learning_rate": 1.607206699840582e-05, "loss": 2.7599, "step": 558 }, { "epoch": 0.9371332774518022, "grad_norm": 16.79456329345703, "learning_rate": 1.605769498445376e-05, "loss": 2.6351, "step": 559 }, { "epoch": 0.9388097233864208, "grad_norm": 13.080081939697266, "learning_rate": 1.604330317769037e-05, "loss": 2.0857, "step": 560 }, { "epoch": 0.9404861693210393, "grad_norm": 5.653440475463867, "learning_rate": 1.602889162513919e-05, "loss": 2.5387, "step": 561 }, { "epoch": 0.942162615255658, "grad_norm": 5.444548606872559, "learning_rate": 1.6014460373888293e-05, "loss": 2.9065, "step": 562 }, { "epoch": 0.9438390611902766, "grad_norm": 8.392128944396973, "learning_rate": 1.6000009471090107e-05, "loss": 2.5887, "step": 563 }, { "epoch": 0.9455155071248952, "grad_norm": 4.5212225914001465, "learning_rate": 1.5985538963961263e-05, "loss": 2.403, "step": 564 }, { "epoch": 0.9471919530595139, "grad_norm": 11.071004867553711, "learning_rate": 1.5971048899782464e-05, "loss": 2.6196, "step": 565 }, { "epoch": 0.9488683989941324, "grad_norm": 4.670419692993164, "learning_rate": 1.59565393258983e-05, "loss": 2.3765, "step": 566 }, { "epoch": 0.950544844928751, "grad_norm": 9.833883285522461, "learning_rate": 1.5942010289717108e-05, "loss": 2.6484, "step": 567 }, { "epoch": 0.9522212908633697, "grad_norm": 5.180789947509766, "learning_rate": 1.5927461838710818e-05, "loss": 2.0936, "step": 568 }, { "epoch": 0.9538977367979883, "grad_norm": 8.313568115234375, "learning_rate": 1.59128940204148e-05, "loss": 2.4155, "step": 569 }, { "epoch": 0.9555741827326069, "grad_norm": 10.749990463256836, "learning_rate": 1.5898306882427696e-05, "loss": 2.5593, "step": 570 }, { "epoch": 0.9572506286672254, "grad_norm": 11.310686111450195, "learning_rate": 1.588370047241128e-05, "loss": 2.805, "step": 571 }, { "epoch": 0.9589270746018441, "grad_norm": 6.234105587005615, "learning_rate": 1.5869074838090288e-05, "loss": 2.1825, "step": 572 }, { "epoch": 0.9606035205364627, "grad_norm": 18.672943115234375, "learning_rate": 1.5854430027252276e-05, "loss": 2.3714, "step": 573 }, { "epoch": 0.9622799664710813, "grad_norm": 21.65448760986328, "learning_rate": 1.583976608774745e-05, "loss": 2.3245, "step": 574 }, { "epoch": 0.9639564124057, "grad_norm": 9.994019508361816, "learning_rate": 1.5825083067488528e-05, "loss": 2.685, "step": 575 }, { "epoch": 0.9656328583403185, "grad_norm": 44.43074035644531, "learning_rate": 1.5810381014450557e-05, "loss": 2.442, "step": 576 }, { "epoch": 0.9673093042749371, "grad_norm": 13.926177024841309, "learning_rate": 1.5795659976670787e-05, "loss": 2.3111, "step": 577 }, { "epoch": 0.9689857502095558, "grad_norm": 22.379493713378906, "learning_rate": 1.5780920002248484e-05, "loss": 2.6092, "step": 578 }, { "epoch": 0.9706621961441744, "grad_norm": 13.413520812988281, "learning_rate": 1.5766161139344802e-05, "loss": 2.4421, "step": 579 }, { "epoch": 0.972338642078793, "grad_norm": 20.679418563842773, "learning_rate": 1.575138343618259e-05, "loss": 2.5371, "step": 580 }, { "epoch": 0.9740150880134115, "grad_norm": 7.441642761230469, "learning_rate": 1.5736586941046285e-05, "loss": 2.5954, "step": 581 }, { "epoch": 0.9756915339480302, "grad_norm": 8.615164756774902, "learning_rate": 1.5721771702281694e-05, "loss": 2.5537, "step": 582 }, { "epoch": 0.9773679798826488, "grad_norm": 8.618377685546875, "learning_rate": 1.5706937768295887e-05, "loss": 2.5546, "step": 583 }, { "epoch": 0.9790444258172674, "grad_norm": 10.058917045593262, "learning_rate": 1.5692085187557012e-05, "loss": 2.3664, "step": 584 }, { "epoch": 0.980720871751886, "grad_norm": 4.548496246337891, "learning_rate": 1.567721400859414e-05, "loss": 2.1509, "step": 585 }, { "epoch": 0.9823973176865046, "grad_norm": 10.367966651916504, "learning_rate": 1.5662324279997115e-05, "loss": 2.1543, "step": 586 }, { "epoch": 0.9840737636211232, "grad_norm": 6.5322771072387695, "learning_rate": 1.5647416050416386e-05, "loss": 2.3545, "step": 587 }, { "epoch": 0.9857502095557418, "grad_norm": 7.886918067932129, "learning_rate": 1.5632489368562853e-05, "loss": 2.3367, "step": 588 }, { "epoch": 0.9874266554903605, "grad_norm": 8.543756484985352, "learning_rate": 1.561754428320771e-05, "loss": 2.3845, "step": 589 }, { "epoch": 0.989103101424979, "grad_norm": 6.489569187164307, "learning_rate": 1.5602580843182272e-05, "loss": 2.2977, "step": 590 }, { "epoch": 0.9907795473595976, "grad_norm": 4.915396213531494, "learning_rate": 1.5587599097377845e-05, "loss": 2.6069, "step": 591 }, { "epoch": 0.9924559932942163, "grad_norm": 7.197396278381348, "learning_rate": 1.5572599094745522e-05, "loss": 2.4374, "step": 592 }, { "epoch": 0.9941324392288349, "grad_norm": 5.097231864929199, "learning_rate": 1.5557580884296065e-05, "loss": 2.3689, "step": 593 }, { "epoch": 0.9958088851634534, "grad_norm": 3.989947557449341, "learning_rate": 1.554254451509973e-05, "loss": 2.4421, "step": 594 }, { "epoch": 0.9974853310980721, "grad_norm": 5.270840644836426, "learning_rate": 1.5527490036286088e-05, "loss": 2.2985, "step": 595 }, { "epoch": 0.9991617770326907, "grad_norm": 4.876128673553467, "learning_rate": 1.5512417497043898e-05, "loss": 2.2711, "step": 596 }, { "epoch": 0.9991617770326907, "eval_loss": 2.4888453483581543, "eval_runtime": 169.7709, "eval_samples_per_second": 3.092, "eval_steps_per_second": 1.549, "step": 596 }, { "epoch": 1.0008382229673094, "grad_norm": 68.67245483398438, "learning_rate": 1.5497326946620924e-05, "loss": 2.5835, "step": 597 }, { "epoch": 1.002514668901928, "grad_norm": 4.932079315185547, "learning_rate": 1.5482218434323772e-05, "loss": 2.387, "step": 598 }, { "epoch": 1.0041911148365466, "grad_norm": 7.7940826416015625, "learning_rate": 1.5467092009517748e-05, "loss": 3.0089, "step": 599 }, { "epoch": 1.0058675607711651, "grad_norm": 13.789093017578125, "learning_rate": 1.5451947721626676e-05, "loss": 2.6936, "step": 600 }, { "epoch": 1.0075440067057837, "grad_norm": 7.167382717132568, "learning_rate": 1.5436785620132756e-05, "loss": 2.7852, "step": 601 }, { "epoch": 1.0092204526404023, "grad_norm": 4.721502304077148, "learning_rate": 1.5421605754576376e-05, "loss": 2.3277, "step": 602 }, { "epoch": 1.0108968985750209, "grad_norm": 6.722412109375, "learning_rate": 1.5406408174555978e-05, "loss": 2.3408, "step": 603 }, { "epoch": 1.0125733445096397, "grad_norm": 4.608644008636475, "learning_rate": 1.5391192929727884e-05, "loss": 2.1118, "step": 604 }, { "epoch": 1.0142497904442582, "grad_norm": 15.75361156463623, "learning_rate": 1.5375960069806124e-05, "loss": 2.6879, "step": 605 }, { "epoch": 1.0159262363788768, "grad_norm": 11.668586730957031, "learning_rate": 1.5360709644562296e-05, "loss": 2.2299, "step": 606 }, { "epoch": 1.0176026823134954, "grad_norm": 6.130990505218506, "learning_rate": 1.5345441703825388e-05, "loss": 2.4246, "step": 607 }, { "epoch": 1.019279128248114, "grad_norm": 6.221343994140625, "learning_rate": 1.53301562974816e-05, "loss": 2.5036, "step": 608 }, { "epoch": 1.0209555741827325, "grad_norm": 12.746177673339844, "learning_rate": 1.5314853475474233e-05, "loss": 2.3069, "step": 609 }, { "epoch": 1.0226320201173513, "grad_norm": 6.415351867675781, "learning_rate": 1.529953328780345e-05, "loss": 2.435, "step": 610 }, { "epoch": 1.02430846605197, "grad_norm": 14.786396026611328, "learning_rate": 1.5284195784526196e-05, "loss": 2.4999, "step": 611 }, { "epoch": 1.0259849119865885, "grad_norm": 6.676532745361328, "learning_rate": 1.526884101575596e-05, "loss": 2.6835, "step": 612 }, { "epoch": 1.027661357921207, "grad_norm": 7.590729236602783, "learning_rate": 1.5253469031662662e-05, "loss": 2.346, "step": 613 }, { "epoch": 1.0016764459346186, "grad_norm": 8.146598815917969, "learning_rate": 1.5238079882472465e-05, "loss": 2.5531, "step": 614 }, { "epoch": 1.0033528918692372, "grad_norm": 19.027986526489258, "learning_rate": 1.5222673618467615e-05, "loss": 2.5493, "step": 615 }, { "epoch": 1.0050293378038557, "grad_norm": 7.539775371551514, "learning_rate": 1.520725028998629e-05, "loss": 2.5868, "step": 616 }, { "epoch": 1.0067057837384745, "grad_norm": 5.7872467041015625, "learning_rate": 1.5191809947422405e-05, "loss": 2.4742, "step": 617 }, { "epoch": 1.008382229673093, "grad_norm": 6.555319786071777, "learning_rate": 1.5176352641225482e-05, "loss": 2.4683, "step": 618 }, { "epoch": 1.0100586756077117, "grad_norm": 5.78521203994751, "learning_rate": 1.5160878421900464e-05, "loss": 2.5911, "step": 619 }, { "epoch": 1.0117351215423303, "grad_norm": 7.895318984985352, "learning_rate": 1.5145387340007562e-05, "loss": 2.2215, "step": 620 }, { "epoch": 1.0134115674769488, "grad_norm": 6.820655822753906, "learning_rate": 1.512987944616207e-05, "loss": 2.3477, "step": 621 }, { "epoch": 1.0150880134115674, "grad_norm": 6.360649585723877, "learning_rate": 1.5114354791034225e-05, "loss": 2.4192, "step": 622 }, { "epoch": 1.016764459346186, "grad_norm": 5.126014709472656, "learning_rate": 1.5098813425349023e-05, "loss": 2.1464, "step": 623 }, { "epoch": 1.0184409052808048, "grad_norm": 6.055723190307617, "learning_rate": 1.5083255399886069e-05, "loss": 2.3008, "step": 624 }, { "epoch": 1.0201173512154234, "grad_norm": 6.189535617828369, "learning_rate": 1.5067680765479387e-05, "loss": 2.1766, "step": 625 }, { "epoch": 1.021793797150042, "grad_norm": 5.050477027893066, "learning_rate": 1.5052089573017283e-05, "loss": 2.333, "step": 626 }, { "epoch": 1.0234702430846605, "grad_norm": 16.873249053955078, "learning_rate": 1.5036481873442154e-05, "loss": 2.4696, "step": 627 }, { "epoch": 1.025146689019279, "grad_norm": 7.117854595184326, "learning_rate": 1.5020857717750345e-05, "loss": 2.4031, "step": 628 }, { "epoch": 1.0268231349538977, "grad_norm": 4.577010631561279, "learning_rate": 1.5005217156991951e-05, "loss": 2.6052, "step": 629 }, { "epoch": 1.0284995808885165, "grad_norm": 8.765073776245117, "learning_rate": 1.4989560242270686e-05, "loss": 2.2716, "step": 630 }, { "epoch": 1.030176026823135, "grad_norm": 7.53173303604126, "learning_rate": 1.4973887024743686e-05, "loss": 2.3277, "step": 631 }, { "epoch": 1.0318524727577536, "grad_norm": 8.31611156463623, "learning_rate": 1.4958197555621367e-05, "loss": 2.561, "step": 632 }, { "epoch": 1.0335289186923722, "grad_norm": 6.258691310882568, "learning_rate": 1.494249188616723e-05, "loss": 2.4224, "step": 633 }, { "epoch": 1.0352053646269908, "grad_norm": 4.758622646331787, "learning_rate": 1.4926770067697723e-05, "loss": 2.3267, "step": 634 }, { "epoch": 1.0368818105616093, "grad_norm": 5.946450710296631, "learning_rate": 1.4911032151582047e-05, "loss": 2.7048, "step": 635 }, { "epoch": 1.038558256496228, "grad_norm": 6.746589660644531, "learning_rate": 1.4895278189242017e-05, "loss": 1.9649, "step": 636 }, { "epoch": 1.0402347024308467, "grad_norm": 8.715204238891602, "learning_rate": 1.4879508232151852e-05, "loss": 2.1481, "step": 637 }, { "epoch": 1.0419111483654653, "grad_norm": 14.40350341796875, "learning_rate": 1.4863722331838056e-05, "loss": 2.5072, "step": 638 }, { "epoch": 1.0435875943000839, "grad_norm": 9.77599048614502, "learning_rate": 1.484792053987921e-05, "loss": 2.4201, "step": 639 }, { "epoch": 1.0452640402347024, "grad_norm": 7.829461097717285, "learning_rate": 1.4832102907905834e-05, "loss": 2.4308, "step": 640 }, { "epoch": 1.046940486169321, "grad_norm": 37.859127044677734, "learning_rate": 1.4816269487600184e-05, "loss": 2.1956, "step": 641 }, { "epoch": 1.0486169321039396, "grad_norm": 44.68777847290039, "learning_rate": 1.4800420330696118e-05, "loss": 2.6287, "step": 642 }, { "epoch": 1.0502933780385582, "grad_norm": 17.88043212890625, "learning_rate": 1.4784555488978903e-05, "loss": 2.697, "step": 643 }, { "epoch": 1.051969823973177, "grad_norm": 11.546906471252441, "learning_rate": 1.4768675014285063e-05, "loss": 2.4398, "step": 644 }, { "epoch": 1.0536462699077955, "grad_norm": 5.822961807250977, "learning_rate": 1.475277895850219e-05, "loss": 2.2602, "step": 645 }, { "epoch": 1.055322715842414, "grad_norm": 18.23073959350586, "learning_rate": 1.4736867373568795e-05, "loss": 2.2956, "step": 646 }, { "epoch": 1.0569991617770327, "grad_norm": 11.038312911987305, "learning_rate": 1.4720940311474121e-05, "loss": 2.2846, "step": 647 }, { "epoch": 1.0586756077116513, "grad_norm": 6.360939979553223, "learning_rate": 1.4704997824257992e-05, "loss": 2.4518, "step": 648 }, { "epoch": 1.0603520536462698, "grad_norm": 11.454911231994629, "learning_rate": 1.4689039964010612e-05, "loss": 2.4703, "step": 649 }, { "epoch": 1.0620284995808884, "grad_norm": 8.823407173156738, "learning_rate": 1.4673066782872436e-05, "loss": 2.301, "step": 650 }, { "epoch": 1.0637049455155072, "grad_norm": 5.283808708190918, "learning_rate": 1.4657078333033967e-05, "loss": 2.1602, "step": 651 }, { "epoch": 1.0653813914501258, "grad_norm": 5.425901412963867, "learning_rate": 1.4641074666735601e-05, "loss": 2.4133, "step": 652 }, { "epoch": 1.0670578373847444, "grad_norm": 7.320043087005615, "learning_rate": 1.4625055836267443e-05, "loss": 2.3213, "step": 653 }, { "epoch": 1.068734283319363, "grad_norm": 7.585738658905029, "learning_rate": 1.460902189396916e-05, "loss": 2.0821, "step": 654 }, { "epoch": 1.0704107292539815, "grad_norm": 4.476714134216309, "learning_rate": 1.4592972892229779e-05, "loss": 2.2537, "step": 655 }, { "epoch": 1.0720871751886, "grad_norm": 6.959841728210449, "learning_rate": 1.4576908883487549e-05, "loss": 2.5676, "step": 656 }, { "epoch": 1.0737636211232187, "grad_norm": 7.167144298553467, "learning_rate": 1.4560829920229735e-05, "loss": 2.4748, "step": 657 }, { "epoch": 1.0754400670578375, "grad_norm": 4.695733070373535, "learning_rate": 1.4544736054992479e-05, "loss": 2.441, "step": 658 }, { "epoch": 1.077116512992456, "grad_norm": 7.068019866943359, "learning_rate": 1.4528627340360602e-05, "loss": 2.5594, "step": 659 }, { "epoch": 1.0787929589270746, "grad_norm": 4.428487300872803, "learning_rate": 1.4512503828967454e-05, "loss": 2.5124, "step": 660 }, { "epoch": 1.0804694048616932, "grad_norm": 7.4426798820495605, "learning_rate": 1.4496365573494724e-05, "loss": 2.5621, "step": 661 }, { "epoch": 1.0821458507963118, "grad_norm": 5.771642208099365, "learning_rate": 1.4480212626672278e-05, "loss": 2.2478, "step": 662 }, { "epoch": 1.0838222967309303, "grad_norm": 5.8191752433776855, "learning_rate": 1.4464045041277983e-05, "loss": 2.3493, "step": 663 }, { "epoch": 1.0854987426655491, "grad_norm": 5.414979934692383, "learning_rate": 1.4447862870137542e-05, "loss": 2.5108, "step": 664 }, { "epoch": 1.0871751886001677, "grad_norm": 20.760400772094727, "learning_rate": 1.443166616612431e-05, "loss": 2.1701, "step": 665 }, { "epoch": 1.0888516345347863, "grad_norm": 5.042229652404785, "learning_rate": 1.4415454982159121e-05, "loss": 2.7314, "step": 666 }, { "epoch": 1.0905280804694049, "grad_norm": 10.158458709716797, "learning_rate": 1.4399229371210131e-05, "loss": 2.4635, "step": 667 }, { "epoch": 1.0922045264040234, "grad_norm": 12.621578216552734, "learning_rate": 1.4382989386292627e-05, "loss": 2.8241, "step": 668 }, { "epoch": 1.093880972338642, "grad_norm": 10.063324928283691, "learning_rate": 1.4366735080468872e-05, "loss": 2.2745, "step": 669 }, { "epoch": 1.0955574182732606, "grad_norm": 8.74553394317627, "learning_rate": 1.4350466506847901e-05, "loss": 2.4419, "step": 670 }, { "epoch": 1.0972338642078794, "grad_norm": 6.882994174957275, "learning_rate": 1.433418371858539e-05, "loss": 2.312, "step": 671 }, { "epoch": 1.098910310142498, "grad_norm": 4.4843549728393555, "learning_rate": 1.4317886768883445e-05, "loss": 2.4577, "step": 672 }, { "epoch": 1.1005867560771165, "grad_norm": 8.001328468322754, "learning_rate": 1.4301575710990449e-05, "loss": 2.4457, "step": 673 }, { "epoch": 1.102263202011735, "grad_norm": 6.44010066986084, "learning_rate": 1.4285250598200875e-05, "loss": 2.5129, "step": 674 }, { "epoch": 1.1039396479463537, "grad_norm": 6.990611553192139, "learning_rate": 1.4268911483855128e-05, "loss": 2.5095, "step": 675 }, { "epoch": 1.1056160938809723, "grad_norm": 6.912359714508057, "learning_rate": 1.4252558421339355e-05, "loss": 2.7567, "step": 676 }, { "epoch": 1.107292539815591, "grad_norm": 11.179567337036133, "learning_rate": 1.4236191464085286e-05, "loss": 2.464, "step": 677 }, { "epoch": 1.1089689857502096, "grad_norm": 5.879552364349365, "learning_rate": 1.421981066557003e-05, "loss": 2.3121, "step": 678 }, { "epoch": 1.1106454316848282, "grad_norm": 14.740165710449219, "learning_rate": 1.4203416079315944e-05, "loss": 2.2557, "step": 679 }, { "epoch": 1.1123218776194468, "grad_norm": 8.366064071655273, "learning_rate": 1.4187007758890423e-05, "loss": 2.4487, "step": 680 }, { "epoch": 1.1139983235540654, "grad_norm": 8.06722640991211, "learning_rate": 1.4170585757905742e-05, "loss": 2.4179, "step": 681 }, { "epoch": 1.115674769488684, "grad_norm": 8.88497257232666, "learning_rate": 1.4154150130018867e-05, "loss": 2.2504, "step": 682 }, { "epoch": 1.1173512154233025, "grad_norm": 7.274255752563477, "learning_rate": 1.4137700928931294e-05, "loss": 2.1664, "step": 683 }, { "epoch": 1.1190276613579213, "grad_norm": 10.20564079284668, "learning_rate": 1.4121238208388872e-05, "loss": 2.4834, "step": 684 }, { "epoch": 1.1207041072925399, "grad_norm": 7.8582048416137695, "learning_rate": 1.410476202218162e-05, "loss": 2.4194, "step": 685 }, { "epoch": 1.1223805532271585, "grad_norm": 16.65113067626953, "learning_rate": 1.4088272424143546e-05, "loss": 2.5727, "step": 686 }, { "epoch": 1.124056999161777, "grad_norm": 8.339031219482422, "learning_rate": 1.4071769468152492e-05, "loss": 2.3786, "step": 687 }, { "epoch": 1.1257334450963956, "grad_norm": 5.935298442840576, "learning_rate": 1.405525320812994e-05, "loss": 2.3723, "step": 688 }, { "epoch": 1.1274098910310142, "grad_norm": 37.79025650024414, "learning_rate": 1.4038723698040844e-05, "loss": 2.3192, "step": 689 }, { "epoch": 1.1290863369656328, "grad_norm": 9.222002983093262, "learning_rate": 1.4022180991893443e-05, "loss": 2.1408, "step": 690 }, { "epoch": 1.1307627829002516, "grad_norm": 4.768589973449707, "learning_rate": 1.4005625143739103e-05, "loss": 2.4107, "step": 691 }, { "epoch": 1.1324392288348701, "grad_norm": 6.6929545402526855, "learning_rate": 1.3989056207672125e-05, "loss": 2.6431, "step": 692 }, { "epoch": 1.1341156747694887, "grad_norm": 8.254537582397461, "learning_rate": 1.3972474237829577e-05, "loss": 2.5987, "step": 693 }, { "epoch": 1.1357921207041073, "grad_norm": 6.098884582519531, "learning_rate": 1.39558792883911e-05, "loss": 2.2504, "step": 694 }, { "epoch": 1.1374685666387259, "grad_norm": 7.766963005065918, "learning_rate": 1.3939271413578766e-05, "loss": 2.5297, "step": 695 }, { "epoch": 1.1391450125733444, "grad_norm": 12.682686805725098, "learning_rate": 1.3922650667656861e-05, "loss": 2.4865, "step": 696 }, { "epoch": 1.140821458507963, "grad_norm": 3.977105140686035, "learning_rate": 1.3906017104931734e-05, "loss": 2.5172, "step": 697 }, { "epoch": 1.1424979044425818, "grad_norm": 5.040312767028809, "learning_rate": 1.388937077975161e-05, "loss": 2.3193, "step": 698 }, { "epoch": 1.1441743503772004, "grad_norm": 10.801183700561523, "learning_rate": 1.3872711746506413e-05, "loss": 2.2413, "step": 699 }, { "epoch": 1.145850796311819, "grad_norm": 7.879208564758301, "learning_rate": 1.3856040059627588e-05, "loss": 2.2312, "step": 700 }, { "epoch": 1.1475272422464375, "grad_norm": 4.403446674346924, "learning_rate": 1.3839355773587932e-05, "loss": 2.4123, "step": 701 }, { "epoch": 1.1492036881810561, "grad_norm": 5.013376235961914, "learning_rate": 1.3822658942901394e-05, "loss": 2.2382, "step": 702 }, { "epoch": 1.1508801341156747, "grad_norm": 5.702521800994873, "learning_rate": 1.3805949622122927e-05, "loss": 2.4323, "step": 703 }, { "epoch": 1.1525565800502933, "grad_norm": 16.96495819091797, "learning_rate": 1.3789227865848282e-05, "loss": 2.1636, "step": 704 }, { "epoch": 1.154233025984912, "grad_norm": 6.872570037841797, "learning_rate": 1.3772493728713852e-05, "loss": 2.2834, "step": 705 }, { "epoch": 1.1559094719195306, "grad_norm": 8.170105934143066, "learning_rate": 1.3755747265396466e-05, "loss": 2.4428, "step": 706 }, { "epoch": 1.1575859178541492, "grad_norm": 16.639694213867188, "learning_rate": 1.3738988530613247e-05, "loss": 2.4099, "step": 707 }, { "epoch": 1.1592623637887678, "grad_norm": 8.593493461608887, "learning_rate": 1.37222175791214e-05, "loss": 2.5641, "step": 708 }, { "epoch": 1.1609388097233864, "grad_norm": 5.511089324951172, "learning_rate": 1.370543446571806e-05, "loss": 2.3449, "step": 709 }, { "epoch": 1.162615255658005, "grad_norm": 5.080988883972168, "learning_rate": 1.3688639245240078e-05, "loss": 1.9627, "step": 710 }, { "epoch": 1.1642917015926235, "grad_norm": 32.55522918701172, "learning_rate": 1.3671831972563889e-05, "loss": 2.4142, "step": 711 }, { "epoch": 1.1659681475272423, "grad_norm": 5.4992852210998535, "learning_rate": 1.3655012702605288e-05, "loss": 2.3351, "step": 712 }, { "epoch": 1.167644593461861, "grad_norm": 10.683517456054688, "learning_rate": 1.363818149031928e-05, "loss": 2.3027, "step": 713 }, { "epoch": 1.1693210393964795, "grad_norm": 7.025029182434082, "learning_rate": 1.3621338390699881e-05, "loss": 2.3981, "step": 714 }, { "epoch": 1.170997485331098, "grad_norm": 8.616060256958008, "learning_rate": 1.360448345877996e-05, "loss": 2.4668, "step": 715 }, { "epoch": 1.1726739312657166, "grad_norm": 6.0210862159729, "learning_rate": 1.3587616749631037e-05, "loss": 2.556, "step": 716 }, { "epoch": 1.1743503772003352, "grad_norm": 5.155324459075928, "learning_rate": 1.3570738318363113e-05, "loss": 1.8139, "step": 717 }, { "epoch": 1.176026823134954, "grad_norm": 7.9282989501953125, "learning_rate": 1.3553848220124497e-05, "loss": 2.2768, "step": 718 }, { "epoch": 1.1777032690695726, "grad_norm": 4.647676467895508, "learning_rate": 1.353694651010161e-05, "loss": 2.5524, "step": 719 }, { "epoch": 1.1793797150041911, "grad_norm": 5.671473026275635, "learning_rate": 1.3520033243518818e-05, "loss": 2.4651, "step": 720 }, { "epoch": 1.1810561609388097, "grad_norm": 4.144130706787109, "learning_rate": 1.3503108475638244e-05, "loss": 2.2353, "step": 721 }, { "epoch": 1.1827326068734283, "grad_norm": 4.976291179656982, "learning_rate": 1.3486172261759598e-05, "loss": 2.4431, "step": 722 }, { "epoch": 1.1844090528080469, "grad_norm": 7.107454776763916, "learning_rate": 1.3469224657219972e-05, "loss": 2.2559, "step": 723 }, { "epoch": 1.1860854987426657, "grad_norm": 6.96286153793335, "learning_rate": 1.3452265717393693e-05, "loss": 2.6742, "step": 724 }, { "epoch": 1.1877619446772842, "grad_norm": 23.62489128112793, "learning_rate": 1.3435295497692114e-05, "loss": 2.2214, "step": 725 }, { "epoch": 1.1894383906119028, "grad_norm": 9.193338394165039, "learning_rate": 1.3418314053563455e-05, "loss": 2.321, "step": 726 }, { "epoch": 1.1911148365465214, "grad_norm": 7.644625186920166, "learning_rate": 1.340132144049259e-05, "loss": 2.4631, "step": 727 }, { "epoch": 1.19279128248114, "grad_norm": 5.875265121459961, "learning_rate": 1.3384317714000907e-05, "loss": 2.3089, "step": 728 }, { "epoch": 1.1944677284157585, "grad_norm": 6.8550310134887695, "learning_rate": 1.3367302929646095e-05, "loss": 2.2461, "step": 729 }, { "epoch": 1.1961441743503771, "grad_norm": 4.478893756866455, "learning_rate": 1.3350277143021981e-05, "loss": 2.4063, "step": 730 }, { "epoch": 1.197820620284996, "grad_norm": 12.494105339050293, "learning_rate": 1.3333240409758328e-05, "loss": 2.3796, "step": 731 }, { "epoch": 1.1994970662196145, "grad_norm": 8.968586921691895, "learning_rate": 1.331619278552068e-05, "loss": 2.3247, "step": 732 }, { "epoch": 1.201173512154233, "grad_norm": 6.821298122406006, "learning_rate": 1.3299134326010155e-05, "loss": 2.2768, "step": 733 }, { "epoch": 1.2028499580888516, "grad_norm": 11.386137962341309, "learning_rate": 1.328206508696328e-05, "loss": 2.5046, "step": 734 }, { "epoch": 1.2045264040234702, "grad_norm": 4.4509172439575195, "learning_rate": 1.3264985124151801e-05, "loss": 2.6289, "step": 735 }, { "epoch": 1.2062028499580888, "grad_norm": 19.35344123840332, "learning_rate": 1.3247894493382507e-05, "loss": 2.4059, "step": 736 }, { "epoch": 1.2078792958927074, "grad_norm": 5.986297130584717, "learning_rate": 1.323079325049703e-05, "loss": 2.2961, "step": 737 }, { "epoch": 1.2095557418273262, "grad_norm": 13.021050453186035, "learning_rate": 1.3213681451371697e-05, "loss": 2.2731, "step": 738 }, { "epoch": 1.2112321877619447, "grad_norm": 14.130514144897461, "learning_rate": 1.3196559151917305e-05, "loss": 2.0552, "step": 739 }, { "epoch": 1.2129086336965633, "grad_norm": 13.177688598632812, "learning_rate": 1.3179426408078975e-05, "loss": 2.0456, "step": 740 }, { "epoch": 1.214585079631182, "grad_norm": 4.816995620727539, "learning_rate": 1.3162283275835938e-05, "loss": 2.0371, "step": 741 }, { "epoch": 1.2162615255658005, "grad_norm": 5.219287395477295, "learning_rate": 1.3145129811201395e-05, "loss": 2.4395, "step": 742 }, { "epoch": 1.217937971500419, "grad_norm": 4.969995975494385, "learning_rate": 1.3127966070222273e-05, "loss": 2.1573, "step": 743 }, { "epoch": 1.2196144174350376, "grad_norm": 6.253572463989258, "learning_rate": 1.31107921089791e-05, "loss": 2.2704, "step": 744 }, { "epoch": 1.2212908633696564, "grad_norm": 4.564723491668701, "learning_rate": 1.3093607983585787e-05, "loss": 2.0412, "step": 745 }, { "epoch": 1.2212908633696564, "eval_loss": 2.3626017570495605, "eval_runtime": 170.0639, "eval_samples_per_second": 3.087, "eval_steps_per_second": 1.546, "step": 745 }, { "epoch": 1.222967309304275, "grad_norm": 8.809134483337402, "learning_rate": 1.3076413750189468e-05, "loss": 2.1204, "step": 746 }, { "epoch": 1.2246437552388936, "grad_norm": 8.777671813964844, "learning_rate": 1.3059209464970278e-05, "loss": 2.5949, "step": 747 }, { "epoch": 1.2263202011735121, "grad_norm": 4.983223915100098, "learning_rate": 1.304199518414122e-05, "loss": 2.5453, "step": 748 }, { "epoch": 1.2279966471081307, "grad_norm": 12.908360481262207, "learning_rate": 1.3024770963947945e-05, "loss": 2.1027, "step": 749 }, { "epoch": 1.2296730930427493, "grad_norm": 6.457499027252197, "learning_rate": 1.3007536860668588e-05, "loss": 2.2025, "step": 750 }, { "epoch": 1.2313495389773679, "grad_norm": 10.60882568359375, "learning_rate": 1.2990292930613559e-05, "loss": 2.4558, "step": 751 }, { "epoch": 1.2330259849119867, "grad_norm": 5.248461723327637, "learning_rate": 1.2973039230125397e-05, "loss": 2.271, "step": 752 }, { "epoch": 1.2347024308466052, "grad_norm": 9.996805191040039, "learning_rate": 1.2955775815578549e-05, "loss": 2.2297, "step": 753 }, { "epoch": 1.2363788767812238, "grad_norm": 5.056663990020752, "learning_rate": 1.2938502743379212e-05, "loss": 1.8947, "step": 754 }, { "epoch": 1.2380553227158424, "grad_norm": 10.086054801940918, "learning_rate": 1.2921220069965125e-05, "loss": 2.3542, "step": 755 }, { "epoch": 1.239731768650461, "grad_norm": 7.093770503997803, "learning_rate": 1.2903927851805416e-05, "loss": 2.6345, "step": 756 }, { "epoch": 1.2414082145850796, "grad_norm": 4.994962215423584, "learning_rate": 1.288662614540038e-05, "loss": 2.0281, "step": 757 }, { "epoch": 1.2430846605196981, "grad_norm": 7.101597785949707, "learning_rate": 1.2869315007281334e-05, "loss": 2.4193, "step": 758 }, { "epoch": 1.244761106454317, "grad_norm": 5.4213643074035645, "learning_rate": 1.285199449401039e-05, "loss": 2.2455, "step": 759 }, { "epoch": 1.2464375523889355, "grad_norm": 5.204904556274414, "learning_rate": 1.2834664662180311e-05, "loss": 2.376, "step": 760 }, { "epoch": 1.248113998323554, "grad_norm": 6.285387992858887, "learning_rate": 1.2817325568414299e-05, "loss": 2.6934, "step": 761 }, { "epoch": 1.2497904442581727, "grad_norm": 4.555075168609619, "learning_rate": 1.2799977269365818e-05, "loss": 2.2706, "step": 762 }, { "epoch": 1.2514668901927912, "grad_norm": 7.026051044464111, "learning_rate": 1.2782619821718408e-05, "loss": 2.513, "step": 763 }, { "epoch": 1.25314333612741, "grad_norm": 13.471336364746094, "learning_rate": 1.2765253282185505e-05, "loss": 2.2633, "step": 764 }, { "epoch": 1.2548197820620284, "grad_norm": 8.327913284301758, "learning_rate": 1.2747877707510252e-05, "loss": 2.8966, "step": 765 }, { "epoch": 1.2564962279966472, "grad_norm": 11.460576057434082, "learning_rate": 1.2730493154465311e-05, "loss": 2.433, "step": 766 }, { "epoch": 1.2581726739312658, "grad_norm": 21.8690128326416, "learning_rate": 1.2713099679852683e-05, "loss": 2.0391, "step": 767 }, { "epoch": 1.2598491198658843, "grad_norm": 23.172582626342773, "learning_rate": 1.2695697340503506e-05, "loss": 2.2557, "step": 768 }, { "epoch": 1.261525565800503, "grad_norm": 5.4329376220703125, "learning_rate": 1.26782861932779e-05, "loss": 1.7829, "step": 769 }, { "epoch": 1.2632020117351215, "grad_norm": 15.813942909240723, "learning_rate": 1.2660866295064754e-05, "loss": 2.2456, "step": 770 }, { "epoch": 1.2648784576697403, "grad_norm": 27.818836212158203, "learning_rate": 1.2643437702781559e-05, "loss": 1.9923, "step": 771 }, { "epoch": 1.2665549036043586, "grad_norm": 6.520410537719727, "learning_rate": 1.2626000473374194e-05, "loss": 2.4505, "step": 772 }, { "epoch": 1.2682313495389774, "grad_norm": 4.259927272796631, "learning_rate": 1.2608554663816779e-05, "loss": 2.4968, "step": 773 }, { "epoch": 1.269907795473596, "grad_norm": 4.41032600402832, "learning_rate": 1.259110033111146e-05, "loss": 2.3696, "step": 774 }, { "epoch": 1.2715842414082146, "grad_norm": 6.355128288269043, "learning_rate": 1.2573637532288232e-05, "loss": 2.4693, "step": 775 }, { "epoch": 1.2732606873428332, "grad_norm": 16.281635284423828, "learning_rate": 1.2556166324404747e-05, "loss": 2.207, "step": 776 }, { "epoch": 1.2749371332774517, "grad_norm": 4.314234256744385, "learning_rate": 1.2538686764546147e-05, "loss": 2.4122, "step": 777 }, { "epoch": 1.2766135792120705, "grad_norm": 20.29671287536621, "learning_rate": 1.2521198909824844e-05, "loss": 2.1538, "step": 778 }, { "epoch": 1.278290025146689, "grad_norm": 32.163490295410156, "learning_rate": 1.2503702817380368e-05, "loss": 2.5883, "step": 779 }, { "epoch": 1.2799664710813077, "grad_norm": 6.775224208831787, "learning_rate": 1.2486198544379156e-05, "loss": 2.1191, "step": 780 }, { "epoch": 1.2816429170159263, "grad_norm": 5.697524547576904, "learning_rate": 1.2468686148014379e-05, "loss": 2.1629, "step": 781 }, { "epoch": 1.2833193629505448, "grad_norm": 20.441125869750977, "learning_rate": 1.2451165685505746e-05, "loss": 2.2228, "step": 782 }, { "epoch": 1.2849958088851634, "grad_norm": 4.5879058837890625, "learning_rate": 1.243363721409933e-05, "loss": 2.27, "step": 783 }, { "epoch": 1.286672254819782, "grad_norm": 4.568212509155273, "learning_rate": 1.2416100791067356e-05, "loss": 2.0187, "step": 784 }, { "epoch": 1.2883487007544008, "grad_norm": 6.81599235534668, "learning_rate": 1.2398556473708046e-05, "loss": 1.9058, "step": 785 }, { "epoch": 1.2900251466890194, "grad_norm": 8.967713356018066, "learning_rate": 1.2381004319345402e-05, "loss": 1.9508, "step": 786 }, { "epoch": 1.291701592623638, "grad_norm": 6.585296630859375, "learning_rate": 1.2363444385329052e-05, "loss": 2.2702, "step": 787 }, { "epoch": 1.2933780385582565, "grad_norm": 3.841449499130249, "learning_rate": 1.2345876729034018e-05, "loss": 1.9843, "step": 788 }, { "epoch": 1.295054484492875, "grad_norm": 6.293701648712158, "learning_rate": 1.2328301407860575e-05, "loss": 2.2729, "step": 789 }, { "epoch": 1.2967309304274937, "grad_norm": 9.20996379852295, "learning_rate": 1.231071847923403e-05, "loss": 2.2307, "step": 790 }, { "epoch": 1.2984073763621122, "grad_norm": 9.385099411010742, "learning_rate": 1.2293128000604552e-05, "loss": 2.4403, "step": 791 }, { "epoch": 1.300083822296731, "grad_norm": 10.362083435058594, "learning_rate": 1.2275530029446975e-05, "loss": 2.4731, "step": 792 }, { "epoch": 1.3017602682313496, "grad_norm": 7.216504096984863, "learning_rate": 1.2257924623260618e-05, "loss": 2.3733, "step": 793 }, { "epoch": 1.3034367141659682, "grad_norm": 9.503218650817871, "learning_rate": 1.2240311839569085e-05, "loss": 2.1359, "step": 794 }, { "epoch": 1.3051131601005868, "grad_norm": 5.100936412811279, "learning_rate": 1.2222691735920103e-05, "loss": 2.2447, "step": 795 }, { "epoch": 1.3067896060352053, "grad_norm": 10.747591018676758, "learning_rate": 1.2205064369885291e-05, "loss": 2.1219, "step": 796 }, { "epoch": 1.308466051969824, "grad_norm": 19.294620513916016, "learning_rate": 1.2187429799060017e-05, "loss": 2.443, "step": 797 }, { "epoch": 1.3101424979044425, "grad_norm": 21.347679138183594, "learning_rate": 1.2169788081063181e-05, "loss": 2.3474, "step": 798 }, { "epoch": 1.3118189438390613, "grad_norm": 7.256950378417969, "learning_rate": 1.2152139273537042e-05, "loss": 1.9597, "step": 799 }, { "epoch": 1.3134953897736799, "grad_norm": 8.270674705505371, "learning_rate": 1.213448343414701e-05, "loss": 2.3496, "step": 800 }, { "epoch": 1.3151718357082984, "grad_norm": 8.472537994384766, "learning_rate": 1.2116820620581486e-05, "loss": 2.2487, "step": 801 }, { "epoch": 1.316848281642917, "grad_norm": 11.256389617919922, "learning_rate": 1.209915089055165e-05, "loss": 2.2415, "step": 802 }, { "epoch": 1.3185247275775356, "grad_norm": 10.744033813476562, "learning_rate": 1.2081474301791286e-05, "loss": 2.4757, "step": 803 }, { "epoch": 1.3202011735121542, "grad_norm": 8.664780616760254, "learning_rate": 1.2063790912056577e-05, "loss": 2.2214, "step": 804 }, { "epoch": 1.3218776194467727, "grad_norm": 38.659423828125, "learning_rate": 1.2046100779125943e-05, "loss": 2.1786, "step": 805 }, { "epoch": 1.3235540653813915, "grad_norm": 12.288658142089844, "learning_rate": 1.2028403960799821e-05, "loss": 2.6573, "step": 806 }, { "epoch": 1.32523051131601, "grad_norm": 15.211291313171387, "learning_rate": 1.2010700514900509e-05, "loss": 2.6974, "step": 807 }, { "epoch": 1.3269069572506287, "grad_norm": 26.189373016357422, "learning_rate": 1.1992990499271939e-05, "loss": 2.2359, "step": 808 }, { "epoch": 1.3285834031852473, "grad_norm": 18.173555374145508, "learning_rate": 1.1975273971779528e-05, "loss": 2.51, "step": 809 }, { "epoch": 1.3302598491198658, "grad_norm": 56.32579803466797, "learning_rate": 1.1957550990309958e-05, "loss": 2.4327, "step": 810 }, { "epoch": 1.3319362950544846, "grad_norm": 31.29780387878418, "learning_rate": 1.1939821612771008e-05, "loss": 2.2537, "step": 811 }, { "epoch": 1.333612740989103, "grad_norm": 18.143177032470703, "learning_rate": 1.1922085897091342e-05, "loss": 2.3451, "step": 812 }, { "epoch": 1.3352891869237218, "grad_norm": 6.67291784286499, "learning_rate": 1.1904343901220345e-05, "loss": 2.4622, "step": 813 }, { "epoch": 1.3369656328583404, "grad_norm": 8.449662208557129, "learning_rate": 1.1886595683127917e-05, "loss": 2.0544, "step": 814 }, { "epoch": 1.338642078792959, "grad_norm": 5.747905254364014, "learning_rate": 1.1868841300804293e-05, "loss": 2.5466, "step": 815 }, { "epoch": 1.3403185247275775, "grad_norm": 7.238699913024902, "learning_rate": 1.1851080812259839e-05, "loss": 2.0036, "step": 816 }, { "epoch": 1.341994970662196, "grad_norm": 9.35036849975586, "learning_rate": 1.183331427552488e-05, "loss": 2.1333, "step": 817 }, { "epoch": 1.3436714165968149, "grad_norm": 5.896330833435059, "learning_rate": 1.1815541748649504e-05, "loss": 2.1309, "step": 818 }, { "epoch": 1.3453478625314332, "grad_norm": 5.703736782073975, "learning_rate": 1.1797763289703364e-05, "loss": 2.0636, "step": 819 }, { "epoch": 1.347024308466052, "grad_norm": 8.290069580078125, "learning_rate": 1.1779978956775507e-05, "loss": 1.9612, "step": 820 }, { "epoch": 1.3487007544006706, "grad_norm": 8.098957061767578, "learning_rate": 1.1762188807974153e-05, "loss": 2.1532, "step": 821 }, { "epoch": 1.3503772003352892, "grad_norm": 12.725566864013672, "learning_rate": 1.1744392901426549e-05, "loss": 2.0479, "step": 822 }, { "epoch": 1.3520536462699078, "grad_norm": 7.343058109283447, "learning_rate": 1.1726591295278732e-05, "loss": 1.9967, "step": 823 }, { "epoch": 1.3537300922045263, "grad_norm": 7.3642659187316895, "learning_rate": 1.1708784047695385e-05, "loss": 2.5486, "step": 824 }, { "epoch": 1.3554065381391451, "grad_norm": 5.633703708648682, "learning_rate": 1.1690971216859599e-05, "loss": 2.3653, "step": 825 }, { "epoch": 1.3570829840737635, "grad_norm": 5.574102878570557, "learning_rate": 1.1673152860972725e-05, "loss": 2.2722, "step": 826 }, { "epoch": 1.3587594300083823, "grad_norm": 13.572903633117676, "learning_rate": 1.1655329038254159e-05, "loss": 2.3043, "step": 827 }, { "epoch": 1.3604358759430009, "grad_norm": 15.497044563293457, "learning_rate": 1.163749980694117e-05, "loss": 2.2273, "step": 828 }, { "epoch": 1.3621123218776194, "grad_norm": 7.474100112915039, "learning_rate": 1.161966522528868e-05, "loss": 2.2127, "step": 829 }, { "epoch": 1.363788767812238, "grad_norm": 10.419915199279785, "learning_rate": 1.1601825351569105e-05, "loss": 2.5162, "step": 830 }, { "epoch": 1.3654652137468566, "grad_norm": 5.53465461730957, "learning_rate": 1.158398024407215e-05, "loss": 2.3196, "step": 831 }, { "epoch": 1.3671416596814754, "grad_norm": 15.311758995056152, "learning_rate": 1.1566129961104628e-05, "loss": 1.9806, "step": 832 }, { "epoch": 1.368818105616094, "grad_norm": 9.462645530700684, "learning_rate": 1.1548274560990244e-05, "loss": 2.2935, "step": 833 }, { "epoch": 1.3704945515507125, "grad_norm": 12.144463539123535, "learning_rate": 1.1530414102069444e-05, "loss": 2.0679, "step": 834 }, { "epoch": 1.3721709974853311, "grad_norm": 7.723580360412598, "learning_rate": 1.1512548642699186e-05, "loss": 2.2991, "step": 835 }, { "epoch": 1.3738474434199497, "grad_norm": 9.321198463439941, "learning_rate": 1.1494678241252781e-05, "loss": 2.4107, "step": 836 }, { "epoch": 1.3755238893545683, "grad_norm": 8.775720596313477, "learning_rate": 1.147680295611967e-05, "loss": 2.2669, "step": 837 }, { "epoch": 1.3772003352891868, "grad_norm": 7.507436275482178, "learning_rate": 1.1458922845705267e-05, "loss": 2.3821, "step": 838 }, { "epoch": 1.3788767812238056, "grad_norm": 10.198686599731445, "learning_rate": 1.1441037968430743e-05, "loss": 2.3453, "step": 839 }, { "epoch": 1.3805532271584242, "grad_norm": 4.230295658111572, "learning_rate": 1.1423148382732854e-05, "loss": 1.9377, "step": 840 }, { "epoch": 1.3822296730930428, "grad_norm": 4.215802192687988, "learning_rate": 1.1405254147063722e-05, "loss": 2.4109, "step": 841 }, { "epoch": 1.3839061190276614, "grad_norm": 3.8975067138671875, "learning_rate": 1.1387355319890685e-05, "loss": 2.1308, "step": 842 }, { "epoch": 1.38558256496228, "grad_norm": 6.950760841369629, "learning_rate": 1.136945195969606e-05, "loss": 2.0955, "step": 843 }, { "epoch": 1.3872590108968985, "grad_norm": 6.852515697479248, "learning_rate": 1.1351544124977001e-05, "loss": 2.1729, "step": 844 }, { "epoch": 1.388935456831517, "grad_norm": 7.585150718688965, "learning_rate": 1.1333631874245252e-05, "loss": 2.3506, "step": 845 }, { "epoch": 1.3906119027661359, "grad_norm": 9.949862480163574, "learning_rate": 1.1315715266027014e-05, "loss": 2.3656, "step": 846 }, { "epoch": 1.3922883487007545, "grad_norm": 8.54454517364502, "learning_rate": 1.1297794358862705e-05, "loss": 2.4884, "step": 847 }, { "epoch": 1.393964794635373, "grad_norm": 14.651853561401367, "learning_rate": 1.127986921130681e-05, "loss": 2.3233, "step": 848 }, { "epoch": 1.3956412405699916, "grad_norm": 7.670744895935059, "learning_rate": 1.126193988192764e-05, "loss": 2.6365, "step": 849 }, { "epoch": 1.3973176865046102, "grad_norm": 7.118234634399414, "learning_rate": 1.1244006429307195e-05, "loss": 2.077, "step": 850 }, { "epoch": 1.3989941324392288, "grad_norm": 12.22930908203125, "learning_rate": 1.1226068912040935e-05, "loss": 2.8746, "step": 851 }, { "epoch": 1.4006705783738473, "grad_norm": 8.1715669631958, "learning_rate": 1.1208127388737613e-05, "loss": 2.371, "step": 852 }, { "epoch": 1.4023470243084661, "grad_norm": 14.557357788085938, "learning_rate": 1.119018191801905e-05, "loss": 2.2965, "step": 853 }, { "epoch": 1.4040234702430847, "grad_norm": 10.226070404052734, "learning_rate": 1.1172232558519983e-05, "loss": 2.4912, "step": 854 }, { "epoch": 1.4056999161777033, "grad_norm": 13.017776489257812, "learning_rate": 1.1154279368887848e-05, "loss": 2.1846, "step": 855 }, { "epoch": 1.4073763621123219, "grad_norm": 4.491578578948975, "learning_rate": 1.1136322407782603e-05, "loss": 2.4329, "step": 856 }, { "epoch": 1.4090528080469404, "grad_norm": 7.38001823425293, "learning_rate": 1.1118361733876513e-05, "loss": 2.202, "step": 857 }, { "epoch": 1.410729253981559, "grad_norm": 14.184925079345703, "learning_rate": 1.1100397405853988e-05, "loss": 2.1145, "step": 858 }, { "epoch": 1.4124056999161776, "grad_norm": 12.585079193115234, "learning_rate": 1.1082429482411373e-05, "loss": 2.4345, "step": 859 }, { "epoch": 1.4140821458507964, "grad_norm": 6.896921157836914, "learning_rate": 1.1064458022256764e-05, "loss": 2.1899, "step": 860 }, { "epoch": 1.415758591785415, "grad_norm": 7.941607475280762, "learning_rate": 1.1046483084109801e-05, "loss": 2.2611, "step": 861 }, { "epoch": 1.4174350377200335, "grad_norm": 13.742573738098145, "learning_rate": 1.1028504726701502e-05, "loss": 2.5155, "step": 862 }, { "epoch": 1.4191114836546521, "grad_norm": 5.462913990020752, "learning_rate": 1.1010523008774052e-05, "loss": 2.3362, "step": 863 }, { "epoch": 1.4207879295892707, "grad_norm": 9.167533874511719, "learning_rate": 1.0992537989080618e-05, "loss": 2.212, "step": 864 }, { "epoch": 1.4224643755238895, "grad_norm": 12.828566551208496, "learning_rate": 1.0974549726385146e-05, "loss": 2.1003, "step": 865 }, { "epoch": 1.4241408214585078, "grad_norm": 11.575430870056152, "learning_rate": 1.0956558279462188e-05, "loss": 2.2957, "step": 866 }, { "epoch": 1.4258172673931266, "grad_norm": 4.52449893951416, "learning_rate": 1.0938563707096697e-05, "loss": 2.3084, "step": 867 }, { "epoch": 1.4274937133277452, "grad_norm": 7.962285041809082, "learning_rate": 1.092056606808384e-05, "loss": 1.9497, "step": 868 }, { "epoch": 1.4291701592623638, "grad_norm": 5.646916389465332, "learning_rate": 1.09025654212288e-05, "loss": 2.2562, "step": 869 }, { "epoch": 1.4308466051969824, "grad_norm": 10.125725746154785, "learning_rate": 1.0884561825346589e-05, "loss": 2.3797, "step": 870 }, { "epoch": 1.432523051131601, "grad_norm": 7.040199279785156, "learning_rate": 1.0866555339261854e-05, "loss": 2.2331, "step": 871 }, { "epoch": 1.4341994970662197, "grad_norm": 7.917027950286865, "learning_rate": 1.084854602180869e-05, "loss": 2.4089, "step": 872 }, { "epoch": 1.435875943000838, "grad_norm": 7.044780731201172, "learning_rate": 1.083053393183044e-05, "loss": 2.5574, "step": 873 }, { "epoch": 1.437552388935457, "grad_norm": 6.6082868576049805, "learning_rate": 1.0812519128179501e-05, "loss": 2.3254, "step": 874 }, { "epoch": 1.4392288348700755, "grad_norm": 5.847382068634033, "learning_rate": 1.0794501669717146e-05, "loss": 2.4952, "step": 875 }, { "epoch": 1.440905280804694, "grad_norm": 5.139172077178955, "learning_rate": 1.0776481615313314e-05, "loss": 2.6738, "step": 876 }, { "epoch": 1.4425817267393126, "grad_norm": 8.412246704101562, "learning_rate": 1.0758459023846435e-05, "loss": 2.1551, "step": 877 }, { "epoch": 1.4442581726739312, "grad_norm": 14.49990177154541, "learning_rate": 1.0740433954203215e-05, "loss": 2.1967, "step": 878 }, { "epoch": 1.44593461860855, "grad_norm": 6.606904983520508, "learning_rate": 1.0722406465278475e-05, "loss": 2.2967, "step": 879 }, { "epoch": 1.4476110645431683, "grad_norm": 12.395999908447266, "learning_rate": 1.070437661597492e-05, "loss": 2.3293, "step": 880 }, { "epoch": 1.4492875104777871, "grad_norm": 5.6000447273254395, "learning_rate": 1.0686344465202988e-05, "loss": 2.1421, "step": 881 }, { "epoch": 1.4509639564124057, "grad_norm": 6.1536102294921875, "learning_rate": 1.066831007188062e-05, "loss": 2.4295, "step": 882 }, { "epoch": 1.4526404023470243, "grad_norm": 8.98680305480957, "learning_rate": 1.06502734949331e-05, "loss": 2.2217, "step": 883 }, { "epoch": 1.4543168482816429, "grad_norm": 15.467352867126465, "learning_rate": 1.0632234793292826e-05, "loss": 2.2295, "step": 884 }, { "epoch": 1.4559932942162614, "grad_norm": 6.254953861236572, "learning_rate": 1.0614194025899163e-05, "loss": 2.1731, "step": 885 }, { "epoch": 1.4576697401508802, "grad_norm": 5.679813385009766, "learning_rate": 1.05961512516982e-05, "loss": 2.1994, "step": 886 }, { "epoch": 1.4593461860854988, "grad_norm": 12.249408721923828, "learning_rate": 1.0578106529642606e-05, "loss": 2.2795, "step": 887 }, { "epoch": 1.4610226320201174, "grad_norm": 36.43180847167969, "learning_rate": 1.0560059918691396e-05, "loss": 2.2238, "step": 888 }, { "epoch": 1.462699077954736, "grad_norm": 95.63656616210938, "learning_rate": 1.0542011477809776e-05, "loss": 2.2013, "step": 889 }, { "epoch": 1.4643755238893545, "grad_norm": 48.707584381103516, "learning_rate": 1.0523961265968909e-05, "loss": 2.3434, "step": 890 }, { "epoch": 1.4660519698239731, "grad_norm": 8.923469543457031, "learning_rate": 1.050590934214576e-05, "loss": 2.2588, "step": 891 }, { "epoch": 1.4677284157585917, "grad_norm": 29.601642608642578, "learning_rate": 1.0487855765322886e-05, "loss": 2.4926, "step": 892 }, { "epoch": 1.4694048616932105, "grad_norm": 17.259878158569336, "learning_rate": 1.0469800594488237e-05, "loss": 2.0528, "step": 893 }, { "epoch": 1.471081307627829, "grad_norm": 13.11846923828125, "learning_rate": 1.045174388863498e-05, "loss": 2.322, "step": 894 }, { "epoch": 1.471081307627829, "eval_loss": 2.30824875831604, "eval_runtime": 170.0728, "eval_samples_per_second": 3.087, "eval_steps_per_second": 1.546, "step": 894 }, { "epoch": 1.4727577535624476, "grad_norm": 17.859909057617188, "learning_rate": 1.0433685706761291e-05, "loss": 2.0382, "step": 895 }, { "epoch": 1.4744341994970662, "grad_norm": 4.719820022583008, "learning_rate": 1.0415626107870171e-05, "loss": 2.228, "step": 896 }, { "epoch": 1.4761106454316848, "grad_norm": 15.876289367675781, "learning_rate": 1.039756515096926e-05, "loss": 2.1097, "step": 897 }, { "epoch": 1.4777870913663034, "grad_norm": 11.272828102111816, "learning_rate": 1.0379502895070615e-05, "loss": 2.1039, "step": 898 }, { "epoch": 1.479463537300922, "grad_norm": 14.485950469970703, "learning_rate": 1.0361439399190554e-05, "loss": 2.0921, "step": 899 }, { "epoch": 1.4811399832355407, "grad_norm": 8.811751365661621, "learning_rate": 1.0343374722349442e-05, "loss": 2.5925, "step": 900 }, { "epoch": 1.4828164291701593, "grad_norm": 9.245624542236328, "learning_rate": 1.0325308923571506e-05, "loss": 2.268, "step": 901 }, { "epoch": 1.484492875104778, "grad_norm": 12.345224380493164, "learning_rate": 1.0307242061884625e-05, "loss": 2.5139, "step": 902 }, { "epoch": 1.4861693210393965, "grad_norm": 8.364679336547852, "learning_rate": 1.028917419632017e-05, "loss": 2.3901, "step": 903 }, { "epoch": 1.487845766974015, "grad_norm": 9.738297462463379, "learning_rate": 1.0271105385912779e-05, "loss": 2.1516, "step": 904 }, { "epoch": 1.4895222129086336, "grad_norm": 5.286563396453857, "learning_rate": 1.0253035689700186e-05, "loss": 2.1824, "step": 905 }, { "epoch": 1.4911986588432522, "grad_norm": 11.570133209228516, "learning_rate": 1.0234965166723003e-05, "loss": 2.1925, "step": 906 }, { "epoch": 1.492875104777871, "grad_norm": 9.793863296508789, "learning_rate": 1.0216893876024567e-05, "loss": 2.4608, "step": 907 }, { "epoch": 1.4945515507124896, "grad_norm": 58.09648895263672, "learning_rate": 1.0198821876650702e-05, "loss": 2.2604, "step": 908 }, { "epoch": 1.4962279966471081, "grad_norm": 6.563573837280273, "learning_rate": 1.0180749227649564e-05, "loss": 2.1474, "step": 909 }, { "epoch": 1.4979044425817267, "grad_norm": 6.815333843231201, "learning_rate": 1.0162675988071419e-05, "loss": 2.3145, "step": 910 }, { "epoch": 1.4995808885163453, "grad_norm": 9.559165000915527, "learning_rate": 1.014460221696847e-05, "loss": 2.4443, "step": 911 }, { "epoch": 1.501257334450964, "grad_norm": 10.48674201965332, "learning_rate": 1.0126527973394653e-05, "loss": 2.1422, "step": 912 }, { "epoch": 1.5029337803855825, "grad_norm": 6.076246738433838, "learning_rate": 1.0108453316405456e-05, "loss": 2.2119, "step": 913 }, { "epoch": 1.5046102263202012, "grad_norm": 3.87929368019104, "learning_rate": 1.0090378305057702e-05, "loss": 2.0507, "step": 914 }, { "epoch": 1.5062866722548198, "grad_norm": 12.247063636779785, "learning_rate": 1.0072302998409384e-05, "loss": 2.211, "step": 915 }, { "epoch": 1.5079631181894384, "grad_norm": 5.461767673492432, "learning_rate": 1.0054227455519459e-05, "loss": 2.2925, "step": 916 }, { "epoch": 1.509639564124057, "grad_norm": 9.779470443725586, "learning_rate": 1.0036151735447652e-05, "loss": 2.3825, "step": 917 }, { "epoch": 1.5113160100586756, "grad_norm": 5.849466800689697, "learning_rate": 1.0018075897254269e-05, "loss": 2.1729, "step": 918 }, { "epoch": 1.5129924559932943, "grad_norm": 10.019554138183594, "learning_rate": 1e-05, "loss": 2.1547, "step": 919 }, { "epoch": 1.5146689019279127, "grad_norm": 6.485411167144775, "learning_rate": 9.981924102745734e-06, "loss": 2.3309, "step": 920 }, { "epoch": 1.5163453478625315, "grad_norm": 9.050286293029785, "learning_rate": 9.963848264552351e-06, "loss": 2.5027, "step": 921 }, { "epoch": 1.51802179379715, "grad_norm": 16.020944595336914, "learning_rate": 9.945772544480543e-06, "loss": 2.4543, "step": 922 }, { "epoch": 1.5196982397317687, "grad_norm": 5.056370258331299, "learning_rate": 9.927697001590618e-06, "loss": 2.1948, "step": 923 }, { "epoch": 1.5213746856663872, "grad_norm": 9.948664665222168, "learning_rate": 9.909621694942301e-06, "loss": 2.3463, "step": 924 }, { "epoch": 1.5230511316010058, "grad_norm": 15.54350471496582, "learning_rate": 9.891546683594548e-06, "loss": 2.0823, "step": 925 }, { "epoch": 1.5247275775356246, "grad_norm": 8.221317291259766, "learning_rate": 9.873472026605347e-06, "loss": 2.1233, "step": 926 }, { "epoch": 1.526404023470243, "grad_norm": 25.38336944580078, "learning_rate": 9.855397783031531e-06, "loss": 2.4209, "step": 927 }, { "epoch": 1.5280804694048618, "grad_norm": 5.437260627746582, "learning_rate": 9.837324011928584e-06, "loss": 2.2921, "step": 928 }, { "epoch": 1.5297569153394803, "grad_norm": 4.227839946746826, "learning_rate": 9.819250772350441e-06, "loss": 2.3951, "step": 929 }, { "epoch": 1.531433361274099, "grad_norm": 7.979526996612549, "learning_rate": 9.801178123349298e-06, "loss": 2.1476, "step": 930 }, { "epoch": 1.5331098072087175, "grad_norm": 7.771272659301758, "learning_rate": 9.783106123975436e-06, "loss": 2.0426, "step": 931 }, { "epoch": 1.534786253143336, "grad_norm": 7.528907299041748, "learning_rate": 9.765034833277e-06, "loss": 2.4708, "step": 932 }, { "epoch": 1.5364626990779549, "grad_norm": 5.879626274108887, "learning_rate": 9.74696431029982e-06, "loss": 2.395, "step": 933 }, { "epoch": 1.5381391450125732, "grad_norm": 8.686258316040039, "learning_rate": 9.728894614087221e-06, "loss": 2.329, "step": 934 }, { "epoch": 1.539815590947192, "grad_norm": 11.73071575164795, "learning_rate": 9.710825803679831e-06, "loss": 2.1707, "step": 935 }, { "epoch": 1.5414920368818106, "grad_norm": 11.081551551818848, "learning_rate": 9.692757938115376e-06, "loss": 2.0183, "step": 936 }, { "epoch": 1.5431684828164292, "grad_norm": 4.939816951751709, "learning_rate": 9.674691076428499e-06, "loss": 2.0046, "step": 937 }, { "epoch": 1.5448449287510477, "grad_norm": 6.710269927978516, "learning_rate": 9.656625277650558e-06, "loss": 2.4054, "step": 938 }, { "epoch": 1.5465213746856663, "grad_norm": 8.460982322692871, "learning_rate": 9.638560600809448e-06, "loss": 2.3256, "step": 939 }, { "epoch": 1.548197820620285, "grad_norm": 7.40871524810791, "learning_rate": 9.62049710492939e-06, "loss": 2.4702, "step": 940 }, { "epoch": 1.5498742665549035, "grad_norm": 4.766067028045654, "learning_rate": 9.602434849030747e-06, "loss": 2.5357, "step": 941 }, { "epoch": 1.5515507124895223, "grad_norm": 7.146002292633057, "learning_rate": 9.584373892129829e-06, "loss": 2.2425, "step": 942 }, { "epoch": 1.5532271584241408, "grad_norm": 10.299982070922852, "learning_rate": 9.566314293238712e-06, "loss": 2.1741, "step": 943 }, { "epoch": 1.5549036043587594, "grad_norm": 6.528247356414795, "learning_rate": 9.548256111365025e-06, "loss": 2.5344, "step": 944 }, { "epoch": 1.5565800502933782, "grad_norm": 7.139346599578857, "learning_rate": 9.530199405511766e-06, "loss": 2.478, "step": 945 }, { "epoch": 1.5582564962279966, "grad_norm": 7.574531078338623, "learning_rate": 9.512144234677116e-06, "loss": 2.3816, "step": 946 }, { "epoch": 1.5599329421626154, "grad_norm": 6.39193058013916, "learning_rate": 9.494090657854241e-06, "loss": 2.4685, "step": 947 }, { "epoch": 1.5616093880972337, "grad_norm": 11.183760643005371, "learning_rate": 9.476038734031093e-06, "loss": 2.0295, "step": 948 }, { "epoch": 1.5632858340318525, "grad_norm": 6.316678524017334, "learning_rate": 9.457988522190227e-06, "loss": 2.2214, "step": 949 }, { "epoch": 1.564962279966471, "grad_norm": 6.389228820800781, "learning_rate": 9.439940081308602e-06, "loss": 2.4774, "step": 950 }, { "epoch": 1.5666387259010897, "grad_norm": 7.7148847579956055, "learning_rate": 9.421893470357397e-06, "loss": 2.2774, "step": 951 }, { "epoch": 1.5683151718357085, "grad_norm": 8.3518705368042, "learning_rate": 9.403848748301802e-06, "loss": 2.4449, "step": 952 }, { "epoch": 1.5699916177703268, "grad_norm": 22.124832153320312, "learning_rate": 9.385805974100843e-06, "loss": 2.3098, "step": 953 }, { "epoch": 1.5716680637049456, "grad_norm": 19.433311462402344, "learning_rate": 9.367765206707174e-06, "loss": 2.2241, "step": 954 }, { "epoch": 1.573344509639564, "grad_norm": 18.221141815185547, "learning_rate": 9.349726505066904e-06, "loss": 2.2718, "step": 955 }, { "epoch": 1.5750209555741828, "grad_norm": 8.636380195617676, "learning_rate": 9.331689928119382e-06, "loss": 2.0337, "step": 956 }, { "epoch": 1.5766974015088013, "grad_norm": 15.445240020751953, "learning_rate": 9.313655534797017e-06, "loss": 2.2257, "step": 957 }, { "epoch": 1.57837384744342, "grad_norm": 10.828184127807617, "learning_rate": 9.295623384025081e-06, "loss": 2.0014, "step": 958 }, { "epoch": 1.5800502933780387, "grad_norm": 5.609714984893799, "learning_rate": 9.27759353472153e-06, "loss": 1.9006, "step": 959 }, { "epoch": 1.581726739312657, "grad_norm": 5.516435623168945, "learning_rate": 9.259566045796787e-06, "loss": 2.2504, "step": 960 }, { "epoch": 1.5834031852472759, "grad_norm": 9.04311466217041, "learning_rate": 9.24154097615357e-06, "loss": 2.2467, "step": 961 }, { "epoch": 1.5850796311818944, "grad_norm": 6.324374198913574, "learning_rate": 9.223518384686688e-06, "loss": 2.3547, "step": 962 }, { "epoch": 1.586756077116513, "grad_norm": 6.383605003356934, "learning_rate": 9.205498330282857e-06, "loss": 2.3789, "step": 963 }, { "epoch": 1.5884325230511316, "grad_norm": 5.347365379333496, "learning_rate": 9.187480871820502e-06, "loss": 2.2165, "step": 964 }, { "epoch": 1.5901089689857502, "grad_norm": 7.827714920043945, "learning_rate": 9.169466068169565e-06, "loss": 2.2948, "step": 965 }, { "epoch": 1.591785414920369, "grad_norm": 14.313446044921875, "learning_rate": 9.151453978191312e-06, "loss": 2.2106, "step": 966 }, { "epoch": 1.5934618608549873, "grad_norm": 6.769836902618408, "learning_rate": 9.133444660738147e-06, "loss": 2.6148, "step": 967 }, { "epoch": 1.595138306789606, "grad_norm": 5.148403644561768, "learning_rate": 9.115438174653415e-06, "loss": 2.4168, "step": 968 }, { "epoch": 1.5968147527242247, "grad_norm": 7.450311183929443, "learning_rate": 9.097434578771204e-06, "loss": 2.0438, "step": 969 }, { "epoch": 1.5984911986588433, "grad_norm": 5.682309150695801, "learning_rate": 9.07943393191616e-06, "loss": 2.6788, "step": 970 }, { "epoch": 1.6001676445934618, "grad_norm": 15.186368942260742, "learning_rate": 9.061436292903303e-06, "loss": 2.2623, "step": 971 }, { "epoch": 1.6018440905280804, "grad_norm": 15.18022346496582, "learning_rate": 9.043441720537814e-06, "loss": 2.3517, "step": 972 }, { "epoch": 1.6035205364626992, "grad_norm": 4.450839042663574, "learning_rate": 9.025450273614858e-06, "loss": 2.4281, "step": 973 }, { "epoch": 1.6051969823973176, "grad_norm": 5.214892387390137, "learning_rate": 9.007462010919387e-06, "loss": 2.262, "step": 974 }, { "epoch": 1.6068734283319364, "grad_norm": 7.075205326080322, "learning_rate": 8.989476991225948e-06, "loss": 2.249, "step": 975 }, { "epoch": 1.608549874266555, "grad_norm": 7.326975345611572, "learning_rate": 8.9714952732985e-06, "loss": 2.0278, "step": 976 }, { "epoch": 1.6102263202011735, "grad_norm": 4.590035915374756, "learning_rate": 8.953516915890202e-06, "loss": 2.3434, "step": 977 }, { "epoch": 1.611902766135792, "grad_norm": 8.010977745056152, "learning_rate": 8.935541977743243e-06, "loss": 2.2695, "step": 978 }, { "epoch": 1.6135792120704107, "grad_norm": 13.463695526123047, "learning_rate": 8.917570517588629e-06, "loss": 2.1467, "step": 979 }, { "epoch": 1.6152556580050295, "grad_norm": 5.092221260070801, "learning_rate": 8.899602594146013e-06, "loss": 2.0806, "step": 980 }, { "epoch": 1.6169321039396478, "grad_norm": 16.650634765625, "learning_rate": 8.88163826612349e-06, "loss": 2.3448, "step": 981 }, { "epoch": 1.6186085498742666, "grad_norm": 8.627533912658691, "learning_rate": 8.863677592217402e-06, "loss": 2.0594, "step": 982 }, { "epoch": 1.6202849958088852, "grad_norm": 10.417862892150879, "learning_rate": 8.845720631112152e-06, "loss": 2.0984, "step": 983 }, { "epoch": 1.6219614417435038, "grad_norm": 5.652635097503662, "learning_rate": 8.82776744148002e-06, "loss": 2.283, "step": 984 }, { "epoch": 1.6236378876781223, "grad_norm": 14.651529312133789, "learning_rate": 8.809818081980954e-06, "loss": 2.4419, "step": 985 }, { "epoch": 1.625314333612741, "grad_norm": 43.85502243041992, "learning_rate": 8.791872611262393e-06, "loss": 2.2221, "step": 986 }, { "epoch": 1.6269907795473597, "grad_norm": 4.894194602966309, "learning_rate": 8.773931087959066e-06, "loss": 2.1858, "step": 987 }, { "epoch": 1.628667225481978, "grad_norm": 8.100835800170898, "learning_rate": 8.755993570692808e-06, "loss": 2.2965, "step": 988 }, { "epoch": 1.6303436714165969, "grad_norm": 20.580204010009766, "learning_rate": 8.738060118072365e-06, "loss": 2.0304, "step": 989 }, { "epoch": 1.6320201173512154, "grad_norm": 8.8585844039917, "learning_rate": 8.720130788693197e-06, "loss": 2.5962, "step": 990 }, { "epoch": 1.633696563285834, "grad_norm": 6.588524341583252, "learning_rate": 8.702205641137293e-06, "loss": 2.5359, "step": 991 }, { "epoch": 1.6353730092204526, "grad_norm": 6.456886291503906, "learning_rate": 8.684284733972989e-06, "loss": 2.0713, "step": 992 }, { "epoch": 1.6370494551550712, "grad_norm": 5.424596309661865, "learning_rate": 8.66636812575475e-06, "loss": 2.0886, "step": 993 }, { "epoch": 1.63872590108969, "grad_norm": 6.081263542175293, "learning_rate": 8.648455875023006e-06, "loss": 2.0848, "step": 994 }, { "epoch": 1.6404023470243083, "grad_norm": 8.830599784851074, "learning_rate": 8.63054804030394e-06, "loss": 2.1685, "step": 995 }, { "epoch": 1.6420787929589271, "grad_norm": 12.61053466796875, "learning_rate": 8.61264468010932e-06, "loss": 2.2989, "step": 996 }, { "epoch": 1.6437552388935457, "grad_norm": 10.2522554397583, "learning_rate": 8.59474585293628e-06, "loss": 2.0865, "step": 997 }, { "epoch": 1.6454316848281643, "grad_norm": 8.868592262268066, "learning_rate": 8.576851617267151e-06, "loss": 2.4715, "step": 998 }, { "epoch": 1.647108130762783, "grad_norm": 8.336810111999512, "learning_rate": 8.558962031569257e-06, "loss": 2.2661, "step": 999 }, { "epoch": 1.6487845766974014, "grad_norm": 9.012283325195312, "learning_rate": 8.541077154294735e-06, "loss": 1.8368, "step": 1000 }, { "epoch": 1.6504610226320202, "grad_norm": 10.004655838012695, "learning_rate": 8.523197043880334e-06, "loss": 2.0076, "step": 1001 }, { "epoch": 1.6521374685666386, "grad_norm": 6.365155220031738, "learning_rate": 8.505321758747224e-06, "loss": 2.0785, "step": 1002 }, { "epoch": 1.6538139145012574, "grad_norm": 7.260687828063965, "learning_rate": 8.487451357300814e-06, "loss": 2.3836, "step": 1003 }, { "epoch": 1.655490360435876, "grad_norm": 9.959872245788574, "learning_rate": 8.469585897930557e-06, "loss": 2.2012, "step": 1004 }, { "epoch": 1.6571668063704945, "grad_norm": 8.26469898223877, "learning_rate": 8.451725439009757e-06, "loss": 2.2201, "step": 1005 }, { "epoch": 1.6588432523051133, "grad_norm": 10.560108184814453, "learning_rate": 8.433870038895377e-06, "loss": 2.1521, "step": 1006 }, { "epoch": 1.6605196982397317, "grad_norm": 10.366450309753418, "learning_rate": 8.416019755927851e-06, "loss": 2.1748, "step": 1007 }, { "epoch": 1.6621961441743505, "grad_norm": 6.520101070404053, "learning_rate": 8.398174648430898e-06, "loss": 2.2714, "step": 1008 }, { "epoch": 1.663872590108969, "grad_norm": 6.552917957305908, "learning_rate": 8.380334774711326e-06, "loss": 2.4878, "step": 1009 }, { "epoch": 1.6655490360435876, "grad_norm": 6.475761890411377, "learning_rate": 8.362500193058836e-06, "loss": 2.1404, "step": 1010 }, { "epoch": 1.6672254819782062, "grad_norm": 4.359820365905762, "learning_rate": 8.34467096174584e-06, "loss": 2.093, "step": 1011 }, { "epoch": 1.6689019279128248, "grad_norm": 6.3148417472839355, "learning_rate": 8.326847139027278e-06, "loss": 2.2588, "step": 1012 }, { "epoch": 1.6705783738474436, "grad_norm": 10.42726993560791, "learning_rate": 8.309028783140404e-06, "loss": 2.0811, "step": 1013 }, { "epoch": 1.672254819782062, "grad_norm": 4.301555633544922, "learning_rate": 8.291215952304622e-06, "loss": 1.9573, "step": 1014 }, { "epoch": 1.6739312657166807, "grad_norm": 15.535539627075195, "learning_rate": 8.27340870472127e-06, "loss": 2.316, "step": 1015 }, { "epoch": 1.6756077116512993, "grad_norm": 11.388089179992676, "learning_rate": 8.255607098573454e-06, "loss": 2.1923, "step": 1016 }, { "epoch": 1.6772841575859179, "grad_norm": 6.459679126739502, "learning_rate": 8.23781119202585e-06, "loss": 2.2816, "step": 1017 }, { "epoch": 1.6789606035205364, "grad_norm": 6.557635307312012, "learning_rate": 8.2200210432245e-06, "loss": 2.5434, "step": 1018 }, { "epoch": 1.680637049455155, "grad_norm": 14.87962532043457, "learning_rate": 8.202236710296636e-06, "loss": 1.9257, "step": 1019 }, { "epoch": 1.6823134953897738, "grad_norm": 10.075231552124023, "learning_rate": 8.1844582513505e-06, "loss": 1.8412, "step": 1020 }, { "epoch": 1.6839899413243922, "grad_norm": 13.415687561035156, "learning_rate": 8.166685724475123e-06, "loss": 2.21, "step": 1021 }, { "epoch": 1.685666387259011, "grad_norm": 5.891720771789551, "learning_rate": 8.148919187740165e-06, "loss": 2.3421, "step": 1022 }, { "epoch": 1.6873428331936295, "grad_norm": 19.35289192199707, "learning_rate": 8.13115869919571e-06, "loss": 2.2966, "step": 1023 }, { "epoch": 1.6890192791282481, "grad_norm": 6.343852996826172, "learning_rate": 8.113404316872083e-06, "loss": 2.0441, "step": 1024 }, { "epoch": 1.6906957250628667, "grad_norm": 15.842345237731934, "learning_rate": 8.095656098779657e-06, "loss": 2.1886, "step": 1025 }, { "epoch": 1.6923721709974853, "grad_norm": 5.255537986755371, "learning_rate": 8.077914102908661e-06, "loss": 2.3907, "step": 1026 }, { "epoch": 1.694048616932104, "grad_norm": 19.217512130737305, "learning_rate": 8.060178387228997e-06, "loss": 1.9004, "step": 1027 }, { "epoch": 1.6957250628667224, "grad_norm": 5.641807556152344, "learning_rate": 8.042449009690044e-06, "loss": 2.4428, "step": 1028 }, { "epoch": 1.6974015088013412, "grad_norm": 9.170493125915527, "learning_rate": 8.024726028220474e-06, "loss": 2.1193, "step": 1029 }, { "epoch": 1.6990779547359598, "grad_norm": 6.460337162017822, "learning_rate": 8.007009500728065e-06, "loss": 2.2255, "step": 1030 }, { "epoch": 1.7007544006705784, "grad_norm": 10.930974960327148, "learning_rate": 7.989299485099498e-06, "loss": 2.1062, "step": 1031 }, { "epoch": 1.702430846605197, "grad_norm": 9.958852767944336, "learning_rate": 7.97159603920018e-06, "loss": 2.5083, "step": 1032 }, { "epoch": 1.7041072925398155, "grad_norm": 9.895286560058594, "learning_rate": 7.95389922087406e-06, "loss": 2.1561, "step": 1033 }, { "epoch": 1.7057837384744343, "grad_norm": 10.789249420166016, "learning_rate": 7.936209087943424e-06, "loss": 2.1883, "step": 1034 }, { "epoch": 1.7074601844090527, "grad_norm": 8.60191822052002, "learning_rate": 7.918525698208719e-06, "loss": 2.1805, "step": 1035 }, { "epoch": 1.7091366303436715, "grad_norm": 7.604182720184326, "learning_rate": 7.90084910944835e-06, "loss": 2.1348, "step": 1036 }, { "epoch": 1.71081307627829, "grad_norm": 8.246973991394043, "learning_rate": 7.883179379418516e-06, "loss": 2.3316, "step": 1037 }, { "epoch": 1.7124895222129086, "grad_norm": 9.872467994689941, "learning_rate": 7.865516565852993e-06, "loss": 2.4325, "step": 1038 }, { "epoch": 1.7141659681475272, "grad_norm": 9.95871353149414, "learning_rate": 7.847860726462964e-06, "loss": 1.9019, "step": 1039 }, { "epoch": 1.7158424140821458, "grad_norm": 5.651617050170898, "learning_rate": 7.83021191893682e-06, "loss": 2.3187, "step": 1040 }, { "epoch": 1.7175188600167646, "grad_norm": 5.1338605880737305, "learning_rate": 7.812570200939985e-06, "loss": 2.5131, "step": 1041 }, { "epoch": 1.719195305951383, "grad_norm": 5.415092468261719, "learning_rate": 7.794935630114712e-06, "loss": 2.263, "step": 1042 }, { "epoch": 1.7208717518860017, "grad_norm": 7.658169269561768, "learning_rate": 7.777308264079902e-06, "loss": 2.4664, "step": 1043 }, { "epoch": 1.7208717518860017, "eval_loss": 2.2645041942596436, "eval_runtime": 170.5639, "eval_samples_per_second": 3.078, "eval_steps_per_second": 1.542, "step": 1043 }, { "epoch": 1.7225481978206203, "grad_norm": 6.024785995483398, "learning_rate": 7.759688160430913e-06, "loss": 2.1216, "step": 1044 }, { "epoch": 1.7242246437552389, "grad_norm": 7.298198223114014, "learning_rate": 7.742075376739386e-06, "loss": 2.3594, "step": 1045 }, { "epoch": 1.7259010896898574, "grad_norm": 4.409038543701172, "learning_rate": 7.724469970553028e-06, "loss": 2.3114, "step": 1046 }, { "epoch": 1.727577535624476, "grad_norm": 26.98436737060547, "learning_rate": 7.706871999395451e-06, "loss": 2.6305, "step": 1047 }, { "epoch": 1.7292539815590948, "grad_norm": 21.230642318725586, "learning_rate": 7.68928152076597e-06, "loss": 1.6229, "step": 1048 }, { "epoch": 1.7309304274937132, "grad_norm": 6.471439838409424, "learning_rate": 7.671698592139426e-06, "loss": 2.2796, "step": 1049 }, { "epoch": 1.732606873428332, "grad_norm": 12.904387474060059, "learning_rate": 7.654123270965984e-06, "loss": 2.2036, "step": 1050 }, { "epoch": 1.7342833193629505, "grad_norm": 9.128398895263672, "learning_rate": 7.636555614670953e-06, "loss": 2.19, "step": 1051 }, { "epoch": 1.7359597652975691, "grad_norm": 7.86855936050415, "learning_rate": 7.618995680654599e-06, "loss": 2.1663, "step": 1052 }, { "epoch": 1.737636211232188, "grad_norm": 8.717936515808105, "learning_rate": 7.601443526291958e-06, "loss": 2.1837, "step": 1053 }, { "epoch": 1.7393126571668063, "grad_norm": 8.208657264709473, "learning_rate": 7.583899208932648e-06, "loss": 2.3838, "step": 1054 }, { "epoch": 1.740989103101425, "grad_norm": 9.249015808105469, "learning_rate": 7.566362785900675e-06, "loss": 1.977, "step": 1055 }, { "epoch": 1.7426655490360434, "grad_norm": 6.25834846496582, "learning_rate": 7.548834314494254e-06, "loss": 1.9191, "step": 1056 }, { "epoch": 1.7443419949706622, "grad_norm": 22.7048282623291, "learning_rate": 7.531313851985623e-06, "loss": 2.3832, "step": 1057 }, { "epoch": 1.7460184409052808, "grad_norm": 6.0677924156188965, "learning_rate": 7.513801455620847e-06, "loss": 2.1563, "step": 1058 }, { "epoch": 1.7476948868398994, "grad_norm": 9.725539207458496, "learning_rate": 7.496297182619637e-06, "loss": 2.4701, "step": 1059 }, { "epoch": 1.7493713327745182, "grad_norm": 9.37392807006836, "learning_rate": 7.478801090175159e-06, "loss": 2.5401, "step": 1060 }, { "epoch": 1.7510477787091365, "grad_norm": 5.6621904373168945, "learning_rate": 7.461313235453858e-06, "loss": 2.2832, "step": 1061 }, { "epoch": 1.7527242246437553, "grad_norm": 14.174243927001953, "learning_rate": 7.443833675595254e-06, "loss": 2.0525, "step": 1062 }, { "epoch": 1.754400670578374, "grad_norm": 8.08659839630127, "learning_rate": 7.426362467711773e-06, "loss": 2.3987, "step": 1063 }, { "epoch": 1.7560771165129925, "grad_norm": 5.601348876953125, "learning_rate": 7.408899668888541e-06, "loss": 2.1891, "step": 1064 }, { "epoch": 1.757753562447611, "grad_norm": 5.893629550933838, "learning_rate": 7.391445336183223e-06, "loss": 2.1811, "step": 1065 }, { "epoch": 1.7594300083822296, "grad_norm": 33.441280364990234, "learning_rate": 7.37399952662581e-06, "loss": 2.0866, "step": 1066 }, { "epoch": 1.7611064543168484, "grad_norm": 9.334148406982422, "learning_rate": 7.356562297218448e-06, "loss": 2.2446, "step": 1067 }, { "epoch": 1.7627829002514668, "grad_norm": 7.684276103973389, "learning_rate": 7.339133704935246e-06, "loss": 2.0276, "step": 1068 }, { "epoch": 1.7644593461860856, "grad_norm": 48.19441223144531, "learning_rate": 7.321713806722102e-06, "loss": 2.2558, "step": 1069 }, { "epoch": 1.7661357921207042, "grad_norm": 26.02524757385254, "learning_rate": 7.304302659496499e-06, "loss": 2.1062, "step": 1070 }, { "epoch": 1.7678122380553227, "grad_norm": 11.773177146911621, "learning_rate": 7.2869003201473235e-06, "loss": 2.3763, "step": 1071 }, { "epoch": 1.7694886839899413, "grad_norm": 5.439338684082031, "learning_rate": 7.2695068455346885e-06, "loss": 2.3191, "step": 1072 }, { "epoch": 1.7711651299245599, "grad_norm": 5.517753601074219, "learning_rate": 7.252122292489747e-06, "loss": 2.0986, "step": 1073 }, { "epoch": 1.7728415758591787, "grad_norm": 9.649155616760254, "learning_rate": 7.234746717814496e-06, "loss": 2.2701, "step": 1074 }, { "epoch": 1.774518021793797, "grad_norm": 7.149664878845215, "learning_rate": 7.217380178281596e-06, "loss": 2.3595, "step": 1075 }, { "epoch": 1.7761944677284158, "grad_norm": 7.706420421600342, "learning_rate": 7.200022730634188e-06, "loss": 2.049, "step": 1076 }, { "epoch": 1.7778709136630344, "grad_norm": 5.772740840911865, "learning_rate": 7.182674431585703e-06, "loss": 1.8452, "step": 1077 }, { "epoch": 1.779547359597653, "grad_norm": 4.753730297088623, "learning_rate": 7.165335337819692e-06, "loss": 2.3813, "step": 1078 }, { "epoch": 1.7812238055322716, "grad_norm": 7.576776027679443, "learning_rate": 7.148005505989612e-06, "loss": 2.4679, "step": 1079 }, { "epoch": 1.7829002514668901, "grad_norm": 5.049241542816162, "learning_rate": 7.130684992718672e-06, "loss": 2.0306, "step": 1080 }, { "epoch": 1.784576697401509, "grad_norm": 11.073182106018066, "learning_rate": 7.113373854599621e-06, "loss": 2.1904, "step": 1081 }, { "epoch": 1.7862531433361273, "grad_norm": 7.758405685424805, "learning_rate": 7.096072148194588e-06, "loss": 2.2661, "step": 1082 }, { "epoch": 1.787929589270746, "grad_norm": 7.006621837615967, "learning_rate": 7.078779930034877e-06, "loss": 2.224, "step": 1083 }, { "epoch": 1.7896060352053647, "grad_norm": 7.297264575958252, "learning_rate": 7.061497256620793e-06, "loss": 2.2321, "step": 1084 }, { "epoch": 1.7912824811399832, "grad_norm": 15.760753631591797, "learning_rate": 7.044224184421452e-06, "loss": 2.2102, "step": 1085 }, { "epoch": 1.7929589270746018, "grad_norm": 11.469801902770996, "learning_rate": 7.026960769874605e-06, "loss": 2.1495, "step": 1086 }, { "epoch": 1.7946353730092204, "grad_norm": 6.294549942016602, "learning_rate": 7.009707069386442e-06, "loss": 2.3341, "step": 1087 }, { "epoch": 1.7963118189438392, "grad_norm": 6.376000881195068, "learning_rate": 6.992463139331418e-06, "loss": 2.1656, "step": 1088 }, { "epoch": 1.7979882648784575, "grad_norm": 4.4377946853637695, "learning_rate": 6.975229036052056e-06, "loss": 2.1987, "step": 1089 }, { "epoch": 1.7996647108130763, "grad_norm": 4.860551357269287, "learning_rate": 6.958004815858783e-06, "loss": 2.2562, "step": 1090 }, { "epoch": 1.801341156747695, "grad_norm": 4.674697399139404, "learning_rate": 6.940790535029726e-06, "loss": 2.3898, "step": 1091 }, { "epoch": 1.8030176026823135, "grad_norm": 6.539408206939697, "learning_rate": 6.923586249810539e-06, "loss": 2.2545, "step": 1092 }, { "epoch": 1.804694048616932, "grad_norm": 4.220900535583496, "learning_rate": 6.906392016414212e-06, "loss": 2.2343, "step": 1093 }, { "epoch": 1.8063704945515506, "grad_norm": 5.962541580200195, "learning_rate": 6.889207891020901e-06, "loss": 2.3388, "step": 1094 }, { "epoch": 1.8080469404861694, "grad_norm": 5.308079719543457, "learning_rate": 6.872033929777731e-06, "loss": 1.9542, "step": 1095 }, { "epoch": 1.8097233864207878, "grad_norm": 5.22299861907959, "learning_rate": 6.85487018879861e-06, "loss": 2.1694, "step": 1096 }, { "epoch": 1.8113998323554066, "grad_norm": 15.883075714111328, "learning_rate": 6.837716724164061e-06, "loss": 2.5534, "step": 1097 }, { "epoch": 1.8130762782900252, "grad_norm": 6.803228855133057, "learning_rate": 6.820573591921029e-06, "loss": 2.4624, "step": 1098 }, { "epoch": 1.8147527242246437, "grad_norm": 6.329739570617676, "learning_rate": 6.803440848082698e-06, "loss": 2.3427, "step": 1099 }, { "epoch": 1.8164291701592625, "grad_norm": 40.00912857055664, "learning_rate": 6.786318548628307e-06, "loss": 2.438, "step": 1100 }, { "epoch": 1.8181056160938809, "grad_norm": 4.516688823699951, "learning_rate": 6.76920674950297e-06, "loss": 2.0016, "step": 1101 }, { "epoch": 1.8197820620284997, "grad_norm": 10.062124252319336, "learning_rate": 6.752105506617497e-06, "loss": 2.3252, "step": 1102 }, { "epoch": 1.821458507963118, "grad_norm": 44.352874755859375, "learning_rate": 6.735014875848201e-06, "loss": 2.0923, "step": 1103 }, { "epoch": 1.8231349538977368, "grad_norm": 7.247190952301025, "learning_rate": 6.7179349130367235e-06, "loss": 2.1868, "step": 1104 }, { "epoch": 1.8248113998323554, "grad_norm": 12.149118423461914, "learning_rate": 6.700865673989847e-06, "loss": 2.3446, "step": 1105 }, { "epoch": 1.826487845766974, "grad_norm": 4.366675853729248, "learning_rate": 6.683807214479323e-06, "loss": 2.0674, "step": 1106 }, { "epoch": 1.8281642917015928, "grad_norm": 4.6648149490356445, "learning_rate": 6.666759590241673e-06, "loss": 2.0906, "step": 1107 }, { "epoch": 1.8298407376362111, "grad_norm": 12.673425674438477, "learning_rate": 6.649722856978022e-06, "loss": 2.3083, "step": 1108 }, { "epoch": 1.83151718357083, "grad_norm": 6.555782794952393, "learning_rate": 6.632697070353906e-06, "loss": 2.1852, "step": 1109 }, { "epoch": 1.8331936295054483, "grad_norm": 11.261723518371582, "learning_rate": 6.615682285999096e-06, "loss": 1.7571, "step": 1110 }, { "epoch": 1.834870075440067, "grad_norm": 5.449329376220703, "learning_rate": 6.598678559507414e-06, "loss": 2.1372, "step": 1111 }, { "epoch": 1.8365465213746857, "grad_norm": 9.50792407989502, "learning_rate": 6.581685946436551e-06, "loss": 2.2881, "step": 1112 }, { "epoch": 1.8382229673093042, "grad_norm": 9.747804641723633, "learning_rate": 6.564704502307886e-06, "loss": 2.1115, "step": 1113 }, { "epoch": 1.839899413243923, "grad_norm": 10.825417518615723, "learning_rate": 6.547734282606311e-06, "loss": 2.2342, "step": 1114 }, { "epoch": 1.8415758591785414, "grad_norm": 7.077234268188477, "learning_rate": 6.530775342780032e-06, "loss": 2.2541, "step": 1115 }, { "epoch": 1.8432523051131602, "grad_norm": 9.605854988098145, "learning_rate": 6.513827738240408e-06, "loss": 1.9631, "step": 1116 }, { "epoch": 1.8449287510477788, "grad_norm": 4.980114936828613, "learning_rate": 6.496891524361757e-06, "loss": 2.1376, "step": 1117 }, { "epoch": 1.8466051969823973, "grad_norm": 11.652900695800781, "learning_rate": 6.479966756481187e-06, "loss": 2.2842, "step": 1118 }, { "epoch": 1.848281642917016, "grad_norm": 13.679494857788086, "learning_rate": 6.463053489898395e-06, "loss": 2.1547, "step": 1119 }, { "epoch": 1.8499580888516345, "grad_norm": 6.347150802612305, "learning_rate": 6.446151779875507e-06, "loss": 2.2538, "step": 1120 }, { "epoch": 1.8516345347862533, "grad_norm": 7.642538070678711, "learning_rate": 6.429261681636889e-06, "loss": 2.3493, "step": 1121 }, { "epoch": 1.8533109807208716, "grad_norm": 4.232964038848877, "learning_rate": 6.4123832503689665e-06, "loss": 2.0883, "step": 1122 }, { "epoch": 1.8549874266554904, "grad_norm": 15.709921836853027, "learning_rate": 6.395516541220044e-06, "loss": 2.2556, "step": 1123 }, { "epoch": 1.856663872590109, "grad_norm": 3.9246747493743896, "learning_rate": 6.378661609300122e-06, "loss": 2.5295, "step": 1124 }, { "epoch": 1.8583403185247276, "grad_norm": 3.4677414894104004, "learning_rate": 6.361818509680725e-06, "loss": 1.8925, "step": 1125 }, { "epoch": 1.8600167644593462, "grad_norm": 6.76437520980835, "learning_rate": 6.344987297394713e-06, "loss": 2.0996, "step": 1126 }, { "epoch": 1.8616932103939647, "grad_norm": 5.159142971038818, "learning_rate": 6.3281680274361135e-06, "loss": 2.3012, "step": 1127 }, { "epoch": 1.8633696563285835, "grad_norm": 7.648279666900635, "learning_rate": 6.311360754759923e-06, "loss": 2.2632, "step": 1128 }, { "epoch": 1.8650461022632019, "grad_norm": 8.155828475952148, "learning_rate": 6.294565534281945e-06, "loss": 2.2258, "step": 1129 }, { "epoch": 1.8667225481978207, "grad_norm": 6.460453987121582, "learning_rate": 6.2777824208786e-06, "loss": 2.2516, "step": 1130 }, { "epoch": 1.8683989941324393, "grad_norm": 9.944646835327148, "learning_rate": 6.261011469386755e-06, "loss": 2.2504, "step": 1131 }, { "epoch": 1.8700754400670578, "grad_norm": 6.516200065612793, "learning_rate": 6.244252734603538e-06, "loss": 2.3209, "step": 1132 }, { "epoch": 1.8717518860016764, "grad_norm": 6.515983581542969, "learning_rate": 6.2275062712861545e-06, "loss": 2.0608, "step": 1133 }, { "epoch": 1.873428331936295, "grad_norm": 26.509382247924805, "learning_rate": 6.210772134151719e-06, "loss": 2.0832, "step": 1134 }, { "epoch": 1.8751047778709138, "grad_norm": 5.853835105895996, "learning_rate": 6.194050377877074e-06, "loss": 2.1142, "step": 1135 }, { "epoch": 1.8767812238055321, "grad_norm": 7.032966136932373, "learning_rate": 6.177341057098607e-06, "loss": 2.4475, "step": 1136 }, { "epoch": 1.878457669740151, "grad_norm": 8.253186225891113, "learning_rate": 6.160644226412072e-06, "loss": 2.0488, "step": 1137 }, { "epoch": 1.8801341156747695, "grad_norm": 7.693860054016113, "learning_rate": 6.143959940372412e-06, "loss": 2.3109, "step": 1138 }, { "epoch": 1.881810561609388, "grad_norm": 7.1489973068237305, "learning_rate": 6.127288253493591e-06, "loss": 2.0233, "step": 1139 }, { "epoch": 1.8834870075440067, "grad_norm": 5.263674736022949, "learning_rate": 6.110629220248394e-06, "loss": 2.2143, "step": 1140 }, { "epoch": 1.8851634534786252, "grad_norm": 6.076548099517822, "learning_rate": 6.09398289506827e-06, "loss": 2.2446, "step": 1141 }, { "epoch": 1.886839899413244, "grad_norm": 5.693360328674316, "learning_rate": 6.077349332343141e-06, "loss": 2.3735, "step": 1142 }, { "epoch": 1.8885163453478624, "grad_norm": 6.941556930541992, "learning_rate": 6.060728586421236e-06, "loss": 2.171, "step": 1143 }, { "epoch": 1.8901927912824812, "grad_norm": 15.569869041442871, "learning_rate": 6.0441207116089e-06, "loss": 2.2099, "step": 1144 }, { "epoch": 1.8918692372170998, "grad_norm": 4.877901554107666, "learning_rate": 6.0275257621704285e-06, "loss": 2.1303, "step": 1145 }, { "epoch": 1.8935456831517183, "grad_norm": 8.144440650939941, "learning_rate": 6.010943792327875e-06, "loss": 2.616, "step": 1146 }, { "epoch": 1.895222129086337, "grad_norm": 24.216936111450195, "learning_rate": 5.994374856260898e-06, "loss": 2.3298, "step": 1147 }, { "epoch": 1.8968985750209555, "grad_norm": 5.97416353225708, "learning_rate": 5.97781900810656e-06, "loss": 2.1778, "step": 1148 }, { "epoch": 1.8985750209555743, "grad_norm": 15.024431228637695, "learning_rate": 5.961276301959161e-06, "loss": 2.0387, "step": 1149 }, { "epoch": 1.9002514668901926, "grad_norm": 5.835839748382568, "learning_rate": 5.944746791870062e-06, "loss": 2.4754, "step": 1150 }, { "epoch": 1.9019279128248114, "grad_norm": 38.862701416015625, "learning_rate": 5.92823053184751e-06, "loss": 2.0012, "step": 1151 }, { "epoch": 1.90360435875943, "grad_norm": 5.240647315979004, "learning_rate": 5.911727575856457e-06, "loss": 2.3046, "step": 1152 }, { "epoch": 1.9052808046940486, "grad_norm": 28.80046272277832, "learning_rate": 5.8952379778183845e-06, "loss": 2.2732, "step": 1153 }, { "epoch": 1.9069572506286674, "grad_norm": 7.8639326095581055, "learning_rate": 5.878761791611129e-06, "loss": 2.3288, "step": 1154 }, { "epoch": 1.9086336965632857, "grad_norm": 12.526535034179688, "learning_rate": 5.8622990710687065e-06, "loss": 2.0136, "step": 1155 }, { "epoch": 1.9103101424979045, "grad_norm": 23.384218215942383, "learning_rate": 5.845849869981137e-06, "loss": 2.121, "step": 1156 }, { "epoch": 1.911986588432523, "grad_norm": 4.202671527862549, "learning_rate": 5.829414242094262e-06, "loss": 2.1663, "step": 1157 }, { "epoch": 1.9136630343671417, "grad_norm": 8.441536903381348, "learning_rate": 5.812992241109578e-06, "loss": 2.0716, "step": 1158 }, { "epoch": 1.9153394803017603, "grad_norm": 7.15108585357666, "learning_rate": 5.796583920684056e-06, "loss": 2.1374, "step": 1159 }, { "epoch": 1.9170159262363788, "grad_norm": 5.923423767089844, "learning_rate": 5.780189334429973e-06, "loss": 2.4179, "step": 1160 }, { "epoch": 1.9186923721709976, "grad_norm": 6.724905014038086, "learning_rate": 5.7638085359147235e-06, "loss": 2.1162, "step": 1161 }, { "epoch": 1.920368818105616, "grad_norm": 11.778375625610352, "learning_rate": 5.747441578660644e-06, "loss": 2.1733, "step": 1162 }, { "epoch": 1.9220452640402348, "grad_norm": 8.26591968536377, "learning_rate": 5.731088516144875e-06, "loss": 1.7997, "step": 1163 }, { "epoch": 1.9237217099748531, "grad_norm": 8.323285102844238, "learning_rate": 5.71474940179913e-06, "loss": 2.4237, "step": 1164 }, { "epoch": 1.925398155909472, "grad_norm": 4.375385284423828, "learning_rate": 5.698424289009556e-06, "loss": 2.1891, "step": 1165 }, { "epoch": 1.9270746018440905, "grad_norm": 7.225880146026611, "learning_rate": 5.682113231116557e-06, "loss": 2.2102, "step": 1166 }, { "epoch": 1.928751047778709, "grad_norm": 4.167888641357422, "learning_rate": 5.66581628141461e-06, "loss": 2.5383, "step": 1167 }, { "epoch": 1.930427493713328, "grad_norm": 8.296483993530273, "learning_rate": 5.649533493152101e-06, "loss": 2.3281, "step": 1168 }, { "epoch": 1.9321039396479462, "grad_norm": 16.285106658935547, "learning_rate": 5.6332649195311365e-06, "loss": 2.4103, "step": 1169 }, { "epoch": 1.933780385582565, "grad_norm": 15.525874137878418, "learning_rate": 5.6170106137073725e-06, "loss": 2.0458, "step": 1170 }, { "epoch": 1.9354568315171836, "grad_norm": 16.159217834472656, "learning_rate": 5.600770628789872e-06, "loss": 2.2247, "step": 1171 }, { "epoch": 1.9371332774518022, "grad_norm": 10.77514362335205, "learning_rate": 5.584545017840886e-06, "loss": 2.427, "step": 1172 }, { "epoch": 1.9388097233864208, "grad_norm": 8.69184398651123, "learning_rate": 5.568333833875696e-06, "loss": 2.3373, "step": 1173 }, { "epoch": 1.9404861693210393, "grad_norm": 5.5508036613464355, "learning_rate": 5.552137129862458e-06, "loss": 2.1463, "step": 1174 }, { "epoch": 1.9421626152556581, "grad_norm": 8.996875762939453, "learning_rate": 5.535954958722016e-06, "loss": 2.2568, "step": 1175 }, { "epoch": 1.9438390611902765, "grad_norm": 5.983624458312988, "learning_rate": 5.519787373327725e-06, "loss": 2.1321, "step": 1176 }, { "epoch": 1.9455155071248953, "grad_norm": 12.689400672912598, "learning_rate": 5.5036344265052825e-06, "loss": 2.5095, "step": 1177 }, { "epoch": 1.9471919530595139, "grad_norm": 6.534908294677734, "learning_rate": 5.48749617103255e-06, "loss": 2.2237, "step": 1178 }, { "epoch": 1.9488683989941324, "grad_norm": 6.437363624572754, "learning_rate": 5.4713726596394005e-06, "loss": 2.0786, "step": 1179 }, { "epoch": 1.950544844928751, "grad_norm": 6.881913661956787, "learning_rate": 5.4552639450075226e-06, "loss": 2.4198, "step": 1180 }, { "epoch": 1.9522212908633696, "grad_norm": 6.208714485168457, "learning_rate": 5.439170079770269e-06, "loss": 1.8166, "step": 1181 }, { "epoch": 1.9538977367979884, "grad_norm": 21.176307678222656, "learning_rate": 5.423091116512458e-06, "loss": 2.3336, "step": 1182 }, { "epoch": 1.9555741827326067, "grad_norm": 5.8681960105896, "learning_rate": 5.40702710777022e-06, "loss": 2.2693, "step": 1183 }, { "epoch": 1.9572506286672255, "grad_norm": 8.790966987609863, "learning_rate": 5.390978106030843e-06, "loss": 2.1443, "step": 1184 }, { "epoch": 1.9589270746018441, "grad_norm": 8.432100296020508, "learning_rate": 5.374944163732561e-06, "loss": 2.0352, "step": 1185 }, { "epoch": 1.9606035205364627, "grad_norm": 5.639806747436523, "learning_rate": 5.358925333264403e-06, "loss": 2.1605, "step": 1186 }, { "epoch": 1.9622799664710813, "grad_norm": 15.98481273651123, "learning_rate": 5.342921666966033e-06, "loss": 2.3602, "step": 1187 }, { "epoch": 1.9639564124056998, "grad_norm": 30.443532943725586, "learning_rate": 5.3269332171275635e-06, "loss": 2.2108, "step": 1188 }, { "epoch": 1.9656328583403186, "grad_norm": 9.014719009399414, "learning_rate": 5.31096003598939e-06, "loss": 2.5361, "step": 1189 }, { "epoch": 1.967309304274937, "grad_norm": 31.22583770751953, "learning_rate": 5.295002175742017e-06, "loss": 1.9084, "step": 1190 }, { "epoch": 1.9689857502095558, "grad_norm": 13.885859489440918, "learning_rate": 5.279059688525878e-06, "loss": 2.578, "step": 1191 }, { "epoch": 1.9706621961441744, "grad_norm": 12.779547691345215, "learning_rate": 5.263132626431208e-06, "loss": 2.1021, "step": 1192 }, { "epoch": 1.9706621961441744, "eval_loss": 2.2453856468200684, "eval_runtime": 170.1493, "eval_samples_per_second": 3.086, "eval_steps_per_second": 1.546, "step": 1192 }, { "epoch": 1.972338642078793, "grad_norm": 4.851776599884033, "learning_rate": 5.2472210414978156e-06, "loss": 2.4652, "step": 1193 }, { "epoch": 1.9740150880134115, "grad_norm": 24.775318145751953, "learning_rate": 5.231324985714942e-06, "loss": 2.1521, "step": 1194 }, { "epoch": 1.97569153394803, "grad_norm": 6.5739359855651855, "learning_rate": 5.215444511021099e-06, "loss": 2.0707, "step": 1195 }, { "epoch": 1.977367979882649, "grad_norm": 12.756904602050781, "learning_rate": 5.199579669303885e-06, "loss": 2.2409, "step": 1196 }, { "epoch": 1.9790444258172672, "grad_norm": 10.48883056640625, "learning_rate": 5.1837305123998205e-06, "loss": 2.3598, "step": 1197 }, { "epoch": 1.980720871751886, "grad_norm": 6.538893699645996, "learning_rate": 5.167897092094173e-06, "loss": 2.0839, "step": 1198 }, { "epoch": 1.9823973176865046, "grad_norm": 4.959535598754883, "learning_rate": 5.152079460120787e-06, "loss": 1.9232, "step": 1199 }, { "epoch": 1.9840737636211232, "grad_norm": 8.06222152709961, "learning_rate": 5.136277668161946e-06, "loss": 2.2869, "step": 1200 }, { "epoch": 1.9857502095557418, "grad_norm": 5.546164512634277, "learning_rate": 5.1204917678481525e-06, "loss": 2.2664, "step": 1201 }, { "epoch": 1.9874266554903603, "grad_norm": 5.100728511810303, "learning_rate": 5.104721810757989e-06, "loss": 2.2692, "step": 1202 }, { "epoch": 1.9891031014249791, "grad_norm": 4.7023186683654785, "learning_rate": 5.0889678484179525e-06, "loss": 2.2996, "step": 1203 }, { "epoch": 1.9907795473595975, "grad_norm": 6.995988368988037, "learning_rate": 5.073229932302277e-06, "loss": 2.3977, "step": 1204 }, { "epoch": 1.9924559932942163, "grad_norm": 5.1511993408203125, "learning_rate": 5.057508113832772e-06, "loss": 1.9203, "step": 1205 }, { "epoch": 1.9941324392288349, "grad_norm": 6.9583563804626465, "learning_rate": 5.0418024443786395e-06, "loss": 2.4477, "step": 1206 }, { "epoch": 1.9958088851634534, "grad_norm": 4.900050640106201, "learning_rate": 5.026112975256312e-06, "loss": 2.1678, "step": 1207 }, { "epoch": 1.9974853310980722, "grad_norm": 7.841214179992676, "learning_rate": 5.010439757729317e-06, "loss": 2.3142, "step": 1208 }, { "epoch": 1.9991617770326906, "grad_norm": 9.979947090148926, "learning_rate": 4.994782843008053e-06, "loss": 2.3608, "step": 1209 }, { "epoch": 2.0008382229673094, "grad_norm": 7.047790050506592, "learning_rate": 4.97914228224966e-06, "loss": 2.1716, "step": 1210 }, { "epoch": 2.002514668901928, "grad_norm": 7.367396354675293, "learning_rate": 4.963518126557846e-06, "loss": 2.3512, "step": 1211 }, { "epoch": 2.0041911148365466, "grad_norm": 5.9742841720581055, "learning_rate": 4.947910426982718e-06, "loss": 2.1804, "step": 1212 }, { "epoch": 2.005867560771165, "grad_norm": 23.638952255249023, "learning_rate": 4.932319234520616e-06, "loss": 2.2882, "step": 1213 }, { "epoch": 2.0075440067057837, "grad_norm": 10.937363624572754, "learning_rate": 4.9167446001139385e-06, "loss": 2.2866, "step": 1214 }, { "epoch": 2.0092204526404025, "grad_norm": 8.216862678527832, "learning_rate": 4.9011865746509766e-06, "loss": 2.3837, "step": 1215 }, { "epoch": 2.010896898575021, "grad_norm": 9.21924114227295, "learning_rate": 4.885645208965779e-06, "loss": 2.4004, "step": 1216 }, { "epoch": 2.0125733445096397, "grad_norm": 8.511059761047363, "learning_rate": 4.870120553837936e-06, "loss": 2.4868, "step": 1217 }, { "epoch": 2.014249790444258, "grad_norm": 5.596872806549072, "learning_rate": 4.854612659992443e-06, "loss": 2.2454, "step": 1218 }, { "epoch": 2.015926236378877, "grad_norm": 5.222379684448242, "learning_rate": 4.839121578099536e-06, "loss": 2.4379, "step": 1219 }, { "epoch": 2.0176026823134956, "grad_norm": 7.564155101776123, "learning_rate": 4.823647358774518e-06, "loss": 2.2867, "step": 1220 }, { "epoch": 2.019279128248114, "grad_norm": 30.321897506713867, "learning_rate": 4.8081900525775984e-06, "loss": 2.055, "step": 1221 }, { "epoch": 2.0209555741827323, "grad_norm": 8.739989280700684, "learning_rate": 4.792749710013717e-06, "loss": 1.8506, "step": 1222 }, { "epoch": 2.0004191114836547, "grad_norm": 4.6824212074279785, "learning_rate": 4.777326381532382e-06, "loss": 2.2967, "step": 1223 }, { "epoch": 2.002095557418273, "grad_norm": 6.725395202636719, "learning_rate": 4.7619201175275365e-06, "loss": 2.1834, "step": 1224 }, { "epoch": 2.003772003352892, "grad_norm": 6.484198093414307, "learning_rate": 4.746530968337342e-06, "loss": 2.4209, "step": 1225 }, { "epoch": 2.0054484492875106, "grad_norm": 4.450620174407959, "learning_rate": 4.731158984244042e-06, "loss": 2.2121, "step": 1226 }, { "epoch": 2.007124895222129, "grad_norm": 6.4583845138549805, "learning_rate": 4.7158042154738094e-06, "loss": 2.3898, "step": 1227 }, { "epoch": 2.008801341156748, "grad_norm": 10.216773986816406, "learning_rate": 4.700466712196546e-06, "loss": 2.2019, "step": 1228 }, { "epoch": 2.010477787091366, "grad_norm": 5.428000450134277, "learning_rate": 4.685146524525771e-06, "loss": 2.3826, "step": 1229 }, { "epoch": 2.012154233025985, "grad_norm": 11.409854888916016, "learning_rate": 4.6698437025184e-06, "loss": 2.5916, "step": 1230 }, { "epoch": 2.0138306789606033, "grad_norm": 10.307641983032227, "learning_rate": 4.654558296174617e-06, "loss": 2.299, "step": 1231 }, { "epoch": 2.015507124895222, "grad_norm": 13.394296646118164, "learning_rate": 4.639290355437703e-06, "loss": 2.0325, "step": 1232 }, { "epoch": 2.017183570829841, "grad_norm": 32.09248733520508, "learning_rate": 4.624039930193876e-06, "loss": 2.4044, "step": 1233 }, { "epoch": 2.0188600167644593, "grad_norm": 5.458365440368652, "learning_rate": 4.6088070702721215e-06, "loss": 2.5126, "step": 1234 }, { "epoch": 2.020536462699078, "grad_norm": 4.540204048156738, "learning_rate": 4.593591825444028e-06, "loss": 2.1358, "step": 1235 }, { "epoch": 2.0222129086336964, "grad_norm": 7.029388427734375, "learning_rate": 4.578394245423626e-06, "loss": 2.3082, "step": 1236 }, { "epoch": 2.023889354568315, "grad_norm": 5.638110637664795, "learning_rate": 4.563214379867248e-06, "loss": 2.1867, "step": 1237 }, { "epoch": 2.025565800502934, "grad_norm": 12.348304748535156, "learning_rate": 4.548052278373327e-06, "loss": 2.2004, "step": 1238 }, { "epoch": 2.0272422464375524, "grad_norm": 12.328714370727539, "learning_rate": 4.532907990482255e-06, "loss": 1.9823, "step": 1239 }, { "epoch": 2.028918692372171, "grad_norm": 8.03898811340332, "learning_rate": 4.517781565676229e-06, "loss": 2.2488, "step": 1240 }, { "epoch": 2.0305951383067895, "grad_norm": 7.206671714782715, "learning_rate": 4.502673053379077e-06, "loss": 2.3345, "step": 1241 }, { "epoch": 2.0322715842414083, "grad_norm": 5.277784824371338, "learning_rate": 4.487582502956104e-06, "loss": 2.1939, "step": 1242 }, { "epoch": 2.0339480301760267, "grad_norm": 8.063956260681152, "learning_rate": 4.472509963713917e-06, "loss": 2.3669, "step": 1243 }, { "epoch": 2.0356244761106455, "grad_norm": 4.986489772796631, "learning_rate": 4.4574554849002715e-06, "loss": 2.1272, "step": 1244 }, { "epoch": 2.0373009220452643, "grad_norm": 38.20222473144531, "learning_rate": 4.442419115703936e-06, "loss": 2.2284, "step": 1245 }, { "epoch": 2.0389773679798826, "grad_norm": 21.551639556884766, "learning_rate": 4.427400905254483e-06, "loss": 2.1649, "step": 1246 }, { "epoch": 2.0406538139145014, "grad_norm": 11.212272644042969, "learning_rate": 4.41240090262216e-06, "loss": 2.254, "step": 1247 }, { "epoch": 2.0423302598491198, "grad_norm": 5.011099338531494, "learning_rate": 4.397419156817727e-06, "loss": 2.5565, "step": 1248 }, { "epoch": 2.0440067057837386, "grad_norm": 44.877105712890625, "learning_rate": 4.382455716792291e-06, "loss": 2.1956, "step": 1249 }, { "epoch": 2.045683151718357, "grad_norm": 13.658645629882812, "learning_rate": 4.367510631437149e-06, "loss": 2.3583, "step": 1250 }, { "epoch": 2.0473595976529757, "grad_norm": 4.71381950378418, "learning_rate": 4.352583949583619e-06, "loss": 2.3225, "step": 1251 }, { "epoch": 2.0490360435875945, "grad_norm": 8.717616081237793, "learning_rate": 4.337675720002885e-06, "loss": 2.1865, "step": 1252 }, { "epoch": 2.050712489522213, "grad_norm": 9.512639045715332, "learning_rate": 4.322785991405863e-06, "loss": 2.1042, "step": 1253 }, { "epoch": 2.0523889354568317, "grad_norm": 7.821987628936768, "learning_rate": 4.307914812442993e-06, "loss": 2.0881, "step": 1254 }, { "epoch": 2.05406538139145, "grad_norm": 12.316287994384766, "learning_rate": 4.293062231704115e-06, "loss": 2.3137, "step": 1255 }, { "epoch": 2.055741827326069, "grad_norm": 8.3701810836792, "learning_rate": 4.278228297718307e-06, "loss": 2.3773, "step": 1256 }, { "epoch": 2.057418273260687, "grad_norm": 8.464999198913574, "learning_rate": 4.263413058953716e-06, "loss": 2.1083, "step": 1257 }, { "epoch": 2.059094719195306, "grad_norm": 7.7581586837768555, "learning_rate": 4.248616563817409e-06, "loss": 2.3429, "step": 1258 }, { "epoch": 2.0607711651299248, "grad_norm": 4.331634521484375, "learning_rate": 4.233838860655205e-06, "loss": 2.2203, "step": 1259 }, { "epoch": 2.062447611064543, "grad_norm": 10.138313293457031, "learning_rate": 4.219079997751515e-06, "loss": 2.3022, "step": 1260 }, { "epoch": 2.064124056999162, "grad_norm": 7.5774760246276855, "learning_rate": 4.204340023329215e-06, "loss": 2.5286, "step": 1261 }, { "epoch": 2.0658005029337803, "grad_norm": 9.023489952087402, "learning_rate": 4.189618985549446e-06, "loss": 2.2223, "step": 1262 }, { "epoch": 2.067476948868399, "grad_norm": 21.10108184814453, "learning_rate": 4.174916932511477e-06, "loss": 2.2781, "step": 1263 }, { "epoch": 2.0691533948030174, "grad_norm": 6.682534217834473, "learning_rate": 4.160233912252551e-06, "loss": 2.0834, "step": 1264 }, { "epoch": 2.070829840737636, "grad_norm": 6.4588541984558105, "learning_rate": 4.1455699727477265e-06, "loss": 2.3847, "step": 1265 }, { "epoch": 2.072506286672255, "grad_norm": 6.047457695007324, "learning_rate": 4.130925161909716e-06, "loss": 1.9221, "step": 1266 }, { "epoch": 2.0741827326068734, "grad_norm": 7.597789764404297, "learning_rate": 4.116299527588726e-06, "loss": 2.1919, "step": 1267 }, { "epoch": 2.075859178541492, "grad_norm": 9.464275360107422, "learning_rate": 4.101693117572304e-06, "loss": 2.2842, "step": 1268 }, { "epoch": 2.0775356244761105, "grad_norm": 3.945188045501709, "learning_rate": 4.087105979585203e-06, "loss": 2.2042, "step": 1269 }, { "epoch": 2.0792120704107293, "grad_norm": 11.845580101013184, "learning_rate": 4.072538161289186e-06, "loss": 2.0507, "step": 1270 }, { "epoch": 2.0808885163453477, "grad_norm": 12.141922950744629, "learning_rate": 4.057989710282897e-06, "loss": 2.0213, "step": 1271 }, { "epoch": 2.0825649622799665, "grad_norm": 19.84447479248047, "learning_rate": 4.043460674101704e-06, "loss": 2.1826, "step": 1272 }, { "epoch": 2.0842414082145853, "grad_norm": 6.200555324554443, "learning_rate": 4.0289511002175375e-06, "loss": 2.4647, "step": 1273 }, { "epoch": 2.0859178541492036, "grad_norm": 12.250397682189941, "learning_rate": 4.014461036038739e-06, "loss": 2.2364, "step": 1274 }, { "epoch": 2.0875943000838224, "grad_norm": 8.463107109069824, "learning_rate": 3.999990528909902e-06, "loss": 2.0801, "step": 1275 }, { "epoch": 2.0892707460184408, "grad_norm": 7.2163896560668945, "learning_rate": 3.985539626111708e-06, "loss": 2.2082, "step": 1276 }, { "epoch": 2.0909471919530596, "grad_norm": 5.574299335479736, "learning_rate": 3.9711083748608126e-06, "loss": 2.0143, "step": 1277 }, { "epoch": 2.092623637887678, "grad_norm": 8.328110694885254, "learning_rate": 3.956696822309632e-06, "loss": 2.0825, "step": 1278 }, { "epoch": 2.0943000838222967, "grad_norm": 8.31409740447998, "learning_rate": 3.942305015546242e-06, "loss": 2.2214, "step": 1279 }, { "epoch": 2.0959765297569155, "grad_norm": 5.020392894744873, "learning_rate": 3.927933001594185e-06, "loss": 2.6608, "step": 1280 }, { "epoch": 2.097652975691534, "grad_norm": 4.7579755783081055, "learning_rate": 3.913580827412334e-06, "loss": 2.4305, "step": 1281 }, { "epoch": 2.0993294216261527, "grad_norm": 9.670133590698242, "learning_rate": 3.899248539894756e-06, "loss": 2.1238, "step": 1282 }, { "epoch": 2.101005867560771, "grad_norm": 6.502074718475342, "learning_rate": 3.884936185870522e-06, "loss": 2.4698, "step": 1283 }, { "epoch": 2.10268231349539, "grad_norm": 8.494933128356934, "learning_rate": 3.870643812103574e-06, "loss": 2.1554, "step": 1284 }, { "epoch": 2.104358759430008, "grad_norm": 7.273034572601318, "learning_rate": 3.856371465292579e-06, "loss": 2.2675, "step": 1285 }, { "epoch": 2.106035205364627, "grad_norm": 5.748294830322266, "learning_rate": 3.842119192070762e-06, "loss": 2.1754, "step": 1286 }, { "epoch": 2.1077116512992458, "grad_norm": 44.848793029785156, "learning_rate": 3.827887039005769e-06, "loss": 1.879, "step": 1287 }, { "epoch": 2.109388097233864, "grad_norm": 5.074281215667725, "learning_rate": 3.813675052599495e-06, "loss": 2.2287, "step": 1288 }, { "epoch": 2.111064543168483, "grad_norm": 4.818614959716797, "learning_rate": 3.7994832792879376e-06, "loss": 2.0011, "step": 1289 }, { "epoch": 2.1127409891031013, "grad_norm": 9.798919677734375, "learning_rate": 3.7853117654410753e-06, "loss": 2.0911, "step": 1290 }, { "epoch": 2.11441743503772, "grad_norm": 11.435412406921387, "learning_rate": 3.77116055736267e-06, "loss": 1.8566, "step": 1291 }, { "epoch": 2.116093880972339, "grad_norm": 18.63894271850586, "learning_rate": 3.7570297012901357e-06, "loss": 2.0245, "step": 1292 }, { "epoch": 2.117770326906957, "grad_norm": 8.416255950927734, "learning_rate": 3.7429192433944016e-06, "loss": 2.2666, "step": 1293 }, { "epoch": 2.119446772841576, "grad_norm": 5.542328357696533, "learning_rate": 3.7288292297797402e-06, "loss": 2.1097, "step": 1294 }, { "epoch": 2.1211232187761944, "grad_norm": 7.464775562286377, "learning_rate": 3.7147597064836306e-06, "loss": 1.986, "step": 1295 }, { "epoch": 2.122799664710813, "grad_norm": 6.234290599822998, "learning_rate": 3.7007107194765945e-06, "loss": 2.0715, "step": 1296 }, { "epoch": 2.1244761106454315, "grad_norm": 10.061214447021484, "learning_rate": 3.6866823146620513e-06, "loss": 2.2687, "step": 1297 }, { "epoch": 2.1261525565800503, "grad_norm": 10.62147045135498, "learning_rate": 3.6726745378761885e-06, "loss": 2.1829, "step": 1298 }, { "epoch": 2.127829002514669, "grad_norm": 13.778946876525879, "learning_rate": 3.6586874348877767e-06, "loss": 2.3452, "step": 1299 }, { "epoch": 2.1295054484492875, "grad_norm": 3.8549985885620117, "learning_rate": 3.6447210513980367e-06, "loss": 1.9992, "step": 1300 }, { "epoch": 2.1311818943839063, "grad_norm": 10.212150573730469, "learning_rate": 3.630775433040502e-06, "loss": 2.3456, "step": 1301 }, { "epoch": 2.1328583403185246, "grad_norm": 8.225541114807129, "learning_rate": 3.616850625380851e-06, "loss": 2.5998, "step": 1302 }, { "epoch": 2.1345347862531434, "grad_norm": 31.58371353149414, "learning_rate": 3.602946673916773e-06, "loss": 1.8099, "step": 1303 }, { "epoch": 2.1362112321877618, "grad_norm": 6.1769843101501465, "learning_rate": 3.589063624077802e-06, "loss": 2.126, "step": 1304 }, { "epoch": 2.1378876781223806, "grad_norm": 13.941624641418457, "learning_rate": 3.575201521225177e-06, "loss": 2.1587, "step": 1305 }, { "epoch": 2.1395641240569994, "grad_norm": 6.619443893432617, "learning_rate": 3.561360410651713e-06, "loss": 2.2289, "step": 1306 }, { "epoch": 2.1412405699916177, "grad_norm": 7.619211196899414, "learning_rate": 3.5475403375816177e-06, "loss": 2.222, "step": 1307 }, { "epoch": 2.1429170159262365, "grad_norm": 16.830177307128906, "learning_rate": 3.5337413471703628e-06, "loss": 2.4003, "step": 1308 }, { "epoch": 2.144593461860855, "grad_norm": 12.820639610290527, "learning_rate": 3.5199634845045448e-06, "loss": 2.1341, "step": 1309 }, { "epoch": 2.1462699077954737, "grad_norm": 16.68798828125, "learning_rate": 3.5062067946017185e-06, "loss": 1.79, "step": 1310 }, { "epoch": 2.147946353730092, "grad_norm": 22.891008377075195, "learning_rate": 3.492471322410268e-06, "loss": 2.2808, "step": 1311 }, { "epoch": 2.149622799664711, "grad_norm": 10.050169944763184, "learning_rate": 3.478757112809241e-06, "loss": 2.4637, "step": 1312 }, { "epoch": 2.1512992455993296, "grad_norm": 5.088534832000732, "learning_rate": 3.4650642106082112e-06, "loss": 2.0537, "step": 1313 }, { "epoch": 2.152975691533948, "grad_norm": 4.907692909240723, "learning_rate": 3.4513926605471504e-06, "loss": 2.1865, "step": 1314 }, { "epoch": 2.1546521374685668, "grad_norm": 6.531007766723633, "learning_rate": 3.4377425072962467e-06, "loss": 2.1118, "step": 1315 }, { "epoch": 2.156328583403185, "grad_norm": 7.38433313369751, "learning_rate": 3.4241137954557792e-06, "loss": 2.5795, "step": 1316 }, { "epoch": 2.158005029337804, "grad_norm": 8.534085273742676, "learning_rate": 3.410506569555977e-06, "loss": 2.33, "step": 1317 }, { "epoch": 2.1596814752724223, "grad_norm": 8.002737045288086, "learning_rate": 3.3969208740568628e-06, "loss": 2.3218, "step": 1318 }, { "epoch": 2.161357921207041, "grad_norm": 7.097446918487549, "learning_rate": 3.3833567533481126e-06, "loss": 2.1682, "step": 1319 }, { "epoch": 2.16303436714166, "grad_norm": 5.984406471252441, "learning_rate": 3.3698142517489063e-06, "loss": 2.13, "step": 1320 }, { "epoch": 2.164710813076278, "grad_norm": 19.71725845336914, "learning_rate": 3.3562934135077794e-06, "loss": 2.3915, "step": 1321 }, { "epoch": 2.166387259010897, "grad_norm": 4.020336151123047, "learning_rate": 3.342794282802507e-06, "loss": 2.2242, "step": 1322 }, { "epoch": 2.1680637049455154, "grad_norm": 4.430262088775635, "learning_rate": 3.3293169037399155e-06, "loss": 2.1006, "step": 1323 }, { "epoch": 2.169740150880134, "grad_norm": 4.853217601776123, "learning_rate": 3.315861320355764e-06, "loss": 1.8745, "step": 1324 }, { "epoch": 2.1714165968147525, "grad_norm": 8.008112907409668, "learning_rate": 3.3024275766146063e-06, "loss": 2.6229, "step": 1325 }, { "epoch": 2.1730930427493713, "grad_norm": 7.5876851081848145, "learning_rate": 3.2890157164096315e-06, "loss": 2.1892, "step": 1326 }, { "epoch": 2.17476948868399, "grad_norm": 6.744850158691406, "learning_rate": 3.2756257835625295e-06, "loss": 2.2864, "step": 1327 }, { "epoch": 2.1764459346186085, "grad_norm": 9.34411907196045, "learning_rate": 3.2622578218233405e-06, "loss": 2.2471, "step": 1328 }, { "epoch": 2.1781223805532273, "grad_norm": 4.918528079986572, "learning_rate": 3.2489118748703154e-06, "loss": 2.1541, "step": 1329 }, { "epoch": 2.1797988264878456, "grad_norm": 7.374917030334473, "learning_rate": 3.235587986309782e-06, "loss": 2.2008, "step": 1330 }, { "epoch": 2.1814752724224644, "grad_norm": 4.484757900238037, "learning_rate": 3.2222861996759912e-06, "loss": 2.3445, "step": 1331 }, { "epoch": 2.1831517183570828, "grad_norm": 6.88983154296875, "learning_rate": 3.2090065584309813e-06, "loss": 2.2523, "step": 1332 }, { "epoch": 2.1848281642917016, "grad_norm": 12.558144569396973, "learning_rate": 3.1957491059644274e-06, "loss": 2.0281, "step": 1333 }, { "epoch": 2.1865046102263204, "grad_norm": 7.2450690269470215, "learning_rate": 3.182513885593501e-06, "loss": 2.3113, "step": 1334 }, { "epoch": 2.1881810561609387, "grad_norm": 26.816823959350586, "learning_rate": 3.1693009405627538e-06, "loss": 2.4343, "step": 1335 }, { "epoch": 2.1898575020955575, "grad_norm": 16.565248489379883, "learning_rate": 3.156110314043933e-06, "loss": 2.1688, "step": 1336 }, { "epoch": 2.191533948030176, "grad_norm": 5.443547248840332, "learning_rate": 3.1429420491358696e-06, "loss": 2.1053, "step": 1337 }, { "epoch": 2.1932103939647947, "grad_norm": 7.466479301452637, "learning_rate": 3.129796188864336e-06, "loss": 2.4287, "step": 1338 }, { "epoch": 2.1948868398994135, "grad_norm": 10.327070236206055, "learning_rate": 3.1166727761818936e-06, "loss": 2.3776, "step": 1339 }, { "epoch": 2.196563285834032, "grad_norm": 5.39819860458374, "learning_rate": 3.1035718539677683e-06, "loss": 2.0732, "step": 1340 }, { "epoch": 2.1982397317686506, "grad_norm": 6.212865829467773, "learning_rate": 3.0904934650276897e-06, "loss": 2.1009, "step": 1341 }, { "epoch": 2.1982397317686506, "eval_loss": 2.235383987426758, "eval_runtime": 170.594, "eval_samples_per_second": 3.077, "eval_steps_per_second": 1.542, "step": 1341 }, { "epoch": 2.199916177703269, "grad_norm": 9.951416969299316, "learning_rate": 3.07743765209376e-06, "loss": 2.1172, "step": 1342 }, { "epoch": 2.2015926236378878, "grad_norm": 5.6082305908203125, "learning_rate": 3.064404457824337e-06, "loss": 2.1653, "step": 1343 }, { "epoch": 2.203269069572506, "grad_norm": 25.695249557495117, "learning_rate": 3.0513939248038572e-06, "loss": 1.9704, "step": 1344 }, { "epoch": 2.204945515507125, "grad_norm": 6.777249813079834, "learning_rate": 3.038406095542713e-06, "loss": 2.1125, "step": 1345 }, { "epoch": 2.2066219614417433, "grad_norm": 16.44721031188965, "learning_rate": 3.025441012477126e-06, "loss": 2.243, "step": 1346 }, { "epoch": 2.208298407376362, "grad_norm": 5.672937870025635, "learning_rate": 3.0124987179689903e-06, "loss": 2.1498, "step": 1347 }, { "epoch": 2.209974853310981, "grad_norm": 5.7788848876953125, "learning_rate": 2.999579254305748e-06, "loss": 2.4373, "step": 1348 }, { "epoch": 2.2116512992455992, "grad_norm": 7.267642498016357, "learning_rate": 2.986682663700232e-06, "loss": 2.0362, "step": 1349 }, { "epoch": 2.213327745180218, "grad_norm": 10.069255828857422, "learning_rate": 2.973808988290544e-06, "loss": 2.236, "step": 1350 }, { "epoch": 2.2150041911148364, "grad_norm": 4.0618696212768555, "learning_rate": 2.9609582701399266e-06, "loss": 1.9876, "step": 1351 }, { "epoch": 2.216680637049455, "grad_norm": 4.883275508880615, "learning_rate": 2.9481305512365964e-06, "loss": 2.1696, "step": 1352 }, { "epoch": 2.218357082984074, "grad_norm": 5.943734645843506, "learning_rate": 2.935325873493623e-06, "loss": 2.0801, "step": 1353 }, { "epoch": 2.2200335289186923, "grad_norm": 8.83841323852539, "learning_rate": 2.922544278748801e-06, "loss": 1.9798, "step": 1354 }, { "epoch": 2.221709974853311, "grad_norm": 7.799284934997559, "learning_rate": 2.9097858087644983e-06, "loss": 2.209, "step": 1355 }, { "epoch": 2.2233864207879295, "grad_norm": 4.826918601989746, "learning_rate": 2.89705050522753e-06, "loss": 2.2354, "step": 1356 }, { "epoch": 2.2250628667225483, "grad_norm": 5.494199275970459, "learning_rate": 2.8843384097490113e-06, "loss": 2.2727, "step": 1357 }, { "epoch": 2.2267393126571666, "grad_norm": 10.839118957519531, "learning_rate": 2.871649563864224e-06, "loss": 2.2366, "step": 1358 }, { "epoch": 2.2284157585917854, "grad_norm": 5.281680107116699, "learning_rate": 2.8589840090325028e-06, "loss": 2.071, "step": 1359 }, { "epoch": 2.230092204526404, "grad_norm": 5.377181053161621, "learning_rate": 2.8463417866370657e-06, "loss": 2.15, "step": 1360 }, { "epoch": 2.2317686504610226, "grad_norm": 4.584048271179199, "learning_rate": 2.833722937984893e-06, "loss": 2.1011, "step": 1361 }, { "epoch": 2.2334450963956414, "grad_norm": 9.0023775100708, "learning_rate": 2.8211275043066045e-06, "loss": 2.2702, "step": 1362 }, { "epoch": 2.2351215423302597, "grad_norm": 10.387222290039062, "learning_rate": 2.8085555267563093e-06, "loss": 2.0993, "step": 1363 }, { "epoch": 2.2367979882648785, "grad_norm": 16.555574417114258, "learning_rate": 2.7960070464114796e-06, "loss": 2.0898, "step": 1364 }, { "epoch": 2.238474434199497, "grad_norm": 4.0740966796875, "learning_rate": 2.7834821042728077e-06, "loss": 2.2147, "step": 1365 }, { "epoch": 2.2401508801341157, "grad_norm": 8.56203842163086, "learning_rate": 2.7709807412640745e-06, "loss": 2.2396, "step": 1366 }, { "epoch": 2.2418273260687345, "grad_norm": 10.104968070983887, "learning_rate": 2.7585029982320354e-06, "loss": 2.0067, "step": 1367 }, { "epoch": 2.243503772003353, "grad_norm": 10.658915519714355, "learning_rate": 2.7460489159462545e-06, "loss": 2.2904, "step": 1368 }, { "epoch": 2.2451802179379716, "grad_norm": 9.786938667297363, "learning_rate": 2.7336185350989885e-06, "loss": 2.2836, "step": 1369 }, { "epoch": 2.24685666387259, "grad_norm": 18.35841941833496, "learning_rate": 2.721211896305059e-06, "loss": 1.894, "step": 1370 }, { "epoch": 2.2485331098072088, "grad_norm": 8.9628267288208, "learning_rate": 2.70882904010171e-06, "loss": 2.4271, "step": 1371 }, { "epoch": 2.2502095557418276, "grad_norm": 6.923974990844727, "learning_rate": 2.6964700069484827e-06, "loss": 2.3662, "step": 1372 }, { "epoch": 2.251886001676446, "grad_norm": 6.602287292480469, "learning_rate": 2.6841348372270725e-06, "loss": 2.2167, "step": 1373 }, { "epoch": 2.2535624476110647, "grad_norm": 13.797829627990723, "learning_rate": 2.671823571241199e-06, "loss": 2.3762, "step": 1374 }, { "epoch": 2.255238893545683, "grad_norm": 6.3822922706604, "learning_rate": 2.659536249216501e-06, "loss": 2.1108, "step": 1375 }, { "epoch": 2.256915339480302, "grad_norm": 7.4859299659729, "learning_rate": 2.6472729113003614e-06, "loss": 2.0934, "step": 1376 }, { "epoch": 2.2585917854149202, "grad_norm": 5.307763576507568, "learning_rate": 2.6350335975618035e-06, "loss": 2.1377, "step": 1377 }, { "epoch": 2.260268231349539, "grad_norm": 7.142258644104004, "learning_rate": 2.622818347991359e-06, "loss": 2.0624, "step": 1378 }, { "epoch": 2.2619446772841574, "grad_norm": 7.365501403808594, "learning_rate": 2.6106272025009317e-06, "loss": 2.0939, "step": 1379 }, { "epoch": 2.263621123218776, "grad_norm": 6.25426721572876, "learning_rate": 2.5984602009236672e-06, "loss": 2.0953, "step": 1380 }, { "epoch": 2.265297569153395, "grad_norm": 20.978071212768555, "learning_rate": 2.5863173830138212e-06, "loss": 1.9874, "step": 1381 }, { "epoch": 2.2669740150880133, "grad_norm": 4.320618152618408, "learning_rate": 2.5741987884466313e-06, "loss": 2.5172, "step": 1382 }, { "epoch": 2.268650461022632, "grad_norm": 13.529948234558105, "learning_rate": 2.562104456818193e-06, "loss": 2.3618, "step": 1383 }, { "epoch": 2.2703269069572505, "grad_norm": 6.987939357757568, "learning_rate": 2.5500344276453237e-06, "loss": 2.5367, "step": 1384 }, { "epoch": 2.2720033528918693, "grad_norm": 13.568084716796875, "learning_rate": 2.537988740365438e-06, "loss": 2.2878, "step": 1385 }, { "epoch": 2.273679798826488, "grad_norm": 6.971397876739502, "learning_rate": 2.5259674343364104e-06, "loss": 2.2381, "step": 1386 }, { "epoch": 2.2753562447611064, "grad_norm": 10.580013275146484, "learning_rate": 2.5139705488364487e-06, "loss": 2.3562, "step": 1387 }, { "epoch": 2.2770326906957252, "grad_norm": 10.957854270935059, "learning_rate": 2.501998123063989e-06, "loss": 2.5301, "step": 1388 }, { "epoch": 2.2787091366303436, "grad_norm": 6.093613147735596, "learning_rate": 2.490050196137528e-06, "loss": 2.3545, "step": 1389 }, { "epoch": 2.2803855825649624, "grad_norm": 8.123588562011719, "learning_rate": 2.47812680709552e-06, "loss": 2.2263, "step": 1390 }, { "epoch": 2.2820620284995807, "grad_norm": 9.231768608093262, "learning_rate": 2.4662279948962497e-06, "loss": 2.1541, "step": 1391 }, { "epoch": 2.2837384744341995, "grad_norm": 6.772127628326416, "learning_rate": 2.454353798417698e-06, "loss": 2.1219, "step": 1392 }, { "epoch": 2.285414920368818, "grad_norm": 12.371326446533203, "learning_rate": 2.4425042564574186e-06, "loss": 2.1754, "step": 1393 }, { "epoch": 2.2870913663034367, "grad_norm": 9.332115173339844, "learning_rate": 2.4306794077324025e-06, "loss": 2.3706, "step": 1394 }, { "epoch": 2.2887678122380555, "grad_norm": 4.564769744873047, "learning_rate": 2.418879290878959e-06, "loss": 2.372, "step": 1395 }, { "epoch": 2.290444258172674, "grad_norm": 8.537875175476074, "learning_rate": 2.4071039444526046e-06, "loss": 2.024, "step": 1396 }, { "epoch": 2.2921207041072926, "grad_norm": 9.414278984069824, "learning_rate": 2.3953534069279006e-06, "loss": 2.3969, "step": 1397 }, { "epoch": 2.293797150041911, "grad_norm": 9.292947769165039, "learning_rate": 2.3836277166983567e-06, "loss": 2.1119, "step": 1398 }, { "epoch": 2.29547359597653, "grad_norm": 9.540177345275879, "learning_rate": 2.371926912076299e-06, "loss": 2.1613, "step": 1399 }, { "epoch": 2.2971500419111486, "grad_norm": 5.375885486602783, "learning_rate": 2.3602510312927405e-06, "loss": 2.3057, "step": 1400 }, { "epoch": 2.298826487845767, "grad_norm": 7.281787395477295, "learning_rate": 2.3486001124972636e-06, "loss": 2.206, "step": 1401 }, { "epoch": 2.3005029337803857, "grad_norm": 10.502132415771484, "learning_rate": 2.3369741937578803e-06, "loss": 2.218, "step": 1402 }, { "epoch": 2.302179379715004, "grad_norm": 5.155184745788574, "learning_rate": 2.325373313060919e-06, "loss": 2.1152, "step": 1403 }, { "epoch": 2.303855825649623, "grad_norm": 6.232965469360352, "learning_rate": 2.3137975083109153e-06, "loss": 2.3069, "step": 1404 }, { "epoch": 2.3055322715842412, "grad_norm": 12.726558685302734, "learning_rate": 2.3022468173304526e-06, "loss": 2.5572, "step": 1405 }, { "epoch": 2.30720871751886, "grad_norm": 7.728231906890869, "learning_rate": 2.290721277860064e-06, "loss": 2.0625, "step": 1406 }, { "epoch": 2.3088851634534784, "grad_norm": 7.336434841156006, "learning_rate": 2.279220927558107e-06, "loss": 2.0206, "step": 1407 }, { "epoch": 2.310561609388097, "grad_norm": 9.828569412231445, "learning_rate": 2.267745804000634e-06, "loss": 2.3912, "step": 1408 }, { "epoch": 2.312238055322716, "grad_norm": 8.977592468261719, "learning_rate": 2.256295944681275e-06, "loss": 2.1803, "step": 1409 }, { "epoch": 2.3139145012573343, "grad_norm": 7.798287868499756, "learning_rate": 2.244871387011105e-06, "loss": 2.2114, "step": 1410 }, { "epoch": 2.315590947191953, "grad_norm": 33.64244079589844, "learning_rate": 2.233472168318529e-06, "loss": 1.9737, "step": 1411 }, { "epoch": 2.3172673931265715, "grad_norm": 5.0558977127075195, "learning_rate": 2.2220983258491726e-06, "loss": 2.3031, "step": 1412 }, { "epoch": 2.3189438390611903, "grad_norm": 6.966290473937988, "learning_rate": 2.2107498967657347e-06, "loss": 2.371, "step": 1413 }, { "epoch": 2.320620284995809, "grad_norm": 27.856409072875977, "learning_rate": 2.19942691814788e-06, "loss": 2.2999, "step": 1414 }, { "epoch": 2.3222967309304274, "grad_norm": 12.30173110961914, "learning_rate": 2.1881294269921217e-06, "loss": 2.3032, "step": 1415 }, { "epoch": 2.3239731768650462, "grad_norm": 5.013786792755127, "learning_rate": 2.176857460211693e-06, "loss": 2.0642, "step": 1416 }, { "epoch": 2.3256496227996646, "grad_norm": 9.071167945861816, "learning_rate": 2.165611054636434e-06, "loss": 2.1122, "step": 1417 }, { "epoch": 2.3273260687342834, "grad_norm": 8.3031005859375, "learning_rate": 2.1543902470126576e-06, "loss": 2.1381, "step": 1418 }, { "epoch": 2.3290025146689017, "grad_norm": 7.641931056976318, "learning_rate": 2.1431950740030394e-06, "loss": 2.1226, "step": 1419 }, { "epoch": 2.3306789606035205, "grad_norm": 5.272727012634277, "learning_rate": 2.13202557218651e-06, "loss": 2.0126, "step": 1420 }, { "epoch": 2.3323554065381393, "grad_norm": 5.695738315582275, "learning_rate": 2.120881778058109e-06, "loss": 2.2322, "step": 1421 }, { "epoch": 2.3340318524727577, "grad_norm": 6.745401382446289, "learning_rate": 2.1097637280288807e-06, "loss": 2.2258, "step": 1422 }, { "epoch": 2.3357082984073765, "grad_norm": 12.495540618896484, "learning_rate": 2.098671458425756e-06, "loss": 2.2928, "step": 1423 }, { "epoch": 2.337384744341995, "grad_norm": 4.469579696655273, "learning_rate": 2.0876050054914332e-06, "loss": 2.0871, "step": 1424 }, { "epoch": 2.3390611902766136, "grad_norm": 7.0336995124816895, "learning_rate": 2.0765644053842583e-06, "loss": 2.2006, "step": 1425 }, { "epoch": 2.340737636211232, "grad_norm": 5.7290568351745605, "learning_rate": 2.0655496941780993e-06, "loss": 2.2943, "step": 1426 }, { "epoch": 2.342414082145851, "grad_norm": 10.908333778381348, "learning_rate": 2.0545609078622354e-06, "loss": 2.2536, "step": 1427 }, { "epoch": 2.3440905280804696, "grad_norm": 6.3847832679748535, "learning_rate": 2.043598082341255e-06, "loss": 1.9198, "step": 1428 }, { "epoch": 2.345766974015088, "grad_norm": 6.258805274963379, "learning_rate": 2.032661253434903e-06, "loss": 2.0593, "step": 1429 }, { "epoch": 2.3474434199497067, "grad_norm": 6.0445146560668945, "learning_rate": 2.0217504568779913e-06, "loss": 1.9706, "step": 1430 }, { "epoch": 2.349119865884325, "grad_norm": 4.850488662719727, "learning_rate": 2.0108657283202783e-06, "loss": 2.3983, "step": 1431 }, { "epoch": 2.350796311818944, "grad_norm": 4.106098175048828, "learning_rate": 2.000007103326336e-06, "loss": 2.0996, "step": 1432 }, { "epoch": 2.3524727577535627, "grad_norm": 6.61715841293335, "learning_rate": 1.9891746173754644e-06, "loss": 2.3494, "step": 1433 }, { "epoch": 2.354149203688181, "grad_norm": 3.849245548248291, "learning_rate": 1.978368305861543e-06, "loss": 2.0074, "step": 1434 }, { "epoch": 2.3558256496228, "grad_norm": 5.794919013977051, "learning_rate": 1.9675882040929297e-06, "loss": 1.7891, "step": 1435 }, { "epoch": 2.357502095557418, "grad_norm": 19.595027923583984, "learning_rate": 1.9568343472923524e-06, "loss": 1.976, "step": 1436 }, { "epoch": 2.359178541492037, "grad_norm": 7.703280448913574, "learning_rate": 1.946106770596783e-06, "loss": 2.2819, "step": 1437 }, { "epoch": 2.3608549874266553, "grad_norm": 6.606808662414551, "learning_rate": 1.9354055090573277e-06, "loss": 2.0023, "step": 1438 }, { "epoch": 2.362531433361274, "grad_norm": 9.028946876525879, "learning_rate": 1.9247305976391074e-06, "loss": 2.0141, "step": 1439 }, { "epoch": 2.3642078792958925, "grad_norm": 9.077788352966309, "learning_rate": 1.914082071221145e-06, "loss": 2.2771, "step": 1440 }, { "epoch": 2.3658843252305113, "grad_norm": 8.733525276184082, "learning_rate": 1.903459964596267e-06, "loss": 2.1378, "step": 1441 }, { "epoch": 2.36756077116513, "grad_norm": 5.042154788970947, "learning_rate": 1.8928643124709622e-06, "loss": 2.262, "step": 1442 }, { "epoch": 2.3692372170997484, "grad_norm": 7.528419017791748, "learning_rate": 1.8822951494652852e-06, "loss": 2.0493, "step": 1443 }, { "epoch": 2.3709136630343672, "grad_norm": 6.6567063331604, "learning_rate": 1.8717525101127442e-06, "loss": 2.5786, "step": 1444 }, { "epoch": 2.3725901089689856, "grad_norm": 11.961003303527832, "learning_rate": 1.8612364288601836e-06, "loss": 2.2694, "step": 1445 }, { "epoch": 2.3742665549036044, "grad_norm": 5.76288366317749, "learning_rate": 1.8507469400676736e-06, "loss": 2.069, "step": 1446 }, { "epoch": 2.375943000838223, "grad_norm": 7.505050182342529, "learning_rate": 1.840284078008393e-06, "loss": 2.0601, "step": 1447 }, { "epoch": 2.3776194467728415, "grad_norm": 11.826309204101562, "learning_rate": 1.8298478768685146e-06, "loss": 2.3673, "step": 1448 }, { "epoch": 2.3792958927074603, "grad_norm": 6.623786449432373, "learning_rate": 1.819438370747121e-06, "loss": 2.2409, "step": 1449 }, { "epoch": 2.3809723386420787, "grad_norm": 5.3532891273498535, "learning_rate": 1.8090555936560517e-06, "loss": 1.9787, "step": 1450 }, { "epoch": 2.3826487845766975, "grad_norm": 12.599485397338867, "learning_rate": 1.798699579519817e-06, "loss": 1.9842, "step": 1451 }, { "epoch": 2.384325230511316, "grad_norm": 4.9249773025512695, "learning_rate": 1.7883703621754855e-06, "loss": 2.3094, "step": 1452 }, { "epoch": 2.3860016764459346, "grad_norm": 6.108479022979736, "learning_rate": 1.7780679753725716e-06, "loss": 1.9537, "step": 1453 }, { "epoch": 2.387678122380553, "grad_norm": 7.488571643829346, "learning_rate": 1.7677924527729228e-06, "loss": 2.261, "step": 1454 }, { "epoch": 2.389354568315172, "grad_norm": 9.828038215637207, "learning_rate": 1.7575438279506075e-06, "loss": 2.2053, "step": 1455 }, { "epoch": 2.3910310142497906, "grad_norm": 4.574258327484131, "learning_rate": 1.7473221343918057e-06, "loss": 2.0829, "step": 1456 }, { "epoch": 2.392707460184409, "grad_norm": 5.391171455383301, "learning_rate": 1.7371274054947196e-06, "loss": 1.8422, "step": 1457 }, { "epoch": 2.3943839061190277, "grad_norm": 16.221250534057617, "learning_rate": 1.7269596745694295e-06, "loss": 1.9086, "step": 1458 }, { "epoch": 2.396060352053646, "grad_norm": 8.319306373596191, "learning_rate": 1.7168189748378084e-06, "loss": 2.0303, "step": 1459 }, { "epoch": 2.397736797988265, "grad_norm": 32.93059539794922, "learning_rate": 1.70670533943341e-06, "loss": 1.938, "step": 1460 }, { "epoch": 2.3994132439228837, "grad_norm": 7.017632007598877, "learning_rate": 1.6966188014013574e-06, "loss": 2.1977, "step": 1461 }, { "epoch": 2.401089689857502, "grad_norm": 9.0474214553833, "learning_rate": 1.6865593936982417e-06, "loss": 2.1389, "step": 1462 }, { "epoch": 2.402766135792121, "grad_norm": 7.278559684753418, "learning_rate": 1.6765271491919977e-06, "loss": 1.8519, "step": 1463 }, { "epoch": 2.404442581726739, "grad_norm": 6.806189060211182, "learning_rate": 1.6665221006618093e-06, "loss": 2.1755, "step": 1464 }, { "epoch": 2.406119027661358, "grad_norm": 6.3243937492370605, "learning_rate": 1.656544280798017e-06, "loss": 1.8707, "step": 1465 }, { "epoch": 2.4077954735959763, "grad_norm": 7.006830215454102, "learning_rate": 1.6465937222019745e-06, "loss": 2.2122, "step": 1466 }, { "epoch": 2.409471919530595, "grad_norm": 4.942697525024414, "learning_rate": 1.6366704573859692e-06, "loss": 2.1237, "step": 1467 }, { "epoch": 2.411148365465214, "grad_norm": 4.9030327796936035, "learning_rate": 1.6267745187731142e-06, "loss": 2.103, "step": 1468 }, { "epoch": 2.4128248113998323, "grad_norm": 15.216239929199219, "learning_rate": 1.6169059386972342e-06, "loss": 2.0042, "step": 1469 }, { "epoch": 2.414501257334451, "grad_norm": 13.778632164001465, "learning_rate": 1.6070647494027624e-06, "loss": 2.2192, "step": 1470 }, { "epoch": 2.4161777032690694, "grad_norm": 5.1935296058654785, "learning_rate": 1.597250983044637e-06, "loss": 2.0851, "step": 1471 }, { "epoch": 2.4178541492036882, "grad_norm": 6.960971355438232, "learning_rate": 1.587464671688187e-06, "loss": 2.1853, "step": 1472 }, { "epoch": 2.4195305951383066, "grad_norm": 21.46315574645996, "learning_rate": 1.5777058473090534e-06, "loss": 2.2485, "step": 1473 }, { "epoch": 2.4212070410729254, "grad_norm": 5.483536720275879, "learning_rate": 1.5679745417930515e-06, "loss": 1.8533, "step": 1474 }, { "epoch": 2.422883487007544, "grad_norm": 10.926675796508789, "learning_rate": 1.5582707869360825e-06, "loss": 2.417, "step": 1475 }, { "epoch": 2.4245599329421625, "grad_norm": 5.486928462982178, "learning_rate": 1.5485946144440357e-06, "loss": 2.3654, "step": 1476 }, { "epoch": 2.4262363788767813, "grad_norm": 9.436552047729492, "learning_rate": 1.5389460559326753e-06, "loss": 2.2155, "step": 1477 }, { "epoch": 2.4279128248113997, "grad_norm": 7.809107780456543, "learning_rate": 1.5293251429275424e-06, "loss": 2.3929, "step": 1478 }, { "epoch": 2.4295892707460185, "grad_norm": 7.403750896453857, "learning_rate": 1.519731906863846e-06, "loss": 2.1066, "step": 1479 }, { "epoch": 2.4312657166806373, "grad_norm": 6.136329174041748, "learning_rate": 1.5101663790863597e-06, "loss": 2.2792, "step": 1480 }, { "epoch": 2.4329421626152556, "grad_norm": 9.608780860900879, "learning_rate": 1.500628590849339e-06, "loss": 2.3297, "step": 1481 }, { "epoch": 2.4346186085498744, "grad_norm": 9.624567985534668, "learning_rate": 1.4911185733163858e-06, "loss": 1.9665, "step": 1482 }, { "epoch": 2.436295054484493, "grad_norm": 16.837739944458008, "learning_rate": 1.4816363575603764e-06, "loss": 2.3727, "step": 1483 }, { "epoch": 2.4379715004191116, "grad_norm": 10.557210922241211, "learning_rate": 1.4721819745633448e-06, "loss": 2.1602, "step": 1484 }, { "epoch": 2.43964794635373, "grad_norm": 8.435738563537598, "learning_rate": 1.462755455216376e-06, "loss": 2.4873, "step": 1485 }, { "epoch": 2.4413243922883487, "grad_norm": 5.892087936401367, "learning_rate": 1.4533568303195333e-06, "loss": 2.6876, "step": 1486 }, { "epoch": 2.443000838222967, "grad_norm": 14.025276184082031, "learning_rate": 1.4439861305817204e-06, "loss": 2.0792, "step": 1487 }, { "epoch": 2.444677284157586, "grad_norm": 3.751133918762207, "learning_rate": 1.4346433866206044e-06, "loss": 1.9906, "step": 1488 }, { "epoch": 2.4463537300922047, "grad_norm": 13.324430465698242, "learning_rate": 1.4253286289625113e-06, "loss": 2.1824, "step": 1489 }, { "epoch": 2.448030176026823, "grad_norm": 7.326375961303711, "learning_rate": 1.4160418880423254e-06, "loss": 2.0631, "step": 1490 }, { "epoch": 2.448030176026823, "eval_loss": 2.231937885284424, "eval_runtime": 170.5921, "eval_samples_per_second": 3.078, "eval_steps_per_second": 1.542, "step": 1490 }, { "epoch": 2.449706621961442, "grad_norm": 8.523940086364746, "learning_rate": 1.4067831942033904e-06, "loss": 2.2783, "step": 1491 }, { "epoch": 2.45138306789606, "grad_norm": 5.17748498916626, "learning_rate": 1.3975525776974065e-06, "loss": 1.7464, "step": 1492 }, { "epoch": 2.453059513830679, "grad_norm": 14.52001953125, "learning_rate": 1.3883500686843309e-06, "loss": 2.1745, "step": 1493 }, { "epoch": 2.454735959765298, "grad_norm": 4.2642388343811035, "learning_rate": 1.3791756972322968e-06, "loss": 2.089, "step": 1494 }, { "epoch": 2.456412405699916, "grad_norm": 19.937469482421875, "learning_rate": 1.3700294933174874e-06, "loss": 2.2071, "step": 1495 }, { "epoch": 2.458088851634535, "grad_norm": 7.203943252563477, "learning_rate": 1.3609114868240537e-06, "loss": 2.2084, "step": 1496 }, { "epoch": 2.4597652975691533, "grad_norm": 14.399893760681152, "learning_rate": 1.3518217075440209e-06, "loss": 1.9919, "step": 1497 }, { "epoch": 2.461441743503772, "grad_norm": 6.565903186798096, "learning_rate": 1.342760185177181e-06, "loss": 2.2743, "step": 1498 }, { "epoch": 2.4631181894383904, "grad_norm": 5.243765354156494, "learning_rate": 1.3337269493310045e-06, "loss": 2.0915, "step": 1499 }, { "epoch": 2.4647946353730092, "grad_norm": 6.328683376312256, "learning_rate": 1.324722029520531e-06, "loss": 2.1317, "step": 1500 }, { "epoch": 2.4664710813076276, "grad_norm": 12.934956550598145, "learning_rate": 1.3157454551682824e-06, "loss": 2.4827, "step": 1501 }, { "epoch": 2.4681475272422464, "grad_norm": 7.548893451690674, "learning_rate": 1.3067972556041753e-06, "loss": 2.0718, "step": 1502 }, { "epoch": 2.469823973176865, "grad_norm": 8.60559368133545, "learning_rate": 1.297877460065403e-06, "loss": 1.9854, "step": 1503 }, { "epoch": 2.4715004191114835, "grad_norm": 7.519042015075684, "learning_rate": 1.2889860976963542e-06, "loss": 2.0767, "step": 1504 }, { "epoch": 2.4731768650461023, "grad_norm": 11.122941970825195, "learning_rate": 1.2801231975485208e-06, "loss": 2.3641, "step": 1505 }, { "epoch": 2.4748533109807207, "grad_norm": 17.30807876586914, "learning_rate": 1.2712887885803937e-06, "loss": 2.1318, "step": 1506 }, { "epoch": 2.4765297569153395, "grad_norm": 12.969422340393066, "learning_rate": 1.2624828996573756e-06, "loss": 2.3816, "step": 1507 }, { "epoch": 2.4782062028499583, "grad_norm": 20.083126068115234, "learning_rate": 1.2537055595516778e-06, "loss": 2.073, "step": 1508 }, { "epoch": 2.4798826487845766, "grad_norm": 8.024612426757812, "learning_rate": 1.2449567969422315e-06, "loss": 2.0068, "step": 1509 }, { "epoch": 2.4815590947191954, "grad_norm": 4.503040790557861, "learning_rate": 1.2362366404146053e-06, "loss": 2.4259, "step": 1510 }, { "epoch": 2.483235540653814, "grad_norm": 9.55201244354248, "learning_rate": 1.227545118460889e-06, "loss": 2.1942, "step": 1511 }, { "epoch": 2.4849119865884326, "grad_norm": 7.338901996612549, "learning_rate": 1.2188822594796146e-06, "loss": 2.1969, "step": 1512 }, { "epoch": 2.486588432523051, "grad_norm": 11.002367973327637, "learning_rate": 1.2102480917756632e-06, "loss": 2.4189, "step": 1513 }, { "epoch": 2.4882648784576697, "grad_norm": 5.696709632873535, "learning_rate": 1.2016426435601703e-06, "loss": 2.4543, "step": 1514 }, { "epoch": 2.489941324392288, "grad_norm": 21.405759811401367, "learning_rate": 1.1930659429504376e-06, "loss": 2.2616, "step": 1515 }, { "epoch": 2.491617770326907, "grad_norm": 9.35830020904541, "learning_rate": 1.184518017969829e-06, "loss": 2.5454, "step": 1516 }, { "epoch": 2.4932942162615257, "grad_norm": 5.365878582000732, "learning_rate": 1.1759988965476888e-06, "loss": 2.2917, "step": 1517 }, { "epoch": 2.494970662196144, "grad_norm": 13.6166353225708, "learning_rate": 1.1675086065192586e-06, "loss": 1.9612, "step": 1518 }, { "epoch": 2.496647108130763, "grad_norm": 5.654745578765869, "learning_rate": 1.1590471756255683e-06, "loss": 2.2421, "step": 1519 }, { "epoch": 2.498323554065381, "grad_norm": 7.4756178855896, "learning_rate": 1.1506146315133527e-06, "loss": 2.3553, "step": 1520 }, { "epoch": 2.5, "grad_norm": 7.051192283630371, "learning_rate": 1.1422110017349676e-06, "loss": 2.165, "step": 1521 }, { "epoch": 2.501676445934619, "grad_norm": 5.298817157745361, "learning_rate": 1.1338363137482922e-06, "loss": 2.0893, "step": 1522 }, { "epoch": 2.503352891869237, "grad_norm": 5.511155605316162, "learning_rate": 1.1254905949166438e-06, "loss": 2.108, "step": 1523 }, { "epoch": 2.505029337803856, "grad_norm": 24.017852783203125, "learning_rate": 1.1171738725086833e-06, "loss": 2.0972, "step": 1524 }, { "epoch": 2.5067057837384743, "grad_norm": 4.4245219230651855, "learning_rate": 1.108886173698326e-06, "loss": 2.2732, "step": 1525 }, { "epoch": 2.508382229673093, "grad_norm": 6.6978936195373535, "learning_rate": 1.100627525564668e-06, "loss": 1.984, "step": 1526 }, { "epoch": 2.510058675607712, "grad_norm": 5.836032867431641, "learning_rate": 1.0923979550918762e-06, "loss": 1.9825, "step": 1527 }, { "epoch": 2.5117351215423303, "grad_norm": 7.63114595413208, "learning_rate": 1.0841974891691066e-06, "loss": 2.2135, "step": 1528 }, { "epoch": 2.5134115674769486, "grad_norm": 9.540766716003418, "learning_rate": 1.0760261545904294e-06, "loss": 2.1814, "step": 1529 }, { "epoch": 2.5150880134115674, "grad_norm": 8.15484619140625, "learning_rate": 1.0678839780547246e-06, "loss": 2.2068, "step": 1530 }, { "epoch": 2.516764459346186, "grad_norm": 5.961563587188721, "learning_rate": 1.0597709861656057e-06, "loss": 2.458, "step": 1531 }, { "epoch": 2.5184409052808046, "grad_norm": 17.800792694091797, "learning_rate": 1.0516872054313264e-06, "loss": 2.2997, "step": 1532 }, { "epoch": 2.5201173512154234, "grad_norm": 9.044431686401367, "learning_rate": 1.0436326622646941e-06, "loss": 2.024, "step": 1533 }, { "epoch": 2.5217937971500417, "grad_norm": 7.718180179595947, "learning_rate": 1.0356073829829893e-06, "loss": 2.4786, "step": 1534 }, { "epoch": 2.5234702430846605, "grad_norm": 9.294529914855957, "learning_rate": 1.0276113938078768e-06, "loss": 2.1424, "step": 1535 }, { "epoch": 2.5251466890192793, "grad_norm": 8.191193580627441, "learning_rate": 1.0196447208653193e-06, "loss": 2.2498, "step": 1536 }, { "epoch": 2.5268231349538977, "grad_norm": 13.213946342468262, "learning_rate": 1.011707390185489e-06, "loss": 2.231, "step": 1537 }, { "epoch": 2.5284995808885165, "grad_norm": 6.670695781707764, "learning_rate": 1.003799427702684e-06, "loss": 2.6511, "step": 1538 }, { "epoch": 2.530176026823135, "grad_norm": 4.2688140869140625, "learning_rate": 9.959208592552549e-07, "loss": 2.2819, "step": 1539 }, { "epoch": 2.5318524727577536, "grad_norm": 16.4216365814209, "learning_rate": 9.88071710585503e-07, "loss": 2.0831, "step": 1540 }, { "epoch": 2.5335289186923724, "grad_norm": 6.223483562469482, "learning_rate": 9.802520073396016e-07, "loss": 2.3864, "step": 1541 }, { "epoch": 2.5352053646269908, "grad_norm": 11.680347442626953, "learning_rate": 9.724617750675214e-07, "loss": 2.2259, "step": 1542 }, { "epoch": 2.5368818105616096, "grad_norm": 6.158958435058594, "learning_rate": 9.647010392229373e-07, "loss": 2.3039, "step": 1543 }, { "epoch": 2.538558256496228, "grad_norm": 8.861401557922363, "learning_rate": 9.569698251631487e-07, "loss": 2.5107, "step": 1544 }, { "epoch": 2.5402347024308467, "grad_norm": 4.6398138999938965, "learning_rate": 9.492681581489938e-07, "loss": 1.9874, "step": 1545 }, { "epoch": 2.541911148365465, "grad_norm": 12.700160026550293, "learning_rate": 9.415960633447674e-07, "loss": 2.4402, "step": 1546 }, { "epoch": 2.543587594300084, "grad_norm": 10.315913200378418, "learning_rate": 9.339535658181497e-07, "loss": 2.1393, "step": 1547 }, { "epoch": 2.545264040234702, "grad_norm": 6.171055316925049, "learning_rate": 9.263406905401085e-07, "loss": 1.7812, "step": 1548 }, { "epoch": 2.546940486169321, "grad_norm": 8.021125793457031, "learning_rate": 9.187574623848206e-07, "loss": 2.0037, "step": 1549 }, { "epoch": 2.54861693210394, "grad_norm": 7.094779968261719, "learning_rate": 9.112039061296019e-07, "loss": 2.1047, "step": 1550 }, { "epoch": 2.550293378038558, "grad_norm": 5.408681869506836, "learning_rate": 9.036800464548157e-07, "loss": 2.2368, "step": 1551 }, { "epoch": 2.551969823973177, "grad_norm": 5.102510452270508, "learning_rate": 8.961859079437974e-07, "loss": 2.2142, "step": 1552 }, { "epoch": 2.5536462699077953, "grad_norm": 5.690503120422363, "learning_rate": 8.887215150827677e-07, "loss": 2.3129, "step": 1553 }, { "epoch": 2.555322715842414, "grad_norm": 19.20132827758789, "learning_rate": 8.812868922607565e-07, "loss": 2.2056, "step": 1554 }, { "epoch": 2.556999161777033, "grad_norm": 4.1225810050964355, "learning_rate": 8.738820637695322e-07, "loss": 2.0504, "step": 1555 }, { "epoch": 2.5586756077116513, "grad_norm": 5.147797107696533, "learning_rate": 8.665070538035037e-07, "loss": 2.0496, "step": 1556 }, { "epoch": 2.56035205364627, "grad_norm": 5.437122344970703, "learning_rate": 8.591618864596541e-07, "loss": 2.3919, "step": 1557 }, { "epoch": 2.5620284995808884, "grad_norm": 4.14401388168335, "learning_rate": 8.518465857374636e-07, "loss": 1.8546, "step": 1558 }, { "epoch": 2.563704945515507, "grad_norm": 18.75957679748535, "learning_rate": 8.445611755388205e-07, "loss": 2.1602, "step": 1559 }, { "epoch": 2.565381391450126, "grad_norm": 4.997353553771973, "learning_rate": 8.373056796679568e-07, "loss": 1.967, "step": 1560 }, { "epoch": 2.5670578373847444, "grad_norm": 7.487930774688721, "learning_rate": 8.300801218313548e-07, "loss": 2.5101, "step": 1561 }, { "epoch": 2.5687342833193627, "grad_norm": 18.071205139160156, "learning_rate": 8.228845256376794e-07, "loss": 2.2558, "step": 1562 }, { "epoch": 2.5704107292539815, "grad_norm": 7.025851249694824, "learning_rate": 8.157189145977074e-07, "loss": 2.1498, "step": 1563 }, { "epoch": 2.5720871751886003, "grad_norm": 8.819117546081543, "learning_rate": 8.085833121242326e-07, "loss": 2.1667, "step": 1564 }, { "epoch": 2.5737636211232187, "grad_norm": 9.051352500915527, "learning_rate": 8.01477741532003e-07, "loss": 2.4935, "step": 1565 }, { "epoch": 2.5754400670578375, "grad_norm": 10.156190872192383, "learning_rate": 7.944022260376416e-07, "loss": 2.2195, "step": 1566 }, { "epoch": 2.577116512992456, "grad_norm": 5.755540370941162, "learning_rate": 7.873567887595702e-07, "loss": 2.2316, "step": 1567 }, { "epoch": 2.5787929589270746, "grad_norm": 5.415092468261719, "learning_rate": 7.803414527179343e-07, "loss": 2.0278, "step": 1568 }, { "epoch": 2.5804694048616934, "grad_norm": 7.1985182762146, "learning_rate": 7.73356240834523e-07, "loss": 2.1752, "step": 1569 }, { "epoch": 2.5821458507963118, "grad_norm": 4.371781826019287, "learning_rate": 7.664011759326984e-07, "loss": 2.2419, "step": 1570 }, { "epoch": 2.5838222967309306, "grad_norm": 6.6038336753845215, "learning_rate": 7.594762807373313e-07, "loss": 2.2207, "step": 1571 }, { "epoch": 2.585498742665549, "grad_norm": 21.03390884399414, "learning_rate": 7.525815778747025e-07, "loss": 1.8768, "step": 1572 }, { "epoch": 2.5871751886001677, "grad_norm": 5.654022693634033, "learning_rate": 7.457170898724487e-07, "loss": 2.1449, "step": 1573 }, { "epoch": 2.5888516345347865, "grad_norm": 6.087095737457275, "learning_rate": 7.388828391594849e-07, "loss": 2.1637, "step": 1574 }, { "epoch": 2.590528080469405, "grad_norm": 5.222080707550049, "learning_rate": 7.320788480659269e-07, "loss": 2.3952, "step": 1575 }, { "epoch": 2.592204526404023, "grad_norm": 10.92801570892334, "learning_rate": 7.253051388230248e-07, "loss": 2.1145, "step": 1576 }, { "epoch": 2.593880972338642, "grad_norm": 6.291825294494629, "learning_rate": 7.18561733563079e-07, "loss": 2.3148, "step": 1577 }, { "epoch": 2.595557418273261, "grad_norm": 11.299331665039062, "learning_rate": 7.118486543193781e-07, "loss": 2.228, "step": 1578 }, { "epoch": 2.597233864207879, "grad_norm": 11.442245483398438, "learning_rate": 7.051659230261299e-07, "loss": 2.216, "step": 1579 }, { "epoch": 2.598910310142498, "grad_norm": 10.759445190429688, "learning_rate": 6.985135615183758e-07, "loss": 2.1715, "step": 1580 }, { "epoch": 2.6005867560771163, "grad_norm": 5.001315116882324, "learning_rate": 6.918915915319302e-07, "loss": 2.0885, "step": 1581 }, { "epoch": 2.602263202011735, "grad_norm": 20.08878517150879, "learning_rate": 6.853000347033078e-07, "loss": 2.0068, "step": 1582 }, { "epoch": 2.603939647946354, "grad_norm": 9.580506324768066, "learning_rate": 6.787389125696508e-07, "loss": 2.636, "step": 1583 }, { "epoch": 2.6056160938809723, "grad_norm": 5.075564384460449, "learning_rate": 6.72208246568663e-07, "loss": 2.0567, "step": 1584 }, { "epoch": 2.607292539815591, "grad_norm": 9.786645889282227, "learning_rate": 6.6570805803853e-07, "loss": 1.9531, "step": 1585 }, { "epoch": 2.6089689857502094, "grad_norm": 8.57758903503418, "learning_rate": 6.592383682178593e-07, "loss": 2.5005, "step": 1586 }, { "epoch": 2.610645431684828, "grad_norm": 15.163758277893066, "learning_rate": 6.527991982456072e-07, "loss": 2.1622, "step": 1587 }, { "epoch": 2.612321877619447, "grad_norm": 14.070173263549805, "learning_rate": 6.463905691610117e-07, "loss": 2.438, "step": 1588 }, { "epoch": 2.6139983235540654, "grad_norm": 7.3514509201049805, "learning_rate": 6.400125019035208e-07, "loss": 1.9967, "step": 1589 }, { "epoch": 2.6156747694886837, "grad_norm": 9.949421882629395, "learning_rate": 6.336650173127224e-07, "loss": 1.9762, "step": 1590 }, { "epoch": 2.6173512154233025, "grad_norm": 38.16314697265625, "learning_rate": 6.273481361282807e-07, "loss": 2.1642, "step": 1591 }, { "epoch": 2.6190276613579213, "grad_norm": 9.410321235656738, "learning_rate": 6.210618789898726e-07, "loss": 2.1192, "step": 1592 }, { "epoch": 2.6207041072925397, "grad_norm": 5.9524641036987305, "learning_rate": 6.148062664371068e-07, "loss": 1.8401, "step": 1593 }, { "epoch": 2.6223805532271585, "grad_norm": 6.7698235511779785, "learning_rate": 6.085813189094658e-07, "loss": 1.8851, "step": 1594 }, { "epoch": 2.624056999161777, "grad_norm": 8.685763359069824, "learning_rate": 6.023870567462397e-07, "loss": 2.3776, "step": 1595 }, { "epoch": 2.6257334450963956, "grad_norm": 8.412744522094727, "learning_rate": 5.96223500186458e-07, "loss": 2.067, "step": 1596 }, { "epoch": 2.6274098910310144, "grad_norm": 7.181644916534424, "learning_rate": 5.900906693688236e-07, "loss": 2.3699, "step": 1597 }, { "epoch": 2.6290863369656328, "grad_norm": 8.961746215820312, "learning_rate": 5.839885843316439e-07, "loss": 2.2746, "step": 1598 }, { "epoch": 2.6307627829002516, "grad_norm": 7.9360785484313965, "learning_rate": 5.779172650127674e-07, "loss": 2.087, "step": 1599 }, { "epoch": 2.63243922883487, "grad_norm": 8.271594047546387, "learning_rate": 5.718767312495255e-07, "loss": 2.3263, "step": 1600 }, { "epoch": 2.6341156747694887, "grad_norm": 22.34787368774414, "learning_rate": 5.658670027786561e-07, "loss": 2.0452, "step": 1601 }, { "epoch": 2.6357921207041075, "grad_norm": 18.66659164428711, "learning_rate": 5.598880992362432e-07, "loss": 2.102, "step": 1602 }, { "epoch": 2.637468566638726, "grad_norm": 4.952098846435547, "learning_rate": 5.539400401576578e-07, "loss": 2.1685, "step": 1603 }, { "epoch": 2.6391450125733447, "grad_norm": 10.171954154968262, "learning_rate": 5.480228449774882e-07, "loss": 2.2068, "step": 1604 }, { "epoch": 2.640821458507963, "grad_norm": 8.009418487548828, "learning_rate": 5.421365330294814e-07, "loss": 2.0208, "step": 1605 }, { "epoch": 2.642497904442582, "grad_norm": 6.748593330383301, "learning_rate": 5.362811235464727e-07, "loss": 2.1862, "step": 1606 }, { "epoch": 2.6441743503772006, "grad_norm": 7.552306652069092, "learning_rate": 5.30456635660327e-07, "loss": 2.4543, "step": 1607 }, { "epoch": 2.645850796311819, "grad_norm": 10.333016395568848, "learning_rate": 5.246630884018844e-07, "loss": 2.2099, "step": 1608 }, { "epoch": 2.6475272422464373, "grad_norm": 5.712170600891113, "learning_rate": 5.189005007008796e-07, "loss": 2.1597, "step": 1609 }, { "epoch": 2.649203688181056, "grad_norm": 4.704019069671631, "learning_rate": 5.131688913858956e-07, "loss": 1.9806, "step": 1610 }, { "epoch": 2.650880134115675, "grad_norm": 6.9373345375061035, "learning_rate": 5.074682791842988e-07, "loss": 2.0318, "step": 1611 }, { "epoch": 2.6525565800502933, "grad_norm": 11.29400634765625, "learning_rate": 5.017986827221733e-07, "loss": 2.2923, "step": 1612 }, { "epoch": 2.654233025984912, "grad_norm": 9.595968246459961, "learning_rate": 4.96160120524265e-07, "loss": 2.4366, "step": 1613 }, { "epoch": 2.6559094719195304, "grad_norm": 4.494987964630127, "learning_rate": 4.905526110139159e-07, "loss": 2.0897, "step": 1614 }, { "epoch": 2.657585917854149, "grad_norm": 5.816176891326904, "learning_rate": 4.849761725130076e-07, "loss": 1.9899, "step": 1615 }, { "epoch": 2.659262363788768, "grad_norm": 13.64667797088623, "learning_rate": 4.794308232419065e-07, "loss": 2.172, "step": 1616 }, { "epoch": 2.6609388097233864, "grad_norm": 4.845463275909424, "learning_rate": 4.7391658131939244e-07, "loss": 2.0132, "step": 1617 }, { "epoch": 2.662615255658005, "grad_norm": 18.137544631958008, "learning_rate": 4.6843346476260434e-07, "loss": 2.1996, "step": 1618 }, { "epoch": 2.6642917015926235, "grad_norm": 14.117281913757324, "learning_rate": 4.629814914869879e-07, "loss": 2.1581, "step": 1619 }, { "epoch": 2.6659681475272423, "grad_norm": 9.738118171691895, "learning_rate": 4.5756067930623037e-07, "loss": 2.1434, "step": 1620 }, { "epoch": 2.667644593461861, "grad_norm": 5.329063415527344, "learning_rate": 4.521710459322015e-07, "loss": 2.4944, "step": 1621 }, { "epoch": 2.6693210393964795, "grad_norm": 7.685181140899658, "learning_rate": 4.468126089749003e-07, "loss": 2.7036, "step": 1622 }, { "epoch": 2.670997485331098, "grad_norm": 5.141359806060791, "learning_rate": 4.4148538594239176e-07, "loss": 2.3699, "step": 1623 }, { "epoch": 2.6726739312657166, "grad_norm": 6.516773223876953, "learning_rate": 4.36189394240758e-07, "loss": 2.5436, "step": 1624 }, { "epoch": 2.6743503772003354, "grad_norm": 6.696847438812256, "learning_rate": 4.309246511740339e-07, "loss": 1.9162, "step": 1625 }, { "epoch": 2.6760268231349538, "grad_norm": 4.967213153839111, "learning_rate": 4.256911739441505e-07, "loss": 2.1733, "step": 1626 }, { "epoch": 2.6777032690695726, "grad_norm": 14.924220085144043, "learning_rate": 4.2048897965088487e-07, "loss": 2.1359, "step": 1627 }, { "epoch": 2.679379715004191, "grad_norm": 6.377952575683594, "learning_rate": 4.153180852918004e-07, "loss": 2.3113, "step": 1628 }, { "epoch": 2.6810561609388097, "grad_norm": 7.470493793487549, "learning_rate": 4.1017850776219224e-07, "loss": 2.2714, "step": 1629 }, { "epoch": 2.6827326068734285, "grad_norm": 6.198121070861816, "learning_rate": 4.0507026385502747e-07, "loss": 2.3614, "step": 1630 }, { "epoch": 2.684409052808047, "grad_norm": 5.405788421630859, "learning_rate": 3.999933702608949e-07, "loss": 2.0414, "step": 1631 }, { "epoch": 2.6860854987426657, "grad_norm": 10.905454635620117, "learning_rate": 3.949478435679577e-07, "loss": 1.9939, "step": 1632 }, { "epoch": 2.687761944677284, "grad_norm": 9.610267639160156, "learning_rate": 3.89933700261883e-07, "loss": 2.2963, "step": 1633 }, { "epoch": 2.689438390611903, "grad_norm": 9.685226440429688, "learning_rate": 3.8495095672579584e-07, "loss": 1.867, "step": 1634 }, { "epoch": 2.6911148365465216, "grad_norm": 5.048326015472412, "learning_rate": 3.7999962924023304e-07, "loss": 2.2468, "step": 1635 }, { "epoch": 2.69279128248114, "grad_norm": 6.6950836181640625, "learning_rate": 3.7507973398307584e-07, "loss": 2.4554, "step": 1636 }, { "epoch": 2.6944677284157583, "grad_norm": 4.78188419342041, "learning_rate": 3.7019128702951213e-07, "loss": 2.1269, "step": 1637 }, { "epoch": 2.696144174350377, "grad_norm": 5.873968601226807, "learning_rate": 3.653343043519686e-07, "loss": 1.9546, "step": 1638 }, { "epoch": 2.697820620284996, "grad_norm": 3.986678123474121, "learning_rate": 3.6050880182006864e-07, "loss": 1.9768, "step": 1639 }, { "epoch": 2.697820620284996, "eval_loss": 2.2314939498901367, "eval_runtime": 171.6601, "eval_samples_per_second": 3.058, "eval_steps_per_second": 1.532, "step": 1639 }, { "epoch": 2.6994970662196143, "grad_norm": 6.747722148895264, "learning_rate": 3.557147952005813e-07, "loss": 2.4473, "step": 1640 }, { "epoch": 2.701173512154233, "grad_norm": 8.736268997192383, "learning_rate": 3.5095230015736113e-07, "loss": 2.098, "step": 1641 }, { "epoch": 2.7028499580888514, "grad_norm": 16.439146041870117, "learning_rate": 3.462213322513075e-07, "loss": 2.1514, "step": 1642 }, { "epoch": 2.70452640402347, "grad_norm": 6.314731597900391, "learning_rate": 3.415219069403042e-07, "loss": 2.0459, "step": 1643 }, { "epoch": 2.706202849958089, "grad_norm": 15.164846420288086, "learning_rate": 3.3685403957917195e-07, "loss": 2.0335, "step": 1644 }, { "epoch": 2.7078792958927074, "grad_norm": 7.011516571044922, "learning_rate": 3.322177454196285e-07, "loss": 2.2174, "step": 1645 }, { "epoch": 2.709555741827326, "grad_norm": 7.147578716278076, "learning_rate": 3.276130396102217e-07, "loss": 2.1493, "step": 1646 }, { "epoch": 2.7112321877619445, "grad_norm": 9.694156646728516, "learning_rate": 3.230399371962878e-07, "loss": 2.0447, "step": 1647 }, { "epoch": 2.7129086336965633, "grad_norm": 7.1735100746154785, "learning_rate": 3.1849845311990757e-07, "loss": 2.0439, "step": 1648 }, { "epoch": 2.714585079631182, "grad_norm": 10.528395652770996, "learning_rate": 3.139886022198502e-07, "loss": 2.2992, "step": 1649 }, { "epoch": 2.7162615255658005, "grad_norm": 9.272653579711914, "learning_rate": 3.095103992315263e-07, "loss": 2.2504, "step": 1650 }, { "epoch": 2.7179379715004193, "grad_norm": 6.080860614776611, "learning_rate": 3.0506385878694255e-07, "loss": 1.9448, "step": 1651 }, { "epoch": 2.7196144174350376, "grad_norm": 6.053015232086182, "learning_rate": 3.0064899541464856e-07, "loss": 2.2458, "step": 1652 }, { "epoch": 2.7212908633696564, "grad_norm": 11.146768569946289, "learning_rate": 2.9626582353969756e-07, "loss": 2.2337, "step": 1653 }, { "epoch": 2.7229673093042748, "grad_norm": 6.148212432861328, "learning_rate": 2.9191435748359144e-07, "loss": 2.0337, "step": 1654 }, { "epoch": 2.7246437552388936, "grad_norm": 11.22092056274414, "learning_rate": 2.8759461146423694e-07, "loss": 2.2422, "step": 1655 }, { "epoch": 2.726320201173512, "grad_norm": 5.4355316162109375, "learning_rate": 2.8330659959589944e-07, "loss": 2.2251, "step": 1656 }, { "epoch": 2.7279966471081307, "grad_norm": 6.184296131134033, "learning_rate": 2.7905033588915944e-07, "loss": 2.4166, "step": 1657 }, { "epoch": 2.7296730930427495, "grad_norm": 5.829388618469238, "learning_rate": 2.748258342508614e-07, "loss": 2.1479, "step": 1658 }, { "epoch": 2.731349538977368, "grad_norm": 4.974775791168213, "learning_rate": 2.706331084840697e-07, "loss": 2.2435, "step": 1659 }, { "epoch": 2.7330259849119867, "grad_norm": 5.624755382537842, "learning_rate": 2.664721722880259e-07, "loss": 1.7193, "step": 1660 }, { "epoch": 2.734702430846605, "grad_norm": 7.836221694946289, "learning_rate": 2.623430392581061e-07, "loss": 1.9878, "step": 1661 }, { "epoch": 2.736378876781224, "grad_norm": 5.058412075042725, "learning_rate": 2.5824572288577043e-07, "loss": 2.1475, "step": 1662 }, { "epoch": 2.7380553227158426, "grad_norm": 11.342087745666504, "learning_rate": 2.541802365585189e-07, "loss": 2.0362, "step": 1663 }, { "epoch": 2.739731768650461, "grad_norm": 8.350056648254395, "learning_rate": 2.501465935598557e-07, "loss": 2.1253, "step": 1664 }, { "epoch": 2.7414082145850798, "grad_norm": 7.950875282287598, "learning_rate": 2.461448070692374e-07, "loss": 2.1074, "step": 1665 }, { "epoch": 2.743084660519698, "grad_norm": 4.998746395111084, "learning_rate": 2.421748901620369e-07, "loss": 2.3507, "step": 1666 }, { "epoch": 2.744761106454317, "grad_norm": 7.144976615905762, "learning_rate": 2.3823685580949273e-07, "loss": 2.2031, "step": 1667 }, { "epoch": 2.7464375523889357, "grad_norm": 5.609632968902588, "learning_rate": 2.3433071687867014e-07, "loss": 2.1873, "step": 1668 }, { "epoch": 2.748113998323554, "grad_norm": 4.145407199859619, "learning_rate": 2.3045648613242545e-07, "loss": 2.2014, "step": 1669 }, { "epoch": 2.7497904442581724, "grad_norm": 4.666073799133301, "learning_rate": 2.26614176229355e-07, "loss": 1.8582, "step": 1670 }, { "epoch": 2.7514668901927912, "grad_norm": 5.224541664123535, "learning_rate": 2.2280379972375643e-07, "loss": 2.1606, "step": 1671 }, { "epoch": 2.75314333612741, "grad_norm": 7.115670680999756, "learning_rate": 2.190253690655919e-07, "loss": 2.3746, "step": 1672 }, { "epoch": 2.7548197820620284, "grad_norm": 5.890516757965088, "learning_rate": 2.1527889660044043e-07, "loss": 2.1739, "step": 1673 }, { "epoch": 2.756496227996647, "grad_norm": 7.22526741027832, "learning_rate": 2.1156439456946676e-07, "loss": 1.9828, "step": 1674 }, { "epoch": 2.7581726739312655, "grad_norm": 5.419872760772705, "learning_rate": 2.078818751093703e-07, "loss": 2.1167, "step": 1675 }, { "epoch": 2.7598491198658843, "grad_norm": 10.352044105529785, "learning_rate": 2.0423135025235298e-07, "loss": 2.4593, "step": 1676 }, { "epoch": 2.761525565800503, "grad_norm": 7.381417751312256, "learning_rate": 2.0061283192608028e-07, "loss": 2.2362, "step": 1677 }, { "epoch": 2.7632020117351215, "grad_norm": 5.139676570892334, "learning_rate": 1.9702633195363918e-07, "loss": 2.1607, "step": 1678 }, { "epoch": 2.7648784576697403, "grad_norm": 7.026803493499756, "learning_rate": 1.934718620534981e-07, "loss": 2.0865, "step": 1679 }, { "epoch": 2.7665549036043586, "grad_norm": 21.078304290771484, "learning_rate": 1.899494338394725e-07, "loss": 2.0511, "step": 1680 }, { "epoch": 2.7682313495389774, "grad_norm": 6.046341896057129, "learning_rate": 1.8645905882068605e-07, "loss": 2.0225, "step": 1681 }, { "epoch": 2.7699077954735962, "grad_norm": 9.598211288452148, "learning_rate": 1.8300074840153282e-07, "loss": 2.4605, "step": 1682 }, { "epoch": 2.7715842414082146, "grad_norm": 5.533355712890625, "learning_rate": 1.7957451388163628e-07, "loss": 2.3401, "step": 1683 }, { "epoch": 2.773260687342833, "grad_norm": 6.748833656311035, "learning_rate": 1.7618036645581817e-07, "loss": 2.263, "step": 1684 }, { "epoch": 2.7749371332774517, "grad_norm": 7.263243198394775, "learning_rate": 1.7281831721406184e-07, "loss": 2.3331, "step": 1685 }, { "epoch": 2.7766135792120705, "grad_norm": 11.478459358215332, "learning_rate": 1.6948837714146793e-07, "loss": 2.3995, "step": 1686 }, { "epoch": 2.778290025146689, "grad_norm": 8.58930778503418, "learning_rate": 1.6619055711822874e-07, "loss": 2.1968, "step": 1687 }, { "epoch": 2.7799664710813077, "grad_norm": 6.965145587921143, "learning_rate": 1.6292486791958495e-07, "loss": 2.0981, "step": 1688 }, { "epoch": 2.781642917015926, "grad_norm": 12.417163848876953, "learning_rate": 1.5969132021579347e-07, "loss": 1.9477, "step": 1689 }, { "epoch": 2.783319362950545, "grad_norm": 6.506493091583252, "learning_rate": 1.5648992457209744e-07, "loss": 2.4737, "step": 1690 }, { "epoch": 2.7849958088851636, "grad_norm": 6.64824914932251, "learning_rate": 1.533206914486818e-07, "loss": 1.9737, "step": 1691 }, { "epoch": 2.786672254819782, "grad_norm": 7.992014408111572, "learning_rate": 1.5018363120064548e-07, "loss": 2.1973, "step": 1692 }, { "epoch": 2.7883487007544008, "grad_norm": 8.56637954711914, "learning_rate": 1.4707875407796724e-07, "loss": 2.1347, "step": 1693 }, { "epoch": 2.790025146689019, "grad_norm": 11.73257064819336, "learning_rate": 1.4400607022546976e-07, "loss": 2.3428, "step": 1694 }, { "epoch": 2.791701592623638, "grad_norm": 12.196760177612305, "learning_rate": 1.4096558968279216e-07, "loss": 2.2212, "step": 1695 }, { "epoch": 2.7933780385582567, "grad_norm": 4.418464183807373, "learning_rate": 1.3795732238434557e-07, "loss": 2.2901, "step": 1696 }, { "epoch": 2.795054484492875, "grad_norm": 5.917961597442627, "learning_rate": 1.3498127815929295e-07, "loss": 2.1123, "step": 1697 }, { "epoch": 2.796730930427494, "grad_norm": 9.734413146972656, "learning_rate": 1.3203746673151497e-07, "loss": 2.1387, "step": 1698 }, { "epoch": 2.7984073763621122, "grad_norm": 8.743748664855957, "learning_rate": 1.2912589771956863e-07, "loss": 2.4509, "step": 1699 }, { "epoch": 2.800083822296731, "grad_norm": 7.576177597045898, "learning_rate": 1.262465806366664e-07, "loss": 2.0774, "step": 1700 }, { "epoch": 2.8017602682313494, "grad_norm": 13.9898042678833, "learning_rate": 1.2339952489064056e-07, "loss": 2.1904, "step": 1701 }, { "epoch": 2.803436714165968, "grad_norm": 5.648590564727783, "learning_rate": 1.2058473978391438e-07, "loss": 2.3304, "step": 1702 }, { "epoch": 2.8051131601005865, "grad_norm": 4.651821613311768, "learning_rate": 1.1780223451346994e-07, "loss": 2.046, "step": 1703 }, { "epoch": 2.8067896060352053, "grad_norm": 5.326381683349609, "learning_rate": 1.1505201817081702e-07, "loss": 2.3392, "step": 1704 }, { "epoch": 2.808466051969824, "grad_norm": 7.358956336975098, "learning_rate": 1.123340997419664e-07, "loss": 2.2677, "step": 1705 }, { "epoch": 2.8101424979044425, "grad_norm": 10.38466739654541, "learning_rate": 1.0964848810740114e-07, "loss": 2.0176, "step": 1706 }, { "epoch": 2.8118189438390613, "grad_norm": 13.03023624420166, "learning_rate": 1.0699519204204422e-07, "loss": 2.0403, "step": 1707 }, { "epoch": 2.8134953897736796, "grad_norm": 4.97235107421875, "learning_rate": 1.0437422021522759e-07, "loss": 2.3487, "step": 1708 }, { "epoch": 2.8151718357082984, "grad_norm": 7.588263988494873, "learning_rate": 1.0178558119067316e-07, "loss": 1.8305, "step": 1709 }, { "epoch": 2.8168482816429172, "grad_norm": 9.938261985778809, "learning_rate": 9.922928342645522e-08, "loss": 2.1789, "step": 1710 }, { "epoch": 2.8185247275775356, "grad_norm": 7.735471725463867, "learning_rate": 9.670533527498139e-08, "loss": 2.2703, "step": 1711 }, { "epoch": 2.8202011735121544, "grad_norm": 13.126014709472656, "learning_rate": 9.421374498295498e-08, "loss": 2.2732, "step": 1712 }, { "epoch": 2.8218776194467727, "grad_norm": 11.316969871520996, "learning_rate": 9.175452069135615e-08, "loss": 2.1832, "step": 1713 }, { "epoch": 2.8235540653813915, "grad_norm": 5.266726970672607, "learning_rate": 8.932767043541624e-08, "loss": 2.4669, "step": 1714 }, { "epoch": 2.8252305113160103, "grad_norm": 7.458639621734619, "learning_rate": 8.693320214458234e-08, "loss": 2.0844, "step": 1715 }, { "epoch": 2.8269069572506287, "grad_norm": 7.3576555252075195, "learning_rate": 8.457112364250064e-08, "loss": 2.2435, "step": 1716 }, { "epoch": 2.828583403185247, "grad_norm": 10.169660568237305, "learning_rate": 8.224144264698864e-08, "loss": 1.9385, "step": 1717 }, { "epoch": 2.830259849119866, "grad_norm": 10.92442512512207, "learning_rate": 7.994416677000405e-08, "loss": 2.3116, "step": 1718 }, { "epoch": 2.8319362950544846, "grad_norm": 7.840557098388672, "learning_rate": 7.767930351763043e-08, "loss": 2.2313, "step": 1719 }, { "epoch": 2.833612740989103, "grad_norm": 5.196908473968506, "learning_rate": 7.544686029004378e-08, "loss": 2.3076, "step": 1720 }, { "epoch": 2.835289186923722, "grad_norm": 8.68769645690918, "learning_rate": 7.324684438148933e-08, "loss": 2.4478, "step": 1721 }, { "epoch": 2.83696563285834, "grad_norm": 7.743781566619873, "learning_rate": 7.10792629802659e-08, "loss": 2.3941, "step": 1722 }, { "epoch": 2.838642078792959, "grad_norm": 4.173694610595703, "learning_rate": 6.894412316869048e-08, "loss": 2.0683, "step": 1723 }, { "epoch": 2.8403185247275777, "grad_norm": 6.068975448608398, "learning_rate": 6.684143192308368e-08, "loss": 2.3673, "step": 1724 }, { "epoch": 2.841994970662196, "grad_norm": 4.265227317810059, "learning_rate": 6.477119611374316e-08, "loss": 2.1415, "step": 1725 }, { "epoch": 2.843671416596815, "grad_norm": 8.569744110107422, "learning_rate": 6.273342250492254e-08, "loss": 1.8601, "step": 1726 }, { "epoch": 2.8453478625314332, "grad_norm": 5.244567394256592, "learning_rate": 6.072811775481135e-08, "loss": 2.2043, "step": 1727 }, { "epoch": 2.847024308466052, "grad_norm": 8.898619651794434, "learning_rate": 5.875528841550737e-08, "loss": 2.2055, "step": 1728 }, { "epoch": 2.848700754400671, "grad_norm": 6.68065881729126, "learning_rate": 5.6814940932999886e-08, "loss": 2.1443, "step": 1729 }, { "epoch": 2.850377200335289, "grad_norm": 5.27171516418457, "learning_rate": 5.490708164714864e-08, "loss": 2.0189, "step": 1730 }, { "epoch": 2.8520536462699075, "grad_norm": 7.6161417961120605, "learning_rate": 5.303171679166053e-08, "loss": 2.0958, "step": 1731 }, { "epoch": 2.8537300922045263, "grad_norm": 4.6871562004089355, "learning_rate": 5.118885249407179e-08, "loss": 2.2328, "step": 1732 }, { "epoch": 2.855406538139145, "grad_norm": 7.863831996917725, "learning_rate": 4.937849477572587e-08, "loss": 1.945, "step": 1733 }, { "epoch": 2.8570829840737635, "grad_norm": 12.705326080322266, "learning_rate": 4.760064955175558e-08, "loss": 2.1768, "step": 1734 }, { "epoch": 2.8587594300083823, "grad_norm": 5.357565879821777, "learning_rate": 4.585532263106318e-08, "loss": 1.9204, "step": 1735 }, { "epoch": 2.8604358759430006, "grad_norm": 11.520817756652832, "learning_rate": 4.414251971630035e-08, "loss": 2.2237, "step": 1736 }, { "epoch": 2.8621123218776194, "grad_norm": 5.635189056396484, "learning_rate": 4.246224640385044e-08, "loss": 2.3528, "step": 1737 }, { "epoch": 2.8637887678122382, "grad_norm": 23.543516159057617, "learning_rate": 4.0814508183810716e-08, "loss": 2.1781, "step": 1738 }, { "epoch": 2.8654652137468566, "grad_norm": 6.292739391326904, "learning_rate": 3.9199310439972384e-08, "loss": 2.1205, "step": 1739 }, { "epoch": 2.8671416596814754, "grad_norm": 5.19038200378418, "learning_rate": 3.761665844980833e-08, "loss": 1.9758, "step": 1740 }, { "epoch": 2.8688181056160937, "grad_norm": 8.495299339294434, "learning_rate": 3.606655738444653e-08, "loss": 2.1386, "step": 1741 }, { "epoch": 2.8704945515507125, "grad_norm": 6.778533935546875, "learning_rate": 3.4549012308661146e-08, "loss": 2.1292, "step": 1742 }, { "epoch": 2.8721709974853313, "grad_norm": 11.879091262817383, "learning_rate": 3.3064028180855854e-08, "loss": 1.9339, "step": 1743 }, { "epoch": 2.8738474434199497, "grad_norm": 9.468332290649414, "learning_rate": 3.161160985304168e-08, "loss": 2.1338, "step": 1744 }, { "epoch": 2.875523889354568, "grad_norm": 7.453269958496094, "learning_rate": 3.019176207082586e-08, "loss": 2.1449, "step": 1745 }, { "epoch": 2.877200335289187, "grad_norm": 8.21602725982666, "learning_rate": 2.8804489473395205e-08, "loss": 2.0623, "step": 1746 }, { "epoch": 2.8788767812238056, "grad_norm": 15.724867820739746, "learning_rate": 2.7449796593501665e-08, "loss": 2.2227, "step": 1747 }, { "epoch": 2.880553227158424, "grad_norm": 8.88268756866455, "learning_rate": 2.6127687857445682e-08, "loss": 2.2417, "step": 1748 }, { "epoch": 2.882229673093043, "grad_norm": 6.835209846496582, "learning_rate": 2.4838167585063965e-08, "loss": 2.4532, "step": 1749 }, { "epoch": 2.883906119027661, "grad_norm": 9.754383087158203, "learning_rate": 2.3581239989711734e-08, "loss": 1.9046, "step": 1750 }, { "epoch": 2.88558256496228, "grad_norm": 7.3027496337890625, "learning_rate": 2.235690917825384e-08, "loss": 2.4733, "step": 1751 }, { "epoch": 2.8872590108968987, "grad_norm": 22.672439575195312, "learning_rate": 2.116517915105032e-08, "loss": 2.1045, "step": 1752 }, { "epoch": 2.888935456831517, "grad_norm": 6.756922245025635, "learning_rate": 2.0006053801937543e-08, "loss": 2.2625, "step": 1753 }, { "epoch": 2.890611902766136, "grad_norm": 6.469045639038086, "learning_rate": 1.8879536918223752e-08, "loss": 2.2362, "step": 1754 }, { "epoch": 2.8922883487007542, "grad_norm": 5.1020731925964355, "learning_rate": 1.77856321806702e-08, "loss": 2.3511, "step": 1755 }, { "epoch": 2.893964794635373, "grad_norm": 5.348669052124023, "learning_rate": 1.672434316348337e-08, "loss": 2.1033, "step": 1756 }, { "epoch": 2.895641240569992, "grad_norm": 5.0972723960876465, "learning_rate": 1.5695673334301665e-08, "loss": 2.2644, "step": 1757 }, { "epoch": 2.89731768650461, "grad_norm": 6.626637935638428, "learning_rate": 1.469962605418318e-08, "loss": 2.3634, "step": 1758 }, { "epoch": 2.898994132439229, "grad_norm": 3.3540515899658203, "learning_rate": 1.3736204577596834e-08, "loss": 2.0988, "step": 1759 }, { "epoch": 2.9006705783738473, "grad_norm": 13.148756980895996, "learning_rate": 1.2805412052409039e-08, "loss": 2.3569, "step": 1760 }, { "epoch": 2.902347024308466, "grad_norm": 4.383535385131836, "learning_rate": 1.1907251519877039e-08, "loss": 2.056, "step": 1761 }, { "epoch": 2.904023470243085, "grad_norm": 4.678471088409424, "learning_rate": 1.1041725914633372e-08, "loss": 2.0393, "step": 1762 }, { "epoch": 2.9056999161777033, "grad_norm": 5.9528069496154785, "learning_rate": 1.0208838064683646e-08, "loss": 2.4556, "step": 1763 }, { "epoch": 2.9073763621123216, "grad_norm": 8.404833793640137, "learning_rate": 9.408590691392106e-09, "loss": 2.6487, "step": 1764 }, { "epoch": 2.9090528080469404, "grad_norm": 15.023886680603027, "learning_rate": 8.640986409471642e-09, "loss": 2.2012, "step": 1765 }, { "epoch": 2.9107292539815592, "grad_norm": 6.09271764755249, "learning_rate": 7.906027726981568e-09, "loss": 2.101, "step": 1766 }, { "epoch": 2.9124056999161776, "grad_norm": 4.989724636077881, "learning_rate": 7.2037170453120865e-09, "loss": 2.2726, "step": 1767 }, { "epoch": 2.9140821458507964, "grad_norm": 17.875017166137695, "learning_rate": 6.534056659183163e-09, "loss": 1.8577, "step": 1768 }, { "epoch": 2.9157585917854147, "grad_norm": 7.415724277496338, "learning_rate": 5.897048756630108e-09, "loss": 2.1932, "step": 1769 }, { "epoch": 2.9174350377200335, "grad_norm": 4.532334327697754, "learning_rate": 5.292695419002458e-09, "loss": 1.9691, "step": 1770 }, { "epoch": 2.9191114836546523, "grad_norm": 8.778444290161133, "learning_rate": 4.720998620955097e-09, "loss": 2.2092, "step": 1771 }, { "epoch": 2.9207879295892707, "grad_norm": 9.400038719177246, "learning_rate": 4.181960230439375e-09, "loss": 2.1021, "step": 1772 }, { "epoch": 2.9224643755238895, "grad_norm": 8.5486478805542, "learning_rate": 3.675582008700884e-09, "loss": 2.445, "step": 1773 }, { "epoch": 2.924140821458508, "grad_norm": 6.419625759124756, "learning_rate": 3.2018656102716928e-09, "loss": 1.717, "step": 1774 }, { "epoch": 2.9258172673931266, "grad_norm": 5.932398796081543, "learning_rate": 2.760812582964789e-09, "loss": 2.5388, "step": 1775 }, { "epoch": 2.9274937133277454, "grad_norm": 4.722435474395752, "learning_rate": 2.3524243678685333e-09, "loss": 2.2076, "step": 1776 }, { "epoch": 2.929170159262364, "grad_norm": 7.867659091949463, "learning_rate": 1.9767022993444353e-09, "loss": 2.1944, "step": 1777 }, { "epoch": 2.930846605196982, "grad_norm": 17.070358276367188, "learning_rate": 1.633647605020494e-09, "loss": 2.2172, "step": 1778 }, { "epoch": 2.932523051131601, "grad_norm": 13.998858451843262, "learning_rate": 1.3232614057878678e-09, "loss": 2.0388, "step": 1779 }, { "epoch": 2.9341994970662197, "grad_norm": 7.206970691680908, "learning_rate": 1.0455447157975418e-09, "loss": 2.3386, "step": 1780 }, { "epoch": 2.935875943000838, "grad_norm": 5.423552989959717, "learning_rate": 8.004984424558881e-10, "loss": 2.1649, "step": 1781 }, { "epoch": 2.937552388935457, "grad_norm": 7.113008975982666, "learning_rate": 5.881233864224456e-10, "loss": 1.9516, "step": 1782 }, { "epoch": 2.9392288348700752, "grad_norm": 7.968682765960693, "learning_rate": 4.0842024160991925e-10, "loss": 2.1864, "step": 1783 }, { "epoch": 2.940905280804694, "grad_norm": 9.597124099731445, "learning_rate": 2.613895951752987e-10, "loss": 2.3531, "step": 1784 }, { "epoch": 2.942581726739313, "grad_norm": 7.044503688812256, "learning_rate": 1.470319275254095e-10, "loss": 2.3195, "step": 1785 }, { "epoch": 2.944258172673931, "grad_norm": 6.170265197753906, "learning_rate": 6.534761230914122e-11, "loss": 2.1304, "step": 1786 }, { "epoch": 2.94593461860855, "grad_norm": 8.598637580871582, "learning_rate": 1.6336916421888503e-11, "loss": 2.178, "step": 1787 }, { "epoch": 2.9476110645431683, "grad_norm": 5.382966041564941, "learning_rate": 0.0, "loss": 2.5498, "step": 1788 }, { "epoch": 2.9476110645431683, "eval_loss": 2.231407880783081, "eval_runtime": 170.8858, "eval_samples_per_second": 3.072, "eval_steps_per_second": 1.539, "step": 1788 } ], "logging_steps": 1, "max_steps": 1788, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 298, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.169549558265938e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }