diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,81608 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 11654, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 4.855755805969238, + "learning_rate": 2.9997425776557404e-05, + "loss": 6.0374, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 4.109065532684326, + "learning_rate": 2.999485155311481e-05, + "loss": 5.84, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 4.335865497589111, + "learning_rate": 2.9992277329672214e-05, + "loss": 5.6446, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 4.389212608337402, + "learning_rate": 2.998970310622962e-05, + "loss": 5.5062, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 4.956500053405762, + "learning_rate": 2.9987128882787027e-05, + "loss": 5.2977, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 5.355259895324707, + "learning_rate": 2.9984554659344434e-05, + "loss": 5.0181, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 5.3592987060546875, + "learning_rate": 2.9981980435901837e-05, + "loss": 4.8524, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 5.215114593505859, + "learning_rate": 2.9979406212459244e-05, + "loss": 4.7503, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 6.257913589477539, + "learning_rate": 2.9976831989016647e-05, + "loss": 4.5238, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 5.579934597015381, + "learning_rate": 2.9974257765574054e-05, + "loss": 4.353, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 5.536452293395996, + "learning_rate": 2.9971683542131457e-05, + "loss": 4.2689, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 5.942359447479248, + "learning_rate": 2.996910931868886e-05, + "loss": 4.1416, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 6.103983402252197, + "learning_rate": 2.9966535095246267e-05, + "loss": 3.9087, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 8.696576118469238, + "learning_rate": 2.996396087180367e-05, + "loss": 3.8292, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 8.177977561950684, + "learning_rate": 2.996138664836108e-05, + "loss": 3.5344, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 7.678316593170166, + "learning_rate": 2.9958812424918484e-05, + "loss": 3.3705, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 14.075394630432129, + "learning_rate": 2.995623820147589e-05, + "loss": 3.7041, + "step": 17 + }, + { + "epoch": 0.0, + "grad_norm": 10.564874649047852, + "learning_rate": 2.9953663978033294e-05, + "loss": 2.9878, + "step": 18 + }, + { + "epoch": 0.0, + "grad_norm": 22.051210403442383, + "learning_rate": 2.99510897545907e-05, + "loss": 3.2233, + "step": 19 + }, + { + "epoch": 0.0, + "grad_norm": 13.445047378540039, + "learning_rate": 2.9948515531148104e-05, + "loss": 3.0284, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 13.75167179107666, + "learning_rate": 2.9945941307705507e-05, + "loss": 3.2145, + "step": 21 + }, + { + "epoch": 0.0, + "grad_norm": 35.302085876464844, + "learning_rate": 2.9943367084262914e-05, + "loss": 3.022, + "step": 22 + }, + { + "epoch": 0.0, + "grad_norm": 20.62187385559082, + "learning_rate": 2.9940792860820317e-05, + "loss": 2.8868, + "step": 23 + }, + { + "epoch": 0.0, + "grad_norm": 20.765596389770508, + "learning_rate": 2.9938218637377727e-05, + "loss": 2.7575, + "step": 24 + }, + { + "epoch": 0.0, + "grad_norm": 17.492706298828125, + "learning_rate": 2.993564441393513e-05, + "loss": 2.9801, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 11.508520126342773, + "learning_rate": 2.9933070190492537e-05, + "loss": 2.9401, + "step": 26 + }, + { + "epoch": 0.0, + "grad_norm": 11.079510688781738, + "learning_rate": 2.993049596704994e-05, + "loss": 2.8228, + "step": 27 + }, + { + "epoch": 0.0, + "grad_norm": 16.55045509338379, + "learning_rate": 2.9927921743607347e-05, + "loss": 2.5193, + "step": 28 + }, + { + "epoch": 0.0, + "grad_norm": 9.096270561218262, + "learning_rate": 2.992534752016475e-05, + "loss": 2.612, + "step": 29 + }, + { + "epoch": 0.01, + "grad_norm": 14.282428741455078, + "learning_rate": 2.9922773296722157e-05, + "loss": 2.8728, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 12.487578392028809, + "learning_rate": 2.992019907327956e-05, + "loss": 2.8666, + "step": 31 + }, + { + "epoch": 0.01, + "grad_norm": 11.594644546508789, + "learning_rate": 2.9917624849836964e-05, + "loss": 2.4412, + "step": 32 + }, + { + "epoch": 0.01, + "grad_norm": 9.025802612304688, + "learning_rate": 2.991505062639437e-05, + "loss": 2.6776, + "step": 33 + }, + { + "epoch": 0.01, + "grad_norm": 9.0786771774292, + "learning_rate": 2.9912476402951777e-05, + "loss": 2.0032, + "step": 34 + }, + { + "epoch": 0.01, + "grad_norm": 13.964434623718262, + "learning_rate": 2.9909902179509184e-05, + "loss": 2.7171, + "step": 35 + }, + { + "epoch": 0.01, + "grad_norm": 11.710619926452637, + "learning_rate": 2.9907327956066587e-05, + "loss": 2.3159, + "step": 36 + }, + { + "epoch": 0.01, + "grad_norm": 9.789759635925293, + "learning_rate": 2.9904753732623994e-05, + "loss": 2.3495, + "step": 37 + }, + { + "epoch": 0.01, + "grad_norm": 12.272398948669434, + "learning_rate": 2.9902179509181397e-05, + "loss": 2.4484, + "step": 38 + }, + { + "epoch": 0.01, + "grad_norm": 12.568193435668945, + "learning_rate": 2.9899605285738804e-05, + "loss": 2.3863, + "step": 39 + }, + { + "epoch": 0.01, + "grad_norm": 9.755990028381348, + "learning_rate": 2.9897031062296207e-05, + "loss": 2.1977, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 9.962193489074707, + "learning_rate": 2.9894456838853614e-05, + "loss": 2.4596, + "step": 41 + }, + { + "epoch": 0.01, + "grad_norm": 10.54276180267334, + "learning_rate": 2.9891882615411017e-05, + "loss": 2.0344, + "step": 42 + }, + { + "epoch": 0.01, + "grad_norm": 11.461871147155762, + "learning_rate": 2.9889308391968424e-05, + "loss": 2.5114, + "step": 43 + }, + { + "epoch": 0.01, + "grad_norm": 10.544355392456055, + "learning_rate": 2.988673416852583e-05, + "loss": 2.2582, + "step": 44 + }, + { + "epoch": 0.01, + "grad_norm": 8.856222152709961, + "learning_rate": 2.9884159945083234e-05, + "loss": 2.5919, + "step": 45 + }, + { + "epoch": 0.01, + "grad_norm": 9.033227920532227, + "learning_rate": 2.988158572164064e-05, + "loss": 2.117, + "step": 46 + }, + { + "epoch": 0.01, + "grad_norm": 15.805606842041016, + "learning_rate": 2.9879011498198044e-05, + "loss": 2.5082, + "step": 47 + }, + { + "epoch": 0.01, + "grad_norm": 14.737322807312012, + "learning_rate": 2.987643727475545e-05, + "loss": 2.3978, + "step": 48 + }, + { + "epoch": 0.01, + "grad_norm": 10.552938461303711, + "learning_rate": 2.9873863051312854e-05, + "loss": 2.1329, + "step": 49 + }, + { + "epoch": 0.01, + "grad_norm": 14.492465019226074, + "learning_rate": 2.987128882787026e-05, + "loss": 2.2769, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 13.515401840209961, + "learning_rate": 2.9868714604427664e-05, + "loss": 2.6346, + "step": 51 + }, + { + "epoch": 0.01, + "grad_norm": 9.540853500366211, + "learning_rate": 2.986614038098507e-05, + "loss": 2.3172, + "step": 52 + }, + { + "epoch": 0.01, + "grad_norm": 12.621674537658691, + "learning_rate": 2.9863566157542477e-05, + "loss": 2.3729, + "step": 53 + }, + { + "epoch": 0.01, + "grad_norm": 10.57535457611084, + "learning_rate": 2.986099193409988e-05, + "loss": 1.7151, + "step": 54 + }, + { + "epoch": 0.01, + "grad_norm": 9.841571807861328, + "learning_rate": 2.9858417710657287e-05, + "loss": 2.0466, + "step": 55 + }, + { + "epoch": 0.01, + "grad_norm": 11.51070499420166, + "learning_rate": 2.985584348721469e-05, + "loss": 2.4203, + "step": 56 + }, + { + "epoch": 0.01, + "grad_norm": 9.48379135131836, + "learning_rate": 2.9853269263772097e-05, + "loss": 2.1071, + "step": 57 + }, + { + "epoch": 0.01, + "grad_norm": 12.393445014953613, + "learning_rate": 2.98506950403295e-05, + "loss": 2.014, + "step": 58 + }, + { + "epoch": 0.01, + "grad_norm": 10.898478507995605, + "learning_rate": 2.9848120816886907e-05, + "loss": 2.4063, + "step": 59 + }, + { + "epoch": 0.01, + "grad_norm": 11.223072052001953, + "learning_rate": 2.984554659344431e-05, + "loss": 2.1862, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 11.349495887756348, + "learning_rate": 2.9842972370001717e-05, + "loss": 1.829, + "step": 61 + }, + { + "epoch": 0.01, + "grad_norm": 11.530231475830078, + "learning_rate": 2.9840398146559124e-05, + "loss": 2.2932, + "step": 62 + }, + { + "epoch": 0.01, + "grad_norm": 11.20637321472168, + "learning_rate": 2.9837823923116527e-05, + "loss": 1.7466, + "step": 63 + }, + { + "epoch": 0.01, + "grad_norm": 10.135061264038086, + "learning_rate": 2.9835249699673934e-05, + "loss": 1.9535, + "step": 64 + }, + { + "epoch": 0.01, + "grad_norm": 13.84786319732666, + "learning_rate": 2.9832675476231337e-05, + "loss": 2.3064, + "step": 65 + }, + { + "epoch": 0.01, + "grad_norm": 10.094093322753906, + "learning_rate": 2.9830101252788743e-05, + "loss": 1.9594, + "step": 66 + }, + { + "epoch": 0.01, + "grad_norm": 16.481359481811523, + "learning_rate": 2.9827527029346147e-05, + "loss": 2.3698, + "step": 67 + }, + { + "epoch": 0.01, + "grad_norm": 10.151596069335938, + "learning_rate": 2.9824952805903553e-05, + "loss": 2.1976, + "step": 68 + }, + { + "epoch": 0.01, + "grad_norm": 9.80447769165039, + "learning_rate": 2.9822378582460957e-05, + "loss": 2.2426, + "step": 69 + }, + { + "epoch": 0.01, + "grad_norm": 11.807653427124023, + "learning_rate": 2.9819804359018363e-05, + "loss": 1.8431, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 9.624991416931152, + "learning_rate": 2.981723013557577e-05, + "loss": 1.9657, + "step": 71 + }, + { + "epoch": 0.01, + "grad_norm": 9.18333911895752, + "learning_rate": 2.9814655912133177e-05, + "loss": 1.9263, + "step": 72 + }, + { + "epoch": 0.01, + "grad_norm": 15.231605529785156, + "learning_rate": 2.981208168869058e-05, + "loss": 2.0695, + "step": 73 + }, + { + "epoch": 0.01, + "grad_norm": 14.867193222045898, + "learning_rate": 2.9809507465247983e-05, + "loss": 1.9182, + "step": 74 + }, + { + "epoch": 0.01, + "grad_norm": 10.365927696228027, + "learning_rate": 2.980693324180539e-05, + "loss": 1.9339, + "step": 75 + }, + { + "epoch": 0.01, + "grad_norm": 9.1305570602417, + "learning_rate": 2.9804359018362793e-05, + "loss": 2.1178, + "step": 76 + }, + { + "epoch": 0.01, + "grad_norm": 13.789942741394043, + "learning_rate": 2.98017847949202e-05, + "loss": 1.8076, + "step": 77 + }, + { + "epoch": 0.01, + "grad_norm": 10.080610275268555, + "learning_rate": 2.9799210571477603e-05, + "loss": 2.0405, + "step": 78 + }, + { + "epoch": 0.01, + "grad_norm": 10.912185668945312, + "learning_rate": 2.979663634803501e-05, + "loss": 1.9494, + "step": 79 + }, + { + "epoch": 0.01, + "grad_norm": 12.001837730407715, + "learning_rate": 2.9794062124592413e-05, + "loss": 2.0811, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 9.438093185424805, + "learning_rate": 2.9791487901149823e-05, + "loss": 2.081, + "step": 81 + }, + { + "epoch": 0.01, + "grad_norm": 11.761951446533203, + "learning_rate": 2.9788913677707227e-05, + "loss": 1.9545, + "step": 82 + }, + { + "epoch": 0.01, + "grad_norm": 9.435627937316895, + "learning_rate": 2.9786339454264633e-05, + "loss": 1.9778, + "step": 83 + }, + { + "epoch": 0.01, + "grad_norm": 11.932748794555664, + "learning_rate": 2.9783765230822037e-05, + "loss": 2.2337, + "step": 84 + }, + { + "epoch": 0.01, + "grad_norm": 11.060193061828613, + "learning_rate": 2.978119100737944e-05, + "loss": 1.9319, + "step": 85 + }, + { + "epoch": 0.01, + "grad_norm": 10.534597396850586, + "learning_rate": 2.9778616783936847e-05, + "loss": 2.0977, + "step": 86 + }, + { + "epoch": 0.01, + "grad_norm": 11.02622127532959, + "learning_rate": 2.977604256049425e-05, + "loss": 2.1778, + "step": 87 + }, + { + "epoch": 0.02, + "grad_norm": 11.81395149230957, + "learning_rate": 2.9773468337051657e-05, + "loss": 1.8546, + "step": 88 + }, + { + "epoch": 0.02, + "grad_norm": 10.430489540100098, + "learning_rate": 2.977089411360906e-05, + "loss": 2.142, + "step": 89 + }, + { + "epoch": 0.02, + "grad_norm": 13.661596298217773, + "learning_rate": 2.976831989016647e-05, + "loss": 1.9116, + "step": 90 + }, + { + "epoch": 0.02, + "grad_norm": 9.51032829284668, + "learning_rate": 2.9765745666723873e-05, + "loss": 1.8959, + "step": 91 + }, + { + "epoch": 0.02, + "grad_norm": 9.853291511535645, + "learning_rate": 2.976317144328128e-05, + "loss": 1.9983, + "step": 92 + }, + { + "epoch": 0.02, + "grad_norm": 8.778831481933594, + "learning_rate": 2.9760597219838683e-05, + "loss": 1.8415, + "step": 93 + }, + { + "epoch": 0.02, + "grad_norm": 11.720138549804688, + "learning_rate": 2.9758022996396087e-05, + "loss": 2.322, + "step": 94 + }, + { + "epoch": 0.02, + "grad_norm": 10.475862503051758, + "learning_rate": 2.9755448772953493e-05, + "loss": 2.1695, + "step": 95 + }, + { + "epoch": 0.02, + "grad_norm": 9.582047462463379, + "learning_rate": 2.9752874549510896e-05, + "loss": 2.1263, + "step": 96 + }, + { + "epoch": 0.02, + "grad_norm": 12.227659225463867, + "learning_rate": 2.9750300326068303e-05, + "loss": 1.7871, + "step": 97 + }, + { + "epoch": 0.02, + "grad_norm": 8.419367790222168, + "learning_rate": 2.9747726102625706e-05, + "loss": 1.8047, + "step": 98 + }, + { + "epoch": 0.02, + "grad_norm": 8.549893379211426, + "learning_rate": 2.9745151879183113e-05, + "loss": 2.0386, + "step": 99 + }, + { + "epoch": 0.02, + "grad_norm": 16.57759666442871, + "learning_rate": 2.974257765574052e-05, + "loss": 1.9337, + "step": 100 + }, + { + "epoch": 0.02, + "grad_norm": 10.29601764678955, + "learning_rate": 2.9740003432297927e-05, + "loss": 1.6766, + "step": 101 + }, + { + "epoch": 0.02, + "grad_norm": 13.312577247619629, + "learning_rate": 2.973742920885533e-05, + "loss": 2.0605, + "step": 102 + }, + { + "epoch": 0.02, + "grad_norm": 9.877716064453125, + "learning_rate": 2.9734854985412736e-05, + "loss": 2.0258, + "step": 103 + }, + { + "epoch": 0.02, + "grad_norm": 12.774568557739258, + "learning_rate": 2.973228076197014e-05, + "loss": 1.8771, + "step": 104 + }, + { + "epoch": 0.02, + "grad_norm": 13.98751449584961, + "learning_rate": 2.9729706538527543e-05, + "loss": 2.1093, + "step": 105 + }, + { + "epoch": 0.02, + "grad_norm": 11.158930778503418, + "learning_rate": 2.972713231508495e-05, + "loss": 2.0506, + "step": 106 + }, + { + "epoch": 0.02, + "grad_norm": 8.581437110900879, + "learning_rate": 2.9724558091642353e-05, + "loss": 1.721, + "step": 107 + }, + { + "epoch": 0.02, + "grad_norm": 9.814128875732422, + "learning_rate": 2.972198386819976e-05, + "loss": 1.9709, + "step": 108 + }, + { + "epoch": 0.02, + "grad_norm": 8.382211685180664, + "learning_rate": 2.9719409644757166e-05, + "loss": 1.759, + "step": 109 + }, + { + "epoch": 0.02, + "grad_norm": 9.167340278625488, + "learning_rate": 2.9716835421314573e-05, + "loss": 1.9925, + "step": 110 + }, + { + "epoch": 0.02, + "grad_norm": 11.374336242675781, + "learning_rate": 2.9714261197871976e-05, + "loss": 2.5769, + "step": 111 + }, + { + "epoch": 0.02, + "grad_norm": 10.552855491638184, + "learning_rate": 2.9711686974429383e-05, + "loss": 2.045, + "step": 112 + }, + { + "epoch": 0.02, + "grad_norm": 9.009696006774902, + "learning_rate": 2.9709112750986786e-05, + "loss": 2.0438, + "step": 113 + }, + { + "epoch": 0.02, + "grad_norm": 11.150276184082031, + "learning_rate": 2.9706538527544193e-05, + "loss": 1.7308, + "step": 114 + }, + { + "epoch": 0.02, + "grad_norm": 10.456452369689941, + "learning_rate": 2.9703964304101596e-05, + "loss": 1.8647, + "step": 115 + }, + { + "epoch": 0.02, + "grad_norm": 9.520078659057617, + "learning_rate": 2.9701390080659e-05, + "loss": 1.7663, + "step": 116 + }, + { + "epoch": 0.02, + "grad_norm": 11.088191032409668, + "learning_rate": 2.9698815857216406e-05, + "loss": 1.6428, + "step": 117 + }, + { + "epoch": 0.02, + "grad_norm": 12.780815124511719, + "learning_rate": 2.969624163377381e-05, + "loss": 2.1887, + "step": 118 + }, + { + "epoch": 0.02, + "grad_norm": 13.029000282287598, + "learning_rate": 2.969366741033122e-05, + "loss": 1.9132, + "step": 119 + }, + { + "epoch": 0.02, + "grad_norm": 9.509160995483398, + "learning_rate": 2.9691093186888623e-05, + "loss": 1.9838, + "step": 120 + }, + { + "epoch": 0.02, + "grad_norm": 10.37309741973877, + "learning_rate": 2.968851896344603e-05, + "loss": 1.9265, + "step": 121 + }, + { + "epoch": 0.02, + "grad_norm": 9.40943717956543, + "learning_rate": 2.9685944740003433e-05, + "loss": 1.9344, + "step": 122 + }, + { + "epoch": 0.02, + "grad_norm": 11.383122444152832, + "learning_rate": 2.968337051656084e-05, + "loss": 2.1259, + "step": 123 + }, + { + "epoch": 0.02, + "grad_norm": 9.58685302734375, + "learning_rate": 2.9680796293118243e-05, + "loss": 1.7329, + "step": 124 + }, + { + "epoch": 0.02, + "grad_norm": 8.93775749206543, + "learning_rate": 2.9678222069675646e-05, + "loss": 1.7628, + "step": 125 + }, + { + "epoch": 0.02, + "grad_norm": 10.446601867675781, + "learning_rate": 2.9675647846233053e-05, + "loss": 1.9658, + "step": 126 + }, + { + "epoch": 0.02, + "grad_norm": 9.44230842590332, + "learning_rate": 2.9673073622790456e-05, + "loss": 2.2834, + "step": 127 + }, + { + "epoch": 0.02, + "grad_norm": 8.857770919799805, + "learning_rate": 2.9670499399347866e-05, + "loss": 1.8792, + "step": 128 + }, + { + "epoch": 0.02, + "grad_norm": 10.195531845092773, + "learning_rate": 2.966792517590527e-05, + "loss": 1.8233, + "step": 129 + }, + { + "epoch": 0.02, + "grad_norm": 12.215664863586426, + "learning_rate": 2.9665350952462676e-05, + "loss": 1.6713, + "step": 130 + }, + { + "epoch": 0.02, + "grad_norm": 12.225882530212402, + "learning_rate": 2.966277672902008e-05, + "loss": 2.454, + "step": 131 + }, + { + "epoch": 0.02, + "grad_norm": 10.208149909973145, + "learning_rate": 2.9660202505577486e-05, + "loss": 1.9905, + "step": 132 + }, + { + "epoch": 0.02, + "grad_norm": 10.276273727416992, + "learning_rate": 2.965762828213489e-05, + "loss": 1.9253, + "step": 133 + }, + { + "epoch": 0.02, + "grad_norm": 9.66161823272705, + "learning_rate": 2.9655054058692296e-05, + "loss": 1.4329, + "step": 134 + }, + { + "epoch": 0.02, + "grad_norm": 9.97610855102539, + "learning_rate": 2.96524798352497e-05, + "loss": 1.9271, + "step": 135 + }, + { + "epoch": 0.02, + "grad_norm": 8.867330551147461, + "learning_rate": 2.9649905611807103e-05, + "loss": 1.4213, + "step": 136 + }, + { + "epoch": 0.02, + "grad_norm": 9.628447532653809, + "learning_rate": 2.964733138836451e-05, + "loss": 2.0446, + "step": 137 + }, + { + "epoch": 0.02, + "grad_norm": 9.068493843078613, + "learning_rate": 2.9644757164921916e-05, + "loss": 1.8949, + "step": 138 + }, + { + "epoch": 0.02, + "grad_norm": 10.425756454467773, + "learning_rate": 2.9642182941479323e-05, + "loss": 1.8883, + "step": 139 + }, + { + "epoch": 0.02, + "grad_norm": 9.106772422790527, + "learning_rate": 2.9639608718036726e-05, + "loss": 1.6391, + "step": 140 + }, + { + "epoch": 0.02, + "grad_norm": 9.371277809143066, + "learning_rate": 2.9637034494594133e-05, + "loss": 1.836, + "step": 141 + }, + { + "epoch": 0.02, + "grad_norm": 9.329122543334961, + "learning_rate": 2.9634460271151536e-05, + "loss": 1.7353, + "step": 142 + }, + { + "epoch": 0.02, + "grad_norm": 8.681215286254883, + "learning_rate": 2.9631886047708943e-05, + "loss": 1.5368, + "step": 143 + }, + { + "epoch": 0.02, + "grad_norm": 10.243221282958984, + "learning_rate": 2.9629311824266346e-05, + "loss": 2.144, + "step": 144 + }, + { + "epoch": 0.02, + "grad_norm": 7.58792781829834, + "learning_rate": 2.9626737600823753e-05, + "loss": 1.6091, + "step": 145 + }, + { + "epoch": 0.03, + "grad_norm": 10.012331008911133, + "learning_rate": 2.9624163377381156e-05, + "loss": 1.7524, + "step": 146 + }, + { + "epoch": 0.03, + "grad_norm": 10.462698936462402, + "learning_rate": 2.9621589153938563e-05, + "loss": 2.0098, + "step": 147 + }, + { + "epoch": 0.03, + "grad_norm": 12.41120433807373, + "learning_rate": 2.961901493049597e-05, + "loss": 2.0257, + "step": 148 + }, + { + "epoch": 0.03, + "grad_norm": 13.883111000061035, + "learning_rate": 2.9616440707053373e-05, + "loss": 1.8638, + "step": 149 + }, + { + "epoch": 0.03, + "grad_norm": 10.62372875213623, + "learning_rate": 2.961386648361078e-05, + "loss": 2.052, + "step": 150 + }, + { + "epoch": 0.03, + "grad_norm": 10.91905403137207, + "learning_rate": 2.9611292260168183e-05, + "loss": 2.2381, + "step": 151 + }, + { + "epoch": 0.03, + "grad_norm": 8.769111633300781, + "learning_rate": 2.960871803672559e-05, + "loss": 1.7367, + "step": 152 + }, + { + "epoch": 0.03, + "grad_norm": 9.132026672363281, + "learning_rate": 2.9606143813282993e-05, + "loss": 1.9155, + "step": 153 + }, + { + "epoch": 0.03, + "grad_norm": 9.776375770568848, + "learning_rate": 2.96035695898404e-05, + "loss": 2.0896, + "step": 154 + }, + { + "epoch": 0.03, + "grad_norm": 8.961363792419434, + "learning_rate": 2.9600995366397803e-05, + "loss": 2.0811, + "step": 155 + }, + { + "epoch": 0.03, + "grad_norm": 8.618287086486816, + "learning_rate": 2.959842114295521e-05, + "loss": 2.0352, + "step": 156 + }, + { + "epoch": 0.03, + "grad_norm": 8.396384239196777, + "learning_rate": 2.9595846919512616e-05, + "loss": 1.677, + "step": 157 + }, + { + "epoch": 0.03, + "grad_norm": 9.013529777526855, + "learning_rate": 2.959327269607002e-05, + "loss": 1.8562, + "step": 158 + }, + { + "epoch": 0.03, + "grad_norm": 8.963462829589844, + "learning_rate": 2.9590698472627426e-05, + "loss": 1.9417, + "step": 159 + }, + { + "epoch": 0.03, + "grad_norm": 9.495522499084473, + "learning_rate": 2.958812424918483e-05, + "loss": 1.5038, + "step": 160 + }, + { + "epoch": 0.03, + "grad_norm": 8.643668174743652, + "learning_rate": 2.9585550025742236e-05, + "loss": 1.6444, + "step": 161 + }, + { + "epoch": 0.03, + "grad_norm": 9.734110832214355, + "learning_rate": 2.958297580229964e-05, + "loss": 1.7968, + "step": 162 + }, + { + "epoch": 0.03, + "grad_norm": 10.823417663574219, + "learning_rate": 2.9580401578857046e-05, + "loss": 1.6856, + "step": 163 + }, + { + "epoch": 0.03, + "grad_norm": 11.06626033782959, + "learning_rate": 2.957782735541445e-05, + "loss": 1.6747, + "step": 164 + }, + { + "epoch": 0.03, + "grad_norm": 9.7233304977417, + "learning_rate": 2.9575253131971856e-05, + "loss": 1.5999, + "step": 165 + }, + { + "epoch": 0.03, + "grad_norm": 9.944986343383789, + "learning_rate": 2.9572678908529263e-05, + "loss": 1.79, + "step": 166 + }, + { + "epoch": 0.03, + "grad_norm": 10.763249397277832, + "learning_rate": 2.9570104685086666e-05, + "loss": 1.6137, + "step": 167 + }, + { + "epoch": 0.03, + "grad_norm": 10.439078330993652, + "learning_rate": 2.9567530461644073e-05, + "loss": 1.5438, + "step": 168 + }, + { + "epoch": 0.03, + "grad_norm": 11.983046531677246, + "learning_rate": 2.9564956238201476e-05, + "loss": 1.9339, + "step": 169 + }, + { + "epoch": 0.03, + "grad_norm": 8.30870532989502, + "learning_rate": 2.9562382014758882e-05, + "loss": 1.6122, + "step": 170 + }, + { + "epoch": 0.03, + "grad_norm": 11.260065078735352, + "learning_rate": 2.9559807791316286e-05, + "loss": 2.0262, + "step": 171 + }, + { + "epoch": 0.03, + "grad_norm": 10.255437850952148, + "learning_rate": 2.9557233567873692e-05, + "loss": 1.545, + "step": 172 + }, + { + "epoch": 0.03, + "grad_norm": 10.572342872619629, + "learning_rate": 2.9554659344431096e-05, + "loss": 1.9368, + "step": 173 + }, + { + "epoch": 0.03, + "grad_norm": 10.593486785888672, + "learning_rate": 2.9552085120988502e-05, + "loss": 1.9177, + "step": 174 + }, + { + "epoch": 0.03, + "grad_norm": 10.95742416381836, + "learning_rate": 2.954951089754591e-05, + "loss": 2.0184, + "step": 175 + }, + { + "epoch": 0.03, + "grad_norm": 11.451391220092773, + "learning_rate": 2.9546936674103316e-05, + "loss": 1.7796, + "step": 176 + }, + { + "epoch": 0.03, + "grad_norm": 9.525118827819824, + "learning_rate": 2.954436245066072e-05, + "loss": 1.854, + "step": 177 + }, + { + "epoch": 0.03, + "grad_norm": 10.262989044189453, + "learning_rate": 2.9541788227218122e-05, + "loss": 1.7874, + "step": 178 + }, + { + "epoch": 0.03, + "grad_norm": 9.921760559082031, + "learning_rate": 2.953921400377553e-05, + "loss": 1.959, + "step": 179 + }, + { + "epoch": 0.03, + "grad_norm": 12.572196006774902, + "learning_rate": 2.9536639780332932e-05, + "loss": 1.523, + "step": 180 + }, + { + "epoch": 0.03, + "grad_norm": 11.206779479980469, + "learning_rate": 2.953406555689034e-05, + "loss": 2.1411, + "step": 181 + }, + { + "epoch": 0.03, + "grad_norm": 10.821024894714355, + "learning_rate": 2.9531491333447742e-05, + "loss": 2.0197, + "step": 182 + }, + { + "epoch": 0.03, + "grad_norm": 9.028032302856445, + "learning_rate": 2.952891711000515e-05, + "loss": 1.457, + "step": 183 + }, + { + "epoch": 0.03, + "grad_norm": 10.02061653137207, + "learning_rate": 2.9526342886562552e-05, + "loss": 1.7391, + "step": 184 + }, + { + "epoch": 0.03, + "grad_norm": 9.09890079498291, + "learning_rate": 2.9523768663119962e-05, + "loss": 1.9001, + "step": 185 + }, + { + "epoch": 0.03, + "grad_norm": 8.106844902038574, + "learning_rate": 2.9521194439677366e-05, + "loss": 1.7622, + "step": 186 + }, + { + "epoch": 0.03, + "grad_norm": 8.670572280883789, + "learning_rate": 2.9518620216234772e-05, + "loss": 2.1797, + "step": 187 + }, + { + "epoch": 0.03, + "grad_norm": 9.153185844421387, + "learning_rate": 2.9516045992792176e-05, + "loss": 1.986, + "step": 188 + }, + { + "epoch": 0.03, + "grad_norm": 11.073293685913086, + "learning_rate": 2.951347176934958e-05, + "loss": 1.8458, + "step": 189 + }, + { + "epoch": 0.03, + "grad_norm": 10.061956405639648, + "learning_rate": 2.9510897545906986e-05, + "loss": 1.8076, + "step": 190 + }, + { + "epoch": 0.03, + "grad_norm": 8.938712120056152, + "learning_rate": 2.950832332246439e-05, + "loss": 1.7217, + "step": 191 + }, + { + "epoch": 0.03, + "grad_norm": 10.107938766479492, + "learning_rate": 2.9505749099021796e-05, + "loss": 1.8765, + "step": 192 + }, + { + "epoch": 0.03, + "grad_norm": 10.666096687316895, + "learning_rate": 2.95031748755792e-05, + "loss": 1.7959, + "step": 193 + }, + { + "epoch": 0.03, + "grad_norm": 11.081636428833008, + "learning_rate": 2.950060065213661e-05, + "loss": 1.705, + "step": 194 + }, + { + "epoch": 0.03, + "grad_norm": 8.058525085449219, + "learning_rate": 2.9498026428694012e-05, + "loss": 1.236, + "step": 195 + }, + { + "epoch": 0.03, + "grad_norm": 8.916040420532227, + "learning_rate": 2.949545220525142e-05, + "loss": 1.6955, + "step": 196 + }, + { + "epoch": 0.03, + "grad_norm": 7.877468109130859, + "learning_rate": 2.9492877981808822e-05, + "loss": 1.6584, + "step": 197 + }, + { + "epoch": 0.03, + "grad_norm": 10.352563858032227, + "learning_rate": 2.9490303758366226e-05, + "loss": 1.9344, + "step": 198 + }, + { + "epoch": 0.03, + "grad_norm": 8.303910255432129, + "learning_rate": 2.9487729534923632e-05, + "loss": 1.662, + "step": 199 + }, + { + "epoch": 0.03, + "grad_norm": 8.003342628479004, + "learning_rate": 2.9485155311481035e-05, + "loss": 1.9813, + "step": 200 + }, + { + "epoch": 0.03, + "grad_norm": 8.470848083496094, + "learning_rate": 2.9482581088038442e-05, + "loss": 2.0612, + "step": 201 + }, + { + "epoch": 0.03, + "grad_norm": 8.331122398376465, + "learning_rate": 2.9480006864595845e-05, + "loss": 1.6463, + "step": 202 + }, + { + "epoch": 0.03, + "grad_norm": 9.50797176361084, + "learning_rate": 2.9477432641153252e-05, + "loss": 1.3334, + "step": 203 + }, + { + "epoch": 0.04, + "grad_norm": 8.60113525390625, + "learning_rate": 2.947485841771066e-05, + "loss": 1.7248, + "step": 204 + }, + { + "epoch": 0.04, + "grad_norm": 8.610932350158691, + "learning_rate": 2.9472284194268066e-05, + "loss": 1.6779, + "step": 205 + }, + { + "epoch": 0.04, + "grad_norm": 9.794546127319336, + "learning_rate": 2.946970997082547e-05, + "loss": 1.7341, + "step": 206 + }, + { + "epoch": 0.04, + "grad_norm": 10.368892669677734, + "learning_rate": 2.9467135747382875e-05, + "loss": 1.9062, + "step": 207 + }, + { + "epoch": 0.04, + "grad_norm": 12.087629318237305, + "learning_rate": 2.946456152394028e-05, + "loss": 1.9048, + "step": 208 + }, + { + "epoch": 0.04, + "grad_norm": 9.668496131896973, + "learning_rate": 2.9461987300497682e-05, + "loss": 1.7576, + "step": 209 + }, + { + "epoch": 0.04, + "grad_norm": 10.209294319152832, + "learning_rate": 2.945941307705509e-05, + "loss": 1.9772, + "step": 210 + }, + { + "epoch": 0.04, + "grad_norm": 11.424263954162598, + "learning_rate": 2.9456838853612492e-05, + "loss": 1.6316, + "step": 211 + }, + { + "epoch": 0.04, + "grad_norm": 10.347501754760742, + "learning_rate": 2.94542646301699e-05, + "loss": 1.6466, + "step": 212 + }, + { + "epoch": 0.04, + "grad_norm": 9.764115333557129, + "learning_rate": 2.9451690406727305e-05, + "loss": 1.7222, + "step": 213 + }, + { + "epoch": 0.04, + "grad_norm": 11.490241050720215, + "learning_rate": 2.9449116183284712e-05, + "loss": 2.0773, + "step": 214 + }, + { + "epoch": 0.04, + "grad_norm": 12.803098678588867, + "learning_rate": 2.9446541959842115e-05, + "loss": 1.6786, + "step": 215 + }, + { + "epoch": 0.04, + "grad_norm": 9.36636734008789, + "learning_rate": 2.9443967736399522e-05, + "loss": 1.9252, + "step": 216 + }, + { + "epoch": 0.04, + "grad_norm": 10.605925559997559, + "learning_rate": 2.9441393512956925e-05, + "loss": 1.5526, + "step": 217 + }, + { + "epoch": 0.04, + "grad_norm": 11.705082893371582, + "learning_rate": 2.9438819289514332e-05, + "loss": 1.4581, + "step": 218 + }, + { + "epoch": 0.04, + "grad_norm": 10.624661445617676, + "learning_rate": 2.9436245066071735e-05, + "loss": 2.0116, + "step": 219 + }, + { + "epoch": 0.04, + "grad_norm": 8.22999382019043, + "learning_rate": 2.943367084262914e-05, + "loss": 1.5265, + "step": 220 + }, + { + "epoch": 0.04, + "grad_norm": 8.914277076721191, + "learning_rate": 2.9431096619186545e-05, + "loss": 1.633, + "step": 221 + }, + { + "epoch": 0.04, + "grad_norm": 8.711101531982422, + "learning_rate": 2.942852239574395e-05, + "loss": 1.7177, + "step": 222 + }, + { + "epoch": 0.04, + "grad_norm": 11.378776550292969, + "learning_rate": 2.942594817230136e-05, + "loss": 1.7416, + "step": 223 + }, + { + "epoch": 0.04, + "grad_norm": 9.701340675354004, + "learning_rate": 2.9423373948858762e-05, + "loss": 1.9424, + "step": 224 + }, + { + "epoch": 0.04, + "grad_norm": 8.773192405700684, + "learning_rate": 2.942079972541617e-05, + "loss": 1.6253, + "step": 225 + }, + { + "epoch": 0.04, + "grad_norm": 9.05998706817627, + "learning_rate": 2.9418225501973572e-05, + "loss": 1.4659, + "step": 226 + }, + { + "epoch": 0.04, + "grad_norm": 9.306830406188965, + "learning_rate": 2.941565127853098e-05, + "loss": 1.7184, + "step": 227 + }, + { + "epoch": 0.04, + "grad_norm": 9.687745094299316, + "learning_rate": 2.9413077055088382e-05, + "loss": 1.5897, + "step": 228 + }, + { + "epoch": 0.04, + "grad_norm": 10.609406471252441, + "learning_rate": 2.9410502831645785e-05, + "loss": 2.0312, + "step": 229 + }, + { + "epoch": 0.04, + "grad_norm": 11.43237018585205, + "learning_rate": 2.9407928608203192e-05, + "loss": 1.9321, + "step": 230 + }, + { + "epoch": 0.04, + "grad_norm": 10.16401195526123, + "learning_rate": 2.9405354384760595e-05, + "loss": 1.7838, + "step": 231 + }, + { + "epoch": 0.04, + "grad_norm": 8.92502212524414, + "learning_rate": 2.9402780161318005e-05, + "loss": 1.3907, + "step": 232 + }, + { + "epoch": 0.04, + "grad_norm": 9.83435344696045, + "learning_rate": 2.940020593787541e-05, + "loss": 1.6566, + "step": 233 + }, + { + "epoch": 0.04, + "grad_norm": 10.208948135375977, + "learning_rate": 2.9397631714432815e-05, + "loss": 2.1311, + "step": 234 + }, + { + "epoch": 0.04, + "grad_norm": 9.743586540222168, + "learning_rate": 2.939505749099022e-05, + "loss": 1.9852, + "step": 235 + }, + { + "epoch": 0.04, + "grad_norm": 8.744789123535156, + "learning_rate": 2.9392483267547625e-05, + "loss": 1.4427, + "step": 236 + }, + { + "epoch": 0.04, + "grad_norm": 9.780157089233398, + "learning_rate": 2.938990904410503e-05, + "loss": 1.4846, + "step": 237 + }, + { + "epoch": 0.04, + "grad_norm": 8.985099792480469, + "learning_rate": 2.9387334820662435e-05, + "loss": 1.3918, + "step": 238 + }, + { + "epoch": 0.04, + "grad_norm": 7.792673587799072, + "learning_rate": 2.938476059721984e-05, + "loss": 1.5816, + "step": 239 + }, + { + "epoch": 0.04, + "grad_norm": 8.847929954528809, + "learning_rate": 2.9382186373777242e-05, + "loss": 1.9881, + "step": 240 + }, + { + "epoch": 0.04, + "grad_norm": 8.681547164916992, + "learning_rate": 2.937961215033465e-05, + "loss": 1.5669, + "step": 241 + }, + { + "epoch": 0.04, + "grad_norm": 9.897395133972168, + "learning_rate": 2.9377037926892055e-05, + "loss": 2.1757, + "step": 242 + }, + { + "epoch": 0.04, + "grad_norm": 10.938973426818848, + "learning_rate": 2.9374463703449462e-05, + "loss": 1.4375, + "step": 243 + }, + { + "epoch": 0.04, + "grad_norm": 10.108646392822266, + "learning_rate": 2.9371889480006865e-05, + "loss": 1.6926, + "step": 244 + }, + { + "epoch": 0.04, + "grad_norm": 9.904474258422852, + "learning_rate": 2.9369315256564272e-05, + "loss": 1.3446, + "step": 245 + }, + { + "epoch": 0.04, + "grad_norm": 9.004955291748047, + "learning_rate": 2.9366741033121675e-05, + "loss": 1.4969, + "step": 246 + }, + { + "epoch": 0.04, + "grad_norm": 10.158055305480957, + "learning_rate": 2.9364166809679082e-05, + "loss": 2.0364, + "step": 247 + }, + { + "epoch": 0.04, + "grad_norm": 9.525121688842773, + "learning_rate": 2.9361592586236485e-05, + "loss": 1.7299, + "step": 248 + }, + { + "epoch": 0.04, + "grad_norm": 11.56649398803711, + "learning_rate": 2.9359018362793892e-05, + "loss": 1.7901, + "step": 249 + }, + { + "epoch": 0.04, + "grad_norm": 15.06497573852539, + "learning_rate": 2.9356444139351295e-05, + "loss": 2.3591, + "step": 250 + }, + { + "epoch": 0.04, + "grad_norm": 12.082416534423828, + "learning_rate": 2.9353869915908702e-05, + "loss": 1.9583, + "step": 251 + }, + { + "epoch": 0.04, + "grad_norm": 7.90018892288208, + "learning_rate": 2.935129569246611e-05, + "loss": 1.4908, + "step": 252 + }, + { + "epoch": 0.04, + "grad_norm": 12.350253105163574, + "learning_rate": 2.934872146902351e-05, + "loss": 1.7432, + "step": 253 + }, + { + "epoch": 0.04, + "grad_norm": 8.411694526672363, + "learning_rate": 2.934614724558092e-05, + "loss": 1.7825, + "step": 254 + }, + { + "epoch": 0.04, + "grad_norm": 9.787676811218262, + "learning_rate": 2.934357302213832e-05, + "loss": 1.9696, + "step": 255 + }, + { + "epoch": 0.04, + "grad_norm": 7.568073749542236, + "learning_rate": 2.934099879869573e-05, + "loss": 1.5693, + "step": 256 + }, + { + "epoch": 0.04, + "grad_norm": 7.90737247467041, + "learning_rate": 2.933842457525313e-05, + "loss": 1.56, + "step": 257 + }, + { + "epoch": 0.04, + "grad_norm": 8.960163116455078, + "learning_rate": 2.933585035181054e-05, + "loss": 1.6016, + "step": 258 + }, + { + "epoch": 0.04, + "grad_norm": 8.698493957519531, + "learning_rate": 2.933327612836794e-05, + "loss": 1.5217, + "step": 259 + }, + { + "epoch": 0.04, + "grad_norm": 10.947373390197754, + "learning_rate": 2.9330701904925348e-05, + "loss": 1.9468, + "step": 260 + }, + { + "epoch": 0.04, + "grad_norm": 9.384953498840332, + "learning_rate": 2.9328127681482755e-05, + "loss": 1.3466, + "step": 261 + }, + { + "epoch": 0.04, + "grad_norm": 11.936210632324219, + "learning_rate": 2.9325553458040158e-05, + "loss": 2.0577, + "step": 262 + }, + { + "epoch": 0.05, + "grad_norm": 17.217910766601562, + "learning_rate": 2.9322979234597565e-05, + "loss": 1.7381, + "step": 263 + }, + { + "epoch": 0.05, + "grad_norm": 10.802103996276855, + "learning_rate": 2.9320405011154968e-05, + "loss": 1.6338, + "step": 264 + }, + { + "epoch": 0.05, + "grad_norm": 9.834925651550293, + "learning_rate": 2.9317830787712375e-05, + "loss": 1.7873, + "step": 265 + }, + { + "epoch": 0.05, + "grad_norm": 9.63886547088623, + "learning_rate": 2.9315256564269778e-05, + "loss": 1.5055, + "step": 266 + }, + { + "epoch": 0.05, + "grad_norm": 11.555371284484863, + "learning_rate": 2.9312682340827185e-05, + "loss": 1.7096, + "step": 267 + }, + { + "epoch": 0.05, + "grad_norm": 12.70984172821045, + "learning_rate": 2.9310108117384588e-05, + "loss": 1.76, + "step": 268 + }, + { + "epoch": 0.05, + "grad_norm": 9.787165641784668, + "learning_rate": 2.9307533893941995e-05, + "loss": 1.5348, + "step": 269 + }, + { + "epoch": 0.05, + "grad_norm": 9.777557373046875, + "learning_rate": 2.93049596704994e-05, + "loss": 1.8856, + "step": 270 + }, + { + "epoch": 0.05, + "grad_norm": 10.017600059509277, + "learning_rate": 2.9302385447056805e-05, + "loss": 1.8646, + "step": 271 + }, + { + "epoch": 0.05, + "grad_norm": 8.701334953308105, + "learning_rate": 2.929981122361421e-05, + "loss": 1.531, + "step": 272 + }, + { + "epoch": 0.05, + "grad_norm": 10.639413833618164, + "learning_rate": 2.9297237000171615e-05, + "loss": 1.962, + "step": 273 + }, + { + "epoch": 0.05, + "grad_norm": 8.560542106628418, + "learning_rate": 2.929466277672902e-05, + "loss": 1.7777, + "step": 274 + }, + { + "epoch": 0.05, + "grad_norm": 8.680726051330566, + "learning_rate": 2.9292088553286425e-05, + "loss": 1.5241, + "step": 275 + }, + { + "epoch": 0.05, + "grad_norm": 8.093884468078613, + "learning_rate": 2.928951432984383e-05, + "loss": 1.6674, + "step": 276 + }, + { + "epoch": 0.05, + "grad_norm": 7.894911289215088, + "learning_rate": 2.9286940106401235e-05, + "loss": 1.5643, + "step": 277 + }, + { + "epoch": 0.05, + "grad_norm": 8.18982982635498, + "learning_rate": 2.928436588295864e-05, + "loss": 1.4667, + "step": 278 + }, + { + "epoch": 0.05, + "grad_norm": 8.561273574829102, + "learning_rate": 2.9281791659516045e-05, + "loss": 1.617, + "step": 279 + }, + { + "epoch": 0.05, + "grad_norm": 9.593453407287598, + "learning_rate": 2.9279217436073455e-05, + "loss": 1.6699, + "step": 280 + }, + { + "epoch": 0.05, + "grad_norm": 9.71127986907959, + "learning_rate": 2.9276643212630858e-05, + "loss": 1.6506, + "step": 281 + }, + { + "epoch": 0.05, + "grad_norm": 8.987764358520508, + "learning_rate": 2.927406898918826e-05, + "loss": 1.4503, + "step": 282 + }, + { + "epoch": 0.05, + "grad_norm": 10.146617889404297, + "learning_rate": 2.9271494765745668e-05, + "loss": 1.686, + "step": 283 + }, + { + "epoch": 0.05, + "grad_norm": 10.64239501953125, + "learning_rate": 2.926892054230307e-05, + "loss": 1.9325, + "step": 284 + }, + { + "epoch": 0.05, + "grad_norm": 8.928953170776367, + "learning_rate": 2.9266346318860478e-05, + "loss": 1.6489, + "step": 285 + }, + { + "epoch": 0.05, + "grad_norm": 9.513096809387207, + "learning_rate": 2.926377209541788e-05, + "loss": 1.7969, + "step": 286 + }, + { + "epoch": 0.05, + "grad_norm": 9.037224769592285, + "learning_rate": 2.9261197871975288e-05, + "loss": 1.6694, + "step": 287 + }, + { + "epoch": 0.05, + "grad_norm": 11.013139724731445, + "learning_rate": 2.925862364853269e-05, + "loss": 1.7193, + "step": 288 + }, + { + "epoch": 0.05, + "grad_norm": 8.478838920593262, + "learning_rate": 2.92560494250901e-05, + "loss": 1.4296, + "step": 289 + }, + { + "epoch": 0.05, + "grad_norm": 8.771774291992188, + "learning_rate": 2.9253475201647505e-05, + "loss": 1.8696, + "step": 290 + }, + { + "epoch": 0.05, + "grad_norm": 7.5445556640625, + "learning_rate": 2.925090097820491e-05, + "loss": 1.4061, + "step": 291 + }, + { + "epoch": 0.05, + "grad_norm": 10.020146369934082, + "learning_rate": 2.9248326754762315e-05, + "loss": 1.6945, + "step": 292 + }, + { + "epoch": 0.05, + "grad_norm": 9.573962211608887, + "learning_rate": 2.9245752531319718e-05, + "loss": 1.6359, + "step": 293 + }, + { + "epoch": 0.05, + "grad_norm": 8.782485008239746, + "learning_rate": 2.9243178307877125e-05, + "loss": 1.6955, + "step": 294 + }, + { + "epoch": 0.05, + "grad_norm": 9.277192115783691, + "learning_rate": 2.9240604084434528e-05, + "loss": 1.5417, + "step": 295 + }, + { + "epoch": 0.05, + "grad_norm": 9.727743148803711, + "learning_rate": 2.9238029860991935e-05, + "loss": 1.4512, + "step": 296 + }, + { + "epoch": 0.05, + "grad_norm": 9.91127872467041, + "learning_rate": 2.9235455637549338e-05, + "loss": 1.7006, + "step": 297 + }, + { + "epoch": 0.05, + "grad_norm": 8.304641723632812, + "learning_rate": 2.9232881414106748e-05, + "loss": 1.4382, + "step": 298 + }, + { + "epoch": 0.05, + "grad_norm": 8.38481616973877, + "learning_rate": 2.923030719066415e-05, + "loss": 1.5406, + "step": 299 + }, + { + "epoch": 0.05, + "grad_norm": 10.092000007629395, + "learning_rate": 2.9227732967221558e-05, + "loss": 1.5547, + "step": 300 + }, + { + "epoch": 0.05, + "grad_norm": 12.8936767578125, + "learning_rate": 2.922515874377896e-05, + "loss": 1.9512, + "step": 301 + }, + { + "epoch": 0.05, + "grad_norm": 9.751150131225586, + "learning_rate": 2.9222584520336365e-05, + "loss": 1.6481, + "step": 302 + }, + { + "epoch": 0.05, + "grad_norm": 10.386673927307129, + "learning_rate": 2.922001029689377e-05, + "loss": 1.7778, + "step": 303 + }, + { + "epoch": 0.05, + "grad_norm": 9.260998725891113, + "learning_rate": 2.9217436073451175e-05, + "loss": 1.4244, + "step": 304 + }, + { + "epoch": 0.05, + "grad_norm": 9.2503080368042, + "learning_rate": 2.921486185000858e-05, + "loss": 1.575, + "step": 305 + }, + { + "epoch": 0.05, + "grad_norm": 8.5574369430542, + "learning_rate": 2.9212287626565984e-05, + "loss": 1.6031, + "step": 306 + }, + { + "epoch": 0.05, + "grad_norm": 9.495893478393555, + "learning_rate": 2.920971340312339e-05, + "loss": 1.7797, + "step": 307 + }, + { + "epoch": 0.05, + "grad_norm": 8.136427879333496, + "learning_rate": 2.9207139179680798e-05, + "loss": 1.4174, + "step": 308 + }, + { + "epoch": 0.05, + "grad_norm": 8.032553672790527, + "learning_rate": 2.9204564956238205e-05, + "loss": 1.6757, + "step": 309 + }, + { + "epoch": 0.05, + "grad_norm": 11.711871147155762, + "learning_rate": 2.9201990732795608e-05, + "loss": 1.8516, + "step": 310 + }, + { + "epoch": 0.05, + "grad_norm": 8.710490226745605, + "learning_rate": 2.9199416509353015e-05, + "loss": 1.6837, + "step": 311 + }, + { + "epoch": 0.05, + "grad_norm": 8.25143814086914, + "learning_rate": 2.9196842285910418e-05, + "loss": 1.5361, + "step": 312 + }, + { + "epoch": 0.05, + "grad_norm": 10.08812427520752, + "learning_rate": 2.919426806246782e-05, + "loss": 1.3348, + "step": 313 + }, + { + "epoch": 0.05, + "grad_norm": 8.16710090637207, + "learning_rate": 2.9191693839025228e-05, + "loss": 1.3417, + "step": 314 + }, + { + "epoch": 0.05, + "grad_norm": 9.723457336425781, + "learning_rate": 2.918911961558263e-05, + "loss": 1.7711, + "step": 315 + }, + { + "epoch": 0.05, + "grad_norm": 9.169995307922363, + "learning_rate": 2.9186545392140038e-05, + "loss": 1.6362, + "step": 316 + }, + { + "epoch": 0.05, + "grad_norm": 12.744826316833496, + "learning_rate": 2.9183971168697444e-05, + "loss": 2.0765, + "step": 317 + }, + { + "epoch": 0.05, + "grad_norm": 9.706212997436523, + "learning_rate": 2.918139694525485e-05, + "loss": 1.6454, + "step": 318 + }, + { + "epoch": 0.05, + "grad_norm": 9.484731674194336, + "learning_rate": 2.9178822721812254e-05, + "loss": 1.7635, + "step": 319 + }, + { + "epoch": 0.05, + "grad_norm": 9.310269355773926, + "learning_rate": 2.917624849836966e-05, + "loss": 1.519, + "step": 320 + }, + { + "epoch": 0.06, + "grad_norm": 10.98282527923584, + "learning_rate": 2.9173674274927064e-05, + "loss": 2.0391, + "step": 321 + }, + { + "epoch": 0.06, + "grad_norm": 10.362037658691406, + "learning_rate": 2.917110005148447e-05, + "loss": 1.8951, + "step": 322 + }, + { + "epoch": 0.06, + "grad_norm": 9.465320587158203, + "learning_rate": 2.9168525828041874e-05, + "loss": 1.6288, + "step": 323 + }, + { + "epoch": 0.06, + "grad_norm": 9.646321296691895, + "learning_rate": 2.9165951604599278e-05, + "loss": 1.6041, + "step": 324 + }, + { + "epoch": 0.06, + "grad_norm": 8.398006439208984, + "learning_rate": 2.9163377381156684e-05, + "loss": 1.7292, + "step": 325 + }, + { + "epoch": 0.06, + "grad_norm": 9.411280632019043, + "learning_rate": 2.9160803157714088e-05, + "loss": 1.7537, + "step": 326 + }, + { + "epoch": 0.06, + "grad_norm": 9.08098316192627, + "learning_rate": 2.9158228934271498e-05, + "loss": 2.0472, + "step": 327 + }, + { + "epoch": 0.06, + "grad_norm": 9.14523983001709, + "learning_rate": 2.91556547108289e-05, + "loss": 1.6499, + "step": 328 + }, + { + "epoch": 0.06, + "grad_norm": 8.763137817382812, + "learning_rate": 2.9153080487386308e-05, + "loss": 1.5005, + "step": 329 + }, + { + "epoch": 0.06, + "grad_norm": 7.802804946899414, + "learning_rate": 2.915050626394371e-05, + "loss": 1.2987, + "step": 330 + }, + { + "epoch": 0.06, + "grad_norm": 9.917118072509766, + "learning_rate": 2.9147932040501118e-05, + "loss": 1.7462, + "step": 331 + }, + { + "epoch": 0.06, + "grad_norm": 10.044041633605957, + "learning_rate": 2.914535781705852e-05, + "loss": 1.9712, + "step": 332 + }, + { + "epoch": 0.06, + "grad_norm": 9.21104621887207, + "learning_rate": 2.9142783593615928e-05, + "loss": 1.5639, + "step": 333 + }, + { + "epoch": 0.06, + "grad_norm": 8.971760749816895, + "learning_rate": 2.914020937017333e-05, + "loss": 1.4879, + "step": 334 + }, + { + "epoch": 0.06, + "grad_norm": 9.787034034729004, + "learning_rate": 2.9137635146730734e-05, + "loss": 1.481, + "step": 335 + }, + { + "epoch": 0.06, + "grad_norm": 7.895040512084961, + "learning_rate": 2.9135060923288144e-05, + "loss": 1.4038, + "step": 336 + }, + { + "epoch": 0.06, + "grad_norm": 10.115073204040527, + "learning_rate": 2.9132486699845548e-05, + "loss": 1.4964, + "step": 337 + }, + { + "epoch": 0.06, + "grad_norm": 8.226001739501953, + "learning_rate": 2.9129912476402954e-05, + "loss": 1.5353, + "step": 338 + }, + { + "epoch": 0.06, + "grad_norm": 8.958303451538086, + "learning_rate": 2.9127338252960358e-05, + "loss": 1.6507, + "step": 339 + }, + { + "epoch": 0.06, + "grad_norm": 9.76336669921875, + "learning_rate": 2.9124764029517764e-05, + "loss": 1.7615, + "step": 340 + }, + { + "epoch": 0.06, + "grad_norm": 8.854474067687988, + "learning_rate": 2.9122189806075168e-05, + "loss": 1.675, + "step": 341 + }, + { + "epoch": 0.06, + "grad_norm": 8.663541793823242, + "learning_rate": 2.9119615582632574e-05, + "loss": 1.5917, + "step": 342 + }, + { + "epoch": 0.06, + "grad_norm": 7.632105350494385, + "learning_rate": 2.9117041359189977e-05, + "loss": 1.8109, + "step": 343 + }, + { + "epoch": 0.06, + "grad_norm": 8.303193092346191, + "learning_rate": 2.911446713574738e-05, + "loss": 1.5036, + "step": 344 + }, + { + "epoch": 0.06, + "grad_norm": 8.947858810424805, + "learning_rate": 2.9111892912304787e-05, + "loss": 1.5195, + "step": 345 + }, + { + "epoch": 0.06, + "grad_norm": 9.142110824584961, + "learning_rate": 2.9109318688862194e-05, + "loss": 1.5374, + "step": 346 + }, + { + "epoch": 0.06, + "grad_norm": 7.125122547149658, + "learning_rate": 2.91067444654196e-05, + "loss": 1.1923, + "step": 347 + }, + { + "epoch": 0.06, + "grad_norm": 9.325124740600586, + "learning_rate": 2.9104170241977004e-05, + "loss": 1.8373, + "step": 348 + }, + { + "epoch": 0.06, + "grad_norm": 9.541301727294922, + "learning_rate": 2.910159601853441e-05, + "loss": 1.428, + "step": 349 + }, + { + "epoch": 0.06, + "grad_norm": 8.802016258239746, + "learning_rate": 2.9099021795091814e-05, + "loss": 1.5451, + "step": 350 + }, + { + "epoch": 0.06, + "grad_norm": 9.415572166442871, + "learning_rate": 2.909644757164922e-05, + "loss": 1.5812, + "step": 351 + }, + { + "epoch": 0.06, + "grad_norm": 9.448362350463867, + "learning_rate": 2.9093873348206624e-05, + "loss": 1.7371, + "step": 352 + }, + { + "epoch": 0.06, + "grad_norm": 8.94411849975586, + "learning_rate": 2.909129912476403e-05, + "loss": 1.5187, + "step": 353 + }, + { + "epoch": 0.06, + "grad_norm": 9.400604248046875, + "learning_rate": 2.9088724901321434e-05, + "loss": 1.6639, + "step": 354 + }, + { + "epoch": 0.06, + "grad_norm": 9.481334686279297, + "learning_rate": 2.908615067787884e-05, + "loss": 1.515, + "step": 355 + }, + { + "epoch": 0.06, + "grad_norm": 10.073720932006836, + "learning_rate": 2.9083576454436247e-05, + "loss": 1.6674, + "step": 356 + }, + { + "epoch": 0.06, + "grad_norm": 10.867945671081543, + "learning_rate": 2.908100223099365e-05, + "loss": 1.9646, + "step": 357 + }, + { + "epoch": 0.06, + "grad_norm": 10.329514503479004, + "learning_rate": 2.9078428007551057e-05, + "loss": 1.8233, + "step": 358 + }, + { + "epoch": 0.06, + "grad_norm": 9.168272972106934, + "learning_rate": 2.907585378410846e-05, + "loss": 1.6868, + "step": 359 + }, + { + "epoch": 0.06, + "grad_norm": 9.724711418151855, + "learning_rate": 2.9073279560665867e-05, + "loss": 1.644, + "step": 360 + }, + { + "epoch": 0.06, + "grad_norm": 8.938602447509766, + "learning_rate": 2.907070533722327e-05, + "loss": 1.5774, + "step": 361 + }, + { + "epoch": 0.06, + "grad_norm": 8.528223037719727, + "learning_rate": 2.9068131113780677e-05, + "loss": 1.4291, + "step": 362 + }, + { + "epoch": 0.06, + "grad_norm": 9.6717529296875, + "learning_rate": 2.906555689033808e-05, + "loss": 1.5272, + "step": 363 + }, + { + "epoch": 0.06, + "grad_norm": 8.576579093933105, + "learning_rate": 2.9062982666895487e-05, + "loss": 1.3728, + "step": 364 + }, + { + "epoch": 0.06, + "grad_norm": 9.873361587524414, + "learning_rate": 2.9060408443452894e-05, + "loss": 1.8679, + "step": 365 + }, + { + "epoch": 0.06, + "grad_norm": 10.140089988708496, + "learning_rate": 2.9057834220010297e-05, + "loss": 1.4819, + "step": 366 + }, + { + "epoch": 0.06, + "grad_norm": 13.3444185256958, + "learning_rate": 2.9055259996567704e-05, + "loss": 1.7938, + "step": 367 + }, + { + "epoch": 0.06, + "grad_norm": 9.555766105651855, + "learning_rate": 2.9052685773125107e-05, + "loss": 1.586, + "step": 368 + }, + { + "epoch": 0.06, + "grad_norm": 9.24820613861084, + "learning_rate": 2.9050111549682514e-05, + "loss": 1.4957, + "step": 369 + }, + { + "epoch": 0.06, + "grad_norm": 11.031588554382324, + "learning_rate": 2.9047537326239917e-05, + "loss": 1.7064, + "step": 370 + }, + { + "epoch": 0.06, + "grad_norm": 9.380867958068848, + "learning_rate": 2.9044963102797324e-05, + "loss": 1.4169, + "step": 371 + }, + { + "epoch": 0.06, + "grad_norm": 11.075194358825684, + "learning_rate": 2.9042388879354727e-05, + "loss": 1.5741, + "step": 372 + }, + { + "epoch": 0.06, + "grad_norm": 8.459732055664062, + "learning_rate": 2.9039814655912134e-05, + "loss": 1.3803, + "step": 373 + }, + { + "epoch": 0.06, + "grad_norm": 9.930313110351562, + "learning_rate": 2.903724043246954e-05, + "loss": 1.6806, + "step": 374 + }, + { + "epoch": 0.06, + "grad_norm": 9.009334564208984, + "learning_rate": 2.9034666209026944e-05, + "loss": 1.8382, + "step": 375 + }, + { + "epoch": 0.06, + "grad_norm": 9.107048988342285, + "learning_rate": 2.903209198558435e-05, + "loss": 1.706, + "step": 376 + }, + { + "epoch": 0.06, + "grad_norm": 9.72223949432373, + "learning_rate": 2.9029517762141754e-05, + "loss": 1.5575, + "step": 377 + }, + { + "epoch": 0.06, + "grad_norm": 11.12093448638916, + "learning_rate": 2.902694353869916e-05, + "loss": 1.8506, + "step": 378 + }, + { + "epoch": 0.07, + "grad_norm": 9.984782218933105, + "learning_rate": 2.9024369315256564e-05, + "loss": 1.6462, + "step": 379 + }, + { + "epoch": 0.07, + "grad_norm": 8.998955726623535, + "learning_rate": 2.902179509181397e-05, + "loss": 1.5175, + "step": 380 + }, + { + "epoch": 0.07, + "grad_norm": 8.353002548217773, + "learning_rate": 2.9019220868371374e-05, + "loss": 1.3723, + "step": 381 + }, + { + "epoch": 0.07, + "grad_norm": 10.063586235046387, + "learning_rate": 2.901664664492878e-05, + "loss": 1.5693, + "step": 382 + }, + { + "epoch": 0.07, + "grad_norm": 8.760736465454102, + "learning_rate": 2.9014072421486184e-05, + "loss": 1.4859, + "step": 383 + }, + { + "epoch": 0.07, + "grad_norm": 11.812806129455566, + "learning_rate": 2.9011498198043594e-05, + "loss": 1.4088, + "step": 384 + }, + { + "epoch": 0.07, + "grad_norm": 9.350480079650879, + "learning_rate": 2.9008923974600997e-05, + "loss": 1.5996, + "step": 385 + }, + { + "epoch": 0.07, + "grad_norm": 8.718733787536621, + "learning_rate": 2.90063497511584e-05, + "loss": 1.232, + "step": 386 + }, + { + "epoch": 0.07, + "grad_norm": 7.795390605926514, + "learning_rate": 2.9003775527715807e-05, + "loss": 1.2631, + "step": 387 + }, + { + "epoch": 0.07, + "grad_norm": 9.643641471862793, + "learning_rate": 2.900120130427321e-05, + "loss": 1.6076, + "step": 388 + }, + { + "epoch": 0.07, + "grad_norm": 9.053388595581055, + "learning_rate": 2.8998627080830617e-05, + "loss": 1.6208, + "step": 389 + }, + { + "epoch": 0.07, + "grad_norm": 9.285635948181152, + "learning_rate": 2.899605285738802e-05, + "loss": 1.9027, + "step": 390 + }, + { + "epoch": 0.07, + "grad_norm": 8.512655258178711, + "learning_rate": 2.8993478633945427e-05, + "loss": 1.6828, + "step": 391 + }, + { + "epoch": 0.07, + "grad_norm": 8.464376449584961, + "learning_rate": 2.899090441050283e-05, + "loss": 1.2634, + "step": 392 + }, + { + "epoch": 0.07, + "grad_norm": 9.082260131835938, + "learning_rate": 2.898833018706024e-05, + "loss": 1.3558, + "step": 393 + }, + { + "epoch": 0.07, + "grad_norm": 8.868090629577637, + "learning_rate": 2.8985755963617644e-05, + "loss": 1.8094, + "step": 394 + }, + { + "epoch": 0.07, + "grad_norm": 8.074893951416016, + "learning_rate": 2.898318174017505e-05, + "loss": 1.5454, + "step": 395 + }, + { + "epoch": 0.07, + "grad_norm": 8.83586597442627, + "learning_rate": 2.8980607516732454e-05, + "loss": 1.6831, + "step": 396 + }, + { + "epoch": 0.07, + "grad_norm": 8.7890043258667, + "learning_rate": 2.8978033293289857e-05, + "loss": 1.3461, + "step": 397 + }, + { + "epoch": 0.07, + "grad_norm": 9.520912170410156, + "learning_rate": 2.8975459069847264e-05, + "loss": 1.6014, + "step": 398 + }, + { + "epoch": 0.07, + "grad_norm": 10.111466407775879, + "learning_rate": 2.8972884846404667e-05, + "loss": 1.8845, + "step": 399 + }, + { + "epoch": 0.07, + "grad_norm": 10.061944007873535, + "learning_rate": 2.8970310622962074e-05, + "loss": 1.7318, + "step": 400 + }, + { + "epoch": 0.07, + "grad_norm": 8.755029678344727, + "learning_rate": 2.8967736399519477e-05, + "loss": 1.425, + "step": 401 + }, + { + "epoch": 0.07, + "grad_norm": 7.505181789398193, + "learning_rate": 2.8965162176076884e-05, + "loss": 1.4305, + "step": 402 + }, + { + "epoch": 0.07, + "grad_norm": 10.357077598571777, + "learning_rate": 2.896258795263429e-05, + "loss": 1.6816, + "step": 403 + }, + { + "epoch": 0.07, + "grad_norm": 8.416088104248047, + "learning_rate": 2.8960013729191697e-05, + "loss": 1.7239, + "step": 404 + }, + { + "epoch": 0.07, + "grad_norm": 8.75820541381836, + "learning_rate": 2.89574395057491e-05, + "loss": 1.6036, + "step": 405 + }, + { + "epoch": 0.07, + "grad_norm": 8.285774230957031, + "learning_rate": 2.8954865282306504e-05, + "loss": 1.2902, + "step": 406 + }, + { + "epoch": 0.07, + "grad_norm": 8.048407554626465, + "learning_rate": 2.895229105886391e-05, + "loss": 1.5071, + "step": 407 + }, + { + "epoch": 0.07, + "grad_norm": 7.721714019775391, + "learning_rate": 2.8949716835421314e-05, + "loss": 1.7365, + "step": 408 + }, + { + "epoch": 0.07, + "grad_norm": 8.985815048217773, + "learning_rate": 2.894714261197872e-05, + "loss": 1.3089, + "step": 409 + }, + { + "epoch": 0.07, + "grad_norm": 8.920326232910156, + "learning_rate": 2.8944568388536123e-05, + "loss": 1.8378, + "step": 410 + }, + { + "epoch": 0.07, + "grad_norm": 8.613027572631836, + "learning_rate": 2.894199416509353e-05, + "loss": 1.2882, + "step": 411 + }, + { + "epoch": 0.07, + "grad_norm": 10.50846004486084, + "learning_rate": 2.8939419941650937e-05, + "loss": 1.7531, + "step": 412 + }, + { + "epoch": 0.07, + "grad_norm": 8.982809066772461, + "learning_rate": 2.8936845718208344e-05, + "loss": 1.6533, + "step": 413 + }, + { + "epoch": 0.07, + "grad_norm": 8.472440719604492, + "learning_rate": 2.8934271494765747e-05, + "loss": 1.4576, + "step": 414 + }, + { + "epoch": 0.07, + "grad_norm": 7.862814903259277, + "learning_rate": 2.8931697271323154e-05, + "loss": 1.2296, + "step": 415 + }, + { + "epoch": 0.07, + "grad_norm": 9.792984962463379, + "learning_rate": 2.8929123047880557e-05, + "loss": 1.4784, + "step": 416 + }, + { + "epoch": 0.07, + "grad_norm": 7.533136367797852, + "learning_rate": 2.892654882443796e-05, + "loss": 1.3715, + "step": 417 + }, + { + "epoch": 0.07, + "grad_norm": 8.728937149047852, + "learning_rate": 2.8923974600995367e-05, + "loss": 1.2633, + "step": 418 + }, + { + "epoch": 0.07, + "grad_norm": 8.582226753234863, + "learning_rate": 2.892140037755277e-05, + "loss": 1.4999, + "step": 419 + }, + { + "epoch": 0.07, + "grad_norm": 9.515429496765137, + "learning_rate": 2.8918826154110177e-05, + "loss": 1.8531, + "step": 420 + }, + { + "epoch": 0.07, + "grad_norm": 11.933034896850586, + "learning_rate": 2.8916251930667583e-05, + "loss": 2.1722, + "step": 421 + }, + { + "epoch": 0.07, + "grad_norm": 11.477798461914062, + "learning_rate": 2.891367770722499e-05, + "loss": 1.4052, + "step": 422 + }, + { + "epoch": 0.07, + "grad_norm": 10.258628845214844, + "learning_rate": 2.8911103483782393e-05, + "loss": 1.707, + "step": 423 + }, + { + "epoch": 0.07, + "grad_norm": 10.19088077545166, + "learning_rate": 2.89085292603398e-05, + "loss": 1.489, + "step": 424 + }, + { + "epoch": 0.07, + "grad_norm": 9.1199312210083, + "learning_rate": 2.8905955036897203e-05, + "loss": 1.4829, + "step": 425 + }, + { + "epoch": 0.07, + "grad_norm": 9.810800552368164, + "learning_rate": 2.890338081345461e-05, + "loss": 1.5978, + "step": 426 + }, + { + "epoch": 0.07, + "grad_norm": 10.57884407043457, + "learning_rate": 2.8900806590012013e-05, + "loss": 1.4805, + "step": 427 + }, + { + "epoch": 0.07, + "grad_norm": 9.866064071655273, + "learning_rate": 2.8898232366569417e-05, + "loss": 1.7046, + "step": 428 + }, + { + "epoch": 0.07, + "grad_norm": 7.306712627410889, + "learning_rate": 2.8895658143126823e-05, + "loss": 1.1738, + "step": 429 + }, + { + "epoch": 0.07, + "grad_norm": 8.592244148254395, + "learning_rate": 2.8893083919684227e-05, + "loss": 1.5762, + "step": 430 + }, + { + "epoch": 0.07, + "grad_norm": 9.549958229064941, + "learning_rate": 2.8890509696241637e-05, + "loss": 1.4738, + "step": 431 + }, + { + "epoch": 0.07, + "grad_norm": 8.925586700439453, + "learning_rate": 2.888793547279904e-05, + "loss": 1.6632, + "step": 432 + }, + { + "epoch": 0.07, + "grad_norm": 9.961637496948242, + "learning_rate": 2.8885361249356447e-05, + "loss": 1.6991, + "step": 433 + }, + { + "epoch": 0.07, + "grad_norm": 10.438910484313965, + "learning_rate": 2.888278702591385e-05, + "loss": 1.7705, + "step": 434 + }, + { + "epoch": 0.07, + "grad_norm": 8.384795188903809, + "learning_rate": 2.8880212802471257e-05, + "loss": 1.411, + "step": 435 + }, + { + "epoch": 0.07, + "grad_norm": 10.856956481933594, + "learning_rate": 2.887763857902866e-05, + "loss": 2.1184, + "step": 436 + }, + { + "epoch": 0.07, + "grad_norm": 8.427003860473633, + "learning_rate": 2.8875064355586067e-05, + "loss": 1.4459, + "step": 437 + }, + { + "epoch": 0.08, + "grad_norm": 8.227859497070312, + "learning_rate": 2.887249013214347e-05, + "loss": 1.6471, + "step": 438 + }, + { + "epoch": 0.08, + "grad_norm": 8.303044319152832, + "learning_rate": 2.8869915908700873e-05, + "loss": 1.3036, + "step": 439 + }, + { + "epoch": 0.08, + "grad_norm": 8.696854591369629, + "learning_rate": 2.8867341685258283e-05, + "loss": 1.5718, + "step": 440 + }, + { + "epoch": 0.08, + "grad_norm": 9.387533187866211, + "learning_rate": 2.8864767461815687e-05, + "loss": 1.596, + "step": 441 + }, + { + "epoch": 0.08, + "grad_norm": 7.539954662322998, + "learning_rate": 2.8862193238373093e-05, + "loss": 1.1893, + "step": 442 + }, + { + "epoch": 0.08, + "grad_norm": 7.909122467041016, + "learning_rate": 2.8859619014930497e-05, + "loss": 1.4737, + "step": 443 + }, + { + "epoch": 0.08, + "grad_norm": 9.117277145385742, + "learning_rate": 2.8857044791487903e-05, + "loss": 1.3799, + "step": 444 + }, + { + "epoch": 0.08, + "grad_norm": 10.359807968139648, + "learning_rate": 2.8854470568045307e-05, + "loss": 1.7931, + "step": 445 + }, + { + "epoch": 0.08, + "grad_norm": 9.885112762451172, + "learning_rate": 2.8851896344602713e-05, + "loss": 1.5852, + "step": 446 + }, + { + "epoch": 0.08, + "grad_norm": 8.114673614501953, + "learning_rate": 2.8849322121160116e-05, + "loss": 1.4774, + "step": 447 + }, + { + "epoch": 0.08, + "grad_norm": 8.589859962463379, + "learning_rate": 2.884674789771752e-05, + "loss": 1.6173, + "step": 448 + }, + { + "epoch": 0.08, + "grad_norm": 9.234646797180176, + "learning_rate": 2.8844173674274926e-05, + "loss": 1.2883, + "step": 449 + }, + { + "epoch": 0.08, + "grad_norm": 9.6058931350708, + "learning_rate": 2.8841599450832333e-05, + "loss": 1.2674, + "step": 450 + }, + { + "epoch": 0.08, + "grad_norm": 11.516396522521973, + "learning_rate": 2.883902522738974e-05, + "loss": 1.7395, + "step": 451 + }, + { + "epoch": 0.08, + "grad_norm": 9.401927947998047, + "learning_rate": 2.8836451003947143e-05, + "loss": 1.6409, + "step": 452 + }, + { + "epoch": 0.08, + "grad_norm": 9.46969223022461, + "learning_rate": 2.883387678050455e-05, + "loss": 0.9971, + "step": 453 + }, + { + "epoch": 0.08, + "grad_norm": 8.885191917419434, + "learning_rate": 2.8831302557061953e-05, + "loss": 1.2535, + "step": 454 + }, + { + "epoch": 0.08, + "grad_norm": 10.642657279968262, + "learning_rate": 2.882872833361936e-05, + "loss": 1.7177, + "step": 455 + }, + { + "epoch": 0.08, + "grad_norm": 10.326385498046875, + "learning_rate": 2.8826154110176763e-05, + "loss": 1.5923, + "step": 456 + }, + { + "epoch": 0.08, + "grad_norm": 8.094305992126465, + "learning_rate": 2.882357988673417e-05, + "loss": 1.5446, + "step": 457 + }, + { + "epoch": 0.08, + "grad_norm": 9.640613555908203, + "learning_rate": 2.8821005663291573e-05, + "loss": 1.4332, + "step": 458 + }, + { + "epoch": 0.08, + "grad_norm": 8.753673553466797, + "learning_rate": 2.881843143984898e-05, + "loss": 1.7025, + "step": 459 + }, + { + "epoch": 0.08, + "grad_norm": 10.737837791442871, + "learning_rate": 2.8815857216406386e-05, + "loss": 2.0308, + "step": 460 + }, + { + "epoch": 0.08, + "grad_norm": 8.723050117492676, + "learning_rate": 2.881328299296379e-05, + "loss": 1.5147, + "step": 461 + }, + { + "epoch": 0.08, + "grad_norm": 8.3440523147583, + "learning_rate": 2.8810708769521196e-05, + "loss": 1.5351, + "step": 462 + }, + { + "epoch": 0.08, + "grad_norm": 9.674872398376465, + "learning_rate": 2.88081345460786e-05, + "loss": 1.7827, + "step": 463 + }, + { + "epoch": 0.08, + "grad_norm": 8.43867301940918, + "learning_rate": 2.8805560322636006e-05, + "loss": 1.6664, + "step": 464 + }, + { + "epoch": 0.08, + "grad_norm": 10.447357177734375, + "learning_rate": 2.880298609919341e-05, + "loss": 1.5267, + "step": 465 + }, + { + "epoch": 0.08, + "grad_norm": 9.519311904907227, + "learning_rate": 2.8800411875750816e-05, + "loss": 1.6896, + "step": 466 + }, + { + "epoch": 0.08, + "grad_norm": 8.392447471618652, + "learning_rate": 2.879783765230822e-05, + "loss": 1.2946, + "step": 467 + }, + { + "epoch": 0.08, + "grad_norm": 7.106021881103516, + "learning_rate": 2.8795263428865626e-05, + "loss": 1.5775, + "step": 468 + }, + { + "epoch": 0.08, + "grad_norm": 7.461575508117676, + "learning_rate": 2.8792689205423033e-05, + "loss": 1.204, + "step": 469 + }, + { + "epoch": 0.08, + "grad_norm": 7.577383518218994, + "learning_rate": 2.8790114981980436e-05, + "loss": 1.5391, + "step": 470 + }, + { + "epoch": 0.08, + "grad_norm": 10.269767761230469, + "learning_rate": 2.8787540758537843e-05, + "loss": 2.0634, + "step": 471 + }, + { + "epoch": 0.08, + "grad_norm": 8.887826919555664, + "learning_rate": 2.8784966535095246e-05, + "loss": 1.4488, + "step": 472 + }, + { + "epoch": 0.08, + "grad_norm": 8.82689094543457, + "learning_rate": 2.8782392311652653e-05, + "loss": 2.0102, + "step": 473 + }, + { + "epoch": 0.08, + "grad_norm": 7.734695911407471, + "learning_rate": 2.8779818088210056e-05, + "loss": 1.5332, + "step": 474 + }, + { + "epoch": 0.08, + "grad_norm": 8.526495933532715, + "learning_rate": 2.8777243864767463e-05, + "loss": 1.5298, + "step": 475 + }, + { + "epoch": 0.08, + "grad_norm": 9.68381118774414, + "learning_rate": 2.8774669641324866e-05, + "loss": 1.8056, + "step": 476 + }, + { + "epoch": 0.08, + "grad_norm": 8.388910293579102, + "learning_rate": 2.8772095417882273e-05, + "loss": 1.6498, + "step": 477 + }, + { + "epoch": 0.08, + "grad_norm": 9.723257064819336, + "learning_rate": 2.876952119443968e-05, + "loss": 1.5671, + "step": 478 + }, + { + "epoch": 0.08, + "grad_norm": 9.094700813293457, + "learning_rate": 2.8766946970997083e-05, + "loss": 1.5728, + "step": 479 + }, + { + "epoch": 0.08, + "grad_norm": 7.833842754364014, + "learning_rate": 2.876437274755449e-05, + "loss": 1.2576, + "step": 480 + }, + { + "epoch": 0.08, + "grad_norm": 10.016430854797363, + "learning_rate": 2.8761798524111893e-05, + "loss": 1.7658, + "step": 481 + }, + { + "epoch": 0.08, + "grad_norm": 9.015495300292969, + "learning_rate": 2.87592243006693e-05, + "loss": 1.6063, + "step": 482 + }, + { + "epoch": 0.08, + "grad_norm": 8.249187469482422, + "learning_rate": 2.8756650077226703e-05, + "loss": 1.4659, + "step": 483 + }, + { + "epoch": 0.08, + "grad_norm": 9.637657165527344, + "learning_rate": 2.875407585378411e-05, + "loss": 1.4088, + "step": 484 + }, + { + "epoch": 0.08, + "grad_norm": 9.466156005859375, + "learning_rate": 2.8751501630341513e-05, + "loss": 1.4845, + "step": 485 + }, + { + "epoch": 0.08, + "grad_norm": 8.58089828491211, + "learning_rate": 2.874892740689892e-05, + "loss": 1.5108, + "step": 486 + }, + { + "epoch": 0.08, + "grad_norm": 8.993913650512695, + "learning_rate": 2.8746353183456323e-05, + "loss": 1.3799, + "step": 487 + }, + { + "epoch": 0.08, + "grad_norm": 10.493663787841797, + "learning_rate": 2.8743778960013733e-05, + "loss": 1.7722, + "step": 488 + }, + { + "epoch": 0.08, + "grad_norm": 8.064780235290527, + "learning_rate": 2.8741204736571136e-05, + "loss": 1.5231, + "step": 489 + }, + { + "epoch": 0.08, + "grad_norm": 8.072488784790039, + "learning_rate": 2.873863051312854e-05, + "loss": 1.385, + "step": 490 + }, + { + "epoch": 0.08, + "grad_norm": 9.39823055267334, + "learning_rate": 2.8736056289685946e-05, + "loss": 1.6999, + "step": 491 + }, + { + "epoch": 0.08, + "grad_norm": 8.070072174072266, + "learning_rate": 2.873348206624335e-05, + "loss": 1.579, + "step": 492 + }, + { + "epoch": 0.08, + "grad_norm": 9.037577629089355, + "learning_rate": 2.8730907842800756e-05, + "loss": 1.7168, + "step": 493 + }, + { + "epoch": 0.08, + "grad_norm": 8.199870109558105, + "learning_rate": 2.872833361935816e-05, + "loss": 1.4653, + "step": 494 + }, + { + "epoch": 0.08, + "grad_norm": 8.702022552490234, + "learning_rate": 2.8725759395915566e-05, + "loss": 1.2119, + "step": 495 + }, + { + "epoch": 0.09, + "grad_norm": 7.548079013824463, + "learning_rate": 2.872318517247297e-05, + "loss": 1.398, + "step": 496 + }, + { + "epoch": 0.09, + "grad_norm": 8.74194622039795, + "learning_rate": 2.872061094903038e-05, + "loss": 1.3153, + "step": 497 + }, + { + "epoch": 0.09, + "grad_norm": 8.39633560180664, + "learning_rate": 2.8718036725587783e-05, + "loss": 1.1622, + "step": 498 + }, + { + "epoch": 0.09, + "grad_norm": 8.131392478942871, + "learning_rate": 2.871546250214519e-05, + "loss": 1.4024, + "step": 499 + }, + { + "epoch": 0.09, + "grad_norm": 8.805859565734863, + "learning_rate": 2.8712888278702593e-05, + "loss": 1.2898, + "step": 500 + }, + { + "epoch": 0.09, + "grad_norm": 9.168538093566895, + "learning_rate": 2.8710314055259996e-05, + "loss": 1.7268, + "step": 501 + }, + { + "epoch": 0.09, + "grad_norm": 9.682873725891113, + "learning_rate": 2.8707739831817403e-05, + "loss": 1.4385, + "step": 502 + }, + { + "epoch": 0.09, + "grad_norm": 10.96394157409668, + "learning_rate": 2.8705165608374806e-05, + "loss": 1.3478, + "step": 503 + }, + { + "epoch": 0.09, + "grad_norm": 10.533995628356934, + "learning_rate": 2.8702591384932213e-05, + "loss": 1.664, + "step": 504 + }, + { + "epoch": 0.09, + "grad_norm": 9.12846851348877, + "learning_rate": 2.8700017161489616e-05, + "loss": 1.2708, + "step": 505 + }, + { + "epoch": 0.09, + "grad_norm": 10.074721336364746, + "learning_rate": 2.8697442938047023e-05, + "loss": 1.283, + "step": 506 + }, + { + "epoch": 0.09, + "grad_norm": 8.145303726196289, + "learning_rate": 2.869486871460443e-05, + "loss": 0.968, + "step": 507 + }, + { + "epoch": 0.09, + "grad_norm": 9.491410255432129, + "learning_rate": 2.8692294491161836e-05, + "loss": 1.7349, + "step": 508 + }, + { + "epoch": 0.09, + "grad_norm": 9.441530227661133, + "learning_rate": 2.868972026771924e-05, + "loss": 1.3327, + "step": 509 + }, + { + "epoch": 0.09, + "grad_norm": 9.813334465026855, + "learning_rate": 2.8687146044276643e-05, + "loss": 1.4541, + "step": 510 + }, + { + "epoch": 0.09, + "grad_norm": 9.257591247558594, + "learning_rate": 2.868457182083405e-05, + "loss": 1.3805, + "step": 511 + }, + { + "epoch": 0.09, + "grad_norm": 9.843669891357422, + "learning_rate": 2.8681997597391453e-05, + "loss": 1.6476, + "step": 512 + }, + { + "epoch": 0.09, + "grad_norm": 9.05841064453125, + "learning_rate": 2.867942337394886e-05, + "loss": 1.5867, + "step": 513 + }, + { + "epoch": 0.09, + "grad_norm": 7.566347122192383, + "learning_rate": 2.8676849150506263e-05, + "loss": 1.1905, + "step": 514 + }, + { + "epoch": 0.09, + "grad_norm": 8.416635513305664, + "learning_rate": 2.867427492706367e-05, + "loss": 1.5511, + "step": 515 + }, + { + "epoch": 0.09, + "grad_norm": 9.213944435119629, + "learning_rate": 2.8671700703621076e-05, + "loss": 1.5191, + "step": 516 + }, + { + "epoch": 0.09, + "grad_norm": 8.978853225708008, + "learning_rate": 2.8669126480178483e-05, + "loss": 1.5472, + "step": 517 + }, + { + "epoch": 0.09, + "grad_norm": 8.227170944213867, + "learning_rate": 2.8666552256735886e-05, + "loss": 1.5105, + "step": 518 + }, + { + "epoch": 0.09, + "grad_norm": 7.5068464279174805, + "learning_rate": 2.8663978033293293e-05, + "loss": 1.5845, + "step": 519 + }, + { + "epoch": 0.09, + "grad_norm": 9.044169425964355, + "learning_rate": 2.8661403809850696e-05, + "loss": 1.2448, + "step": 520 + }, + { + "epoch": 0.09, + "grad_norm": 9.292195320129395, + "learning_rate": 2.86588295864081e-05, + "loss": 1.5076, + "step": 521 + }, + { + "epoch": 0.09, + "grad_norm": 7.377120018005371, + "learning_rate": 2.8656255362965506e-05, + "loss": 1.4297, + "step": 522 + }, + { + "epoch": 0.09, + "grad_norm": 8.932923316955566, + "learning_rate": 2.865368113952291e-05, + "loss": 1.3458, + "step": 523 + }, + { + "epoch": 0.09, + "grad_norm": 8.574246406555176, + "learning_rate": 2.8651106916080316e-05, + "loss": 1.2264, + "step": 524 + }, + { + "epoch": 0.09, + "grad_norm": 8.966174125671387, + "learning_rate": 2.8648532692637722e-05, + "loss": 1.4927, + "step": 525 + }, + { + "epoch": 0.09, + "grad_norm": 10.199288368225098, + "learning_rate": 2.864595846919513e-05, + "loss": 1.7801, + "step": 526 + }, + { + "epoch": 0.09, + "grad_norm": 9.957565307617188, + "learning_rate": 2.8643384245752532e-05, + "loss": 1.4717, + "step": 527 + }, + { + "epoch": 0.09, + "grad_norm": 9.930829048156738, + "learning_rate": 2.864081002230994e-05, + "loss": 1.345, + "step": 528 + }, + { + "epoch": 0.09, + "grad_norm": 8.972787857055664, + "learning_rate": 2.8638235798867342e-05, + "loss": 1.5369, + "step": 529 + }, + { + "epoch": 0.09, + "grad_norm": 10.50515365600586, + "learning_rate": 2.863566157542475e-05, + "loss": 1.5552, + "step": 530 + }, + { + "epoch": 0.09, + "grad_norm": 8.620457649230957, + "learning_rate": 2.8633087351982152e-05, + "loss": 1.2706, + "step": 531 + }, + { + "epoch": 0.09, + "grad_norm": 9.689730644226074, + "learning_rate": 2.8630513128539556e-05, + "loss": 1.4852, + "step": 532 + }, + { + "epoch": 0.09, + "grad_norm": 9.077995300292969, + "learning_rate": 2.8627938905096962e-05, + "loss": 1.68, + "step": 533 + }, + { + "epoch": 0.09, + "grad_norm": 7.704209804534912, + "learning_rate": 2.8625364681654366e-05, + "loss": 1.5412, + "step": 534 + }, + { + "epoch": 0.09, + "grad_norm": 9.958584785461426, + "learning_rate": 2.8622790458211776e-05, + "loss": 1.772, + "step": 535 + }, + { + "epoch": 0.09, + "grad_norm": 8.211577415466309, + "learning_rate": 2.862021623476918e-05, + "loss": 1.6424, + "step": 536 + }, + { + "epoch": 0.09, + "grad_norm": 8.039546966552734, + "learning_rate": 2.8617642011326586e-05, + "loss": 1.4005, + "step": 537 + }, + { + "epoch": 0.09, + "grad_norm": 8.478930473327637, + "learning_rate": 2.861506778788399e-05, + "loss": 1.602, + "step": 538 + }, + { + "epoch": 0.09, + "grad_norm": 9.348405838012695, + "learning_rate": 2.8612493564441396e-05, + "loss": 1.4291, + "step": 539 + }, + { + "epoch": 0.09, + "grad_norm": 9.137516975402832, + "learning_rate": 2.86099193409988e-05, + "loss": 1.5088, + "step": 540 + }, + { + "epoch": 0.09, + "grad_norm": 8.610368728637695, + "learning_rate": 2.8607345117556206e-05, + "loss": 1.6462, + "step": 541 + }, + { + "epoch": 0.09, + "grad_norm": 10.408897399902344, + "learning_rate": 2.860477089411361e-05, + "loss": 1.5357, + "step": 542 + }, + { + "epoch": 0.09, + "grad_norm": 8.426880836486816, + "learning_rate": 2.8602196670671012e-05, + "loss": 1.4384, + "step": 543 + }, + { + "epoch": 0.09, + "grad_norm": 9.32236099243164, + "learning_rate": 2.8599622447228422e-05, + "loss": 1.5944, + "step": 544 + }, + { + "epoch": 0.09, + "grad_norm": 8.563417434692383, + "learning_rate": 2.8597048223785826e-05, + "loss": 1.4292, + "step": 545 + }, + { + "epoch": 0.09, + "grad_norm": 10.363273620605469, + "learning_rate": 2.8594474000343232e-05, + "loss": 1.3923, + "step": 546 + }, + { + "epoch": 0.09, + "grad_norm": 9.25059700012207, + "learning_rate": 2.8591899776900636e-05, + "loss": 1.7713, + "step": 547 + }, + { + "epoch": 0.09, + "grad_norm": 10.885030746459961, + "learning_rate": 2.8589325553458042e-05, + "loss": 1.7659, + "step": 548 + }, + { + "epoch": 0.09, + "grad_norm": 9.823013305664062, + "learning_rate": 2.8586751330015446e-05, + "loss": 1.6639, + "step": 549 + }, + { + "epoch": 0.09, + "grad_norm": 9.12160873413086, + "learning_rate": 2.8584177106572852e-05, + "loss": 1.5859, + "step": 550 + }, + { + "epoch": 0.09, + "grad_norm": 9.695314407348633, + "learning_rate": 2.8581602883130256e-05, + "loss": 1.666, + "step": 551 + }, + { + "epoch": 0.09, + "grad_norm": 7.679184913635254, + "learning_rate": 2.857902865968766e-05, + "loss": 1.3786, + "step": 552 + }, + { + "epoch": 0.09, + "grad_norm": 8.873235702514648, + "learning_rate": 2.8576454436245065e-05, + "loss": 1.3557, + "step": 553 + }, + { + "epoch": 0.1, + "grad_norm": 9.05200481414795, + "learning_rate": 2.8573880212802472e-05, + "loss": 1.8113, + "step": 554 + }, + { + "epoch": 0.1, + "grad_norm": 8.330602645874023, + "learning_rate": 2.857130598935988e-05, + "loss": 1.405, + "step": 555 + }, + { + "epoch": 0.1, + "grad_norm": 8.589398384094238, + "learning_rate": 2.8568731765917282e-05, + "loss": 1.4438, + "step": 556 + }, + { + "epoch": 0.1, + "grad_norm": 8.285298347473145, + "learning_rate": 2.856615754247469e-05, + "loss": 1.4108, + "step": 557 + }, + { + "epoch": 0.1, + "grad_norm": 7.805575847625732, + "learning_rate": 2.8563583319032092e-05, + "loss": 1.339, + "step": 558 + }, + { + "epoch": 0.1, + "grad_norm": 7.822475433349609, + "learning_rate": 2.85610090955895e-05, + "loss": 1.2177, + "step": 559 + }, + { + "epoch": 0.1, + "grad_norm": 8.784849166870117, + "learning_rate": 2.8558434872146902e-05, + "loss": 1.4748, + "step": 560 + }, + { + "epoch": 0.1, + "grad_norm": 8.376548767089844, + "learning_rate": 2.855586064870431e-05, + "loss": 1.126, + "step": 561 + }, + { + "epoch": 0.1, + "grad_norm": 7.582365989685059, + "learning_rate": 2.8553286425261712e-05, + "loss": 1.3181, + "step": 562 + }, + { + "epoch": 0.1, + "grad_norm": 9.676347732543945, + "learning_rate": 2.855071220181912e-05, + "loss": 1.5478, + "step": 563 + }, + { + "epoch": 0.1, + "grad_norm": 9.534523010253906, + "learning_rate": 2.8548137978376525e-05, + "loss": 1.3806, + "step": 564 + }, + { + "epoch": 0.1, + "grad_norm": 10.204734802246094, + "learning_rate": 2.854556375493393e-05, + "loss": 1.6416, + "step": 565 + }, + { + "epoch": 0.1, + "grad_norm": 10.165093421936035, + "learning_rate": 2.8542989531491335e-05, + "loss": 1.8188, + "step": 566 + }, + { + "epoch": 0.1, + "grad_norm": 8.71523380279541, + "learning_rate": 2.854041530804874e-05, + "loss": 1.5907, + "step": 567 + }, + { + "epoch": 0.1, + "grad_norm": 8.430357933044434, + "learning_rate": 2.8537841084606145e-05, + "loss": 1.1473, + "step": 568 + }, + { + "epoch": 0.1, + "grad_norm": 8.172737121582031, + "learning_rate": 2.853526686116355e-05, + "loss": 1.3425, + "step": 569 + }, + { + "epoch": 0.1, + "grad_norm": 8.882026672363281, + "learning_rate": 2.8532692637720955e-05, + "loss": 1.394, + "step": 570 + }, + { + "epoch": 0.1, + "grad_norm": 9.96976375579834, + "learning_rate": 2.853011841427836e-05, + "loss": 1.7068, + "step": 571 + }, + { + "epoch": 0.1, + "grad_norm": 9.174710273742676, + "learning_rate": 2.8527544190835765e-05, + "loss": 1.5241, + "step": 572 + }, + { + "epoch": 0.1, + "grad_norm": 9.107884407043457, + "learning_rate": 2.8524969967393172e-05, + "loss": 1.2171, + "step": 573 + }, + { + "epoch": 0.1, + "grad_norm": 9.180644035339355, + "learning_rate": 2.8522395743950575e-05, + "loss": 1.522, + "step": 574 + }, + { + "epoch": 0.1, + "grad_norm": 8.132983207702637, + "learning_rate": 2.8519821520507982e-05, + "loss": 1.273, + "step": 575 + }, + { + "epoch": 0.1, + "grad_norm": 7.64364767074585, + "learning_rate": 2.8517247297065385e-05, + "loss": 1.2571, + "step": 576 + }, + { + "epoch": 0.1, + "grad_norm": 9.831807136535645, + "learning_rate": 2.8514673073622792e-05, + "loss": 1.7455, + "step": 577 + }, + { + "epoch": 0.1, + "grad_norm": 8.58155345916748, + "learning_rate": 2.8512098850180195e-05, + "loss": 1.4143, + "step": 578 + }, + { + "epoch": 0.1, + "grad_norm": 9.167181015014648, + "learning_rate": 2.8509524626737602e-05, + "loss": 1.4688, + "step": 579 + }, + { + "epoch": 0.1, + "grad_norm": 9.433991432189941, + "learning_rate": 2.8506950403295005e-05, + "loss": 1.4494, + "step": 580 + }, + { + "epoch": 0.1, + "grad_norm": 9.626629829406738, + "learning_rate": 2.8504376179852412e-05, + "loss": 1.4249, + "step": 581 + }, + { + "epoch": 0.1, + "grad_norm": 9.549328804016113, + "learning_rate": 2.850180195640982e-05, + "loss": 1.7137, + "step": 582 + }, + { + "epoch": 0.1, + "grad_norm": 8.940505981445312, + "learning_rate": 2.8499227732967222e-05, + "loss": 1.3205, + "step": 583 + }, + { + "epoch": 0.1, + "grad_norm": 11.76657485961914, + "learning_rate": 2.849665350952463e-05, + "loss": 1.4745, + "step": 584 + }, + { + "epoch": 0.1, + "grad_norm": 10.353049278259277, + "learning_rate": 2.8494079286082032e-05, + "loss": 1.6701, + "step": 585 + }, + { + "epoch": 0.1, + "grad_norm": 9.746253967285156, + "learning_rate": 2.849150506263944e-05, + "loss": 1.3818, + "step": 586 + }, + { + "epoch": 0.1, + "grad_norm": 10.59052562713623, + "learning_rate": 2.8488930839196842e-05, + "loss": 1.7664, + "step": 587 + }, + { + "epoch": 0.1, + "grad_norm": 8.615872383117676, + "learning_rate": 2.848635661575425e-05, + "loss": 1.5378, + "step": 588 + }, + { + "epoch": 0.1, + "grad_norm": 8.799748420715332, + "learning_rate": 2.8483782392311652e-05, + "loss": 1.5329, + "step": 589 + }, + { + "epoch": 0.1, + "grad_norm": 9.83292293548584, + "learning_rate": 2.848120816886906e-05, + "loss": 1.3462, + "step": 590 + }, + { + "epoch": 0.1, + "grad_norm": 8.456145286560059, + "learning_rate": 2.8478633945426462e-05, + "loss": 1.4992, + "step": 591 + }, + { + "epoch": 0.1, + "grad_norm": 8.450884819030762, + "learning_rate": 2.8476059721983872e-05, + "loss": 1.2677, + "step": 592 + }, + { + "epoch": 0.1, + "grad_norm": 7.532412528991699, + "learning_rate": 2.8473485498541275e-05, + "loss": 1.5072, + "step": 593 + }, + { + "epoch": 0.1, + "grad_norm": 7.4922709465026855, + "learning_rate": 2.847091127509868e-05, + "loss": 1.2154, + "step": 594 + }, + { + "epoch": 0.1, + "grad_norm": 8.205540657043457, + "learning_rate": 2.8468337051656085e-05, + "loss": 1.3805, + "step": 595 + }, + { + "epoch": 0.1, + "grad_norm": 9.42839241027832, + "learning_rate": 2.846576282821349e-05, + "loss": 1.3325, + "step": 596 + }, + { + "epoch": 0.1, + "grad_norm": 8.487515449523926, + "learning_rate": 2.8463188604770895e-05, + "loss": 1.2221, + "step": 597 + }, + { + "epoch": 0.1, + "grad_norm": 9.484041213989258, + "learning_rate": 2.84606143813283e-05, + "loss": 1.8827, + "step": 598 + }, + { + "epoch": 0.1, + "grad_norm": 8.353645324707031, + "learning_rate": 2.8458040157885705e-05, + "loss": 1.607, + "step": 599 + }, + { + "epoch": 0.1, + "grad_norm": 8.193730354309082, + "learning_rate": 2.845546593444311e-05, + "loss": 1.2073, + "step": 600 + }, + { + "epoch": 0.1, + "grad_norm": 9.85962963104248, + "learning_rate": 2.845289171100052e-05, + "loss": 1.4088, + "step": 601 + }, + { + "epoch": 0.1, + "grad_norm": 8.017230987548828, + "learning_rate": 2.8450317487557922e-05, + "loss": 1.4605, + "step": 602 + }, + { + "epoch": 0.1, + "grad_norm": 7.847349643707275, + "learning_rate": 2.844774326411533e-05, + "loss": 1.1767, + "step": 603 + }, + { + "epoch": 0.1, + "grad_norm": 9.358426094055176, + "learning_rate": 2.8445169040672732e-05, + "loss": 1.5359, + "step": 604 + }, + { + "epoch": 0.1, + "grad_norm": 9.367965698242188, + "learning_rate": 2.8442594817230135e-05, + "loss": 1.5171, + "step": 605 + }, + { + "epoch": 0.1, + "grad_norm": 8.827372550964355, + "learning_rate": 2.844002059378754e-05, + "loss": 1.3408, + "step": 606 + }, + { + "epoch": 0.1, + "grad_norm": 7.997843265533447, + "learning_rate": 2.8437446370344945e-05, + "loss": 1.3004, + "step": 607 + }, + { + "epoch": 0.1, + "grad_norm": 8.360730171203613, + "learning_rate": 2.843487214690235e-05, + "loss": 1.48, + "step": 608 + }, + { + "epoch": 0.1, + "grad_norm": 7.979563236236572, + "learning_rate": 2.8432297923459755e-05, + "loss": 1.3397, + "step": 609 + }, + { + "epoch": 0.1, + "grad_norm": 10.846831321716309, + "learning_rate": 2.842972370001716e-05, + "loss": 1.2568, + "step": 610 + }, + { + "epoch": 0.1, + "grad_norm": 9.71766185760498, + "learning_rate": 2.8427149476574568e-05, + "loss": 1.5845, + "step": 611 + }, + { + "epoch": 0.11, + "grad_norm": 8.213266372680664, + "learning_rate": 2.8424575253131975e-05, + "loss": 1.2543, + "step": 612 + }, + { + "epoch": 0.11, + "grad_norm": 9.464890480041504, + "learning_rate": 2.8422001029689378e-05, + "loss": 1.2864, + "step": 613 + }, + { + "epoch": 0.11, + "grad_norm": 8.840841293334961, + "learning_rate": 2.841942680624678e-05, + "loss": 1.521, + "step": 614 + }, + { + "epoch": 0.11, + "grad_norm": 10.37797737121582, + "learning_rate": 2.8416852582804188e-05, + "loss": 1.5883, + "step": 615 + }, + { + "epoch": 0.11, + "grad_norm": 7.5304765701293945, + "learning_rate": 2.841427835936159e-05, + "loss": 1.089, + "step": 616 + }, + { + "epoch": 0.11, + "grad_norm": 8.18742847442627, + "learning_rate": 2.8411704135918998e-05, + "loss": 1.2754, + "step": 617 + }, + { + "epoch": 0.11, + "grad_norm": 9.496501922607422, + "learning_rate": 2.84091299124764e-05, + "loss": 1.7009, + "step": 618 + }, + { + "epoch": 0.11, + "grad_norm": 8.113269805908203, + "learning_rate": 2.8406555689033808e-05, + "loss": 1.5481, + "step": 619 + }, + { + "epoch": 0.11, + "grad_norm": 10.157917022705078, + "learning_rate": 2.8403981465591215e-05, + "loss": 1.4719, + "step": 620 + }, + { + "epoch": 0.11, + "grad_norm": 8.711053848266602, + "learning_rate": 2.840140724214862e-05, + "loss": 1.5226, + "step": 621 + }, + { + "epoch": 0.11, + "grad_norm": 7.997692108154297, + "learning_rate": 2.8398833018706025e-05, + "loss": 1.3378, + "step": 622 + }, + { + "epoch": 0.11, + "grad_norm": 7.8751749992370605, + "learning_rate": 2.839625879526343e-05, + "loss": 1.4806, + "step": 623 + }, + { + "epoch": 0.11, + "grad_norm": 9.335694313049316, + "learning_rate": 2.8393684571820835e-05, + "loss": 1.5422, + "step": 624 + }, + { + "epoch": 0.11, + "grad_norm": 8.491147994995117, + "learning_rate": 2.8391110348378238e-05, + "loss": 1.6276, + "step": 625 + }, + { + "epoch": 0.11, + "grad_norm": 9.00670337677002, + "learning_rate": 2.8388536124935645e-05, + "loss": 1.2147, + "step": 626 + }, + { + "epoch": 0.11, + "grad_norm": 8.42430305480957, + "learning_rate": 2.8385961901493048e-05, + "loss": 1.3524, + "step": 627 + }, + { + "epoch": 0.11, + "grad_norm": 8.35117244720459, + "learning_rate": 2.8383387678050455e-05, + "loss": 1.2318, + "step": 628 + }, + { + "epoch": 0.11, + "grad_norm": 8.867499351501465, + "learning_rate": 2.8380813454607858e-05, + "loss": 1.5234, + "step": 629 + }, + { + "epoch": 0.11, + "grad_norm": 11.456783294677734, + "learning_rate": 2.8378239231165268e-05, + "loss": 1.4583, + "step": 630 + }, + { + "epoch": 0.11, + "grad_norm": 9.594684600830078, + "learning_rate": 2.837566500772267e-05, + "loss": 1.7668, + "step": 631 + }, + { + "epoch": 0.11, + "grad_norm": 9.295198440551758, + "learning_rate": 2.8373090784280078e-05, + "loss": 1.3842, + "step": 632 + }, + { + "epoch": 0.11, + "grad_norm": 9.227025985717773, + "learning_rate": 2.837051656083748e-05, + "loss": 1.3941, + "step": 633 + }, + { + "epoch": 0.11, + "grad_norm": 9.663003921508789, + "learning_rate": 2.8367942337394888e-05, + "loss": 1.7119, + "step": 634 + }, + { + "epoch": 0.11, + "grad_norm": 9.245357513427734, + "learning_rate": 2.836536811395229e-05, + "loss": 1.3433, + "step": 635 + }, + { + "epoch": 0.11, + "grad_norm": 9.337903022766113, + "learning_rate": 2.8362793890509695e-05, + "loss": 1.3921, + "step": 636 + }, + { + "epoch": 0.11, + "grad_norm": 9.937525749206543, + "learning_rate": 2.83602196670671e-05, + "loss": 1.6586, + "step": 637 + }, + { + "epoch": 0.11, + "grad_norm": 9.402658462524414, + "learning_rate": 2.8357645443624505e-05, + "loss": 1.2618, + "step": 638 + }, + { + "epoch": 0.11, + "grad_norm": 9.879486083984375, + "learning_rate": 2.8355071220181915e-05, + "loss": 1.6364, + "step": 639 + }, + { + "epoch": 0.11, + "grad_norm": 9.75072956085205, + "learning_rate": 2.8352496996739318e-05, + "loss": 1.7883, + "step": 640 + }, + { + "epoch": 0.11, + "grad_norm": 8.510144233703613, + "learning_rate": 2.8349922773296725e-05, + "loss": 1.5349, + "step": 641 + }, + { + "epoch": 0.11, + "grad_norm": 9.616477966308594, + "learning_rate": 2.8347348549854128e-05, + "loss": 1.7636, + "step": 642 + }, + { + "epoch": 0.11, + "grad_norm": 8.672359466552734, + "learning_rate": 2.8344774326411535e-05, + "loss": 1.5705, + "step": 643 + }, + { + "epoch": 0.11, + "grad_norm": 9.690023422241211, + "learning_rate": 2.8342200102968938e-05, + "loss": 1.615, + "step": 644 + }, + { + "epoch": 0.11, + "grad_norm": 9.147051811218262, + "learning_rate": 2.8339625879526345e-05, + "loss": 1.3188, + "step": 645 + }, + { + "epoch": 0.11, + "grad_norm": 8.338275909423828, + "learning_rate": 2.8337051656083748e-05, + "loss": 1.4246, + "step": 646 + }, + { + "epoch": 0.11, + "grad_norm": 7.594513416290283, + "learning_rate": 2.833447743264115e-05, + "loss": 1.5287, + "step": 647 + }, + { + "epoch": 0.11, + "grad_norm": 8.40947151184082, + "learning_rate": 2.833190320919856e-05, + "loss": 1.4517, + "step": 648 + }, + { + "epoch": 0.11, + "grad_norm": 8.413890838623047, + "learning_rate": 2.8329328985755965e-05, + "loss": 1.5831, + "step": 649 + }, + { + "epoch": 0.11, + "grad_norm": 8.007394790649414, + "learning_rate": 2.832675476231337e-05, + "loss": 1.4636, + "step": 650 + }, + { + "epoch": 0.11, + "grad_norm": 7.903453350067139, + "learning_rate": 2.8324180538870775e-05, + "loss": 1.1872, + "step": 651 + }, + { + "epoch": 0.11, + "grad_norm": 7.605274200439453, + "learning_rate": 2.832160631542818e-05, + "loss": 1.1718, + "step": 652 + }, + { + "epoch": 0.11, + "grad_norm": 7.102076053619385, + "learning_rate": 2.8319032091985585e-05, + "loss": 1.1408, + "step": 653 + }, + { + "epoch": 0.11, + "grad_norm": 9.305570602416992, + "learning_rate": 2.831645786854299e-05, + "loss": 1.6147, + "step": 654 + }, + { + "epoch": 0.11, + "grad_norm": 9.515800476074219, + "learning_rate": 2.8313883645100395e-05, + "loss": 1.5238, + "step": 655 + }, + { + "epoch": 0.11, + "grad_norm": 9.747093200683594, + "learning_rate": 2.8311309421657798e-05, + "loss": 1.2978, + "step": 656 + }, + { + "epoch": 0.11, + "grad_norm": 11.113810539245605, + "learning_rate": 2.8308735198215204e-05, + "loss": 1.8871, + "step": 657 + }, + { + "epoch": 0.11, + "grad_norm": 11.422607421875, + "learning_rate": 2.830616097477261e-05, + "loss": 1.7156, + "step": 658 + }, + { + "epoch": 0.11, + "grad_norm": 10.17238998413086, + "learning_rate": 2.8303586751330018e-05, + "loss": 1.7009, + "step": 659 + }, + { + "epoch": 0.11, + "grad_norm": 10.30120849609375, + "learning_rate": 2.830101252788742e-05, + "loss": 1.8961, + "step": 660 + }, + { + "epoch": 0.11, + "grad_norm": 7.465618133544922, + "learning_rate": 2.8298438304444828e-05, + "loss": 1.2713, + "step": 661 + }, + { + "epoch": 0.11, + "grad_norm": 8.053711891174316, + "learning_rate": 2.829586408100223e-05, + "loss": 1.3993, + "step": 662 + }, + { + "epoch": 0.11, + "grad_norm": 8.97900104522705, + "learning_rate": 2.8293289857559638e-05, + "loss": 1.736, + "step": 663 + }, + { + "epoch": 0.11, + "grad_norm": 8.041558265686035, + "learning_rate": 2.829071563411704e-05, + "loss": 1.4241, + "step": 664 + }, + { + "epoch": 0.11, + "grad_norm": 8.299877166748047, + "learning_rate": 2.8288141410674448e-05, + "loss": 1.4024, + "step": 665 + }, + { + "epoch": 0.11, + "grad_norm": 7.503842353820801, + "learning_rate": 2.828556718723185e-05, + "loss": 1.2853, + "step": 666 + }, + { + "epoch": 0.11, + "grad_norm": 8.113901138305664, + "learning_rate": 2.8282992963789258e-05, + "loss": 1.6806, + "step": 667 + }, + { + "epoch": 0.11, + "grad_norm": 7.860781669616699, + "learning_rate": 2.8280418740346664e-05, + "loss": 1.3881, + "step": 668 + }, + { + "epoch": 0.11, + "grad_norm": 6.9158501625061035, + "learning_rate": 2.8277844516904068e-05, + "loss": 1.2087, + "step": 669 + }, + { + "epoch": 0.11, + "grad_norm": 8.270143508911133, + "learning_rate": 2.8275270293461474e-05, + "loss": 1.2132, + "step": 670 + }, + { + "epoch": 0.12, + "grad_norm": 8.712434768676758, + "learning_rate": 2.8272696070018878e-05, + "loss": 1.2317, + "step": 671 + }, + { + "epoch": 0.12, + "grad_norm": 9.592971801757812, + "learning_rate": 2.8270121846576284e-05, + "loss": 1.8483, + "step": 672 + }, + { + "epoch": 0.12, + "grad_norm": 8.637419700622559, + "learning_rate": 2.8267547623133688e-05, + "loss": 1.2322, + "step": 673 + }, + { + "epoch": 0.12, + "grad_norm": 9.837871551513672, + "learning_rate": 2.8264973399691094e-05, + "loss": 1.562, + "step": 674 + }, + { + "epoch": 0.12, + "grad_norm": 9.74756908416748, + "learning_rate": 2.8262399176248498e-05, + "loss": 1.522, + "step": 675 + }, + { + "epoch": 0.12, + "grad_norm": 8.316798210144043, + "learning_rate": 2.8259824952805904e-05, + "loss": 1.3224, + "step": 676 + }, + { + "epoch": 0.12, + "grad_norm": 7.852266788482666, + "learning_rate": 2.825725072936331e-05, + "loss": 1.4292, + "step": 677 + }, + { + "epoch": 0.12, + "grad_norm": 8.70324993133545, + "learning_rate": 2.8254676505920714e-05, + "loss": 1.3933, + "step": 678 + }, + { + "epoch": 0.12, + "grad_norm": 9.499561309814453, + "learning_rate": 2.825210228247812e-05, + "loss": 1.5622, + "step": 679 + }, + { + "epoch": 0.12, + "grad_norm": 9.421582221984863, + "learning_rate": 2.8249528059035524e-05, + "loss": 1.3405, + "step": 680 + }, + { + "epoch": 0.12, + "grad_norm": 8.83010196685791, + "learning_rate": 2.824695383559293e-05, + "loss": 1.3413, + "step": 681 + }, + { + "epoch": 0.12, + "grad_norm": 8.317943572998047, + "learning_rate": 2.8244379612150334e-05, + "loss": 1.5046, + "step": 682 + }, + { + "epoch": 0.12, + "grad_norm": 8.324114799499512, + "learning_rate": 2.824180538870774e-05, + "loss": 1.2111, + "step": 683 + }, + { + "epoch": 0.12, + "grad_norm": 8.69671630859375, + "learning_rate": 2.8239231165265144e-05, + "loss": 1.6075, + "step": 684 + }, + { + "epoch": 0.12, + "grad_norm": 7.922121047973633, + "learning_rate": 2.823665694182255e-05, + "loss": 1.401, + "step": 685 + }, + { + "epoch": 0.12, + "grad_norm": 7.156432151794434, + "learning_rate": 2.8234082718379958e-05, + "loss": 1.0052, + "step": 686 + }, + { + "epoch": 0.12, + "grad_norm": 8.44300365447998, + "learning_rate": 2.823150849493736e-05, + "loss": 1.5392, + "step": 687 + }, + { + "epoch": 0.12, + "grad_norm": 10.195891380310059, + "learning_rate": 2.8228934271494768e-05, + "loss": 1.3723, + "step": 688 + }, + { + "epoch": 0.12, + "grad_norm": 9.78988265991211, + "learning_rate": 2.822636004805217e-05, + "loss": 1.5812, + "step": 689 + }, + { + "epoch": 0.12, + "grad_norm": 9.798638343811035, + "learning_rate": 2.8223785824609578e-05, + "loss": 1.7575, + "step": 690 + }, + { + "epoch": 0.12, + "grad_norm": 9.687625885009766, + "learning_rate": 2.822121160116698e-05, + "loss": 1.1519, + "step": 691 + }, + { + "epoch": 0.12, + "grad_norm": 12.919822692871094, + "learning_rate": 2.8218637377724388e-05, + "loss": 1.331, + "step": 692 + }, + { + "epoch": 0.12, + "grad_norm": 11.755447387695312, + "learning_rate": 2.821606315428179e-05, + "loss": 1.4046, + "step": 693 + }, + { + "epoch": 0.12, + "grad_norm": 9.944352149963379, + "learning_rate": 2.8213488930839197e-05, + "loss": 1.5211, + "step": 694 + }, + { + "epoch": 0.12, + "grad_norm": 9.388495445251465, + "learning_rate": 2.82109147073966e-05, + "loss": 1.4173, + "step": 695 + }, + { + "epoch": 0.12, + "grad_norm": 8.420978546142578, + "learning_rate": 2.820834048395401e-05, + "loss": 1.1827, + "step": 696 + }, + { + "epoch": 0.12, + "grad_norm": 9.561078071594238, + "learning_rate": 2.8205766260511414e-05, + "loss": 1.5684, + "step": 697 + }, + { + "epoch": 0.12, + "grad_norm": 9.48249340057373, + "learning_rate": 2.8203192037068817e-05, + "loss": 1.533, + "step": 698 + }, + { + "epoch": 0.12, + "grad_norm": 8.367238998413086, + "learning_rate": 2.8200617813626224e-05, + "loss": 1.4733, + "step": 699 + }, + { + "epoch": 0.12, + "grad_norm": 9.245125770568848, + "learning_rate": 2.8198043590183627e-05, + "loss": 1.4795, + "step": 700 + }, + { + "epoch": 0.12, + "grad_norm": 9.23082160949707, + "learning_rate": 2.8195469366741034e-05, + "loss": 1.5353, + "step": 701 + }, + { + "epoch": 0.12, + "grad_norm": 7.436493396759033, + "learning_rate": 2.8192895143298437e-05, + "loss": 1.2041, + "step": 702 + }, + { + "epoch": 0.12, + "grad_norm": 7.462575435638428, + "learning_rate": 2.8190320919855844e-05, + "loss": 1.2031, + "step": 703 + }, + { + "epoch": 0.12, + "grad_norm": 7.718941688537598, + "learning_rate": 2.8187746696413247e-05, + "loss": 1.2735, + "step": 704 + }, + { + "epoch": 0.12, + "grad_norm": 8.902457237243652, + "learning_rate": 2.8185172472970657e-05, + "loss": 1.3001, + "step": 705 + }, + { + "epoch": 0.12, + "grad_norm": 8.40603256225586, + "learning_rate": 2.818259824952806e-05, + "loss": 1.3974, + "step": 706 + }, + { + "epoch": 0.12, + "grad_norm": 8.115034103393555, + "learning_rate": 2.8180024026085467e-05, + "loss": 1.3445, + "step": 707 + }, + { + "epoch": 0.12, + "grad_norm": 9.21794605255127, + "learning_rate": 2.817744980264287e-05, + "loss": 1.5452, + "step": 708 + }, + { + "epoch": 0.12, + "grad_norm": 8.31137752532959, + "learning_rate": 2.8174875579200274e-05, + "loss": 1.4253, + "step": 709 + }, + { + "epoch": 0.12, + "grad_norm": 9.634665489196777, + "learning_rate": 2.817230135575768e-05, + "loss": 1.789, + "step": 710 + }, + { + "epoch": 0.12, + "grad_norm": 9.329280853271484, + "learning_rate": 2.8169727132315084e-05, + "loss": 1.4738, + "step": 711 + }, + { + "epoch": 0.12, + "grad_norm": 8.350824356079102, + "learning_rate": 2.816715290887249e-05, + "loss": 1.4322, + "step": 712 + }, + { + "epoch": 0.12, + "grad_norm": 7.817314624786377, + "learning_rate": 2.8164578685429894e-05, + "loss": 1.3692, + "step": 713 + }, + { + "epoch": 0.12, + "grad_norm": 9.686982154846191, + "learning_rate": 2.81620044619873e-05, + "loss": 1.4641, + "step": 714 + }, + { + "epoch": 0.12, + "grad_norm": 9.947505950927734, + "learning_rate": 2.8159430238544707e-05, + "loss": 1.5755, + "step": 715 + }, + { + "epoch": 0.12, + "grad_norm": 9.84663200378418, + "learning_rate": 2.8156856015102114e-05, + "loss": 1.5036, + "step": 716 + }, + { + "epoch": 0.12, + "grad_norm": 9.863555908203125, + "learning_rate": 2.8154281791659517e-05, + "loss": 1.6515, + "step": 717 + }, + { + "epoch": 0.12, + "grad_norm": 9.05031967163086, + "learning_rate": 2.815170756821692e-05, + "loss": 1.6212, + "step": 718 + }, + { + "epoch": 0.12, + "grad_norm": 8.781850814819336, + "learning_rate": 2.8149133344774327e-05, + "loss": 1.5527, + "step": 719 + }, + { + "epoch": 0.12, + "grad_norm": 11.855267524719238, + "learning_rate": 2.814655912133173e-05, + "loss": 1.5061, + "step": 720 + }, + { + "epoch": 0.12, + "grad_norm": 8.520151138305664, + "learning_rate": 2.8143984897889137e-05, + "loss": 1.2851, + "step": 721 + }, + { + "epoch": 0.12, + "grad_norm": 9.402628898620605, + "learning_rate": 2.814141067444654e-05, + "loss": 1.458, + "step": 722 + }, + { + "epoch": 0.12, + "grad_norm": 7.520021438598633, + "learning_rate": 2.8138836451003947e-05, + "loss": 1.3035, + "step": 723 + }, + { + "epoch": 0.12, + "grad_norm": 8.891894340515137, + "learning_rate": 2.8136262227561354e-05, + "loss": 1.5014, + "step": 724 + }, + { + "epoch": 0.12, + "grad_norm": 8.058806419372559, + "learning_rate": 2.813368800411876e-05, + "loss": 1.4258, + "step": 725 + }, + { + "epoch": 0.12, + "grad_norm": 8.435503959655762, + "learning_rate": 2.8131113780676164e-05, + "loss": 1.0866, + "step": 726 + }, + { + "epoch": 0.12, + "grad_norm": 9.342641830444336, + "learning_rate": 2.812853955723357e-05, + "loss": 1.8102, + "step": 727 + }, + { + "epoch": 0.12, + "grad_norm": 9.731269836425781, + "learning_rate": 2.8125965333790974e-05, + "loss": 1.5781, + "step": 728 + }, + { + "epoch": 0.13, + "grad_norm": 9.534306526184082, + "learning_rate": 2.8123391110348377e-05, + "loss": 1.5534, + "step": 729 + }, + { + "epoch": 0.13, + "grad_norm": 9.31511402130127, + "learning_rate": 2.8120816886905784e-05, + "loss": 0.9475, + "step": 730 + }, + { + "epoch": 0.13, + "grad_norm": 8.601847648620605, + "learning_rate": 2.8118242663463187e-05, + "loss": 1.2074, + "step": 731 + }, + { + "epoch": 0.13, + "grad_norm": 9.2041015625, + "learning_rate": 2.8115668440020594e-05, + "loss": 1.5293, + "step": 732 + }, + { + "epoch": 0.13, + "grad_norm": 9.243498802185059, + "learning_rate": 2.8113094216577997e-05, + "loss": 1.8844, + "step": 733 + }, + { + "epoch": 0.13, + "grad_norm": 10.502090454101562, + "learning_rate": 2.8110519993135407e-05, + "loss": 1.4478, + "step": 734 + }, + { + "epoch": 0.13, + "grad_norm": 10.382709503173828, + "learning_rate": 2.810794576969281e-05, + "loss": 1.3572, + "step": 735 + }, + { + "epoch": 0.13, + "grad_norm": 7.930559158325195, + "learning_rate": 2.8105371546250217e-05, + "loss": 1.0489, + "step": 736 + }, + { + "epoch": 0.13, + "grad_norm": 9.150694847106934, + "learning_rate": 2.810279732280762e-05, + "loss": 1.4539, + "step": 737 + }, + { + "epoch": 0.13, + "grad_norm": 9.979594230651855, + "learning_rate": 2.8100223099365027e-05, + "loss": 1.6143, + "step": 738 + }, + { + "epoch": 0.13, + "grad_norm": 8.314689636230469, + "learning_rate": 2.809764887592243e-05, + "loss": 1.5458, + "step": 739 + }, + { + "epoch": 0.13, + "grad_norm": 8.491994857788086, + "learning_rate": 2.8095074652479834e-05, + "loss": 1.3965, + "step": 740 + }, + { + "epoch": 0.13, + "grad_norm": 8.279489517211914, + "learning_rate": 2.809250042903724e-05, + "loss": 1.1503, + "step": 741 + }, + { + "epoch": 0.13, + "grad_norm": 8.607050895690918, + "learning_rate": 2.8089926205594644e-05, + "loss": 1.2468, + "step": 742 + }, + { + "epoch": 0.13, + "grad_norm": 8.813472747802734, + "learning_rate": 2.8087351982152054e-05, + "loss": 1.2145, + "step": 743 + }, + { + "epoch": 0.13, + "grad_norm": 8.898530960083008, + "learning_rate": 2.8084777758709457e-05, + "loss": 1.3393, + "step": 744 + }, + { + "epoch": 0.13, + "grad_norm": 9.19594955444336, + "learning_rate": 2.8082203535266864e-05, + "loss": 1.5255, + "step": 745 + }, + { + "epoch": 0.13, + "grad_norm": 9.30061149597168, + "learning_rate": 2.8079629311824267e-05, + "loss": 1.2362, + "step": 746 + }, + { + "epoch": 0.13, + "grad_norm": 8.08763313293457, + "learning_rate": 2.8077055088381674e-05, + "loss": 1.2331, + "step": 747 + }, + { + "epoch": 0.13, + "grad_norm": 8.105565071105957, + "learning_rate": 2.8074480864939077e-05, + "loss": 1.1464, + "step": 748 + }, + { + "epoch": 0.13, + "grad_norm": 10.09921646118164, + "learning_rate": 2.8071906641496484e-05, + "loss": 1.5649, + "step": 749 + }, + { + "epoch": 0.13, + "grad_norm": 10.6478853225708, + "learning_rate": 2.8069332418053887e-05, + "loss": 1.4615, + "step": 750 + }, + { + "epoch": 0.13, + "grad_norm": 10.303277969360352, + "learning_rate": 2.806675819461129e-05, + "loss": 1.3727, + "step": 751 + }, + { + "epoch": 0.13, + "grad_norm": 9.733452796936035, + "learning_rate": 2.8064183971168697e-05, + "loss": 1.4761, + "step": 752 + }, + { + "epoch": 0.13, + "grad_norm": 10.374414443969727, + "learning_rate": 2.8061609747726104e-05, + "loss": 1.5373, + "step": 753 + }, + { + "epoch": 0.13, + "grad_norm": 9.646998405456543, + "learning_rate": 2.805903552428351e-05, + "loss": 1.4541, + "step": 754 + }, + { + "epoch": 0.13, + "grad_norm": 8.969915390014648, + "learning_rate": 2.8056461300840914e-05, + "loss": 1.4849, + "step": 755 + }, + { + "epoch": 0.13, + "grad_norm": 8.019808769226074, + "learning_rate": 2.805388707739832e-05, + "loss": 1.1678, + "step": 756 + }, + { + "epoch": 0.13, + "grad_norm": 9.369609832763672, + "learning_rate": 2.8051312853955724e-05, + "loss": 1.2665, + "step": 757 + }, + { + "epoch": 0.13, + "grad_norm": 11.37863826751709, + "learning_rate": 2.804873863051313e-05, + "loss": 1.4877, + "step": 758 + }, + { + "epoch": 0.13, + "grad_norm": 10.627493858337402, + "learning_rate": 2.8046164407070534e-05, + "loss": 1.5047, + "step": 759 + }, + { + "epoch": 0.13, + "grad_norm": 9.57814884185791, + "learning_rate": 2.8043590183627937e-05, + "loss": 1.356, + "step": 760 + }, + { + "epoch": 0.13, + "grad_norm": 7.7247514724731445, + "learning_rate": 2.8041015960185344e-05, + "loss": 1.6163, + "step": 761 + }, + { + "epoch": 0.13, + "grad_norm": 8.711459159851074, + "learning_rate": 2.803844173674275e-05, + "loss": 1.4718, + "step": 762 + }, + { + "epoch": 0.13, + "grad_norm": 8.438310623168945, + "learning_rate": 2.8035867513300157e-05, + "loss": 1.4829, + "step": 763 + }, + { + "epoch": 0.13, + "grad_norm": 8.3248929977417, + "learning_rate": 2.803329328985756e-05, + "loss": 1.5657, + "step": 764 + }, + { + "epoch": 0.13, + "grad_norm": 9.526896476745605, + "learning_rate": 2.8030719066414967e-05, + "loss": 1.3642, + "step": 765 + }, + { + "epoch": 0.13, + "grad_norm": 8.918802261352539, + "learning_rate": 2.802814484297237e-05, + "loss": 1.489, + "step": 766 + }, + { + "epoch": 0.13, + "grad_norm": 8.489181518554688, + "learning_rate": 2.8025570619529777e-05, + "loss": 1.5363, + "step": 767 + }, + { + "epoch": 0.13, + "grad_norm": 8.939236640930176, + "learning_rate": 2.802299639608718e-05, + "loss": 1.5765, + "step": 768 + }, + { + "epoch": 0.13, + "grad_norm": 8.207650184631348, + "learning_rate": 2.8020422172644587e-05, + "loss": 1.4685, + "step": 769 + }, + { + "epoch": 0.13, + "grad_norm": 9.779074668884277, + "learning_rate": 2.801784794920199e-05, + "loss": 1.5115, + "step": 770 + }, + { + "epoch": 0.13, + "grad_norm": 8.131409645080566, + "learning_rate": 2.8015273725759397e-05, + "loss": 1.466, + "step": 771 + }, + { + "epoch": 0.13, + "grad_norm": 8.781936645507812, + "learning_rate": 2.8012699502316803e-05, + "loss": 1.561, + "step": 772 + }, + { + "epoch": 0.13, + "grad_norm": 8.795979499816895, + "learning_rate": 2.8010125278874207e-05, + "loss": 1.5802, + "step": 773 + }, + { + "epoch": 0.13, + "grad_norm": 9.044179916381836, + "learning_rate": 2.8007551055431613e-05, + "loss": 1.5705, + "step": 774 + }, + { + "epoch": 0.13, + "grad_norm": 8.744950294494629, + "learning_rate": 2.8004976831989017e-05, + "loss": 1.3674, + "step": 775 + }, + { + "epoch": 0.13, + "grad_norm": 7.9183454513549805, + "learning_rate": 2.8002402608546423e-05, + "loss": 1.1358, + "step": 776 + }, + { + "epoch": 0.13, + "grad_norm": 10.269308090209961, + "learning_rate": 2.7999828385103827e-05, + "loss": 1.5533, + "step": 777 + }, + { + "epoch": 0.13, + "grad_norm": 9.909619331359863, + "learning_rate": 2.7997254161661233e-05, + "loss": 1.4723, + "step": 778 + }, + { + "epoch": 0.13, + "grad_norm": 9.315204620361328, + "learning_rate": 2.7994679938218637e-05, + "loss": 1.2354, + "step": 779 + }, + { + "epoch": 0.13, + "grad_norm": 9.495931625366211, + "learning_rate": 2.7992105714776043e-05, + "loss": 1.5534, + "step": 780 + }, + { + "epoch": 0.13, + "grad_norm": 9.21084213256836, + "learning_rate": 2.798953149133345e-05, + "loss": 1.4437, + "step": 781 + }, + { + "epoch": 0.13, + "grad_norm": 9.249410629272461, + "learning_rate": 2.7986957267890853e-05, + "loss": 1.6794, + "step": 782 + }, + { + "epoch": 0.13, + "grad_norm": 10.520964622497559, + "learning_rate": 2.798438304444826e-05, + "loss": 1.5549, + "step": 783 + }, + { + "epoch": 0.13, + "grad_norm": 8.072175979614258, + "learning_rate": 2.7981808821005663e-05, + "loss": 1.2691, + "step": 784 + }, + { + "epoch": 0.13, + "grad_norm": 9.172987937927246, + "learning_rate": 2.797923459756307e-05, + "loss": 1.1972, + "step": 785 + }, + { + "epoch": 0.13, + "grad_norm": 8.787796020507812, + "learning_rate": 2.7976660374120473e-05, + "loss": 1.3193, + "step": 786 + }, + { + "epoch": 0.14, + "grad_norm": 7.934823036193848, + "learning_rate": 2.797408615067788e-05, + "loss": 1.0993, + "step": 787 + }, + { + "epoch": 0.14, + "grad_norm": 8.232659339904785, + "learning_rate": 2.7971511927235283e-05, + "loss": 1.3508, + "step": 788 + }, + { + "epoch": 0.14, + "grad_norm": 8.57980728149414, + "learning_rate": 2.796893770379269e-05, + "loss": 1.1617, + "step": 789 + }, + { + "epoch": 0.14, + "grad_norm": 9.604145050048828, + "learning_rate": 2.7966363480350097e-05, + "loss": 1.7157, + "step": 790 + }, + { + "epoch": 0.14, + "grad_norm": 8.965630531311035, + "learning_rate": 2.79637892569075e-05, + "loss": 1.3628, + "step": 791 + }, + { + "epoch": 0.14, + "grad_norm": 10.321697235107422, + "learning_rate": 2.7961215033464907e-05, + "loss": 1.4101, + "step": 792 + }, + { + "epoch": 0.14, + "grad_norm": 8.196749687194824, + "learning_rate": 2.795864081002231e-05, + "loss": 1.0958, + "step": 793 + }, + { + "epoch": 0.14, + "grad_norm": 8.979670524597168, + "learning_rate": 2.7956066586579717e-05, + "loss": 1.225, + "step": 794 + }, + { + "epoch": 0.14, + "grad_norm": 8.82540225982666, + "learning_rate": 2.795349236313712e-05, + "loss": 1.5192, + "step": 795 + }, + { + "epoch": 0.14, + "grad_norm": 8.932717323303223, + "learning_rate": 2.7950918139694527e-05, + "loss": 1.0531, + "step": 796 + }, + { + "epoch": 0.14, + "grad_norm": 8.442691802978516, + "learning_rate": 2.794834391625193e-05, + "loss": 1.261, + "step": 797 + }, + { + "epoch": 0.14, + "grad_norm": 11.675254821777344, + "learning_rate": 2.7945769692809337e-05, + "loss": 1.2304, + "step": 798 + }, + { + "epoch": 0.14, + "grad_norm": 9.10240650177002, + "learning_rate": 2.794319546936674e-05, + "loss": 1.4201, + "step": 799 + }, + { + "epoch": 0.14, + "grad_norm": 9.222650527954102, + "learning_rate": 2.794062124592415e-05, + "loss": 1.245, + "step": 800 + }, + { + "epoch": 0.14, + "grad_norm": 8.896738052368164, + "learning_rate": 2.7938047022481553e-05, + "loss": 1.4258, + "step": 801 + }, + { + "epoch": 0.14, + "grad_norm": 8.333304405212402, + "learning_rate": 2.7935472799038956e-05, + "loss": 1.4705, + "step": 802 + }, + { + "epoch": 0.14, + "grad_norm": 8.350672721862793, + "learning_rate": 2.7932898575596363e-05, + "loss": 1.659, + "step": 803 + }, + { + "epoch": 0.14, + "grad_norm": 8.64991283416748, + "learning_rate": 2.7930324352153766e-05, + "loss": 1.1396, + "step": 804 + }, + { + "epoch": 0.14, + "grad_norm": 7.883172988891602, + "learning_rate": 2.7927750128711173e-05, + "loss": 1.3913, + "step": 805 + }, + { + "epoch": 0.14, + "grad_norm": 10.036734580993652, + "learning_rate": 2.7925175905268576e-05, + "loss": 1.8256, + "step": 806 + }, + { + "epoch": 0.14, + "grad_norm": 9.359436988830566, + "learning_rate": 2.7922601681825983e-05, + "loss": 1.5282, + "step": 807 + }, + { + "epoch": 0.14, + "grad_norm": 9.453700065612793, + "learning_rate": 2.7920027458383386e-05, + "loss": 1.657, + "step": 808 + }, + { + "epoch": 0.14, + "grad_norm": 9.343161582946777, + "learning_rate": 2.7917453234940796e-05, + "loss": 1.5153, + "step": 809 + }, + { + "epoch": 0.14, + "grad_norm": 8.08825969696045, + "learning_rate": 2.79148790114982e-05, + "loss": 1.165, + "step": 810 + }, + { + "epoch": 0.14, + "grad_norm": 10.013175010681152, + "learning_rate": 2.7912304788055606e-05, + "loss": 1.6793, + "step": 811 + }, + { + "epoch": 0.14, + "grad_norm": 8.553650856018066, + "learning_rate": 2.790973056461301e-05, + "loss": 1.4495, + "step": 812 + }, + { + "epoch": 0.14, + "grad_norm": 9.575993537902832, + "learning_rate": 2.7907156341170413e-05, + "loss": 1.6111, + "step": 813 + }, + { + "epoch": 0.14, + "grad_norm": 8.336498260498047, + "learning_rate": 2.790458211772782e-05, + "loss": 1.2413, + "step": 814 + }, + { + "epoch": 0.14, + "grad_norm": 8.19493579864502, + "learning_rate": 2.7902007894285223e-05, + "loss": 1.5179, + "step": 815 + }, + { + "epoch": 0.14, + "grad_norm": 9.755159378051758, + "learning_rate": 2.789943367084263e-05, + "loss": 1.2592, + "step": 816 + }, + { + "epoch": 0.14, + "grad_norm": 9.641608238220215, + "learning_rate": 2.7896859447400033e-05, + "loss": 1.1667, + "step": 817 + }, + { + "epoch": 0.14, + "grad_norm": 8.296409606933594, + "learning_rate": 2.789428522395744e-05, + "loss": 1.2261, + "step": 818 + }, + { + "epoch": 0.14, + "grad_norm": 13.423913955688477, + "learning_rate": 2.7891711000514846e-05, + "loss": 2.0305, + "step": 819 + }, + { + "epoch": 0.14, + "grad_norm": 9.940492630004883, + "learning_rate": 2.7889136777072253e-05, + "loss": 1.6239, + "step": 820 + }, + { + "epoch": 0.14, + "grad_norm": 8.727612495422363, + "learning_rate": 2.7886562553629656e-05, + "loss": 1.3816, + "step": 821 + }, + { + "epoch": 0.14, + "grad_norm": 9.049920082092285, + "learning_rate": 2.788398833018706e-05, + "loss": 1.2178, + "step": 822 + }, + { + "epoch": 0.14, + "grad_norm": 8.606457710266113, + "learning_rate": 2.7881414106744466e-05, + "loss": 1.4069, + "step": 823 + }, + { + "epoch": 0.14, + "grad_norm": 9.72132682800293, + "learning_rate": 2.787883988330187e-05, + "loss": 1.4682, + "step": 824 + }, + { + "epoch": 0.14, + "grad_norm": 9.023664474487305, + "learning_rate": 2.7876265659859276e-05, + "loss": 1.4439, + "step": 825 + }, + { + "epoch": 0.14, + "grad_norm": 8.507688522338867, + "learning_rate": 2.787369143641668e-05, + "loss": 1.184, + "step": 826 + }, + { + "epoch": 0.14, + "grad_norm": 9.646373748779297, + "learning_rate": 2.7871117212974086e-05, + "loss": 1.5025, + "step": 827 + }, + { + "epoch": 0.14, + "grad_norm": 8.352978706359863, + "learning_rate": 2.7868542989531493e-05, + "loss": 1.3281, + "step": 828 + }, + { + "epoch": 0.14, + "grad_norm": 9.009771347045898, + "learning_rate": 2.78659687660889e-05, + "loss": 1.5252, + "step": 829 + }, + { + "epoch": 0.14, + "grad_norm": 8.628283500671387, + "learning_rate": 2.7863394542646303e-05, + "loss": 1.3438, + "step": 830 + }, + { + "epoch": 0.14, + "grad_norm": 9.474054336547852, + "learning_rate": 2.786082031920371e-05, + "loss": 1.7123, + "step": 831 + }, + { + "epoch": 0.14, + "grad_norm": 10.158101081848145, + "learning_rate": 2.7858246095761113e-05, + "loss": 1.6228, + "step": 832 + }, + { + "epoch": 0.14, + "grad_norm": 8.310938835144043, + "learning_rate": 2.7855671872318516e-05, + "loss": 1.3533, + "step": 833 + }, + { + "epoch": 0.14, + "grad_norm": 8.569671630859375, + "learning_rate": 2.7853097648875923e-05, + "loss": 1.3526, + "step": 834 + }, + { + "epoch": 0.14, + "grad_norm": 8.98193359375, + "learning_rate": 2.7850523425433326e-05, + "loss": 1.1168, + "step": 835 + }, + { + "epoch": 0.14, + "grad_norm": 8.391763687133789, + "learning_rate": 2.7847949201990733e-05, + "loss": 1.3365, + "step": 836 + }, + { + "epoch": 0.14, + "grad_norm": 9.450783729553223, + "learning_rate": 2.7845374978548136e-05, + "loss": 1.6182, + "step": 837 + }, + { + "epoch": 0.14, + "grad_norm": 9.776142120361328, + "learning_rate": 2.7842800755105546e-05, + "loss": 1.6554, + "step": 838 + }, + { + "epoch": 0.14, + "grad_norm": 9.17233943939209, + "learning_rate": 2.784022653166295e-05, + "loss": 1.3684, + "step": 839 + }, + { + "epoch": 0.14, + "grad_norm": 8.352310180664062, + "learning_rate": 2.7837652308220356e-05, + "loss": 1.2635, + "step": 840 + }, + { + "epoch": 0.14, + "grad_norm": 8.076722145080566, + "learning_rate": 2.783507808477776e-05, + "loss": 1.344, + "step": 841 + }, + { + "epoch": 0.14, + "grad_norm": 9.454001426696777, + "learning_rate": 2.7832503861335166e-05, + "loss": 1.5776, + "step": 842 + }, + { + "epoch": 0.14, + "grad_norm": 9.781198501586914, + "learning_rate": 2.782992963789257e-05, + "loss": 1.4146, + "step": 843 + }, + { + "epoch": 0.14, + "grad_norm": 8.85221004486084, + "learning_rate": 2.7827355414449973e-05, + "loss": 1.1327, + "step": 844 + }, + { + "epoch": 0.15, + "grad_norm": 8.658119201660156, + "learning_rate": 2.782478119100738e-05, + "loss": 1.5778, + "step": 845 + }, + { + "epoch": 0.15, + "grad_norm": 7.849745750427246, + "learning_rate": 2.7822206967564783e-05, + "loss": 1.3338, + "step": 846 + }, + { + "epoch": 0.15, + "grad_norm": 8.33973503112793, + "learning_rate": 2.7819632744122193e-05, + "loss": 1.4931, + "step": 847 + }, + { + "epoch": 0.15, + "grad_norm": 7.998106002807617, + "learning_rate": 2.7817058520679596e-05, + "loss": 1.3042, + "step": 848 + }, + { + "epoch": 0.15, + "grad_norm": 8.173836708068848, + "learning_rate": 2.7814484297237003e-05, + "loss": 1.2464, + "step": 849 + }, + { + "epoch": 0.15, + "grad_norm": 9.28589916229248, + "learning_rate": 2.7811910073794406e-05, + "loss": 1.884, + "step": 850 + }, + { + "epoch": 0.15, + "grad_norm": 8.372815132141113, + "learning_rate": 2.7809335850351813e-05, + "loss": 1.2398, + "step": 851 + }, + { + "epoch": 0.15, + "grad_norm": 10.412333488464355, + "learning_rate": 2.7806761626909216e-05, + "loss": 1.5797, + "step": 852 + }, + { + "epoch": 0.15, + "grad_norm": 8.245325088500977, + "learning_rate": 2.7804187403466623e-05, + "loss": 1.3753, + "step": 853 + }, + { + "epoch": 0.15, + "grad_norm": 8.252384185791016, + "learning_rate": 2.7801613180024026e-05, + "loss": 1.2435, + "step": 854 + }, + { + "epoch": 0.15, + "grad_norm": 10.715044975280762, + "learning_rate": 2.779903895658143e-05, + "loss": 1.9103, + "step": 855 + }, + { + "epoch": 0.15, + "grad_norm": 7.821426868438721, + "learning_rate": 2.7796464733138836e-05, + "loss": 1.2186, + "step": 856 + }, + { + "epoch": 0.15, + "grad_norm": 8.929902076721191, + "learning_rate": 2.7793890509696243e-05, + "loss": 1.4858, + "step": 857 + }, + { + "epoch": 0.15, + "grad_norm": 10.203032493591309, + "learning_rate": 2.779131628625365e-05, + "loss": 1.4556, + "step": 858 + }, + { + "epoch": 0.15, + "grad_norm": 10.393985748291016, + "learning_rate": 2.7788742062811053e-05, + "loss": 1.5507, + "step": 859 + }, + { + "epoch": 0.15, + "grad_norm": 8.583320617675781, + "learning_rate": 2.778616783936846e-05, + "loss": 1.2571, + "step": 860 + }, + { + "epoch": 0.15, + "grad_norm": 7.82917594909668, + "learning_rate": 2.7783593615925863e-05, + "loss": 1.2594, + "step": 861 + }, + { + "epoch": 0.15, + "grad_norm": 10.128918647766113, + "learning_rate": 2.778101939248327e-05, + "loss": 1.4016, + "step": 862 + }, + { + "epoch": 0.15, + "grad_norm": 9.211956977844238, + "learning_rate": 2.7778445169040673e-05, + "loss": 1.4521, + "step": 863 + }, + { + "epoch": 0.15, + "grad_norm": 7.162777900695801, + "learning_rate": 2.7775870945598076e-05, + "loss": 1.1528, + "step": 864 + }, + { + "epoch": 0.15, + "grad_norm": 11.554145812988281, + "learning_rate": 2.7773296722155483e-05, + "loss": 1.02, + "step": 865 + }, + { + "epoch": 0.15, + "grad_norm": 7.833945274353027, + "learning_rate": 2.777072249871289e-05, + "loss": 1.167, + "step": 866 + }, + { + "epoch": 0.15, + "grad_norm": 8.881366729736328, + "learning_rate": 2.7768148275270296e-05, + "loss": 1.4133, + "step": 867 + }, + { + "epoch": 0.15, + "grad_norm": 7.62943172454834, + "learning_rate": 2.77655740518277e-05, + "loss": 1.2017, + "step": 868 + }, + { + "epoch": 0.15, + "grad_norm": 8.090350151062012, + "learning_rate": 2.7762999828385106e-05, + "loss": 1.022, + "step": 869 + }, + { + "epoch": 0.15, + "grad_norm": 9.169464111328125, + "learning_rate": 2.776042560494251e-05, + "loss": 1.5532, + "step": 870 + }, + { + "epoch": 0.15, + "grad_norm": 9.9035005569458, + "learning_rate": 2.7757851381499916e-05, + "loss": 1.5007, + "step": 871 + }, + { + "epoch": 0.15, + "grad_norm": 9.399003028869629, + "learning_rate": 2.775527715805732e-05, + "loss": 1.3543, + "step": 872 + }, + { + "epoch": 0.15, + "grad_norm": 8.287327766418457, + "learning_rate": 2.7752702934614726e-05, + "loss": 1.3413, + "step": 873 + }, + { + "epoch": 0.15, + "grad_norm": 7.973354339599609, + "learning_rate": 2.775012871117213e-05, + "loss": 1.2202, + "step": 874 + }, + { + "epoch": 0.15, + "grad_norm": 11.43429946899414, + "learning_rate": 2.7747554487729536e-05, + "loss": 1.2334, + "step": 875 + }, + { + "epoch": 0.15, + "grad_norm": 9.133187294006348, + "learning_rate": 2.7744980264286942e-05, + "loss": 1.505, + "step": 876 + }, + { + "epoch": 0.15, + "grad_norm": 8.597518920898438, + "learning_rate": 2.7742406040844346e-05, + "loss": 1.3289, + "step": 877 + }, + { + "epoch": 0.15, + "grad_norm": 9.42212963104248, + "learning_rate": 2.7739831817401752e-05, + "loss": 1.3873, + "step": 878 + }, + { + "epoch": 0.15, + "grad_norm": 10.565329551696777, + "learning_rate": 2.7737257593959156e-05, + "loss": 1.2618, + "step": 879 + }, + { + "epoch": 0.15, + "grad_norm": 9.424479484558105, + "learning_rate": 2.7734683370516562e-05, + "loss": 1.3611, + "step": 880 + }, + { + "epoch": 0.15, + "grad_norm": 9.964366912841797, + "learning_rate": 2.7732109147073966e-05, + "loss": 1.4917, + "step": 881 + }, + { + "epoch": 0.15, + "grad_norm": 8.830155372619629, + "learning_rate": 2.7729534923631372e-05, + "loss": 1.6664, + "step": 882 + }, + { + "epoch": 0.15, + "grad_norm": 7.698051929473877, + "learning_rate": 2.7726960700188776e-05, + "loss": 1.2856, + "step": 883 + }, + { + "epoch": 0.15, + "grad_norm": 8.606985092163086, + "learning_rate": 2.7724386476746182e-05, + "loss": 1.0351, + "step": 884 + }, + { + "epoch": 0.15, + "grad_norm": 8.696028709411621, + "learning_rate": 2.772181225330359e-05, + "loss": 1.3133, + "step": 885 + }, + { + "epoch": 0.15, + "grad_norm": 8.831123352050781, + "learning_rate": 2.7719238029860992e-05, + "loss": 1.4761, + "step": 886 + }, + { + "epoch": 0.15, + "grad_norm": 9.008023262023926, + "learning_rate": 2.77166638064184e-05, + "loss": 1.4388, + "step": 887 + }, + { + "epoch": 0.15, + "grad_norm": 9.56207275390625, + "learning_rate": 2.7714089582975802e-05, + "loss": 1.5946, + "step": 888 + }, + { + "epoch": 0.15, + "grad_norm": 8.514052391052246, + "learning_rate": 2.771151535953321e-05, + "loss": 1.4256, + "step": 889 + }, + { + "epoch": 0.15, + "grad_norm": 10.17920970916748, + "learning_rate": 2.7708941136090612e-05, + "loss": 1.1741, + "step": 890 + }, + { + "epoch": 0.15, + "grad_norm": 8.985677719116211, + "learning_rate": 2.770636691264802e-05, + "loss": 1.3246, + "step": 891 + }, + { + "epoch": 0.15, + "grad_norm": 8.475797653198242, + "learning_rate": 2.7703792689205422e-05, + "loss": 1.4087, + "step": 892 + }, + { + "epoch": 0.15, + "grad_norm": 8.537306785583496, + "learning_rate": 2.770121846576283e-05, + "loss": 1.2527, + "step": 893 + }, + { + "epoch": 0.15, + "grad_norm": 9.671489715576172, + "learning_rate": 2.7698644242320236e-05, + "loss": 1.3167, + "step": 894 + }, + { + "epoch": 0.15, + "grad_norm": 9.936307907104492, + "learning_rate": 2.769607001887764e-05, + "loss": 1.576, + "step": 895 + }, + { + "epoch": 0.15, + "grad_norm": 8.451406478881836, + "learning_rate": 2.7693495795435046e-05, + "loss": 1.4639, + "step": 896 + }, + { + "epoch": 0.15, + "grad_norm": 8.476738929748535, + "learning_rate": 2.769092157199245e-05, + "loss": 1.3466, + "step": 897 + }, + { + "epoch": 0.15, + "grad_norm": 8.354731559753418, + "learning_rate": 2.7688347348549856e-05, + "loss": 1.3035, + "step": 898 + }, + { + "epoch": 0.15, + "grad_norm": 9.596505165100098, + "learning_rate": 2.768577312510726e-05, + "loss": 1.6087, + "step": 899 + }, + { + "epoch": 0.15, + "grad_norm": 9.208419799804688, + "learning_rate": 2.7683198901664666e-05, + "loss": 1.2007, + "step": 900 + }, + { + "epoch": 0.15, + "grad_norm": 9.790543556213379, + "learning_rate": 2.768062467822207e-05, + "loss": 1.3654, + "step": 901 + }, + { + "epoch": 0.15, + "grad_norm": 7.729399681091309, + "learning_rate": 2.7678050454779476e-05, + "loss": 1.1288, + "step": 902 + }, + { + "epoch": 0.15, + "grad_norm": 9.655479431152344, + "learning_rate": 2.767547623133688e-05, + "loss": 1.4537, + "step": 903 + }, + { + "epoch": 0.16, + "grad_norm": 8.013383865356445, + "learning_rate": 2.767290200789429e-05, + "loss": 1.302, + "step": 904 + }, + { + "epoch": 0.16, + "grad_norm": 10.9959077835083, + "learning_rate": 2.7670327784451692e-05, + "loss": 1.4639, + "step": 905 + }, + { + "epoch": 0.16, + "grad_norm": 8.561423301696777, + "learning_rate": 2.7667753561009095e-05, + "loss": 1.1788, + "step": 906 + }, + { + "epoch": 0.16, + "grad_norm": 9.05864429473877, + "learning_rate": 2.7665179337566502e-05, + "loss": 1.217, + "step": 907 + }, + { + "epoch": 0.16, + "grad_norm": 9.962160110473633, + "learning_rate": 2.7662605114123905e-05, + "loss": 1.3522, + "step": 908 + }, + { + "epoch": 0.16, + "grad_norm": 9.301092147827148, + "learning_rate": 2.7660030890681312e-05, + "loss": 1.3572, + "step": 909 + }, + { + "epoch": 0.16, + "grad_norm": 10.414505004882812, + "learning_rate": 2.7657456667238715e-05, + "loss": 1.8806, + "step": 910 + }, + { + "epoch": 0.16, + "grad_norm": 8.82504653930664, + "learning_rate": 2.7654882443796122e-05, + "loss": 1.4387, + "step": 911 + }, + { + "epoch": 0.16, + "grad_norm": 7.63746452331543, + "learning_rate": 2.7652308220353525e-05, + "loss": 1.1837, + "step": 912 + }, + { + "epoch": 0.16, + "grad_norm": 7.7293620109558105, + "learning_rate": 2.7649733996910935e-05, + "loss": 1.3675, + "step": 913 + }, + { + "epoch": 0.16, + "grad_norm": 9.78594970703125, + "learning_rate": 2.764715977346834e-05, + "loss": 1.345, + "step": 914 + }, + { + "epoch": 0.16, + "grad_norm": 7.400517463684082, + "learning_rate": 2.7644585550025745e-05, + "loss": 1.3403, + "step": 915 + }, + { + "epoch": 0.16, + "grad_norm": 7.140812873840332, + "learning_rate": 2.764201132658315e-05, + "loss": 1.3672, + "step": 916 + }, + { + "epoch": 0.16, + "grad_norm": 7.761661052703857, + "learning_rate": 2.7639437103140552e-05, + "loss": 1.2969, + "step": 917 + }, + { + "epoch": 0.16, + "grad_norm": 8.45772933959961, + "learning_rate": 2.763686287969796e-05, + "loss": 1.5179, + "step": 918 + }, + { + "epoch": 0.16, + "grad_norm": 7.847336769104004, + "learning_rate": 2.7634288656255362e-05, + "loss": 1.2688, + "step": 919 + }, + { + "epoch": 0.16, + "grad_norm": 7.579736709594727, + "learning_rate": 2.763171443281277e-05, + "loss": 1.5179, + "step": 920 + }, + { + "epoch": 0.16, + "grad_norm": 6.5482964515686035, + "learning_rate": 2.7629140209370172e-05, + "loss": 0.9854, + "step": 921 + }, + { + "epoch": 0.16, + "grad_norm": 10.172941207885742, + "learning_rate": 2.762656598592758e-05, + "loss": 1.5952, + "step": 922 + }, + { + "epoch": 0.16, + "grad_norm": 8.809013366699219, + "learning_rate": 2.7623991762484985e-05, + "loss": 1.326, + "step": 923 + }, + { + "epoch": 0.16, + "grad_norm": 9.597607612609863, + "learning_rate": 2.7621417539042392e-05, + "loss": 1.5772, + "step": 924 + }, + { + "epoch": 0.16, + "grad_norm": 9.929927825927734, + "learning_rate": 2.7618843315599795e-05, + "loss": 1.5638, + "step": 925 + }, + { + "epoch": 0.16, + "grad_norm": 9.811972618103027, + "learning_rate": 2.7616269092157202e-05, + "loss": 1.2806, + "step": 926 + }, + { + "epoch": 0.16, + "grad_norm": 9.242873191833496, + "learning_rate": 2.7613694868714605e-05, + "loss": 1.7581, + "step": 927 + }, + { + "epoch": 0.16, + "grad_norm": 8.977294921875, + "learning_rate": 2.761112064527201e-05, + "loss": 1.4192, + "step": 928 + }, + { + "epoch": 0.16, + "grad_norm": 7.7886962890625, + "learning_rate": 2.7608546421829415e-05, + "loss": 1.6531, + "step": 929 + }, + { + "epoch": 0.16, + "grad_norm": 9.794466972351074, + "learning_rate": 2.760597219838682e-05, + "loss": 1.662, + "step": 930 + }, + { + "epoch": 0.16, + "grad_norm": 8.725576400756836, + "learning_rate": 2.7603397974944225e-05, + "loss": 1.4014, + "step": 931 + }, + { + "epoch": 0.16, + "grad_norm": 8.456843376159668, + "learning_rate": 2.7600823751501632e-05, + "loss": 1.5779, + "step": 932 + }, + { + "epoch": 0.16, + "grad_norm": 8.219736099243164, + "learning_rate": 2.759824952805904e-05, + "loss": 1.3011, + "step": 933 + }, + { + "epoch": 0.16, + "grad_norm": 7.357527732849121, + "learning_rate": 2.7595675304616442e-05, + "loss": 1.3028, + "step": 934 + }, + { + "epoch": 0.16, + "grad_norm": 8.564726829528809, + "learning_rate": 2.759310108117385e-05, + "loss": 1.6339, + "step": 935 + }, + { + "epoch": 0.16, + "grad_norm": 7.923804759979248, + "learning_rate": 2.7590526857731252e-05, + "loss": 1.463, + "step": 936 + }, + { + "epoch": 0.16, + "grad_norm": 8.143631935119629, + "learning_rate": 2.7587952634288655e-05, + "loss": 1.2058, + "step": 937 + }, + { + "epoch": 0.16, + "grad_norm": 8.371817588806152, + "learning_rate": 2.7585378410846062e-05, + "loss": 1.4673, + "step": 938 + }, + { + "epoch": 0.16, + "grad_norm": 7.691073417663574, + "learning_rate": 2.7582804187403465e-05, + "loss": 1.1621, + "step": 939 + }, + { + "epoch": 0.16, + "grad_norm": 9.7957763671875, + "learning_rate": 2.7580229963960872e-05, + "loss": 1.3863, + "step": 940 + }, + { + "epoch": 0.16, + "grad_norm": 10.425039291381836, + "learning_rate": 2.7577655740518275e-05, + "loss": 1.4881, + "step": 941 + }, + { + "epoch": 0.16, + "grad_norm": 8.161550521850586, + "learning_rate": 2.7575081517075685e-05, + "loss": 1.3773, + "step": 942 + }, + { + "epoch": 0.16, + "grad_norm": 8.596711158752441, + "learning_rate": 2.757250729363309e-05, + "loss": 1.1722, + "step": 943 + }, + { + "epoch": 0.16, + "grad_norm": 8.607233047485352, + "learning_rate": 2.7569933070190495e-05, + "loss": 1.3246, + "step": 944 + }, + { + "epoch": 0.16, + "grad_norm": 8.256855010986328, + "learning_rate": 2.75673588467479e-05, + "loss": 1.4224, + "step": 945 + }, + { + "epoch": 0.16, + "grad_norm": 8.440996170043945, + "learning_rate": 2.7564784623305305e-05, + "loss": 1.0807, + "step": 946 + }, + { + "epoch": 0.16, + "grad_norm": 9.548016548156738, + "learning_rate": 2.756221039986271e-05, + "loss": 1.3143, + "step": 947 + }, + { + "epoch": 0.16, + "grad_norm": 8.569645881652832, + "learning_rate": 2.7559636176420112e-05, + "loss": 1.1236, + "step": 948 + }, + { + "epoch": 0.16, + "grad_norm": 11.226103782653809, + "learning_rate": 2.755706195297752e-05, + "loss": 1.0959, + "step": 949 + }, + { + "epoch": 0.16, + "grad_norm": 7.283565521240234, + "learning_rate": 2.755448772953492e-05, + "loss": 1.0641, + "step": 950 + }, + { + "epoch": 0.16, + "grad_norm": 11.367616653442383, + "learning_rate": 2.7551913506092332e-05, + "loss": 1.6593, + "step": 951 + }, + { + "epoch": 0.16, + "grad_norm": 8.412775039672852, + "learning_rate": 2.7549339282649735e-05, + "loss": 1.2759, + "step": 952 + }, + { + "epoch": 0.16, + "grad_norm": 7.963075160980225, + "learning_rate": 2.7546765059207142e-05, + "loss": 1.4865, + "step": 953 + }, + { + "epoch": 0.16, + "grad_norm": 10.61840534210205, + "learning_rate": 2.7544190835764545e-05, + "loss": 1.5278, + "step": 954 + }, + { + "epoch": 0.16, + "grad_norm": 10.823490142822266, + "learning_rate": 2.7541616612321952e-05, + "loss": 1.5876, + "step": 955 + }, + { + "epoch": 0.16, + "grad_norm": 9.646615028381348, + "learning_rate": 2.7539042388879355e-05, + "loss": 1.6518, + "step": 956 + }, + { + "epoch": 0.16, + "grad_norm": 8.296534538269043, + "learning_rate": 2.753646816543676e-05, + "loss": 1.5283, + "step": 957 + }, + { + "epoch": 0.16, + "grad_norm": 8.436152458190918, + "learning_rate": 2.7533893941994165e-05, + "loss": 1.2958, + "step": 958 + }, + { + "epoch": 0.16, + "grad_norm": 7.631742000579834, + "learning_rate": 2.7531319718551568e-05, + "loss": 1.1314, + "step": 959 + }, + { + "epoch": 0.16, + "grad_norm": 9.731194496154785, + "learning_rate": 2.7528745495108975e-05, + "loss": 1.7553, + "step": 960 + }, + { + "epoch": 0.16, + "grad_norm": 7.916111946105957, + "learning_rate": 2.752617127166638e-05, + "loss": 1.3605, + "step": 961 + }, + { + "epoch": 0.17, + "grad_norm": 7.284040451049805, + "learning_rate": 2.752359704822379e-05, + "loss": 1.361, + "step": 962 + }, + { + "epoch": 0.17, + "grad_norm": 9.29102611541748, + "learning_rate": 2.752102282478119e-05, + "loss": 1.5038, + "step": 963 + }, + { + "epoch": 0.17, + "grad_norm": 8.322530746459961, + "learning_rate": 2.7518448601338598e-05, + "loss": 1.3679, + "step": 964 + }, + { + "epoch": 0.17, + "grad_norm": 9.11369800567627, + "learning_rate": 2.7515874377896e-05, + "loss": 1.5415, + "step": 965 + }, + { + "epoch": 0.17, + "grad_norm": 7.6457438468933105, + "learning_rate": 2.7513300154453408e-05, + "loss": 1.2653, + "step": 966 + }, + { + "epoch": 0.17, + "grad_norm": 9.037080764770508, + "learning_rate": 2.751072593101081e-05, + "loss": 0.9422, + "step": 967 + }, + { + "epoch": 0.17, + "grad_norm": 9.72690486907959, + "learning_rate": 2.7508151707568215e-05, + "loss": 1.3183, + "step": 968 + }, + { + "epoch": 0.17, + "grad_norm": 9.159808158874512, + "learning_rate": 2.750557748412562e-05, + "loss": 1.3254, + "step": 969 + }, + { + "epoch": 0.17, + "grad_norm": 8.896674156188965, + "learning_rate": 2.7503003260683028e-05, + "loss": 1.3447, + "step": 970 + }, + { + "epoch": 0.17, + "grad_norm": 9.429150581359863, + "learning_rate": 2.7500429037240435e-05, + "loss": 1.3015, + "step": 971 + }, + { + "epoch": 0.17, + "grad_norm": 8.94772720336914, + "learning_rate": 2.7497854813797838e-05, + "loss": 1.3144, + "step": 972 + }, + { + "epoch": 0.17, + "grad_norm": 9.137228012084961, + "learning_rate": 2.7495280590355245e-05, + "loss": 1.0446, + "step": 973 + }, + { + "epoch": 0.17, + "grad_norm": 9.031691551208496, + "learning_rate": 2.7492706366912648e-05, + "loss": 0.9954, + "step": 974 + }, + { + "epoch": 0.17, + "grad_norm": 8.93708324432373, + "learning_rate": 2.7490132143470055e-05, + "loss": 1.2037, + "step": 975 + }, + { + "epoch": 0.17, + "grad_norm": 10.269675254821777, + "learning_rate": 2.7487557920027458e-05, + "loss": 1.4372, + "step": 976 + }, + { + "epoch": 0.17, + "grad_norm": 9.201359748840332, + "learning_rate": 2.7484983696584865e-05, + "loss": 1.1643, + "step": 977 + }, + { + "epoch": 0.17, + "grad_norm": 14.586709976196289, + "learning_rate": 2.7482409473142268e-05, + "loss": 1.5917, + "step": 978 + }, + { + "epoch": 0.17, + "grad_norm": 11.20718002319336, + "learning_rate": 2.747983524969967e-05, + "loss": 1.4842, + "step": 979 + }, + { + "epoch": 0.17, + "grad_norm": 10.088601112365723, + "learning_rate": 2.747726102625708e-05, + "loss": 1.4403, + "step": 980 + }, + { + "epoch": 0.17, + "grad_norm": 8.692402839660645, + "learning_rate": 2.7474686802814485e-05, + "loss": 1.2648, + "step": 981 + }, + { + "epoch": 0.17, + "grad_norm": 9.76282787322998, + "learning_rate": 2.747211257937189e-05, + "loss": 1.4759, + "step": 982 + }, + { + "epoch": 0.17, + "grad_norm": 9.340919494628906, + "learning_rate": 2.7469538355929295e-05, + "loss": 1.4664, + "step": 983 + }, + { + "epoch": 0.17, + "grad_norm": 10.029942512512207, + "learning_rate": 2.74669641324867e-05, + "loss": 1.4747, + "step": 984 + }, + { + "epoch": 0.17, + "grad_norm": 10.550951957702637, + "learning_rate": 2.7464389909044105e-05, + "loss": 1.8357, + "step": 985 + }, + { + "epoch": 0.17, + "grad_norm": 9.380844116210938, + "learning_rate": 2.746181568560151e-05, + "loss": 1.2763, + "step": 986 + }, + { + "epoch": 0.17, + "grad_norm": 8.784636497497559, + "learning_rate": 2.7459241462158915e-05, + "loss": 1.2095, + "step": 987 + }, + { + "epoch": 0.17, + "grad_norm": 8.628052711486816, + "learning_rate": 2.745666723871632e-05, + "loss": 1.3114, + "step": 988 + }, + { + "epoch": 0.17, + "grad_norm": 8.085103988647461, + "learning_rate": 2.7454093015273728e-05, + "loss": 1.2096, + "step": 989 + }, + { + "epoch": 0.17, + "grad_norm": 9.746047973632812, + "learning_rate": 2.745151879183113e-05, + "loss": 1.7489, + "step": 990 + }, + { + "epoch": 0.17, + "grad_norm": 9.280878067016602, + "learning_rate": 2.7448944568388538e-05, + "loss": 1.3324, + "step": 991 + }, + { + "epoch": 0.17, + "grad_norm": 7.545253753662109, + "learning_rate": 2.744637034494594e-05, + "loss": 1.3374, + "step": 992 + }, + { + "epoch": 0.17, + "grad_norm": 8.984289169311523, + "learning_rate": 2.7443796121503348e-05, + "loss": 1.4667, + "step": 993 + }, + { + "epoch": 0.17, + "grad_norm": 9.125088691711426, + "learning_rate": 2.744122189806075e-05, + "loss": 1.2929, + "step": 994 + }, + { + "epoch": 0.17, + "grad_norm": 6.825432777404785, + "learning_rate": 2.7438647674618158e-05, + "loss": 1.2813, + "step": 995 + }, + { + "epoch": 0.17, + "grad_norm": 9.433895111083984, + "learning_rate": 2.743607345117556e-05, + "loss": 1.3678, + "step": 996 + }, + { + "epoch": 0.17, + "grad_norm": 8.616389274597168, + "learning_rate": 2.7433499227732968e-05, + "loss": 1.4376, + "step": 997 + }, + { + "epoch": 0.17, + "grad_norm": 10.003437042236328, + "learning_rate": 2.7430925004290375e-05, + "loss": 1.6973, + "step": 998 + }, + { + "epoch": 0.17, + "grad_norm": 10.557788848876953, + "learning_rate": 2.7428350780847778e-05, + "loss": 1.4694, + "step": 999 + }, + { + "epoch": 0.17, + "grad_norm": 10.035713195800781, + "learning_rate": 2.7425776557405185e-05, + "loss": 1.4349, + "step": 1000 + }, + { + "epoch": 0.17, + "grad_norm": 9.870453834533691, + "learning_rate": 2.7423202333962588e-05, + "loss": 1.3485, + "step": 1001 + }, + { + "epoch": 0.17, + "grad_norm": 8.851469993591309, + "learning_rate": 2.7420628110519995e-05, + "loss": 1.3463, + "step": 1002 + }, + { + "epoch": 0.17, + "grad_norm": 9.11250114440918, + "learning_rate": 2.7418053887077398e-05, + "loss": 1.8346, + "step": 1003 + }, + { + "epoch": 0.17, + "grad_norm": 8.808446884155273, + "learning_rate": 2.7415479663634805e-05, + "loss": 1.4318, + "step": 1004 + }, + { + "epoch": 0.17, + "grad_norm": 8.631911277770996, + "learning_rate": 2.7412905440192208e-05, + "loss": 1.1731, + "step": 1005 + }, + { + "epoch": 0.17, + "grad_norm": 6.999469757080078, + "learning_rate": 2.7410331216749615e-05, + "loss": 1.2488, + "step": 1006 + }, + { + "epoch": 0.17, + "grad_norm": 8.889474868774414, + "learning_rate": 2.7407756993307018e-05, + "loss": 1.5142, + "step": 1007 + }, + { + "epoch": 0.17, + "grad_norm": 7.910511016845703, + "learning_rate": 2.7405182769864428e-05, + "loss": 1.2691, + "step": 1008 + }, + { + "epoch": 0.17, + "grad_norm": 8.317187309265137, + "learning_rate": 2.740260854642183e-05, + "loss": 1.14, + "step": 1009 + }, + { + "epoch": 0.17, + "grad_norm": 8.15993881225586, + "learning_rate": 2.7400034322979234e-05, + "loss": 1.1762, + "step": 1010 + }, + { + "epoch": 0.17, + "grad_norm": 7.653911590576172, + "learning_rate": 2.739746009953664e-05, + "loss": 1.2432, + "step": 1011 + }, + { + "epoch": 0.17, + "grad_norm": 9.123514175415039, + "learning_rate": 2.7394885876094044e-05, + "loss": 1.0913, + "step": 1012 + }, + { + "epoch": 0.17, + "grad_norm": 9.117807388305664, + "learning_rate": 2.739231165265145e-05, + "loss": 1.3455, + "step": 1013 + }, + { + "epoch": 0.17, + "grad_norm": 9.247596740722656, + "learning_rate": 2.7389737429208854e-05, + "loss": 1.2397, + "step": 1014 + }, + { + "epoch": 0.17, + "grad_norm": 9.6008939743042, + "learning_rate": 2.738716320576626e-05, + "loss": 1.3644, + "step": 1015 + }, + { + "epoch": 0.17, + "grad_norm": 10.20018482208252, + "learning_rate": 2.7384588982323664e-05, + "loss": 1.4907, + "step": 1016 + }, + { + "epoch": 0.17, + "grad_norm": 8.3839693069458, + "learning_rate": 2.7382014758881074e-05, + "loss": 1.3098, + "step": 1017 + }, + { + "epoch": 0.17, + "grad_norm": 8.762544631958008, + "learning_rate": 2.7379440535438478e-05, + "loss": 1.2336, + "step": 1018 + }, + { + "epoch": 0.17, + "grad_norm": 9.597867965698242, + "learning_rate": 2.7376866311995884e-05, + "loss": 1.428, + "step": 1019 + }, + { + "epoch": 0.18, + "grad_norm": 9.393948554992676, + "learning_rate": 2.7374292088553288e-05, + "loss": 1.2987, + "step": 1020 + }, + { + "epoch": 0.18, + "grad_norm": 8.504260063171387, + "learning_rate": 2.737171786511069e-05, + "loss": 1.1499, + "step": 1021 + }, + { + "epoch": 0.18, + "grad_norm": 9.080394744873047, + "learning_rate": 2.7369143641668098e-05, + "loss": 1.3871, + "step": 1022 + }, + { + "epoch": 0.18, + "grad_norm": 8.832898139953613, + "learning_rate": 2.73665694182255e-05, + "loss": 1.1188, + "step": 1023 + }, + { + "epoch": 0.18, + "grad_norm": 9.266690254211426, + "learning_rate": 2.7363995194782908e-05, + "loss": 1.4037, + "step": 1024 + }, + { + "epoch": 0.18, + "grad_norm": 9.53109359741211, + "learning_rate": 2.736142097134031e-05, + "loss": 1.3645, + "step": 1025 + }, + { + "epoch": 0.18, + "grad_norm": 9.360075950622559, + "learning_rate": 2.7358846747897718e-05, + "loss": 1.4885, + "step": 1026 + }, + { + "epoch": 0.18, + "grad_norm": 9.781167030334473, + "learning_rate": 2.7356272524455124e-05, + "loss": 1.673, + "step": 1027 + }, + { + "epoch": 0.18, + "grad_norm": 11.556243896484375, + "learning_rate": 2.735369830101253e-05, + "loss": 1.4541, + "step": 1028 + }, + { + "epoch": 0.18, + "grad_norm": 9.666065216064453, + "learning_rate": 2.7351124077569934e-05, + "loss": 1.5686, + "step": 1029 + }, + { + "epoch": 0.18, + "grad_norm": 8.713845252990723, + "learning_rate": 2.734854985412734e-05, + "loss": 1.2677, + "step": 1030 + }, + { + "epoch": 0.18, + "grad_norm": 9.593574523925781, + "learning_rate": 2.7345975630684744e-05, + "loss": 1.4152, + "step": 1031 + }, + { + "epoch": 0.18, + "grad_norm": 9.018454551696777, + "learning_rate": 2.7343401407242148e-05, + "loss": 1.4421, + "step": 1032 + }, + { + "epoch": 0.18, + "grad_norm": 8.887699127197266, + "learning_rate": 2.7340827183799554e-05, + "loss": 1.085, + "step": 1033 + }, + { + "epoch": 0.18, + "grad_norm": 8.83116626739502, + "learning_rate": 2.7338252960356958e-05, + "loss": 1.101, + "step": 1034 + }, + { + "epoch": 0.18, + "grad_norm": 8.687261581420898, + "learning_rate": 2.7335678736914364e-05, + "loss": 1.158, + "step": 1035 + }, + { + "epoch": 0.18, + "grad_norm": 8.233701705932617, + "learning_rate": 2.733310451347177e-05, + "loss": 1.1735, + "step": 1036 + }, + { + "epoch": 0.18, + "grad_norm": 10.780478477478027, + "learning_rate": 2.7330530290029178e-05, + "loss": 1.4419, + "step": 1037 + }, + { + "epoch": 0.18, + "grad_norm": 9.909387588500977, + "learning_rate": 2.732795606658658e-05, + "loss": 1.2245, + "step": 1038 + }, + { + "epoch": 0.18, + "grad_norm": 8.03641128540039, + "learning_rate": 2.7325381843143988e-05, + "loss": 1.5364, + "step": 1039 + }, + { + "epoch": 0.18, + "grad_norm": 7.964493274688721, + "learning_rate": 2.732280761970139e-05, + "loss": 1.0429, + "step": 1040 + }, + { + "epoch": 0.18, + "grad_norm": 10.043766021728516, + "learning_rate": 2.7320233396258794e-05, + "loss": 1.6543, + "step": 1041 + }, + { + "epoch": 0.18, + "grad_norm": 8.70139217376709, + "learning_rate": 2.73176591728162e-05, + "loss": 0.9705, + "step": 1042 + }, + { + "epoch": 0.18, + "grad_norm": 8.825154304504395, + "learning_rate": 2.7315084949373604e-05, + "loss": 1.4729, + "step": 1043 + }, + { + "epoch": 0.18, + "grad_norm": 7.802508354187012, + "learning_rate": 2.731251072593101e-05, + "loss": 1.2117, + "step": 1044 + }, + { + "epoch": 0.18, + "grad_norm": 9.77186393737793, + "learning_rate": 2.7309936502488414e-05, + "loss": 1.604, + "step": 1045 + }, + { + "epoch": 0.18, + "grad_norm": 10.28923225402832, + "learning_rate": 2.7307362279045824e-05, + "loss": 1.3641, + "step": 1046 + }, + { + "epoch": 0.18, + "grad_norm": 8.963648796081543, + "learning_rate": 2.7304788055603227e-05, + "loss": 1.1925, + "step": 1047 + }, + { + "epoch": 0.18, + "grad_norm": 8.663580894470215, + "learning_rate": 2.7302213832160634e-05, + "loss": 1.2647, + "step": 1048 + }, + { + "epoch": 0.18, + "grad_norm": 8.14822006225586, + "learning_rate": 2.7299639608718037e-05, + "loss": 1.2401, + "step": 1049 + }, + { + "epoch": 0.18, + "grad_norm": 10.369800567626953, + "learning_rate": 2.7297065385275444e-05, + "loss": 1.3942, + "step": 1050 + }, + { + "epoch": 0.18, + "grad_norm": 9.311555862426758, + "learning_rate": 2.7294491161832847e-05, + "loss": 1.2202, + "step": 1051 + }, + { + "epoch": 0.18, + "grad_norm": 8.417490005493164, + "learning_rate": 2.729191693839025e-05, + "loss": 1.0955, + "step": 1052 + }, + { + "epoch": 0.18, + "grad_norm": 10.044454574584961, + "learning_rate": 2.7289342714947657e-05, + "loss": 1.2672, + "step": 1053 + }, + { + "epoch": 0.18, + "grad_norm": 9.748496055603027, + "learning_rate": 2.728676849150506e-05, + "loss": 1.1653, + "step": 1054 + }, + { + "epoch": 0.18, + "grad_norm": 9.002110481262207, + "learning_rate": 2.728419426806247e-05, + "loss": 1.1476, + "step": 1055 + }, + { + "epoch": 0.18, + "grad_norm": 8.822497367858887, + "learning_rate": 2.7281620044619874e-05, + "loss": 1.367, + "step": 1056 + }, + { + "epoch": 0.18, + "grad_norm": 9.774259567260742, + "learning_rate": 2.727904582117728e-05, + "loss": 1.242, + "step": 1057 + }, + { + "epoch": 0.18, + "grad_norm": 9.742218017578125, + "learning_rate": 2.7276471597734684e-05, + "loss": 1.5533, + "step": 1058 + }, + { + "epoch": 0.18, + "grad_norm": 8.503838539123535, + "learning_rate": 2.727389737429209e-05, + "loss": 1.2968, + "step": 1059 + }, + { + "epoch": 0.18, + "grad_norm": 9.95584774017334, + "learning_rate": 2.7271323150849494e-05, + "loss": 1.3682, + "step": 1060 + }, + { + "epoch": 0.18, + "grad_norm": 9.960376739501953, + "learning_rate": 2.72687489274069e-05, + "loss": 1.4881, + "step": 1061 + }, + { + "epoch": 0.18, + "grad_norm": 7.651063919067383, + "learning_rate": 2.7266174703964304e-05, + "loss": 1.1785, + "step": 1062 + }, + { + "epoch": 0.18, + "grad_norm": 8.194541931152344, + "learning_rate": 2.7263600480521707e-05, + "loss": 1.172, + "step": 1063 + }, + { + "epoch": 0.18, + "grad_norm": 8.579036712646484, + "learning_rate": 2.7261026257079114e-05, + "loss": 1.563, + "step": 1064 + }, + { + "epoch": 0.18, + "grad_norm": 6.894886493682861, + "learning_rate": 2.725845203363652e-05, + "loss": 1.2226, + "step": 1065 + }, + { + "epoch": 0.18, + "grad_norm": 9.038219451904297, + "learning_rate": 2.7255877810193927e-05, + "loss": 1.1319, + "step": 1066 + }, + { + "epoch": 0.18, + "grad_norm": 7.268056869506836, + "learning_rate": 2.725330358675133e-05, + "loss": 1.0023, + "step": 1067 + }, + { + "epoch": 0.18, + "grad_norm": 8.772714614868164, + "learning_rate": 2.7250729363308737e-05, + "loss": 1.3525, + "step": 1068 + }, + { + "epoch": 0.18, + "grad_norm": 8.555374145507812, + "learning_rate": 2.724815513986614e-05, + "loss": 1.1983, + "step": 1069 + }, + { + "epoch": 0.18, + "grad_norm": 9.176958084106445, + "learning_rate": 2.7245580916423547e-05, + "loss": 1.3963, + "step": 1070 + }, + { + "epoch": 0.18, + "grad_norm": 10.416862487792969, + "learning_rate": 2.724300669298095e-05, + "loss": 1.2034, + "step": 1071 + }, + { + "epoch": 0.18, + "grad_norm": 8.942066192626953, + "learning_rate": 2.7240432469538354e-05, + "loss": 1.3266, + "step": 1072 + }, + { + "epoch": 0.18, + "grad_norm": 10.036736488342285, + "learning_rate": 2.723785824609576e-05, + "loss": 1.4365, + "step": 1073 + }, + { + "epoch": 0.18, + "grad_norm": 10.972512245178223, + "learning_rate": 2.7235284022653167e-05, + "loss": 1.4161, + "step": 1074 + }, + { + "epoch": 0.18, + "grad_norm": 11.268491744995117, + "learning_rate": 2.7232709799210574e-05, + "loss": 1.2924, + "step": 1075 + }, + { + "epoch": 0.18, + "grad_norm": 8.488449096679688, + "learning_rate": 2.7230135575767977e-05, + "loss": 1.0855, + "step": 1076 + }, + { + "epoch": 0.18, + "grad_norm": 9.027210235595703, + "learning_rate": 2.7227561352325384e-05, + "loss": 1.3078, + "step": 1077 + }, + { + "epoch": 0.19, + "grad_norm": 9.640469551086426, + "learning_rate": 2.7224987128882787e-05, + "loss": 1.3335, + "step": 1078 + }, + { + "epoch": 0.19, + "grad_norm": 6.25822114944458, + "learning_rate": 2.7222412905440194e-05, + "loss": 0.8048, + "step": 1079 + }, + { + "epoch": 0.19, + "grad_norm": 8.746109962463379, + "learning_rate": 2.7219838681997597e-05, + "loss": 1.454, + "step": 1080 + }, + { + "epoch": 0.19, + "grad_norm": 9.298506736755371, + "learning_rate": 2.7217264458555004e-05, + "loss": 1.1325, + "step": 1081 + }, + { + "epoch": 0.19, + "grad_norm": 10.089698791503906, + "learning_rate": 2.7214690235112407e-05, + "loss": 1.4946, + "step": 1082 + }, + { + "epoch": 0.19, + "grad_norm": 9.399145126342773, + "learning_rate": 2.721211601166981e-05, + "loss": 1.3796, + "step": 1083 + }, + { + "epoch": 0.19, + "grad_norm": 10.360230445861816, + "learning_rate": 2.720954178822722e-05, + "loss": 1.7057, + "step": 1084 + }, + { + "epoch": 0.19, + "grad_norm": 9.048979759216309, + "learning_rate": 2.7206967564784624e-05, + "loss": 1.2736, + "step": 1085 + }, + { + "epoch": 0.19, + "grad_norm": 9.581949234008789, + "learning_rate": 2.720439334134203e-05, + "loss": 1.2437, + "step": 1086 + }, + { + "epoch": 0.19, + "grad_norm": 8.216153144836426, + "learning_rate": 2.7201819117899434e-05, + "loss": 1.1414, + "step": 1087 + }, + { + "epoch": 0.19, + "grad_norm": 11.193462371826172, + "learning_rate": 2.719924489445684e-05, + "loss": 1.5273, + "step": 1088 + }, + { + "epoch": 0.19, + "grad_norm": 9.907831192016602, + "learning_rate": 2.7196670671014244e-05, + "loss": 1.4671, + "step": 1089 + }, + { + "epoch": 0.19, + "grad_norm": 8.600893020629883, + "learning_rate": 2.719409644757165e-05, + "loss": 1.3231, + "step": 1090 + }, + { + "epoch": 0.19, + "grad_norm": 9.562541961669922, + "learning_rate": 2.7191522224129054e-05, + "loss": 1.4549, + "step": 1091 + }, + { + "epoch": 0.19, + "grad_norm": 8.27472972869873, + "learning_rate": 2.718894800068646e-05, + "loss": 1.2719, + "step": 1092 + }, + { + "epoch": 0.19, + "grad_norm": 8.14704418182373, + "learning_rate": 2.7186373777243867e-05, + "loss": 1.0528, + "step": 1093 + }, + { + "epoch": 0.19, + "grad_norm": 9.050322532653809, + "learning_rate": 2.718379955380127e-05, + "loss": 1.5463, + "step": 1094 + }, + { + "epoch": 0.19, + "grad_norm": 7.295217514038086, + "learning_rate": 2.7181225330358677e-05, + "loss": 1.1819, + "step": 1095 + }, + { + "epoch": 0.19, + "grad_norm": 8.834641456604004, + "learning_rate": 2.717865110691608e-05, + "loss": 1.4474, + "step": 1096 + }, + { + "epoch": 0.19, + "grad_norm": 8.026323318481445, + "learning_rate": 2.7176076883473487e-05, + "loss": 1.1722, + "step": 1097 + }, + { + "epoch": 0.19, + "grad_norm": 10.28368854522705, + "learning_rate": 2.717350266003089e-05, + "loss": 1.2893, + "step": 1098 + }, + { + "epoch": 0.19, + "grad_norm": 9.067283630371094, + "learning_rate": 2.7170928436588297e-05, + "loss": 1.0889, + "step": 1099 + }, + { + "epoch": 0.19, + "grad_norm": 9.283278465270996, + "learning_rate": 2.71683542131457e-05, + "loss": 1.1066, + "step": 1100 + }, + { + "epoch": 0.19, + "grad_norm": 9.266766548156738, + "learning_rate": 2.7165779989703107e-05, + "loss": 1.5371, + "step": 1101 + }, + { + "epoch": 0.19, + "grad_norm": 9.837309837341309, + "learning_rate": 2.716320576626051e-05, + "loss": 1.2524, + "step": 1102 + }, + { + "epoch": 0.19, + "grad_norm": 11.806564331054688, + "learning_rate": 2.7160631542817917e-05, + "loss": 1.7349, + "step": 1103 + }, + { + "epoch": 0.19, + "grad_norm": 10.308426856994629, + "learning_rate": 2.7158057319375324e-05, + "loss": 1.8968, + "step": 1104 + }, + { + "epoch": 0.19, + "grad_norm": 8.350577354431152, + "learning_rate": 2.7155483095932727e-05, + "loss": 1.2429, + "step": 1105 + }, + { + "epoch": 0.19, + "grad_norm": 9.256535530090332, + "learning_rate": 2.7152908872490134e-05, + "loss": 1.2191, + "step": 1106 + }, + { + "epoch": 0.19, + "grad_norm": 10.345229148864746, + "learning_rate": 2.7150334649047537e-05, + "loss": 1.1011, + "step": 1107 + }, + { + "epoch": 0.19, + "grad_norm": 8.522645950317383, + "learning_rate": 2.7147760425604944e-05, + "loss": 1.2915, + "step": 1108 + }, + { + "epoch": 0.19, + "grad_norm": 10.150025367736816, + "learning_rate": 2.7145186202162347e-05, + "loss": 1.2009, + "step": 1109 + }, + { + "epoch": 0.19, + "grad_norm": 10.376709938049316, + "learning_rate": 2.7142611978719754e-05, + "loss": 1.5594, + "step": 1110 + }, + { + "epoch": 0.19, + "grad_norm": 8.121232032775879, + "learning_rate": 2.7140037755277157e-05, + "loss": 1.3129, + "step": 1111 + }, + { + "epoch": 0.19, + "grad_norm": 9.032523155212402, + "learning_rate": 2.7137463531834567e-05, + "loss": 1.1689, + "step": 1112 + }, + { + "epoch": 0.19, + "grad_norm": 7.9547905921936035, + "learning_rate": 2.713488930839197e-05, + "loss": 1.2583, + "step": 1113 + }, + { + "epoch": 0.19, + "grad_norm": 7.238072395324707, + "learning_rate": 2.7132315084949373e-05, + "loss": 1.1958, + "step": 1114 + }, + { + "epoch": 0.19, + "grad_norm": 9.16339111328125, + "learning_rate": 2.712974086150678e-05, + "loss": 1.2236, + "step": 1115 + }, + { + "epoch": 0.19, + "grad_norm": 9.557994842529297, + "learning_rate": 2.7127166638064183e-05, + "loss": 1.1426, + "step": 1116 + }, + { + "epoch": 0.19, + "grad_norm": 8.909612655639648, + "learning_rate": 2.712459241462159e-05, + "loss": 1.2599, + "step": 1117 + }, + { + "epoch": 0.19, + "grad_norm": 10.249550819396973, + "learning_rate": 2.7122018191178993e-05, + "loss": 1.4662, + "step": 1118 + }, + { + "epoch": 0.19, + "grad_norm": 9.27196979522705, + "learning_rate": 2.71194439677364e-05, + "loss": 1.472, + "step": 1119 + }, + { + "epoch": 0.19, + "grad_norm": 11.640380859375, + "learning_rate": 2.7116869744293803e-05, + "loss": 1.4584, + "step": 1120 + }, + { + "epoch": 0.19, + "grad_norm": 9.803701400756836, + "learning_rate": 2.7114295520851213e-05, + "loss": 1.0802, + "step": 1121 + }, + { + "epoch": 0.19, + "grad_norm": 8.972593307495117, + "learning_rate": 2.7111721297408617e-05, + "loss": 1.3308, + "step": 1122 + }, + { + "epoch": 0.19, + "grad_norm": 8.79111385345459, + "learning_rate": 2.7109147073966023e-05, + "loss": 1.2226, + "step": 1123 + }, + { + "epoch": 0.19, + "grad_norm": 11.142602920532227, + "learning_rate": 2.7106572850523427e-05, + "loss": 1.5817, + "step": 1124 + }, + { + "epoch": 0.19, + "grad_norm": 7.296048641204834, + "learning_rate": 2.710399862708083e-05, + "loss": 1.0926, + "step": 1125 + }, + { + "epoch": 0.19, + "grad_norm": 9.158304214477539, + "learning_rate": 2.7101424403638237e-05, + "loss": 1.4636, + "step": 1126 + }, + { + "epoch": 0.19, + "grad_norm": 9.532355308532715, + "learning_rate": 2.709885018019564e-05, + "loss": 1.4941, + "step": 1127 + }, + { + "epoch": 0.19, + "grad_norm": 7.275026798248291, + "learning_rate": 2.7096275956753047e-05, + "loss": 0.9197, + "step": 1128 + }, + { + "epoch": 0.19, + "grad_norm": 10.894339561462402, + "learning_rate": 2.709370173331045e-05, + "loss": 1.5351, + "step": 1129 + }, + { + "epoch": 0.19, + "grad_norm": 7.644191265106201, + "learning_rate": 2.7091127509867857e-05, + "loss": 1.028, + "step": 1130 + }, + { + "epoch": 0.19, + "grad_norm": 9.101517677307129, + "learning_rate": 2.7088553286425263e-05, + "loss": 1.4001, + "step": 1131 + }, + { + "epoch": 0.19, + "grad_norm": 7.999380588531494, + "learning_rate": 2.708597906298267e-05, + "loss": 1.015, + "step": 1132 + }, + { + "epoch": 0.19, + "grad_norm": 8.905345916748047, + "learning_rate": 2.7083404839540073e-05, + "loss": 1.6007, + "step": 1133 + }, + { + "epoch": 0.19, + "grad_norm": 8.128106117248535, + "learning_rate": 2.708083061609748e-05, + "loss": 1.4566, + "step": 1134 + }, + { + "epoch": 0.19, + "grad_norm": 8.673796653747559, + "learning_rate": 2.7078256392654883e-05, + "loss": 1.6084, + "step": 1135 + }, + { + "epoch": 0.19, + "grad_norm": 7.865784168243408, + "learning_rate": 2.7075682169212287e-05, + "loss": 1.2059, + "step": 1136 + }, + { + "epoch": 0.2, + "grad_norm": 8.854172706604004, + "learning_rate": 2.7073107945769693e-05, + "loss": 1.2551, + "step": 1137 + }, + { + "epoch": 0.2, + "grad_norm": 7.900066375732422, + "learning_rate": 2.7070533722327097e-05, + "loss": 1.1971, + "step": 1138 + }, + { + "epoch": 0.2, + "grad_norm": 8.985684394836426, + "learning_rate": 2.7067959498884503e-05, + "loss": 1.4171, + "step": 1139 + }, + { + "epoch": 0.2, + "grad_norm": 9.223388671875, + "learning_rate": 2.706538527544191e-05, + "loss": 1.502, + "step": 1140 + }, + { + "epoch": 0.2, + "grad_norm": 9.067261695861816, + "learning_rate": 2.7062811051999317e-05, + "loss": 1.5211, + "step": 1141 + }, + { + "epoch": 0.2, + "grad_norm": 8.43513298034668, + "learning_rate": 2.706023682855672e-05, + "loss": 1.1767, + "step": 1142 + }, + { + "epoch": 0.2, + "grad_norm": 8.506795883178711, + "learning_rate": 2.7057662605114127e-05, + "loss": 1.4128, + "step": 1143 + }, + { + "epoch": 0.2, + "grad_norm": 7.250001430511475, + "learning_rate": 2.705508838167153e-05, + "loss": 1.2489, + "step": 1144 + }, + { + "epoch": 0.2, + "grad_norm": 7.432572841644287, + "learning_rate": 2.7052514158228933e-05, + "loss": 1.1841, + "step": 1145 + }, + { + "epoch": 0.2, + "grad_norm": 9.109137535095215, + "learning_rate": 2.704993993478634e-05, + "loss": 1.2777, + "step": 1146 + }, + { + "epoch": 0.2, + "grad_norm": 8.922881126403809, + "learning_rate": 2.7047365711343743e-05, + "loss": 1.4648, + "step": 1147 + }, + { + "epoch": 0.2, + "grad_norm": 8.688481330871582, + "learning_rate": 2.704479148790115e-05, + "loss": 1.358, + "step": 1148 + }, + { + "epoch": 0.2, + "grad_norm": 8.354059219360352, + "learning_rate": 2.7042217264458553e-05, + "loss": 0.9972, + "step": 1149 + }, + { + "epoch": 0.2, + "grad_norm": 8.341496467590332, + "learning_rate": 2.7039643041015963e-05, + "loss": 1.2205, + "step": 1150 + }, + { + "epoch": 0.2, + "grad_norm": 8.882163047790527, + "learning_rate": 2.7037068817573366e-05, + "loss": 1.2021, + "step": 1151 + }, + { + "epoch": 0.2, + "grad_norm": 8.504749298095703, + "learning_rate": 2.7034494594130773e-05, + "loss": 1.3326, + "step": 1152 + }, + { + "epoch": 0.2, + "grad_norm": 9.217513084411621, + "learning_rate": 2.7031920370688176e-05, + "loss": 1.3535, + "step": 1153 + }, + { + "epoch": 0.2, + "grad_norm": 9.810968399047852, + "learning_rate": 2.7029346147245583e-05, + "loss": 1.2546, + "step": 1154 + }, + { + "epoch": 0.2, + "grad_norm": 8.84786319732666, + "learning_rate": 2.7026771923802986e-05, + "loss": 1.4116, + "step": 1155 + }, + { + "epoch": 0.2, + "grad_norm": 9.833187103271484, + "learning_rate": 2.702419770036039e-05, + "loss": 1.4681, + "step": 1156 + }, + { + "epoch": 0.2, + "grad_norm": 7.529823303222656, + "learning_rate": 2.7021623476917796e-05, + "loss": 1.2409, + "step": 1157 + }, + { + "epoch": 0.2, + "grad_norm": 8.504322052001953, + "learning_rate": 2.70190492534752e-05, + "loss": 1.1238, + "step": 1158 + }, + { + "epoch": 0.2, + "grad_norm": 8.834338188171387, + "learning_rate": 2.701647503003261e-05, + "loss": 1.3571, + "step": 1159 + }, + { + "epoch": 0.2, + "grad_norm": 9.878161430358887, + "learning_rate": 2.7013900806590013e-05, + "loss": 1.19, + "step": 1160 + }, + { + "epoch": 0.2, + "grad_norm": 8.034382820129395, + "learning_rate": 2.701132658314742e-05, + "loss": 1.0617, + "step": 1161 + }, + { + "epoch": 0.2, + "grad_norm": 7.727060317993164, + "learning_rate": 2.7008752359704823e-05, + "loss": 1.061, + "step": 1162 + }, + { + "epoch": 0.2, + "grad_norm": 8.908223152160645, + "learning_rate": 2.700617813626223e-05, + "loss": 0.8979, + "step": 1163 + }, + { + "epoch": 0.2, + "grad_norm": 8.940628051757812, + "learning_rate": 2.7003603912819633e-05, + "loss": 1.3193, + "step": 1164 + }, + { + "epoch": 0.2, + "grad_norm": 10.10629653930664, + "learning_rate": 2.700102968937704e-05, + "loss": 1.7784, + "step": 1165 + }, + { + "epoch": 0.2, + "grad_norm": 7.900351524353027, + "learning_rate": 2.6998455465934443e-05, + "loss": 1.2081, + "step": 1166 + }, + { + "epoch": 0.2, + "grad_norm": 10.000555038452148, + "learning_rate": 2.6995881242491846e-05, + "loss": 1.3022, + "step": 1167 + }, + { + "epoch": 0.2, + "grad_norm": 9.327181816101074, + "learning_rate": 2.6993307019049253e-05, + "loss": 1.2732, + "step": 1168 + }, + { + "epoch": 0.2, + "grad_norm": 9.021303176879883, + "learning_rate": 2.699073279560666e-05, + "loss": 1.4867, + "step": 1169 + }, + { + "epoch": 0.2, + "grad_norm": 11.396635055541992, + "learning_rate": 2.6988158572164066e-05, + "loss": 1.8524, + "step": 1170 + }, + { + "epoch": 0.2, + "grad_norm": 8.448280334472656, + "learning_rate": 2.698558434872147e-05, + "loss": 1.3967, + "step": 1171 + }, + { + "epoch": 0.2, + "grad_norm": 9.023969650268555, + "learning_rate": 2.6983010125278876e-05, + "loss": 1.1958, + "step": 1172 + }, + { + "epoch": 0.2, + "grad_norm": 8.381837844848633, + "learning_rate": 2.698043590183628e-05, + "loss": 1.2659, + "step": 1173 + }, + { + "epoch": 0.2, + "grad_norm": 7.306741714477539, + "learning_rate": 2.6977861678393686e-05, + "loss": 1.1591, + "step": 1174 + }, + { + "epoch": 0.2, + "grad_norm": 6.948760032653809, + "learning_rate": 2.697528745495109e-05, + "loss": 1.0862, + "step": 1175 + }, + { + "epoch": 0.2, + "grad_norm": 8.335007667541504, + "learning_rate": 2.6972713231508493e-05, + "loss": 0.8766, + "step": 1176 + }, + { + "epoch": 0.2, + "grad_norm": 8.226988792419434, + "learning_rate": 2.69701390080659e-05, + "loss": 1.2587, + "step": 1177 + }, + { + "epoch": 0.2, + "grad_norm": 7.792378902435303, + "learning_rate": 2.6967564784623306e-05, + "loss": 1.2425, + "step": 1178 + }, + { + "epoch": 0.2, + "grad_norm": 9.548218727111816, + "learning_rate": 2.6964990561180713e-05, + "loss": 1.2892, + "step": 1179 + }, + { + "epoch": 0.2, + "grad_norm": 10.905377388000488, + "learning_rate": 2.6962416337738116e-05, + "loss": 1.5361, + "step": 1180 + }, + { + "epoch": 0.2, + "grad_norm": 8.688945770263672, + "learning_rate": 2.6959842114295523e-05, + "loss": 1.3123, + "step": 1181 + }, + { + "epoch": 0.2, + "grad_norm": 7.875213146209717, + "learning_rate": 2.6957267890852926e-05, + "loss": 0.9015, + "step": 1182 + }, + { + "epoch": 0.2, + "grad_norm": 9.691976547241211, + "learning_rate": 2.6954693667410333e-05, + "loss": 1.4515, + "step": 1183 + }, + { + "epoch": 0.2, + "grad_norm": 11.65355396270752, + "learning_rate": 2.6952119443967736e-05, + "loss": 1.9028, + "step": 1184 + }, + { + "epoch": 0.2, + "grad_norm": 8.477837562561035, + "learning_rate": 2.6949545220525143e-05, + "loss": 1.2233, + "step": 1185 + }, + { + "epoch": 0.2, + "grad_norm": 8.239665985107422, + "learning_rate": 2.6946970997082546e-05, + "loss": 1.2302, + "step": 1186 + }, + { + "epoch": 0.2, + "grad_norm": 9.567435264587402, + "learning_rate": 2.694439677363995e-05, + "loss": 1.456, + "step": 1187 + }, + { + "epoch": 0.2, + "grad_norm": 7.737056732177734, + "learning_rate": 2.694182255019736e-05, + "loss": 1.0246, + "step": 1188 + }, + { + "epoch": 0.2, + "grad_norm": 8.440693855285645, + "learning_rate": 2.6939248326754763e-05, + "loss": 1.1833, + "step": 1189 + }, + { + "epoch": 0.2, + "grad_norm": 8.651070594787598, + "learning_rate": 2.693667410331217e-05, + "loss": 1.2408, + "step": 1190 + }, + { + "epoch": 0.2, + "grad_norm": 9.181726455688477, + "learning_rate": 2.6934099879869573e-05, + "loss": 1.4955, + "step": 1191 + }, + { + "epoch": 0.2, + "grad_norm": 9.472039222717285, + "learning_rate": 2.693152565642698e-05, + "loss": 1.4052, + "step": 1192 + }, + { + "epoch": 0.2, + "grad_norm": 9.136848449707031, + "learning_rate": 2.6928951432984383e-05, + "loss": 1.0829, + "step": 1193 + }, + { + "epoch": 0.2, + "grad_norm": 7.888769626617432, + "learning_rate": 2.692637720954179e-05, + "loss": 1.1699, + "step": 1194 + }, + { + "epoch": 0.21, + "grad_norm": 8.18743896484375, + "learning_rate": 2.6923802986099193e-05, + "loss": 1.0537, + "step": 1195 + }, + { + "epoch": 0.21, + "grad_norm": 9.397427558898926, + "learning_rate": 2.69212287626566e-05, + "loss": 1.2303, + "step": 1196 + }, + { + "epoch": 0.21, + "grad_norm": 8.45164966583252, + "learning_rate": 2.6918654539214006e-05, + "loss": 1.5722, + "step": 1197 + }, + { + "epoch": 0.21, + "grad_norm": 10.499549865722656, + "learning_rate": 2.691608031577141e-05, + "loss": 1.4703, + "step": 1198 + }, + { + "epoch": 0.21, + "grad_norm": 9.246573448181152, + "learning_rate": 2.6913506092328816e-05, + "loss": 1.5224, + "step": 1199 + }, + { + "epoch": 0.21, + "grad_norm": 9.132889747619629, + "learning_rate": 2.691093186888622e-05, + "loss": 1.2977, + "step": 1200 + }, + { + "epoch": 0.21, + "grad_norm": 8.937135696411133, + "learning_rate": 2.6908357645443626e-05, + "loss": 1.3967, + "step": 1201 + }, + { + "epoch": 0.21, + "grad_norm": 9.2399263381958, + "learning_rate": 2.690578342200103e-05, + "loss": 1.0836, + "step": 1202 + }, + { + "epoch": 0.21, + "grad_norm": 9.38232135772705, + "learning_rate": 2.6903209198558436e-05, + "loss": 1.2621, + "step": 1203 + }, + { + "epoch": 0.21, + "grad_norm": 9.263805389404297, + "learning_rate": 2.690063497511584e-05, + "loss": 1.382, + "step": 1204 + }, + { + "epoch": 0.21, + "grad_norm": 8.532333374023438, + "learning_rate": 2.6898060751673246e-05, + "loss": 1.0092, + "step": 1205 + }, + { + "epoch": 0.21, + "grad_norm": 10.846914291381836, + "learning_rate": 2.689548652823065e-05, + "loss": 1.2845, + "step": 1206 + }, + { + "epoch": 0.21, + "grad_norm": 7.730926990509033, + "learning_rate": 2.6892912304788056e-05, + "loss": 0.9868, + "step": 1207 + }, + { + "epoch": 0.21, + "grad_norm": 9.525662422180176, + "learning_rate": 2.6890338081345463e-05, + "loss": 1.4252, + "step": 1208 + }, + { + "epoch": 0.21, + "grad_norm": 9.073384284973145, + "learning_rate": 2.6887763857902866e-05, + "loss": 1.4963, + "step": 1209 + }, + { + "epoch": 0.21, + "grad_norm": 10.478918075561523, + "learning_rate": 2.6885189634460273e-05, + "loss": 1.3453, + "step": 1210 + }, + { + "epoch": 0.21, + "grad_norm": 10.732980728149414, + "learning_rate": 2.6882615411017676e-05, + "loss": 1.4029, + "step": 1211 + }, + { + "epoch": 0.21, + "grad_norm": 8.388533592224121, + "learning_rate": 2.6880041187575083e-05, + "loss": 1.1371, + "step": 1212 + }, + { + "epoch": 0.21, + "grad_norm": 8.94727611541748, + "learning_rate": 2.6877466964132486e-05, + "loss": 1.2883, + "step": 1213 + }, + { + "epoch": 0.21, + "grad_norm": 9.437952995300293, + "learning_rate": 2.6874892740689893e-05, + "loss": 1.311, + "step": 1214 + }, + { + "epoch": 0.21, + "grad_norm": 10.140856742858887, + "learning_rate": 2.6872318517247296e-05, + "loss": 1.1271, + "step": 1215 + }, + { + "epoch": 0.21, + "grad_norm": 8.91845417022705, + "learning_rate": 2.6869744293804706e-05, + "loss": 1.1375, + "step": 1216 + }, + { + "epoch": 0.21, + "grad_norm": 8.56423568725586, + "learning_rate": 2.686717007036211e-05, + "loss": 1.4121, + "step": 1217 + }, + { + "epoch": 0.21, + "grad_norm": 9.286993026733398, + "learning_rate": 2.6864595846919513e-05, + "loss": 1.304, + "step": 1218 + }, + { + "epoch": 0.21, + "grad_norm": 9.56916332244873, + "learning_rate": 2.686202162347692e-05, + "loss": 1.4004, + "step": 1219 + }, + { + "epoch": 0.21, + "grad_norm": 8.579012870788574, + "learning_rate": 2.6859447400034322e-05, + "loss": 1.4633, + "step": 1220 + }, + { + "epoch": 0.21, + "grad_norm": 7.460388660430908, + "learning_rate": 2.685687317659173e-05, + "loss": 0.9385, + "step": 1221 + }, + { + "epoch": 0.21, + "grad_norm": 10.638516426086426, + "learning_rate": 2.6854298953149132e-05, + "loss": 1.6112, + "step": 1222 + }, + { + "epoch": 0.21, + "grad_norm": 9.712013244628906, + "learning_rate": 2.685172472970654e-05, + "loss": 1.4429, + "step": 1223 + }, + { + "epoch": 0.21, + "grad_norm": 8.578434944152832, + "learning_rate": 2.6849150506263942e-05, + "loss": 1.4887, + "step": 1224 + }, + { + "epoch": 0.21, + "grad_norm": 9.140835762023926, + "learning_rate": 2.6846576282821352e-05, + "loss": 1.1822, + "step": 1225 + }, + { + "epoch": 0.21, + "grad_norm": 9.051060676574707, + "learning_rate": 2.6844002059378756e-05, + "loss": 1.2473, + "step": 1226 + }, + { + "epoch": 0.21, + "grad_norm": 9.668966293334961, + "learning_rate": 2.6841427835936162e-05, + "loss": 1.2553, + "step": 1227 + }, + { + "epoch": 0.21, + "grad_norm": 8.400277137756348, + "learning_rate": 2.6838853612493566e-05, + "loss": 1.2029, + "step": 1228 + }, + { + "epoch": 0.21, + "grad_norm": 8.655241012573242, + "learning_rate": 2.683627938905097e-05, + "loss": 1.5658, + "step": 1229 + }, + { + "epoch": 0.21, + "grad_norm": 8.049247741699219, + "learning_rate": 2.6833705165608376e-05, + "loss": 1.293, + "step": 1230 + }, + { + "epoch": 0.21, + "grad_norm": 9.07247543334961, + "learning_rate": 2.683113094216578e-05, + "loss": 1.3902, + "step": 1231 + }, + { + "epoch": 0.21, + "grad_norm": 8.127625465393066, + "learning_rate": 2.6828556718723186e-05, + "loss": 1.2842, + "step": 1232 + }, + { + "epoch": 0.21, + "grad_norm": 7.649299144744873, + "learning_rate": 2.682598249528059e-05, + "loss": 0.8758, + "step": 1233 + }, + { + "epoch": 0.21, + "grad_norm": 10.730408668518066, + "learning_rate": 2.6823408271837996e-05, + "loss": 1.4391, + "step": 1234 + }, + { + "epoch": 0.21, + "grad_norm": 10.480378150939941, + "learning_rate": 2.6820834048395402e-05, + "loss": 1.5429, + "step": 1235 + }, + { + "epoch": 0.21, + "grad_norm": 9.526721954345703, + "learning_rate": 2.681825982495281e-05, + "loss": 1.2032, + "step": 1236 + }, + { + "epoch": 0.21, + "grad_norm": 10.1389799118042, + "learning_rate": 2.6815685601510212e-05, + "loss": 1.0992, + "step": 1237 + }, + { + "epoch": 0.21, + "grad_norm": 10.146998405456543, + "learning_rate": 2.681311137806762e-05, + "loss": 1.4844, + "step": 1238 + }, + { + "epoch": 0.21, + "grad_norm": 11.229116439819336, + "learning_rate": 2.6810537154625022e-05, + "loss": 1.2092, + "step": 1239 + }, + { + "epoch": 0.21, + "grad_norm": 9.03791332244873, + "learning_rate": 2.6807962931182426e-05, + "loss": 1.2726, + "step": 1240 + }, + { + "epoch": 0.21, + "grad_norm": 9.248305320739746, + "learning_rate": 2.6805388707739832e-05, + "loss": 1.4621, + "step": 1241 + }, + { + "epoch": 0.21, + "grad_norm": 9.154080390930176, + "learning_rate": 2.6802814484297236e-05, + "loss": 1.3475, + "step": 1242 + }, + { + "epoch": 0.21, + "grad_norm": 9.43741512298584, + "learning_rate": 2.6800240260854642e-05, + "loss": 1.3909, + "step": 1243 + }, + { + "epoch": 0.21, + "grad_norm": 8.604585647583008, + "learning_rate": 2.679766603741205e-05, + "loss": 1.4298, + "step": 1244 + }, + { + "epoch": 0.21, + "grad_norm": 8.446846961975098, + "learning_rate": 2.6795091813969456e-05, + "loss": 1.1632, + "step": 1245 + }, + { + "epoch": 0.21, + "grad_norm": 7.354076862335205, + "learning_rate": 2.679251759052686e-05, + "loss": 1.257, + "step": 1246 + }, + { + "epoch": 0.21, + "grad_norm": 9.221611976623535, + "learning_rate": 2.6789943367084266e-05, + "loss": 1.4602, + "step": 1247 + }, + { + "epoch": 0.21, + "grad_norm": 8.221695899963379, + "learning_rate": 2.678736914364167e-05, + "loss": 1.3385, + "step": 1248 + }, + { + "epoch": 0.21, + "grad_norm": 8.068194389343262, + "learning_rate": 2.6784794920199072e-05, + "loss": 1.226, + "step": 1249 + }, + { + "epoch": 0.21, + "grad_norm": 8.30330753326416, + "learning_rate": 2.678222069675648e-05, + "loss": 1.3546, + "step": 1250 + }, + { + "epoch": 0.21, + "grad_norm": 9.008672714233398, + "learning_rate": 2.6779646473313882e-05, + "loss": 1.4961, + "step": 1251 + }, + { + "epoch": 0.21, + "grad_norm": 9.799113273620605, + "learning_rate": 2.677707224987129e-05, + "loss": 1.419, + "step": 1252 + }, + { + "epoch": 0.22, + "grad_norm": 8.145889282226562, + "learning_rate": 2.6774498026428692e-05, + "loss": 0.9806, + "step": 1253 + }, + { + "epoch": 0.22, + "grad_norm": 8.050820350646973, + "learning_rate": 2.6771923802986102e-05, + "loss": 1.3412, + "step": 1254 + }, + { + "epoch": 0.22, + "grad_norm": 8.461099624633789, + "learning_rate": 2.6769349579543506e-05, + "loss": 1.1918, + "step": 1255 + }, + { + "epoch": 0.22, + "grad_norm": 8.203243255615234, + "learning_rate": 2.6766775356100912e-05, + "loss": 1.2476, + "step": 1256 + }, + { + "epoch": 0.22, + "grad_norm": 8.571125984191895, + "learning_rate": 2.6764201132658315e-05, + "loss": 1.1585, + "step": 1257 + }, + { + "epoch": 0.22, + "grad_norm": 9.301223754882812, + "learning_rate": 2.6761626909215722e-05, + "loss": 1.5106, + "step": 1258 + }, + { + "epoch": 0.22, + "grad_norm": 9.19857406616211, + "learning_rate": 2.6759052685773125e-05, + "loss": 1.2659, + "step": 1259 + }, + { + "epoch": 0.22, + "grad_norm": 9.036355018615723, + "learning_rate": 2.675647846233053e-05, + "loss": 1.137, + "step": 1260 + }, + { + "epoch": 0.22, + "grad_norm": 10.456526756286621, + "learning_rate": 2.6753904238887935e-05, + "loss": 1.4946, + "step": 1261 + }, + { + "epoch": 0.22, + "grad_norm": 8.323201179504395, + "learning_rate": 2.675133001544534e-05, + "loss": 1.1494, + "step": 1262 + }, + { + "epoch": 0.22, + "grad_norm": 10.239011764526367, + "learning_rate": 2.674875579200275e-05, + "loss": 1.3713, + "step": 1263 + }, + { + "epoch": 0.22, + "grad_norm": 8.527006149291992, + "learning_rate": 2.6746181568560152e-05, + "loss": 1.1823, + "step": 1264 + }, + { + "epoch": 0.22, + "grad_norm": 7.94394063949585, + "learning_rate": 2.674360734511756e-05, + "loss": 1.322, + "step": 1265 + }, + { + "epoch": 0.22, + "grad_norm": 8.312918663024902, + "learning_rate": 2.6741033121674962e-05, + "loss": 1.1758, + "step": 1266 + }, + { + "epoch": 0.22, + "grad_norm": 9.257575988769531, + "learning_rate": 2.673845889823237e-05, + "loss": 1.2889, + "step": 1267 + }, + { + "epoch": 0.22, + "grad_norm": 10.413891792297363, + "learning_rate": 2.6735884674789772e-05, + "loss": 1.515, + "step": 1268 + }, + { + "epoch": 0.22, + "grad_norm": 7.223206043243408, + "learning_rate": 2.673331045134718e-05, + "loss": 1.0707, + "step": 1269 + }, + { + "epoch": 0.22, + "grad_norm": 9.151848793029785, + "learning_rate": 2.6730736227904582e-05, + "loss": 1.0929, + "step": 1270 + }, + { + "epoch": 0.22, + "grad_norm": 8.26773452758789, + "learning_rate": 2.6728162004461985e-05, + "loss": 1.1119, + "step": 1271 + }, + { + "epoch": 0.22, + "grad_norm": 11.067112922668457, + "learning_rate": 2.6725587781019392e-05, + "loss": 1.4869, + "step": 1272 + }, + { + "epoch": 0.22, + "grad_norm": 9.439190864562988, + "learning_rate": 2.67230135575768e-05, + "loss": 1.4908, + "step": 1273 + }, + { + "epoch": 0.22, + "grad_norm": 8.8590669631958, + "learning_rate": 2.6720439334134205e-05, + "loss": 1.3083, + "step": 1274 + }, + { + "epoch": 0.22, + "grad_norm": 8.895794868469238, + "learning_rate": 2.671786511069161e-05, + "loss": 1.2789, + "step": 1275 + }, + { + "epoch": 0.22, + "grad_norm": 10.065464973449707, + "learning_rate": 2.6715290887249015e-05, + "loss": 1.3152, + "step": 1276 + }, + { + "epoch": 0.22, + "grad_norm": 8.821928977966309, + "learning_rate": 2.671271666380642e-05, + "loss": 1.3512, + "step": 1277 + }, + { + "epoch": 0.22, + "grad_norm": 8.471129417419434, + "learning_rate": 2.6710142440363825e-05, + "loss": 1.0972, + "step": 1278 + }, + { + "epoch": 0.22, + "grad_norm": 9.04776668548584, + "learning_rate": 2.670756821692123e-05, + "loss": 1.248, + "step": 1279 + }, + { + "epoch": 0.22, + "grad_norm": 8.509843826293945, + "learning_rate": 2.6704993993478632e-05, + "loss": 1.2877, + "step": 1280 + }, + { + "epoch": 0.22, + "grad_norm": 9.483633041381836, + "learning_rate": 2.670241977003604e-05, + "loss": 1.217, + "step": 1281 + }, + { + "epoch": 0.22, + "grad_norm": 8.813414573669434, + "learning_rate": 2.6699845546593445e-05, + "loss": 1.2106, + "step": 1282 + }, + { + "epoch": 0.22, + "grad_norm": 8.751136779785156, + "learning_rate": 2.6697271323150852e-05, + "loss": 1.1097, + "step": 1283 + }, + { + "epoch": 0.22, + "grad_norm": 8.629862785339355, + "learning_rate": 2.6694697099708255e-05, + "loss": 1.1052, + "step": 1284 + }, + { + "epoch": 0.22, + "grad_norm": 7.738077640533447, + "learning_rate": 2.6692122876265662e-05, + "loss": 0.9371, + "step": 1285 + }, + { + "epoch": 0.22, + "grad_norm": 9.272834777832031, + "learning_rate": 2.6689548652823065e-05, + "loss": 1.178, + "step": 1286 + }, + { + "epoch": 0.22, + "grad_norm": 8.922650337219238, + "learning_rate": 2.6686974429380472e-05, + "loss": 1.1148, + "step": 1287 + }, + { + "epoch": 0.22, + "grad_norm": 10.045205116271973, + "learning_rate": 2.6684400205937875e-05, + "loss": 1.4301, + "step": 1288 + }, + { + "epoch": 0.22, + "grad_norm": 9.679380416870117, + "learning_rate": 2.6681825982495282e-05, + "loss": 1.3469, + "step": 1289 + }, + { + "epoch": 0.22, + "grad_norm": 8.931136131286621, + "learning_rate": 2.6679251759052685e-05, + "loss": 1.4528, + "step": 1290 + }, + { + "epoch": 0.22, + "grad_norm": 7.9316534996032715, + "learning_rate": 2.667667753561009e-05, + "loss": 0.8932, + "step": 1291 + }, + { + "epoch": 0.22, + "grad_norm": 8.66825008392334, + "learning_rate": 2.66741033121675e-05, + "loss": 1.2996, + "step": 1292 + }, + { + "epoch": 0.22, + "grad_norm": 7.578680515289307, + "learning_rate": 2.6671529088724902e-05, + "loss": 0.929, + "step": 1293 + }, + { + "epoch": 0.22, + "grad_norm": 9.219220161437988, + "learning_rate": 2.666895486528231e-05, + "loss": 1.1454, + "step": 1294 + }, + { + "epoch": 0.22, + "grad_norm": 7.822579860687256, + "learning_rate": 2.6666380641839712e-05, + "loss": 1.1635, + "step": 1295 + }, + { + "epoch": 0.22, + "grad_norm": 8.75918197631836, + "learning_rate": 2.666380641839712e-05, + "loss": 1.3699, + "step": 1296 + }, + { + "epoch": 0.22, + "grad_norm": 9.776763916015625, + "learning_rate": 2.6661232194954522e-05, + "loss": 1.425, + "step": 1297 + }, + { + "epoch": 0.22, + "grad_norm": 7.903858184814453, + "learning_rate": 2.665865797151193e-05, + "loss": 1.2982, + "step": 1298 + }, + { + "epoch": 0.22, + "grad_norm": 7.8899312019348145, + "learning_rate": 2.6656083748069332e-05, + "loss": 1.0836, + "step": 1299 + }, + { + "epoch": 0.22, + "grad_norm": 9.65578556060791, + "learning_rate": 2.665350952462674e-05, + "loss": 1.2516, + "step": 1300 + }, + { + "epoch": 0.22, + "grad_norm": 11.424769401550293, + "learning_rate": 2.6650935301184145e-05, + "loss": 1.6553, + "step": 1301 + }, + { + "epoch": 0.22, + "grad_norm": 9.16710090637207, + "learning_rate": 2.664836107774155e-05, + "loss": 1.1495, + "step": 1302 + }, + { + "epoch": 0.22, + "grad_norm": 9.541897773742676, + "learning_rate": 2.6645786854298955e-05, + "loss": 1.2282, + "step": 1303 + }, + { + "epoch": 0.22, + "grad_norm": 9.607604026794434, + "learning_rate": 2.664321263085636e-05, + "loss": 1.4688, + "step": 1304 + }, + { + "epoch": 0.22, + "grad_norm": 8.082883834838867, + "learning_rate": 2.6640638407413765e-05, + "loss": 0.9886, + "step": 1305 + }, + { + "epoch": 0.22, + "grad_norm": 10.696791648864746, + "learning_rate": 2.663806418397117e-05, + "loss": 1.0391, + "step": 1306 + }, + { + "epoch": 0.22, + "grad_norm": 9.231827735900879, + "learning_rate": 2.6635489960528575e-05, + "loss": 1.4162, + "step": 1307 + }, + { + "epoch": 0.22, + "grad_norm": 9.852787971496582, + "learning_rate": 2.6632915737085978e-05, + "loss": 1.0936, + "step": 1308 + }, + { + "epoch": 0.22, + "grad_norm": 8.198777198791504, + "learning_rate": 2.6630341513643385e-05, + "loss": 1.2517, + "step": 1309 + }, + { + "epoch": 0.22, + "grad_norm": 9.10767650604248, + "learning_rate": 2.6627767290200788e-05, + "loss": 1.2609, + "step": 1310 + }, + { + "epoch": 0.22, + "grad_norm": 9.866680145263672, + "learning_rate": 2.6625193066758195e-05, + "loss": 1.2264, + "step": 1311 + }, + { + "epoch": 0.23, + "grad_norm": 7.617739677429199, + "learning_rate": 2.66226188433156e-05, + "loss": 1.0467, + "step": 1312 + }, + { + "epoch": 0.23, + "grad_norm": 7.443436622619629, + "learning_rate": 2.6620044619873005e-05, + "loss": 0.9794, + "step": 1313 + }, + { + "epoch": 0.23, + "grad_norm": 8.602325439453125, + "learning_rate": 2.661747039643041e-05, + "loss": 1.3684, + "step": 1314 + }, + { + "epoch": 0.23, + "grad_norm": 8.663492202758789, + "learning_rate": 2.6614896172987815e-05, + "loss": 1.2373, + "step": 1315 + }, + { + "epoch": 0.23, + "grad_norm": 11.530584335327148, + "learning_rate": 2.661232194954522e-05, + "loss": 1.4406, + "step": 1316 + }, + { + "epoch": 0.23, + "grad_norm": 8.34512996673584, + "learning_rate": 2.6609747726102625e-05, + "loss": 1.0497, + "step": 1317 + }, + { + "epoch": 0.23, + "grad_norm": 7.65249490737915, + "learning_rate": 2.660717350266003e-05, + "loss": 1.0519, + "step": 1318 + }, + { + "epoch": 0.23, + "grad_norm": 9.023550987243652, + "learning_rate": 2.6604599279217435e-05, + "loss": 1.3027, + "step": 1319 + }, + { + "epoch": 0.23, + "grad_norm": 10.111237525939941, + "learning_rate": 2.6602025055774845e-05, + "loss": 1.4954, + "step": 1320 + }, + { + "epoch": 0.23, + "grad_norm": 10.713436126708984, + "learning_rate": 2.6599450832332248e-05, + "loss": 1.6837, + "step": 1321 + }, + { + "epoch": 0.23, + "grad_norm": 9.139344215393066, + "learning_rate": 2.659687660888965e-05, + "loss": 1.4476, + "step": 1322 + }, + { + "epoch": 0.23, + "grad_norm": 8.681056022644043, + "learning_rate": 2.6594302385447058e-05, + "loss": 1.2565, + "step": 1323 + }, + { + "epoch": 0.23, + "grad_norm": 8.738880157470703, + "learning_rate": 2.659172816200446e-05, + "loss": 1.2536, + "step": 1324 + }, + { + "epoch": 0.23, + "grad_norm": 8.696887969970703, + "learning_rate": 2.6589153938561868e-05, + "loss": 1.6081, + "step": 1325 + }, + { + "epoch": 0.23, + "grad_norm": 8.451643943786621, + "learning_rate": 2.658657971511927e-05, + "loss": 1.1937, + "step": 1326 + }, + { + "epoch": 0.23, + "grad_norm": 9.157116889953613, + "learning_rate": 2.6584005491676678e-05, + "loss": 1.1614, + "step": 1327 + }, + { + "epoch": 0.23, + "grad_norm": 10.9005765914917, + "learning_rate": 2.658143126823408e-05, + "loss": 1.3988, + "step": 1328 + }, + { + "epoch": 0.23, + "grad_norm": 7.618021011352539, + "learning_rate": 2.6578857044791488e-05, + "loss": 1.0125, + "step": 1329 + }, + { + "epoch": 0.23, + "grad_norm": 8.41938591003418, + "learning_rate": 2.6576282821348895e-05, + "loss": 1.1074, + "step": 1330 + }, + { + "epoch": 0.23, + "grad_norm": 9.549199104309082, + "learning_rate": 2.65737085979063e-05, + "loss": 1.3725, + "step": 1331 + }, + { + "epoch": 0.23, + "grad_norm": 7.936274528503418, + "learning_rate": 2.6571134374463705e-05, + "loss": 1.4184, + "step": 1332 + }, + { + "epoch": 0.23, + "grad_norm": 9.33743667602539, + "learning_rate": 2.6568560151021108e-05, + "loss": 1.3609, + "step": 1333 + }, + { + "epoch": 0.23, + "grad_norm": 9.436013221740723, + "learning_rate": 2.6565985927578515e-05, + "loss": 1.3234, + "step": 1334 + }, + { + "epoch": 0.23, + "grad_norm": 10.227049827575684, + "learning_rate": 2.6563411704135918e-05, + "loss": 1.455, + "step": 1335 + }, + { + "epoch": 0.23, + "grad_norm": 8.859749794006348, + "learning_rate": 2.6560837480693325e-05, + "loss": 1.381, + "step": 1336 + }, + { + "epoch": 0.23, + "grad_norm": 9.653039932250977, + "learning_rate": 2.6558263257250728e-05, + "loss": 1.2519, + "step": 1337 + }, + { + "epoch": 0.23, + "grad_norm": 10.58700942993164, + "learning_rate": 2.6555689033808135e-05, + "loss": 1.5164, + "step": 1338 + }, + { + "epoch": 0.23, + "grad_norm": 8.98537826538086, + "learning_rate": 2.655311481036554e-05, + "loss": 1.0812, + "step": 1339 + }, + { + "epoch": 0.23, + "grad_norm": 9.802996635437012, + "learning_rate": 2.6550540586922948e-05, + "loss": 1.4409, + "step": 1340 + }, + { + "epoch": 0.23, + "grad_norm": 7.917903900146484, + "learning_rate": 2.654796636348035e-05, + "loss": 0.9767, + "step": 1341 + }, + { + "epoch": 0.23, + "grad_norm": 10.221455574035645, + "learning_rate": 2.6545392140037758e-05, + "loss": 1.2497, + "step": 1342 + }, + { + "epoch": 0.23, + "grad_norm": 8.750977516174316, + "learning_rate": 2.654281791659516e-05, + "loss": 1.1992, + "step": 1343 + }, + { + "epoch": 0.23, + "grad_norm": 9.34138298034668, + "learning_rate": 2.6540243693152565e-05, + "loss": 1.3582, + "step": 1344 + }, + { + "epoch": 0.23, + "grad_norm": 10.008773803710938, + "learning_rate": 2.653766946970997e-05, + "loss": 1.5271, + "step": 1345 + }, + { + "epoch": 0.23, + "grad_norm": 8.252571105957031, + "learning_rate": 2.6535095246267375e-05, + "loss": 1.1353, + "step": 1346 + }, + { + "epoch": 0.23, + "grad_norm": 9.196686744689941, + "learning_rate": 2.653252102282478e-05, + "loss": 1.2903, + "step": 1347 + }, + { + "epoch": 0.23, + "grad_norm": 10.113208770751953, + "learning_rate": 2.6529946799382188e-05, + "loss": 1.1973, + "step": 1348 + }, + { + "epoch": 0.23, + "grad_norm": 9.13182258605957, + "learning_rate": 2.6527372575939595e-05, + "loss": 1.1939, + "step": 1349 + }, + { + "epoch": 0.23, + "grad_norm": 9.174057960510254, + "learning_rate": 2.6524798352496998e-05, + "loss": 1.3718, + "step": 1350 + }, + { + "epoch": 0.23, + "grad_norm": 8.287139892578125, + "learning_rate": 2.6522224129054405e-05, + "loss": 1.4532, + "step": 1351 + }, + { + "epoch": 0.23, + "grad_norm": 8.905390739440918, + "learning_rate": 2.6519649905611808e-05, + "loss": 1.2902, + "step": 1352 + }, + { + "epoch": 0.23, + "grad_norm": 9.484794616699219, + "learning_rate": 2.651707568216921e-05, + "loss": 1.2279, + "step": 1353 + }, + { + "epoch": 0.23, + "grad_norm": 8.344484329223633, + "learning_rate": 2.6514501458726618e-05, + "loss": 1.0599, + "step": 1354 + }, + { + "epoch": 0.23, + "grad_norm": 8.926910400390625, + "learning_rate": 2.651192723528402e-05, + "loss": 1.246, + "step": 1355 + }, + { + "epoch": 0.23, + "grad_norm": 8.935715675354004, + "learning_rate": 2.6509353011841428e-05, + "loss": 1.2552, + "step": 1356 + }, + { + "epoch": 0.23, + "grad_norm": 8.152778625488281, + "learning_rate": 2.650677878839883e-05, + "loss": 1.1893, + "step": 1357 + }, + { + "epoch": 0.23, + "grad_norm": 8.60451602935791, + "learning_rate": 2.650420456495624e-05, + "loss": 1.1447, + "step": 1358 + }, + { + "epoch": 0.23, + "grad_norm": 9.021504402160645, + "learning_rate": 2.6501630341513645e-05, + "loss": 1.0769, + "step": 1359 + }, + { + "epoch": 0.23, + "grad_norm": 8.891911506652832, + "learning_rate": 2.649905611807105e-05, + "loss": 0.9558, + "step": 1360 + }, + { + "epoch": 0.23, + "grad_norm": 7.601040363311768, + "learning_rate": 2.6496481894628454e-05, + "loss": 1.0276, + "step": 1361 + }, + { + "epoch": 0.23, + "grad_norm": 8.832829475402832, + "learning_rate": 2.649390767118586e-05, + "loss": 1.287, + "step": 1362 + }, + { + "epoch": 0.23, + "grad_norm": 9.218730926513672, + "learning_rate": 2.6491333447743264e-05, + "loss": 1.0422, + "step": 1363 + }, + { + "epoch": 0.23, + "grad_norm": 9.534462928771973, + "learning_rate": 2.6488759224300668e-05, + "loss": 1.4117, + "step": 1364 + }, + { + "epoch": 0.23, + "grad_norm": 8.883268356323242, + "learning_rate": 2.6486185000858074e-05, + "loss": 1.2663, + "step": 1365 + }, + { + "epoch": 0.23, + "grad_norm": 10.62038803100586, + "learning_rate": 2.6483610777415478e-05, + "loss": 1.1101, + "step": 1366 + }, + { + "epoch": 0.23, + "grad_norm": 10.019786834716797, + "learning_rate": 2.6481036553972888e-05, + "loss": 1.3157, + "step": 1367 + }, + { + "epoch": 0.23, + "grad_norm": 8.632616996765137, + "learning_rate": 2.647846233053029e-05, + "loss": 1.1233, + "step": 1368 + }, + { + "epoch": 0.23, + "grad_norm": 10.388335227966309, + "learning_rate": 2.6475888107087698e-05, + "loss": 1.3165, + "step": 1369 + }, + { + "epoch": 0.24, + "grad_norm": 7.935334205627441, + "learning_rate": 2.64733138836451e-05, + "loss": 1.1324, + "step": 1370 + }, + { + "epoch": 0.24, + "grad_norm": 8.972235679626465, + "learning_rate": 2.6470739660202508e-05, + "loss": 1.2516, + "step": 1371 + }, + { + "epoch": 0.24, + "grad_norm": 8.370040893554688, + "learning_rate": 2.646816543675991e-05, + "loss": 1.1454, + "step": 1372 + }, + { + "epoch": 0.24, + "grad_norm": 7.459895610809326, + "learning_rate": 2.6465591213317318e-05, + "loss": 1.247, + "step": 1373 + }, + { + "epoch": 0.24, + "grad_norm": 9.179786682128906, + "learning_rate": 2.646301698987472e-05, + "loss": 1.3015, + "step": 1374 + }, + { + "epoch": 0.24, + "grad_norm": 9.019957542419434, + "learning_rate": 2.6460442766432124e-05, + "loss": 1.12, + "step": 1375 + }, + { + "epoch": 0.24, + "grad_norm": 8.15821647644043, + "learning_rate": 2.645786854298953e-05, + "loss": 1.3423, + "step": 1376 + }, + { + "epoch": 0.24, + "grad_norm": 9.62690258026123, + "learning_rate": 2.6455294319546938e-05, + "loss": 1.3008, + "step": 1377 + }, + { + "epoch": 0.24, + "grad_norm": 10.900343894958496, + "learning_rate": 2.6452720096104344e-05, + "loss": 1.6182, + "step": 1378 + }, + { + "epoch": 0.24, + "grad_norm": 9.850994110107422, + "learning_rate": 2.6450145872661748e-05, + "loss": 1.2545, + "step": 1379 + }, + { + "epoch": 0.24, + "grad_norm": 7.256598472595215, + "learning_rate": 2.6447571649219154e-05, + "loss": 0.8649, + "step": 1380 + }, + { + "epoch": 0.24, + "grad_norm": 8.804770469665527, + "learning_rate": 2.6444997425776558e-05, + "loss": 1.3885, + "step": 1381 + }, + { + "epoch": 0.24, + "grad_norm": 9.699951171875, + "learning_rate": 2.6442423202333964e-05, + "loss": 1.0952, + "step": 1382 + }, + { + "epoch": 0.24, + "grad_norm": 12.45407485961914, + "learning_rate": 2.6439848978891368e-05, + "loss": 1.3435, + "step": 1383 + }, + { + "epoch": 0.24, + "grad_norm": 10.335962295532227, + "learning_rate": 2.6437274755448774e-05, + "loss": 1.4324, + "step": 1384 + }, + { + "epoch": 0.24, + "grad_norm": 10.221283912658691, + "learning_rate": 2.6434700532006178e-05, + "loss": 1.1905, + "step": 1385 + }, + { + "epoch": 0.24, + "grad_norm": 9.06550407409668, + "learning_rate": 2.6432126308563584e-05, + "loss": 1.444, + "step": 1386 + }, + { + "epoch": 0.24, + "grad_norm": 10.470441818237305, + "learning_rate": 2.642955208512099e-05, + "loss": 1.2508, + "step": 1387 + }, + { + "epoch": 0.24, + "grad_norm": 9.260307312011719, + "learning_rate": 2.6426977861678394e-05, + "loss": 0.9873, + "step": 1388 + }, + { + "epoch": 0.24, + "grad_norm": 8.299884796142578, + "learning_rate": 2.64244036382358e-05, + "loss": 1.4163, + "step": 1389 + }, + { + "epoch": 0.24, + "grad_norm": 8.187042236328125, + "learning_rate": 2.6421829414793204e-05, + "loss": 1.1896, + "step": 1390 + }, + { + "epoch": 0.24, + "grad_norm": 9.056967735290527, + "learning_rate": 2.641925519135061e-05, + "loss": 0.9773, + "step": 1391 + }, + { + "epoch": 0.24, + "grad_norm": 10.281644821166992, + "learning_rate": 2.6416680967908014e-05, + "loss": 1.4031, + "step": 1392 + }, + { + "epoch": 0.24, + "grad_norm": 9.758089065551758, + "learning_rate": 2.641410674446542e-05, + "loss": 1.3926, + "step": 1393 + }, + { + "epoch": 0.24, + "grad_norm": 9.50987720489502, + "learning_rate": 2.6411532521022824e-05, + "loss": 1.2747, + "step": 1394 + }, + { + "epoch": 0.24, + "grad_norm": 9.121760368347168, + "learning_rate": 2.6408958297580227e-05, + "loss": 1.3597, + "step": 1395 + }, + { + "epoch": 0.24, + "grad_norm": 9.474566459655762, + "learning_rate": 2.6406384074137638e-05, + "loss": 1.4541, + "step": 1396 + }, + { + "epoch": 0.24, + "grad_norm": 8.610142707824707, + "learning_rate": 2.640380985069504e-05, + "loss": 1.3285, + "step": 1397 + }, + { + "epoch": 0.24, + "grad_norm": 10.047139167785645, + "learning_rate": 2.6401235627252447e-05, + "loss": 1.5605, + "step": 1398 + }, + { + "epoch": 0.24, + "grad_norm": 9.22934341430664, + "learning_rate": 2.639866140380985e-05, + "loss": 1.3207, + "step": 1399 + }, + { + "epoch": 0.24, + "grad_norm": 9.720802307128906, + "learning_rate": 2.6396087180367257e-05, + "loss": 1.3976, + "step": 1400 + }, + { + "epoch": 0.24, + "grad_norm": 9.501425743103027, + "learning_rate": 2.639351295692466e-05, + "loss": 1.3132, + "step": 1401 + }, + { + "epoch": 0.24, + "grad_norm": 11.377949714660645, + "learning_rate": 2.6390938733482067e-05, + "loss": 1.3306, + "step": 1402 + }, + { + "epoch": 0.24, + "grad_norm": 9.295886039733887, + "learning_rate": 2.638836451003947e-05, + "loss": 1.4345, + "step": 1403 + }, + { + "epoch": 0.24, + "grad_norm": 7.783322811126709, + "learning_rate": 2.6385790286596877e-05, + "loss": 1.2414, + "step": 1404 + }, + { + "epoch": 0.24, + "grad_norm": 9.674589157104492, + "learning_rate": 2.6383216063154284e-05, + "loss": 1.4647, + "step": 1405 + }, + { + "epoch": 0.24, + "grad_norm": 8.309123039245605, + "learning_rate": 2.6380641839711687e-05, + "loss": 1.1609, + "step": 1406 + }, + { + "epoch": 0.24, + "grad_norm": 9.010190963745117, + "learning_rate": 2.6378067616269094e-05, + "loss": 1.2418, + "step": 1407 + }, + { + "epoch": 0.24, + "grad_norm": 10.536715507507324, + "learning_rate": 2.6375493392826497e-05, + "loss": 1.3751, + "step": 1408 + }, + { + "epoch": 0.24, + "grad_norm": 8.624482154846191, + "learning_rate": 2.6372919169383904e-05, + "loss": 1.3539, + "step": 1409 + }, + { + "epoch": 0.24, + "grad_norm": 8.700296401977539, + "learning_rate": 2.6370344945941307e-05, + "loss": 1.249, + "step": 1410 + }, + { + "epoch": 0.24, + "grad_norm": 8.026456832885742, + "learning_rate": 2.6367770722498714e-05, + "loss": 0.8828, + "step": 1411 + }, + { + "epoch": 0.24, + "grad_norm": 9.169084548950195, + "learning_rate": 2.6365196499056117e-05, + "loss": 1.102, + "step": 1412 + }, + { + "epoch": 0.24, + "grad_norm": 9.534589767456055, + "learning_rate": 2.6362622275613524e-05, + "loss": 1.3815, + "step": 1413 + }, + { + "epoch": 0.24, + "grad_norm": 10.347875595092773, + "learning_rate": 2.6360048052170927e-05, + "loss": 1.5063, + "step": 1414 + }, + { + "epoch": 0.24, + "grad_norm": 8.11861515045166, + "learning_rate": 2.6357473828728334e-05, + "loss": 1.1653, + "step": 1415 + }, + { + "epoch": 0.24, + "grad_norm": 10.822431564331055, + "learning_rate": 2.635489960528574e-05, + "loss": 1.5637, + "step": 1416 + }, + { + "epoch": 0.24, + "grad_norm": 9.840667724609375, + "learning_rate": 2.6352325381843144e-05, + "loss": 1.0735, + "step": 1417 + }, + { + "epoch": 0.24, + "grad_norm": 11.409327507019043, + "learning_rate": 2.634975115840055e-05, + "loss": 1.3696, + "step": 1418 + }, + { + "epoch": 0.24, + "grad_norm": 9.566048622131348, + "learning_rate": 2.6347176934957954e-05, + "loss": 1.4983, + "step": 1419 + }, + { + "epoch": 0.24, + "grad_norm": 10.011872291564941, + "learning_rate": 2.634460271151536e-05, + "loss": 1.4012, + "step": 1420 + }, + { + "epoch": 0.24, + "grad_norm": 7.427055358886719, + "learning_rate": 2.6342028488072764e-05, + "loss": 0.9848, + "step": 1421 + }, + { + "epoch": 0.24, + "grad_norm": 8.475969314575195, + "learning_rate": 2.633945426463017e-05, + "loss": 1.1438, + "step": 1422 + }, + { + "epoch": 0.24, + "grad_norm": 7.266942501068115, + "learning_rate": 2.6336880041187574e-05, + "loss": 1.326, + "step": 1423 + }, + { + "epoch": 0.24, + "grad_norm": 7.881961822509766, + "learning_rate": 2.6334305817744984e-05, + "loss": 1.2736, + "step": 1424 + }, + { + "epoch": 0.24, + "grad_norm": 7.330427646636963, + "learning_rate": 2.6331731594302387e-05, + "loss": 1.2111, + "step": 1425 + }, + { + "epoch": 0.24, + "grad_norm": 9.54010009765625, + "learning_rate": 2.632915737085979e-05, + "loss": 1.7535, + "step": 1426 + }, + { + "epoch": 0.24, + "grad_norm": 8.168648719787598, + "learning_rate": 2.6326583147417197e-05, + "loss": 1.1077, + "step": 1427 + }, + { + "epoch": 0.25, + "grad_norm": 9.138731956481934, + "learning_rate": 2.63240089239746e-05, + "loss": 1.2385, + "step": 1428 + }, + { + "epoch": 0.25, + "grad_norm": 8.944056510925293, + "learning_rate": 2.6321434700532007e-05, + "loss": 1.1677, + "step": 1429 + }, + { + "epoch": 0.25, + "grad_norm": 8.671627044677734, + "learning_rate": 2.631886047708941e-05, + "loss": 1.2365, + "step": 1430 + }, + { + "epoch": 0.25, + "grad_norm": 9.128754615783691, + "learning_rate": 2.6316286253646817e-05, + "loss": 1.1243, + "step": 1431 + }, + { + "epoch": 0.25, + "grad_norm": 9.254434585571289, + "learning_rate": 2.631371203020422e-05, + "loss": 1.4588, + "step": 1432 + }, + { + "epoch": 0.25, + "grad_norm": 9.757545471191406, + "learning_rate": 2.6311137806761627e-05, + "loss": 1.2052, + "step": 1433 + }, + { + "epoch": 0.25, + "grad_norm": 9.631169319152832, + "learning_rate": 2.6308563583319034e-05, + "loss": 1.4379, + "step": 1434 + }, + { + "epoch": 0.25, + "grad_norm": 9.045636177062988, + "learning_rate": 2.630598935987644e-05, + "loss": 1.083, + "step": 1435 + }, + { + "epoch": 0.25, + "grad_norm": 11.049459457397461, + "learning_rate": 2.6303415136433844e-05, + "loss": 1.5538, + "step": 1436 + }, + { + "epoch": 0.25, + "grad_norm": 10.972939491271973, + "learning_rate": 2.6300840912991247e-05, + "loss": 1.223, + "step": 1437 + }, + { + "epoch": 0.25, + "grad_norm": 9.741581916809082, + "learning_rate": 2.6298266689548654e-05, + "loss": 1.4703, + "step": 1438 + }, + { + "epoch": 0.25, + "grad_norm": 8.267253875732422, + "learning_rate": 2.6295692466106057e-05, + "loss": 1.1549, + "step": 1439 + }, + { + "epoch": 0.25, + "grad_norm": 11.986956596374512, + "learning_rate": 2.6293118242663464e-05, + "loss": 1.5899, + "step": 1440 + }, + { + "epoch": 0.25, + "grad_norm": 8.246567726135254, + "learning_rate": 2.6290544019220867e-05, + "loss": 1.2799, + "step": 1441 + }, + { + "epoch": 0.25, + "grad_norm": 9.742122650146484, + "learning_rate": 2.6287969795778274e-05, + "loss": 1.2074, + "step": 1442 + }, + { + "epoch": 0.25, + "grad_norm": 8.831069946289062, + "learning_rate": 2.628539557233568e-05, + "loss": 1.1914, + "step": 1443 + }, + { + "epoch": 0.25, + "grad_norm": 8.123552322387695, + "learning_rate": 2.6282821348893087e-05, + "loss": 1.1744, + "step": 1444 + }, + { + "epoch": 0.25, + "grad_norm": 8.420841217041016, + "learning_rate": 2.628024712545049e-05, + "loss": 1.1882, + "step": 1445 + }, + { + "epoch": 0.25, + "grad_norm": 9.566805839538574, + "learning_rate": 2.6277672902007897e-05, + "loss": 1.1515, + "step": 1446 + }, + { + "epoch": 0.25, + "grad_norm": 8.217146873474121, + "learning_rate": 2.62750986785653e-05, + "loss": 1.154, + "step": 1447 + }, + { + "epoch": 0.25, + "grad_norm": 7.855069637298584, + "learning_rate": 2.6272524455122704e-05, + "loss": 0.9249, + "step": 1448 + }, + { + "epoch": 0.25, + "grad_norm": 8.529651641845703, + "learning_rate": 2.626995023168011e-05, + "loss": 0.9164, + "step": 1449 + }, + { + "epoch": 0.25, + "grad_norm": 10.14615249633789, + "learning_rate": 2.6267376008237514e-05, + "loss": 1.1964, + "step": 1450 + }, + { + "epoch": 0.25, + "grad_norm": 8.80463981628418, + "learning_rate": 2.626480178479492e-05, + "loss": 1.1795, + "step": 1451 + }, + { + "epoch": 0.25, + "grad_norm": 9.780580520629883, + "learning_rate": 2.6262227561352324e-05, + "loss": 1.3118, + "step": 1452 + }, + { + "epoch": 0.25, + "grad_norm": 8.311175346374512, + "learning_rate": 2.6259653337909734e-05, + "loss": 1.2455, + "step": 1453 + }, + { + "epoch": 0.25, + "grad_norm": 10.386007308959961, + "learning_rate": 2.6257079114467137e-05, + "loss": 1.4528, + "step": 1454 + }, + { + "epoch": 0.25, + "grad_norm": 9.25583553314209, + "learning_rate": 2.6254504891024544e-05, + "loss": 1.3275, + "step": 1455 + }, + { + "epoch": 0.25, + "grad_norm": 9.80235767364502, + "learning_rate": 2.6251930667581947e-05, + "loss": 1.5025, + "step": 1456 + }, + { + "epoch": 0.25, + "grad_norm": 8.165590286254883, + "learning_rate": 2.624935644413935e-05, + "loss": 1.1918, + "step": 1457 + }, + { + "epoch": 0.25, + "grad_norm": 10.783446311950684, + "learning_rate": 2.6246782220696757e-05, + "loss": 1.5329, + "step": 1458 + }, + { + "epoch": 0.25, + "grad_norm": 7.697442054748535, + "learning_rate": 2.624420799725416e-05, + "loss": 1.1324, + "step": 1459 + }, + { + "epoch": 0.25, + "grad_norm": 8.200699806213379, + "learning_rate": 2.6241633773811567e-05, + "loss": 1.028, + "step": 1460 + }, + { + "epoch": 0.25, + "grad_norm": 8.565408706665039, + "learning_rate": 2.623905955036897e-05, + "loss": 0.9422, + "step": 1461 + }, + { + "epoch": 0.25, + "grad_norm": 7.901795864105225, + "learning_rate": 2.623648532692638e-05, + "loss": 1.1371, + "step": 1462 + }, + { + "epoch": 0.25, + "grad_norm": 8.764126777648926, + "learning_rate": 2.6233911103483784e-05, + "loss": 1.1865, + "step": 1463 + }, + { + "epoch": 0.25, + "grad_norm": 8.938124656677246, + "learning_rate": 2.623133688004119e-05, + "loss": 1.2255, + "step": 1464 + }, + { + "epoch": 0.25, + "grad_norm": 8.876240730285645, + "learning_rate": 2.6228762656598594e-05, + "loss": 1.1229, + "step": 1465 + }, + { + "epoch": 0.25, + "grad_norm": 9.972127914428711, + "learning_rate": 2.6226188433156e-05, + "loss": 1.4471, + "step": 1466 + }, + { + "epoch": 0.25, + "grad_norm": 8.517899513244629, + "learning_rate": 2.6223614209713403e-05, + "loss": 1.3616, + "step": 1467 + }, + { + "epoch": 0.25, + "grad_norm": 8.41218376159668, + "learning_rate": 2.6221039986270807e-05, + "loss": 1.1269, + "step": 1468 + }, + { + "epoch": 0.25, + "grad_norm": 8.990455627441406, + "learning_rate": 2.6218465762828213e-05, + "loss": 1.179, + "step": 1469 + }, + { + "epoch": 0.25, + "grad_norm": 8.675251960754395, + "learning_rate": 2.6215891539385617e-05, + "loss": 1.1038, + "step": 1470 + }, + { + "epoch": 0.25, + "grad_norm": 9.660655975341797, + "learning_rate": 2.6213317315943027e-05, + "loss": 0.9937, + "step": 1471 + }, + { + "epoch": 0.25, + "grad_norm": 8.12274169921875, + "learning_rate": 2.621074309250043e-05, + "loss": 1.1449, + "step": 1472 + }, + { + "epoch": 0.25, + "grad_norm": 7.534292697906494, + "learning_rate": 2.6208168869057837e-05, + "loss": 0.9945, + "step": 1473 + }, + { + "epoch": 0.25, + "grad_norm": 8.044416427612305, + "learning_rate": 2.620559464561524e-05, + "loss": 1.1414, + "step": 1474 + }, + { + "epoch": 0.25, + "grad_norm": 9.75588321685791, + "learning_rate": 2.6203020422172647e-05, + "loss": 1.5593, + "step": 1475 + }, + { + "epoch": 0.25, + "grad_norm": 9.109524726867676, + "learning_rate": 2.620044619873005e-05, + "loss": 1.3625, + "step": 1476 + }, + { + "epoch": 0.25, + "grad_norm": 9.501923561096191, + "learning_rate": 2.6197871975287457e-05, + "loss": 1.2569, + "step": 1477 + }, + { + "epoch": 0.25, + "grad_norm": 8.203754425048828, + "learning_rate": 2.619529775184486e-05, + "loss": 1.412, + "step": 1478 + }, + { + "epoch": 0.25, + "grad_norm": 8.245170593261719, + "learning_rate": 2.6192723528402263e-05, + "loss": 1.1217, + "step": 1479 + }, + { + "epoch": 0.25, + "grad_norm": 8.047924995422363, + "learning_rate": 2.619014930495967e-05, + "loss": 0.9712, + "step": 1480 + }, + { + "epoch": 0.25, + "grad_norm": 8.3880033493042, + "learning_rate": 2.6187575081517077e-05, + "loss": 1.1291, + "step": 1481 + }, + { + "epoch": 0.25, + "grad_norm": 9.66402530670166, + "learning_rate": 2.6185000858074483e-05, + "loss": 1.4886, + "step": 1482 + }, + { + "epoch": 0.25, + "grad_norm": 8.70718002319336, + "learning_rate": 2.6182426634631887e-05, + "loss": 1.0842, + "step": 1483 + }, + { + "epoch": 0.25, + "grad_norm": 8.335410118103027, + "learning_rate": 2.6179852411189293e-05, + "loss": 1.4253, + "step": 1484 + }, + { + "epoch": 0.25, + "grad_norm": 8.54088020324707, + "learning_rate": 2.6177278187746697e-05, + "loss": 1.2456, + "step": 1485 + }, + { + "epoch": 0.26, + "grad_norm": 9.075895309448242, + "learning_rate": 2.6174703964304103e-05, + "loss": 1.403, + "step": 1486 + }, + { + "epoch": 0.26, + "grad_norm": 8.078447341918945, + "learning_rate": 2.6172129740861507e-05, + "loss": 1.0031, + "step": 1487 + }, + { + "epoch": 0.26, + "grad_norm": 9.30174446105957, + "learning_rate": 2.6169555517418913e-05, + "loss": 1.1542, + "step": 1488 + }, + { + "epoch": 0.26, + "grad_norm": 9.88626480102539, + "learning_rate": 2.6166981293976317e-05, + "loss": 1.6623, + "step": 1489 + }, + { + "epoch": 0.26, + "grad_norm": 8.433023452758789, + "learning_rate": 2.6164407070533723e-05, + "loss": 0.9823, + "step": 1490 + }, + { + "epoch": 0.26, + "grad_norm": 9.406853675842285, + "learning_rate": 2.616183284709113e-05, + "loss": 1.1836, + "step": 1491 + }, + { + "epoch": 0.26, + "grad_norm": 10.292494773864746, + "learning_rate": 2.6159258623648533e-05, + "loss": 1.2547, + "step": 1492 + }, + { + "epoch": 0.26, + "grad_norm": 7.658616065979004, + "learning_rate": 2.615668440020594e-05, + "loss": 1.0207, + "step": 1493 + }, + { + "epoch": 0.26, + "grad_norm": 7.94169807434082, + "learning_rate": 2.6154110176763343e-05, + "loss": 1.0582, + "step": 1494 + }, + { + "epoch": 0.26, + "grad_norm": 10.028471946716309, + "learning_rate": 2.615153595332075e-05, + "loss": 1.3083, + "step": 1495 + }, + { + "epoch": 0.26, + "grad_norm": 9.022738456726074, + "learning_rate": 2.6148961729878153e-05, + "loss": 1.2612, + "step": 1496 + }, + { + "epoch": 0.26, + "grad_norm": 9.101866722106934, + "learning_rate": 2.614638750643556e-05, + "loss": 1.2579, + "step": 1497 + }, + { + "epoch": 0.26, + "grad_norm": 9.523536682128906, + "learning_rate": 2.6143813282992963e-05, + "loss": 1.1771, + "step": 1498 + }, + { + "epoch": 0.26, + "grad_norm": 9.217952728271484, + "learning_rate": 2.6141239059550366e-05, + "loss": 1.1969, + "step": 1499 + }, + { + "epoch": 0.26, + "grad_norm": 10.444476127624512, + "learning_rate": 2.6138664836107777e-05, + "loss": 1.7069, + "step": 1500 + }, + { + "epoch": 0.26, + "grad_norm": 9.153836250305176, + "learning_rate": 2.613609061266518e-05, + "loss": 1.3156, + "step": 1501 + }, + { + "epoch": 0.26, + "grad_norm": 8.480145454406738, + "learning_rate": 2.6133516389222587e-05, + "loss": 1.0281, + "step": 1502 + }, + { + "epoch": 0.26, + "grad_norm": 9.882759094238281, + "learning_rate": 2.613094216577999e-05, + "loss": 1.6403, + "step": 1503 + }, + { + "epoch": 0.26, + "grad_norm": 9.847168922424316, + "learning_rate": 2.6128367942337396e-05, + "loss": 1.1885, + "step": 1504 + }, + { + "epoch": 0.26, + "grad_norm": 8.725817680358887, + "learning_rate": 2.61257937188948e-05, + "loss": 1.132, + "step": 1505 + }, + { + "epoch": 0.26, + "grad_norm": 10.618316650390625, + "learning_rate": 2.6123219495452206e-05, + "loss": 1.1898, + "step": 1506 + }, + { + "epoch": 0.26, + "grad_norm": 8.863554000854492, + "learning_rate": 2.612064527200961e-05, + "loss": 1.3372, + "step": 1507 + }, + { + "epoch": 0.26, + "grad_norm": 8.243571281433105, + "learning_rate": 2.6118071048567016e-05, + "loss": 1.1867, + "step": 1508 + }, + { + "epoch": 0.26, + "grad_norm": 10.022723197937012, + "learning_rate": 2.6115496825124423e-05, + "loss": 1.3467, + "step": 1509 + }, + { + "epoch": 0.26, + "grad_norm": 9.887679100036621, + "learning_rate": 2.6112922601681826e-05, + "loss": 1.2277, + "step": 1510 + }, + { + "epoch": 0.26, + "grad_norm": 7.539658069610596, + "learning_rate": 2.6110348378239233e-05, + "loss": 1.161, + "step": 1511 + }, + { + "epoch": 0.26, + "grad_norm": 11.810113906860352, + "learning_rate": 2.6107774154796636e-05, + "loss": 1.3282, + "step": 1512 + }, + { + "epoch": 0.26, + "grad_norm": 7.964231967926025, + "learning_rate": 2.6105199931354043e-05, + "loss": 1.3472, + "step": 1513 + }, + { + "epoch": 0.26, + "grad_norm": 6.931862831115723, + "learning_rate": 2.6102625707911446e-05, + "loss": 1.1696, + "step": 1514 + }, + { + "epoch": 0.26, + "grad_norm": 8.483572959899902, + "learning_rate": 2.6100051484468853e-05, + "loss": 1.3194, + "step": 1515 + }, + { + "epoch": 0.26, + "grad_norm": 8.26704216003418, + "learning_rate": 2.6097477261026256e-05, + "loss": 1.0045, + "step": 1516 + }, + { + "epoch": 0.26, + "grad_norm": 11.326842308044434, + "learning_rate": 2.6094903037583663e-05, + "loss": 1.4534, + "step": 1517 + }, + { + "epoch": 0.26, + "grad_norm": 11.104940414428711, + "learning_rate": 2.6092328814141066e-05, + "loss": 1.2621, + "step": 1518 + }, + { + "epoch": 0.26, + "grad_norm": 9.792030334472656, + "learning_rate": 2.6089754590698476e-05, + "loss": 1.1254, + "step": 1519 + }, + { + "epoch": 0.26, + "grad_norm": 10.367633819580078, + "learning_rate": 2.608718036725588e-05, + "loss": 1.1915, + "step": 1520 + }, + { + "epoch": 0.26, + "grad_norm": 11.074049949645996, + "learning_rate": 2.6084606143813283e-05, + "loss": 1.2102, + "step": 1521 + }, + { + "epoch": 0.26, + "grad_norm": 9.285543441772461, + "learning_rate": 2.608203192037069e-05, + "loss": 1.3275, + "step": 1522 + }, + { + "epoch": 0.26, + "grad_norm": 9.799038887023926, + "learning_rate": 2.6079457696928093e-05, + "loss": 1.3434, + "step": 1523 + }, + { + "epoch": 0.26, + "grad_norm": 10.813976287841797, + "learning_rate": 2.60768834734855e-05, + "loss": 1.3586, + "step": 1524 + }, + { + "epoch": 0.26, + "grad_norm": 8.80359172821045, + "learning_rate": 2.6074309250042903e-05, + "loss": 1.1103, + "step": 1525 + }, + { + "epoch": 0.26, + "grad_norm": 8.167686462402344, + "learning_rate": 2.607173502660031e-05, + "loss": 1.1062, + "step": 1526 + }, + { + "epoch": 0.26, + "grad_norm": 8.026318550109863, + "learning_rate": 2.6069160803157713e-05, + "loss": 0.9971, + "step": 1527 + }, + { + "epoch": 0.26, + "grad_norm": 8.773980140686035, + "learning_rate": 2.6066586579715123e-05, + "loss": 1.4267, + "step": 1528 + }, + { + "epoch": 0.26, + "grad_norm": 9.633464813232422, + "learning_rate": 2.6064012356272526e-05, + "loss": 1.5509, + "step": 1529 + }, + { + "epoch": 0.26, + "grad_norm": 8.131473541259766, + "learning_rate": 2.606143813282993e-05, + "loss": 1.0474, + "step": 1530 + }, + { + "epoch": 0.26, + "grad_norm": 9.265179634094238, + "learning_rate": 2.6058863909387336e-05, + "loss": 1.294, + "step": 1531 + }, + { + "epoch": 0.26, + "grad_norm": 9.328397750854492, + "learning_rate": 2.605628968594474e-05, + "loss": 1.2807, + "step": 1532 + }, + { + "epoch": 0.26, + "grad_norm": 8.06914234161377, + "learning_rate": 2.6053715462502146e-05, + "loss": 1.2215, + "step": 1533 + }, + { + "epoch": 0.26, + "grad_norm": 9.479385375976562, + "learning_rate": 2.605114123905955e-05, + "loss": 1.2997, + "step": 1534 + }, + { + "epoch": 0.26, + "grad_norm": 9.318657875061035, + "learning_rate": 2.6048567015616956e-05, + "loss": 1.342, + "step": 1535 + }, + { + "epoch": 0.26, + "grad_norm": 8.627128601074219, + "learning_rate": 2.604599279217436e-05, + "loss": 1.2197, + "step": 1536 + }, + { + "epoch": 0.26, + "grad_norm": 9.470630645751953, + "learning_rate": 2.6043418568731766e-05, + "loss": 1.3896, + "step": 1537 + }, + { + "epoch": 0.26, + "grad_norm": 8.276989936828613, + "learning_rate": 2.6040844345289173e-05, + "loss": 1.3584, + "step": 1538 + }, + { + "epoch": 0.26, + "grad_norm": 7.971053600311279, + "learning_rate": 2.603827012184658e-05, + "loss": 1.0443, + "step": 1539 + }, + { + "epoch": 0.26, + "grad_norm": 8.825150489807129, + "learning_rate": 2.6035695898403983e-05, + "loss": 1.3234, + "step": 1540 + }, + { + "epoch": 0.26, + "grad_norm": 9.27294921875, + "learning_rate": 2.6033121674961386e-05, + "loss": 1.2559, + "step": 1541 + }, + { + "epoch": 0.26, + "grad_norm": 9.06420612335205, + "learning_rate": 2.6030547451518793e-05, + "loss": 1.1736, + "step": 1542 + }, + { + "epoch": 0.26, + "grad_norm": 9.466202735900879, + "learning_rate": 2.6027973228076196e-05, + "loss": 1.3382, + "step": 1543 + }, + { + "epoch": 0.26, + "grad_norm": 8.974349975585938, + "learning_rate": 2.6025399004633603e-05, + "loss": 1.451, + "step": 1544 + }, + { + "epoch": 0.27, + "grad_norm": 8.396931648254395, + "learning_rate": 2.6022824781191006e-05, + "loss": 1.2475, + "step": 1545 + }, + { + "epoch": 0.27, + "grad_norm": 8.682100296020508, + "learning_rate": 2.6020250557748413e-05, + "loss": 0.9821, + "step": 1546 + }, + { + "epoch": 0.27, + "grad_norm": 9.19713306427002, + "learning_rate": 2.601767633430582e-05, + "loss": 1.1506, + "step": 1547 + }, + { + "epoch": 0.27, + "grad_norm": 9.293886184692383, + "learning_rate": 2.6015102110863226e-05, + "loss": 1.0679, + "step": 1548 + }, + { + "epoch": 0.27, + "grad_norm": 10.456722259521484, + "learning_rate": 2.601252788742063e-05, + "loss": 1.0798, + "step": 1549 + }, + { + "epoch": 0.27, + "grad_norm": 10.706494331359863, + "learning_rate": 2.6009953663978036e-05, + "loss": 1.4401, + "step": 1550 + }, + { + "epoch": 0.27, + "grad_norm": 10.215545654296875, + "learning_rate": 2.600737944053544e-05, + "loss": 0.8901, + "step": 1551 + }, + { + "epoch": 0.27, + "grad_norm": 10.596031188964844, + "learning_rate": 2.6004805217092843e-05, + "loss": 1.2893, + "step": 1552 + }, + { + "epoch": 0.27, + "grad_norm": 9.22983455657959, + "learning_rate": 2.600223099365025e-05, + "loss": 1.275, + "step": 1553 + }, + { + "epoch": 0.27, + "grad_norm": 8.633913040161133, + "learning_rate": 2.5999656770207653e-05, + "loss": 0.9564, + "step": 1554 + }, + { + "epoch": 0.27, + "grad_norm": 9.765681266784668, + "learning_rate": 2.599708254676506e-05, + "loss": 1.1862, + "step": 1555 + }, + { + "epoch": 0.27, + "grad_norm": 8.760149955749512, + "learning_rate": 2.5994508323322463e-05, + "loss": 1.1313, + "step": 1556 + }, + { + "epoch": 0.27, + "grad_norm": 9.333000183105469, + "learning_rate": 2.5991934099879873e-05, + "loss": 1.1004, + "step": 1557 + }, + { + "epoch": 0.27, + "grad_norm": 9.956636428833008, + "learning_rate": 2.5989359876437276e-05, + "loss": 1.691, + "step": 1558 + }, + { + "epoch": 0.27, + "grad_norm": 9.976466178894043, + "learning_rate": 2.5986785652994683e-05, + "loss": 1.0736, + "step": 1559 + }, + { + "epoch": 0.27, + "grad_norm": 7.870991230010986, + "learning_rate": 2.5984211429552086e-05, + "loss": 0.9734, + "step": 1560 + }, + { + "epoch": 0.27, + "grad_norm": 9.663421630859375, + "learning_rate": 2.598163720610949e-05, + "loss": 1.0836, + "step": 1561 + }, + { + "epoch": 0.27, + "grad_norm": 10.073293685913086, + "learning_rate": 2.5979062982666896e-05, + "loss": 1.2648, + "step": 1562 + }, + { + "epoch": 0.27, + "grad_norm": 9.528361320495605, + "learning_rate": 2.59764887592243e-05, + "loss": 1.0944, + "step": 1563 + }, + { + "epoch": 0.27, + "grad_norm": 10.58749008178711, + "learning_rate": 2.5973914535781706e-05, + "loss": 1.2068, + "step": 1564 + }, + { + "epoch": 0.27, + "grad_norm": 8.19024658203125, + "learning_rate": 2.597134031233911e-05, + "loss": 1.0406, + "step": 1565 + }, + { + "epoch": 0.27, + "grad_norm": 10.314001083374023, + "learning_rate": 2.596876608889652e-05, + "loss": 1.1504, + "step": 1566 + }, + { + "epoch": 0.27, + "grad_norm": 8.663982391357422, + "learning_rate": 2.5966191865453923e-05, + "loss": 0.9644, + "step": 1567 + }, + { + "epoch": 0.27, + "grad_norm": 8.71829605102539, + "learning_rate": 2.596361764201133e-05, + "loss": 0.8445, + "step": 1568 + }, + { + "epoch": 0.27, + "grad_norm": 10.616984367370605, + "learning_rate": 2.5961043418568733e-05, + "loss": 1.2033, + "step": 1569 + }, + { + "epoch": 0.27, + "grad_norm": 9.283010482788086, + "learning_rate": 2.595846919512614e-05, + "loss": 1.1312, + "step": 1570 + }, + { + "epoch": 0.27, + "grad_norm": 9.120360374450684, + "learning_rate": 2.5955894971683542e-05, + "loss": 1.0336, + "step": 1571 + }, + { + "epoch": 0.27, + "grad_norm": 8.295063972473145, + "learning_rate": 2.5953320748240946e-05, + "loss": 1.0758, + "step": 1572 + }, + { + "epoch": 0.27, + "grad_norm": 9.565741539001465, + "learning_rate": 2.5950746524798352e-05, + "loss": 1.3195, + "step": 1573 + }, + { + "epoch": 0.27, + "grad_norm": 9.618343353271484, + "learning_rate": 2.5948172301355756e-05, + "loss": 1.3803, + "step": 1574 + }, + { + "epoch": 0.27, + "grad_norm": 9.500362396240234, + "learning_rate": 2.5945598077913166e-05, + "loss": 1.1424, + "step": 1575 + }, + { + "epoch": 0.27, + "grad_norm": 9.92062759399414, + "learning_rate": 2.594302385447057e-05, + "loss": 1.189, + "step": 1576 + }, + { + "epoch": 0.27, + "grad_norm": 9.251409530639648, + "learning_rate": 2.5940449631027976e-05, + "loss": 1.1997, + "step": 1577 + }, + { + "epoch": 0.27, + "grad_norm": 9.031352043151855, + "learning_rate": 2.593787540758538e-05, + "loss": 1.1386, + "step": 1578 + }, + { + "epoch": 0.27, + "grad_norm": 9.570923805236816, + "learning_rate": 2.5935301184142786e-05, + "loss": 1.2136, + "step": 1579 + }, + { + "epoch": 0.27, + "grad_norm": 8.8291654586792, + "learning_rate": 2.593272696070019e-05, + "loss": 1.0956, + "step": 1580 + }, + { + "epoch": 0.27, + "grad_norm": 10.069900512695312, + "learning_rate": 2.5930152737257596e-05, + "loss": 1.3491, + "step": 1581 + }, + { + "epoch": 0.27, + "grad_norm": 10.534960746765137, + "learning_rate": 2.5927578513815e-05, + "loss": 1.3644, + "step": 1582 + }, + { + "epoch": 0.27, + "grad_norm": 10.055986404418945, + "learning_rate": 2.5925004290372402e-05, + "loss": 1.1657, + "step": 1583 + }, + { + "epoch": 0.27, + "grad_norm": 9.447620391845703, + "learning_rate": 2.592243006692981e-05, + "loss": 1.1843, + "step": 1584 + }, + { + "epoch": 0.27, + "grad_norm": 8.631956100463867, + "learning_rate": 2.5919855843487216e-05, + "loss": 1.0351, + "step": 1585 + }, + { + "epoch": 0.27, + "grad_norm": 8.020038604736328, + "learning_rate": 2.5917281620044622e-05, + "loss": 0.9871, + "step": 1586 + }, + { + "epoch": 0.27, + "grad_norm": 9.377992630004883, + "learning_rate": 2.5914707396602026e-05, + "loss": 1.4819, + "step": 1587 + }, + { + "epoch": 0.27, + "grad_norm": 9.236485481262207, + "learning_rate": 2.5912133173159432e-05, + "loss": 1.2268, + "step": 1588 + }, + { + "epoch": 0.27, + "grad_norm": 8.214728355407715, + "learning_rate": 2.5909558949716836e-05, + "loss": 1.075, + "step": 1589 + }, + { + "epoch": 0.27, + "grad_norm": 8.975008964538574, + "learning_rate": 2.5906984726274242e-05, + "loss": 0.9712, + "step": 1590 + }, + { + "epoch": 0.27, + "grad_norm": 9.329899787902832, + "learning_rate": 2.5904410502831646e-05, + "loss": 1.2087, + "step": 1591 + }, + { + "epoch": 0.27, + "grad_norm": 9.313892364501953, + "learning_rate": 2.5901836279389052e-05, + "loss": 1.2299, + "step": 1592 + }, + { + "epoch": 0.27, + "grad_norm": 8.25578784942627, + "learning_rate": 2.5899262055946456e-05, + "loss": 1.1505, + "step": 1593 + }, + { + "epoch": 0.27, + "grad_norm": 8.274794578552246, + "learning_rate": 2.5896687832503862e-05, + "loss": 0.9779, + "step": 1594 + }, + { + "epoch": 0.27, + "grad_norm": 8.605783462524414, + "learning_rate": 2.589411360906127e-05, + "loss": 1.0434, + "step": 1595 + }, + { + "epoch": 0.27, + "grad_norm": 9.504106521606445, + "learning_rate": 2.5891539385618672e-05, + "loss": 1.0292, + "step": 1596 + }, + { + "epoch": 0.27, + "grad_norm": 9.182815551757812, + "learning_rate": 2.588896516217608e-05, + "loss": 1.0919, + "step": 1597 + }, + { + "epoch": 0.27, + "grad_norm": 10.316807746887207, + "learning_rate": 2.5886390938733482e-05, + "loss": 1.0491, + "step": 1598 + }, + { + "epoch": 0.27, + "grad_norm": 7.9618000984191895, + "learning_rate": 2.588381671529089e-05, + "loss": 1.0251, + "step": 1599 + }, + { + "epoch": 0.27, + "grad_norm": 9.987043380737305, + "learning_rate": 2.5881242491848292e-05, + "loss": 1.3196, + "step": 1600 + }, + { + "epoch": 0.27, + "grad_norm": 8.570785522460938, + "learning_rate": 2.58786682684057e-05, + "loss": 1.1278, + "step": 1601 + }, + { + "epoch": 0.27, + "grad_norm": 9.415283203125, + "learning_rate": 2.5876094044963102e-05, + "loss": 1.3667, + "step": 1602 + }, + { + "epoch": 0.28, + "grad_norm": 8.775483131408691, + "learning_rate": 2.5873519821520505e-05, + "loss": 1.266, + "step": 1603 + }, + { + "epoch": 0.28, + "grad_norm": 10.52068042755127, + "learning_rate": 2.5870945598077916e-05, + "loss": 1.5216, + "step": 1604 + }, + { + "epoch": 0.28, + "grad_norm": 8.699344635009766, + "learning_rate": 2.586837137463532e-05, + "loss": 1.1651, + "step": 1605 + }, + { + "epoch": 0.28, + "grad_norm": 8.356231689453125, + "learning_rate": 2.5865797151192726e-05, + "loss": 1.2763, + "step": 1606 + }, + { + "epoch": 0.28, + "grad_norm": 9.245226860046387, + "learning_rate": 2.586322292775013e-05, + "loss": 1.2878, + "step": 1607 + }, + { + "epoch": 0.28, + "grad_norm": 7.9049530029296875, + "learning_rate": 2.5860648704307535e-05, + "loss": 1.4374, + "step": 1608 + }, + { + "epoch": 0.28, + "grad_norm": 7.493533611297607, + "learning_rate": 2.585807448086494e-05, + "loss": 1.0598, + "step": 1609 + }, + { + "epoch": 0.28, + "grad_norm": 8.12313175201416, + "learning_rate": 2.5855500257422345e-05, + "loss": 1.3791, + "step": 1610 + }, + { + "epoch": 0.28, + "grad_norm": 6.579049587249756, + "learning_rate": 2.585292603397975e-05, + "loss": 1.0437, + "step": 1611 + }, + { + "epoch": 0.28, + "grad_norm": 9.188210487365723, + "learning_rate": 2.5850351810537155e-05, + "loss": 1.2966, + "step": 1612 + }, + { + "epoch": 0.28, + "grad_norm": 8.327075004577637, + "learning_rate": 2.5847777587094562e-05, + "loss": 1.2624, + "step": 1613 + }, + { + "epoch": 0.28, + "grad_norm": 8.03768539428711, + "learning_rate": 2.5845203363651965e-05, + "loss": 1.2136, + "step": 1614 + }, + { + "epoch": 0.28, + "grad_norm": 8.479057312011719, + "learning_rate": 2.5842629140209372e-05, + "loss": 1.2015, + "step": 1615 + }, + { + "epoch": 0.28, + "grad_norm": 8.179889678955078, + "learning_rate": 2.5840054916766775e-05, + "loss": 1.265, + "step": 1616 + }, + { + "epoch": 0.28, + "grad_norm": 8.410205841064453, + "learning_rate": 2.5837480693324182e-05, + "loss": 1.1442, + "step": 1617 + }, + { + "epoch": 0.28, + "grad_norm": 9.869892120361328, + "learning_rate": 2.5834906469881585e-05, + "loss": 1.3213, + "step": 1618 + }, + { + "epoch": 0.28, + "grad_norm": 8.214839935302734, + "learning_rate": 2.5832332246438992e-05, + "loss": 0.9812, + "step": 1619 + }, + { + "epoch": 0.28, + "grad_norm": 10.514745712280273, + "learning_rate": 2.5829758022996395e-05, + "loss": 1.3735, + "step": 1620 + }, + { + "epoch": 0.28, + "grad_norm": 7.812685489654541, + "learning_rate": 2.5827183799553802e-05, + "loss": 0.9847, + "step": 1621 + }, + { + "epoch": 0.28, + "grad_norm": 7.392012119293213, + "learning_rate": 2.5824609576111205e-05, + "loss": 1.0244, + "step": 1622 + }, + { + "epoch": 0.28, + "grad_norm": 9.04334545135498, + "learning_rate": 2.5822035352668615e-05, + "loss": 1.0548, + "step": 1623 + }, + { + "epoch": 0.28, + "grad_norm": 9.246118545532227, + "learning_rate": 2.581946112922602e-05, + "loss": 1.3623, + "step": 1624 + }, + { + "epoch": 0.28, + "grad_norm": 8.836997985839844, + "learning_rate": 2.5816886905783422e-05, + "loss": 1.1976, + "step": 1625 + }, + { + "epoch": 0.28, + "grad_norm": 11.115344047546387, + "learning_rate": 2.581431268234083e-05, + "loss": 1.3798, + "step": 1626 + }, + { + "epoch": 0.28, + "grad_norm": 8.662017822265625, + "learning_rate": 2.5811738458898232e-05, + "loss": 1.1289, + "step": 1627 + }, + { + "epoch": 0.28, + "grad_norm": 8.208451271057129, + "learning_rate": 2.580916423545564e-05, + "loss": 1.1622, + "step": 1628 + }, + { + "epoch": 0.28, + "grad_norm": 8.290624618530273, + "learning_rate": 2.5806590012013042e-05, + "loss": 1.1739, + "step": 1629 + }, + { + "epoch": 0.28, + "grad_norm": 8.374139785766602, + "learning_rate": 2.580401578857045e-05, + "loss": 1.1181, + "step": 1630 + }, + { + "epoch": 0.28, + "grad_norm": 8.209418296813965, + "learning_rate": 2.5801441565127852e-05, + "loss": 1.1703, + "step": 1631 + }, + { + "epoch": 0.28, + "grad_norm": 9.328374862670898, + "learning_rate": 2.5798867341685262e-05, + "loss": 1.2326, + "step": 1632 + }, + { + "epoch": 0.28, + "grad_norm": 8.27452278137207, + "learning_rate": 2.5796293118242665e-05, + "loss": 1.2253, + "step": 1633 + }, + { + "epoch": 0.28, + "grad_norm": 8.137526512145996, + "learning_rate": 2.579371889480007e-05, + "loss": 1.0306, + "step": 1634 + }, + { + "epoch": 0.28, + "grad_norm": 8.130615234375, + "learning_rate": 2.5791144671357475e-05, + "loss": 1.0341, + "step": 1635 + }, + { + "epoch": 0.28, + "grad_norm": 9.105877876281738, + "learning_rate": 2.578857044791488e-05, + "loss": 1.3181, + "step": 1636 + }, + { + "epoch": 0.28, + "grad_norm": 8.853985786437988, + "learning_rate": 2.5785996224472285e-05, + "loss": 1.3231, + "step": 1637 + }, + { + "epoch": 0.28, + "grad_norm": 10.044734954833984, + "learning_rate": 2.578342200102969e-05, + "loss": 1.1382, + "step": 1638 + }, + { + "epoch": 0.28, + "grad_norm": 7.615414142608643, + "learning_rate": 2.5780847777587095e-05, + "loss": 0.9365, + "step": 1639 + }, + { + "epoch": 0.28, + "grad_norm": 7.614706039428711, + "learning_rate": 2.57782735541445e-05, + "loss": 0.9821, + "step": 1640 + }, + { + "epoch": 0.28, + "grad_norm": 7.8402838706970215, + "learning_rate": 2.5775699330701905e-05, + "loss": 0.92, + "step": 1641 + }, + { + "epoch": 0.28, + "grad_norm": 8.08004093170166, + "learning_rate": 2.5773125107259312e-05, + "loss": 1.236, + "step": 1642 + }, + { + "epoch": 0.28, + "grad_norm": 9.8212308883667, + "learning_rate": 2.577055088381672e-05, + "loss": 1.1995, + "step": 1643 + }, + { + "epoch": 0.28, + "grad_norm": 8.835196495056152, + "learning_rate": 2.5767976660374122e-05, + "loss": 1.1684, + "step": 1644 + }, + { + "epoch": 0.28, + "grad_norm": 9.536199569702148, + "learning_rate": 2.5765402436931525e-05, + "loss": 1.1275, + "step": 1645 + }, + { + "epoch": 0.28, + "grad_norm": 9.259613990783691, + "learning_rate": 2.5762828213488932e-05, + "loss": 0.9788, + "step": 1646 + }, + { + "epoch": 0.28, + "grad_norm": 10.785974502563477, + "learning_rate": 2.5760253990046335e-05, + "loss": 1.0851, + "step": 1647 + }, + { + "epoch": 0.28, + "grad_norm": 9.26859188079834, + "learning_rate": 2.5757679766603742e-05, + "loss": 1.1746, + "step": 1648 + }, + { + "epoch": 0.28, + "grad_norm": 10.363548278808594, + "learning_rate": 2.5755105543161145e-05, + "loss": 1.0277, + "step": 1649 + }, + { + "epoch": 0.28, + "grad_norm": 8.938871383666992, + "learning_rate": 2.5752531319718552e-05, + "loss": 1.199, + "step": 1650 + }, + { + "epoch": 0.28, + "grad_norm": 7.887062072753906, + "learning_rate": 2.574995709627596e-05, + "loss": 0.7206, + "step": 1651 + }, + { + "epoch": 0.28, + "grad_norm": 9.923095703125, + "learning_rate": 2.5747382872833365e-05, + "loss": 1.2589, + "step": 1652 + }, + { + "epoch": 0.28, + "grad_norm": 10.889364242553711, + "learning_rate": 2.574480864939077e-05, + "loss": 1.1392, + "step": 1653 + }, + { + "epoch": 0.28, + "grad_norm": 10.161477088928223, + "learning_rate": 2.5742234425948175e-05, + "loss": 1.312, + "step": 1654 + }, + { + "epoch": 0.28, + "grad_norm": 10.382858276367188, + "learning_rate": 2.573966020250558e-05, + "loss": 1.2759, + "step": 1655 + }, + { + "epoch": 0.28, + "grad_norm": 8.420166015625, + "learning_rate": 2.573708597906298e-05, + "loss": 1.1455, + "step": 1656 + }, + { + "epoch": 0.28, + "grad_norm": 8.9227294921875, + "learning_rate": 2.573451175562039e-05, + "loss": 1.2758, + "step": 1657 + }, + { + "epoch": 0.28, + "grad_norm": 7.996256351470947, + "learning_rate": 2.573193753217779e-05, + "loss": 1.275, + "step": 1658 + }, + { + "epoch": 0.28, + "grad_norm": 7.737109184265137, + "learning_rate": 2.57293633087352e-05, + "loss": 1.4071, + "step": 1659 + }, + { + "epoch": 0.28, + "grad_norm": 7.454028129577637, + "learning_rate": 2.57267890852926e-05, + "loss": 0.8918, + "step": 1660 + }, + { + "epoch": 0.29, + "grad_norm": 8.383586883544922, + "learning_rate": 2.572421486185001e-05, + "loss": 1.1086, + "step": 1661 + }, + { + "epoch": 0.29, + "grad_norm": 9.172550201416016, + "learning_rate": 2.5721640638407415e-05, + "loss": 1.3437, + "step": 1662 + }, + { + "epoch": 0.29, + "grad_norm": 8.78988265991211, + "learning_rate": 2.571906641496482e-05, + "loss": 1.4695, + "step": 1663 + }, + { + "epoch": 0.29, + "grad_norm": 9.235658645629883, + "learning_rate": 2.5716492191522225e-05, + "loss": 1.36, + "step": 1664 + }, + { + "epoch": 0.29, + "grad_norm": 7.791393756866455, + "learning_rate": 2.5713917968079628e-05, + "loss": 1.2821, + "step": 1665 + }, + { + "epoch": 0.29, + "grad_norm": 8.439427375793457, + "learning_rate": 2.5711343744637035e-05, + "loss": 1.2101, + "step": 1666 + }, + { + "epoch": 0.29, + "grad_norm": 7.998271942138672, + "learning_rate": 2.5708769521194438e-05, + "loss": 1.2884, + "step": 1667 + }, + { + "epoch": 0.29, + "grad_norm": 8.0204439163208, + "learning_rate": 2.5706195297751845e-05, + "loss": 1.1096, + "step": 1668 + }, + { + "epoch": 0.29, + "grad_norm": 8.507468223571777, + "learning_rate": 2.5703621074309248e-05, + "loss": 1.2128, + "step": 1669 + }, + { + "epoch": 0.29, + "grad_norm": 8.155778884887695, + "learning_rate": 2.5701046850866658e-05, + "loss": 1.1895, + "step": 1670 + }, + { + "epoch": 0.29, + "grad_norm": 9.207258224487305, + "learning_rate": 2.569847262742406e-05, + "loss": 1.2311, + "step": 1671 + }, + { + "epoch": 0.29, + "grad_norm": 8.249384880065918, + "learning_rate": 2.5695898403981468e-05, + "loss": 1.2064, + "step": 1672 + }, + { + "epoch": 0.29, + "grad_norm": 7.2927937507629395, + "learning_rate": 2.569332418053887e-05, + "loss": 0.9132, + "step": 1673 + }, + { + "epoch": 0.29, + "grad_norm": 9.482983589172363, + "learning_rate": 2.5690749957096278e-05, + "loss": 1.0854, + "step": 1674 + }, + { + "epoch": 0.29, + "grad_norm": 10.173653602600098, + "learning_rate": 2.568817573365368e-05, + "loss": 1.2891, + "step": 1675 + }, + { + "epoch": 0.29, + "grad_norm": 7.37519645690918, + "learning_rate": 2.5685601510211085e-05, + "loss": 0.9443, + "step": 1676 + }, + { + "epoch": 0.29, + "grad_norm": 8.192808151245117, + "learning_rate": 2.568302728676849e-05, + "loss": 1.1677, + "step": 1677 + }, + { + "epoch": 0.29, + "grad_norm": 7.935910701751709, + "learning_rate": 2.5680453063325895e-05, + "loss": 1.0965, + "step": 1678 + }, + { + "epoch": 0.29, + "grad_norm": 8.470190048217773, + "learning_rate": 2.56778788398833e-05, + "loss": 1.2907, + "step": 1679 + }, + { + "epoch": 0.29, + "grad_norm": 9.00965690612793, + "learning_rate": 2.5675304616440708e-05, + "loss": 1.3334, + "step": 1680 + }, + { + "epoch": 0.29, + "grad_norm": 8.875039100646973, + "learning_rate": 2.5672730392998115e-05, + "loss": 1.1551, + "step": 1681 + }, + { + "epoch": 0.29, + "grad_norm": 7.871654033660889, + "learning_rate": 2.5670156169555518e-05, + "loss": 1.0475, + "step": 1682 + }, + { + "epoch": 0.29, + "grad_norm": 9.366538047790527, + "learning_rate": 2.5667581946112925e-05, + "loss": 1.1854, + "step": 1683 + }, + { + "epoch": 0.29, + "grad_norm": 7.781619548797607, + "learning_rate": 2.5665007722670328e-05, + "loss": 1.0515, + "step": 1684 + }, + { + "epoch": 0.29, + "grad_norm": 9.574727058410645, + "learning_rate": 2.5662433499227735e-05, + "loss": 1.1163, + "step": 1685 + }, + { + "epoch": 0.29, + "grad_norm": 9.076858520507812, + "learning_rate": 2.5659859275785138e-05, + "loss": 1.151, + "step": 1686 + }, + { + "epoch": 0.29, + "grad_norm": 8.47282600402832, + "learning_rate": 2.565728505234254e-05, + "loss": 1.091, + "step": 1687 + }, + { + "epoch": 0.29, + "grad_norm": 8.974745750427246, + "learning_rate": 2.5654710828899948e-05, + "loss": 1.1764, + "step": 1688 + }, + { + "epoch": 0.29, + "grad_norm": 10.930011749267578, + "learning_rate": 2.5652136605457355e-05, + "loss": 1.2829, + "step": 1689 + }, + { + "epoch": 0.29, + "grad_norm": 9.917031288146973, + "learning_rate": 2.564956238201476e-05, + "loss": 1.5465, + "step": 1690 + }, + { + "epoch": 0.29, + "grad_norm": 9.354314804077148, + "learning_rate": 2.5646988158572165e-05, + "loss": 1.4153, + "step": 1691 + }, + { + "epoch": 0.29, + "grad_norm": 6.695298671722412, + "learning_rate": 2.564441393512957e-05, + "loss": 0.8676, + "step": 1692 + }, + { + "epoch": 0.29, + "grad_norm": 8.762816429138184, + "learning_rate": 2.5641839711686975e-05, + "loss": 1.0837, + "step": 1693 + }, + { + "epoch": 0.29, + "grad_norm": 9.785277366638184, + "learning_rate": 2.563926548824438e-05, + "loss": 1.0689, + "step": 1694 + }, + { + "epoch": 0.29, + "grad_norm": 8.864838600158691, + "learning_rate": 2.5636691264801785e-05, + "loss": 1.176, + "step": 1695 + }, + { + "epoch": 0.29, + "grad_norm": 8.101361274719238, + "learning_rate": 2.563411704135919e-05, + "loss": 1.0014, + "step": 1696 + }, + { + "epoch": 0.29, + "grad_norm": 9.913434028625488, + "learning_rate": 2.5631542817916595e-05, + "loss": 1.0631, + "step": 1697 + }, + { + "epoch": 0.29, + "grad_norm": 9.819299697875977, + "learning_rate": 2.5628968594474e-05, + "loss": 1.2241, + "step": 1698 + }, + { + "epoch": 0.29, + "grad_norm": 10.142266273498535, + "learning_rate": 2.5626394371031408e-05, + "loss": 0.9767, + "step": 1699 + }, + { + "epoch": 0.29, + "grad_norm": 10.407238960266113, + "learning_rate": 2.562382014758881e-05, + "loss": 1.2434, + "step": 1700 + }, + { + "epoch": 0.29, + "grad_norm": 10.397313117980957, + "learning_rate": 2.5621245924146218e-05, + "loss": 1.0554, + "step": 1701 + }, + { + "epoch": 0.29, + "grad_norm": 8.991972923278809, + "learning_rate": 2.561867170070362e-05, + "loss": 1.0965, + "step": 1702 + }, + { + "epoch": 0.29, + "grad_norm": 9.07584285736084, + "learning_rate": 2.5616097477261028e-05, + "loss": 0.9142, + "step": 1703 + }, + { + "epoch": 0.29, + "grad_norm": 9.226771354675293, + "learning_rate": 2.561352325381843e-05, + "loss": 0.9517, + "step": 1704 + }, + { + "epoch": 0.29, + "grad_norm": 10.419251441955566, + "learning_rate": 2.5610949030375838e-05, + "loss": 1.5665, + "step": 1705 + }, + { + "epoch": 0.29, + "grad_norm": 10.380773544311523, + "learning_rate": 2.560837480693324e-05, + "loss": 1.1231, + "step": 1706 + }, + { + "epoch": 0.29, + "grad_norm": 10.461137771606445, + "learning_rate": 2.5605800583490644e-05, + "loss": 1.1659, + "step": 1707 + }, + { + "epoch": 0.29, + "grad_norm": 9.826033592224121, + "learning_rate": 2.5603226360048055e-05, + "loss": 1.2141, + "step": 1708 + }, + { + "epoch": 0.29, + "grad_norm": 9.870662689208984, + "learning_rate": 2.5600652136605458e-05, + "loss": 1.3264, + "step": 1709 + }, + { + "epoch": 0.29, + "grad_norm": 10.239055633544922, + "learning_rate": 2.5598077913162865e-05, + "loss": 1.2444, + "step": 1710 + }, + { + "epoch": 0.29, + "grad_norm": 9.364582061767578, + "learning_rate": 2.5595503689720268e-05, + "loss": 1.0615, + "step": 1711 + }, + { + "epoch": 0.29, + "grad_norm": 7.995655059814453, + "learning_rate": 2.5592929466277675e-05, + "loss": 1.1462, + "step": 1712 + }, + { + "epoch": 0.29, + "grad_norm": 8.26231575012207, + "learning_rate": 2.5590355242835078e-05, + "loss": 0.9843, + "step": 1713 + }, + { + "epoch": 0.29, + "grad_norm": 11.783554077148438, + "learning_rate": 2.5587781019392484e-05, + "loss": 1.1091, + "step": 1714 + }, + { + "epoch": 0.29, + "grad_norm": 9.424436569213867, + "learning_rate": 2.5585206795949888e-05, + "loss": 1.2304, + "step": 1715 + }, + { + "epoch": 0.29, + "grad_norm": 8.603520393371582, + "learning_rate": 2.5582632572507294e-05, + "loss": 1.3525, + "step": 1716 + }, + { + "epoch": 0.29, + "grad_norm": 9.819580078125, + "learning_rate": 2.55800583490647e-05, + "loss": 1.5046, + "step": 1717 + }, + { + "epoch": 0.29, + "grad_norm": 10.068258285522461, + "learning_rate": 2.5577484125622104e-05, + "loss": 1.2893, + "step": 1718 + }, + { + "epoch": 0.3, + "grad_norm": 7.862461090087891, + "learning_rate": 2.557490990217951e-05, + "loss": 1.0952, + "step": 1719 + }, + { + "epoch": 0.3, + "grad_norm": 10.446170806884766, + "learning_rate": 2.5572335678736914e-05, + "loss": 1.3306, + "step": 1720 + }, + { + "epoch": 0.3, + "grad_norm": 8.586013793945312, + "learning_rate": 2.556976145529432e-05, + "loss": 1.1739, + "step": 1721 + }, + { + "epoch": 0.3, + "grad_norm": 8.612980842590332, + "learning_rate": 2.5567187231851724e-05, + "loss": 1.215, + "step": 1722 + }, + { + "epoch": 0.3, + "grad_norm": 8.444388389587402, + "learning_rate": 2.556461300840913e-05, + "loss": 1.1309, + "step": 1723 + }, + { + "epoch": 0.3, + "grad_norm": 8.63848876953125, + "learning_rate": 2.5562038784966534e-05, + "loss": 1.3445, + "step": 1724 + }, + { + "epoch": 0.3, + "grad_norm": 8.80753231048584, + "learning_rate": 2.555946456152394e-05, + "loss": 1.0968, + "step": 1725 + }, + { + "epoch": 0.3, + "grad_norm": 7.618228435516357, + "learning_rate": 2.5556890338081344e-05, + "loss": 0.9463, + "step": 1726 + }, + { + "epoch": 0.3, + "grad_norm": 9.364997863769531, + "learning_rate": 2.5554316114638754e-05, + "loss": 1.1908, + "step": 1727 + }, + { + "epoch": 0.3, + "grad_norm": 8.8157958984375, + "learning_rate": 2.5551741891196158e-05, + "loss": 1.0125, + "step": 1728 + }, + { + "epoch": 0.3, + "grad_norm": 8.717795372009277, + "learning_rate": 2.554916766775356e-05, + "loss": 1.2761, + "step": 1729 + }, + { + "epoch": 0.3, + "grad_norm": 8.9950590133667, + "learning_rate": 2.5546593444310968e-05, + "loss": 1.376, + "step": 1730 + }, + { + "epoch": 0.3, + "grad_norm": 7.8765716552734375, + "learning_rate": 2.554401922086837e-05, + "loss": 1.1086, + "step": 1731 + }, + { + "epoch": 0.3, + "grad_norm": 9.276371002197266, + "learning_rate": 2.5541444997425778e-05, + "loss": 1.3216, + "step": 1732 + }, + { + "epoch": 0.3, + "grad_norm": 8.059179306030273, + "learning_rate": 2.553887077398318e-05, + "loss": 0.9557, + "step": 1733 + }, + { + "epoch": 0.3, + "grad_norm": 9.337965965270996, + "learning_rate": 2.5536296550540588e-05, + "loss": 0.8447, + "step": 1734 + }, + { + "epoch": 0.3, + "grad_norm": 10.218817710876465, + "learning_rate": 2.553372232709799e-05, + "loss": 1.3292, + "step": 1735 + }, + { + "epoch": 0.3, + "grad_norm": 8.164855003356934, + "learning_rate": 2.55311481036554e-05, + "loss": 0.9875, + "step": 1736 + }, + { + "epoch": 0.3, + "grad_norm": 9.308812141418457, + "learning_rate": 2.5528573880212804e-05, + "loss": 1.1649, + "step": 1737 + }, + { + "epoch": 0.3, + "grad_norm": 10.199872970581055, + "learning_rate": 2.5525999656770208e-05, + "loss": 1.5198, + "step": 1738 + }, + { + "epoch": 0.3, + "grad_norm": 6.293773174285889, + "learning_rate": 2.5523425433327614e-05, + "loss": 0.9806, + "step": 1739 + }, + { + "epoch": 0.3, + "grad_norm": 9.174756050109863, + "learning_rate": 2.5520851209885018e-05, + "loss": 1.1593, + "step": 1740 + }, + { + "epoch": 0.3, + "grad_norm": 8.54172420501709, + "learning_rate": 2.5518276986442424e-05, + "loss": 1.1001, + "step": 1741 + }, + { + "epoch": 0.3, + "grad_norm": 10.815516471862793, + "learning_rate": 2.5515702762999828e-05, + "loss": 1.2876, + "step": 1742 + }, + { + "epoch": 0.3, + "grad_norm": 8.003947257995605, + "learning_rate": 2.5513128539557234e-05, + "loss": 1.1743, + "step": 1743 + }, + { + "epoch": 0.3, + "grad_norm": 8.989374160766602, + "learning_rate": 2.5510554316114637e-05, + "loss": 1.1333, + "step": 1744 + }, + { + "epoch": 0.3, + "grad_norm": 8.674088478088379, + "learning_rate": 2.5507980092672044e-05, + "loss": 1.0615, + "step": 1745 + }, + { + "epoch": 0.3, + "grad_norm": 8.114914894104004, + "learning_rate": 2.550540586922945e-05, + "loss": 0.9093, + "step": 1746 + }, + { + "epoch": 0.3, + "grad_norm": 8.078839302062988, + "learning_rate": 2.5502831645786858e-05, + "loss": 1.2257, + "step": 1747 + }, + { + "epoch": 0.3, + "grad_norm": 9.610944747924805, + "learning_rate": 2.550025742234426e-05, + "loss": 1.3549, + "step": 1748 + }, + { + "epoch": 0.3, + "grad_norm": 9.474281311035156, + "learning_rate": 2.5497683198901664e-05, + "loss": 1.3019, + "step": 1749 + }, + { + "epoch": 0.3, + "grad_norm": 8.139394760131836, + "learning_rate": 2.549510897545907e-05, + "loss": 1.0164, + "step": 1750 + }, + { + "epoch": 0.3, + "grad_norm": 8.196832656860352, + "learning_rate": 2.5492534752016474e-05, + "loss": 0.9659, + "step": 1751 + }, + { + "epoch": 0.3, + "grad_norm": 11.259092330932617, + "learning_rate": 2.548996052857388e-05, + "loss": 1.2302, + "step": 1752 + }, + { + "epoch": 0.3, + "grad_norm": 9.978081703186035, + "learning_rate": 2.5487386305131284e-05, + "loss": 1.5391, + "step": 1753 + }, + { + "epoch": 0.3, + "grad_norm": 9.210874557495117, + "learning_rate": 2.548481208168869e-05, + "loss": 1.0691, + "step": 1754 + }, + { + "epoch": 0.3, + "grad_norm": 10.688666343688965, + "learning_rate": 2.5482237858246097e-05, + "loss": 1.291, + "step": 1755 + }, + { + "epoch": 0.3, + "grad_norm": 9.124794960021973, + "learning_rate": 2.5479663634803504e-05, + "loss": 1.1149, + "step": 1756 + }, + { + "epoch": 0.3, + "grad_norm": 9.819000244140625, + "learning_rate": 2.5477089411360907e-05, + "loss": 1.4636, + "step": 1757 + }, + { + "epoch": 0.3, + "grad_norm": 9.510807991027832, + "learning_rate": 2.5474515187918314e-05, + "loss": 1.155, + "step": 1758 + }, + { + "epoch": 0.3, + "grad_norm": 9.002891540527344, + "learning_rate": 2.5471940964475717e-05, + "loss": 1.1307, + "step": 1759 + }, + { + "epoch": 0.3, + "grad_norm": 6.713222026824951, + "learning_rate": 2.546936674103312e-05, + "loss": 0.823, + "step": 1760 + }, + { + "epoch": 0.3, + "grad_norm": 8.20630168914795, + "learning_rate": 2.5466792517590527e-05, + "loss": 1.0278, + "step": 1761 + }, + { + "epoch": 0.3, + "grad_norm": 8.881501197814941, + "learning_rate": 2.546421829414793e-05, + "loss": 1.1759, + "step": 1762 + }, + { + "epoch": 0.3, + "grad_norm": 8.865151405334473, + "learning_rate": 2.5461644070705337e-05, + "loss": 1.0436, + "step": 1763 + }, + { + "epoch": 0.3, + "grad_norm": 9.163095474243164, + "learning_rate": 2.545906984726274e-05, + "loss": 1.1672, + "step": 1764 + }, + { + "epoch": 0.3, + "grad_norm": 8.597827911376953, + "learning_rate": 2.545649562382015e-05, + "loss": 0.9883, + "step": 1765 + }, + { + "epoch": 0.3, + "grad_norm": 8.688282012939453, + "learning_rate": 2.5453921400377554e-05, + "loss": 1.1817, + "step": 1766 + }, + { + "epoch": 0.3, + "grad_norm": 9.824360847473145, + "learning_rate": 2.545134717693496e-05, + "loss": 1.0302, + "step": 1767 + }, + { + "epoch": 0.3, + "grad_norm": 9.243977546691895, + "learning_rate": 2.5448772953492364e-05, + "loss": 1.3014, + "step": 1768 + }, + { + "epoch": 0.3, + "grad_norm": 9.205143928527832, + "learning_rate": 2.5446198730049767e-05, + "loss": 1.1799, + "step": 1769 + }, + { + "epoch": 0.3, + "grad_norm": 8.626049995422363, + "learning_rate": 2.5443624506607174e-05, + "loss": 1.2875, + "step": 1770 + }, + { + "epoch": 0.3, + "grad_norm": 7.9079484939575195, + "learning_rate": 2.5441050283164577e-05, + "loss": 0.9806, + "step": 1771 + }, + { + "epoch": 0.3, + "grad_norm": 9.699675559997559, + "learning_rate": 2.5438476059721984e-05, + "loss": 1.2925, + "step": 1772 + }, + { + "epoch": 0.3, + "grad_norm": 8.091042518615723, + "learning_rate": 2.5435901836279387e-05, + "loss": 1.1246, + "step": 1773 + }, + { + "epoch": 0.3, + "grad_norm": 10.246540069580078, + "learning_rate": 2.5433327612836797e-05, + "loss": 1.3969, + "step": 1774 + }, + { + "epoch": 0.3, + "grad_norm": 9.48653507232666, + "learning_rate": 2.54307533893942e-05, + "loss": 1.5656, + "step": 1775 + }, + { + "epoch": 0.3, + "grad_norm": 8.698553085327148, + "learning_rate": 2.5428179165951607e-05, + "loss": 1.3264, + "step": 1776 + }, + { + "epoch": 0.3, + "grad_norm": 8.113031387329102, + "learning_rate": 2.542560494250901e-05, + "loss": 1.349, + "step": 1777 + }, + { + "epoch": 0.31, + "grad_norm": 9.13812255859375, + "learning_rate": 2.5423030719066417e-05, + "loss": 1.2943, + "step": 1778 + }, + { + "epoch": 0.31, + "grad_norm": 9.228736877441406, + "learning_rate": 2.542045649562382e-05, + "loss": 1.1007, + "step": 1779 + }, + { + "epoch": 0.31, + "grad_norm": 8.414812088012695, + "learning_rate": 2.5417882272181224e-05, + "loss": 1.3554, + "step": 1780 + }, + { + "epoch": 0.31, + "grad_norm": 8.727490425109863, + "learning_rate": 2.541530804873863e-05, + "loss": 1.0629, + "step": 1781 + }, + { + "epoch": 0.31, + "grad_norm": 9.751081466674805, + "learning_rate": 2.5412733825296034e-05, + "loss": 1.0592, + "step": 1782 + }, + { + "epoch": 0.31, + "grad_norm": 7.771555423736572, + "learning_rate": 2.541015960185344e-05, + "loss": 1.2257, + "step": 1783 + }, + { + "epoch": 0.31, + "grad_norm": 8.105414390563965, + "learning_rate": 2.5407585378410847e-05, + "loss": 1.0661, + "step": 1784 + }, + { + "epoch": 0.31, + "grad_norm": 8.473969459533691, + "learning_rate": 2.5405011154968254e-05, + "loss": 1.3305, + "step": 1785 + }, + { + "epoch": 0.31, + "grad_norm": 9.59005355834961, + "learning_rate": 2.5402436931525657e-05, + "loss": 1.2249, + "step": 1786 + }, + { + "epoch": 0.31, + "grad_norm": 8.714675903320312, + "learning_rate": 2.5399862708083064e-05, + "loss": 1.05, + "step": 1787 + }, + { + "epoch": 0.31, + "grad_norm": 9.160584449768066, + "learning_rate": 2.5397288484640467e-05, + "loss": 1.1466, + "step": 1788 + }, + { + "epoch": 0.31, + "grad_norm": 10.179819107055664, + "learning_rate": 2.5394714261197874e-05, + "loss": 1.1973, + "step": 1789 + }, + { + "epoch": 0.31, + "grad_norm": 11.159802436828613, + "learning_rate": 2.5392140037755277e-05, + "loss": 1.049, + "step": 1790 + }, + { + "epoch": 0.31, + "grad_norm": 9.245061874389648, + "learning_rate": 2.538956581431268e-05, + "loss": 1.1937, + "step": 1791 + }, + { + "epoch": 0.31, + "grad_norm": 8.878827095031738, + "learning_rate": 2.5386991590870087e-05, + "loss": 1.2525, + "step": 1792 + }, + { + "epoch": 0.31, + "grad_norm": 9.605012893676758, + "learning_rate": 2.5384417367427494e-05, + "loss": 1.3157, + "step": 1793 + }, + { + "epoch": 0.31, + "grad_norm": 8.687931060791016, + "learning_rate": 2.53818431439849e-05, + "loss": 1.1184, + "step": 1794 + }, + { + "epoch": 0.31, + "grad_norm": 8.726444244384766, + "learning_rate": 2.5379268920542304e-05, + "loss": 1.0972, + "step": 1795 + }, + { + "epoch": 0.31, + "grad_norm": 8.861693382263184, + "learning_rate": 2.537669469709971e-05, + "loss": 1.4293, + "step": 1796 + }, + { + "epoch": 0.31, + "grad_norm": 9.042244911193848, + "learning_rate": 2.5374120473657114e-05, + "loss": 1.2176, + "step": 1797 + }, + { + "epoch": 0.31, + "grad_norm": 9.745487213134766, + "learning_rate": 2.537154625021452e-05, + "loss": 1.0754, + "step": 1798 + }, + { + "epoch": 0.31, + "grad_norm": 9.054213523864746, + "learning_rate": 2.5368972026771924e-05, + "loss": 1.2724, + "step": 1799 + }, + { + "epoch": 0.31, + "grad_norm": 8.261024475097656, + "learning_rate": 2.536639780332933e-05, + "loss": 1.2302, + "step": 1800 + }, + { + "epoch": 0.31, + "grad_norm": 9.732303619384766, + "learning_rate": 2.5363823579886734e-05, + "loss": 1.4895, + "step": 1801 + }, + { + "epoch": 0.31, + "grad_norm": 7.90101957321167, + "learning_rate": 2.5361249356444137e-05, + "loss": 0.7279, + "step": 1802 + }, + { + "epoch": 0.31, + "grad_norm": 9.161076545715332, + "learning_rate": 2.5358675133001547e-05, + "loss": 1.0414, + "step": 1803 + }, + { + "epoch": 0.31, + "grad_norm": 7.7752532958984375, + "learning_rate": 2.535610090955895e-05, + "loss": 1.0087, + "step": 1804 + }, + { + "epoch": 0.31, + "grad_norm": 9.58970832824707, + "learning_rate": 2.5353526686116357e-05, + "loss": 1.0037, + "step": 1805 + }, + { + "epoch": 0.31, + "grad_norm": 10.076090812683105, + "learning_rate": 2.535095246267376e-05, + "loss": 1.2012, + "step": 1806 + }, + { + "epoch": 0.31, + "grad_norm": 10.196894645690918, + "learning_rate": 2.5348378239231167e-05, + "loss": 1.153, + "step": 1807 + }, + { + "epoch": 0.31, + "grad_norm": 7.644584655761719, + "learning_rate": 2.534580401578857e-05, + "loss": 0.9725, + "step": 1808 + }, + { + "epoch": 0.31, + "grad_norm": 9.094841003417969, + "learning_rate": 2.5343229792345977e-05, + "loss": 1.504, + "step": 1809 + }, + { + "epoch": 0.31, + "grad_norm": 8.794973373413086, + "learning_rate": 2.534065556890338e-05, + "loss": 1.2477, + "step": 1810 + }, + { + "epoch": 0.31, + "grad_norm": 9.357816696166992, + "learning_rate": 2.5338081345460783e-05, + "loss": 1.0704, + "step": 1811 + }, + { + "epoch": 0.31, + "grad_norm": 7.8945417404174805, + "learning_rate": 2.5335507122018194e-05, + "loss": 1.0943, + "step": 1812 + }, + { + "epoch": 0.31, + "grad_norm": 8.366178512573242, + "learning_rate": 2.5332932898575597e-05, + "loss": 1.0265, + "step": 1813 + }, + { + "epoch": 0.31, + "grad_norm": 9.090495109558105, + "learning_rate": 2.5330358675133004e-05, + "loss": 1.2477, + "step": 1814 + }, + { + "epoch": 0.31, + "grad_norm": 7.481625556945801, + "learning_rate": 2.5327784451690407e-05, + "loss": 1.0535, + "step": 1815 + }, + { + "epoch": 0.31, + "grad_norm": 9.137459754943848, + "learning_rate": 2.5325210228247814e-05, + "loss": 1.3704, + "step": 1816 + }, + { + "epoch": 0.31, + "grad_norm": 9.223336219787598, + "learning_rate": 2.5322636004805217e-05, + "loss": 1.2863, + "step": 1817 + }, + { + "epoch": 0.31, + "grad_norm": 8.916875839233398, + "learning_rate": 2.5320061781362623e-05, + "loss": 1.3063, + "step": 1818 + }, + { + "epoch": 0.31, + "grad_norm": 9.86544418334961, + "learning_rate": 2.5317487557920027e-05, + "loss": 1.1277, + "step": 1819 + }, + { + "epoch": 0.31, + "grad_norm": 8.277205467224121, + "learning_rate": 2.5314913334477433e-05, + "loss": 1.3575, + "step": 1820 + }, + { + "epoch": 0.31, + "grad_norm": 10.45832347869873, + "learning_rate": 2.531233911103484e-05, + "loss": 1.416, + "step": 1821 + }, + { + "epoch": 0.31, + "grad_norm": 9.004668235778809, + "learning_rate": 2.5309764887592243e-05, + "loss": 1.2249, + "step": 1822 + }, + { + "epoch": 0.31, + "grad_norm": 7.491814136505127, + "learning_rate": 2.530719066414965e-05, + "loss": 0.8599, + "step": 1823 + }, + { + "epoch": 0.31, + "grad_norm": 8.60008430480957, + "learning_rate": 2.5304616440707053e-05, + "loss": 1.1542, + "step": 1824 + }, + { + "epoch": 0.31, + "grad_norm": 7.817724227905273, + "learning_rate": 2.530204221726446e-05, + "loss": 1.3333, + "step": 1825 + }, + { + "epoch": 0.31, + "grad_norm": 8.774344444274902, + "learning_rate": 2.5299467993821863e-05, + "loss": 1.4904, + "step": 1826 + }, + { + "epoch": 0.31, + "grad_norm": 7.157302379608154, + "learning_rate": 2.529689377037927e-05, + "loss": 1.0191, + "step": 1827 + }, + { + "epoch": 0.31, + "grad_norm": 9.208282470703125, + "learning_rate": 2.5294319546936673e-05, + "loss": 1.0935, + "step": 1828 + }, + { + "epoch": 0.31, + "grad_norm": 9.650165557861328, + "learning_rate": 2.529174532349408e-05, + "loss": 1.1832, + "step": 1829 + }, + { + "epoch": 0.31, + "grad_norm": 9.321013450622559, + "learning_rate": 2.5289171100051483e-05, + "loss": 1.3029, + "step": 1830 + }, + { + "epoch": 0.31, + "grad_norm": 9.568893432617188, + "learning_rate": 2.5286596876608893e-05, + "loss": 0.9535, + "step": 1831 + }, + { + "epoch": 0.31, + "grad_norm": 7.806543350219727, + "learning_rate": 2.5284022653166297e-05, + "loss": 0.7877, + "step": 1832 + }, + { + "epoch": 0.31, + "grad_norm": 8.580751419067383, + "learning_rate": 2.52814484297237e-05, + "loss": 1.1122, + "step": 1833 + }, + { + "epoch": 0.31, + "grad_norm": 9.052247047424316, + "learning_rate": 2.5278874206281107e-05, + "loss": 1.1541, + "step": 1834 + }, + { + "epoch": 0.31, + "grad_norm": 11.40207290649414, + "learning_rate": 2.527629998283851e-05, + "loss": 1.3442, + "step": 1835 + }, + { + "epoch": 0.32, + "grad_norm": 10.848437309265137, + "learning_rate": 2.5273725759395917e-05, + "loss": 1.3851, + "step": 1836 + }, + { + "epoch": 0.32, + "grad_norm": 8.450674057006836, + "learning_rate": 2.527115153595332e-05, + "loss": 1.0922, + "step": 1837 + }, + { + "epoch": 0.32, + "grad_norm": 10.358359336853027, + "learning_rate": 2.5268577312510727e-05, + "loss": 1.1509, + "step": 1838 + }, + { + "epoch": 0.32, + "grad_norm": 9.33609390258789, + "learning_rate": 2.526600308906813e-05, + "loss": 1.2894, + "step": 1839 + }, + { + "epoch": 0.32, + "grad_norm": 8.63636589050293, + "learning_rate": 2.526342886562554e-05, + "loss": 0.9445, + "step": 1840 + }, + { + "epoch": 0.32, + "grad_norm": 10.046621322631836, + "learning_rate": 2.5260854642182943e-05, + "loss": 1.2915, + "step": 1841 + }, + { + "epoch": 0.32, + "grad_norm": 8.787140846252441, + "learning_rate": 2.5258280418740347e-05, + "loss": 1.0923, + "step": 1842 + }, + { + "epoch": 0.32, + "grad_norm": 11.000146865844727, + "learning_rate": 2.5255706195297753e-05, + "loss": 1.2291, + "step": 1843 + }, + { + "epoch": 0.32, + "grad_norm": 8.815464973449707, + "learning_rate": 2.5253131971855157e-05, + "loss": 1.1863, + "step": 1844 + }, + { + "epoch": 0.32, + "grad_norm": 9.100621223449707, + "learning_rate": 2.5250557748412563e-05, + "loss": 1.186, + "step": 1845 + }, + { + "epoch": 0.32, + "grad_norm": 9.029321670532227, + "learning_rate": 2.5247983524969967e-05, + "loss": 1.1971, + "step": 1846 + }, + { + "epoch": 0.32, + "grad_norm": 7.647441864013672, + "learning_rate": 2.5245409301527373e-05, + "loss": 0.9327, + "step": 1847 + }, + { + "epoch": 0.32, + "grad_norm": 7.917750358581543, + "learning_rate": 2.5242835078084776e-05, + "loss": 1.3352, + "step": 1848 + }, + { + "epoch": 0.32, + "grad_norm": 9.372392654418945, + "learning_rate": 2.5240260854642183e-05, + "loss": 1.2961, + "step": 1849 + }, + { + "epoch": 0.32, + "grad_norm": 8.865127563476562, + "learning_rate": 2.523768663119959e-05, + "loss": 1.1148, + "step": 1850 + }, + { + "epoch": 0.32, + "grad_norm": 10.053229331970215, + "learning_rate": 2.5235112407756997e-05, + "loss": 1.089, + "step": 1851 + }, + { + "epoch": 0.32, + "grad_norm": 6.55601692199707, + "learning_rate": 2.52325381843144e-05, + "loss": 0.9104, + "step": 1852 + }, + { + "epoch": 0.32, + "grad_norm": 7.748366355895996, + "learning_rate": 2.5229963960871803e-05, + "loss": 1.2221, + "step": 1853 + }, + { + "epoch": 0.32, + "grad_norm": 7.148165702819824, + "learning_rate": 2.522738973742921e-05, + "loss": 0.9479, + "step": 1854 + }, + { + "epoch": 0.32, + "grad_norm": 8.006891250610352, + "learning_rate": 2.5224815513986613e-05, + "loss": 1.0907, + "step": 1855 + }, + { + "epoch": 0.32, + "grad_norm": 8.19072437286377, + "learning_rate": 2.522224129054402e-05, + "loss": 1.0549, + "step": 1856 + }, + { + "epoch": 0.32, + "grad_norm": 9.527332305908203, + "learning_rate": 2.5219667067101423e-05, + "loss": 1.3011, + "step": 1857 + }, + { + "epoch": 0.32, + "grad_norm": 8.543940544128418, + "learning_rate": 2.521709284365883e-05, + "loss": 1.231, + "step": 1858 + }, + { + "epoch": 0.32, + "grad_norm": 7.198790550231934, + "learning_rate": 2.5214518620216236e-05, + "loss": 0.9201, + "step": 1859 + }, + { + "epoch": 0.32, + "grad_norm": 9.62169361114502, + "learning_rate": 2.5211944396773643e-05, + "loss": 1.3424, + "step": 1860 + }, + { + "epoch": 0.32, + "grad_norm": 10.031617164611816, + "learning_rate": 2.5209370173331046e-05, + "loss": 1.4437, + "step": 1861 + }, + { + "epoch": 0.32, + "grad_norm": 9.64932632446289, + "learning_rate": 2.5206795949888453e-05, + "loss": 1.3224, + "step": 1862 + }, + { + "epoch": 0.32, + "grad_norm": 8.811517715454102, + "learning_rate": 2.5204221726445856e-05, + "loss": 1.0686, + "step": 1863 + }, + { + "epoch": 0.32, + "grad_norm": 8.884221076965332, + "learning_rate": 2.520164750300326e-05, + "loss": 1.07, + "step": 1864 + }, + { + "epoch": 0.32, + "grad_norm": 7.0543694496154785, + "learning_rate": 2.5199073279560666e-05, + "loss": 0.9218, + "step": 1865 + }, + { + "epoch": 0.32, + "grad_norm": 9.33708381652832, + "learning_rate": 2.519649905611807e-05, + "loss": 1.3307, + "step": 1866 + }, + { + "epoch": 0.32, + "grad_norm": 9.017557144165039, + "learning_rate": 2.5193924832675476e-05, + "loss": 1.0409, + "step": 1867 + }, + { + "epoch": 0.32, + "grad_norm": 9.777966499328613, + "learning_rate": 2.519135060923288e-05, + "loss": 1.1855, + "step": 1868 + }, + { + "epoch": 0.32, + "grad_norm": 10.000447273254395, + "learning_rate": 2.518877638579029e-05, + "loss": 0.8718, + "step": 1869 + }, + { + "epoch": 0.32, + "grad_norm": 10.26295280456543, + "learning_rate": 2.5186202162347693e-05, + "loss": 1.3592, + "step": 1870 + }, + { + "epoch": 0.32, + "grad_norm": 8.32453727722168, + "learning_rate": 2.51836279389051e-05, + "loss": 1.0087, + "step": 1871 + }, + { + "epoch": 0.32, + "grad_norm": 8.395368576049805, + "learning_rate": 2.5181053715462503e-05, + "loss": 0.9962, + "step": 1872 + }, + { + "epoch": 0.32, + "grad_norm": 10.196331977844238, + "learning_rate": 2.5178479492019906e-05, + "loss": 1.1135, + "step": 1873 + }, + { + "epoch": 0.32, + "grad_norm": 9.52117919921875, + "learning_rate": 2.5175905268577313e-05, + "loss": 1.1214, + "step": 1874 + }, + { + "epoch": 0.32, + "grad_norm": 10.943233489990234, + "learning_rate": 2.5173331045134716e-05, + "loss": 1.4006, + "step": 1875 + }, + { + "epoch": 0.32, + "grad_norm": 9.03986930847168, + "learning_rate": 2.5170756821692123e-05, + "loss": 0.9299, + "step": 1876 + }, + { + "epoch": 0.32, + "grad_norm": 10.859893798828125, + "learning_rate": 2.5168182598249526e-05, + "loss": 1.413, + "step": 1877 + }, + { + "epoch": 0.32, + "grad_norm": 9.069256782531738, + "learning_rate": 2.5165608374806936e-05, + "loss": 1.2064, + "step": 1878 + }, + { + "epoch": 0.32, + "grad_norm": 9.76870346069336, + "learning_rate": 2.516303415136434e-05, + "loss": 1.116, + "step": 1879 + }, + { + "epoch": 0.32, + "grad_norm": 9.318020820617676, + "learning_rate": 2.5160459927921746e-05, + "loss": 1.0757, + "step": 1880 + }, + { + "epoch": 0.32, + "grad_norm": 8.963784217834473, + "learning_rate": 2.515788570447915e-05, + "loss": 1.2655, + "step": 1881 + }, + { + "epoch": 0.32, + "grad_norm": 8.540380477905273, + "learning_rate": 2.5155311481036556e-05, + "loss": 1.0673, + "step": 1882 + }, + { + "epoch": 0.32, + "grad_norm": 9.368696212768555, + "learning_rate": 2.515273725759396e-05, + "loss": 1.072, + "step": 1883 + }, + { + "epoch": 0.32, + "grad_norm": 8.880315780639648, + "learning_rate": 2.5150163034151363e-05, + "loss": 1.3764, + "step": 1884 + }, + { + "epoch": 0.32, + "grad_norm": 9.173974990844727, + "learning_rate": 2.514758881070877e-05, + "loss": 1.2389, + "step": 1885 + }, + { + "epoch": 0.32, + "grad_norm": 10.06794261932373, + "learning_rate": 2.5145014587266173e-05, + "loss": 1.4193, + "step": 1886 + }, + { + "epoch": 0.32, + "grad_norm": 7.367919921875, + "learning_rate": 2.514244036382358e-05, + "loss": 1.0134, + "step": 1887 + }, + { + "epoch": 0.32, + "grad_norm": 8.085611343383789, + "learning_rate": 2.5139866140380986e-05, + "loss": 1.2374, + "step": 1888 + }, + { + "epoch": 0.32, + "grad_norm": 8.164103507995605, + "learning_rate": 2.5137291916938393e-05, + "loss": 0.9791, + "step": 1889 + }, + { + "epoch": 0.32, + "grad_norm": 8.86156940460205, + "learning_rate": 2.5134717693495796e-05, + "loss": 1.2038, + "step": 1890 + }, + { + "epoch": 0.32, + "grad_norm": 7.802731513977051, + "learning_rate": 2.5132143470053203e-05, + "loss": 0.9789, + "step": 1891 + }, + { + "epoch": 0.32, + "grad_norm": 7.810247898101807, + "learning_rate": 2.5129569246610606e-05, + "loss": 1.0637, + "step": 1892 + }, + { + "epoch": 0.32, + "grad_norm": 8.733325004577637, + "learning_rate": 2.5126995023168013e-05, + "loss": 1.2844, + "step": 1893 + }, + { + "epoch": 0.33, + "grad_norm": 7.678900718688965, + "learning_rate": 2.5124420799725416e-05, + "loss": 0.9974, + "step": 1894 + }, + { + "epoch": 0.33, + "grad_norm": 9.847447395324707, + "learning_rate": 2.512184657628282e-05, + "loss": 1.1089, + "step": 1895 + }, + { + "epoch": 0.33, + "grad_norm": 9.557967185974121, + "learning_rate": 2.5119272352840226e-05, + "loss": 1.1357, + "step": 1896 + }, + { + "epoch": 0.33, + "grad_norm": 9.160346984863281, + "learning_rate": 2.5116698129397633e-05, + "loss": 1.2265, + "step": 1897 + }, + { + "epoch": 0.33, + "grad_norm": 10.898780822753906, + "learning_rate": 2.511412390595504e-05, + "loss": 1.3529, + "step": 1898 + }, + { + "epoch": 0.33, + "grad_norm": 8.820026397705078, + "learning_rate": 2.5111549682512443e-05, + "loss": 1.1643, + "step": 1899 + }, + { + "epoch": 0.33, + "grad_norm": 8.765425682067871, + "learning_rate": 2.510897545906985e-05, + "loss": 0.9822, + "step": 1900 + }, + { + "epoch": 0.33, + "grad_norm": 9.185986518859863, + "learning_rate": 2.5106401235627253e-05, + "loss": 1.3487, + "step": 1901 + }, + { + "epoch": 0.33, + "grad_norm": 9.548151969909668, + "learning_rate": 2.510382701218466e-05, + "loss": 1.4098, + "step": 1902 + }, + { + "epoch": 0.33, + "grad_norm": 9.726716041564941, + "learning_rate": 2.5101252788742063e-05, + "loss": 1.2139, + "step": 1903 + }, + { + "epoch": 0.33, + "grad_norm": 6.740248680114746, + "learning_rate": 2.509867856529947e-05, + "loss": 0.8818, + "step": 1904 + }, + { + "epoch": 0.33, + "grad_norm": 8.98962688446045, + "learning_rate": 2.5096104341856873e-05, + "loss": 1.2039, + "step": 1905 + }, + { + "epoch": 0.33, + "grad_norm": 9.169604301452637, + "learning_rate": 2.5093530118414276e-05, + "loss": 1.1749, + "step": 1906 + }, + { + "epoch": 0.33, + "grad_norm": 8.317733764648438, + "learning_rate": 2.5090955894971686e-05, + "loss": 1.0668, + "step": 1907 + }, + { + "epoch": 0.33, + "grad_norm": 8.362645149230957, + "learning_rate": 2.508838167152909e-05, + "loss": 0.8885, + "step": 1908 + }, + { + "epoch": 0.33, + "grad_norm": 7.150195121765137, + "learning_rate": 2.5085807448086496e-05, + "loss": 1.0103, + "step": 1909 + }, + { + "epoch": 0.33, + "grad_norm": 8.825246810913086, + "learning_rate": 2.50832332246439e-05, + "loss": 0.9991, + "step": 1910 + }, + { + "epoch": 0.33, + "grad_norm": 8.028517723083496, + "learning_rate": 2.5080659001201306e-05, + "loss": 1.1897, + "step": 1911 + }, + { + "epoch": 0.33, + "grad_norm": 8.390515327453613, + "learning_rate": 2.507808477775871e-05, + "loss": 1.0032, + "step": 1912 + }, + { + "epoch": 0.33, + "grad_norm": 10.067804336547852, + "learning_rate": 2.5075510554316116e-05, + "loss": 1.1421, + "step": 1913 + }, + { + "epoch": 0.33, + "grad_norm": 8.462785720825195, + "learning_rate": 2.507293633087352e-05, + "loss": 1.1792, + "step": 1914 + }, + { + "epoch": 0.33, + "grad_norm": 9.050780296325684, + "learning_rate": 2.5070362107430923e-05, + "loss": 1.0274, + "step": 1915 + }, + { + "epoch": 0.33, + "grad_norm": 8.930495262145996, + "learning_rate": 2.5067787883988333e-05, + "loss": 1.1926, + "step": 1916 + }, + { + "epoch": 0.33, + "grad_norm": 9.409160614013672, + "learning_rate": 2.5065213660545736e-05, + "loss": 1.2423, + "step": 1917 + }, + { + "epoch": 0.33, + "grad_norm": 10.221443176269531, + "learning_rate": 2.5062639437103143e-05, + "loss": 1.0732, + "step": 1918 + }, + { + "epoch": 0.33, + "grad_norm": 8.196868896484375, + "learning_rate": 2.5060065213660546e-05, + "loss": 0.9071, + "step": 1919 + }, + { + "epoch": 0.33, + "grad_norm": 9.157271385192871, + "learning_rate": 2.5057490990217953e-05, + "loss": 1.0358, + "step": 1920 + }, + { + "epoch": 0.33, + "grad_norm": 9.6808500289917, + "learning_rate": 2.5054916766775356e-05, + "loss": 1.1404, + "step": 1921 + }, + { + "epoch": 0.33, + "grad_norm": 10.573707580566406, + "learning_rate": 2.5052342543332763e-05, + "loss": 1.2259, + "step": 1922 + }, + { + "epoch": 0.33, + "grad_norm": 8.4099702835083, + "learning_rate": 2.5049768319890166e-05, + "loss": 0.9453, + "step": 1923 + }, + { + "epoch": 0.33, + "grad_norm": 9.807967185974121, + "learning_rate": 2.5047194096447572e-05, + "loss": 1.1408, + "step": 1924 + }, + { + "epoch": 0.33, + "grad_norm": 9.258343696594238, + "learning_rate": 2.504461987300498e-05, + "loss": 1.0049, + "step": 1925 + }, + { + "epoch": 0.33, + "grad_norm": 9.263693809509277, + "learning_rate": 2.5042045649562382e-05, + "loss": 1.1866, + "step": 1926 + }, + { + "epoch": 0.33, + "grad_norm": 10.111815452575684, + "learning_rate": 2.503947142611979e-05, + "loss": 1.3251, + "step": 1927 + }, + { + "epoch": 0.33, + "grad_norm": 8.936982154846191, + "learning_rate": 2.5036897202677192e-05, + "loss": 1.181, + "step": 1928 + }, + { + "epoch": 0.33, + "grad_norm": 9.280904769897461, + "learning_rate": 2.50343229792346e-05, + "loss": 0.9024, + "step": 1929 + }, + { + "epoch": 0.33, + "grad_norm": 9.838895797729492, + "learning_rate": 2.5031748755792002e-05, + "loss": 1.4286, + "step": 1930 + }, + { + "epoch": 0.33, + "grad_norm": 9.120624542236328, + "learning_rate": 2.502917453234941e-05, + "loss": 1.2885, + "step": 1931 + }, + { + "epoch": 0.33, + "grad_norm": 8.158381462097168, + "learning_rate": 2.5026600308906812e-05, + "loss": 1.1287, + "step": 1932 + }, + { + "epoch": 0.33, + "grad_norm": 9.586956024169922, + "learning_rate": 2.502402608546422e-05, + "loss": 1.1301, + "step": 1933 + }, + { + "epoch": 0.33, + "grad_norm": 9.403854370117188, + "learning_rate": 2.5021451862021622e-05, + "loss": 1.3198, + "step": 1934 + }, + { + "epoch": 0.33, + "grad_norm": 7.893551349639893, + "learning_rate": 2.5018877638579032e-05, + "loss": 1.3, + "step": 1935 + }, + { + "epoch": 0.33, + "grad_norm": 9.152052879333496, + "learning_rate": 2.5016303415136436e-05, + "loss": 1.2528, + "step": 1936 + }, + { + "epoch": 0.33, + "grad_norm": 9.34033203125, + "learning_rate": 2.501372919169384e-05, + "loss": 1.2822, + "step": 1937 + }, + { + "epoch": 0.33, + "grad_norm": 8.420156478881836, + "learning_rate": 2.5011154968251246e-05, + "loss": 1.0191, + "step": 1938 + }, + { + "epoch": 0.33, + "grad_norm": 10.021442413330078, + "learning_rate": 2.500858074480865e-05, + "loss": 1.2212, + "step": 1939 + }, + { + "epoch": 0.33, + "grad_norm": 9.50693130493164, + "learning_rate": 2.5006006521366056e-05, + "loss": 1.3093, + "step": 1940 + }, + { + "epoch": 0.33, + "grad_norm": 9.51550579071045, + "learning_rate": 2.500343229792346e-05, + "loss": 1.0646, + "step": 1941 + }, + { + "epoch": 0.33, + "grad_norm": 9.394984245300293, + "learning_rate": 2.5000858074480866e-05, + "loss": 1.1632, + "step": 1942 + }, + { + "epoch": 0.33, + "grad_norm": 8.698156356811523, + "learning_rate": 2.499828385103827e-05, + "loss": 1.3627, + "step": 1943 + }, + { + "epoch": 0.33, + "grad_norm": 10.20472240447998, + "learning_rate": 2.499570962759568e-05, + "loss": 1.2292, + "step": 1944 + }, + { + "epoch": 0.33, + "grad_norm": 10.285568237304688, + "learning_rate": 2.4993135404153082e-05, + "loss": 1.4758, + "step": 1945 + }, + { + "epoch": 0.33, + "grad_norm": 9.318853378295898, + "learning_rate": 2.4990561180710486e-05, + "loss": 1.2371, + "step": 1946 + }, + { + "epoch": 0.33, + "grad_norm": 7.862863540649414, + "learning_rate": 2.4987986957267892e-05, + "loss": 1.0279, + "step": 1947 + }, + { + "epoch": 0.33, + "grad_norm": 9.340022087097168, + "learning_rate": 2.4985412733825296e-05, + "loss": 1.1188, + "step": 1948 + }, + { + "epoch": 0.33, + "grad_norm": 7.183253765106201, + "learning_rate": 2.4982838510382702e-05, + "loss": 0.9651, + "step": 1949 + }, + { + "epoch": 0.33, + "grad_norm": 8.86628532409668, + "learning_rate": 2.4980264286940106e-05, + "loss": 1.2317, + "step": 1950 + }, + { + "epoch": 0.33, + "grad_norm": 12.645370483398438, + "learning_rate": 2.4977690063497512e-05, + "loss": 1.209, + "step": 1951 + }, + { + "epoch": 0.33, + "grad_norm": 8.665984153747559, + "learning_rate": 2.4975115840054916e-05, + "loss": 0.9434, + "step": 1952 + }, + { + "epoch": 0.34, + "grad_norm": 9.246528625488281, + "learning_rate": 2.4972541616612322e-05, + "loss": 1.4181, + "step": 1953 + }, + { + "epoch": 0.34, + "grad_norm": 8.156112670898438, + "learning_rate": 2.496996739316973e-05, + "loss": 1.1267, + "step": 1954 + }, + { + "epoch": 0.34, + "grad_norm": 9.022880554199219, + "learning_rate": 2.4967393169727136e-05, + "loss": 1.1695, + "step": 1955 + }, + { + "epoch": 0.34, + "grad_norm": 9.308635711669922, + "learning_rate": 2.496481894628454e-05, + "loss": 1.2813, + "step": 1956 + }, + { + "epoch": 0.34, + "grad_norm": 8.928953170776367, + "learning_rate": 2.4962244722841942e-05, + "loss": 1.0572, + "step": 1957 + }, + { + "epoch": 0.34, + "grad_norm": 10.274496078491211, + "learning_rate": 2.495967049939935e-05, + "loss": 0.9021, + "step": 1958 + }, + { + "epoch": 0.34, + "grad_norm": 9.08324909210205, + "learning_rate": 2.4957096275956752e-05, + "loss": 1.0692, + "step": 1959 + }, + { + "epoch": 0.34, + "grad_norm": 8.38923454284668, + "learning_rate": 2.495452205251416e-05, + "loss": 1.103, + "step": 1960 + }, + { + "epoch": 0.34, + "grad_norm": 9.054323196411133, + "learning_rate": 2.4951947829071562e-05, + "loss": 1.2061, + "step": 1961 + }, + { + "epoch": 0.34, + "grad_norm": 9.091486930847168, + "learning_rate": 2.494937360562897e-05, + "loss": 1.1214, + "step": 1962 + }, + { + "epoch": 0.34, + "grad_norm": 6.852002143859863, + "learning_rate": 2.4946799382186375e-05, + "loss": 0.8592, + "step": 1963 + }, + { + "epoch": 0.34, + "grad_norm": 8.171123504638672, + "learning_rate": 2.4944225158743782e-05, + "loss": 0.8868, + "step": 1964 + }, + { + "epoch": 0.34, + "grad_norm": 9.343501091003418, + "learning_rate": 2.4941650935301185e-05, + "loss": 1.3639, + "step": 1965 + }, + { + "epoch": 0.34, + "grad_norm": 7.870195388793945, + "learning_rate": 2.4939076711858592e-05, + "loss": 1.1631, + "step": 1966 + }, + { + "epoch": 0.34, + "grad_norm": 8.815661430358887, + "learning_rate": 2.4936502488415995e-05, + "loss": 1.1268, + "step": 1967 + }, + { + "epoch": 0.34, + "grad_norm": 8.165377616882324, + "learning_rate": 2.49339282649734e-05, + "loss": 0.9369, + "step": 1968 + }, + { + "epoch": 0.34, + "grad_norm": 9.087328910827637, + "learning_rate": 2.4931354041530805e-05, + "loss": 1.0966, + "step": 1969 + }, + { + "epoch": 0.34, + "grad_norm": 10.71306324005127, + "learning_rate": 2.492877981808821e-05, + "loss": 1.432, + "step": 1970 + }, + { + "epoch": 0.34, + "grad_norm": 9.951461791992188, + "learning_rate": 2.4926205594645615e-05, + "loss": 1.2412, + "step": 1971 + }, + { + "epoch": 0.34, + "grad_norm": 7.967318534851074, + "learning_rate": 2.492363137120302e-05, + "loss": 1.2419, + "step": 1972 + }, + { + "epoch": 0.34, + "grad_norm": 10.283565521240234, + "learning_rate": 2.492105714776043e-05, + "loss": 1.1954, + "step": 1973 + }, + { + "epoch": 0.34, + "grad_norm": 9.887789726257324, + "learning_rate": 2.4918482924317832e-05, + "loss": 1.6527, + "step": 1974 + }, + { + "epoch": 0.34, + "grad_norm": 9.543041229248047, + "learning_rate": 2.491590870087524e-05, + "loss": 1.1091, + "step": 1975 + }, + { + "epoch": 0.34, + "grad_norm": 10.84188461303711, + "learning_rate": 2.4913334477432642e-05, + "loss": 1.2852, + "step": 1976 + }, + { + "epoch": 0.34, + "grad_norm": 10.968687057495117, + "learning_rate": 2.491076025399005e-05, + "loss": 1.5774, + "step": 1977 + }, + { + "epoch": 0.34, + "grad_norm": 9.033377647399902, + "learning_rate": 2.4908186030547452e-05, + "loss": 0.9287, + "step": 1978 + }, + { + "epoch": 0.34, + "grad_norm": 10.477993965148926, + "learning_rate": 2.4905611807104855e-05, + "loss": 1.3667, + "step": 1979 + }, + { + "epoch": 0.34, + "grad_norm": 9.788491249084473, + "learning_rate": 2.4903037583662262e-05, + "loss": 1.4238, + "step": 1980 + }, + { + "epoch": 0.34, + "grad_norm": 8.339115142822266, + "learning_rate": 2.4900463360219665e-05, + "loss": 1.0565, + "step": 1981 + }, + { + "epoch": 0.34, + "grad_norm": 8.48612117767334, + "learning_rate": 2.4897889136777075e-05, + "loss": 1.3312, + "step": 1982 + }, + { + "epoch": 0.34, + "grad_norm": 8.537652015686035, + "learning_rate": 2.489531491333448e-05, + "loss": 1.2015, + "step": 1983 + }, + { + "epoch": 0.34, + "grad_norm": 7.066079139709473, + "learning_rate": 2.4892740689891885e-05, + "loss": 1.1343, + "step": 1984 + }, + { + "epoch": 0.34, + "grad_norm": 7.819148063659668, + "learning_rate": 2.489016646644929e-05, + "loss": 1.1123, + "step": 1985 + }, + { + "epoch": 0.34, + "grad_norm": 8.840709686279297, + "learning_rate": 2.4887592243006695e-05, + "loss": 1.1617, + "step": 1986 + }, + { + "epoch": 0.34, + "grad_norm": 9.1939115524292, + "learning_rate": 2.48850180195641e-05, + "loss": 1.3644, + "step": 1987 + }, + { + "epoch": 0.34, + "grad_norm": 10.358109474182129, + "learning_rate": 2.4882443796121502e-05, + "loss": 1.3645, + "step": 1988 + }, + { + "epoch": 0.34, + "grad_norm": 9.732720375061035, + "learning_rate": 2.487986957267891e-05, + "loss": 1.033, + "step": 1989 + }, + { + "epoch": 0.34, + "grad_norm": 8.227747917175293, + "learning_rate": 2.4877295349236312e-05, + "loss": 1.1426, + "step": 1990 + }, + { + "epoch": 0.34, + "grad_norm": 8.248335838317871, + "learning_rate": 2.487472112579372e-05, + "loss": 1.0489, + "step": 1991 + }, + { + "epoch": 0.34, + "grad_norm": 8.124897956848145, + "learning_rate": 2.4872146902351125e-05, + "loss": 0.9307, + "step": 1992 + }, + { + "epoch": 0.34, + "grad_norm": 10.019804954528809, + "learning_rate": 2.4869572678908532e-05, + "loss": 1.4164, + "step": 1993 + }, + { + "epoch": 0.34, + "grad_norm": 8.667664527893066, + "learning_rate": 2.4866998455465935e-05, + "loss": 1.0404, + "step": 1994 + }, + { + "epoch": 0.34, + "grad_norm": 9.261445045471191, + "learning_rate": 2.4864424232023342e-05, + "loss": 1.0281, + "step": 1995 + }, + { + "epoch": 0.34, + "grad_norm": 7.9775824546813965, + "learning_rate": 2.4861850008580745e-05, + "loss": 0.8719, + "step": 1996 + }, + { + "epoch": 0.34, + "grad_norm": 9.73702621459961, + "learning_rate": 2.4859275785138152e-05, + "loss": 1.1269, + "step": 1997 + }, + { + "epoch": 0.34, + "grad_norm": 7.8519062995910645, + "learning_rate": 2.4856701561695555e-05, + "loss": 1.0876, + "step": 1998 + }, + { + "epoch": 0.34, + "grad_norm": 8.00592041015625, + "learning_rate": 2.485412733825296e-05, + "loss": 0.7832, + "step": 1999 + }, + { + "epoch": 0.34, + "grad_norm": 7.4707441329956055, + "learning_rate": 2.4851553114810365e-05, + "loss": 1.2687, + "step": 2000 + }, + { + "epoch": 0.34, + "grad_norm": 8.627842903137207, + "learning_rate": 2.4848978891367772e-05, + "loss": 1.1093, + "step": 2001 + }, + { + "epoch": 0.34, + "grad_norm": 9.832741737365723, + "learning_rate": 2.484640466792518e-05, + "loss": 1.2518, + "step": 2002 + }, + { + "epoch": 0.34, + "grad_norm": 9.39503288269043, + "learning_rate": 2.4843830444482582e-05, + "loss": 1.2994, + "step": 2003 + }, + { + "epoch": 0.34, + "grad_norm": 8.167266845703125, + "learning_rate": 2.484125622103999e-05, + "loss": 0.9416, + "step": 2004 + }, + { + "epoch": 0.34, + "grad_norm": 7.453071117401123, + "learning_rate": 2.4838681997597392e-05, + "loss": 0.8271, + "step": 2005 + }, + { + "epoch": 0.34, + "grad_norm": 7.479333400726318, + "learning_rate": 2.48361077741548e-05, + "loss": 1.0426, + "step": 2006 + }, + { + "epoch": 0.34, + "grad_norm": 8.295331001281738, + "learning_rate": 2.48335335507122e-05, + "loss": 1.0965, + "step": 2007 + }, + { + "epoch": 0.34, + "grad_norm": 10.796544075012207, + "learning_rate": 2.483095932726961e-05, + "loss": 1.4362, + "step": 2008 + }, + { + "epoch": 0.34, + "grad_norm": 11.850000381469727, + "learning_rate": 2.482838510382701e-05, + "loss": 1.3482, + "step": 2009 + }, + { + "epoch": 0.34, + "grad_norm": 8.764659881591797, + "learning_rate": 2.4825810880384415e-05, + "loss": 1.2939, + "step": 2010 + }, + { + "epoch": 0.35, + "grad_norm": 9.835600852966309, + "learning_rate": 2.4823236656941825e-05, + "loss": 1.2095, + "step": 2011 + }, + { + "epoch": 0.35, + "grad_norm": 9.962176322937012, + "learning_rate": 2.4820662433499228e-05, + "loss": 1.0814, + "step": 2012 + }, + { + "epoch": 0.35, + "grad_norm": 9.528059959411621, + "learning_rate": 2.4818088210056635e-05, + "loss": 0.8885, + "step": 2013 + }, + { + "epoch": 0.35, + "grad_norm": 7.620846748352051, + "learning_rate": 2.4815513986614038e-05, + "loss": 1.1178, + "step": 2014 + }, + { + "epoch": 0.35, + "grad_norm": 7.787505626678467, + "learning_rate": 2.4812939763171445e-05, + "loss": 1.0418, + "step": 2015 + }, + { + "epoch": 0.35, + "grad_norm": 10.11175537109375, + "learning_rate": 2.4810365539728848e-05, + "loss": 1.3541, + "step": 2016 + }, + { + "epoch": 0.35, + "grad_norm": 11.326554298400879, + "learning_rate": 2.4807791316286255e-05, + "loss": 1.4547, + "step": 2017 + }, + { + "epoch": 0.35, + "grad_norm": 10.435914039611816, + "learning_rate": 2.4805217092843658e-05, + "loss": 1.3384, + "step": 2018 + }, + { + "epoch": 0.35, + "grad_norm": 8.599921226501465, + "learning_rate": 2.480264286940106e-05, + "loss": 0.9495, + "step": 2019 + }, + { + "epoch": 0.35, + "grad_norm": 10.479714393615723, + "learning_rate": 2.480006864595847e-05, + "loss": 1.3211, + "step": 2020 + }, + { + "epoch": 0.35, + "grad_norm": 9.65898609161377, + "learning_rate": 2.4797494422515875e-05, + "loss": 1.1568, + "step": 2021 + }, + { + "epoch": 0.35, + "grad_norm": 8.756569862365723, + "learning_rate": 2.479492019907328e-05, + "loss": 1.0404, + "step": 2022 + }, + { + "epoch": 0.35, + "grad_norm": 8.057429313659668, + "learning_rate": 2.4792345975630685e-05, + "loss": 1.0557, + "step": 2023 + }, + { + "epoch": 0.35, + "grad_norm": 11.154457092285156, + "learning_rate": 2.478977175218809e-05, + "loss": 1.3369, + "step": 2024 + }, + { + "epoch": 0.35, + "grad_norm": 8.76572322845459, + "learning_rate": 2.4787197528745495e-05, + "loss": 1.0762, + "step": 2025 + }, + { + "epoch": 0.35, + "grad_norm": 11.778600692749023, + "learning_rate": 2.47846233053029e-05, + "loss": 1.3628, + "step": 2026 + }, + { + "epoch": 0.35, + "grad_norm": 8.858481407165527, + "learning_rate": 2.4782049081860305e-05, + "loss": 1.1526, + "step": 2027 + }, + { + "epoch": 0.35, + "grad_norm": 10.333586692810059, + "learning_rate": 2.477947485841771e-05, + "loss": 1.0982, + "step": 2028 + }, + { + "epoch": 0.35, + "grad_norm": 8.965689659118652, + "learning_rate": 2.4776900634975115e-05, + "loss": 0.9712, + "step": 2029 + }, + { + "epoch": 0.35, + "grad_norm": 10.018839836120605, + "learning_rate": 2.477432641153252e-05, + "loss": 1.3437, + "step": 2030 + }, + { + "epoch": 0.35, + "grad_norm": 9.67626953125, + "learning_rate": 2.4771752188089928e-05, + "loss": 1.0218, + "step": 2031 + }, + { + "epoch": 0.35, + "grad_norm": 9.6708984375, + "learning_rate": 2.476917796464733e-05, + "loss": 1.0562, + "step": 2032 + }, + { + "epoch": 0.35, + "grad_norm": 8.617552757263184, + "learning_rate": 2.4766603741204738e-05, + "loss": 0.9068, + "step": 2033 + }, + { + "epoch": 0.35, + "grad_norm": 9.860594749450684, + "learning_rate": 2.476402951776214e-05, + "loss": 1.108, + "step": 2034 + }, + { + "epoch": 0.35, + "grad_norm": 8.098282814025879, + "learning_rate": 2.4761455294319548e-05, + "loss": 0.9153, + "step": 2035 + }, + { + "epoch": 0.35, + "grad_norm": 11.727445602416992, + "learning_rate": 2.475888107087695e-05, + "loss": 1.2464, + "step": 2036 + }, + { + "epoch": 0.35, + "grad_norm": 9.118781089782715, + "learning_rate": 2.4756306847434358e-05, + "loss": 1.2765, + "step": 2037 + }, + { + "epoch": 0.35, + "grad_norm": 8.542216300964355, + "learning_rate": 2.475373262399176e-05, + "loss": 0.8026, + "step": 2038 + }, + { + "epoch": 0.35, + "grad_norm": 9.281174659729004, + "learning_rate": 2.475115840054917e-05, + "loss": 0.855, + "step": 2039 + }, + { + "epoch": 0.35, + "grad_norm": 8.210792541503906, + "learning_rate": 2.4748584177106575e-05, + "loss": 1.0354, + "step": 2040 + }, + { + "epoch": 0.35, + "grad_norm": 7.555898666381836, + "learning_rate": 2.4746009953663978e-05, + "loss": 0.6123, + "step": 2041 + }, + { + "epoch": 0.35, + "grad_norm": 9.490297317504883, + "learning_rate": 2.4743435730221385e-05, + "loss": 1.1734, + "step": 2042 + }, + { + "epoch": 0.35, + "grad_norm": 8.4724702835083, + "learning_rate": 2.4740861506778788e-05, + "loss": 1.2586, + "step": 2043 + }, + { + "epoch": 0.35, + "grad_norm": 10.335028648376465, + "learning_rate": 2.4738287283336195e-05, + "loss": 1.2043, + "step": 2044 + }, + { + "epoch": 0.35, + "grad_norm": 10.547687530517578, + "learning_rate": 2.4735713059893598e-05, + "loss": 1.2341, + "step": 2045 + }, + { + "epoch": 0.35, + "grad_norm": 9.621874809265137, + "learning_rate": 2.4733138836451005e-05, + "loss": 1.0011, + "step": 2046 + }, + { + "epoch": 0.35, + "grad_norm": 9.322978019714355, + "learning_rate": 2.4730564613008408e-05, + "loss": 1.2744, + "step": 2047 + }, + { + "epoch": 0.35, + "grad_norm": 9.387273788452148, + "learning_rate": 2.4727990389565818e-05, + "loss": 1.2313, + "step": 2048 + }, + { + "epoch": 0.35, + "grad_norm": 10.519750595092773, + "learning_rate": 2.472541616612322e-05, + "loss": 1.2329, + "step": 2049 + }, + { + "epoch": 0.35, + "grad_norm": 8.675507545471191, + "learning_rate": 2.4722841942680625e-05, + "loss": 1.3132, + "step": 2050 + }, + { + "epoch": 0.35, + "grad_norm": 9.72492504119873, + "learning_rate": 2.472026771923803e-05, + "loss": 1.1005, + "step": 2051 + }, + { + "epoch": 0.35, + "grad_norm": 8.680103302001953, + "learning_rate": 2.4717693495795435e-05, + "loss": 1.2857, + "step": 2052 + }, + { + "epoch": 0.35, + "grad_norm": 7.613232135772705, + "learning_rate": 2.471511927235284e-05, + "loss": 1.1919, + "step": 2053 + }, + { + "epoch": 0.35, + "grad_norm": 8.367504119873047, + "learning_rate": 2.4712545048910245e-05, + "loss": 1.0231, + "step": 2054 + }, + { + "epoch": 0.35, + "grad_norm": 7.910067081451416, + "learning_rate": 2.470997082546765e-05, + "loss": 0.9046, + "step": 2055 + }, + { + "epoch": 0.35, + "grad_norm": 8.293525695800781, + "learning_rate": 2.4707396602025055e-05, + "loss": 1.2671, + "step": 2056 + }, + { + "epoch": 0.35, + "grad_norm": 8.532302856445312, + "learning_rate": 2.470482237858246e-05, + "loss": 1.4923, + "step": 2057 + }, + { + "epoch": 0.35, + "grad_norm": 8.526620864868164, + "learning_rate": 2.4702248155139868e-05, + "loss": 1.2102, + "step": 2058 + }, + { + "epoch": 0.35, + "grad_norm": 10.08618450164795, + "learning_rate": 2.4699673931697275e-05, + "loss": 1.1245, + "step": 2059 + }, + { + "epoch": 0.35, + "grad_norm": 9.348907470703125, + "learning_rate": 2.4697099708254678e-05, + "loss": 1.041, + "step": 2060 + }, + { + "epoch": 0.35, + "grad_norm": 11.697951316833496, + "learning_rate": 2.469452548481208e-05, + "loss": 1.3327, + "step": 2061 + }, + { + "epoch": 0.35, + "grad_norm": 7.185370922088623, + "learning_rate": 2.4691951261369488e-05, + "loss": 0.9218, + "step": 2062 + }, + { + "epoch": 0.35, + "grad_norm": 7.989435195922852, + "learning_rate": 2.468937703792689e-05, + "loss": 0.8377, + "step": 2063 + }, + { + "epoch": 0.35, + "grad_norm": 9.306659698486328, + "learning_rate": 2.4686802814484298e-05, + "loss": 0.9904, + "step": 2064 + }, + { + "epoch": 0.35, + "grad_norm": 9.163015365600586, + "learning_rate": 2.46842285910417e-05, + "loss": 1.2342, + "step": 2065 + }, + { + "epoch": 0.35, + "grad_norm": 10.481640815734863, + "learning_rate": 2.4681654367599108e-05, + "loss": 1.0154, + "step": 2066 + }, + { + "epoch": 0.35, + "grad_norm": 8.87024211883545, + "learning_rate": 2.4679080144156514e-05, + "loss": 1.123, + "step": 2067 + }, + { + "epoch": 0.35, + "grad_norm": 11.469578742980957, + "learning_rate": 2.467650592071392e-05, + "loss": 1.4519, + "step": 2068 + }, + { + "epoch": 0.36, + "grad_norm": 8.79355239868164, + "learning_rate": 2.4673931697271324e-05, + "loss": 1.1126, + "step": 2069 + }, + { + "epoch": 0.36, + "grad_norm": 10.070310592651367, + "learning_rate": 2.467135747382873e-05, + "loss": 1.2017, + "step": 2070 + }, + { + "epoch": 0.36, + "grad_norm": 10.575870513916016, + "learning_rate": 2.4668783250386134e-05, + "loss": 1.3892, + "step": 2071 + }, + { + "epoch": 0.36, + "grad_norm": 8.681638717651367, + "learning_rate": 2.4666209026943538e-05, + "loss": 1.4273, + "step": 2072 + }, + { + "epoch": 0.36, + "grad_norm": 9.094696044921875, + "learning_rate": 2.4663634803500944e-05, + "loss": 1.1297, + "step": 2073 + }, + { + "epoch": 0.36, + "grad_norm": 8.590570449829102, + "learning_rate": 2.4661060580058348e-05, + "loss": 1.0709, + "step": 2074 + }, + { + "epoch": 0.36, + "grad_norm": 9.218464851379395, + "learning_rate": 2.4658486356615754e-05, + "loss": 1.2096, + "step": 2075 + }, + { + "epoch": 0.36, + "grad_norm": 10.500626564025879, + "learning_rate": 2.4655912133173158e-05, + "loss": 1.3741, + "step": 2076 + }, + { + "epoch": 0.36, + "grad_norm": 8.936196327209473, + "learning_rate": 2.4653337909730568e-05, + "loss": 1.0692, + "step": 2077 + }, + { + "epoch": 0.36, + "grad_norm": 9.131011009216309, + "learning_rate": 2.465076368628797e-05, + "loss": 1.255, + "step": 2078 + }, + { + "epoch": 0.36, + "grad_norm": 7.559833526611328, + "learning_rate": 2.4648189462845378e-05, + "loss": 1.0649, + "step": 2079 + }, + { + "epoch": 0.36, + "grad_norm": 7.875581741333008, + "learning_rate": 2.464561523940278e-05, + "loss": 1.0418, + "step": 2080 + }, + { + "epoch": 0.36, + "grad_norm": 7.681763172149658, + "learning_rate": 2.4643041015960188e-05, + "loss": 1.1694, + "step": 2081 + }, + { + "epoch": 0.36, + "grad_norm": 8.073424339294434, + "learning_rate": 2.464046679251759e-05, + "loss": 1.0565, + "step": 2082 + }, + { + "epoch": 0.36, + "grad_norm": 9.330310821533203, + "learning_rate": 2.4637892569074994e-05, + "loss": 1.1167, + "step": 2083 + }, + { + "epoch": 0.36, + "grad_norm": 9.346527099609375, + "learning_rate": 2.46353183456324e-05, + "loss": 1.3122, + "step": 2084 + }, + { + "epoch": 0.36, + "grad_norm": 9.09740924835205, + "learning_rate": 2.4632744122189804e-05, + "loss": 1.0779, + "step": 2085 + }, + { + "epoch": 0.36, + "grad_norm": 7.799961566925049, + "learning_rate": 2.4630169898747214e-05, + "loss": 1.0795, + "step": 2086 + }, + { + "epoch": 0.36, + "grad_norm": 9.641633987426758, + "learning_rate": 2.4627595675304618e-05, + "loss": 1.0811, + "step": 2087 + }, + { + "epoch": 0.36, + "grad_norm": 9.578268051147461, + "learning_rate": 2.4625021451862024e-05, + "loss": 1.0085, + "step": 2088 + }, + { + "epoch": 0.36, + "grad_norm": 8.661127090454102, + "learning_rate": 2.4622447228419428e-05, + "loss": 1.0115, + "step": 2089 + }, + { + "epoch": 0.36, + "grad_norm": 10.416635513305664, + "learning_rate": 2.4619873004976834e-05, + "loss": 1.3297, + "step": 2090 + }, + { + "epoch": 0.36, + "grad_norm": 10.808404922485352, + "learning_rate": 2.4617298781534238e-05, + "loss": 1.2156, + "step": 2091 + }, + { + "epoch": 0.36, + "grad_norm": 8.63053035736084, + "learning_rate": 2.461472455809164e-05, + "loss": 1.1493, + "step": 2092 + }, + { + "epoch": 0.36, + "grad_norm": 9.865976333618164, + "learning_rate": 2.4612150334649048e-05, + "loss": 1.2221, + "step": 2093 + }, + { + "epoch": 0.36, + "grad_norm": 8.930879592895508, + "learning_rate": 2.460957611120645e-05, + "loss": 1.1561, + "step": 2094 + }, + { + "epoch": 0.36, + "grad_norm": 10.664885520935059, + "learning_rate": 2.4607001887763857e-05, + "loss": 1.2096, + "step": 2095 + }, + { + "epoch": 0.36, + "grad_norm": 7.464540481567383, + "learning_rate": 2.4604427664321264e-05, + "loss": 0.7703, + "step": 2096 + }, + { + "epoch": 0.36, + "grad_norm": 8.401766777038574, + "learning_rate": 2.460185344087867e-05, + "loss": 1.0532, + "step": 2097 + }, + { + "epoch": 0.36, + "grad_norm": 8.557998657226562, + "learning_rate": 2.4599279217436074e-05, + "loss": 1.0615, + "step": 2098 + }, + { + "epoch": 0.36, + "grad_norm": 9.52142333984375, + "learning_rate": 2.459670499399348e-05, + "loss": 1.373, + "step": 2099 + }, + { + "epoch": 0.36, + "grad_norm": 9.780794143676758, + "learning_rate": 2.4594130770550884e-05, + "loss": 1.3402, + "step": 2100 + }, + { + "epoch": 0.36, + "grad_norm": 8.523842811584473, + "learning_rate": 2.459155654710829e-05, + "loss": 1.0245, + "step": 2101 + }, + { + "epoch": 0.36, + "grad_norm": 7.324873924255371, + "learning_rate": 2.4588982323665694e-05, + "loss": 0.7332, + "step": 2102 + }, + { + "epoch": 0.36, + "grad_norm": 8.999865531921387, + "learning_rate": 2.4586408100223097e-05, + "loss": 1.238, + "step": 2103 + }, + { + "epoch": 0.36, + "grad_norm": 10.391741752624512, + "learning_rate": 2.4583833876780504e-05, + "loss": 0.992, + "step": 2104 + }, + { + "epoch": 0.36, + "grad_norm": 8.214604377746582, + "learning_rate": 2.458125965333791e-05, + "loss": 0.9706, + "step": 2105 + }, + { + "epoch": 0.36, + "grad_norm": 10.142857551574707, + "learning_rate": 2.4578685429895317e-05, + "loss": 1.2553, + "step": 2106 + }, + { + "epoch": 0.36, + "grad_norm": 8.071646690368652, + "learning_rate": 2.457611120645272e-05, + "loss": 0.8326, + "step": 2107 + }, + { + "epoch": 0.36, + "grad_norm": 8.613232612609863, + "learning_rate": 2.4573536983010127e-05, + "loss": 0.7947, + "step": 2108 + }, + { + "epoch": 0.36, + "grad_norm": 9.005168914794922, + "learning_rate": 2.457096275956753e-05, + "loss": 1.0563, + "step": 2109 + }, + { + "epoch": 0.36, + "grad_norm": 9.119216918945312, + "learning_rate": 2.4568388536124937e-05, + "loss": 0.992, + "step": 2110 + }, + { + "epoch": 0.36, + "grad_norm": 8.742559432983398, + "learning_rate": 2.456581431268234e-05, + "loss": 1.1163, + "step": 2111 + }, + { + "epoch": 0.36, + "grad_norm": 9.28771686553955, + "learning_rate": 2.4563240089239747e-05, + "loss": 1.2394, + "step": 2112 + }, + { + "epoch": 0.36, + "grad_norm": 10.60543155670166, + "learning_rate": 2.456066586579715e-05, + "loss": 1.2264, + "step": 2113 + }, + { + "epoch": 0.36, + "grad_norm": 11.486471176147461, + "learning_rate": 2.4558091642354554e-05, + "loss": 1.3191, + "step": 2114 + }, + { + "epoch": 0.36, + "grad_norm": 10.7002534866333, + "learning_rate": 2.4555517418911964e-05, + "loss": 1.0788, + "step": 2115 + }, + { + "epoch": 0.36, + "grad_norm": 8.65157413482666, + "learning_rate": 2.4552943195469367e-05, + "loss": 1.0665, + "step": 2116 + }, + { + "epoch": 0.36, + "grad_norm": 9.475146293640137, + "learning_rate": 2.4550368972026774e-05, + "loss": 1.131, + "step": 2117 + }, + { + "epoch": 0.36, + "grad_norm": 8.699040412902832, + "learning_rate": 2.4547794748584177e-05, + "loss": 1.0983, + "step": 2118 + }, + { + "epoch": 0.36, + "grad_norm": 8.982903480529785, + "learning_rate": 2.4545220525141584e-05, + "loss": 1.1224, + "step": 2119 + }, + { + "epoch": 0.36, + "grad_norm": 8.772966384887695, + "learning_rate": 2.4542646301698987e-05, + "loss": 1.167, + "step": 2120 + }, + { + "epoch": 0.36, + "grad_norm": 9.30195426940918, + "learning_rate": 2.4540072078256394e-05, + "loss": 1.1701, + "step": 2121 + }, + { + "epoch": 0.36, + "grad_norm": 9.401288032531738, + "learning_rate": 2.4537497854813797e-05, + "loss": 1.1435, + "step": 2122 + }, + { + "epoch": 0.36, + "grad_norm": 10.261945724487305, + "learning_rate": 2.45349236313712e-05, + "loss": 1.1808, + "step": 2123 + }, + { + "epoch": 0.36, + "grad_norm": 9.20970630645752, + "learning_rate": 2.453234940792861e-05, + "loss": 1.2177, + "step": 2124 + }, + { + "epoch": 0.36, + "grad_norm": 9.664337158203125, + "learning_rate": 2.4529775184486014e-05, + "loss": 1.0365, + "step": 2125 + }, + { + "epoch": 0.36, + "grad_norm": 8.273855209350586, + "learning_rate": 2.452720096104342e-05, + "loss": 1.1114, + "step": 2126 + }, + { + "epoch": 0.37, + "grad_norm": 9.142509460449219, + "learning_rate": 2.4524626737600824e-05, + "loss": 1.1827, + "step": 2127 + }, + { + "epoch": 0.37, + "grad_norm": 9.21065902709961, + "learning_rate": 2.452205251415823e-05, + "loss": 1.1838, + "step": 2128 + }, + { + "epoch": 0.37, + "grad_norm": 8.823779106140137, + "learning_rate": 2.4519478290715634e-05, + "loss": 1.3231, + "step": 2129 + }, + { + "epoch": 0.37, + "grad_norm": 8.876160621643066, + "learning_rate": 2.451690406727304e-05, + "loss": 1.041, + "step": 2130 + }, + { + "epoch": 0.37, + "grad_norm": 10.582056999206543, + "learning_rate": 2.4514329843830444e-05, + "loss": 1.3357, + "step": 2131 + }, + { + "epoch": 0.37, + "grad_norm": 8.97353744506836, + "learning_rate": 2.451175562038785e-05, + "loss": 0.9429, + "step": 2132 + }, + { + "epoch": 0.37, + "grad_norm": 8.752605438232422, + "learning_rate": 2.4509181396945254e-05, + "loss": 1.0655, + "step": 2133 + }, + { + "epoch": 0.37, + "grad_norm": 11.494879722595215, + "learning_rate": 2.450660717350266e-05, + "loss": 1.2365, + "step": 2134 + }, + { + "epoch": 0.37, + "grad_norm": 9.78968620300293, + "learning_rate": 2.4504032950060067e-05, + "loss": 1.3128, + "step": 2135 + }, + { + "epoch": 0.37, + "grad_norm": 8.948899269104004, + "learning_rate": 2.450145872661747e-05, + "loss": 1.0771, + "step": 2136 + }, + { + "epoch": 0.37, + "grad_norm": 8.892141342163086, + "learning_rate": 2.4498884503174877e-05, + "loss": 1.4076, + "step": 2137 + }, + { + "epoch": 0.37, + "grad_norm": 8.983065605163574, + "learning_rate": 2.449631027973228e-05, + "loss": 1.0138, + "step": 2138 + }, + { + "epoch": 0.37, + "grad_norm": 7.955356121063232, + "learning_rate": 2.4493736056289687e-05, + "loss": 0.8654, + "step": 2139 + }, + { + "epoch": 0.37, + "grad_norm": 8.503843307495117, + "learning_rate": 2.449116183284709e-05, + "loss": 1.051, + "step": 2140 + }, + { + "epoch": 0.37, + "grad_norm": 9.763766288757324, + "learning_rate": 2.4488587609404497e-05, + "loss": 1.3045, + "step": 2141 + }, + { + "epoch": 0.37, + "grad_norm": 8.153258323669434, + "learning_rate": 2.44860133859619e-05, + "loss": 1.0647, + "step": 2142 + }, + { + "epoch": 0.37, + "grad_norm": 10.2325439453125, + "learning_rate": 2.448343916251931e-05, + "loss": 1.2145, + "step": 2143 + }, + { + "epoch": 0.37, + "grad_norm": 9.163873672485352, + "learning_rate": 2.4480864939076714e-05, + "loss": 1.0033, + "step": 2144 + }, + { + "epoch": 0.37, + "grad_norm": 11.364749908447266, + "learning_rate": 2.4478290715634117e-05, + "loss": 1.3655, + "step": 2145 + }, + { + "epoch": 0.37, + "grad_norm": 10.249857902526855, + "learning_rate": 2.4475716492191524e-05, + "loss": 1.2007, + "step": 2146 + }, + { + "epoch": 0.37, + "grad_norm": 8.793172836303711, + "learning_rate": 2.4473142268748927e-05, + "loss": 0.8409, + "step": 2147 + }, + { + "epoch": 0.37, + "grad_norm": 9.970074653625488, + "learning_rate": 2.4470568045306334e-05, + "loss": 1.1618, + "step": 2148 + }, + { + "epoch": 0.37, + "grad_norm": 9.811893463134766, + "learning_rate": 2.4467993821863737e-05, + "loss": 0.9153, + "step": 2149 + }, + { + "epoch": 0.37, + "grad_norm": 9.616089820861816, + "learning_rate": 2.4465419598421144e-05, + "loss": 1.1202, + "step": 2150 + }, + { + "epoch": 0.37, + "grad_norm": 11.34910774230957, + "learning_rate": 2.4462845374978547e-05, + "loss": 1.6053, + "step": 2151 + }, + { + "epoch": 0.37, + "grad_norm": 9.412336349487305, + "learning_rate": 2.4460271151535954e-05, + "loss": 1.3831, + "step": 2152 + }, + { + "epoch": 0.37, + "grad_norm": 8.091730117797852, + "learning_rate": 2.445769692809336e-05, + "loss": 1.0023, + "step": 2153 + }, + { + "epoch": 0.37, + "grad_norm": 9.222784042358398, + "learning_rate": 2.4455122704650764e-05, + "loss": 1.1603, + "step": 2154 + }, + { + "epoch": 0.37, + "grad_norm": 9.691291809082031, + "learning_rate": 2.445254848120817e-05, + "loss": 1.3121, + "step": 2155 + }, + { + "epoch": 0.37, + "grad_norm": 10.616677284240723, + "learning_rate": 2.4449974257765574e-05, + "loss": 1.2645, + "step": 2156 + }, + { + "epoch": 0.37, + "grad_norm": 8.994897842407227, + "learning_rate": 2.444740003432298e-05, + "loss": 1.1881, + "step": 2157 + }, + { + "epoch": 0.37, + "grad_norm": 8.053680419921875, + "learning_rate": 2.4444825810880384e-05, + "loss": 1.1504, + "step": 2158 + }, + { + "epoch": 0.37, + "grad_norm": 8.763884544372559, + "learning_rate": 2.444225158743779e-05, + "loss": 1.1584, + "step": 2159 + }, + { + "epoch": 0.37, + "grad_norm": 9.906607627868652, + "learning_rate": 2.4439677363995194e-05, + "loss": 1.4657, + "step": 2160 + }, + { + "epoch": 0.37, + "grad_norm": 6.504964351654053, + "learning_rate": 2.44371031405526e-05, + "loss": 0.8196, + "step": 2161 + }, + { + "epoch": 0.37, + "grad_norm": 7.369091510772705, + "learning_rate": 2.4434528917110007e-05, + "loss": 0.7597, + "step": 2162 + }, + { + "epoch": 0.37, + "grad_norm": 9.04252815246582, + "learning_rate": 2.4431954693667414e-05, + "loss": 1.0252, + "step": 2163 + }, + { + "epoch": 0.37, + "grad_norm": 8.198497772216797, + "learning_rate": 2.4429380470224817e-05, + "loss": 1.0809, + "step": 2164 + }, + { + "epoch": 0.37, + "grad_norm": 8.258764266967773, + "learning_rate": 2.442680624678222e-05, + "loss": 0.9795, + "step": 2165 + }, + { + "epoch": 0.37, + "grad_norm": 8.062975883483887, + "learning_rate": 2.4424232023339627e-05, + "loss": 1.0093, + "step": 2166 + }, + { + "epoch": 0.37, + "grad_norm": 8.119672775268555, + "learning_rate": 2.442165779989703e-05, + "loss": 0.8311, + "step": 2167 + }, + { + "epoch": 0.37, + "grad_norm": 9.630653381347656, + "learning_rate": 2.4419083576454437e-05, + "loss": 1.0723, + "step": 2168 + }, + { + "epoch": 0.37, + "grad_norm": 11.558615684509277, + "learning_rate": 2.441650935301184e-05, + "loss": 1.5513, + "step": 2169 + }, + { + "epoch": 0.37, + "grad_norm": 10.386838912963867, + "learning_rate": 2.4413935129569247e-05, + "loss": 1.3053, + "step": 2170 + }, + { + "epoch": 0.37, + "grad_norm": 10.350480079650879, + "learning_rate": 2.4411360906126653e-05, + "loss": 1.13, + "step": 2171 + }, + { + "epoch": 0.37, + "grad_norm": 9.781171798706055, + "learning_rate": 2.440878668268406e-05, + "loss": 0.9033, + "step": 2172 + }, + { + "epoch": 0.37, + "grad_norm": 8.367735862731934, + "learning_rate": 2.4406212459241463e-05, + "loss": 1.1802, + "step": 2173 + }, + { + "epoch": 0.37, + "grad_norm": 7.530876636505127, + "learning_rate": 2.440363823579887e-05, + "loss": 0.9118, + "step": 2174 + }, + { + "epoch": 0.37, + "grad_norm": 9.17416000366211, + "learning_rate": 2.4401064012356273e-05, + "loss": 1.0697, + "step": 2175 + }, + { + "epoch": 0.37, + "grad_norm": 9.217011451721191, + "learning_rate": 2.4398489788913677e-05, + "loss": 1.2457, + "step": 2176 + }, + { + "epoch": 0.37, + "grad_norm": 8.692070007324219, + "learning_rate": 2.4395915565471083e-05, + "loss": 1.2613, + "step": 2177 + }, + { + "epoch": 0.37, + "grad_norm": 9.750065803527832, + "learning_rate": 2.4393341342028487e-05, + "loss": 1.2005, + "step": 2178 + }, + { + "epoch": 0.37, + "grad_norm": 9.901527404785156, + "learning_rate": 2.4390767118585893e-05, + "loss": 1.3635, + "step": 2179 + }, + { + "epoch": 0.37, + "grad_norm": 12.055781364440918, + "learning_rate": 2.4388192895143297e-05, + "loss": 1.1935, + "step": 2180 + }, + { + "epoch": 0.37, + "grad_norm": 8.499872207641602, + "learning_rate": 2.4385618671700707e-05, + "loss": 0.9131, + "step": 2181 + }, + { + "epoch": 0.37, + "grad_norm": 10.323253631591797, + "learning_rate": 2.438304444825811e-05, + "loss": 1.2284, + "step": 2182 + }, + { + "epoch": 0.37, + "grad_norm": 9.72081470489502, + "learning_rate": 2.4380470224815517e-05, + "loss": 1.2344, + "step": 2183 + }, + { + "epoch": 0.37, + "grad_norm": 10.034542083740234, + "learning_rate": 2.437789600137292e-05, + "loss": 1.2939, + "step": 2184 + }, + { + "epoch": 0.37, + "grad_norm": 8.844457626342773, + "learning_rate": 2.4375321777930327e-05, + "loss": 0.8724, + "step": 2185 + }, + { + "epoch": 0.38, + "grad_norm": 8.399674415588379, + "learning_rate": 2.437274755448773e-05, + "loss": 1.0371, + "step": 2186 + }, + { + "epoch": 0.38, + "grad_norm": 8.00466537475586, + "learning_rate": 2.4370173331045133e-05, + "loss": 1.0131, + "step": 2187 + }, + { + "epoch": 0.38, + "grad_norm": 8.96268081665039, + "learning_rate": 2.436759910760254e-05, + "loss": 1.0904, + "step": 2188 + }, + { + "epoch": 0.38, + "grad_norm": 9.096373558044434, + "learning_rate": 2.4365024884159943e-05, + "loss": 1.0727, + "step": 2189 + }, + { + "epoch": 0.38, + "grad_norm": 9.763750076293945, + "learning_rate": 2.4362450660717353e-05, + "loss": 1.1843, + "step": 2190 + }, + { + "epoch": 0.38, + "grad_norm": 10.026836395263672, + "learning_rate": 2.4359876437274757e-05, + "loss": 1.0918, + "step": 2191 + }, + { + "epoch": 0.38, + "grad_norm": 8.820173263549805, + "learning_rate": 2.4357302213832163e-05, + "loss": 1.0206, + "step": 2192 + }, + { + "epoch": 0.38, + "grad_norm": 10.105895042419434, + "learning_rate": 2.4354727990389567e-05, + "loss": 1.1331, + "step": 2193 + }, + { + "epoch": 0.38, + "grad_norm": 8.734973907470703, + "learning_rate": 2.4352153766946973e-05, + "loss": 1.0124, + "step": 2194 + }, + { + "epoch": 0.38, + "grad_norm": 9.105067253112793, + "learning_rate": 2.4349579543504377e-05, + "loss": 1.0347, + "step": 2195 + }, + { + "epoch": 0.38, + "grad_norm": 10.364143371582031, + "learning_rate": 2.434700532006178e-05, + "loss": 1.2429, + "step": 2196 + }, + { + "epoch": 0.38, + "grad_norm": 8.511385917663574, + "learning_rate": 2.4344431096619187e-05, + "loss": 1.1924, + "step": 2197 + }, + { + "epoch": 0.38, + "grad_norm": 9.025053024291992, + "learning_rate": 2.434185687317659e-05, + "loss": 1.1453, + "step": 2198 + }, + { + "epoch": 0.38, + "grad_norm": 8.77366828918457, + "learning_rate": 2.4339282649733997e-05, + "loss": 1.1927, + "step": 2199 + }, + { + "epoch": 0.38, + "grad_norm": 8.097921371459961, + "learning_rate": 2.4336708426291403e-05, + "loss": 1.1981, + "step": 2200 + }, + { + "epoch": 0.38, + "grad_norm": 9.000970840454102, + "learning_rate": 2.433413420284881e-05, + "loss": 1.0992, + "step": 2201 + }, + { + "epoch": 0.38, + "grad_norm": 10.089954376220703, + "learning_rate": 2.4331559979406213e-05, + "loss": 1.192, + "step": 2202 + }, + { + "epoch": 0.38, + "grad_norm": 9.446324348449707, + "learning_rate": 2.432898575596362e-05, + "loss": 1.102, + "step": 2203 + }, + { + "epoch": 0.38, + "grad_norm": 9.706451416015625, + "learning_rate": 2.4326411532521023e-05, + "loss": 1.1809, + "step": 2204 + }, + { + "epoch": 0.38, + "grad_norm": 10.350419998168945, + "learning_rate": 2.432383730907843e-05, + "loss": 1.292, + "step": 2205 + }, + { + "epoch": 0.38, + "grad_norm": 9.564776420593262, + "learning_rate": 2.4321263085635833e-05, + "loss": 0.9112, + "step": 2206 + }, + { + "epoch": 0.38, + "grad_norm": 8.59047794342041, + "learning_rate": 2.4318688862193236e-05, + "loss": 0.85, + "step": 2207 + }, + { + "epoch": 0.38, + "grad_norm": 8.712336540222168, + "learning_rate": 2.4316114638750643e-05, + "loss": 1.1457, + "step": 2208 + }, + { + "epoch": 0.38, + "grad_norm": 8.856526374816895, + "learning_rate": 2.431354041530805e-05, + "loss": 1.2405, + "step": 2209 + }, + { + "epoch": 0.38, + "grad_norm": 9.182084083557129, + "learning_rate": 2.4310966191865456e-05, + "loss": 0.9503, + "step": 2210 + }, + { + "epoch": 0.38, + "grad_norm": 10.732983589172363, + "learning_rate": 2.430839196842286e-05, + "loss": 1.1711, + "step": 2211 + }, + { + "epoch": 0.38, + "grad_norm": 10.850709915161133, + "learning_rate": 2.4305817744980266e-05, + "loss": 1.2098, + "step": 2212 + }, + { + "epoch": 0.38, + "grad_norm": 8.524785995483398, + "learning_rate": 2.430324352153767e-05, + "loss": 0.9561, + "step": 2213 + }, + { + "epoch": 0.38, + "grad_norm": 10.560460090637207, + "learning_rate": 2.4300669298095076e-05, + "loss": 1.1777, + "step": 2214 + }, + { + "epoch": 0.38, + "grad_norm": 10.62714958190918, + "learning_rate": 2.429809507465248e-05, + "loss": 1.1986, + "step": 2215 + }, + { + "epoch": 0.38, + "grad_norm": 13.657073020935059, + "learning_rate": 2.4295520851209886e-05, + "loss": 1.5808, + "step": 2216 + }, + { + "epoch": 0.38, + "grad_norm": 10.352283477783203, + "learning_rate": 2.429294662776729e-05, + "loss": 1.2082, + "step": 2217 + }, + { + "epoch": 0.38, + "grad_norm": 9.29774284362793, + "learning_rate": 2.4290372404324693e-05, + "loss": 1.0112, + "step": 2218 + }, + { + "epoch": 0.38, + "grad_norm": 10.176529884338379, + "learning_rate": 2.4287798180882103e-05, + "loss": 1.2053, + "step": 2219 + }, + { + "epoch": 0.38, + "grad_norm": 10.629195213317871, + "learning_rate": 2.4285223957439506e-05, + "loss": 1.4169, + "step": 2220 + }, + { + "epoch": 0.38, + "grad_norm": 9.850541114807129, + "learning_rate": 2.4282649733996913e-05, + "loss": 1.3332, + "step": 2221 + }, + { + "epoch": 0.38, + "grad_norm": 8.906332015991211, + "learning_rate": 2.4280075510554316e-05, + "loss": 0.9488, + "step": 2222 + }, + { + "epoch": 0.38, + "grad_norm": 8.36665153503418, + "learning_rate": 2.4277501287111723e-05, + "loss": 0.9325, + "step": 2223 + }, + { + "epoch": 0.38, + "grad_norm": 8.38780689239502, + "learning_rate": 2.4274927063669126e-05, + "loss": 1.0467, + "step": 2224 + }, + { + "epoch": 0.38, + "grad_norm": 7.8286662101745605, + "learning_rate": 2.4272352840226533e-05, + "loss": 0.9573, + "step": 2225 + }, + { + "epoch": 0.38, + "grad_norm": 7.623621940612793, + "learning_rate": 2.4269778616783936e-05, + "loss": 0.9291, + "step": 2226 + }, + { + "epoch": 0.38, + "grad_norm": 7.773752212524414, + "learning_rate": 2.426720439334134e-05, + "loss": 1.127, + "step": 2227 + }, + { + "epoch": 0.38, + "grad_norm": 9.005531311035156, + "learning_rate": 2.426463016989875e-05, + "loss": 1.2232, + "step": 2228 + }, + { + "epoch": 0.38, + "grad_norm": 9.158402442932129, + "learning_rate": 2.4262055946456153e-05, + "loss": 1.0502, + "step": 2229 + }, + { + "epoch": 0.38, + "grad_norm": 8.19765853881836, + "learning_rate": 2.425948172301356e-05, + "loss": 0.9553, + "step": 2230 + }, + { + "epoch": 0.38, + "grad_norm": 8.267332077026367, + "learning_rate": 2.4256907499570963e-05, + "loss": 0.8448, + "step": 2231 + }, + { + "epoch": 0.38, + "grad_norm": 8.809307098388672, + "learning_rate": 2.425433327612837e-05, + "loss": 0.9016, + "step": 2232 + }, + { + "epoch": 0.38, + "grad_norm": 9.3159818649292, + "learning_rate": 2.4251759052685773e-05, + "loss": 1.0381, + "step": 2233 + }, + { + "epoch": 0.38, + "grad_norm": 9.401824951171875, + "learning_rate": 2.424918482924318e-05, + "loss": 1.15, + "step": 2234 + }, + { + "epoch": 0.38, + "grad_norm": 11.003650665283203, + "learning_rate": 2.4246610605800583e-05, + "loss": 0.8876, + "step": 2235 + }, + { + "epoch": 0.38, + "grad_norm": 10.029595375061035, + "learning_rate": 2.424403638235799e-05, + "loss": 0.981, + "step": 2236 + }, + { + "epoch": 0.38, + "grad_norm": 10.966934204101562, + "learning_rate": 2.4241462158915393e-05, + "loss": 1.181, + "step": 2237 + }, + { + "epoch": 0.38, + "grad_norm": 11.154629707336426, + "learning_rate": 2.42388879354728e-05, + "loss": 1.3361, + "step": 2238 + }, + { + "epoch": 0.38, + "grad_norm": 8.800402641296387, + "learning_rate": 2.4236313712030206e-05, + "loss": 1.1646, + "step": 2239 + }, + { + "epoch": 0.38, + "grad_norm": 8.770148277282715, + "learning_rate": 2.423373948858761e-05, + "loss": 1.1305, + "step": 2240 + }, + { + "epoch": 0.38, + "grad_norm": 9.895112037658691, + "learning_rate": 2.4231165265145016e-05, + "loss": 1.1084, + "step": 2241 + }, + { + "epoch": 0.38, + "grad_norm": 7.0063605308532715, + "learning_rate": 2.422859104170242e-05, + "loss": 0.6602, + "step": 2242 + }, + { + "epoch": 0.38, + "grad_norm": 10.963147163391113, + "learning_rate": 2.4226016818259826e-05, + "loss": 0.9376, + "step": 2243 + }, + { + "epoch": 0.39, + "grad_norm": 8.689924240112305, + "learning_rate": 2.422344259481723e-05, + "loss": 0.9693, + "step": 2244 + }, + { + "epoch": 0.39, + "grad_norm": 8.47618579864502, + "learning_rate": 2.4220868371374636e-05, + "loss": 1.0734, + "step": 2245 + }, + { + "epoch": 0.39, + "grad_norm": 9.367137908935547, + "learning_rate": 2.421829414793204e-05, + "loss": 1.1059, + "step": 2246 + }, + { + "epoch": 0.39, + "grad_norm": 9.199468612670898, + "learning_rate": 2.421571992448945e-05, + "loss": 1.0812, + "step": 2247 + }, + { + "epoch": 0.39, + "grad_norm": 8.501741409301758, + "learning_rate": 2.4213145701046853e-05, + "loss": 1.1653, + "step": 2248 + }, + { + "epoch": 0.39, + "grad_norm": 9.0991792678833, + "learning_rate": 2.4210571477604256e-05, + "loss": 1.2308, + "step": 2249 + }, + { + "epoch": 0.39, + "grad_norm": 11.76620864868164, + "learning_rate": 2.4207997254161663e-05, + "loss": 1.2191, + "step": 2250 + }, + { + "epoch": 0.39, + "grad_norm": 9.74355697631836, + "learning_rate": 2.4205423030719066e-05, + "loss": 1.1048, + "step": 2251 + }, + { + "epoch": 0.39, + "grad_norm": 9.557761192321777, + "learning_rate": 2.4202848807276473e-05, + "loss": 1.1524, + "step": 2252 + }, + { + "epoch": 0.39, + "grad_norm": 9.952515602111816, + "learning_rate": 2.4200274583833876e-05, + "loss": 1.2135, + "step": 2253 + }, + { + "epoch": 0.39, + "grad_norm": 8.356393814086914, + "learning_rate": 2.4197700360391283e-05, + "loss": 0.8388, + "step": 2254 + }, + { + "epoch": 0.39, + "grad_norm": 9.512103080749512, + "learning_rate": 2.4195126136948686e-05, + "loss": 0.9181, + "step": 2255 + }, + { + "epoch": 0.39, + "grad_norm": 9.448094367980957, + "learning_rate": 2.4192551913506093e-05, + "loss": 0.994, + "step": 2256 + }, + { + "epoch": 0.39, + "grad_norm": 11.41581916809082, + "learning_rate": 2.41899776900635e-05, + "loss": 1.4013, + "step": 2257 + }, + { + "epoch": 0.39, + "grad_norm": 8.822907447814941, + "learning_rate": 2.4187403466620903e-05, + "loss": 0.9049, + "step": 2258 + }, + { + "epoch": 0.39, + "grad_norm": 8.646187782287598, + "learning_rate": 2.418482924317831e-05, + "loss": 0.9902, + "step": 2259 + }, + { + "epoch": 0.39, + "grad_norm": 8.040304183959961, + "learning_rate": 2.4182255019735713e-05, + "loss": 0.9898, + "step": 2260 + }, + { + "epoch": 0.39, + "grad_norm": 9.171103477478027, + "learning_rate": 2.417968079629312e-05, + "loss": 1.1363, + "step": 2261 + }, + { + "epoch": 0.39, + "grad_norm": 7.7490553855896, + "learning_rate": 2.4177106572850523e-05, + "loss": 1.0245, + "step": 2262 + }, + { + "epoch": 0.39, + "grad_norm": 9.144671440124512, + "learning_rate": 2.417453234940793e-05, + "loss": 1.1563, + "step": 2263 + }, + { + "epoch": 0.39, + "grad_norm": 7.781315803527832, + "learning_rate": 2.4171958125965333e-05, + "loss": 1.0074, + "step": 2264 + }, + { + "epoch": 0.39, + "grad_norm": 10.80992603302002, + "learning_rate": 2.416938390252274e-05, + "loss": 1.0563, + "step": 2265 + }, + { + "epoch": 0.39, + "grad_norm": 9.283391952514648, + "learning_rate": 2.4166809679080146e-05, + "loss": 1.0983, + "step": 2266 + }, + { + "epoch": 0.39, + "grad_norm": 8.335240364074707, + "learning_rate": 2.4164235455637553e-05, + "loss": 0.9964, + "step": 2267 + }, + { + "epoch": 0.39, + "grad_norm": 10.979182243347168, + "learning_rate": 2.4161661232194956e-05, + "loss": 1.0521, + "step": 2268 + }, + { + "epoch": 0.39, + "grad_norm": 10.834734916687012, + "learning_rate": 2.415908700875236e-05, + "loss": 1.3008, + "step": 2269 + }, + { + "epoch": 0.39, + "grad_norm": 9.852832794189453, + "learning_rate": 2.4156512785309766e-05, + "loss": 1.1512, + "step": 2270 + }, + { + "epoch": 0.39, + "grad_norm": 9.862049102783203, + "learning_rate": 2.415393856186717e-05, + "loss": 1.3111, + "step": 2271 + }, + { + "epoch": 0.39, + "grad_norm": 8.671493530273438, + "learning_rate": 2.4151364338424576e-05, + "loss": 0.907, + "step": 2272 + }, + { + "epoch": 0.39, + "grad_norm": 10.944957733154297, + "learning_rate": 2.414879011498198e-05, + "loss": 1.2896, + "step": 2273 + }, + { + "epoch": 0.39, + "grad_norm": 10.786681175231934, + "learning_rate": 2.4146215891539386e-05, + "loss": 1.1686, + "step": 2274 + }, + { + "epoch": 0.39, + "grad_norm": 14.018754005432129, + "learning_rate": 2.4143641668096792e-05, + "loss": 1.1924, + "step": 2275 + }, + { + "epoch": 0.39, + "grad_norm": 9.193449974060059, + "learning_rate": 2.41410674446542e-05, + "loss": 1.1071, + "step": 2276 + }, + { + "epoch": 0.39, + "grad_norm": 9.369192123413086, + "learning_rate": 2.4138493221211602e-05, + "loss": 1.2097, + "step": 2277 + }, + { + "epoch": 0.39, + "grad_norm": 9.837075233459473, + "learning_rate": 2.413591899776901e-05, + "loss": 1.1033, + "step": 2278 + }, + { + "epoch": 0.39, + "grad_norm": 10.052380561828613, + "learning_rate": 2.4133344774326412e-05, + "loss": 1.0338, + "step": 2279 + }, + { + "epoch": 0.39, + "grad_norm": 9.481216430664062, + "learning_rate": 2.4130770550883816e-05, + "loss": 1.1987, + "step": 2280 + }, + { + "epoch": 0.39, + "grad_norm": 9.217461585998535, + "learning_rate": 2.4128196327441222e-05, + "loss": 1.4332, + "step": 2281 + }, + { + "epoch": 0.39, + "grad_norm": 7.972358703613281, + "learning_rate": 2.4125622103998626e-05, + "loss": 0.7925, + "step": 2282 + }, + { + "epoch": 0.39, + "grad_norm": 10.535296440124512, + "learning_rate": 2.4123047880556032e-05, + "loss": 1.5184, + "step": 2283 + }, + { + "epoch": 0.39, + "grad_norm": 8.570453643798828, + "learning_rate": 2.4120473657113436e-05, + "loss": 1.0196, + "step": 2284 + }, + { + "epoch": 0.39, + "grad_norm": 8.724305152893066, + "learning_rate": 2.4117899433670846e-05, + "loss": 1.0131, + "step": 2285 + }, + { + "epoch": 0.39, + "grad_norm": 9.26595687866211, + "learning_rate": 2.411532521022825e-05, + "loss": 1.1198, + "step": 2286 + }, + { + "epoch": 0.39, + "grad_norm": 8.108465194702148, + "learning_rate": 2.4112750986785656e-05, + "loss": 0.8719, + "step": 2287 + }, + { + "epoch": 0.39, + "grad_norm": 9.542659759521484, + "learning_rate": 2.411017676334306e-05, + "loss": 1.0413, + "step": 2288 + }, + { + "epoch": 0.39, + "grad_norm": 8.4580659866333, + "learning_rate": 2.4107602539900466e-05, + "loss": 1.186, + "step": 2289 + }, + { + "epoch": 0.39, + "grad_norm": 8.495659828186035, + "learning_rate": 2.410502831645787e-05, + "loss": 0.9637, + "step": 2290 + }, + { + "epoch": 0.39, + "grad_norm": 10.1148042678833, + "learning_rate": 2.4102454093015272e-05, + "loss": 1.3322, + "step": 2291 + }, + { + "epoch": 0.39, + "grad_norm": 9.231887817382812, + "learning_rate": 2.409987986957268e-05, + "loss": 1.1449, + "step": 2292 + }, + { + "epoch": 0.39, + "grad_norm": 7.8561177253723145, + "learning_rate": 2.4097305646130082e-05, + "loss": 1.1552, + "step": 2293 + }, + { + "epoch": 0.39, + "grad_norm": 9.66640853881836, + "learning_rate": 2.4094731422687492e-05, + "loss": 1.3027, + "step": 2294 + }, + { + "epoch": 0.39, + "grad_norm": 9.01046371459961, + "learning_rate": 2.4092157199244896e-05, + "loss": 1.2658, + "step": 2295 + }, + { + "epoch": 0.39, + "grad_norm": 10.291804313659668, + "learning_rate": 2.4089582975802302e-05, + "loss": 1.3789, + "step": 2296 + }, + { + "epoch": 0.39, + "grad_norm": 10.136520385742188, + "learning_rate": 2.4087008752359706e-05, + "loss": 1.2063, + "step": 2297 + }, + { + "epoch": 0.39, + "grad_norm": 10.098982810974121, + "learning_rate": 2.4084434528917112e-05, + "loss": 1.0684, + "step": 2298 + }, + { + "epoch": 0.39, + "grad_norm": 10.366686820983887, + "learning_rate": 2.4081860305474516e-05, + "loss": 0.8837, + "step": 2299 + }, + { + "epoch": 0.39, + "grad_norm": 9.365204811096191, + "learning_rate": 2.407928608203192e-05, + "loss": 1.1671, + "step": 2300 + }, + { + "epoch": 0.39, + "grad_norm": 8.287782669067383, + "learning_rate": 2.4076711858589326e-05, + "loss": 0.9609, + "step": 2301 + }, + { + "epoch": 0.4, + "grad_norm": 9.816483497619629, + "learning_rate": 2.407413763514673e-05, + "loss": 1.068, + "step": 2302 + }, + { + "epoch": 0.4, + "grad_norm": 8.215349197387695, + "learning_rate": 2.4071563411704136e-05, + "loss": 0.9013, + "step": 2303 + }, + { + "epoch": 0.4, + "grad_norm": 10.46806812286377, + "learning_rate": 2.4068989188261542e-05, + "loss": 1.2445, + "step": 2304 + }, + { + "epoch": 0.4, + "grad_norm": 8.204339981079102, + "learning_rate": 2.406641496481895e-05, + "loss": 1.0809, + "step": 2305 + }, + { + "epoch": 0.4, + "grad_norm": 10.090676307678223, + "learning_rate": 2.4063840741376352e-05, + "loss": 1.0355, + "step": 2306 + }, + { + "epoch": 0.4, + "grad_norm": 8.966399192810059, + "learning_rate": 2.406126651793376e-05, + "loss": 1.3129, + "step": 2307 + }, + { + "epoch": 0.4, + "grad_norm": 9.648179054260254, + "learning_rate": 2.4058692294491162e-05, + "loss": 1.1391, + "step": 2308 + }, + { + "epoch": 0.4, + "grad_norm": 9.937187194824219, + "learning_rate": 2.405611807104857e-05, + "loss": 1.1248, + "step": 2309 + }, + { + "epoch": 0.4, + "grad_norm": 8.938966751098633, + "learning_rate": 2.4053543847605972e-05, + "loss": 0.9587, + "step": 2310 + }, + { + "epoch": 0.4, + "grad_norm": 9.952705383300781, + "learning_rate": 2.4050969624163375e-05, + "loss": 1.0574, + "step": 2311 + }, + { + "epoch": 0.4, + "grad_norm": 8.699490547180176, + "learning_rate": 2.4048395400720782e-05, + "loss": 1.1071, + "step": 2312 + }, + { + "epoch": 0.4, + "grad_norm": 9.558082580566406, + "learning_rate": 2.404582117727819e-05, + "loss": 1.023, + "step": 2313 + }, + { + "epoch": 0.4, + "grad_norm": 9.829763412475586, + "learning_rate": 2.4043246953835595e-05, + "loss": 1.1763, + "step": 2314 + }, + { + "epoch": 0.4, + "grad_norm": 8.652128219604492, + "learning_rate": 2.4040672730393e-05, + "loss": 0.9157, + "step": 2315 + }, + { + "epoch": 0.4, + "grad_norm": 8.757291793823242, + "learning_rate": 2.4038098506950405e-05, + "loss": 1.2526, + "step": 2316 + }, + { + "epoch": 0.4, + "grad_norm": 7.581213474273682, + "learning_rate": 2.403552428350781e-05, + "loss": 0.9489, + "step": 2317 + }, + { + "epoch": 0.4, + "grad_norm": 9.630509376525879, + "learning_rate": 2.4032950060065215e-05, + "loss": 0.9891, + "step": 2318 + }, + { + "epoch": 0.4, + "grad_norm": 7.893701553344727, + "learning_rate": 2.403037583662262e-05, + "loss": 0.8387, + "step": 2319 + }, + { + "epoch": 0.4, + "grad_norm": 10.503652572631836, + "learning_rate": 2.4027801613180025e-05, + "loss": 1.1652, + "step": 2320 + }, + { + "epoch": 0.4, + "grad_norm": 8.663091659545898, + "learning_rate": 2.402522738973743e-05, + "loss": 1.0581, + "step": 2321 + }, + { + "epoch": 0.4, + "grad_norm": 9.012900352478027, + "learning_rate": 2.4022653166294832e-05, + "loss": 1.1648, + "step": 2322 + }, + { + "epoch": 0.4, + "grad_norm": 9.778726577758789, + "learning_rate": 2.4020078942852242e-05, + "loss": 1.1431, + "step": 2323 + }, + { + "epoch": 0.4, + "grad_norm": 9.202214241027832, + "learning_rate": 2.4017504719409645e-05, + "loss": 1.0112, + "step": 2324 + }, + { + "epoch": 0.4, + "grad_norm": 10.303224563598633, + "learning_rate": 2.4014930495967052e-05, + "loss": 1.2525, + "step": 2325 + }, + { + "epoch": 0.4, + "grad_norm": 11.540024757385254, + "learning_rate": 2.4012356272524455e-05, + "loss": 1.4424, + "step": 2326 + }, + { + "epoch": 0.4, + "grad_norm": 8.628973007202148, + "learning_rate": 2.4009782049081862e-05, + "loss": 0.8955, + "step": 2327 + }, + { + "epoch": 0.4, + "grad_norm": 7.897580623626709, + "learning_rate": 2.4007207825639265e-05, + "loss": 1.3406, + "step": 2328 + }, + { + "epoch": 0.4, + "grad_norm": 8.044696807861328, + "learning_rate": 2.4004633602196672e-05, + "loss": 1.0739, + "step": 2329 + }, + { + "epoch": 0.4, + "grad_norm": 9.221452713012695, + "learning_rate": 2.4002059378754075e-05, + "loss": 1.1278, + "step": 2330 + }, + { + "epoch": 0.4, + "grad_norm": 10.01343822479248, + "learning_rate": 2.399948515531148e-05, + "loss": 1.2545, + "step": 2331 + }, + { + "epoch": 0.4, + "grad_norm": 8.265592575073242, + "learning_rate": 2.399691093186889e-05, + "loss": 1.0236, + "step": 2332 + }, + { + "epoch": 0.4, + "grad_norm": 7.603173732757568, + "learning_rate": 2.3994336708426292e-05, + "loss": 0.8957, + "step": 2333 + }, + { + "epoch": 0.4, + "grad_norm": 10.266589164733887, + "learning_rate": 2.39917624849837e-05, + "loss": 1.4813, + "step": 2334 + }, + { + "epoch": 0.4, + "grad_norm": 9.32174015045166, + "learning_rate": 2.3989188261541102e-05, + "loss": 1.3589, + "step": 2335 + }, + { + "epoch": 0.4, + "grad_norm": 7.692406177520752, + "learning_rate": 2.398661403809851e-05, + "loss": 1.0378, + "step": 2336 + }, + { + "epoch": 0.4, + "grad_norm": 9.303297996520996, + "learning_rate": 2.3984039814655912e-05, + "loss": 1.4373, + "step": 2337 + }, + { + "epoch": 0.4, + "grad_norm": 8.728151321411133, + "learning_rate": 2.398146559121332e-05, + "loss": 1.121, + "step": 2338 + }, + { + "epoch": 0.4, + "grad_norm": 9.557869911193848, + "learning_rate": 2.3978891367770722e-05, + "loss": 1.101, + "step": 2339 + }, + { + "epoch": 0.4, + "grad_norm": 8.885689735412598, + "learning_rate": 2.397631714432813e-05, + "loss": 0.8533, + "step": 2340 + }, + { + "epoch": 0.4, + "grad_norm": 10.080730438232422, + "learning_rate": 2.3973742920885532e-05, + "loss": 1.2499, + "step": 2341 + }, + { + "epoch": 0.4, + "grad_norm": 8.192622184753418, + "learning_rate": 2.397116869744294e-05, + "loss": 0.8441, + "step": 2342 + }, + { + "epoch": 0.4, + "grad_norm": 8.07056999206543, + "learning_rate": 2.3968594474000345e-05, + "loss": 1.0561, + "step": 2343 + }, + { + "epoch": 0.4, + "grad_norm": 9.36457347869873, + "learning_rate": 2.396602025055775e-05, + "loss": 1.0302, + "step": 2344 + }, + { + "epoch": 0.4, + "grad_norm": 11.434206008911133, + "learning_rate": 2.3963446027115155e-05, + "loss": 1.065, + "step": 2345 + }, + { + "epoch": 0.4, + "grad_norm": 9.678196907043457, + "learning_rate": 2.396087180367256e-05, + "loss": 1.1172, + "step": 2346 + }, + { + "epoch": 0.4, + "grad_norm": 9.543830871582031, + "learning_rate": 2.3958297580229965e-05, + "loss": 0.9344, + "step": 2347 + }, + { + "epoch": 0.4, + "grad_norm": 10.137594223022461, + "learning_rate": 2.395572335678737e-05, + "loss": 0.9821, + "step": 2348 + }, + { + "epoch": 0.4, + "grad_norm": 9.554171562194824, + "learning_rate": 2.3953149133344775e-05, + "loss": 0.8165, + "step": 2349 + }, + { + "epoch": 0.4, + "grad_norm": 10.486687660217285, + "learning_rate": 2.395057490990218e-05, + "loss": 1.0482, + "step": 2350 + }, + { + "epoch": 0.4, + "grad_norm": 8.66871166229248, + "learning_rate": 2.394800068645959e-05, + "loss": 0.936, + "step": 2351 + }, + { + "epoch": 0.4, + "grad_norm": 8.195672035217285, + "learning_rate": 2.3945426463016992e-05, + "loss": 1.0888, + "step": 2352 + }, + { + "epoch": 0.4, + "grad_norm": 8.308737754821777, + "learning_rate": 2.3942852239574395e-05, + "loss": 1.037, + "step": 2353 + }, + { + "epoch": 0.4, + "grad_norm": 9.782885551452637, + "learning_rate": 2.3940278016131802e-05, + "loss": 1.0355, + "step": 2354 + }, + { + "epoch": 0.4, + "grad_norm": 9.690522193908691, + "learning_rate": 2.3937703792689205e-05, + "loss": 1.1222, + "step": 2355 + }, + { + "epoch": 0.4, + "grad_norm": 9.021770477294922, + "learning_rate": 2.3935129569246612e-05, + "loss": 1.0931, + "step": 2356 + }, + { + "epoch": 0.4, + "grad_norm": 10.61075210571289, + "learning_rate": 2.3932555345804015e-05, + "loss": 0.9862, + "step": 2357 + }, + { + "epoch": 0.4, + "grad_norm": 7.538852214813232, + "learning_rate": 2.392998112236142e-05, + "loss": 0.8301, + "step": 2358 + }, + { + "epoch": 0.4, + "grad_norm": 9.294318199157715, + "learning_rate": 2.3927406898918825e-05, + "loss": 1.1885, + "step": 2359 + }, + { + "epoch": 0.41, + "grad_norm": 8.791847229003906, + "learning_rate": 2.392483267547623e-05, + "loss": 0.931, + "step": 2360 + }, + { + "epoch": 0.41, + "grad_norm": 9.116369247436523, + "learning_rate": 2.392225845203364e-05, + "loss": 0.9284, + "step": 2361 + }, + { + "epoch": 0.41, + "grad_norm": 8.191892623901367, + "learning_rate": 2.391968422859104e-05, + "loss": 0.647, + "step": 2362 + }, + { + "epoch": 0.41, + "grad_norm": 8.659319877624512, + "learning_rate": 2.391711000514845e-05, + "loss": 1.1507, + "step": 2363 + }, + { + "epoch": 0.41, + "grad_norm": 8.853606224060059, + "learning_rate": 2.391453578170585e-05, + "loss": 0.8916, + "step": 2364 + }, + { + "epoch": 0.41, + "grad_norm": 10.060551643371582, + "learning_rate": 2.3911961558263258e-05, + "loss": 0.9975, + "step": 2365 + }, + { + "epoch": 0.41, + "grad_norm": 9.454354286193848, + "learning_rate": 2.390938733482066e-05, + "loss": 1.3043, + "step": 2366 + }, + { + "epoch": 0.41, + "grad_norm": 10.785755157470703, + "learning_rate": 2.3906813111378068e-05, + "loss": 1.52, + "step": 2367 + }, + { + "epoch": 0.41, + "grad_norm": 9.606592178344727, + "learning_rate": 2.390423888793547e-05, + "loss": 0.897, + "step": 2368 + }, + { + "epoch": 0.41, + "grad_norm": 9.56293773651123, + "learning_rate": 2.3901664664492878e-05, + "loss": 1.0566, + "step": 2369 + }, + { + "epoch": 0.41, + "grad_norm": 12.36727523803711, + "learning_rate": 2.3899090441050285e-05, + "loss": 1.3117, + "step": 2370 + }, + { + "epoch": 0.41, + "grad_norm": 11.051411628723145, + "learning_rate": 2.389651621760769e-05, + "loss": 1.1332, + "step": 2371 + }, + { + "epoch": 0.41, + "grad_norm": 10.34546947479248, + "learning_rate": 2.3893941994165095e-05, + "loss": 1.2682, + "step": 2372 + }, + { + "epoch": 0.41, + "grad_norm": 8.438085556030273, + "learning_rate": 2.3891367770722498e-05, + "loss": 0.9392, + "step": 2373 + }, + { + "epoch": 0.41, + "grad_norm": 9.597270011901855, + "learning_rate": 2.3888793547279905e-05, + "loss": 1.0658, + "step": 2374 + }, + { + "epoch": 0.41, + "grad_norm": 9.972052574157715, + "learning_rate": 2.3886219323837308e-05, + "loss": 1.2095, + "step": 2375 + }, + { + "epoch": 0.41, + "grad_norm": 9.842507362365723, + "learning_rate": 2.3883645100394715e-05, + "loss": 1.201, + "step": 2376 + }, + { + "epoch": 0.41, + "grad_norm": 8.305310249328613, + "learning_rate": 2.3881070876952118e-05, + "loss": 1.0585, + "step": 2377 + }, + { + "epoch": 0.41, + "grad_norm": 9.667527198791504, + "learning_rate": 2.3878496653509525e-05, + "loss": 1.1337, + "step": 2378 + }, + { + "epoch": 0.41, + "grad_norm": 9.113306045532227, + "learning_rate": 2.3875922430066928e-05, + "loss": 1.1791, + "step": 2379 + }, + { + "epoch": 0.41, + "grad_norm": 8.53714370727539, + "learning_rate": 2.3873348206624338e-05, + "loss": 1.1967, + "step": 2380 + }, + { + "epoch": 0.41, + "grad_norm": 9.78366470336914, + "learning_rate": 2.387077398318174e-05, + "loss": 1.4728, + "step": 2381 + }, + { + "epoch": 0.41, + "grad_norm": 10.642723083496094, + "learning_rate": 2.3868199759739148e-05, + "loss": 1.2941, + "step": 2382 + }, + { + "epoch": 0.41, + "grad_norm": 9.167917251586914, + "learning_rate": 2.386562553629655e-05, + "loss": 1.1403, + "step": 2383 + }, + { + "epoch": 0.41, + "grad_norm": 9.521174430847168, + "learning_rate": 2.3863051312853955e-05, + "loss": 1.1917, + "step": 2384 + }, + { + "epoch": 0.41, + "grad_norm": 9.093134880065918, + "learning_rate": 2.386047708941136e-05, + "loss": 1.3338, + "step": 2385 + }, + { + "epoch": 0.41, + "grad_norm": 8.815524101257324, + "learning_rate": 2.3857902865968765e-05, + "loss": 1.1557, + "step": 2386 + }, + { + "epoch": 0.41, + "grad_norm": 7.5319743156433105, + "learning_rate": 2.385532864252617e-05, + "loss": 0.9474, + "step": 2387 + }, + { + "epoch": 0.41, + "grad_norm": 7.921976089477539, + "learning_rate": 2.3852754419083575e-05, + "loss": 0.8523, + "step": 2388 + }, + { + "epoch": 0.41, + "grad_norm": 9.433771133422852, + "learning_rate": 2.3850180195640985e-05, + "loss": 1.0348, + "step": 2389 + }, + { + "epoch": 0.41, + "grad_norm": 7.832609176635742, + "learning_rate": 2.3847605972198388e-05, + "loss": 0.9005, + "step": 2390 + }, + { + "epoch": 0.41, + "grad_norm": 10.235001564025879, + "learning_rate": 2.3845031748755795e-05, + "loss": 1.3815, + "step": 2391 + }, + { + "epoch": 0.41, + "grad_norm": 8.518238067626953, + "learning_rate": 2.3842457525313198e-05, + "loss": 0.8948, + "step": 2392 + }, + { + "epoch": 0.41, + "grad_norm": 10.102937698364258, + "learning_rate": 2.3839883301870605e-05, + "loss": 1.2714, + "step": 2393 + }, + { + "epoch": 0.41, + "grad_norm": 9.932148933410645, + "learning_rate": 2.3837309078428008e-05, + "loss": 1.3764, + "step": 2394 + }, + { + "epoch": 0.41, + "grad_norm": 8.55225658416748, + "learning_rate": 2.383473485498541e-05, + "loss": 1.0773, + "step": 2395 + }, + { + "epoch": 0.41, + "grad_norm": 8.794912338256836, + "learning_rate": 2.3832160631542818e-05, + "loss": 1.009, + "step": 2396 + }, + { + "epoch": 0.41, + "grad_norm": 8.654611587524414, + "learning_rate": 2.382958640810022e-05, + "loss": 0.9229, + "step": 2397 + }, + { + "epoch": 0.41, + "grad_norm": 8.335205078125, + "learning_rate": 2.382701218465763e-05, + "loss": 1.0753, + "step": 2398 + }, + { + "epoch": 0.41, + "grad_norm": 8.122414588928223, + "learning_rate": 2.3824437961215035e-05, + "loss": 1.0487, + "step": 2399 + }, + { + "epoch": 0.41, + "grad_norm": 10.104586601257324, + "learning_rate": 2.382186373777244e-05, + "loss": 1.1819, + "step": 2400 + }, + { + "epoch": 0.41, + "grad_norm": 9.469476699829102, + "learning_rate": 2.3819289514329845e-05, + "loss": 1.165, + "step": 2401 + }, + { + "epoch": 0.41, + "grad_norm": 9.234419822692871, + "learning_rate": 2.381671529088725e-05, + "loss": 1.0652, + "step": 2402 + }, + { + "epoch": 0.41, + "grad_norm": 8.224913597106934, + "learning_rate": 2.3814141067444655e-05, + "loss": 1.156, + "step": 2403 + }, + { + "epoch": 0.41, + "grad_norm": 9.427159309387207, + "learning_rate": 2.3811566844002058e-05, + "loss": 1.1087, + "step": 2404 + }, + { + "epoch": 0.41, + "grad_norm": 9.388914108276367, + "learning_rate": 2.3808992620559465e-05, + "loss": 0.994, + "step": 2405 + }, + { + "epoch": 0.41, + "grad_norm": 7.564704895019531, + "learning_rate": 2.3806418397116868e-05, + "loss": 1.1732, + "step": 2406 + }, + { + "epoch": 0.41, + "grad_norm": 9.740270614624023, + "learning_rate": 2.3803844173674275e-05, + "loss": 1.0191, + "step": 2407 + }, + { + "epoch": 0.41, + "grad_norm": 7.566525459289551, + "learning_rate": 2.380126995023168e-05, + "loss": 0.9126, + "step": 2408 + }, + { + "epoch": 0.41, + "grad_norm": 7.765815258026123, + "learning_rate": 2.3798695726789088e-05, + "loss": 1.1143, + "step": 2409 + }, + { + "epoch": 0.41, + "grad_norm": 10.17686653137207, + "learning_rate": 2.379612150334649e-05, + "loss": 0.9496, + "step": 2410 + }, + { + "epoch": 0.41, + "grad_norm": 10.54204273223877, + "learning_rate": 2.3793547279903898e-05, + "loss": 1.2374, + "step": 2411 + }, + { + "epoch": 0.41, + "grad_norm": 7.409787654876709, + "learning_rate": 2.37909730564613e-05, + "loss": 1.0006, + "step": 2412 + }, + { + "epoch": 0.41, + "grad_norm": 8.912604331970215, + "learning_rate": 2.3788398833018708e-05, + "loss": 1.0432, + "step": 2413 + }, + { + "epoch": 0.41, + "grad_norm": 9.363262176513672, + "learning_rate": 2.378582460957611e-05, + "loss": 1.0378, + "step": 2414 + }, + { + "epoch": 0.41, + "grad_norm": 9.565895080566406, + "learning_rate": 2.3783250386133514e-05, + "loss": 1.1047, + "step": 2415 + }, + { + "epoch": 0.41, + "grad_norm": 12.706425666809082, + "learning_rate": 2.378067616269092e-05, + "loss": 1.3295, + "step": 2416 + }, + { + "epoch": 0.41, + "grad_norm": 8.505475997924805, + "learning_rate": 2.3778101939248328e-05, + "loss": 1.1323, + "step": 2417 + }, + { + "epoch": 0.41, + "grad_norm": 8.23418140411377, + "learning_rate": 2.3775527715805734e-05, + "loss": 1.0101, + "step": 2418 + }, + { + "epoch": 0.42, + "grad_norm": 10.298528671264648, + "learning_rate": 2.3772953492363138e-05, + "loss": 1.049, + "step": 2419 + }, + { + "epoch": 0.42, + "grad_norm": 8.873881340026855, + "learning_rate": 2.3770379268920544e-05, + "loss": 0.8916, + "step": 2420 + }, + { + "epoch": 0.42, + "grad_norm": 7.962735652923584, + "learning_rate": 2.3767805045477948e-05, + "loss": 0.8387, + "step": 2421 + }, + { + "epoch": 0.42, + "grad_norm": 9.635884284973145, + "learning_rate": 2.3765230822035354e-05, + "loss": 1.1091, + "step": 2422 + }, + { + "epoch": 0.42, + "grad_norm": 11.053528785705566, + "learning_rate": 2.3762656598592758e-05, + "loss": 1.2137, + "step": 2423 + }, + { + "epoch": 0.42, + "grad_norm": 10.316943168640137, + "learning_rate": 2.3760082375150164e-05, + "loss": 1.2033, + "step": 2424 + }, + { + "epoch": 0.42, + "grad_norm": 11.887518882751465, + "learning_rate": 2.3757508151707568e-05, + "loss": 1.2407, + "step": 2425 + }, + { + "epoch": 0.42, + "grad_norm": 10.951789855957031, + "learning_rate": 2.375493392826497e-05, + "loss": 1.1114, + "step": 2426 + }, + { + "epoch": 0.42, + "grad_norm": 9.072427749633789, + "learning_rate": 2.375235970482238e-05, + "loss": 0.9998, + "step": 2427 + }, + { + "epoch": 0.42, + "grad_norm": 9.388823509216309, + "learning_rate": 2.3749785481379784e-05, + "loss": 0.9954, + "step": 2428 + }, + { + "epoch": 0.42, + "grad_norm": 9.042737007141113, + "learning_rate": 2.374721125793719e-05, + "loss": 0.9225, + "step": 2429 + }, + { + "epoch": 0.42, + "grad_norm": 7.438971042633057, + "learning_rate": 2.3744637034494594e-05, + "loss": 0.7363, + "step": 2430 + }, + { + "epoch": 0.42, + "grad_norm": 9.267683029174805, + "learning_rate": 2.3742062811052e-05, + "loss": 0.9942, + "step": 2431 + }, + { + "epoch": 0.42, + "grad_norm": 8.633086204528809, + "learning_rate": 2.3739488587609404e-05, + "loss": 0.8846, + "step": 2432 + }, + { + "epoch": 0.42, + "grad_norm": 11.981773376464844, + "learning_rate": 2.373691436416681e-05, + "loss": 1.3067, + "step": 2433 + }, + { + "epoch": 0.42, + "grad_norm": 9.61812973022461, + "learning_rate": 2.3734340140724214e-05, + "loss": 1.0108, + "step": 2434 + }, + { + "epoch": 0.42, + "grad_norm": 11.247781753540039, + "learning_rate": 2.373176591728162e-05, + "loss": 1.3639, + "step": 2435 + }, + { + "epoch": 0.42, + "grad_norm": 9.135536193847656, + "learning_rate": 2.3729191693839028e-05, + "loss": 0.9578, + "step": 2436 + }, + { + "epoch": 0.42, + "grad_norm": 9.909148216247559, + "learning_rate": 2.372661747039643e-05, + "loss": 1.023, + "step": 2437 + }, + { + "epoch": 0.42, + "grad_norm": 11.051506996154785, + "learning_rate": 2.3724043246953838e-05, + "loss": 1.3859, + "step": 2438 + }, + { + "epoch": 0.42, + "grad_norm": 10.185151100158691, + "learning_rate": 2.372146902351124e-05, + "loss": 1.1907, + "step": 2439 + }, + { + "epoch": 0.42, + "grad_norm": 9.996024131774902, + "learning_rate": 2.3718894800068648e-05, + "loss": 1.3278, + "step": 2440 + }, + { + "epoch": 0.42, + "grad_norm": 10.550662994384766, + "learning_rate": 2.371632057662605e-05, + "loss": 1.0441, + "step": 2441 + }, + { + "epoch": 0.42, + "grad_norm": 8.213271141052246, + "learning_rate": 2.3713746353183458e-05, + "loss": 0.8978, + "step": 2442 + }, + { + "epoch": 0.42, + "grad_norm": 7.587554931640625, + "learning_rate": 2.371117212974086e-05, + "loss": 1.001, + "step": 2443 + }, + { + "epoch": 0.42, + "grad_norm": 8.72619915008545, + "learning_rate": 2.3708597906298268e-05, + "loss": 1.0332, + "step": 2444 + }, + { + "epoch": 0.42, + "grad_norm": 8.2136812210083, + "learning_rate": 2.370602368285567e-05, + "loss": 0.957, + "step": 2445 + }, + { + "epoch": 0.42, + "grad_norm": 7.504052639007568, + "learning_rate": 2.3703449459413078e-05, + "loss": 0.7831, + "step": 2446 + }, + { + "epoch": 0.42, + "grad_norm": 8.622686386108398, + "learning_rate": 2.3700875235970484e-05, + "loss": 1.1063, + "step": 2447 + }, + { + "epoch": 0.42, + "grad_norm": 9.420857429504395, + "learning_rate": 2.3698301012527887e-05, + "loss": 1.5009, + "step": 2448 + }, + { + "epoch": 0.42, + "grad_norm": 8.327728271484375, + "learning_rate": 2.3695726789085294e-05, + "loss": 1.0195, + "step": 2449 + }, + { + "epoch": 0.42, + "grad_norm": 9.8270902633667, + "learning_rate": 2.3693152565642697e-05, + "loss": 1.2884, + "step": 2450 + }, + { + "epoch": 0.42, + "grad_norm": 10.860349655151367, + "learning_rate": 2.3690578342200104e-05, + "loss": 1.3965, + "step": 2451 + }, + { + "epoch": 0.42, + "grad_norm": 7.761497497558594, + "learning_rate": 2.3688004118757507e-05, + "loss": 0.9062, + "step": 2452 + }, + { + "epoch": 0.42, + "grad_norm": 9.600850105285645, + "learning_rate": 2.3685429895314914e-05, + "loss": 1.1713, + "step": 2453 + }, + { + "epoch": 0.42, + "grad_norm": 9.563528060913086, + "learning_rate": 2.3682855671872317e-05, + "loss": 1.2687, + "step": 2454 + }, + { + "epoch": 0.42, + "grad_norm": 8.841002464294434, + "learning_rate": 2.3680281448429727e-05, + "loss": 0.7598, + "step": 2455 + }, + { + "epoch": 0.42, + "grad_norm": 8.881284713745117, + "learning_rate": 2.367770722498713e-05, + "loss": 0.9367, + "step": 2456 + }, + { + "epoch": 0.42, + "grad_norm": 7.2709784507751465, + "learning_rate": 2.3675133001544534e-05, + "loss": 0.8366, + "step": 2457 + }, + { + "epoch": 0.42, + "grad_norm": 7.911156177520752, + "learning_rate": 2.367255877810194e-05, + "loss": 0.7566, + "step": 2458 + }, + { + "epoch": 0.42, + "grad_norm": 9.620280265808105, + "learning_rate": 2.3669984554659344e-05, + "loss": 1.492, + "step": 2459 + }, + { + "epoch": 0.42, + "grad_norm": 9.352581024169922, + "learning_rate": 2.366741033121675e-05, + "loss": 0.9107, + "step": 2460 + }, + { + "epoch": 0.42, + "grad_norm": 8.401845932006836, + "learning_rate": 2.3664836107774154e-05, + "loss": 1.0071, + "step": 2461 + }, + { + "epoch": 0.42, + "grad_norm": 10.157147407531738, + "learning_rate": 2.366226188433156e-05, + "loss": 1.1496, + "step": 2462 + }, + { + "epoch": 0.42, + "grad_norm": 8.345260620117188, + "learning_rate": 2.3659687660888964e-05, + "loss": 0.8295, + "step": 2463 + }, + { + "epoch": 0.42, + "grad_norm": 7.7211527824401855, + "learning_rate": 2.365711343744637e-05, + "loss": 0.9211, + "step": 2464 + }, + { + "epoch": 0.42, + "grad_norm": 9.277822494506836, + "learning_rate": 2.3654539214003777e-05, + "loss": 1.1629, + "step": 2465 + }, + { + "epoch": 0.42, + "grad_norm": 10.909029960632324, + "learning_rate": 2.365196499056118e-05, + "loss": 1.3003, + "step": 2466 + }, + { + "epoch": 0.42, + "grad_norm": 9.170978546142578, + "learning_rate": 2.3649390767118587e-05, + "loss": 1.1131, + "step": 2467 + }, + { + "epoch": 0.42, + "grad_norm": 9.749275207519531, + "learning_rate": 2.364681654367599e-05, + "loss": 1.1058, + "step": 2468 + }, + { + "epoch": 0.42, + "grad_norm": 11.033509254455566, + "learning_rate": 2.3644242320233397e-05, + "loss": 0.9819, + "step": 2469 + }, + { + "epoch": 0.42, + "grad_norm": 8.735815048217773, + "learning_rate": 2.36416680967908e-05, + "loss": 1.0819, + "step": 2470 + }, + { + "epoch": 0.42, + "grad_norm": 9.044149398803711, + "learning_rate": 2.3639093873348207e-05, + "loss": 1.0405, + "step": 2471 + }, + { + "epoch": 0.42, + "grad_norm": 10.202775001525879, + "learning_rate": 2.363651964990561e-05, + "loss": 1.0634, + "step": 2472 + }, + { + "epoch": 0.42, + "grad_norm": 8.201421737670898, + "learning_rate": 2.3633945426463017e-05, + "loss": 0.991, + "step": 2473 + }, + { + "epoch": 0.42, + "grad_norm": 8.920056343078613, + "learning_rate": 2.3631371203020424e-05, + "loss": 0.7986, + "step": 2474 + }, + { + "epoch": 0.42, + "grad_norm": 10.732706069946289, + "learning_rate": 2.362879697957783e-05, + "loss": 1.3758, + "step": 2475 + }, + { + "epoch": 0.42, + "grad_norm": 9.223851203918457, + "learning_rate": 2.3626222756135234e-05, + "loss": 1.053, + "step": 2476 + }, + { + "epoch": 0.43, + "grad_norm": 10.358149528503418, + "learning_rate": 2.3623648532692637e-05, + "loss": 1.0756, + "step": 2477 + }, + { + "epoch": 0.43, + "grad_norm": 8.53659439086914, + "learning_rate": 2.3621074309250044e-05, + "loss": 0.8786, + "step": 2478 + }, + { + "epoch": 0.43, + "grad_norm": 10.722784042358398, + "learning_rate": 2.3618500085807447e-05, + "loss": 1.3966, + "step": 2479 + }, + { + "epoch": 0.43, + "grad_norm": 7.544040679931641, + "learning_rate": 2.3615925862364854e-05, + "loss": 0.9188, + "step": 2480 + }, + { + "epoch": 0.43, + "grad_norm": 7.28165864944458, + "learning_rate": 2.3613351638922257e-05, + "loss": 0.7604, + "step": 2481 + }, + { + "epoch": 0.43, + "grad_norm": 8.316466331481934, + "learning_rate": 2.3610777415479664e-05, + "loss": 0.9873, + "step": 2482 + }, + { + "epoch": 0.43, + "grad_norm": 9.219130516052246, + "learning_rate": 2.3608203192037067e-05, + "loss": 1.2108, + "step": 2483 + }, + { + "epoch": 0.43, + "grad_norm": 10.904192924499512, + "learning_rate": 2.3605628968594477e-05, + "loss": 1.0583, + "step": 2484 + }, + { + "epoch": 0.43, + "grad_norm": 9.60234546661377, + "learning_rate": 2.360305474515188e-05, + "loss": 1.2682, + "step": 2485 + }, + { + "epoch": 0.43, + "grad_norm": 9.382621765136719, + "learning_rate": 2.3600480521709287e-05, + "loss": 1.0676, + "step": 2486 + }, + { + "epoch": 0.43, + "grad_norm": 10.308059692382812, + "learning_rate": 2.359790629826669e-05, + "loss": 0.9831, + "step": 2487 + }, + { + "epoch": 0.43, + "grad_norm": 8.96368408203125, + "learning_rate": 2.3595332074824094e-05, + "loss": 0.9276, + "step": 2488 + }, + { + "epoch": 0.43, + "grad_norm": 9.224778175354004, + "learning_rate": 2.35927578513815e-05, + "loss": 1.1833, + "step": 2489 + }, + { + "epoch": 0.43, + "grad_norm": 9.238512992858887, + "learning_rate": 2.3590183627938904e-05, + "loss": 1.0521, + "step": 2490 + }, + { + "epoch": 0.43, + "grad_norm": 8.882194519042969, + "learning_rate": 2.358760940449631e-05, + "loss": 0.9314, + "step": 2491 + }, + { + "epoch": 0.43, + "grad_norm": 9.799914360046387, + "learning_rate": 2.3585035181053714e-05, + "loss": 1.2231, + "step": 2492 + }, + { + "epoch": 0.43, + "grad_norm": 11.082046508789062, + "learning_rate": 2.3582460957611124e-05, + "loss": 1.2219, + "step": 2493 + }, + { + "epoch": 0.43, + "grad_norm": 8.452569007873535, + "learning_rate": 2.3579886734168527e-05, + "loss": 0.8429, + "step": 2494 + }, + { + "epoch": 0.43, + "grad_norm": 11.005644798278809, + "learning_rate": 2.3577312510725934e-05, + "loss": 1.3801, + "step": 2495 + }, + { + "epoch": 0.43, + "grad_norm": 10.329168319702148, + "learning_rate": 2.3574738287283337e-05, + "loss": 1.0627, + "step": 2496 + }, + { + "epoch": 0.43, + "grad_norm": 10.755752563476562, + "learning_rate": 2.3572164063840744e-05, + "loss": 1.2499, + "step": 2497 + }, + { + "epoch": 0.43, + "grad_norm": 10.486258506774902, + "learning_rate": 2.3569589840398147e-05, + "loss": 1.0468, + "step": 2498 + }, + { + "epoch": 0.43, + "grad_norm": 7.862729549407959, + "learning_rate": 2.356701561695555e-05, + "loss": 0.9382, + "step": 2499 + }, + { + "epoch": 0.43, + "grad_norm": 8.966972351074219, + "learning_rate": 2.3564441393512957e-05, + "loss": 1.1468, + "step": 2500 + }, + { + "epoch": 0.43, + "grad_norm": 9.419432640075684, + "learning_rate": 2.356186717007036e-05, + "loss": 1.0783, + "step": 2501 + }, + { + "epoch": 0.43, + "grad_norm": 8.90982723236084, + "learning_rate": 2.3559292946627767e-05, + "loss": 1.0103, + "step": 2502 + }, + { + "epoch": 0.43, + "grad_norm": 7.468710899353027, + "learning_rate": 2.3556718723185174e-05, + "loss": 1.0959, + "step": 2503 + }, + { + "epoch": 0.43, + "grad_norm": 8.415430068969727, + "learning_rate": 2.355414449974258e-05, + "loss": 1.0021, + "step": 2504 + }, + { + "epoch": 0.43, + "grad_norm": 9.645242691040039, + "learning_rate": 2.3551570276299984e-05, + "loss": 0.9629, + "step": 2505 + }, + { + "epoch": 0.43, + "grad_norm": 6.518164157867432, + "learning_rate": 2.354899605285739e-05, + "loss": 0.7111, + "step": 2506 + }, + { + "epoch": 0.43, + "grad_norm": 10.395265579223633, + "learning_rate": 2.3546421829414794e-05, + "loss": 0.9499, + "step": 2507 + }, + { + "epoch": 0.43, + "grad_norm": 7.945225715637207, + "learning_rate": 2.3543847605972197e-05, + "loss": 0.8378, + "step": 2508 + }, + { + "epoch": 0.43, + "grad_norm": 9.525571823120117, + "learning_rate": 2.3541273382529604e-05, + "loss": 1.2957, + "step": 2509 + }, + { + "epoch": 0.43, + "grad_norm": 9.679350852966309, + "learning_rate": 2.3538699159087007e-05, + "loss": 1.0082, + "step": 2510 + }, + { + "epoch": 0.43, + "grad_norm": 8.790804862976074, + "learning_rate": 2.3536124935644414e-05, + "loss": 1.0925, + "step": 2511 + }, + { + "epoch": 0.43, + "grad_norm": 9.276101112365723, + "learning_rate": 2.353355071220182e-05, + "loss": 0.9825, + "step": 2512 + }, + { + "epoch": 0.43, + "grad_norm": 9.411559104919434, + "learning_rate": 2.3530976488759227e-05, + "loss": 0.8942, + "step": 2513 + }, + { + "epoch": 0.43, + "grad_norm": 9.060784339904785, + "learning_rate": 2.352840226531663e-05, + "loss": 1.1793, + "step": 2514 + }, + { + "epoch": 0.43, + "grad_norm": 10.921730041503906, + "learning_rate": 2.3525828041874037e-05, + "loss": 0.9212, + "step": 2515 + }, + { + "epoch": 0.43, + "grad_norm": 7.655002593994141, + "learning_rate": 2.352325381843144e-05, + "loss": 0.6529, + "step": 2516 + }, + { + "epoch": 0.43, + "grad_norm": 10.469806671142578, + "learning_rate": 2.3520679594988847e-05, + "loss": 1.0394, + "step": 2517 + }, + { + "epoch": 0.43, + "grad_norm": 9.596567153930664, + "learning_rate": 2.351810537154625e-05, + "loss": 1.147, + "step": 2518 + }, + { + "epoch": 0.43, + "grad_norm": 8.936881065368652, + "learning_rate": 2.3515531148103653e-05, + "loss": 1.1531, + "step": 2519 + }, + { + "epoch": 0.43, + "grad_norm": 9.614805221557617, + "learning_rate": 2.351295692466106e-05, + "loss": 1.0526, + "step": 2520 + }, + { + "epoch": 0.43, + "grad_norm": 9.913834571838379, + "learning_rate": 2.3510382701218467e-05, + "loss": 1.2466, + "step": 2521 + }, + { + "epoch": 0.43, + "grad_norm": 9.67934513092041, + "learning_rate": 2.3507808477775873e-05, + "loss": 1.1262, + "step": 2522 + }, + { + "epoch": 0.43, + "grad_norm": 8.341279983520508, + "learning_rate": 2.3505234254333277e-05, + "loss": 0.9325, + "step": 2523 + }, + { + "epoch": 0.43, + "grad_norm": 12.471835136413574, + "learning_rate": 2.3502660030890683e-05, + "loss": 1.223, + "step": 2524 + }, + { + "epoch": 0.43, + "grad_norm": 8.702771186828613, + "learning_rate": 2.3500085807448087e-05, + "loss": 1.1022, + "step": 2525 + }, + { + "epoch": 0.43, + "grad_norm": 8.640060424804688, + "learning_rate": 2.3497511584005493e-05, + "loss": 0.9489, + "step": 2526 + }, + { + "epoch": 0.43, + "grad_norm": 9.58908462524414, + "learning_rate": 2.3494937360562897e-05, + "loss": 1.3934, + "step": 2527 + }, + { + "epoch": 0.43, + "grad_norm": 10.46060848236084, + "learning_rate": 2.3492363137120303e-05, + "loss": 0.9935, + "step": 2528 + }, + { + "epoch": 0.43, + "grad_norm": 9.000808715820312, + "learning_rate": 2.3489788913677707e-05, + "loss": 0.8257, + "step": 2529 + }, + { + "epoch": 0.43, + "grad_norm": 8.350912094116211, + "learning_rate": 2.348721469023511e-05, + "loss": 1.2273, + "step": 2530 + }, + { + "epoch": 0.43, + "grad_norm": 8.099184036254883, + "learning_rate": 2.348464046679252e-05, + "loss": 0.9647, + "step": 2531 + }, + { + "epoch": 0.43, + "grad_norm": 8.951770782470703, + "learning_rate": 2.3482066243349923e-05, + "loss": 1.0582, + "step": 2532 + }, + { + "epoch": 0.43, + "grad_norm": 10.116839408874512, + "learning_rate": 2.347949201990733e-05, + "loss": 1.1364, + "step": 2533 + }, + { + "epoch": 0.43, + "grad_norm": 9.103583335876465, + "learning_rate": 2.3476917796464733e-05, + "loss": 1.1859, + "step": 2534 + }, + { + "epoch": 0.44, + "grad_norm": 8.171732902526855, + "learning_rate": 2.347434357302214e-05, + "loss": 1.0032, + "step": 2535 + }, + { + "epoch": 0.44, + "grad_norm": 9.168376922607422, + "learning_rate": 2.3471769349579543e-05, + "loss": 0.9084, + "step": 2536 + }, + { + "epoch": 0.44, + "grad_norm": 7.724564552307129, + "learning_rate": 2.346919512613695e-05, + "loss": 0.8318, + "step": 2537 + }, + { + "epoch": 0.44, + "grad_norm": 9.293848037719727, + "learning_rate": 2.3466620902694353e-05, + "loss": 1.2379, + "step": 2538 + }, + { + "epoch": 0.44, + "grad_norm": 8.937259674072266, + "learning_rate": 2.346404667925176e-05, + "loss": 0.8578, + "step": 2539 + }, + { + "epoch": 0.44, + "grad_norm": 9.461071014404297, + "learning_rate": 2.3461472455809167e-05, + "loss": 0.9782, + "step": 2540 + }, + { + "epoch": 0.44, + "grad_norm": 10.833611488342285, + "learning_rate": 2.345889823236657e-05, + "loss": 1.1425, + "step": 2541 + }, + { + "epoch": 0.44, + "grad_norm": 11.43184757232666, + "learning_rate": 2.3456324008923977e-05, + "loss": 0.9369, + "step": 2542 + }, + { + "epoch": 0.44, + "grad_norm": 7.954643249511719, + "learning_rate": 2.345374978548138e-05, + "loss": 0.893, + "step": 2543 + }, + { + "epoch": 0.44, + "grad_norm": 9.736577033996582, + "learning_rate": 2.3451175562038787e-05, + "loss": 0.8636, + "step": 2544 + }, + { + "epoch": 0.44, + "grad_norm": 11.804381370544434, + "learning_rate": 2.344860133859619e-05, + "loss": 1.1972, + "step": 2545 + }, + { + "epoch": 0.44, + "grad_norm": 9.188714027404785, + "learning_rate": 2.3446027115153597e-05, + "loss": 0.6603, + "step": 2546 + }, + { + "epoch": 0.44, + "grad_norm": 14.234238624572754, + "learning_rate": 2.3443452891711e-05, + "loss": 1.5362, + "step": 2547 + }, + { + "epoch": 0.44, + "grad_norm": 10.846101760864258, + "learning_rate": 2.3440878668268407e-05, + "loss": 0.9585, + "step": 2548 + }, + { + "epoch": 0.44, + "grad_norm": 9.942909240722656, + "learning_rate": 2.343830444482581e-05, + "loss": 1.2047, + "step": 2549 + }, + { + "epoch": 0.44, + "grad_norm": 8.016656875610352, + "learning_rate": 2.3435730221383217e-05, + "loss": 1.1495, + "step": 2550 + }, + { + "epoch": 0.44, + "grad_norm": 10.28046703338623, + "learning_rate": 2.3433155997940623e-05, + "loss": 1.0153, + "step": 2551 + }, + { + "epoch": 0.44, + "grad_norm": 11.70443058013916, + "learning_rate": 2.3430581774498026e-05, + "loss": 1.107, + "step": 2552 + }, + { + "epoch": 0.44, + "grad_norm": 9.147012710571289, + "learning_rate": 2.3428007551055433e-05, + "loss": 0.8575, + "step": 2553 + }, + { + "epoch": 0.44, + "grad_norm": 7.779894828796387, + "learning_rate": 2.3425433327612836e-05, + "loss": 0.9142, + "step": 2554 + }, + { + "epoch": 0.44, + "grad_norm": 8.599183082580566, + "learning_rate": 2.3422859104170243e-05, + "loss": 0.7884, + "step": 2555 + }, + { + "epoch": 0.44, + "grad_norm": 10.4769868850708, + "learning_rate": 2.3420284880727646e-05, + "loss": 1.2565, + "step": 2556 + }, + { + "epoch": 0.44, + "grad_norm": 10.201024055480957, + "learning_rate": 2.3417710657285053e-05, + "loss": 1.0752, + "step": 2557 + }, + { + "epoch": 0.44, + "grad_norm": 9.123604774475098, + "learning_rate": 2.3415136433842456e-05, + "loss": 0.8538, + "step": 2558 + }, + { + "epoch": 0.44, + "grad_norm": 7.135371208190918, + "learning_rate": 2.3412562210399866e-05, + "loss": 0.981, + "step": 2559 + }, + { + "epoch": 0.44, + "grad_norm": 7.122720241546631, + "learning_rate": 2.340998798695727e-05, + "loss": 1.1322, + "step": 2560 + }, + { + "epoch": 0.44, + "grad_norm": 9.272409439086914, + "learning_rate": 2.3407413763514673e-05, + "loss": 1.2099, + "step": 2561 + }, + { + "epoch": 0.44, + "grad_norm": 7.999232769012451, + "learning_rate": 2.340483954007208e-05, + "loss": 1.0271, + "step": 2562 + }, + { + "epoch": 0.44, + "grad_norm": 7.023630619049072, + "learning_rate": 2.3402265316629483e-05, + "loss": 0.7494, + "step": 2563 + }, + { + "epoch": 0.44, + "grad_norm": 8.72968578338623, + "learning_rate": 2.339969109318689e-05, + "loss": 1.1202, + "step": 2564 + }, + { + "epoch": 0.44, + "grad_norm": 8.302896499633789, + "learning_rate": 2.3397116869744293e-05, + "loss": 0.7018, + "step": 2565 + }, + { + "epoch": 0.44, + "grad_norm": 10.548614501953125, + "learning_rate": 2.33945426463017e-05, + "loss": 1.174, + "step": 2566 + }, + { + "epoch": 0.44, + "grad_norm": 11.122110366821289, + "learning_rate": 2.3391968422859103e-05, + "loss": 1.2374, + "step": 2567 + }, + { + "epoch": 0.44, + "grad_norm": 9.153768539428711, + "learning_rate": 2.338939419941651e-05, + "loss": 1.1996, + "step": 2568 + }, + { + "epoch": 0.44, + "grad_norm": 9.598315238952637, + "learning_rate": 2.3386819975973916e-05, + "loss": 1.1503, + "step": 2569 + }, + { + "epoch": 0.44, + "grad_norm": 9.490194320678711, + "learning_rate": 2.3384245752531323e-05, + "loss": 1.1182, + "step": 2570 + }, + { + "epoch": 0.44, + "grad_norm": 8.928894996643066, + "learning_rate": 2.3381671529088726e-05, + "loss": 0.9613, + "step": 2571 + }, + { + "epoch": 0.44, + "grad_norm": 9.012547492980957, + "learning_rate": 2.337909730564613e-05, + "loss": 1.117, + "step": 2572 + }, + { + "epoch": 0.44, + "grad_norm": 11.221946716308594, + "learning_rate": 2.3376523082203536e-05, + "loss": 1.391, + "step": 2573 + }, + { + "epoch": 0.44, + "grad_norm": 10.530134201049805, + "learning_rate": 2.337394885876094e-05, + "loss": 1.1089, + "step": 2574 + }, + { + "epoch": 0.44, + "grad_norm": 8.09863567352295, + "learning_rate": 2.3371374635318346e-05, + "loss": 0.9242, + "step": 2575 + }, + { + "epoch": 0.44, + "grad_norm": 9.62304973602295, + "learning_rate": 2.336880041187575e-05, + "loss": 0.9879, + "step": 2576 + }, + { + "epoch": 0.44, + "grad_norm": 11.571710586547852, + "learning_rate": 2.3366226188433156e-05, + "loss": 1.1416, + "step": 2577 + }, + { + "epoch": 0.44, + "grad_norm": 10.05125904083252, + "learning_rate": 2.3363651964990563e-05, + "loss": 1.3324, + "step": 2578 + }, + { + "epoch": 0.44, + "grad_norm": 9.33615779876709, + "learning_rate": 2.336107774154797e-05, + "loss": 0.8008, + "step": 2579 + }, + { + "epoch": 0.44, + "grad_norm": 9.51756763458252, + "learning_rate": 2.3358503518105373e-05, + "loss": 0.9045, + "step": 2580 + }, + { + "epoch": 0.44, + "grad_norm": 7.906956672668457, + "learning_rate": 2.3355929294662776e-05, + "loss": 0.8163, + "step": 2581 + }, + { + "epoch": 0.44, + "grad_norm": 9.578678131103516, + "learning_rate": 2.3353355071220183e-05, + "loss": 1.1397, + "step": 2582 + }, + { + "epoch": 0.44, + "grad_norm": 8.404522895812988, + "learning_rate": 2.3350780847777586e-05, + "loss": 0.8252, + "step": 2583 + }, + { + "epoch": 0.44, + "grad_norm": 8.912517547607422, + "learning_rate": 2.3348206624334993e-05, + "loss": 0.9024, + "step": 2584 + }, + { + "epoch": 0.44, + "grad_norm": 9.505443572998047, + "learning_rate": 2.3345632400892396e-05, + "loss": 1.0416, + "step": 2585 + }, + { + "epoch": 0.44, + "grad_norm": 8.26039981842041, + "learning_rate": 2.3343058177449803e-05, + "loss": 0.9666, + "step": 2586 + }, + { + "epoch": 0.44, + "grad_norm": 8.800965309143066, + "learning_rate": 2.3340483954007206e-05, + "loss": 0.759, + "step": 2587 + }, + { + "epoch": 0.44, + "grad_norm": 11.022990226745605, + "learning_rate": 2.3337909730564616e-05, + "loss": 1.054, + "step": 2588 + }, + { + "epoch": 0.44, + "grad_norm": 8.547623634338379, + "learning_rate": 2.333533550712202e-05, + "loss": 0.9252, + "step": 2589 + }, + { + "epoch": 0.44, + "grad_norm": 9.132389068603516, + "learning_rate": 2.3332761283679426e-05, + "loss": 0.9291, + "step": 2590 + }, + { + "epoch": 0.44, + "grad_norm": 8.922917366027832, + "learning_rate": 2.333018706023683e-05, + "loss": 0.9224, + "step": 2591 + }, + { + "epoch": 0.44, + "grad_norm": 10.583481788635254, + "learning_rate": 2.3327612836794233e-05, + "loss": 1.0704, + "step": 2592 + }, + { + "epoch": 0.44, + "grad_norm": 9.341734886169434, + "learning_rate": 2.332503861335164e-05, + "loss": 1.1663, + "step": 2593 + }, + { + "epoch": 0.45, + "grad_norm": 10.68163013458252, + "learning_rate": 2.3322464389909043e-05, + "loss": 1.1501, + "step": 2594 + }, + { + "epoch": 0.45, + "grad_norm": 10.966670036315918, + "learning_rate": 2.331989016646645e-05, + "loss": 1.4174, + "step": 2595 + }, + { + "epoch": 0.45, + "grad_norm": 10.94918441772461, + "learning_rate": 2.3317315943023853e-05, + "loss": 1.1149, + "step": 2596 + }, + { + "epoch": 0.45, + "grad_norm": 9.16345500946045, + "learning_rate": 2.3314741719581263e-05, + "loss": 1.2118, + "step": 2597 + }, + { + "epoch": 0.45, + "grad_norm": 7.790206432342529, + "learning_rate": 2.3312167496138666e-05, + "loss": 0.864, + "step": 2598 + }, + { + "epoch": 0.45, + "grad_norm": 9.023547172546387, + "learning_rate": 2.3309593272696073e-05, + "loss": 1.086, + "step": 2599 + }, + { + "epoch": 0.45, + "grad_norm": 7.443239688873291, + "learning_rate": 2.3307019049253476e-05, + "loss": 0.7738, + "step": 2600 + }, + { + "epoch": 0.45, + "grad_norm": 7.190574645996094, + "learning_rate": 2.3304444825810883e-05, + "loss": 1.0625, + "step": 2601 + }, + { + "epoch": 0.45, + "grad_norm": 7.739475250244141, + "learning_rate": 2.3301870602368286e-05, + "loss": 1.162, + "step": 2602 + }, + { + "epoch": 0.45, + "grad_norm": 8.268941879272461, + "learning_rate": 2.329929637892569e-05, + "loss": 1.0953, + "step": 2603 + }, + { + "epoch": 0.45, + "grad_norm": 8.179390907287598, + "learning_rate": 2.3296722155483096e-05, + "loss": 0.9851, + "step": 2604 + }, + { + "epoch": 0.45, + "grad_norm": 9.926496505737305, + "learning_rate": 2.32941479320405e-05, + "loss": 1.1368, + "step": 2605 + }, + { + "epoch": 0.45, + "grad_norm": 9.089879035949707, + "learning_rate": 2.3291573708597906e-05, + "loss": 1.1087, + "step": 2606 + }, + { + "epoch": 0.45, + "grad_norm": 8.267629623413086, + "learning_rate": 2.3288999485155313e-05, + "loss": 0.8739, + "step": 2607 + }, + { + "epoch": 0.45, + "grad_norm": 9.331788063049316, + "learning_rate": 2.328642526171272e-05, + "loss": 0.9835, + "step": 2608 + }, + { + "epoch": 0.45, + "grad_norm": 10.474777221679688, + "learning_rate": 2.3283851038270123e-05, + "loss": 1.0124, + "step": 2609 + }, + { + "epoch": 0.45, + "grad_norm": 11.057835578918457, + "learning_rate": 2.328127681482753e-05, + "loss": 1.4244, + "step": 2610 + }, + { + "epoch": 0.45, + "grad_norm": 10.820352554321289, + "learning_rate": 2.3278702591384933e-05, + "loss": 0.902, + "step": 2611 + }, + { + "epoch": 0.45, + "grad_norm": 10.378910064697266, + "learning_rate": 2.3276128367942336e-05, + "loss": 1.0497, + "step": 2612 + }, + { + "epoch": 0.45, + "grad_norm": 11.737698554992676, + "learning_rate": 2.3273554144499743e-05, + "loss": 1.1132, + "step": 2613 + }, + { + "epoch": 0.45, + "grad_norm": 9.998089790344238, + "learning_rate": 2.3270979921057146e-05, + "loss": 1.1106, + "step": 2614 + }, + { + "epoch": 0.45, + "grad_norm": 12.101094245910645, + "learning_rate": 2.3268405697614553e-05, + "loss": 1.4158, + "step": 2615 + }, + { + "epoch": 0.45, + "grad_norm": 8.936396598815918, + "learning_rate": 2.326583147417196e-05, + "loss": 0.9709, + "step": 2616 + }, + { + "epoch": 0.45, + "grad_norm": 9.512072563171387, + "learning_rate": 2.3263257250729366e-05, + "loss": 1.1137, + "step": 2617 + }, + { + "epoch": 0.45, + "grad_norm": 8.632737159729004, + "learning_rate": 2.326068302728677e-05, + "loss": 0.9908, + "step": 2618 + }, + { + "epoch": 0.45, + "grad_norm": 9.253110885620117, + "learning_rate": 2.3258108803844176e-05, + "loss": 1.0179, + "step": 2619 + }, + { + "epoch": 0.45, + "grad_norm": 6.720448017120361, + "learning_rate": 2.325553458040158e-05, + "loss": 0.8714, + "step": 2620 + }, + { + "epoch": 0.45, + "grad_norm": 9.655285835266113, + "learning_rate": 2.3252960356958986e-05, + "loss": 0.8205, + "step": 2621 + }, + { + "epoch": 0.45, + "grad_norm": 10.039182662963867, + "learning_rate": 2.325038613351639e-05, + "loss": 0.9435, + "step": 2622 + }, + { + "epoch": 0.45, + "grad_norm": 9.007562637329102, + "learning_rate": 2.3247811910073792e-05, + "loss": 0.8717, + "step": 2623 + }, + { + "epoch": 0.45, + "grad_norm": 9.206671714782715, + "learning_rate": 2.32452376866312e-05, + "loss": 1.0214, + "step": 2624 + }, + { + "epoch": 0.45, + "grad_norm": 10.041656494140625, + "learning_rate": 2.3242663463188606e-05, + "loss": 0.8245, + "step": 2625 + }, + { + "epoch": 0.45, + "grad_norm": 9.8939790725708, + "learning_rate": 2.3240089239746013e-05, + "loss": 0.9214, + "step": 2626 + }, + { + "epoch": 0.45, + "grad_norm": 9.08059310913086, + "learning_rate": 2.3237515016303416e-05, + "loss": 1.0139, + "step": 2627 + }, + { + "epoch": 0.45, + "grad_norm": 9.726590156555176, + "learning_rate": 2.3234940792860822e-05, + "loss": 0.7992, + "step": 2628 + }, + { + "epoch": 0.45, + "grad_norm": 10.939742088317871, + "learning_rate": 2.3232366569418226e-05, + "loss": 0.9317, + "step": 2629 + }, + { + "epoch": 0.45, + "grad_norm": 11.064697265625, + "learning_rate": 2.3229792345975632e-05, + "loss": 1.1421, + "step": 2630 + }, + { + "epoch": 0.45, + "grad_norm": 9.365456581115723, + "learning_rate": 2.3227218122533036e-05, + "loss": 0.8567, + "step": 2631 + }, + { + "epoch": 0.45, + "grad_norm": 10.208319664001465, + "learning_rate": 2.3224643899090442e-05, + "loss": 1.1267, + "step": 2632 + }, + { + "epoch": 0.45, + "grad_norm": 11.146499633789062, + "learning_rate": 2.3222069675647846e-05, + "loss": 1.2663, + "step": 2633 + }, + { + "epoch": 0.45, + "grad_norm": 11.190032958984375, + "learning_rate": 2.321949545220525e-05, + "loss": 1.2884, + "step": 2634 + }, + { + "epoch": 0.45, + "grad_norm": 10.034303665161133, + "learning_rate": 2.321692122876266e-05, + "loss": 1.0279, + "step": 2635 + }, + { + "epoch": 0.45, + "grad_norm": 10.336434364318848, + "learning_rate": 2.3214347005320062e-05, + "loss": 1.0477, + "step": 2636 + }, + { + "epoch": 0.45, + "grad_norm": 8.87071704864502, + "learning_rate": 2.321177278187747e-05, + "loss": 1.2337, + "step": 2637 + }, + { + "epoch": 0.45, + "grad_norm": 11.137707710266113, + "learning_rate": 2.3209198558434872e-05, + "loss": 1.255, + "step": 2638 + }, + { + "epoch": 0.45, + "grad_norm": 11.236285209655762, + "learning_rate": 2.320662433499228e-05, + "loss": 1.3126, + "step": 2639 + }, + { + "epoch": 0.45, + "grad_norm": 8.853729248046875, + "learning_rate": 2.3204050111549682e-05, + "loss": 1.1931, + "step": 2640 + }, + { + "epoch": 0.45, + "grad_norm": 9.575400352478027, + "learning_rate": 2.320147588810709e-05, + "loss": 1.0255, + "step": 2641 + }, + { + "epoch": 0.45, + "grad_norm": 8.494661331176758, + "learning_rate": 2.3198901664664492e-05, + "loss": 0.8133, + "step": 2642 + }, + { + "epoch": 0.45, + "grad_norm": 9.102679252624512, + "learning_rate": 2.31963274412219e-05, + "loss": 0.9565, + "step": 2643 + }, + { + "epoch": 0.45, + "grad_norm": 9.377016067504883, + "learning_rate": 2.3193753217779306e-05, + "loss": 1.2388, + "step": 2644 + }, + { + "epoch": 0.45, + "grad_norm": 6.880106449127197, + "learning_rate": 2.319117899433671e-05, + "loss": 0.8306, + "step": 2645 + }, + { + "epoch": 0.45, + "grad_norm": 7.2524518966674805, + "learning_rate": 2.3188604770894116e-05, + "loss": 0.9023, + "step": 2646 + }, + { + "epoch": 0.45, + "grad_norm": 8.944124221801758, + "learning_rate": 2.318603054745152e-05, + "loss": 1.0252, + "step": 2647 + }, + { + "epoch": 0.45, + "grad_norm": 9.78840160369873, + "learning_rate": 2.3183456324008926e-05, + "loss": 1.0816, + "step": 2648 + }, + { + "epoch": 0.45, + "grad_norm": 8.999737739562988, + "learning_rate": 2.318088210056633e-05, + "loss": 0.7798, + "step": 2649 + }, + { + "epoch": 0.45, + "grad_norm": 9.514599800109863, + "learning_rate": 2.3178307877123736e-05, + "loss": 1.0205, + "step": 2650 + }, + { + "epoch": 0.45, + "grad_norm": 9.377599716186523, + "learning_rate": 2.317573365368114e-05, + "loss": 1.173, + "step": 2651 + }, + { + "epoch": 0.46, + "grad_norm": 7.738985061645508, + "learning_rate": 2.3173159430238546e-05, + "loss": 1.08, + "step": 2652 + }, + { + "epoch": 0.46, + "grad_norm": 8.516234397888184, + "learning_rate": 2.317058520679595e-05, + "loss": 0.8602, + "step": 2653 + }, + { + "epoch": 0.46, + "grad_norm": 9.325765609741211, + "learning_rate": 2.3168010983353356e-05, + "loss": 1.1015, + "step": 2654 + }, + { + "epoch": 0.46, + "grad_norm": 8.299615859985352, + "learning_rate": 2.3165436759910762e-05, + "loss": 0.9153, + "step": 2655 + }, + { + "epoch": 0.46, + "grad_norm": 8.902812957763672, + "learning_rate": 2.3162862536468166e-05, + "loss": 0.8305, + "step": 2656 + }, + { + "epoch": 0.46, + "grad_norm": 8.841736793518066, + "learning_rate": 2.3160288313025572e-05, + "loss": 1.0197, + "step": 2657 + }, + { + "epoch": 0.46, + "grad_norm": 11.85616683959961, + "learning_rate": 2.3157714089582975e-05, + "loss": 1.2513, + "step": 2658 + }, + { + "epoch": 0.46, + "grad_norm": 11.059075355529785, + "learning_rate": 2.3155139866140382e-05, + "loss": 1.2462, + "step": 2659 + }, + { + "epoch": 0.46, + "grad_norm": 9.764070510864258, + "learning_rate": 2.3152565642697785e-05, + "loss": 1.0946, + "step": 2660 + }, + { + "epoch": 0.46, + "grad_norm": 9.843402862548828, + "learning_rate": 2.3149991419255192e-05, + "loss": 1.2516, + "step": 2661 + }, + { + "epoch": 0.46, + "grad_norm": 9.12812328338623, + "learning_rate": 2.3147417195812595e-05, + "loss": 0.9564, + "step": 2662 + }, + { + "epoch": 0.46, + "grad_norm": 9.824984550476074, + "learning_rate": 2.3144842972370006e-05, + "loss": 1.1269, + "step": 2663 + }, + { + "epoch": 0.46, + "grad_norm": 10.98025894165039, + "learning_rate": 2.314226874892741e-05, + "loss": 1.4424, + "step": 2664 + }, + { + "epoch": 0.46, + "grad_norm": 8.523545265197754, + "learning_rate": 2.3139694525484812e-05, + "loss": 0.9476, + "step": 2665 + }, + { + "epoch": 0.46, + "grad_norm": 9.977849006652832, + "learning_rate": 2.313712030204222e-05, + "loss": 0.8876, + "step": 2666 + }, + { + "epoch": 0.46, + "grad_norm": 10.086889266967773, + "learning_rate": 2.3134546078599622e-05, + "loss": 0.9347, + "step": 2667 + }, + { + "epoch": 0.46, + "grad_norm": 9.332056999206543, + "learning_rate": 2.313197185515703e-05, + "loss": 0.9366, + "step": 2668 + }, + { + "epoch": 0.46, + "grad_norm": 8.346678733825684, + "learning_rate": 2.3129397631714432e-05, + "loss": 1.0269, + "step": 2669 + }, + { + "epoch": 0.46, + "grad_norm": 8.478102684020996, + "learning_rate": 2.312682340827184e-05, + "loss": 1.1438, + "step": 2670 + }, + { + "epoch": 0.46, + "grad_norm": 9.203483581542969, + "learning_rate": 2.3124249184829242e-05, + "loss": 0.9656, + "step": 2671 + }, + { + "epoch": 0.46, + "grad_norm": 10.374492645263672, + "learning_rate": 2.312167496138665e-05, + "loss": 1.0915, + "step": 2672 + }, + { + "epoch": 0.46, + "grad_norm": 9.016454696655273, + "learning_rate": 2.3119100737944055e-05, + "loss": 1.0329, + "step": 2673 + }, + { + "epoch": 0.46, + "grad_norm": 9.490626335144043, + "learning_rate": 2.3116526514501462e-05, + "loss": 1.3229, + "step": 2674 + }, + { + "epoch": 0.46, + "grad_norm": 10.105101585388184, + "learning_rate": 2.3113952291058865e-05, + "loss": 0.871, + "step": 2675 + }, + { + "epoch": 0.46, + "grad_norm": 11.454776763916016, + "learning_rate": 2.311137806761627e-05, + "loss": 1.1395, + "step": 2676 + }, + { + "epoch": 0.46, + "grad_norm": 10.295034408569336, + "learning_rate": 2.3108803844173675e-05, + "loss": 0.9053, + "step": 2677 + }, + { + "epoch": 0.46, + "grad_norm": 12.13992977142334, + "learning_rate": 2.310622962073108e-05, + "loss": 1.3587, + "step": 2678 + }, + { + "epoch": 0.46, + "grad_norm": 9.177966117858887, + "learning_rate": 2.3103655397288485e-05, + "loss": 0.9886, + "step": 2679 + }, + { + "epoch": 0.46, + "grad_norm": 8.235572814941406, + "learning_rate": 2.310108117384589e-05, + "loss": 0.953, + "step": 2680 + }, + { + "epoch": 0.46, + "grad_norm": 9.598236083984375, + "learning_rate": 2.3098506950403295e-05, + "loss": 0.9892, + "step": 2681 + }, + { + "epoch": 0.46, + "grad_norm": 10.04853343963623, + "learning_rate": 2.3095932726960702e-05, + "loss": 1.2259, + "step": 2682 + }, + { + "epoch": 0.46, + "grad_norm": 9.717796325683594, + "learning_rate": 2.309335850351811e-05, + "loss": 1.0625, + "step": 2683 + }, + { + "epoch": 0.46, + "grad_norm": 9.189144134521484, + "learning_rate": 2.3090784280075512e-05, + "loss": 1.2453, + "step": 2684 + }, + { + "epoch": 0.46, + "grad_norm": 6.803874969482422, + "learning_rate": 2.3088210056632915e-05, + "loss": 0.787, + "step": 2685 + }, + { + "epoch": 0.46, + "grad_norm": 8.659193992614746, + "learning_rate": 2.3085635833190322e-05, + "loss": 0.8775, + "step": 2686 + }, + { + "epoch": 0.46, + "grad_norm": 10.041272163391113, + "learning_rate": 2.3083061609747725e-05, + "loss": 1.0143, + "step": 2687 + }, + { + "epoch": 0.46, + "grad_norm": 9.787315368652344, + "learning_rate": 2.3080487386305132e-05, + "loss": 1.1311, + "step": 2688 + }, + { + "epoch": 0.46, + "grad_norm": 8.955000877380371, + "learning_rate": 2.3077913162862535e-05, + "loss": 1.021, + "step": 2689 + }, + { + "epoch": 0.46, + "grad_norm": 10.984403610229492, + "learning_rate": 2.3075338939419942e-05, + "loss": 1.1738, + "step": 2690 + }, + { + "epoch": 0.46, + "grad_norm": 10.193453788757324, + "learning_rate": 2.3072764715977345e-05, + "loss": 0.9614, + "step": 2691 + }, + { + "epoch": 0.46, + "grad_norm": 9.679840087890625, + "learning_rate": 2.3070190492534755e-05, + "loss": 1.1948, + "step": 2692 + }, + { + "epoch": 0.46, + "grad_norm": 8.818044662475586, + "learning_rate": 2.306761626909216e-05, + "loss": 1.0516, + "step": 2693 + }, + { + "epoch": 0.46, + "grad_norm": 9.360944747924805, + "learning_rate": 2.3065042045649565e-05, + "loss": 0.7857, + "step": 2694 + }, + { + "epoch": 0.46, + "grad_norm": 9.251326560974121, + "learning_rate": 2.306246782220697e-05, + "loss": 1.0349, + "step": 2695 + }, + { + "epoch": 0.46, + "grad_norm": 8.83499526977539, + "learning_rate": 2.3059893598764372e-05, + "loss": 0.9429, + "step": 2696 + }, + { + "epoch": 0.46, + "grad_norm": 9.595775604248047, + "learning_rate": 2.305731937532178e-05, + "loss": 0.9833, + "step": 2697 + }, + { + "epoch": 0.46, + "grad_norm": 8.570494651794434, + "learning_rate": 2.3054745151879182e-05, + "loss": 1.0116, + "step": 2698 + }, + { + "epoch": 0.46, + "grad_norm": 8.598008155822754, + "learning_rate": 2.305217092843659e-05, + "loss": 0.9391, + "step": 2699 + }, + { + "epoch": 0.46, + "grad_norm": 8.88185977935791, + "learning_rate": 2.3049596704993992e-05, + "loss": 1.01, + "step": 2700 + }, + { + "epoch": 0.46, + "grad_norm": 8.411881446838379, + "learning_rate": 2.3047022481551402e-05, + "loss": 0.9149, + "step": 2701 + }, + { + "epoch": 0.46, + "grad_norm": 9.78014850616455, + "learning_rate": 2.3044448258108805e-05, + "loss": 0.8936, + "step": 2702 + }, + { + "epoch": 0.46, + "grad_norm": 6.988958835601807, + "learning_rate": 2.3041874034666212e-05, + "loss": 0.6614, + "step": 2703 + }, + { + "epoch": 0.46, + "grad_norm": 10.601667404174805, + "learning_rate": 2.3039299811223615e-05, + "loss": 1.1616, + "step": 2704 + }, + { + "epoch": 0.46, + "grad_norm": 10.891489028930664, + "learning_rate": 2.3036725587781022e-05, + "loss": 1.3806, + "step": 2705 + }, + { + "epoch": 0.46, + "grad_norm": 9.160721778869629, + "learning_rate": 2.3034151364338425e-05, + "loss": 1.0335, + "step": 2706 + }, + { + "epoch": 0.46, + "grad_norm": 6.962326526641846, + "learning_rate": 2.303157714089583e-05, + "loss": 0.7447, + "step": 2707 + }, + { + "epoch": 0.46, + "grad_norm": 9.290735244750977, + "learning_rate": 2.3029002917453235e-05, + "loss": 1.0803, + "step": 2708 + }, + { + "epoch": 0.46, + "grad_norm": 9.305471420288086, + "learning_rate": 2.302642869401064e-05, + "loss": 1.1649, + "step": 2709 + }, + { + "epoch": 0.47, + "grad_norm": 9.52600383758545, + "learning_rate": 2.3023854470568045e-05, + "loss": 1.2812, + "step": 2710 + }, + { + "epoch": 0.47, + "grad_norm": 10.067754745483398, + "learning_rate": 2.302128024712545e-05, + "loss": 1.0728, + "step": 2711 + }, + { + "epoch": 0.47, + "grad_norm": 9.478352546691895, + "learning_rate": 2.301870602368286e-05, + "loss": 0.9654, + "step": 2712 + }, + { + "epoch": 0.47, + "grad_norm": 10.699317932128906, + "learning_rate": 2.301613180024026e-05, + "loss": 0.9325, + "step": 2713 + }, + { + "epoch": 0.47, + "grad_norm": 8.604063987731934, + "learning_rate": 2.301355757679767e-05, + "loss": 1.0163, + "step": 2714 + }, + { + "epoch": 0.47, + "grad_norm": 7.880563259124756, + "learning_rate": 2.301098335335507e-05, + "loss": 0.9194, + "step": 2715 + }, + { + "epoch": 0.47, + "grad_norm": 9.372673034667969, + "learning_rate": 2.3008409129912475e-05, + "loss": 1.118, + "step": 2716 + }, + { + "epoch": 0.47, + "grad_norm": 8.623473167419434, + "learning_rate": 2.300583490646988e-05, + "loss": 0.8165, + "step": 2717 + }, + { + "epoch": 0.47, + "grad_norm": 7.650661945343018, + "learning_rate": 2.3003260683027285e-05, + "loss": 0.9933, + "step": 2718 + }, + { + "epoch": 0.47, + "grad_norm": 10.055195808410645, + "learning_rate": 2.300068645958469e-05, + "loss": 1.1341, + "step": 2719 + }, + { + "epoch": 0.47, + "grad_norm": 11.060074806213379, + "learning_rate": 2.2998112236142098e-05, + "loss": 0.9825, + "step": 2720 + }, + { + "epoch": 0.47, + "grad_norm": 10.135762214660645, + "learning_rate": 2.2995538012699505e-05, + "loss": 1.2304, + "step": 2721 + }, + { + "epoch": 0.47, + "grad_norm": 10.723862648010254, + "learning_rate": 2.2992963789256908e-05, + "loss": 1.0414, + "step": 2722 + }, + { + "epoch": 0.47, + "grad_norm": 7.84394645690918, + "learning_rate": 2.2990389565814315e-05, + "loss": 1.1302, + "step": 2723 + }, + { + "epoch": 0.47, + "grad_norm": 8.975812911987305, + "learning_rate": 2.2987815342371718e-05, + "loss": 0.8251, + "step": 2724 + }, + { + "epoch": 0.47, + "grad_norm": 9.302264213562012, + "learning_rate": 2.2985241118929125e-05, + "loss": 1.1593, + "step": 2725 + }, + { + "epoch": 0.47, + "grad_norm": 9.009163856506348, + "learning_rate": 2.2982666895486528e-05, + "loss": 1.0301, + "step": 2726 + }, + { + "epoch": 0.47, + "grad_norm": 9.274458885192871, + "learning_rate": 2.298009267204393e-05, + "loss": 1.1649, + "step": 2727 + }, + { + "epoch": 0.47, + "grad_norm": 7.925476551055908, + "learning_rate": 2.2977518448601338e-05, + "loss": 0.9727, + "step": 2728 + }, + { + "epoch": 0.47, + "grad_norm": 8.69002914428711, + "learning_rate": 2.297494422515874e-05, + "loss": 1.0547, + "step": 2729 + }, + { + "epoch": 0.47, + "grad_norm": 11.099454879760742, + "learning_rate": 2.297237000171615e-05, + "loss": 1.1742, + "step": 2730 + }, + { + "epoch": 0.47, + "grad_norm": 9.325862884521484, + "learning_rate": 2.2969795778273555e-05, + "loss": 1.0888, + "step": 2731 + }, + { + "epoch": 0.47, + "grad_norm": 10.108589172363281, + "learning_rate": 2.296722155483096e-05, + "loss": 0.9231, + "step": 2732 + }, + { + "epoch": 0.47, + "grad_norm": 9.083965301513672, + "learning_rate": 2.2964647331388365e-05, + "loss": 1.1818, + "step": 2733 + }, + { + "epoch": 0.47, + "grad_norm": 9.379091262817383, + "learning_rate": 2.296207310794577e-05, + "loss": 1.1966, + "step": 2734 + }, + { + "epoch": 0.47, + "grad_norm": 9.886136054992676, + "learning_rate": 2.2959498884503175e-05, + "loss": 1.2887, + "step": 2735 + }, + { + "epoch": 0.47, + "grad_norm": 9.736721992492676, + "learning_rate": 2.295692466106058e-05, + "loss": 1.0544, + "step": 2736 + }, + { + "epoch": 0.47, + "grad_norm": 9.164587020874023, + "learning_rate": 2.2954350437617985e-05, + "loss": 0.9271, + "step": 2737 + }, + { + "epoch": 0.47, + "grad_norm": 8.921046257019043, + "learning_rate": 2.2951776214175388e-05, + "loss": 0.8671, + "step": 2738 + }, + { + "epoch": 0.47, + "grad_norm": 8.995047569274902, + "learning_rate": 2.2949201990732798e-05, + "loss": 1.0532, + "step": 2739 + }, + { + "epoch": 0.47, + "grad_norm": 10.59412670135498, + "learning_rate": 2.29466277672902e-05, + "loss": 0.8976, + "step": 2740 + }, + { + "epoch": 0.47, + "grad_norm": 9.083028793334961, + "learning_rate": 2.2944053543847608e-05, + "loss": 1.1602, + "step": 2741 + }, + { + "epoch": 0.47, + "grad_norm": 9.787667274475098, + "learning_rate": 2.294147932040501e-05, + "loss": 1.0648, + "step": 2742 + }, + { + "epoch": 0.47, + "grad_norm": 8.504075050354004, + "learning_rate": 2.2938905096962418e-05, + "loss": 1.009, + "step": 2743 + }, + { + "epoch": 0.47, + "grad_norm": 10.056138038635254, + "learning_rate": 2.293633087351982e-05, + "loss": 1.0141, + "step": 2744 + }, + { + "epoch": 0.47, + "grad_norm": 8.839750289916992, + "learning_rate": 2.2933756650077228e-05, + "loss": 0.873, + "step": 2745 + }, + { + "epoch": 0.47, + "grad_norm": 9.619115829467773, + "learning_rate": 2.293118242663463e-05, + "loss": 0.9876, + "step": 2746 + }, + { + "epoch": 0.47, + "grad_norm": 10.747673034667969, + "learning_rate": 2.2928608203192038e-05, + "loss": 1.2646, + "step": 2747 + }, + { + "epoch": 0.47, + "grad_norm": 11.004373550415039, + "learning_rate": 2.2926033979749445e-05, + "loss": 1.4506, + "step": 2748 + }, + { + "epoch": 0.47, + "grad_norm": 9.066360473632812, + "learning_rate": 2.2923459756306848e-05, + "loss": 0.8475, + "step": 2749 + }, + { + "epoch": 0.47, + "grad_norm": 9.013704299926758, + "learning_rate": 2.2920885532864255e-05, + "loss": 1.0386, + "step": 2750 + }, + { + "epoch": 0.47, + "grad_norm": 11.597650527954102, + "learning_rate": 2.2918311309421658e-05, + "loss": 1.4272, + "step": 2751 + }, + { + "epoch": 0.47, + "grad_norm": 10.709369659423828, + "learning_rate": 2.2915737085979065e-05, + "loss": 1.3146, + "step": 2752 + }, + { + "epoch": 0.47, + "grad_norm": 7.2885942459106445, + "learning_rate": 2.2913162862536468e-05, + "loss": 0.7658, + "step": 2753 + }, + { + "epoch": 0.47, + "grad_norm": 8.996438980102539, + "learning_rate": 2.2910588639093875e-05, + "loss": 1.0493, + "step": 2754 + }, + { + "epoch": 0.47, + "grad_norm": 10.672294616699219, + "learning_rate": 2.2908014415651278e-05, + "loss": 1.4401, + "step": 2755 + }, + { + "epoch": 0.47, + "grad_norm": 10.644652366638184, + "learning_rate": 2.2905440192208685e-05, + "loss": 1.0895, + "step": 2756 + }, + { + "epoch": 0.47, + "grad_norm": 9.453187942504883, + "learning_rate": 2.2902865968766088e-05, + "loss": 1.0532, + "step": 2757 + }, + { + "epoch": 0.47, + "grad_norm": 8.38924789428711, + "learning_rate": 2.2900291745323495e-05, + "loss": 1.0041, + "step": 2758 + }, + { + "epoch": 0.47, + "grad_norm": 10.288237571716309, + "learning_rate": 2.28977175218809e-05, + "loss": 1.269, + "step": 2759 + }, + { + "epoch": 0.47, + "grad_norm": 9.722000122070312, + "learning_rate": 2.2895143298438305e-05, + "loss": 1.1034, + "step": 2760 + }, + { + "epoch": 0.47, + "grad_norm": 8.730677604675293, + "learning_rate": 2.289256907499571e-05, + "loss": 1.0287, + "step": 2761 + }, + { + "epoch": 0.47, + "grad_norm": 10.569239616394043, + "learning_rate": 2.2889994851553114e-05, + "loss": 1.1965, + "step": 2762 + }, + { + "epoch": 0.47, + "grad_norm": 9.154129028320312, + "learning_rate": 2.288742062811052e-05, + "loss": 1.194, + "step": 2763 + }, + { + "epoch": 0.47, + "grad_norm": 9.696370124816895, + "learning_rate": 2.2884846404667924e-05, + "loss": 1.3635, + "step": 2764 + }, + { + "epoch": 0.47, + "grad_norm": 8.712349891662598, + "learning_rate": 2.288227218122533e-05, + "loss": 0.8794, + "step": 2765 + }, + { + "epoch": 0.47, + "grad_norm": 9.65976619720459, + "learning_rate": 2.2879697957782734e-05, + "loss": 1.0708, + "step": 2766 + }, + { + "epoch": 0.47, + "grad_norm": 8.705913543701172, + "learning_rate": 2.2877123734340145e-05, + "loss": 0.8806, + "step": 2767 + }, + { + "epoch": 0.48, + "grad_norm": 11.749248504638672, + "learning_rate": 2.2874549510897548e-05, + "loss": 1.1793, + "step": 2768 + }, + { + "epoch": 0.48, + "grad_norm": 9.434737205505371, + "learning_rate": 2.287197528745495e-05, + "loss": 1.2681, + "step": 2769 + }, + { + "epoch": 0.48, + "grad_norm": 9.738548278808594, + "learning_rate": 2.2869401064012358e-05, + "loss": 0.9724, + "step": 2770 + }, + { + "epoch": 0.48, + "grad_norm": 8.327269554138184, + "learning_rate": 2.286682684056976e-05, + "loss": 1.1014, + "step": 2771 + }, + { + "epoch": 0.48, + "grad_norm": 9.601359367370605, + "learning_rate": 2.2864252617127168e-05, + "loss": 1.1573, + "step": 2772 + }, + { + "epoch": 0.48, + "grad_norm": 9.368884086608887, + "learning_rate": 2.286167839368457e-05, + "loss": 0.8289, + "step": 2773 + }, + { + "epoch": 0.48, + "grad_norm": 10.068943977355957, + "learning_rate": 2.2859104170241978e-05, + "loss": 1.1792, + "step": 2774 + }, + { + "epoch": 0.48, + "grad_norm": 10.891889572143555, + "learning_rate": 2.285652994679938e-05, + "loss": 1.2894, + "step": 2775 + }, + { + "epoch": 0.48, + "grad_norm": 9.454261779785156, + "learning_rate": 2.2853955723356788e-05, + "loss": 1.4133, + "step": 2776 + }, + { + "epoch": 0.48, + "grad_norm": 9.031497955322266, + "learning_rate": 2.2851381499914194e-05, + "loss": 1.0014, + "step": 2777 + }, + { + "epoch": 0.48, + "grad_norm": 7.623445987701416, + "learning_rate": 2.28488072764716e-05, + "loss": 0.78, + "step": 2778 + }, + { + "epoch": 0.48, + "grad_norm": 10.759760856628418, + "learning_rate": 2.2846233053029004e-05, + "loss": 1.2732, + "step": 2779 + }, + { + "epoch": 0.48, + "grad_norm": 8.171009063720703, + "learning_rate": 2.2843658829586408e-05, + "loss": 0.856, + "step": 2780 + }, + { + "epoch": 0.48, + "grad_norm": 8.820732116699219, + "learning_rate": 2.2841084606143814e-05, + "loss": 1.0889, + "step": 2781 + }, + { + "epoch": 0.48, + "grad_norm": 8.587847709655762, + "learning_rate": 2.2838510382701218e-05, + "loss": 1.0345, + "step": 2782 + }, + { + "epoch": 0.48, + "grad_norm": 8.194660186767578, + "learning_rate": 2.2835936159258624e-05, + "loss": 1.1592, + "step": 2783 + }, + { + "epoch": 0.48, + "grad_norm": 8.72214126586914, + "learning_rate": 2.2833361935816028e-05, + "loss": 0.9592, + "step": 2784 + }, + { + "epoch": 0.48, + "grad_norm": 10.25465202331543, + "learning_rate": 2.2830787712373434e-05, + "loss": 1.1084, + "step": 2785 + }, + { + "epoch": 0.48, + "grad_norm": 9.497455596923828, + "learning_rate": 2.282821348893084e-05, + "loss": 0.9393, + "step": 2786 + }, + { + "epoch": 0.48, + "grad_norm": 8.359732627868652, + "learning_rate": 2.2825639265488248e-05, + "loss": 1.0535, + "step": 2787 + }, + { + "epoch": 0.48, + "grad_norm": 8.600067138671875, + "learning_rate": 2.282306504204565e-05, + "loss": 0.7851, + "step": 2788 + }, + { + "epoch": 0.48, + "grad_norm": 10.811413764953613, + "learning_rate": 2.2820490818603054e-05, + "loss": 1.1524, + "step": 2789 + }, + { + "epoch": 0.48, + "grad_norm": 9.236885070800781, + "learning_rate": 2.281791659516046e-05, + "loss": 1.0283, + "step": 2790 + }, + { + "epoch": 0.48, + "grad_norm": 8.800850868225098, + "learning_rate": 2.2815342371717864e-05, + "loss": 1.1249, + "step": 2791 + }, + { + "epoch": 0.48, + "grad_norm": 7.862846851348877, + "learning_rate": 2.281276814827527e-05, + "loss": 0.7767, + "step": 2792 + }, + { + "epoch": 0.48, + "grad_norm": 8.416818618774414, + "learning_rate": 2.2810193924832674e-05, + "loss": 1.0211, + "step": 2793 + }, + { + "epoch": 0.48, + "grad_norm": 9.245564460754395, + "learning_rate": 2.280761970139008e-05, + "loss": 1.3034, + "step": 2794 + }, + { + "epoch": 0.48, + "grad_norm": 10.16881275177002, + "learning_rate": 2.2805045477947484e-05, + "loss": 1.2065, + "step": 2795 + }, + { + "epoch": 0.48, + "grad_norm": 8.259509086608887, + "learning_rate": 2.2802471254504894e-05, + "loss": 0.9956, + "step": 2796 + }, + { + "epoch": 0.48, + "grad_norm": 9.27147388458252, + "learning_rate": 2.2799897031062298e-05, + "loss": 0.9276, + "step": 2797 + }, + { + "epoch": 0.48, + "grad_norm": 8.915818214416504, + "learning_rate": 2.2797322807619704e-05, + "loss": 0.9996, + "step": 2798 + }, + { + "epoch": 0.48, + "grad_norm": 7.080011367797852, + "learning_rate": 2.2794748584177107e-05, + "loss": 0.8308, + "step": 2799 + }, + { + "epoch": 0.48, + "grad_norm": 10.386871337890625, + "learning_rate": 2.279217436073451e-05, + "loss": 1.4012, + "step": 2800 + }, + { + "epoch": 0.48, + "grad_norm": 8.126442909240723, + "learning_rate": 2.2789600137291917e-05, + "loss": 1.0585, + "step": 2801 + }, + { + "epoch": 0.48, + "grad_norm": 9.754056930541992, + "learning_rate": 2.278702591384932e-05, + "loss": 1.2098, + "step": 2802 + }, + { + "epoch": 0.48, + "grad_norm": 13.785674095153809, + "learning_rate": 2.2784451690406727e-05, + "loss": 1.4384, + "step": 2803 + }, + { + "epoch": 0.48, + "grad_norm": 8.46529483795166, + "learning_rate": 2.278187746696413e-05, + "loss": 0.7423, + "step": 2804 + }, + { + "epoch": 0.48, + "grad_norm": 10.653386116027832, + "learning_rate": 2.277930324352154e-05, + "loss": 1.4328, + "step": 2805 + }, + { + "epoch": 0.48, + "grad_norm": 8.566006660461426, + "learning_rate": 2.2776729020078944e-05, + "loss": 1.0497, + "step": 2806 + }, + { + "epoch": 0.48, + "grad_norm": 8.502777099609375, + "learning_rate": 2.277415479663635e-05, + "loss": 0.8336, + "step": 2807 + }, + { + "epoch": 0.48, + "grad_norm": 8.23631477355957, + "learning_rate": 2.2771580573193754e-05, + "loss": 0.9385, + "step": 2808 + }, + { + "epoch": 0.48, + "grad_norm": 9.036250114440918, + "learning_rate": 2.276900634975116e-05, + "loss": 0.9397, + "step": 2809 + }, + { + "epoch": 0.48, + "grad_norm": 9.418767929077148, + "learning_rate": 2.2766432126308564e-05, + "loss": 0.8866, + "step": 2810 + }, + { + "epoch": 0.48, + "grad_norm": 10.312596321105957, + "learning_rate": 2.2763857902865967e-05, + "loss": 1.059, + "step": 2811 + }, + { + "epoch": 0.48, + "grad_norm": 8.346514701843262, + "learning_rate": 2.2761283679423374e-05, + "loss": 0.8439, + "step": 2812 + }, + { + "epoch": 0.48, + "grad_norm": 8.678810119628906, + "learning_rate": 2.2758709455980777e-05, + "loss": 0.8421, + "step": 2813 + }, + { + "epoch": 0.48, + "grad_norm": 10.164229393005371, + "learning_rate": 2.2756135232538184e-05, + "loss": 1.0234, + "step": 2814 + }, + { + "epoch": 0.48, + "grad_norm": 10.911999702453613, + "learning_rate": 2.275356100909559e-05, + "loss": 1.0253, + "step": 2815 + }, + { + "epoch": 0.48, + "grad_norm": 11.797465324401855, + "learning_rate": 2.2750986785652997e-05, + "loss": 1.1644, + "step": 2816 + }, + { + "epoch": 0.48, + "grad_norm": 12.00514030456543, + "learning_rate": 2.27484125622104e-05, + "loss": 1.4585, + "step": 2817 + }, + { + "epoch": 0.48, + "grad_norm": 7.886373996734619, + "learning_rate": 2.2745838338767807e-05, + "loss": 0.9078, + "step": 2818 + }, + { + "epoch": 0.48, + "grad_norm": 10.170586585998535, + "learning_rate": 2.274326411532521e-05, + "loss": 1.3247, + "step": 2819 + }, + { + "epoch": 0.48, + "grad_norm": 9.58406925201416, + "learning_rate": 2.2740689891882614e-05, + "loss": 1.162, + "step": 2820 + }, + { + "epoch": 0.48, + "grad_norm": 10.046093940734863, + "learning_rate": 2.273811566844002e-05, + "loss": 0.9556, + "step": 2821 + }, + { + "epoch": 0.48, + "grad_norm": 10.614295959472656, + "learning_rate": 2.2735541444997424e-05, + "loss": 1.2942, + "step": 2822 + }, + { + "epoch": 0.48, + "grad_norm": 7.412017345428467, + "learning_rate": 2.273296722155483e-05, + "loss": 0.7609, + "step": 2823 + }, + { + "epoch": 0.48, + "grad_norm": 9.07819938659668, + "learning_rate": 2.2730392998112237e-05, + "loss": 0.7727, + "step": 2824 + }, + { + "epoch": 0.48, + "grad_norm": 8.380146980285645, + "learning_rate": 2.2727818774669644e-05, + "loss": 1.11, + "step": 2825 + }, + { + "epoch": 0.48, + "grad_norm": 10.714298248291016, + "learning_rate": 2.2725244551227047e-05, + "loss": 1.2154, + "step": 2826 + }, + { + "epoch": 0.49, + "grad_norm": 9.598361015319824, + "learning_rate": 2.2722670327784454e-05, + "loss": 0.9784, + "step": 2827 + }, + { + "epoch": 0.49, + "grad_norm": 8.480252265930176, + "learning_rate": 2.2720096104341857e-05, + "loss": 0.9387, + "step": 2828 + }, + { + "epoch": 0.49, + "grad_norm": 9.496916770935059, + "learning_rate": 2.2717521880899264e-05, + "loss": 1.0954, + "step": 2829 + }, + { + "epoch": 0.49, + "grad_norm": 8.717202186584473, + "learning_rate": 2.2714947657456667e-05, + "loss": 1.0192, + "step": 2830 + }, + { + "epoch": 0.49, + "grad_norm": 9.959425926208496, + "learning_rate": 2.271237343401407e-05, + "loss": 1.0376, + "step": 2831 + }, + { + "epoch": 0.49, + "grad_norm": 9.882668495178223, + "learning_rate": 2.2709799210571477e-05, + "loss": 0.9194, + "step": 2832 + }, + { + "epoch": 0.49, + "grad_norm": 7.36072301864624, + "learning_rate": 2.270722498712888e-05, + "loss": 0.7008, + "step": 2833 + }, + { + "epoch": 0.49, + "grad_norm": 9.445476531982422, + "learning_rate": 2.270465076368629e-05, + "loss": 1.0435, + "step": 2834 + }, + { + "epoch": 0.49, + "grad_norm": 8.398406982421875, + "learning_rate": 2.2702076540243694e-05, + "loss": 1.0314, + "step": 2835 + }, + { + "epoch": 0.49, + "grad_norm": 7.824779987335205, + "learning_rate": 2.26995023168011e-05, + "loss": 0.8664, + "step": 2836 + }, + { + "epoch": 0.49, + "grad_norm": 9.393730163574219, + "learning_rate": 2.2696928093358504e-05, + "loss": 0.9419, + "step": 2837 + }, + { + "epoch": 0.49, + "grad_norm": 8.884232521057129, + "learning_rate": 2.269435386991591e-05, + "loss": 0.9628, + "step": 2838 + }, + { + "epoch": 0.49, + "grad_norm": 7.899201393127441, + "learning_rate": 2.2691779646473314e-05, + "loss": 0.804, + "step": 2839 + }, + { + "epoch": 0.49, + "grad_norm": 8.749068260192871, + "learning_rate": 2.268920542303072e-05, + "loss": 1.3074, + "step": 2840 + }, + { + "epoch": 0.49, + "grad_norm": 10.67462158203125, + "learning_rate": 2.2686631199588124e-05, + "loss": 1.093, + "step": 2841 + }, + { + "epoch": 0.49, + "grad_norm": 9.70548152923584, + "learning_rate": 2.2684056976145527e-05, + "loss": 1.016, + "step": 2842 + }, + { + "epoch": 0.49, + "grad_norm": 8.614361763000488, + "learning_rate": 2.2681482752702937e-05, + "loss": 0.9766, + "step": 2843 + }, + { + "epoch": 0.49, + "grad_norm": 10.241039276123047, + "learning_rate": 2.267890852926034e-05, + "loss": 0.9755, + "step": 2844 + }, + { + "epoch": 0.49, + "grad_norm": 8.387537002563477, + "learning_rate": 2.2676334305817747e-05, + "loss": 0.8501, + "step": 2845 + }, + { + "epoch": 0.49, + "grad_norm": 10.643594741821289, + "learning_rate": 2.267376008237515e-05, + "loss": 1.5157, + "step": 2846 + }, + { + "epoch": 0.49, + "grad_norm": 9.009003639221191, + "learning_rate": 2.2671185858932557e-05, + "loss": 1.1499, + "step": 2847 + }, + { + "epoch": 0.49, + "grad_norm": 9.391432762145996, + "learning_rate": 2.266861163548996e-05, + "loss": 1.2275, + "step": 2848 + }, + { + "epoch": 0.49, + "grad_norm": 10.121082305908203, + "learning_rate": 2.2666037412047367e-05, + "loss": 1.1788, + "step": 2849 + }, + { + "epoch": 0.49, + "grad_norm": 9.744053840637207, + "learning_rate": 2.266346318860477e-05, + "loss": 0.9646, + "step": 2850 + }, + { + "epoch": 0.49, + "grad_norm": 9.824888229370117, + "learning_rate": 2.2660888965162177e-05, + "loss": 1.0316, + "step": 2851 + }, + { + "epoch": 0.49, + "grad_norm": 8.28082275390625, + "learning_rate": 2.265831474171958e-05, + "loss": 0.9807, + "step": 2852 + }, + { + "epoch": 0.49, + "grad_norm": 8.869412422180176, + "learning_rate": 2.2655740518276987e-05, + "loss": 0.9285, + "step": 2853 + }, + { + "epoch": 0.49, + "grad_norm": 7.541510105133057, + "learning_rate": 2.2653166294834394e-05, + "loss": 0.9391, + "step": 2854 + }, + { + "epoch": 0.49, + "grad_norm": 9.450916290283203, + "learning_rate": 2.2650592071391797e-05, + "loss": 0.9292, + "step": 2855 + }, + { + "epoch": 0.49, + "grad_norm": 9.403661727905273, + "learning_rate": 2.2648017847949204e-05, + "loss": 1.1493, + "step": 2856 + }, + { + "epoch": 0.49, + "grad_norm": 8.383374214172363, + "learning_rate": 2.2645443624506607e-05, + "loss": 0.9382, + "step": 2857 + }, + { + "epoch": 0.49, + "grad_norm": 7.462475299835205, + "learning_rate": 2.2642869401064014e-05, + "loss": 0.6428, + "step": 2858 + }, + { + "epoch": 0.49, + "grad_norm": 7.96205472946167, + "learning_rate": 2.2640295177621417e-05, + "loss": 0.8163, + "step": 2859 + }, + { + "epoch": 0.49, + "grad_norm": 9.264400482177734, + "learning_rate": 2.2637720954178824e-05, + "loss": 0.9747, + "step": 2860 + }, + { + "epoch": 0.49, + "grad_norm": 12.79320240020752, + "learning_rate": 2.2635146730736227e-05, + "loss": 1.5273, + "step": 2861 + }, + { + "epoch": 0.49, + "grad_norm": 9.706038475036621, + "learning_rate": 2.2632572507293634e-05, + "loss": 1.2014, + "step": 2862 + }, + { + "epoch": 0.49, + "grad_norm": 10.922988891601562, + "learning_rate": 2.262999828385104e-05, + "loss": 1.104, + "step": 2863 + }, + { + "epoch": 0.49, + "grad_norm": 10.83487606048584, + "learning_rate": 2.2627424060408444e-05, + "loss": 1.2641, + "step": 2864 + }, + { + "epoch": 0.49, + "grad_norm": 10.853885650634766, + "learning_rate": 2.262484983696585e-05, + "loss": 1.1656, + "step": 2865 + }, + { + "epoch": 0.49, + "grad_norm": 8.23297119140625, + "learning_rate": 2.2622275613523254e-05, + "loss": 0.6008, + "step": 2866 + }, + { + "epoch": 0.49, + "grad_norm": 11.741523742675781, + "learning_rate": 2.261970139008066e-05, + "loss": 1.371, + "step": 2867 + }, + { + "epoch": 0.49, + "grad_norm": 9.03989315032959, + "learning_rate": 2.2617127166638063e-05, + "loss": 1.0317, + "step": 2868 + }, + { + "epoch": 0.49, + "grad_norm": 10.759997367858887, + "learning_rate": 2.261455294319547e-05, + "loss": 1.3011, + "step": 2869 + }, + { + "epoch": 0.49, + "grad_norm": 11.567174911499023, + "learning_rate": 2.2611978719752873e-05, + "loss": 1.3222, + "step": 2870 + }, + { + "epoch": 0.49, + "grad_norm": 8.156388282775879, + "learning_rate": 2.2609404496310284e-05, + "loss": 0.7418, + "step": 2871 + }, + { + "epoch": 0.49, + "grad_norm": 9.658440589904785, + "learning_rate": 2.2606830272867687e-05, + "loss": 1.2242, + "step": 2872 + }, + { + "epoch": 0.49, + "grad_norm": 8.034728050231934, + "learning_rate": 2.260425604942509e-05, + "loss": 0.7375, + "step": 2873 + }, + { + "epoch": 0.49, + "grad_norm": 9.8517427444458, + "learning_rate": 2.2601681825982497e-05, + "loss": 1.0272, + "step": 2874 + }, + { + "epoch": 0.49, + "grad_norm": 9.360161781311035, + "learning_rate": 2.25991076025399e-05, + "loss": 0.8941, + "step": 2875 + }, + { + "epoch": 0.49, + "grad_norm": 10.058897018432617, + "learning_rate": 2.2596533379097307e-05, + "loss": 0.9598, + "step": 2876 + }, + { + "epoch": 0.49, + "grad_norm": 9.791024208068848, + "learning_rate": 2.259395915565471e-05, + "loss": 1.1843, + "step": 2877 + }, + { + "epoch": 0.49, + "grad_norm": 8.726823806762695, + "learning_rate": 2.2591384932212117e-05, + "loss": 0.8995, + "step": 2878 + }, + { + "epoch": 0.49, + "grad_norm": 8.622626304626465, + "learning_rate": 2.258881070876952e-05, + "loss": 0.896, + "step": 2879 + }, + { + "epoch": 0.49, + "grad_norm": 9.012248992919922, + "learning_rate": 2.2586236485326927e-05, + "loss": 1.0955, + "step": 2880 + }, + { + "epoch": 0.49, + "grad_norm": 8.1736478805542, + "learning_rate": 2.2583662261884333e-05, + "loss": 0.9293, + "step": 2881 + }, + { + "epoch": 0.49, + "grad_norm": 10.817082405090332, + "learning_rate": 2.258108803844174e-05, + "loss": 1.0724, + "step": 2882 + }, + { + "epoch": 0.49, + "grad_norm": 10.296951293945312, + "learning_rate": 2.2578513814999143e-05, + "loss": 1.0913, + "step": 2883 + }, + { + "epoch": 0.49, + "grad_norm": 9.798802375793457, + "learning_rate": 2.2575939591556547e-05, + "loss": 1.1309, + "step": 2884 + }, + { + "epoch": 0.5, + "grad_norm": 7.366511344909668, + "learning_rate": 2.2573365368113953e-05, + "loss": 0.8792, + "step": 2885 + }, + { + "epoch": 0.5, + "grad_norm": 9.514967918395996, + "learning_rate": 2.2570791144671357e-05, + "loss": 1.0338, + "step": 2886 + }, + { + "epoch": 0.5, + "grad_norm": 9.1610746383667, + "learning_rate": 2.2568216921228763e-05, + "loss": 1.0468, + "step": 2887 + }, + { + "epoch": 0.5, + "grad_norm": 9.265742301940918, + "learning_rate": 2.2565642697786167e-05, + "loss": 1.1355, + "step": 2888 + }, + { + "epoch": 0.5, + "grad_norm": 9.74632453918457, + "learning_rate": 2.2563068474343573e-05, + "loss": 1.0831, + "step": 2889 + }, + { + "epoch": 0.5, + "grad_norm": 13.062989234924316, + "learning_rate": 2.256049425090098e-05, + "loss": 1.3055, + "step": 2890 + }, + { + "epoch": 0.5, + "grad_norm": 10.476349830627441, + "learning_rate": 2.2557920027458387e-05, + "loss": 1.2305, + "step": 2891 + }, + { + "epoch": 0.5, + "grad_norm": 10.062787055969238, + "learning_rate": 2.255534580401579e-05, + "loss": 1.3284, + "step": 2892 + }, + { + "epoch": 0.5, + "grad_norm": 10.596938133239746, + "learning_rate": 2.2552771580573193e-05, + "loss": 1.0379, + "step": 2893 + }, + { + "epoch": 0.5, + "grad_norm": 9.208369255065918, + "learning_rate": 2.25501973571306e-05, + "loss": 0.8495, + "step": 2894 + }, + { + "epoch": 0.5, + "grad_norm": 9.72470474243164, + "learning_rate": 2.2547623133688003e-05, + "loss": 0.9906, + "step": 2895 + }, + { + "epoch": 0.5, + "grad_norm": 11.03720760345459, + "learning_rate": 2.254504891024541e-05, + "loss": 1.004, + "step": 2896 + }, + { + "epoch": 0.5, + "grad_norm": 10.569757461547852, + "learning_rate": 2.2542474686802813e-05, + "loss": 1.1559, + "step": 2897 + }, + { + "epoch": 0.5, + "grad_norm": 8.373429298400879, + "learning_rate": 2.253990046336022e-05, + "loss": 0.8029, + "step": 2898 + }, + { + "epoch": 0.5, + "grad_norm": 7.130106449127197, + "learning_rate": 2.2537326239917623e-05, + "loss": 0.8941, + "step": 2899 + }, + { + "epoch": 0.5, + "grad_norm": 7.858506202697754, + "learning_rate": 2.2534752016475033e-05, + "loss": 0.6541, + "step": 2900 + }, + { + "epoch": 0.5, + "grad_norm": 11.609339714050293, + "learning_rate": 2.2532177793032437e-05, + "loss": 1.0596, + "step": 2901 + }, + { + "epoch": 0.5, + "grad_norm": 11.439431190490723, + "learning_rate": 2.2529603569589843e-05, + "loss": 1.0829, + "step": 2902 + }, + { + "epoch": 0.5, + "grad_norm": 9.836402893066406, + "learning_rate": 2.2527029346147247e-05, + "loss": 1.3006, + "step": 2903 + }, + { + "epoch": 0.5, + "grad_norm": 9.604819297790527, + "learning_rate": 2.252445512270465e-05, + "loss": 1.1105, + "step": 2904 + }, + { + "epoch": 0.5, + "grad_norm": 11.094498634338379, + "learning_rate": 2.2521880899262056e-05, + "loss": 1.2208, + "step": 2905 + }, + { + "epoch": 0.5, + "grad_norm": 9.871305465698242, + "learning_rate": 2.251930667581946e-05, + "loss": 0.8997, + "step": 2906 + }, + { + "epoch": 0.5, + "grad_norm": 10.772424697875977, + "learning_rate": 2.2516732452376866e-05, + "loss": 0.9788, + "step": 2907 + }, + { + "epoch": 0.5, + "grad_norm": 11.608298301696777, + "learning_rate": 2.251415822893427e-05, + "loss": 1.1644, + "step": 2908 + }, + { + "epoch": 0.5, + "grad_norm": 13.14294719696045, + "learning_rate": 2.251158400549168e-05, + "loss": 1.2377, + "step": 2909 + }, + { + "epoch": 0.5, + "grad_norm": 11.051589012145996, + "learning_rate": 2.2509009782049083e-05, + "loss": 1.0112, + "step": 2910 + }, + { + "epoch": 0.5, + "grad_norm": 9.835176467895508, + "learning_rate": 2.250643555860649e-05, + "loss": 1.1839, + "step": 2911 + }, + { + "epoch": 0.5, + "grad_norm": 9.873061180114746, + "learning_rate": 2.2503861335163893e-05, + "loss": 0.8594, + "step": 2912 + }, + { + "epoch": 0.5, + "grad_norm": 7.312557697296143, + "learning_rate": 2.25012871117213e-05, + "loss": 0.8498, + "step": 2913 + }, + { + "epoch": 0.5, + "grad_norm": 9.777792930603027, + "learning_rate": 2.2498712888278703e-05, + "loss": 0.9957, + "step": 2914 + }, + { + "epoch": 0.5, + "grad_norm": 8.795259475708008, + "learning_rate": 2.2496138664836106e-05, + "loss": 1.1728, + "step": 2915 + }, + { + "epoch": 0.5, + "grad_norm": 11.050849914550781, + "learning_rate": 2.2493564441393513e-05, + "loss": 1.4074, + "step": 2916 + }, + { + "epoch": 0.5, + "grad_norm": 8.401333808898926, + "learning_rate": 2.2490990217950916e-05, + "loss": 0.826, + "step": 2917 + }, + { + "epoch": 0.5, + "grad_norm": 9.290714263916016, + "learning_rate": 2.2488415994508323e-05, + "loss": 0.9411, + "step": 2918 + }, + { + "epoch": 0.5, + "grad_norm": 9.823974609375, + "learning_rate": 2.248584177106573e-05, + "loss": 1.1492, + "step": 2919 + }, + { + "epoch": 0.5, + "grad_norm": 9.005365371704102, + "learning_rate": 2.2483267547623136e-05, + "loss": 0.8701, + "step": 2920 + }, + { + "epoch": 0.5, + "grad_norm": 8.801459312438965, + "learning_rate": 2.248069332418054e-05, + "loss": 0.8018, + "step": 2921 + }, + { + "epoch": 0.5, + "grad_norm": 8.824800491333008, + "learning_rate": 2.2478119100737946e-05, + "loss": 0.7951, + "step": 2922 + }, + { + "epoch": 0.5, + "grad_norm": 9.079216003417969, + "learning_rate": 2.247554487729535e-05, + "loss": 0.8694, + "step": 2923 + }, + { + "epoch": 0.5, + "grad_norm": 10.32170295715332, + "learning_rate": 2.2472970653852753e-05, + "loss": 0.9289, + "step": 2924 + }, + { + "epoch": 0.5, + "grad_norm": 10.865641593933105, + "learning_rate": 2.247039643041016e-05, + "loss": 0.9216, + "step": 2925 + }, + { + "epoch": 0.5, + "grad_norm": 11.041830062866211, + "learning_rate": 2.2467822206967563e-05, + "loss": 0.7588, + "step": 2926 + }, + { + "epoch": 0.5, + "grad_norm": 8.602577209472656, + "learning_rate": 2.246524798352497e-05, + "loss": 0.8429, + "step": 2927 + }, + { + "epoch": 0.5, + "grad_norm": 10.01832389831543, + "learning_rate": 2.2462673760082376e-05, + "loss": 0.9837, + "step": 2928 + }, + { + "epoch": 0.5, + "grad_norm": 11.765362739562988, + "learning_rate": 2.2460099536639783e-05, + "loss": 1.2726, + "step": 2929 + }, + { + "epoch": 0.5, + "grad_norm": 8.35781478881836, + "learning_rate": 2.2457525313197186e-05, + "loss": 0.7913, + "step": 2930 + }, + { + "epoch": 0.5, + "grad_norm": 8.378927230834961, + "learning_rate": 2.2454951089754593e-05, + "loss": 0.9463, + "step": 2931 + }, + { + "epoch": 0.5, + "grad_norm": 8.902555465698242, + "learning_rate": 2.2452376866311996e-05, + "loss": 0.8122, + "step": 2932 + }, + { + "epoch": 0.5, + "grad_norm": 8.180845260620117, + "learning_rate": 2.2449802642869403e-05, + "loss": 0.8902, + "step": 2933 + }, + { + "epoch": 0.5, + "grad_norm": 10.628933906555176, + "learning_rate": 2.2447228419426806e-05, + "loss": 1.2149, + "step": 2934 + }, + { + "epoch": 0.5, + "grad_norm": 10.003393173217773, + "learning_rate": 2.244465419598421e-05, + "loss": 0.8691, + "step": 2935 + }, + { + "epoch": 0.5, + "grad_norm": 10.806173324584961, + "learning_rate": 2.2442079972541616e-05, + "loss": 1.2196, + "step": 2936 + }, + { + "epoch": 0.5, + "grad_norm": 8.46871280670166, + "learning_rate": 2.243950574909902e-05, + "loss": 0.9806, + "step": 2937 + }, + { + "epoch": 0.5, + "grad_norm": 8.911006927490234, + "learning_rate": 2.243693152565643e-05, + "loss": 0.8281, + "step": 2938 + }, + { + "epoch": 0.5, + "grad_norm": 9.352621078491211, + "learning_rate": 2.2434357302213833e-05, + "loss": 1.0753, + "step": 2939 + }, + { + "epoch": 0.5, + "grad_norm": 9.792496681213379, + "learning_rate": 2.243178307877124e-05, + "loss": 0.963, + "step": 2940 + }, + { + "epoch": 0.5, + "grad_norm": 9.439682006835938, + "learning_rate": 2.2429208855328643e-05, + "loss": 0.9795, + "step": 2941 + }, + { + "epoch": 0.5, + "grad_norm": 9.610797882080078, + "learning_rate": 2.242663463188605e-05, + "loss": 0.8668, + "step": 2942 + }, + { + "epoch": 0.51, + "grad_norm": 9.38074016571045, + "learning_rate": 2.2424060408443453e-05, + "loss": 1.3201, + "step": 2943 + }, + { + "epoch": 0.51, + "grad_norm": 9.588486671447754, + "learning_rate": 2.242148618500086e-05, + "loss": 1.0096, + "step": 2944 + }, + { + "epoch": 0.51, + "grad_norm": 8.158686637878418, + "learning_rate": 2.2418911961558263e-05, + "loss": 0.9543, + "step": 2945 + }, + { + "epoch": 0.51, + "grad_norm": 8.06312084197998, + "learning_rate": 2.2416337738115666e-05, + "loss": 0.8876, + "step": 2946 + }, + { + "epoch": 0.51, + "grad_norm": 11.975364685058594, + "learning_rate": 2.2413763514673076e-05, + "loss": 1.313, + "step": 2947 + }, + { + "epoch": 0.51, + "grad_norm": 9.51795482635498, + "learning_rate": 2.241118929123048e-05, + "loss": 1.2252, + "step": 2948 + }, + { + "epoch": 0.51, + "grad_norm": 7.76899528503418, + "learning_rate": 2.2408615067787886e-05, + "loss": 0.8506, + "step": 2949 + }, + { + "epoch": 0.51, + "grad_norm": 8.216376304626465, + "learning_rate": 2.240604084434529e-05, + "loss": 0.9271, + "step": 2950 + }, + { + "epoch": 0.51, + "grad_norm": 8.150128364562988, + "learning_rate": 2.2403466620902696e-05, + "loss": 1.004, + "step": 2951 + }, + { + "epoch": 0.51, + "grad_norm": 10.28695297241211, + "learning_rate": 2.24008923974601e-05, + "loss": 0.9094, + "step": 2952 + }, + { + "epoch": 0.51, + "grad_norm": 8.283291816711426, + "learning_rate": 2.2398318174017506e-05, + "loss": 1.1637, + "step": 2953 + }, + { + "epoch": 0.51, + "grad_norm": 9.857816696166992, + "learning_rate": 2.239574395057491e-05, + "loss": 1.0686, + "step": 2954 + }, + { + "epoch": 0.51, + "grad_norm": 8.769723892211914, + "learning_rate": 2.2393169727132316e-05, + "loss": 1.1265, + "step": 2955 + }, + { + "epoch": 0.51, + "grad_norm": 7.864490032196045, + "learning_rate": 2.239059550368972e-05, + "loss": 0.9829, + "step": 2956 + }, + { + "epoch": 0.51, + "grad_norm": 11.275936126708984, + "learning_rate": 2.2388021280247126e-05, + "loss": 0.9762, + "step": 2957 + }, + { + "epoch": 0.51, + "grad_norm": 9.359052658081055, + "learning_rate": 2.2385447056804533e-05, + "loss": 0.8706, + "step": 2958 + }, + { + "epoch": 0.51, + "grad_norm": 9.31600570678711, + "learning_rate": 2.2382872833361936e-05, + "loss": 1.0673, + "step": 2959 + }, + { + "epoch": 0.51, + "grad_norm": 10.782825469970703, + "learning_rate": 2.2380298609919343e-05, + "loss": 1.0843, + "step": 2960 + }, + { + "epoch": 0.51, + "grad_norm": 8.320279121398926, + "learning_rate": 2.2377724386476746e-05, + "loss": 0.922, + "step": 2961 + }, + { + "epoch": 0.51, + "grad_norm": 9.702962875366211, + "learning_rate": 2.2375150163034153e-05, + "loss": 1.0546, + "step": 2962 + }, + { + "epoch": 0.51, + "grad_norm": 10.960341453552246, + "learning_rate": 2.2372575939591556e-05, + "loss": 1.0943, + "step": 2963 + }, + { + "epoch": 0.51, + "grad_norm": 9.947541236877441, + "learning_rate": 2.2370001716148963e-05, + "loss": 1.1067, + "step": 2964 + }, + { + "epoch": 0.51, + "grad_norm": 13.1266508102417, + "learning_rate": 2.2367427492706366e-05, + "loss": 0.9991, + "step": 2965 + }, + { + "epoch": 0.51, + "grad_norm": 7.976988792419434, + "learning_rate": 2.2364853269263773e-05, + "loss": 0.8191, + "step": 2966 + }, + { + "epoch": 0.51, + "grad_norm": 9.975715637207031, + "learning_rate": 2.236227904582118e-05, + "loss": 1.0675, + "step": 2967 + }, + { + "epoch": 0.51, + "grad_norm": 9.150272369384766, + "learning_rate": 2.2359704822378583e-05, + "loss": 0.9077, + "step": 2968 + }, + { + "epoch": 0.51, + "grad_norm": 10.799205780029297, + "learning_rate": 2.235713059893599e-05, + "loss": 1.3587, + "step": 2969 + }, + { + "epoch": 0.51, + "grad_norm": 11.644124031066895, + "learning_rate": 2.2354556375493393e-05, + "loss": 1.458, + "step": 2970 + }, + { + "epoch": 0.51, + "grad_norm": 11.589526176452637, + "learning_rate": 2.23519821520508e-05, + "loss": 1.245, + "step": 2971 + }, + { + "epoch": 0.51, + "grad_norm": 9.116387367248535, + "learning_rate": 2.2349407928608202e-05, + "loss": 1.0382, + "step": 2972 + }, + { + "epoch": 0.51, + "grad_norm": 9.83653450012207, + "learning_rate": 2.234683370516561e-05, + "loss": 1.156, + "step": 2973 + }, + { + "epoch": 0.51, + "grad_norm": 9.72766399383545, + "learning_rate": 2.2344259481723012e-05, + "loss": 0.9832, + "step": 2974 + }, + { + "epoch": 0.51, + "grad_norm": 8.677478790283203, + "learning_rate": 2.2341685258280423e-05, + "loss": 0.8163, + "step": 2975 + }, + { + "epoch": 0.51, + "grad_norm": 9.391899108886719, + "learning_rate": 2.2339111034837826e-05, + "loss": 0.9602, + "step": 2976 + }, + { + "epoch": 0.51, + "grad_norm": 8.288619041442871, + "learning_rate": 2.233653681139523e-05, + "loss": 0.8514, + "step": 2977 + }, + { + "epoch": 0.51, + "grad_norm": 10.410719871520996, + "learning_rate": 2.2333962587952636e-05, + "loss": 0.9772, + "step": 2978 + }, + { + "epoch": 0.51, + "grad_norm": 9.934706687927246, + "learning_rate": 2.233138836451004e-05, + "loss": 0.9284, + "step": 2979 + }, + { + "epoch": 0.51, + "grad_norm": 11.661917686462402, + "learning_rate": 2.2328814141067446e-05, + "loss": 0.9782, + "step": 2980 + }, + { + "epoch": 0.51, + "grad_norm": 10.239506721496582, + "learning_rate": 2.232623991762485e-05, + "loss": 0.7205, + "step": 2981 + }, + { + "epoch": 0.51, + "grad_norm": 12.699063301086426, + "learning_rate": 2.2323665694182256e-05, + "loss": 1.3547, + "step": 2982 + }, + { + "epoch": 0.51, + "grad_norm": 9.53893756866455, + "learning_rate": 2.232109147073966e-05, + "loss": 0.9747, + "step": 2983 + }, + { + "epoch": 0.51, + "grad_norm": 8.555697441101074, + "learning_rate": 2.2318517247297066e-05, + "loss": 0.7239, + "step": 2984 + }, + { + "epoch": 0.51, + "grad_norm": 7.867225170135498, + "learning_rate": 2.2315943023854472e-05, + "loss": 0.836, + "step": 2985 + }, + { + "epoch": 0.51, + "grad_norm": 9.933558464050293, + "learning_rate": 2.231336880041188e-05, + "loss": 0.9002, + "step": 2986 + }, + { + "epoch": 0.51, + "grad_norm": 9.113739013671875, + "learning_rate": 2.2310794576969282e-05, + "loss": 0.844, + "step": 2987 + }, + { + "epoch": 0.51, + "grad_norm": 10.667778968811035, + "learning_rate": 2.2308220353526686e-05, + "loss": 1.0439, + "step": 2988 + }, + { + "epoch": 0.51, + "grad_norm": 10.227678298950195, + "learning_rate": 2.2305646130084092e-05, + "loss": 0.9577, + "step": 2989 + }, + { + "epoch": 0.51, + "grad_norm": 10.674300193786621, + "learning_rate": 2.2303071906641496e-05, + "loss": 1.0516, + "step": 2990 + }, + { + "epoch": 0.51, + "grad_norm": 9.962779998779297, + "learning_rate": 2.2300497683198902e-05, + "loss": 1.0495, + "step": 2991 + }, + { + "epoch": 0.51, + "grad_norm": 7.408998489379883, + "learning_rate": 2.2297923459756306e-05, + "loss": 0.6937, + "step": 2992 + }, + { + "epoch": 0.51, + "grad_norm": 9.951751708984375, + "learning_rate": 2.2295349236313712e-05, + "loss": 0.9929, + "step": 2993 + }, + { + "epoch": 0.51, + "grad_norm": 11.693395614624023, + "learning_rate": 2.229277501287112e-05, + "loss": 1.1388, + "step": 2994 + }, + { + "epoch": 0.51, + "grad_norm": 12.005244255065918, + "learning_rate": 2.2290200789428526e-05, + "loss": 1.2366, + "step": 2995 + }, + { + "epoch": 0.51, + "grad_norm": 8.858135223388672, + "learning_rate": 2.228762656598593e-05, + "loss": 0.7901, + "step": 2996 + }, + { + "epoch": 0.51, + "grad_norm": 9.465733528137207, + "learning_rate": 2.2285052342543332e-05, + "loss": 1.0484, + "step": 2997 + }, + { + "epoch": 0.51, + "grad_norm": 11.89690113067627, + "learning_rate": 2.228247811910074e-05, + "loss": 1.2352, + "step": 2998 + }, + { + "epoch": 0.51, + "grad_norm": 10.465445518493652, + "learning_rate": 2.2279903895658142e-05, + "loss": 1.2596, + "step": 2999 + }, + { + "epoch": 0.51, + "grad_norm": 8.890931129455566, + "learning_rate": 2.227732967221555e-05, + "loss": 1.2102, + "step": 3000 + }, + { + "epoch": 0.52, + "grad_norm": 8.394305229187012, + "learning_rate": 2.2274755448772952e-05, + "loss": 1.0521, + "step": 3001 + }, + { + "epoch": 0.52, + "grad_norm": 8.572005271911621, + "learning_rate": 2.227218122533036e-05, + "loss": 0.915, + "step": 3002 + }, + { + "epoch": 0.52, + "grad_norm": 8.06268310546875, + "learning_rate": 2.2269607001887762e-05, + "loss": 0.8785, + "step": 3003 + }, + { + "epoch": 0.52, + "grad_norm": 8.87403392791748, + "learning_rate": 2.2267032778445172e-05, + "loss": 1.2461, + "step": 3004 + }, + { + "epoch": 0.52, + "grad_norm": 9.28782844543457, + "learning_rate": 2.2264458555002576e-05, + "loss": 0.8862, + "step": 3005 + }, + { + "epoch": 0.52, + "grad_norm": 8.314935684204102, + "learning_rate": 2.2261884331559982e-05, + "loss": 0.6601, + "step": 3006 + }, + { + "epoch": 0.52, + "grad_norm": 8.152556419372559, + "learning_rate": 2.2259310108117386e-05, + "loss": 1.1001, + "step": 3007 + }, + { + "epoch": 0.52, + "grad_norm": 8.843362808227539, + "learning_rate": 2.225673588467479e-05, + "loss": 1.2651, + "step": 3008 + }, + { + "epoch": 0.52, + "grad_norm": 10.01810359954834, + "learning_rate": 2.2254161661232195e-05, + "loss": 1.1865, + "step": 3009 + }, + { + "epoch": 0.52, + "grad_norm": 8.529011726379395, + "learning_rate": 2.22515874377896e-05, + "loss": 0.9467, + "step": 3010 + }, + { + "epoch": 0.52, + "grad_norm": 8.114763259887695, + "learning_rate": 2.2249013214347005e-05, + "loss": 0.8459, + "step": 3011 + }, + { + "epoch": 0.52, + "grad_norm": 9.628227233886719, + "learning_rate": 2.224643899090441e-05, + "loss": 1.033, + "step": 3012 + }, + { + "epoch": 0.52, + "grad_norm": 9.931683540344238, + "learning_rate": 2.224386476746182e-05, + "loss": 1.0218, + "step": 3013 + }, + { + "epoch": 0.52, + "grad_norm": 10.078429222106934, + "learning_rate": 2.2241290544019222e-05, + "loss": 1.13, + "step": 3014 + }, + { + "epoch": 0.52, + "grad_norm": 8.351057052612305, + "learning_rate": 2.223871632057663e-05, + "loss": 0.8109, + "step": 3015 + }, + { + "epoch": 0.52, + "grad_norm": 10.0247802734375, + "learning_rate": 2.2236142097134032e-05, + "loss": 1.0613, + "step": 3016 + }, + { + "epoch": 0.52, + "grad_norm": 12.838735580444336, + "learning_rate": 2.223356787369144e-05, + "loss": 1.2196, + "step": 3017 + }, + { + "epoch": 0.52, + "grad_norm": 10.077594757080078, + "learning_rate": 2.2230993650248842e-05, + "loss": 0.9813, + "step": 3018 + }, + { + "epoch": 0.52, + "grad_norm": 10.985511779785156, + "learning_rate": 2.2228419426806245e-05, + "loss": 1.0313, + "step": 3019 + }, + { + "epoch": 0.52, + "grad_norm": 8.833869934082031, + "learning_rate": 2.2225845203363652e-05, + "loss": 1.0435, + "step": 3020 + }, + { + "epoch": 0.52, + "grad_norm": 9.254064559936523, + "learning_rate": 2.2223270979921055e-05, + "loss": 1.0493, + "step": 3021 + }, + { + "epoch": 0.52, + "grad_norm": 8.48665714263916, + "learning_rate": 2.2220696756478462e-05, + "loss": 0.8993, + "step": 3022 + }, + { + "epoch": 0.52, + "grad_norm": 8.376240730285645, + "learning_rate": 2.221812253303587e-05, + "loss": 1.0113, + "step": 3023 + }, + { + "epoch": 0.52, + "grad_norm": 7.388966083526611, + "learning_rate": 2.2215548309593275e-05, + "loss": 0.6854, + "step": 3024 + }, + { + "epoch": 0.52, + "grad_norm": 9.38918399810791, + "learning_rate": 2.221297408615068e-05, + "loss": 0.946, + "step": 3025 + }, + { + "epoch": 0.52, + "grad_norm": 8.641005516052246, + "learning_rate": 2.2210399862708085e-05, + "loss": 1.0363, + "step": 3026 + }, + { + "epoch": 0.52, + "grad_norm": 9.025074005126953, + "learning_rate": 2.220782563926549e-05, + "loss": 0.8921, + "step": 3027 + }, + { + "epoch": 0.52, + "grad_norm": 9.211996078491211, + "learning_rate": 2.2205251415822892e-05, + "loss": 1.0066, + "step": 3028 + }, + { + "epoch": 0.52, + "grad_norm": 8.35977840423584, + "learning_rate": 2.22026771923803e-05, + "loss": 0.9677, + "step": 3029 + }, + { + "epoch": 0.52, + "grad_norm": 9.429361343383789, + "learning_rate": 2.2200102968937702e-05, + "loss": 1.0247, + "step": 3030 + }, + { + "epoch": 0.52, + "grad_norm": 8.093644142150879, + "learning_rate": 2.219752874549511e-05, + "loss": 0.8323, + "step": 3031 + }, + { + "epoch": 0.52, + "grad_norm": 8.507508277893066, + "learning_rate": 2.2194954522052515e-05, + "loss": 0.979, + "step": 3032 + }, + { + "epoch": 0.52, + "grad_norm": 9.718098640441895, + "learning_rate": 2.2192380298609922e-05, + "loss": 1.0606, + "step": 3033 + }, + { + "epoch": 0.52, + "grad_norm": 8.544224739074707, + "learning_rate": 2.2189806075167325e-05, + "loss": 0.9891, + "step": 3034 + }, + { + "epoch": 0.52, + "grad_norm": 8.190923690795898, + "learning_rate": 2.2187231851724732e-05, + "loss": 1.1908, + "step": 3035 + }, + { + "epoch": 0.52, + "grad_norm": 9.675702095031738, + "learning_rate": 2.2184657628282135e-05, + "loss": 0.9008, + "step": 3036 + }, + { + "epoch": 0.52, + "grad_norm": 9.517020225524902, + "learning_rate": 2.2182083404839542e-05, + "loss": 0.9227, + "step": 3037 + }, + { + "epoch": 0.52, + "grad_norm": 9.844659805297852, + "learning_rate": 2.2179509181396945e-05, + "loss": 1.1884, + "step": 3038 + }, + { + "epoch": 0.52, + "grad_norm": 7.5204596519470215, + "learning_rate": 2.217693495795435e-05, + "loss": 0.7349, + "step": 3039 + }, + { + "epoch": 0.52, + "grad_norm": 10.043553352355957, + "learning_rate": 2.2174360734511755e-05, + "loss": 1.1814, + "step": 3040 + }, + { + "epoch": 0.52, + "grad_norm": 9.549355506896973, + "learning_rate": 2.217178651106916e-05, + "loss": 1.0007, + "step": 3041 + }, + { + "epoch": 0.52, + "grad_norm": 8.698515892028809, + "learning_rate": 2.216921228762657e-05, + "loss": 0.8188, + "step": 3042 + }, + { + "epoch": 0.52, + "grad_norm": 8.914580345153809, + "learning_rate": 2.2166638064183972e-05, + "loss": 0.9664, + "step": 3043 + }, + { + "epoch": 0.52, + "grad_norm": 8.412646293640137, + "learning_rate": 2.216406384074138e-05, + "loss": 0.8891, + "step": 3044 + }, + { + "epoch": 0.52, + "grad_norm": 11.946002960205078, + "learning_rate": 2.2161489617298782e-05, + "loss": 1.3124, + "step": 3045 + }, + { + "epoch": 0.52, + "grad_norm": 8.516925811767578, + "learning_rate": 2.215891539385619e-05, + "loss": 0.9214, + "step": 3046 + }, + { + "epoch": 0.52, + "grad_norm": 10.866408348083496, + "learning_rate": 2.2156341170413592e-05, + "loss": 0.8427, + "step": 3047 + }, + { + "epoch": 0.52, + "grad_norm": 9.414539337158203, + "learning_rate": 2.2153766946971e-05, + "loss": 0.7524, + "step": 3048 + }, + { + "epoch": 0.52, + "grad_norm": 10.338263511657715, + "learning_rate": 2.2151192723528402e-05, + "loss": 0.8296, + "step": 3049 + }, + { + "epoch": 0.52, + "grad_norm": 10.56003475189209, + "learning_rate": 2.2148618500085805e-05, + "loss": 1.0169, + "step": 3050 + }, + { + "epoch": 0.52, + "grad_norm": 10.801880836486816, + "learning_rate": 2.2146044276643215e-05, + "loss": 0.9856, + "step": 3051 + }, + { + "epoch": 0.52, + "grad_norm": 8.434285163879395, + "learning_rate": 2.214347005320062e-05, + "loss": 1.2144, + "step": 3052 + }, + { + "epoch": 0.52, + "grad_norm": 11.876259803771973, + "learning_rate": 2.2140895829758025e-05, + "loss": 1.2534, + "step": 3053 + }, + { + "epoch": 0.52, + "grad_norm": 10.42397403717041, + "learning_rate": 2.213832160631543e-05, + "loss": 0.8078, + "step": 3054 + }, + { + "epoch": 0.52, + "grad_norm": 10.448081016540527, + "learning_rate": 2.2135747382872835e-05, + "loss": 0.8845, + "step": 3055 + }, + { + "epoch": 0.52, + "grad_norm": 9.044166564941406, + "learning_rate": 2.213317315943024e-05, + "loss": 0.9541, + "step": 3056 + }, + { + "epoch": 0.52, + "grad_norm": 9.51616382598877, + "learning_rate": 2.2130598935987645e-05, + "loss": 1.1335, + "step": 3057 + }, + { + "epoch": 0.52, + "grad_norm": 9.724225997924805, + "learning_rate": 2.212802471254505e-05, + "loss": 0.9727, + "step": 3058 + }, + { + "epoch": 0.52, + "grad_norm": 8.272114753723145, + "learning_rate": 2.2125450489102455e-05, + "loss": 0.9316, + "step": 3059 + }, + { + "epoch": 0.53, + "grad_norm": 9.649346351623535, + "learning_rate": 2.212287626565986e-05, + "loss": 0.8534, + "step": 3060 + }, + { + "epoch": 0.53, + "grad_norm": 9.92440414428711, + "learning_rate": 2.2120302042217265e-05, + "loss": 0.8505, + "step": 3061 + }, + { + "epoch": 0.53, + "grad_norm": 10.025346755981445, + "learning_rate": 2.211772781877467e-05, + "loss": 0.8607, + "step": 3062 + }, + { + "epoch": 0.53, + "grad_norm": 7.599574089050293, + "learning_rate": 2.2115153595332075e-05, + "loss": 0.8316, + "step": 3063 + }, + { + "epoch": 0.53, + "grad_norm": 10.571738243103027, + "learning_rate": 2.211257937188948e-05, + "loss": 0.9319, + "step": 3064 + }, + { + "epoch": 0.53, + "grad_norm": 9.115510940551758, + "learning_rate": 2.2110005148446885e-05, + "loss": 0.7957, + "step": 3065 + }, + { + "epoch": 0.53, + "grad_norm": 8.370075225830078, + "learning_rate": 2.210743092500429e-05, + "loss": 0.7935, + "step": 3066 + }, + { + "epoch": 0.53, + "grad_norm": 9.367995262145996, + "learning_rate": 2.2104856701561695e-05, + "loss": 0.7663, + "step": 3067 + }, + { + "epoch": 0.53, + "grad_norm": 8.3759126663208, + "learning_rate": 2.21022824781191e-05, + "loss": 0.8605, + "step": 3068 + }, + { + "epoch": 0.53, + "grad_norm": 9.688425064086914, + "learning_rate": 2.2099708254676505e-05, + "loss": 0.8383, + "step": 3069 + }, + { + "epoch": 0.53, + "grad_norm": 9.046122550964355, + "learning_rate": 2.209713403123391e-05, + "loss": 1.0199, + "step": 3070 + }, + { + "epoch": 0.53, + "grad_norm": 8.270439147949219, + "learning_rate": 2.2094559807791318e-05, + "loss": 1.112, + "step": 3071 + }, + { + "epoch": 0.53, + "grad_norm": 9.894763946533203, + "learning_rate": 2.209198558434872e-05, + "loss": 0.8632, + "step": 3072 + }, + { + "epoch": 0.53, + "grad_norm": 11.473116874694824, + "learning_rate": 2.2089411360906128e-05, + "loss": 1.1591, + "step": 3073 + }, + { + "epoch": 0.53, + "grad_norm": 11.77759075164795, + "learning_rate": 2.208683713746353e-05, + "loss": 1.1484, + "step": 3074 + }, + { + "epoch": 0.53, + "grad_norm": 12.56604290008545, + "learning_rate": 2.2084262914020938e-05, + "loss": 1.3491, + "step": 3075 + }, + { + "epoch": 0.53, + "grad_norm": 8.549914360046387, + "learning_rate": 2.208168869057834e-05, + "loss": 0.894, + "step": 3076 + }, + { + "epoch": 0.53, + "grad_norm": 8.53173828125, + "learning_rate": 2.2079114467135748e-05, + "loss": 1.1102, + "step": 3077 + }, + { + "epoch": 0.53, + "grad_norm": 9.941644668579102, + "learning_rate": 2.207654024369315e-05, + "loss": 1.0642, + "step": 3078 + }, + { + "epoch": 0.53, + "grad_norm": 10.05459976196289, + "learning_rate": 2.2073966020250558e-05, + "loss": 0.8512, + "step": 3079 + }, + { + "epoch": 0.53, + "grad_norm": 11.281445503234863, + "learning_rate": 2.2071391796807965e-05, + "loss": 1.159, + "step": 3080 + }, + { + "epoch": 0.53, + "grad_norm": 8.477843284606934, + "learning_rate": 2.2068817573365368e-05, + "loss": 0.8448, + "step": 3081 + }, + { + "epoch": 0.53, + "grad_norm": 8.021671295166016, + "learning_rate": 2.2066243349922775e-05, + "loss": 0.7585, + "step": 3082 + }, + { + "epoch": 0.53, + "grad_norm": 8.576374053955078, + "learning_rate": 2.2063669126480178e-05, + "loss": 0.9537, + "step": 3083 + }, + { + "epoch": 0.53, + "grad_norm": 7.444153308868408, + "learning_rate": 2.2061094903037585e-05, + "loss": 0.9178, + "step": 3084 + }, + { + "epoch": 0.53, + "grad_norm": 12.676690101623535, + "learning_rate": 2.2058520679594988e-05, + "loss": 1.1081, + "step": 3085 + }, + { + "epoch": 0.53, + "grad_norm": 10.60732650756836, + "learning_rate": 2.2055946456152395e-05, + "loss": 1.2603, + "step": 3086 + }, + { + "epoch": 0.53, + "grad_norm": 10.08698844909668, + "learning_rate": 2.2053372232709798e-05, + "loss": 0.9504, + "step": 3087 + }, + { + "epoch": 0.53, + "grad_norm": 9.596535682678223, + "learning_rate": 2.2050798009267205e-05, + "loss": 1.0693, + "step": 3088 + }, + { + "epoch": 0.53, + "grad_norm": 10.122248649597168, + "learning_rate": 2.204822378582461e-05, + "loss": 1.0534, + "step": 3089 + }, + { + "epoch": 0.53, + "grad_norm": 8.22197437286377, + "learning_rate": 2.2045649562382018e-05, + "loss": 0.8598, + "step": 3090 + }, + { + "epoch": 0.53, + "grad_norm": 8.440376281738281, + "learning_rate": 2.204307533893942e-05, + "loss": 1.1198, + "step": 3091 + }, + { + "epoch": 0.53, + "grad_norm": 9.762871742248535, + "learning_rate": 2.2040501115496825e-05, + "loss": 1.0391, + "step": 3092 + }, + { + "epoch": 0.53, + "grad_norm": 10.070852279663086, + "learning_rate": 2.203792689205423e-05, + "loss": 1.0754, + "step": 3093 + }, + { + "epoch": 0.53, + "grad_norm": 7.561709403991699, + "learning_rate": 2.2035352668611635e-05, + "loss": 0.7504, + "step": 3094 + }, + { + "epoch": 0.53, + "grad_norm": 8.179771423339844, + "learning_rate": 2.203277844516904e-05, + "loss": 0.8929, + "step": 3095 + }, + { + "epoch": 0.53, + "grad_norm": 10.326729774475098, + "learning_rate": 2.2030204221726445e-05, + "loss": 1.0587, + "step": 3096 + }, + { + "epoch": 0.53, + "grad_norm": 9.242788314819336, + "learning_rate": 2.202762999828385e-05, + "loss": 1.0423, + "step": 3097 + }, + { + "epoch": 0.53, + "grad_norm": 10.7095365524292, + "learning_rate": 2.2025055774841258e-05, + "loss": 0.9249, + "step": 3098 + }, + { + "epoch": 0.53, + "grad_norm": 10.872523307800293, + "learning_rate": 2.2022481551398665e-05, + "loss": 1.2675, + "step": 3099 + }, + { + "epoch": 0.53, + "grad_norm": 11.039074897766113, + "learning_rate": 2.2019907327956068e-05, + "loss": 0.9968, + "step": 3100 + }, + { + "epoch": 0.53, + "grad_norm": 11.447712898254395, + "learning_rate": 2.201733310451347e-05, + "loss": 0.9513, + "step": 3101 + }, + { + "epoch": 0.53, + "grad_norm": 9.730088233947754, + "learning_rate": 2.2014758881070878e-05, + "loss": 1.0489, + "step": 3102 + }, + { + "epoch": 0.53, + "grad_norm": 8.814104080200195, + "learning_rate": 2.201218465762828e-05, + "loss": 1.0052, + "step": 3103 + }, + { + "epoch": 0.53, + "grad_norm": 8.879773139953613, + "learning_rate": 2.2009610434185688e-05, + "loss": 0.799, + "step": 3104 + }, + { + "epoch": 0.53, + "grad_norm": 9.684769630432129, + "learning_rate": 2.200703621074309e-05, + "loss": 0.8019, + "step": 3105 + }, + { + "epoch": 0.53, + "grad_norm": 9.733920097351074, + "learning_rate": 2.2004461987300498e-05, + "loss": 0.8762, + "step": 3106 + }, + { + "epoch": 0.53, + "grad_norm": 10.692930221557617, + "learning_rate": 2.20018877638579e-05, + "loss": 1.3131, + "step": 3107 + }, + { + "epoch": 0.53, + "grad_norm": 8.138765335083008, + "learning_rate": 2.199931354041531e-05, + "loss": 0.8082, + "step": 3108 + }, + { + "epoch": 0.53, + "grad_norm": 10.816980361938477, + "learning_rate": 2.1996739316972715e-05, + "loss": 1.0204, + "step": 3109 + }, + { + "epoch": 0.53, + "grad_norm": 8.869331359863281, + "learning_rate": 2.199416509353012e-05, + "loss": 0.952, + "step": 3110 + }, + { + "epoch": 0.53, + "grad_norm": 11.218375205993652, + "learning_rate": 2.1991590870087525e-05, + "loss": 1.1501, + "step": 3111 + }, + { + "epoch": 0.53, + "grad_norm": 9.711645126342773, + "learning_rate": 2.1989016646644928e-05, + "loss": 1.1254, + "step": 3112 + }, + { + "epoch": 0.53, + "grad_norm": 11.05109977722168, + "learning_rate": 2.1986442423202335e-05, + "loss": 1.2439, + "step": 3113 + }, + { + "epoch": 0.53, + "grad_norm": 8.715018272399902, + "learning_rate": 2.1983868199759738e-05, + "loss": 0.9996, + "step": 3114 + }, + { + "epoch": 0.53, + "grad_norm": 9.492490768432617, + "learning_rate": 2.1981293976317144e-05, + "loss": 0.8824, + "step": 3115 + }, + { + "epoch": 0.53, + "grad_norm": 9.98580265045166, + "learning_rate": 2.1978719752874548e-05, + "loss": 0.9619, + "step": 3116 + }, + { + "epoch": 0.53, + "grad_norm": 8.428837776184082, + "learning_rate": 2.1976145529431958e-05, + "loss": 0.8728, + "step": 3117 + }, + { + "epoch": 0.54, + "grad_norm": 10.138510704040527, + "learning_rate": 2.197357130598936e-05, + "loss": 1.1656, + "step": 3118 + }, + { + "epoch": 0.54, + "grad_norm": 8.214956283569336, + "learning_rate": 2.1970997082546768e-05, + "loss": 0.8116, + "step": 3119 + }, + { + "epoch": 0.54, + "grad_norm": 9.222757339477539, + "learning_rate": 2.196842285910417e-05, + "loss": 0.9675, + "step": 3120 + }, + { + "epoch": 0.54, + "grad_norm": 7.356564998626709, + "learning_rate": 2.1965848635661578e-05, + "loss": 0.8123, + "step": 3121 + }, + { + "epoch": 0.54, + "grad_norm": 10.394486427307129, + "learning_rate": 2.196327441221898e-05, + "loss": 1.2035, + "step": 3122 + }, + { + "epoch": 0.54, + "grad_norm": 9.59044361114502, + "learning_rate": 2.1960700188776384e-05, + "loss": 1.2356, + "step": 3123 + }, + { + "epoch": 0.54, + "grad_norm": 9.820239067077637, + "learning_rate": 2.195812596533379e-05, + "loss": 1.1329, + "step": 3124 + }, + { + "epoch": 0.54, + "grad_norm": 13.050148010253906, + "learning_rate": 2.1955551741891194e-05, + "loss": 1.2123, + "step": 3125 + }, + { + "epoch": 0.54, + "grad_norm": 10.765544891357422, + "learning_rate": 2.19529775184486e-05, + "loss": 1.2961, + "step": 3126 + }, + { + "epoch": 0.54, + "grad_norm": 8.465819358825684, + "learning_rate": 2.1950403295006008e-05, + "loss": 0.8429, + "step": 3127 + }, + { + "epoch": 0.54, + "grad_norm": 9.164430618286133, + "learning_rate": 2.1947829071563414e-05, + "loss": 0.9639, + "step": 3128 + }, + { + "epoch": 0.54, + "grad_norm": 9.604117393493652, + "learning_rate": 2.1945254848120818e-05, + "loss": 1.1961, + "step": 3129 + }, + { + "epoch": 0.54, + "grad_norm": 8.781293869018555, + "learning_rate": 2.1942680624678224e-05, + "loss": 1.1247, + "step": 3130 + }, + { + "epoch": 0.54, + "grad_norm": 9.708008766174316, + "learning_rate": 2.1940106401235628e-05, + "loss": 0.839, + "step": 3131 + }, + { + "epoch": 0.54, + "grad_norm": 8.764863967895508, + "learning_rate": 2.1937532177793034e-05, + "loss": 0.9776, + "step": 3132 + }, + { + "epoch": 0.54, + "grad_norm": 8.481902122497559, + "learning_rate": 2.1934957954350438e-05, + "loss": 0.9819, + "step": 3133 + }, + { + "epoch": 0.54, + "grad_norm": 8.671924591064453, + "learning_rate": 2.193238373090784e-05, + "loss": 0.8807, + "step": 3134 + }, + { + "epoch": 0.54, + "grad_norm": 8.681611061096191, + "learning_rate": 2.1929809507465248e-05, + "loss": 0.8351, + "step": 3135 + }, + { + "epoch": 0.54, + "grad_norm": 7.935581684112549, + "learning_rate": 2.1927235284022654e-05, + "loss": 0.7053, + "step": 3136 + }, + { + "epoch": 0.54, + "grad_norm": 9.76996898651123, + "learning_rate": 2.192466106058006e-05, + "loss": 1.1438, + "step": 3137 + }, + { + "epoch": 0.54, + "grad_norm": 10.040933609008789, + "learning_rate": 2.1922086837137464e-05, + "loss": 1.1028, + "step": 3138 + }, + { + "epoch": 0.54, + "grad_norm": 10.696728706359863, + "learning_rate": 2.191951261369487e-05, + "loss": 1.2261, + "step": 3139 + }, + { + "epoch": 0.54, + "grad_norm": 10.489141464233398, + "learning_rate": 2.1916938390252274e-05, + "loss": 1.1004, + "step": 3140 + }, + { + "epoch": 0.54, + "grad_norm": 10.115700721740723, + "learning_rate": 2.191436416680968e-05, + "loss": 1.0205, + "step": 3141 + }, + { + "epoch": 0.54, + "grad_norm": 10.014881134033203, + "learning_rate": 2.1911789943367084e-05, + "loss": 1.0217, + "step": 3142 + }, + { + "epoch": 0.54, + "grad_norm": 7.941526889801025, + "learning_rate": 2.1909215719924488e-05, + "loss": 0.7474, + "step": 3143 + }, + { + "epoch": 0.54, + "grad_norm": 9.077324867248535, + "learning_rate": 2.1906641496481894e-05, + "loss": 0.7121, + "step": 3144 + }, + { + "epoch": 0.54, + "grad_norm": 10.537542343139648, + "learning_rate": 2.1904067273039297e-05, + "loss": 1.0168, + "step": 3145 + }, + { + "epoch": 0.54, + "grad_norm": 10.554915428161621, + "learning_rate": 2.1901493049596708e-05, + "loss": 0.97, + "step": 3146 + }, + { + "epoch": 0.54, + "grad_norm": 9.603519439697266, + "learning_rate": 2.189891882615411e-05, + "loss": 0.9573, + "step": 3147 + }, + { + "epoch": 0.54, + "grad_norm": 9.304115295410156, + "learning_rate": 2.1896344602711518e-05, + "loss": 1.0942, + "step": 3148 + }, + { + "epoch": 0.54, + "grad_norm": 9.811833381652832, + "learning_rate": 2.189377037926892e-05, + "loss": 0.897, + "step": 3149 + }, + { + "epoch": 0.54, + "grad_norm": 8.509538650512695, + "learning_rate": 2.1891196155826328e-05, + "loss": 0.6555, + "step": 3150 + }, + { + "epoch": 0.54, + "grad_norm": 11.038629531860352, + "learning_rate": 2.188862193238373e-05, + "loss": 1.0459, + "step": 3151 + }, + { + "epoch": 0.54, + "grad_norm": 10.320324897766113, + "learning_rate": 2.1886047708941137e-05, + "loss": 0.9769, + "step": 3152 + }, + { + "epoch": 0.54, + "grad_norm": 8.822304725646973, + "learning_rate": 2.188347348549854e-05, + "loss": 0.7505, + "step": 3153 + }, + { + "epoch": 0.54, + "grad_norm": 9.76353645324707, + "learning_rate": 2.1880899262055944e-05, + "loss": 1.0906, + "step": 3154 + }, + { + "epoch": 0.54, + "grad_norm": 12.874078750610352, + "learning_rate": 2.1878325038613354e-05, + "loss": 1.54, + "step": 3155 + }, + { + "epoch": 0.54, + "grad_norm": 11.56296443939209, + "learning_rate": 2.1875750815170757e-05, + "loss": 1.1177, + "step": 3156 + }, + { + "epoch": 0.54, + "grad_norm": 9.449142456054688, + "learning_rate": 2.1873176591728164e-05, + "loss": 0.9417, + "step": 3157 + }, + { + "epoch": 0.54, + "grad_norm": 9.855195999145508, + "learning_rate": 2.1870602368285567e-05, + "loss": 1.0936, + "step": 3158 + }, + { + "epoch": 0.54, + "grad_norm": 7.562729358673096, + "learning_rate": 2.1868028144842974e-05, + "loss": 0.7096, + "step": 3159 + }, + { + "epoch": 0.54, + "grad_norm": 9.778154373168945, + "learning_rate": 2.1865453921400377e-05, + "loss": 0.9019, + "step": 3160 + }, + { + "epoch": 0.54, + "grad_norm": 10.041037559509277, + "learning_rate": 2.1862879697957784e-05, + "loss": 0.962, + "step": 3161 + }, + { + "epoch": 0.54, + "grad_norm": 11.064740180969238, + "learning_rate": 2.1860305474515187e-05, + "loss": 1.0679, + "step": 3162 + }, + { + "epoch": 0.54, + "grad_norm": 9.525568008422852, + "learning_rate": 2.1857731251072594e-05, + "loss": 1.0027, + "step": 3163 + }, + { + "epoch": 0.54, + "grad_norm": 10.927902221679688, + "learning_rate": 2.1855157027629997e-05, + "loss": 0.9911, + "step": 3164 + }, + { + "epoch": 0.54, + "grad_norm": 10.060417175292969, + "learning_rate": 2.1852582804187404e-05, + "loss": 1.1202, + "step": 3165 + }, + { + "epoch": 0.54, + "grad_norm": 9.134319305419922, + "learning_rate": 2.185000858074481e-05, + "loss": 0.9622, + "step": 3166 + }, + { + "epoch": 0.54, + "grad_norm": 9.938206672668457, + "learning_rate": 2.1847434357302214e-05, + "loss": 0.9234, + "step": 3167 + }, + { + "epoch": 0.54, + "grad_norm": 9.985689163208008, + "learning_rate": 2.184486013385962e-05, + "loss": 1.2295, + "step": 3168 + }, + { + "epoch": 0.54, + "grad_norm": 10.237380981445312, + "learning_rate": 2.1842285910417024e-05, + "loss": 0.9057, + "step": 3169 + }, + { + "epoch": 0.54, + "grad_norm": 7.169559478759766, + "learning_rate": 2.183971168697443e-05, + "loss": 0.6954, + "step": 3170 + }, + { + "epoch": 0.54, + "grad_norm": 8.447514533996582, + "learning_rate": 2.1837137463531834e-05, + "loss": 1.0324, + "step": 3171 + }, + { + "epoch": 0.54, + "grad_norm": 9.671792984008789, + "learning_rate": 2.183456324008924e-05, + "loss": 0.8003, + "step": 3172 + }, + { + "epoch": 0.54, + "grad_norm": 8.219077110290527, + "learning_rate": 2.1831989016646644e-05, + "loss": 0.6495, + "step": 3173 + }, + { + "epoch": 0.54, + "grad_norm": 9.128629684448242, + "learning_rate": 2.182941479320405e-05, + "loss": 0.7984, + "step": 3174 + }, + { + "epoch": 0.54, + "grad_norm": 7.862685680389404, + "learning_rate": 2.1826840569761457e-05, + "loss": 0.791, + "step": 3175 + }, + { + "epoch": 0.55, + "grad_norm": 8.163856506347656, + "learning_rate": 2.182426634631886e-05, + "loss": 0.8417, + "step": 3176 + }, + { + "epoch": 0.55, + "grad_norm": 10.783720016479492, + "learning_rate": 2.1821692122876267e-05, + "loss": 1.0774, + "step": 3177 + }, + { + "epoch": 0.55, + "grad_norm": 8.313047409057617, + "learning_rate": 2.181911789943367e-05, + "loss": 0.8129, + "step": 3178 + }, + { + "epoch": 0.55, + "grad_norm": 9.134902000427246, + "learning_rate": 2.1816543675991077e-05, + "loss": 0.7819, + "step": 3179 + }, + { + "epoch": 0.55, + "grad_norm": 12.417607307434082, + "learning_rate": 2.181396945254848e-05, + "loss": 1.0525, + "step": 3180 + }, + { + "epoch": 0.55, + "grad_norm": 8.567964553833008, + "learning_rate": 2.1811395229105887e-05, + "loss": 0.7269, + "step": 3181 + }, + { + "epoch": 0.55, + "grad_norm": 8.554522514343262, + "learning_rate": 2.180882100566329e-05, + "loss": 0.7956, + "step": 3182 + }, + { + "epoch": 0.55, + "grad_norm": 8.361642837524414, + "learning_rate": 2.1806246782220697e-05, + "loss": 0.7218, + "step": 3183 + }, + { + "epoch": 0.55, + "grad_norm": 10.764366149902344, + "learning_rate": 2.1803672558778104e-05, + "loss": 1.27, + "step": 3184 + }, + { + "epoch": 0.55, + "grad_norm": 8.85014533996582, + "learning_rate": 2.1801098335335507e-05, + "loss": 0.9879, + "step": 3185 + }, + { + "epoch": 0.55, + "grad_norm": 8.578740119934082, + "learning_rate": 2.1798524111892914e-05, + "loss": 0.884, + "step": 3186 + }, + { + "epoch": 0.55, + "grad_norm": 11.373249053955078, + "learning_rate": 2.1795949888450317e-05, + "loss": 0.8658, + "step": 3187 + }, + { + "epoch": 0.55, + "grad_norm": 11.385355949401855, + "learning_rate": 2.1793375665007724e-05, + "loss": 1.0733, + "step": 3188 + }, + { + "epoch": 0.55, + "grad_norm": 10.13532829284668, + "learning_rate": 2.1790801441565127e-05, + "loss": 1.1228, + "step": 3189 + }, + { + "epoch": 0.55, + "grad_norm": 8.906203269958496, + "learning_rate": 2.1788227218122534e-05, + "loss": 1.0137, + "step": 3190 + }, + { + "epoch": 0.55, + "grad_norm": 10.440940856933594, + "learning_rate": 2.1785652994679937e-05, + "loss": 0.9276, + "step": 3191 + }, + { + "epoch": 0.55, + "grad_norm": 10.625938415527344, + "learning_rate": 2.1783078771237344e-05, + "loss": 1.3305, + "step": 3192 + }, + { + "epoch": 0.55, + "grad_norm": 10.186883926391602, + "learning_rate": 2.178050454779475e-05, + "loss": 0.9193, + "step": 3193 + }, + { + "epoch": 0.55, + "grad_norm": 8.901379585266113, + "learning_rate": 2.1777930324352157e-05, + "loss": 0.9429, + "step": 3194 + }, + { + "epoch": 0.55, + "grad_norm": 10.108570098876953, + "learning_rate": 2.177535610090956e-05, + "loss": 0.8455, + "step": 3195 + }, + { + "epoch": 0.55, + "grad_norm": 9.209932327270508, + "learning_rate": 2.1772781877466964e-05, + "loss": 0.8453, + "step": 3196 + }, + { + "epoch": 0.55, + "grad_norm": 9.383414268493652, + "learning_rate": 2.177020765402437e-05, + "loss": 0.847, + "step": 3197 + }, + { + "epoch": 0.55, + "grad_norm": 10.280423164367676, + "learning_rate": 2.1767633430581774e-05, + "loss": 0.9467, + "step": 3198 + }, + { + "epoch": 0.55, + "grad_norm": 9.35180377960205, + "learning_rate": 2.176505920713918e-05, + "loss": 1.2009, + "step": 3199 + }, + { + "epoch": 0.55, + "grad_norm": 10.709630966186523, + "learning_rate": 2.1762484983696584e-05, + "loss": 1.105, + "step": 3200 + }, + { + "epoch": 0.55, + "grad_norm": 10.566025733947754, + "learning_rate": 2.175991076025399e-05, + "loss": 0.919, + "step": 3201 + }, + { + "epoch": 0.55, + "grad_norm": 9.236313819885254, + "learning_rate": 2.1757336536811394e-05, + "loss": 1.142, + "step": 3202 + }, + { + "epoch": 0.55, + "grad_norm": 8.696102142333984, + "learning_rate": 2.1754762313368804e-05, + "loss": 0.8391, + "step": 3203 + }, + { + "epoch": 0.55, + "grad_norm": 11.705658912658691, + "learning_rate": 2.1752188089926207e-05, + "loss": 1.3478, + "step": 3204 + }, + { + "epoch": 0.55, + "grad_norm": 8.247570991516113, + "learning_rate": 2.174961386648361e-05, + "loss": 0.6832, + "step": 3205 + }, + { + "epoch": 0.55, + "grad_norm": 8.861929893493652, + "learning_rate": 2.1747039643041017e-05, + "loss": 0.6828, + "step": 3206 + }, + { + "epoch": 0.55, + "grad_norm": 8.63481616973877, + "learning_rate": 2.174446541959842e-05, + "loss": 1.0992, + "step": 3207 + }, + { + "epoch": 0.55, + "grad_norm": 9.865202903747559, + "learning_rate": 2.1741891196155827e-05, + "loss": 0.9223, + "step": 3208 + }, + { + "epoch": 0.55, + "grad_norm": 9.262078285217285, + "learning_rate": 2.173931697271323e-05, + "loss": 0.9563, + "step": 3209 + }, + { + "epoch": 0.55, + "grad_norm": 11.30843734741211, + "learning_rate": 2.1736742749270637e-05, + "loss": 1.1408, + "step": 3210 + }, + { + "epoch": 0.55, + "grad_norm": 8.31630802154541, + "learning_rate": 2.173416852582804e-05, + "loss": 0.8361, + "step": 3211 + }, + { + "epoch": 0.55, + "grad_norm": 7.973668575286865, + "learning_rate": 2.173159430238545e-05, + "loss": 0.8109, + "step": 3212 + }, + { + "epoch": 0.55, + "grad_norm": 9.872391700744629, + "learning_rate": 2.1729020078942854e-05, + "loss": 0.9446, + "step": 3213 + }, + { + "epoch": 0.55, + "grad_norm": 8.826184272766113, + "learning_rate": 2.172644585550026e-05, + "loss": 0.73, + "step": 3214 + }, + { + "epoch": 0.55, + "grad_norm": 9.253357887268066, + "learning_rate": 2.1723871632057664e-05, + "loss": 1.113, + "step": 3215 + }, + { + "epoch": 0.55, + "grad_norm": 12.210947036743164, + "learning_rate": 2.1721297408615067e-05, + "loss": 1.0653, + "step": 3216 + }, + { + "epoch": 0.55, + "grad_norm": 9.294479370117188, + "learning_rate": 2.1718723185172474e-05, + "loss": 0.8931, + "step": 3217 + }, + { + "epoch": 0.55, + "grad_norm": 7.98333740234375, + "learning_rate": 2.1716148961729877e-05, + "loss": 0.8752, + "step": 3218 + }, + { + "epoch": 0.55, + "grad_norm": 9.869829177856445, + "learning_rate": 2.1713574738287283e-05, + "loss": 1.0788, + "step": 3219 + }, + { + "epoch": 0.55, + "grad_norm": 10.192008018493652, + "learning_rate": 2.1711000514844687e-05, + "loss": 0.783, + "step": 3220 + }, + { + "epoch": 0.55, + "grad_norm": 10.066743850708008, + "learning_rate": 2.1708426291402097e-05, + "loss": 1.1949, + "step": 3221 + }, + { + "epoch": 0.55, + "grad_norm": 10.277292251586914, + "learning_rate": 2.17058520679595e-05, + "loss": 0.9494, + "step": 3222 + }, + { + "epoch": 0.55, + "grad_norm": 9.366868019104004, + "learning_rate": 2.1703277844516907e-05, + "loss": 1.0176, + "step": 3223 + }, + { + "epoch": 0.55, + "grad_norm": 9.33558464050293, + "learning_rate": 2.170070362107431e-05, + "loss": 0.7573, + "step": 3224 + }, + { + "epoch": 0.55, + "grad_norm": 8.411799430847168, + "learning_rate": 2.1698129397631717e-05, + "loss": 0.7129, + "step": 3225 + }, + { + "epoch": 0.55, + "grad_norm": 9.274582862854004, + "learning_rate": 2.169555517418912e-05, + "loss": 1.0729, + "step": 3226 + }, + { + "epoch": 0.55, + "grad_norm": 10.059366226196289, + "learning_rate": 2.1692980950746523e-05, + "loss": 0.9163, + "step": 3227 + }, + { + "epoch": 0.55, + "grad_norm": 12.27176570892334, + "learning_rate": 2.169040672730393e-05, + "loss": 1.4554, + "step": 3228 + }, + { + "epoch": 0.55, + "grad_norm": 10.528203010559082, + "learning_rate": 2.1687832503861333e-05, + "loss": 0.8814, + "step": 3229 + }, + { + "epoch": 0.55, + "grad_norm": 9.73957347869873, + "learning_rate": 2.168525828041874e-05, + "loss": 0.9085, + "step": 3230 + }, + { + "epoch": 0.55, + "grad_norm": 9.321489334106445, + "learning_rate": 2.1682684056976147e-05, + "loss": 0.9523, + "step": 3231 + }, + { + "epoch": 0.55, + "grad_norm": 10.32539176940918, + "learning_rate": 2.1680109833533553e-05, + "loss": 0.6604, + "step": 3232 + }, + { + "epoch": 0.55, + "grad_norm": 10.537572860717773, + "learning_rate": 2.1677535610090957e-05, + "loss": 1.0714, + "step": 3233 + }, + { + "epoch": 0.56, + "grad_norm": 9.214726448059082, + "learning_rate": 2.1674961386648363e-05, + "loss": 0.8779, + "step": 3234 + }, + { + "epoch": 0.56, + "grad_norm": 12.187397003173828, + "learning_rate": 2.1672387163205767e-05, + "loss": 1.4253, + "step": 3235 + }, + { + "epoch": 0.56, + "grad_norm": 9.91473388671875, + "learning_rate": 2.1669812939763173e-05, + "loss": 0.8857, + "step": 3236 + }, + { + "epoch": 0.56, + "grad_norm": 9.964056015014648, + "learning_rate": 2.1667238716320577e-05, + "loss": 0.8446, + "step": 3237 + }, + { + "epoch": 0.56, + "grad_norm": 10.088868141174316, + "learning_rate": 2.166466449287798e-05, + "loss": 0.9246, + "step": 3238 + }, + { + "epoch": 0.56, + "grad_norm": 10.25182819366455, + "learning_rate": 2.1662090269435387e-05, + "loss": 0.983, + "step": 3239 + }, + { + "epoch": 0.56, + "grad_norm": 9.780190467834473, + "learning_rate": 2.1659516045992793e-05, + "loss": 0.9359, + "step": 3240 + }, + { + "epoch": 0.56, + "grad_norm": 8.661764144897461, + "learning_rate": 2.16569418225502e-05, + "loss": 0.8701, + "step": 3241 + }, + { + "epoch": 0.56, + "grad_norm": 10.114421844482422, + "learning_rate": 2.1654367599107603e-05, + "loss": 1.2242, + "step": 3242 + }, + { + "epoch": 0.56, + "grad_norm": 9.93411636352539, + "learning_rate": 2.165179337566501e-05, + "loss": 1.0718, + "step": 3243 + }, + { + "epoch": 0.56, + "grad_norm": 10.305581092834473, + "learning_rate": 2.1649219152222413e-05, + "loss": 1.0008, + "step": 3244 + }, + { + "epoch": 0.56, + "grad_norm": 7.578404903411865, + "learning_rate": 2.164664492877982e-05, + "loss": 0.7234, + "step": 3245 + }, + { + "epoch": 0.56, + "grad_norm": 8.716707229614258, + "learning_rate": 2.1644070705337223e-05, + "loss": 0.7725, + "step": 3246 + }, + { + "epoch": 0.56, + "grad_norm": 10.51779556274414, + "learning_rate": 2.1641496481894627e-05, + "loss": 1.0431, + "step": 3247 + }, + { + "epoch": 0.56, + "grad_norm": 9.817296028137207, + "learning_rate": 2.1638922258452033e-05, + "loss": 1.0968, + "step": 3248 + }, + { + "epoch": 0.56, + "grad_norm": 9.080244064331055, + "learning_rate": 2.1636348035009437e-05, + "loss": 0.8871, + "step": 3249 + }, + { + "epoch": 0.56, + "grad_norm": 10.9187650680542, + "learning_rate": 2.1633773811566847e-05, + "loss": 1.3985, + "step": 3250 + }, + { + "epoch": 0.56, + "grad_norm": 9.561849594116211, + "learning_rate": 2.163119958812425e-05, + "loss": 1.1111, + "step": 3251 + }, + { + "epoch": 0.56, + "grad_norm": 10.092209815979004, + "learning_rate": 2.1628625364681657e-05, + "loss": 0.9586, + "step": 3252 + }, + { + "epoch": 0.56, + "grad_norm": 9.601984977722168, + "learning_rate": 2.162605114123906e-05, + "loss": 0.9554, + "step": 3253 + }, + { + "epoch": 0.56, + "grad_norm": 8.11715030670166, + "learning_rate": 2.1623476917796467e-05, + "loss": 0.8711, + "step": 3254 + }, + { + "epoch": 0.56, + "grad_norm": 10.338886260986328, + "learning_rate": 2.162090269435387e-05, + "loss": 1.079, + "step": 3255 + }, + { + "epoch": 0.56, + "grad_norm": 9.088749885559082, + "learning_rate": 2.1618328470911276e-05, + "loss": 1.0061, + "step": 3256 + }, + { + "epoch": 0.56, + "grad_norm": 7.813129901885986, + "learning_rate": 2.161575424746868e-05, + "loss": 0.9419, + "step": 3257 + }, + { + "epoch": 0.56, + "grad_norm": 9.794445037841797, + "learning_rate": 2.1613180024026083e-05, + "loss": 1.1162, + "step": 3258 + }, + { + "epoch": 0.56, + "grad_norm": 8.691902160644531, + "learning_rate": 2.1610605800583493e-05, + "loss": 0.7736, + "step": 3259 + }, + { + "epoch": 0.56, + "grad_norm": 6.716875076293945, + "learning_rate": 2.1608031577140896e-05, + "loss": 0.6622, + "step": 3260 + }, + { + "epoch": 0.56, + "grad_norm": 10.189523696899414, + "learning_rate": 2.1605457353698303e-05, + "loss": 1.0522, + "step": 3261 + }, + { + "epoch": 0.56, + "grad_norm": 8.608463287353516, + "learning_rate": 2.1602883130255706e-05, + "loss": 0.7976, + "step": 3262 + }, + { + "epoch": 0.56, + "grad_norm": 9.183985710144043, + "learning_rate": 2.1600308906813113e-05, + "loss": 1.0986, + "step": 3263 + }, + { + "epoch": 0.56, + "grad_norm": 8.639912605285645, + "learning_rate": 2.1597734683370516e-05, + "loss": 0.6988, + "step": 3264 + }, + { + "epoch": 0.56, + "grad_norm": 10.809829711914062, + "learning_rate": 2.1595160459927923e-05, + "loss": 1.5204, + "step": 3265 + }, + { + "epoch": 0.56, + "grad_norm": 8.907243728637695, + "learning_rate": 2.1592586236485326e-05, + "loss": 0.7407, + "step": 3266 + }, + { + "epoch": 0.56, + "grad_norm": 9.971426963806152, + "learning_rate": 2.1590012013042733e-05, + "loss": 1.0083, + "step": 3267 + }, + { + "epoch": 0.56, + "grad_norm": 10.05289363861084, + "learning_rate": 2.1587437789600136e-05, + "loss": 0.8034, + "step": 3268 + }, + { + "epoch": 0.56, + "grad_norm": 9.950414657592773, + "learning_rate": 2.1584863566157543e-05, + "loss": 0.8737, + "step": 3269 + }, + { + "epoch": 0.56, + "grad_norm": 8.235519409179688, + "learning_rate": 2.158228934271495e-05, + "loss": 0.7837, + "step": 3270 + }, + { + "epoch": 0.56, + "grad_norm": 10.354198455810547, + "learning_rate": 2.1579715119272353e-05, + "loss": 0.9803, + "step": 3271 + }, + { + "epoch": 0.56, + "grad_norm": 9.11938762664795, + "learning_rate": 2.157714089582976e-05, + "loss": 0.708, + "step": 3272 + }, + { + "epoch": 0.56, + "grad_norm": 10.488126754760742, + "learning_rate": 2.1574566672387163e-05, + "loss": 1.0758, + "step": 3273 + }, + { + "epoch": 0.56, + "grad_norm": 11.257621765136719, + "learning_rate": 2.157199244894457e-05, + "loss": 0.9413, + "step": 3274 + }, + { + "epoch": 0.56, + "grad_norm": 11.837946891784668, + "learning_rate": 2.1569418225501973e-05, + "loss": 0.8158, + "step": 3275 + }, + { + "epoch": 0.56, + "grad_norm": 11.066544532775879, + "learning_rate": 2.156684400205938e-05, + "loss": 1.1956, + "step": 3276 + }, + { + "epoch": 0.56, + "grad_norm": 9.441474914550781, + "learning_rate": 2.1564269778616783e-05, + "loss": 0.8086, + "step": 3277 + }, + { + "epoch": 0.56, + "grad_norm": 10.52379322052002, + "learning_rate": 2.156169555517419e-05, + "loss": 0.6603, + "step": 3278 + }, + { + "epoch": 0.56, + "grad_norm": 11.316561698913574, + "learning_rate": 2.1559121331731596e-05, + "loss": 0.8135, + "step": 3279 + }, + { + "epoch": 0.56, + "grad_norm": 9.021976470947266, + "learning_rate": 2.1556547108289e-05, + "loss": 0.8317, + "step": 3280 + }, + { + "epoch": 0.56, + "grad_norm": 11.18840503692627, + "learning_rate": 2.1553972884846406e-05, + "loss": 1.0824, + "step": 3281 + }, + { + "epoch": 0.56, + "grad_norm": 10.439772605895996, + "learning_rate": 2.155139866140381e-05, + "loss": 0.8689, + "step": 3282 + }, + { + "epoch": 0.56, + "grad_norm": 10.884522438049316, + "learning_rate": 2.1548824437961216e-05, + "loss": 0.9889, + "step": 3283 + }, + { + "epoch": 0.56, + "grad_norm": 11.708690643310547, + "learning_rate": 2.154625021451862e-05, + "loss": 0.9767, + "step": 3284 + }, + { + "epoch": 0.56, + "grad_norm": 10.476131439208984, + "learning_rate": 2.1543675991076026e-05, + "loss": 1.0251, + "step": 3285 + }, + { + "epoch": 0.56, + "grad_norm": 10.36382007598877, + "learning_rate": 2.154110176763343e-05, + "loss": 0.7405, + "step": 3286 + }, + { + "epoch": 0.56, + "grad_norm": 11.07170295715332, + "learning_rate": 2.1538527544190836e-05, + "loss": 0.9733, + "step": 3287 + }, + { + "epoch": 0.56, + "grad_norm": 10.972197532653809, + "learning_rate": 2.1535953320748243e-05, + "loss": 0.9596, + "step": 3288 + }, + { + "epoch": 0.56, + "grad_norm": 9.184207916259766, + "learning_rate": 2.1533379097305646e-05, + "loss": 0.7975, + "step": 3289 + }, + { + "epoch": 0.56, + "grad_norm": 9.137417793273926, + "learning_rate": 2.1530804873863053e-05, + "loss": 0.7493, + "step": 3290 + }, + { + "epoch": 0.56, + "grad_norm": 9.522597312927246, + "learning_rate": 2.1528230650420456e-05, + "loss": 1.1567, + "step": 3291 + }, + { + "epoch": 0.56, + "grad_norm": 10.457935333251953, + "learning_rate": 2.1525656426977863e-05, + "loss": 0.8999, + "step": 3292 + }, + { + "epoch": 0.57, + "grad_norm": 11.231308937072754, + "learning_rate": 2.1523082203535266e-05, + "loss": 1.1664, + "step": 3293 + }, + { + "epoch": 0.57, + "grad_norm": 10.125234603881836, + "learning_rate": 2.1520507980092673e-05, + "loss": 0.8378, + "step": 3294 + }, + { + "epoch": 0.57, + "grad_norm": 8.983253479003906, + "learning_rate": 2.1517933756650076e-05, + "loss": 0.8401, + "step": 3295 + }, + { + "epoch": 0.57, + "grad_norm": 11.115766525268555, + "learning_rate": 2.1515359533207483e-05, + "loss": 1.0702, + "step": 3296 + }, + { + "epoch": 0.57, + "grad_norm": 9.25951862335205, + "learning_rate": 2.151278530976489e-05, + "loss": 1.1353, + "step": 3297 + }, + { + "epoch": 0.57, + "grad_norm": 11.079184532165527, + "learning_rate": 2.1510211086322296e-05, + "loss": 0.9575, + "step": 3298 + }, + { + "epoch": 0.57, + "grad_norm": 7.612562656402588, + "learning_rate": 2.15076368628797e-05, + "loss": 0.9446, + "step": 3299 + }, + { + "epoch": 0.57, + "grad_norm": 9.672316551208496, + "learning_rate": 2.1505062639437103e-05, + "loss": 0.9791, + "step": 3300 + }, + { + "epoch": 0.57, + "grad_norm": 7.614047527313232, + "learning_rate": 2.150248841599451e-05, + "loss": 0.846, + "step": 3301 + }, + { + "epoch": 0.57, + "grad_norm": 9.674379348754883, + "learning_rate": 2.1499914192551913e-05, + "loss": 1.1468, + "step": 3302 + }, + { + "epoch": 0.57, + "grad_norm": 9.892300605773926, + "learning_rate": 2.149733996910932e-05, + "loss": 1.0865, + "step": 3303 + }, + { + "epoch": 0.57, + "grad_norm": 9.093138694763184, + "learning_rate": 2.1494765745666723e-05, + "loss": 1.0762, + "step": 3304 + }, + { + "epoch": 0.57, + "grad_norm": 9.564008712768555, + "learning_rate": 2.149219152222413e-05, + "loss": 0.9487, + "step": 3305 + }, + { + "epoch": 0.57, + "grad_norm": 9.3695707321167, + "learning_rate": 2.1489617298781533e-05, + "loss": 0.8785, + "step": 3306 + }, + { + "epoch": 0.57, + "grad_norm": 9.876216888427734, + "learning_rate": 2.1487043075338943e-05, + "loss": 0.9495, + "step": 3307 + }, + { + "epoch": 0.57, + "grad_norm": 10.334044456481934, + "learning_rate": 2.1484468851896346e-05, + "loss": 1.2583, + "step": 3308 + }, + { + "epoch": 0.57, + "grad_norm": 7.992374420166016, + "learning_rate": 2.148189462845375e-05, + "loss": 0.8078, + "step": 3309 + }, + { + "epoch": 0.57, + "grad_norm": 8.051708221435547, + "learning_rate": 2.1479320405011156e-05, + "loss": 0.8927, + "step": 3310 + }, + { + "epoch": 0.57, + "grad_norm": 9.372456550598145, + "learning_rate": 2.147674618156856e-05, + "loss": 0.8778, + "step": 3311 + }, + { + "epoch": 0.57, + "grad_norm": 7.7460761070251465, + "learning_rate": 2.1474171958125966e-05, + "loss": 0.9121, + "step": 3312 + }, + { + "epoch": 0.57, + "grad_norm": 8.750205993652344, + "learning_rate": 2.147159773468337e-05, + "loss": 0.8885, + "step": 3313 + }, + { + "epoch": 0.57, + "grad_norm": 8.668148040771484, + "learning_rate": 2.1469023511240776e-05, + "loss": 0.8839, + "step": 3314 + }, + { + "epoch": 0.57, + "grad_norm": 10.011385917663574, + "learning_rate": 2.146644928779818e-05, + "loss": 1.0066, + "step": 3315 + }, + { + "epoch": 0.57, + "grad_norm": 10.701638221740723, + "learning_rate": 2.146387506435559e-05, + "loss": 1.3349, + "step": 3316 + }, + { + "epoch": 0.57, + "grad_norm": 10.443819999694824, + "learning_rate": 2.1461300840912993e-05, + "loss": 1.265, + "step": 3317 + }, + { + "epoch": 0.57, + "grad_norm": 9.117536544799805, + "learning_rate": 2.14587266174704e-05, + "loss": 0.6747, + "step": 3318 + }, + { + "epoch": 0.57, + "grad_norm": 10.16749382019043, + "learning_rate": 2.1456152394027803e-05, + "loss": 1.0698, + "step": 3319 + }, + { + "epoch": 0.57, + "grad_norm": 10.960887908935547, + "learning_rate": 2.1453578170585206e-05, + "loss": 0.928, + "step": 3320 + }, + { + "epoch": 0.57, + "grad_norm": 8.629446983337402, + "learning_rate": 2.1451003947142613e-05, + "loss": 0.8203, + "step": 3321 + }, + { + "epoch": 0.57, + "grad_norm": 9.186299324035645, + "learning_rate": 2.1448429723700016e-05, + "loss": 0.8447, + "step": 3322 + }, + { + "epoch": 0.57, + "grad_norm": 8.769874572753906, + "learning_rate": 2.1445855500257423e-05, + "loss": 0.6914, + "step": 3323 + }, + { + "epoch": 0.57, + "grad_norm": 8.716658592224121, + "learning_rate": 2.1443281276814826e-05, + "loss": 0.7656, + "step": 3324 + }, + { + "epoch": 0.57, + "grad_norm": 10.508240699768066, + "learning_rate": 2.1440707053372236e-05, + "loss": 0.9904, + "step": 3325 + }, + { + "epoch": 0.57, + "grad_norm": 9.989377975463867, + "learning_rate": 2.143813282992964e-05, + "loss": 1.0246, + "step": 3326 + }, + { + "epoch": 0.57, + "grad_norm": 9.998805046081543, + "learning_rate": 2.1435558606487046e-05, + "loss": 0.8177, + "step": 3327 + }, + { + "epoch": 0.57, + "grad_norm": 11.800924301147461, + "learning_rate": 2.143298438304445e-05, + "loss": 1.2029, + "step": 3328 + }, + { + "epoch": 0.57, + "grad_norm": 8.99470043182373, + "learning_rate": 2.1430410159601856e-05, + "loss": 0.8444, + "step": 3329 + }, + { + "epoch": 0.57, + "grad_norm": 9.246833801269531, + "learning_rate": 2.142783593615926e-05, + "loss": 0.8711, + "step": 3330 + }, + { + "epoch": 0.57, + "grad_norm": 9.696725845336914, + "learning_rate": 2.1425261712716662e-05, + "loss": 0.8906, + "step": 3331 + }, + { + "epoch": 0.57, + "grad_norm": 10.879152297973633, + "learning_rate": 2.142268748927407e-05, + "loss": 1.0487, + "step": 3332 + }, + { + "epoch": 0.57, + "grad_norm": 8.030535697937012, + "learning_rate": 2.1420113265831472e-05, + "loss": 0.6264, + "step": 3333 + }, + { + "epoch": 0.57, + "grad_norm": 10.199321746826172, + "learning_rate": 2.141753904238888e-05, + "loss": 0.91, + "step": 3334 + }, + { + "epoch": 0.57, + "grad_norm": 9.858607292175293, + "learning_rate": 2.1414964818946286e-05, + "loss": 1.1586, + "step": 3335 + }, + { + "epoch": 0.57, + "grad_norm": 10.799261093139648, + "learning_rate": 2.1412390595503692e-05, + "loss": 1.061, + "step": 3336 + }, + { + "epoch": 0.57, + "grad_norm": 11.063508033752441, + "learning_rate": 2.1409816372061096e-05, + "loss": 0.9201, + "step": 3337 + }, + { + "epoch": 0.57, + "grad_norm": 10.47541618347168, + "learning_rate": 2.1407242148618502e-05, + "loss": 1.0478, + "step": 3338 + }, + { + "epoch": 0.57, + "grad_norm": 11.947546005249023, + "learning_rate": 2.1404667925175906e-05, + "loss": 1.1379, + "step": 3339 + }, + { + "epoch": 0.57, + "grad_norm": 9.848004341125488, + "learning_rate": 2.1402093701733312e-05, + "loss": 0.9509, + "step": 3340 + }, + { + "epoch": 0.57, + "grad_norm": 8.302692413330078, + "learning_rate": 2.1399519478290716e-05, + "loss": 0.8064, + "step": 3341 + }, + { + "epoch": 0.57, + "grad_norm": 9.407553672790527, + "learning_rate": 2.139694525484812e-05, + "loss": 0.9112, + "step": 3342 + }, + { + "epoch": 0.57, + "grad_norm": 8.795711517333984, + "learning_rate": 2.1394371031405526e-05, + "loss": 0.7802, + "step": 3343 + }, + { + "epoch": 0.57, + "grad_norm": 9.757826805114746, + "learning_rate": 2.1391796807962932e-05, + "loss": 1.0588, + "step": 3344 + }, + { + "epoch": 0.57, + "grad_norm": 10.820745468139648, + "learning_rate": 2.138922258452034e-05, + "loss": 0.8998, + "step": 3345 + }, + { + "epoch": 0.57, + "grad_norm": 11.344983100891113, + "learning_rate": 2.1386648361077742e-05, + "loss": 1.1015, + "step": 3346 + }, + { + "epoch": 0.57, + "grad_norm": 10.325364112854004, + "learning_rate": 2.138407413763515e-05, + "loss": 0.8903, + "step": 3347 + }, + { + "epoch": 0.57, + "grad_norm": 10.416016578674316, + "learning_rate": 2.1381499914192552e-05, + "loss": 1.2725, + "step": 3348 + }, + { + "epoch": 0.57, + "grad_norm": 8.414058685302734, + "learning_rate": 2.137892569074996e-05, + "loss": 0.9045, + "step": 3349 + }, + { + "epoch": 0.57, + "grad_norm": 9.113632202148438, + "learning_rate": 2.1376351467307362e-05, + "loss": 0.8869, + "step": 3350 + }, + { + "epoch": 0.58, + "grad_norm": 9.85736083984375, + "learning_rate": 2.1373777243864766e-05, + "loss": 0.9481, + "step": 3351 + }, + { + "epoch": 0.58, + "grad_norm": 9.1739501953125, + "learning_rate": 2.1371203020422172e-05, + "loss": 0.9657, + "step": 3352 + }, + { + "epoch": 0.58, + "grad_norm": 9.000965118408203, + "learning_rate": 2.1368628796979576e-05, + "loss": 0.771, + "step": 3353 + }, + { + "epoch": 0.58, + "grad_norm": 9.271477699279785, + "learning_rate": 2.1366054573536986e-05, + "loss": 0.936, + "step": 3354 + }, + { + "epoch": 0.58, + "grad_norm": 9.239330291748047, + "learning_rate": 2.136348035009439e-05, + "loss": 1.0011, + "step": 3355 + }, + { + "epoch": 0.58, + "grad_norm": 10.274884223937988, + "learning_rate": 2.1360906126651796e-05, + "loss": 1.1459, + "step": 3356 + }, + { + "epoch": 0.58, + "grad_norm": 8.972211837768555, + "learning_rate": 2.13583319032092e-05, + "loss": 0.8815, + "step": 3357 + }, + { + "epoch": 0.58, + "grad_norm": 10.910994529724121, + "learning_rate": 2.1355757679766606e-05, + "loss": 0.9931, + "step": 3358 + }, + { + "epoch": 0.58, + "grad_norm": 11.614021301269531, + "learning_rate": 2.135318345632401e-05, + "loss": 0.9974, + "step": 3359 + }, + { + "epoch": 0.58, + "grad_norm": 9.335184097290039, + "learning_rate": 2.1350609232881416e-05, + "loss": 1.1987, + "step": 3360 + }, + { + "epoch": 0.58, + "grad_norm": 7.170494556427002, + "learning_rate": 2.134803500943882e-05, + "loss": 0.8815, + "step": 3361 + }, + { + "epoch": 0.58, + "grad_norm": 7.56536340713501, + "learning_rate": 2.1345460785996222e-05, + "loss": 0.7037, + "step": 3362 + }, + { + "epoch": 0.58, + "grad_norm": 8.479547500610352, + "learning_rate": 2.1342886562553632e-05, + "loss": 0.9098, + "step": 3363 + }, + { + "epoch": 0.58, + "grad_norm": 10.641366004943848, + "learning_rate": 2.1340312339111035e-05, + "loss": 1.1193, + "step": 3364 + }, + { + "epoch": 0.58, + "grad_norm": 9.93000602722168, + "learning_rate": 2.1337738115668442e-05, + "loss": 0.8947, + "step": 3365 + }, + { + "epoch": 0.58, + "grad_norm": 7.783208847045898, + "learning_rate": 2.1335163892225845e-05, + "loss": 0.695, + "step": 3366 + }, + { + "epoch": 0.58, + "grad_norm": 10.242003440856934, + "learning_rate": 2.1332589668783252e-05, + "loss": 0.8784, + "step": 3367 + }, + { + "epoch": 0.58, + "grad_norm": 9.016057014465332, + "learning_rate": 2.1330015445340655e-05, + "loss": 0.7613, + "step": 3368 + }, + { + "epoch": 0.58, + "grad_norm": 8.449103355407715, + "learning_rate": 2.1327441221898062e-05, + "loss": 0.87, + "step": 3369 + }, + { + "epoch": 0.58, + "grad_norm": 10.125410079956055, + "learning_rate": 2.1324866998455465e-05, + "loss": 0.8774, + "step": 3370 + }, + { + "epoch": 0.58, + "grad_norm": 9.989790916442871, + "learning_rate": 2.1322292775012872e-05, + "loss": 1.0129, + "step": 3371 + }, + { + "epoch": 0.58, + "grad_norm": 8.345392227172852, + "learning_rate": 2.1319718551570275e-05, + "loss": 0.6481, + "step": 3372 + }, + { + "epoch": 0.58, + "grad_norm": 12.129225730895996, + "learning_rate": 2.1317144328127682e-05, + "loss": 0.9379, + "step": 3373 + }, + { + "epoch": 0.58, + "grad_norm": 9.49839973449707, + "learning_rate": 2.131457010468509e-05, + "loss": 0.805, + "step": 3374 + }, + { + "epoch": 0.58, + "grad_norm": 8.823487281799316, + "learning_rate": 2.1311995881242492e-05, + "loss": 0.8994, + "step": 3375 + }, + { + "epoch": 0.58, + "grad_norm": 9.875161170959473, + "learning_rate": 2.13094216577999e-05, + "loss": 1.0175, + "step": 3376 + }, + { + "epoch": 0.58, + "grad_norm": 8.919576644897461, + "learning_rate": 2.1306847434357302e-05, + "loss": 1.0558, + "step": 3377 + }, + { + "epoch": 0.58, + "grad_norm": 8.043708801269531, + "learning_rate": 2.130427321091471e-05, + "loss": 0.9635, + "step": 3378 + }, + { + "epoch": 0.58, + "grad_norm": 10.710693359375, + "learning_rate": 2.1301698987472112e-05, + "loss": 0.8354, + "step": 3379 + }, + { + "epoch": 0.58, + "grad_norm": 9.491646766662598, + "learning_rate": 2.129912476402952e-05, + "loss": 0.6101, + "step": 3380 + }, + { + "epoch": 0.58, + "grad_norm": 11.665020942687988, + "learning_rate": 2.1296550540586922e-05, + "loss": 1.1387, + "step": 3381 + }, + { + "epoch": 0.58, + "grad_norm": 10.968727111816406, + "learning_rate": 2.129397631714433e-05, + "loss": 0.8374, + "step": 3382 + }, + { + "epoch": 0.58, + "grad_norm": 9.36660385131836, + "learning_rate": 2.1291402093701735e-05, + "loss": 0.7927, + "step": 3383 + }, + { + "epoch": 0.58, + "grad_norm": 10.348801612854004, + "learning_rate": 2.128882787025914e-05, + "loss": 0.8111, + "step": 3384 + }, + { + "epoch": 0.58, + "grad_norm": 10.692757606506348, + "learning_rate": 2.1286253646816545e-05, + "loss": 0.7851, + "step": 3385 + }, + { + "epoch": 0.58, + "grad_norm": 10.30654525756836, + "learning_rate": 2.128367942337395e-05, + "loss": 0.8872, + "step": 3386 + }, + { + "epoch": 0.58, + "grad_norm": 11.270379066467285, + "learning_rate": 2.1281105199931355e-05, + "loss": 0.8711, + "step": 3387 + }, + { + "epoch": 0.58, + "grad_norm": 10.431641578674316, + "learning_rate": 2.127853097648876e-05, + "loss": 0.6263, + "step": 3388 + }, + { + "epoch": 0.58, + "grad_norm": 11.345833778381348, + "learning_rate": 2.1275956753046165e-05, + "loss": 0.8163, + "step": 3389 + }, + { + "epoch": 0.58, + "grad_norm": 10.337143898010254, + "learning_rate": 2.127338252960357e-05, + "loss": 0.8398, + "step": 3390 + }, + { + "epoch": 0.58, + "grad_norm": 11.857279777526855, + "learning_rate": 2.1270808306160975e-05, + "loss": 0.867, + "step": 3391 + }, + { + "epoch": 0.58, + "grad_norm": 9.896108627319336, + "learning_rate": 2.1268234082718382e-05, + "loss": 0.9562, + "step": 3392 + }, + { + "epoch": 0.58, + "grad_norm": 11.186572074890137, + "learning_rate": 2.1265659859275785e-05, + "loss": 0.8309, + "step": 3393 + }, + { + "epoch": 0.58, + "grad_norm": 9.086546897888184, + "learning_rate": 2.1263085635833192e-05, + "loss": 0.7872, + "step": 3394 + }, + { + "epoch": 0.58, + "grad_norm": 9.540240287780762, + "learning_rate": 2.1260511412390595e-05, + "loss": 0.7891, + "step": 3395 + }, + { + "epoch": 0.58, + "grad_norm": 11.109271049499512, + "learning_rate": 2.1257937188948002e-05, + "loss": 1.0625, + "step": 3396 + }, + { + "epoch": 0.58, + "grad_norm": 12.673296928405762, + "learning_rate": 2.1255362965505405e-05, + "loss": 1.0933, + "step": 3397 + }, + { + "epoch": 0.58, + "grad_norm": 9.194646835327148, + "learning_rate": 2.1252788742062812e-05, + "loss": 0.6912, + "step": 3398 + }, + { + "epoch": 0.58, + "grad_norm": 9.768902778625488, + "learning_rate": 2.1250214518620215e-05, + "loss": 0.8915, + "step": 3399 + }, + { + "epoch": 0.58, + "grad_norm": 10.473485946655273, + "learning_rate": 2.1247640295177622e-05, + "loss": 0.8354, + "step": 3400 + }, + { + "epoch": 0.58, + "grad_norm": 8.248024940490723, + "learning_rate": 2.124506607173503e-05, + "loss": 0.721, + "step": 3401 + }, + { + "epoch": 0.58, + "grad_norm": 12.484456062316895, + "learning_rate": 2.1242491848292435e-05, + "loss": 1.2405, + "step": 3402 + }, + { + "epoch": 0.58, + "grad_norm": 10.60517692565918, + "learning_rate": 2.123991762484984e-05, + "loss": 0.9914, + "step": 3403 + }, + { + "epoch": 0.58, + "grad_norm": 10.870101928710938, + "learning_rate": 2.1237343401407242e-05, + "loss": 1.0822, + "step": 3404 + }, + { + "epoch": 0.58, + "grad_norm": 10.580711364746094, + "learning_rate": 2.123476917796465e-05, + "loss": 0.874, + "step": 3405 + }, + { + "epoch": 0.58, + "grad_norm": 9.969808578491211, + "learning_rate": 2.1232194954522052e-05, + "loss": 0.9747, + "step": 3406 + }, + { + "epoch": 0.58, + "grad_norm": 9.010531425476074, + "learning_rate": 2.122962073107946e-05, + "loss": 0.8243, + "step": 3407 + }, + { + "epoch": 0.58, + "grad_norm": 10.558971405029297, + "learning_rate": 2.122704650763686e-05, + "loss": 0.8482, + "step": 3408 + }, + { + "epoch": 0.59, + "grad_norm": 10.554405212402344, + "learning_rate": 2.122447228419427e-05, + "loss": 0.8888, + "step": 3409 + }, + { + "epoch": 0.59, + "grad_norm": 10.682576179504395, + "learning_rate": 2.122189806075167e-05, + "loss": 1.2927, + "step": 3410 + }, + { + "epoch": 0.59, + "grad_norm": 10.798102378845215, + "learning_rate": 2.1219323837309082e-05, + "loss": 0.923, + "step": 3411 + }, + { + "epoch": 0.59, + "grad_norm": 10.334259986877441, + "learning_rate": 2.1216749613866485e-05, + "loss": 0.9114, + "step": 3412 + }, + { + "epoch": 0.59, + "grad_norm": 7.9736008644104, + "learning_rate": 2.121417539042389e-05, + "loss": 0.7468, + "step": 3413 + }, + { + "epoch": 0.59, + "grad_norm": 9.202890396118164, + "learning_rate": 2.1211601166981295e-05, + "loss": 0.9652, + "step": 3414 + }, + { + "epoch": 0.59, + "grad_norm": 7.913182258605957, + "learning_rate": 2.1209026943538698e-05, + "loss": 0.9204, + "step": 3415 + }, + { + "epoch": 0.59, + "grad_norm": 9.688068389892578, + "learning_rate": 2.1206452720096105e-05, + "loss": 0.8886, + "step": 3416 + }, + { + "epoch": 0.59, + "grad_norm": 8.632074356079102, + "learning_rate": 2.1203878496653508e-05, + "loss": 0.9347, + "step": 3417 + }, + { + "epoch": 0.59, + "grad_norm": 9.534497261047363, + "learning_rate": 2.1201304273210915e-05, + "loss": 1.0295, + "step": 3418 + }, + { + "epoch": 0.59, + "grad_norm": 9.882832527160645, + "learning_rate": 2.1198730049768318e-05, + "loss": 0.895, + "step": 3419 + }, + { + "epoch": 0.59, + "grad_norm": 8.662093162536621, + "learning_rate": 2.1196155826325728e-05, + "loss": 0.8651, + "step": 3420 + }, + { + "epoch": 0.59, + "grad_norm": 11.008186340332031, + "learning_rate": 2.119358160288313e-05, + "loss": 1.03, + "step": 3421 + }, + { + "epoch": 0.59, + "grad_norm": 10.827733993530273, + "learning_rate": 2.1191007379440538e-05, + "loss": 0.9715, + "step": 3422 + }, + { + "epoch": 0.59, + "grad_norm": 9.083715438842773, + "learning_rate": 2.118843315599794e-05, + "loss": 0.8469, + "step": 3423 + }, + { + "epoch": 0.59, + "grad_norm": 10.305743217468262, + "learning_rate": 2.1185858932555345e-05, + "loss": 0.8906, + "step": 3424 + }, + { + "epoch": 0.59, + "grad_norm": 10.867901802062988, + "learning_rate": 2.118328470911275e-05, + "loss": 0.9321, + "step": 3425 + }, + { + "epoch": 0.59, + "grad_norm": 12.041646957397461, + "learning_rate": 2.1180710485670155e-05, + "loss": 1.1969, + "step": 3426 + }, + { + "epoch": 0.59, + "grad_norm": 9.933911323547363, + "learning_rate": 2.117813626222756e-05, + "loss": 0.9315, + "step": 3427 + }, + { + "epoch": 0.59, + "grad_norm": 10.391996383666992, + "learning_rate": 2.1175562038784965e-05, + "loss": 0.8534, + "step": 3428 + }, + { + "epoch": 0.59, + "grad_norm": 9.745831489562988, + "learning_rate": 2.117298781534237e-05, + "loss": 0.8504, + "step": 3429 + }, + { + "epoch": 0.59, + "grad_norm": 10.904464721679688, + "learning_rate": 2.1170413591899778e-05, + "loss": 0.7972, + "step": 3430 + }, + { + "epoch": 0.59, + "grad_norm": 8.746960639953613, + "learning_rate": 2.1167839368457185e-05, + "loss": 0.9098, + "step": 3431 + }, + { + "epoch": 0.59, + "grad_norm": 9.606380462646484, + "learning_rate": 2.1165265145014588e-05, + "loss": 0.8763, + "step": 3432 + }, + { + "epoch": 0.59, + "grad_norm": 7.09662389755249, + "learning_rate": 2.1162690921571995e-05, + "loss": 0.8915, + "step": 3433 + }, + { + "epoch": 0.59, + "grad_norm": 7.510503768920898, + "learning_rate": 2.1160116698129398e-05, + "loss": 0.6151, + "step": 3434 + }, + { + "epoch": 0.59, + "grad_norm": 10.570076942443848, + "learning_rate": 2.11575424746868e-05, + "loss": 0.8435, + "step": 3435 + }, + { + "epoch": 0.59, + "grad_norm": 9.956568717956543, + "learning_rate": 2.1154968251244208e-05, + "loss": 0.9828, + "step": 3436 + }, + { + "epoch": 0.59, + "grad_norm": 10.706549644470215, + "learning_rate": 2.115239402780161e-05, + "loss": 0.912, + "step": 3437 + }, + { + "epoch": 0.59, + "grad_norm": 10.461691856384277, + "learning_rate": 2.1149819804359018e-05, + "loss": 0.7623, + "step": 3438 + }, + { + "epoch": 0.59, + "grad_norm": 10.87739372253418, + "learning_rate": 2.1147245580916425e-05, + "loss": 1.0354, + "step": 3439 + }, + { + "epoch": 0.59, + "grad_norm": 10.290151596069336, + "learning_rate": 2.114467135747383e-05, + "loss": 0.8056, + "step": 3440 + }, + { + "epoch": 0.59, + "grad_norm": 10.44156551361084, + "learning_rate": 2.1142097134031235e-05, + "loss": 0.7438, + "step": 3441 + }, + { + "epoch": 0.59, + "grad_norm": 10.8095121383667, + "learning_rate": 2.113952291058864e-05, + "loss": 1.038, + "step": 3442 + }, + { + "epoch": 0.59, + "grad_norm": 10.168571472167969, + "learning_rate": 2.1136948687146045e-05, + "loss": 0.6805, + "step": 3443 + }, + { + "epoch": 0.59, + "grad_norm": 10.736566543579102, + "learning_rate": 2.113437446370345e-05, + "loss": 0.7682, + "step": 3444 + }, + { + "epoch": 0.59, + "grad_norm": 8.09931755065918, + "learning_rate": 2.1131800240260855e-05, + "loss": 0.7133, + "step": 3445 + }, + { + "epoch": 0.59, + "grad_norm": 10.87078857421875, + "learning_rate": 2.1129226016818258e-05, + "loss": 0.9044, + "step": 3446 + }, + { + "epoch": 0.59, + "grad_norm": 10.737774848937988, + "learning_rate": 2.1126651793375665e-05, + "loss": 0.9536, + "step": 3447 + }, + { + "epoch": 0.59, + "grad_norm": 11.273669242858887, + "learning_rate": 2.112407756993307e-05, + "loss": 0.8887, + "step": 3448 + }, + { + "epoch": 0.59, + "grad_norm": 9.952434539794922, + "learning_rate": 2.1121503346490478e-05, + "loss": 0.9554, + "step": 3449 + }, + { + "epoch": 0.59, + "grad_norm": 9.560873985290527, + "learning_rate": 2.111892912304788e-05, + "loss": 0.959, + "step": 3450 + }, + { + "epoch": 0.59, + "grad_norm": 10.96397590637207, + "learning_rate": 2.1116354899605288e-05, + "loss": 0.917, + "step": 3451 + }, + { + "epoch": 0.59, + "grad_norm": 9.227838516235352, + "learning_rate": 2.111378067616269e-05, + "loss": 0.759, + "step": 3452 + }, + { + "epoch": 0.59, + "grad_norm": 10.357260704040527, + "learning_rate": 2.1111206452720098e-05, + "loss": 1.0566, + "step": 3453 + }, + { + "epoch": 0.59, + "grad_norm": 9.383551597595215, + "learning_rate": 2.11086322292775e-05, + "loss": 0.6032, + "step": 3454 + }, + { + "epoch": 0.59, + "grad_norm": 8.857072830200195, + "learning_rate": 2.1106058005834905e-05, + "loss": 0.8897, + "step": 3455 + }, + { + "epoch": 0.59, + "grad_norm": 10.75000286102295, + "learning_rate": 2.110348378239231e-05, + "loss": 0.9349, + "step": 3456 + }, + { + "epoch": 0.59, + "grad_norm": 9.371879577636719, + "learning_rate": 2.1100909558949715e-05, + "loss": 0.7341, + "step": 3457 + }, + { + "epoch": 0.59, + "grad_norm": 9.56074333190918, + "learning_rate": 2.1098335335507125e-05, + "loss": 0.7258, + "step": 3458 + }, + { + "epoch": 0.59, + "grad_norm": 9.784070014953613, + "learning_rate": 2.1095761112064528e-05, + "loss": 0.9596, + "step": 3459 + }, + { + "epoch": 0.59, + "grad_norm": 10.739653587341309, + "learning_rate": 2.1093186888621935e-05, + "loss": 0.7732, + "step": 3460 + }, + { + "epoch": 0.59, + "grad_norm": 9.197305679321289, + "learning_rate": 2.1090612665179338e-05, + "loss": 0.8965, + "step": 3461 + }, + { + "epoch": 0.59, + "grad_norm": 11.749576568603516, + "learning_rate": 2.1088038441736745e-05, + "loss": 0.822, + "step": 3462 + }, + { + "epoch": 0.59, + "grad_norm": 11.039815902709961, + "learning_rate": 2.1085464218294148e-05, + "loss": 1.2785, + "step": 3463 + }, + { + "epoch": 0.59, + "grad_norm": 8.522802352905273, + "learning_rate": 2.1082889994851555e-05, + "loss": 0.684, + "step": 3464 + }, + { + "epoch": 0.59, + "grad_norm": 11.451534271240234, + "learning_rate": 2.1080315771408958e-05, + "loss": 1.3313, + "step": 3465 + }, + { + "epoch": 0.59, + "grad_norm": 9.86819076538086, + "learning_rate": 2.107774154796636e-05, + "loss": 0.9297, + "step": 3466 + }, + { + "epoch": 0.59, + "grad_norm": 12.803569793701172, + "learning_rate": 2.107516732452377e-05, + "loss": 0.8183, + "step": 3467 + }, + { + "epoch": 0.6, + "grad_norm": 9.220539093017578, + "learning_rate": 2.1072593101081174e-05, + "loss": 0.7242, + "step": 3468 + }, + { + "epoch": 0.6, + "grad_norm": 11.610380172729492, + "learning_rate": 2.107001887763858e-05, + "loss": 0.933, + "step": 3469 + }, + { + "epoch": 0.6, + "grad_norm": 7.882061958312988, + "learning_rate": 2.1067444654195984e-05, + "loss": 0.562, + "step": 3470 + }, + { + "epoch": 0.6, + "grad_norm": 12.083072662353516, + "learning_rate": 2.106487043075339e-05, + "loss": 1.2563, + "step": 3471 + }, + { + "epoch": 0.6, + "grad_norm": 9.101675033569336, + "learning_rate": 2.1062296207310794e-05, + "loss": 1.0115, + "step": 3472 + }, + { + "epoch": 0.6, + "grad_norm": 9.753694534301758, + "learning_rate": 2.10597219838682e-05, + "loss": 0.8939, + "step": 3473 + }, + { + "epoch": 0.6, + "grad_norm": 10.481141090393066, + "learning_rate": 2.1057147760425604e-05, + "loss": 0.9582, + "step": 3474 + }, + { + "epoch": 0.6, + "grad_norm": 11.974809646606445, + "learning_rate": 2.105457353698301e-05, + "loss": 0.9938, + "step": 3475 + }, + { + "epoch": 0.6, + "grad_norm": 10.764436721801758, + "learning_rate": 2.1051999313540414e-05, + "loss": 1.0903, + "step": 3476 + }, + { + "epoch": 0.6, + "grad_norm": 9.915582656860352, + "learning_rate": 2.104942509009782e-05, + "loss": 0.9686, + "step": 3477 + }, + { + "epoch": 0.6, + "grad_norm": 10.737418174743652, + "learning_rate": 2.1046850866655228e-05, + "loss": 1.0, + "step": 3478 + }, + { + "epoch": 0.6, + "grad_norm": 11.196995735168457, + "learning_rate": 2.104427664321263e-05, + "loss": 0.808, + "step": 3479 + }, + { + "epoch": 0.6, + "grad_norm": 8.176024436950684, + "learning_rate": 2.1041702419770038e-05, + "loss": 0.8546, + "step": 3480 + }, + { + "epoch": 0.6, + "grad_norm": 8.725537300109863, + "learning_rate": 2.103912819632744e-05, + "loss": 1.122, + "step": 3481 + }, + { + "epoch": 0.6, + "grad_norm": 8.425695419311523, + "learning_rate": 2.1036553972884848e-05, + "loss": 0.6694, + "step": 3482 + }, + { + "epoch": 0.6, + "grad_norm": 8.48117733001709, + "learning_rate": 2.103397974944225e-05, + "loss": 0.8846, + "step": 3483 + }, + { + "epoch": 0.6, + "grad_norm": 10.184659004211426, + "learning_rate": 2.1031405525999658e-05, + "loss": 0.9249, + "step": 3484 + }, + { + "epoch": 0.6, + "grad_norm": 10.160643577575684, + "learning_rate": 2.102883130255706e-05, + "loss": 0.8291, + "step": 3485 + }, + { + "epoch": 0.6, + "grad_norm": 8.763010025024414, + "learning_rate": 2.1026257079114468e-05, + "loss": 0.9809, + "step": 3486 + }, + { + "epoch": 0.6, + "grad_norm": 8.668115615844727, + "learning_rate": 2.1023682855671874e-05, + "loss": 0.8612, + "step": 3487 + }, + { + "epoch": 0.6, + "grad_norm": 7.439923286437988, + "learning_rate": 2.1021108632229278e-05, + "loss": 0.9016, + "step": 3488 + }, + { + "epoch": 0.6, + "grad_norm": 9.359257698059082, + "learning_rate": 2.1018534408786684e-05, + "loss": 0.8972, + "step": 3489 + }, + { + "epoch": 0.6, + "grad_norm": 9.74252700805664, + "learning_rate": 2.1015960185344088e-05, + "loss": 1.0338, + "step": 3490 + }, + { + "epoch": 0.6, + "grad_norm": 7.9962005615234375, + "learning_rate": 2.1013385961901494e-05, + "loss": 0.6492, + "step": 3491 + }, + { + "epoch": 0.6, + "grad_norm": 10.262776374816895, + "learning_rate": 2.1010811738458898e-05, + "loss": 0.9953, + "step": 3492 + }, + { + "epoch": 0.6, + "grad_norm": 10.824647903442383, + "learning_rate": 2.1008237515016304e-05, + "loss": 1.1729, + "step": 3493 + }, + { + "epoch": 0.6, + "grad_norm": 12.100343704223633, + "learning_rate": 2.1005663291573708e-05, + "loss": 0.8705, + "step": 3494 + }, + { + "epoch": 0.6, + "grad_norm": 9.947909355163574, + "learning_rate": 2.1003089068131114e-05, + "loss": 0.7091, + "step": 3495 + }, + { + "epoch": 0.6, + "grad_norm": 8.15250015258789, + "learning_rate": 2.100051484468852e-05, + "loss": 0.8618, + "step": 3496 + }, + { + "epoch": 0.6, + "grad_norm": 9.630385398864746, + "learning_rate": 2.0997940621245924e-05, + "loss": 0.9365, + "step": 3497 + }, + { + "epoch": 0.6, + "grad_norm": 9.370593070983887, + "learning_rate": 2.099536639780333e-05, + "loss": 0.8769, + "step": 3498 + }, + { + "epoch": 0.6, + "grad_norm": 11.930132865905762, + "learning_rate": 2.0992792174360734e-05, + "loss": 1.1851, + "step": 3499 + }, + { + "epoch": 0.6, + "grad_norm": 10.988710403442383, + "learning_rate": 2.099021795091814e-05, + "loss": 1.0623, + "step": 3500 + }, + { + "epoch": 0.6, + "grad_norm": 9.411948204040527, + "learning_rate": 2.0987643727475544e-05, + "loss": 0.8566, + "step": 3501 + }, + { + "epoch": 0.6, + "grad_norm": 8.198237419128418, + "learning_rate": 2.098506950403295e-05, + "loss": 0.8863, + "step": 3502 + }, + { + "epoch": 0.6, + "grad_norm": 9.336523056030273, + "learning_rate": 2.0982495280590354e-05, + "loss": 0.9533, + "step": 3503 + }, + { + "epoch": 0.6, + "grad_norm": 12.431465148925781, + "learning_rate": 2.097992105714776e-05, + "loss": 1.0873, + "step": 3504 + }, + { + "epoch": 0.6, + "grad_norm": 10.55638313293457, + "learning_rate": 2.0977346833705167e-05, + "loss": 1.1952, + "step": 3505 + }, + { + "epoch": 0.6, + "grad_norm": 8.894783020019531, + "learning_rate": 2.0974772610262574e-05, + "loss": 0.7156, + "step": 3506 + }, + { + "epoch": 0.6, + "grad_norm": 10.867124557495117, + "learning_rate": 2.0972198386819977e-05, + "loss": 1.0386, + "step": 3507 + }, + { + "epoch": 0.6, + "grad_norm": 10.643655776977539, + "learning_rate": 2.096962416337738e-05, + "loss": 1.1146, + "step": 3508 + }, + { + "epoch": 0.6, + "grad_norm": 8.800186157226562, + "learning_rate": 2.0967049939934787e-05, + "loss": 1.0199, + "step": 3509 + }, + { + "epoch": 0.6, + "grad_norm": 9.355324745178223, + "learning_rate": 2.096447571649219e-05, + "loss": 0.9845, + "step": 3510 + }, + { + "epoch": 0.6, + "grad_norm": 9.809758186340332, + "learning_rate": 2.0961901493049597e-05, + "loss": 0.863, + "step": 3511 + }, + { + "epoch": 0.6, + "grad_norm": 8.43313217163086, + "learning_rate": 2.0959327269607e-05, + "loss": 0.8327, + "step": 3512 + }, + { + "epoch": 0.6, + "grad_norm": 10.357420921325684, + "learning_rate": 2.0956753046164407e-05, + "loss": 0.9441, + "step": 3513 + }, + { + "epoch": 0.6, + "grad_norm": 9.324047088623047, + "learning_rate": 2.095417882272181e-05, + "loss": 0.867, + "step": 3514 + }, + { + "epoch": 0.6, + "grad_norm": 10.328709602355957, + "learning_rate": 2.095160459927922e-05, + "loss": 1.003, + "step": 3515 + }, + { + "epoch": 0.6, + "grad_norm": 9.511649131774902, + "learning_rate": 2.0949030375836624e-05, + "loss": 1.1902, + "step": 3516 + }, + { + "epoch": 0.6, + "grad_norm": 9.947142601013184, + "learning_rate": 2.0946456152394027e-05, + "loss": 0.8807, + "step": 3517 + }, + { + "epoch": 0.6, + "grad_norm": 8.93046760559082, + "learning_rate": 2.0943881928951434e-05, + "loss": 1.0436, + "step": 3518 + }, + { + "epoch": 0.6, + "grad_norm": 8.775500297546387, + "learning_rate": 2.0941307705508837e-05, + "loss": 1.0016, + "step": 3519 + }, + { + "epoch": 0.6, + "grad_norm": 9.531303405761719, + "learning_rate": 2.0938733482066244e-05, + "loss": 0.966, + "step": 3520 + }, + { + "epoch": 0.6, + "grad_norm": 8.145430564880371, + "learning_rate": 2.0936159258623647e-05, + "loss": 0.7067, + "step": 3521 + }, + { + "epoch": 0.6, + "grad_norm": 8.787616729736328, + "learning_rate": 2.0933585035181054e-05, + "loss": 1.0376, + "step": 3522 + }, + { + "epoch": 0.6, + "grad_norm": 9.452864646911621, + "learning_rate": 2.0931010811738457e-05, + "loss": 1.0156, + "step": 3523 + }, + { + "epoch": 0.6, + "grad_norm": 10.645156860351562, + "learning_rate": 2.0928436588295867e-05, + "loss": 0.921, + "step": 3524 + }, + { + "epoch": 0.6, + "grad_norm": 9.26245403289795, + "learning_rate": 2.092586236485327e-05, + "loss": 0.9125, + "step": 3525 + }, + { + "epoch": 0.61, + "grad_norm": 8.762399673461914, + "learning_rate": 2.0923288141410677e-05, + "loss": 0.8004, + "step": 3526 + }, + { + "epoch": 0.61, + "grad_norm": 9.256109237670898, + "learning_rate": 2.092071391796808e-05, + "loss": 0.8267, + "step": 3527 + }, + { + "epoch": 0.61, + "grad_norm": 10.301924705505371, + "learning_rate": 2.0918139694525484e-05, + "loss": 0.9439, + "step": 3528 + }, + { + "epoch": 0.61, + "grad_norm": 10.991838455200195, + "learning_rate": 2.091556547108289e-05, + "loss": 1.0661, + "step": 3529 + }, + { + "epoch": 0.61, + "grad_norm": 9.099761009216309, + "learning_rate": 2.0912991247640294e-05, + "loss": 0.8532, + "step": 3530 + }, + { + "epoch": 0.61, + "grad_norm": 11.121550559997559, + "learning_rate": 2.09104170241977e-05, + "loss": 0.8051, + "step": 3531 + }, + { + "epoch": 0.61, + "grad_norm": 8.28807258605957, + "learning_rate": 2.0907842800755104e-05, + "loss": 0.7424, + "step": 3532 + }, + { + "epoch": 0.61, + "grad_norm": 8.281378746032715, + "learning_rate": 2.090526857731251e-05, + "loss": 0.7045, + "step": 3533 + }, + { + "epoch": 0.61, + "grad_norm": 10.212347030639648, + "learning_rate": 2.0902694353869917e-05, + "loss": 0.8286, + "step": 3534 + }, + { + "epoch": 0.61, + "grad_norm": 11.407116889953613, + "learning_rate": 2.0900120130427324e-05, + "loss": 1.0528, + "step": 3535 + }, + { + "epoch": 0.61, + "grad_norm": 12.708871841430664, + "learning_rate": 2.0897545906984727e-05, + "loss": 1.1231, + "step": 3536 + }, + { + "epoch": 0.61, + "grad_norm": 10.085511207580566, + "learning_rate": 2.0894971683542134e-05, + "loss": 0.7443, + "step": 3537 + }, + { + "epoch": 0.61, + "grad_norm": 11.9285888671875, + "learning_rate": 2.0892397460099537e-05, + "loss": 1.0158, + "step": 3538 + }, + { + "epoch": 0.61, + "grad_norm": 9.206541061401367, + "learning_rate": 2.088982323665694e-05, + "loss": 0.7936, + "step": 3539 + }, + { + "epoch": 0.61, + "grad_norm": 9.908160209655762, + "learning_rate": 2.0887249013214347e-05, + "loss": 0.6885, + "step": 3540 + }, + { + "epoch": 0.61, + "grad_norm": 12.243167877197266, + "learning_rate": 2.088467478977175e-05, + "loss": 1.0813, + "step": 3541 + }, + { + "epoch": 0.61, + "grad_norm": 12.120624542236328, + "learning_rate": 2.0882100566329157e-05, + "loss": 1.3071, + "step": 3542 + }, + { + "epoch": 0.61, + "grad_norm": 9.454063415527344, + "learning_rate": 2.0879526342886564e-05, + "loss": 1.0242, + "step": 3543 + }, + { + "epoch": 0.61, + "grad_norm": 10.565157890319824, + "learning_rate": 2.087695211944397e-05, + "loss": 1.0155, + "step": 3544 + }, + { + "epoch": 0.61, + "grad_norm": 9.01719856262207, + "learning_rate": 2.0874377896001374e-05, + "loss": 1.0074, + "step": 3545 + }, + { + "epoch": 0.61, + "grad_norm": 9.954242706298828, + "learning_rate": 2.087180367255878e-05, + "loss": 0.7199, + "step": 3546 + }, + { + "epoch": 0.61, + "grad_norm": 13.109999656677246, + "learning_rate": 2.0869229449116184e-05, + "loss": 0.736, + "step": 3547 + }, + { + "epoch": 0.61, + "grad_norm": 8.616226196289062, + "learning_rate": 2.086665522567359e-05, + "loss": 0.8655, + "step": 3548 + }, + { + "epoch": 0.61, + "grad_norm": 9.14283561706543, + "learning_rate": 2.0864081002230994e-05, + "loss": 0.7814, + "step": 3549 + }, + { + "epoch": 0.61, + "grad_norm": 10.235616683959961, + "learning_rate": 2.0861506778788397e-05, + "loss": 0.9135, + "step": 3550 + }, + { + "epoch": 0.61, + "grad_norm": 10.810868263244629, + "learning_rate": 2.0858932555345804e-05, + "loss": 1.1635, + "step": 3551 + }, + { + "epoch": 0.61, + "grad_norm": 8.180346488952637, + "learning_rate": 2.0856358331903207e-05, + "loss": 0.9643, + "step": 3552 + }, + { + "epoch": 0.61, + "grad_norm": 9.242701530456543, + "learning_rate": 2.0853784108460617e-05, + "loss": 0.6851, + "step": 3553 + }, + { + "epoch": 0.61, + "grad_norm": 8.4722261428833, + "learning_rate": 2.085120988501802e-05, + "loss": 0.9304, + "step": 3554 + }, + { + "epoch": 0.61, + "grad_norm": 9.554118156433105, + "learning_rate": 2.0848635661575427e-05, + "loss": 1.2391, + "step": 3555 + }, + { + "epoch": 0.61, + "grad_norm": 8.139974594116211, + "learning_rate": 2.084606143813283e-05, + "loss": 0.7742, + "step": 3556 + }, + { + "epoch": 0.61, + "grad_norm": 8.187524795532227, + "learning_rate": 2.0843487214690237e-05, + "loss": 0.818, + "step": 3557 + }, + { + "epoch": 0.61, + "grad_norm": 8.278107643127441, + "learning_rate": 2.084091299124764e-05, + "loss": 0.7832, + "step": 3558 + }, + { + "epoch": 0.61, + "grad_norm": 8.729706764221191, + "learning_rate": 2.0838338767805044e-05, + "loss": 0.8033, + "step": 3559 + }, + { + "epoch": 0.61, + "grad_norm": 9.261286735534668, + "learning_rate": 2.083576454436245e-05, + "loss": 1.0819, + "step": 3560 + }, + { + "epoch": 0.61, + "grad_norm": 8.894670486450195, + "learning_rate": 2.0833190320919854e-05, + "loss": 0.8773, + "step": 3561 + }, + { + "epoch": 0.61, + "grad_norm": 11.732855796813965, + "learning_rate": 2.0830616097477264e-05, + "loss": 1.083, + "step": 3562 + }, + { + "epoch": 0.61, + "grad_norm": 8.935416221618652, + "learning_rate": 2.0828041874034667e-05, + "loss": 0.823, + "step": 3563 + }, + { + "epoch": 0.61, + "grad_norm": 10.149697303771973, + "learning_rate": 2.0825467650592074e-05, + "loss": 1.0436, + "step": 3564 + }, + { + "epoch": 0.61, + "grad_norm": 8.783563613891602, + "learning_rate": 2.0822893427149477e-05, + "loss": 0.907, + "step": 3565 + }, + { + "epoch": 0.61, + "grad_norm": 10.004741668701172, + "learning_rate": 2.0820319203706884e-05, + "loss": 0.8219, + "step": 3566 + }, + { + "epoch": 0.61, + "grad_norm": 11.74236011505127, + "learning_rate": 2.0817744980264287e-05, + "loss": 1.1861, + "step": 3567 + }, + { + "epoch": 0.61, + "grad_norm": 10.484176635742188, + "learning_rate": 2.0815170756821694e-05, + "loss": 1.0975, + "step": 3568 + }, + { + "epoch": 0.61, + "grad_norm": 8.467572212219238, + "learning_rate": 2.0812596533379097e-05, + "loss": 0.7448, + "step": 3569 + }, + { + "epoch": 0.61, + "grad_norm": 11.28938102722168, + "learning_rate": 2.08100223099365e-05, + "loss": 1.1065, + "step": 3570 + }, + { + "epoch": 0.61, + "grad_norm": 11.573892593383789, + "learning_rate": 2.080744808649391e-05, + "loss": 1.1743, + "step": 3571 + }, + { + "epoch": 0.61, + "grad_norm": 8.535249710083008, + "learning_rate": 2.0804873863051313e-05, + "loss": 0.8175, + "step": 3572 + }, + { + "epoch": 0.61, + "grad_norm": 9.04575252532959, + "learning_rate": 2.080229963960872e-05, + "loss": 0.7422, + "step": 3573 + }, + { + "epoch": 0.61, + "grad_norm": 9.565536499023438, + "learning_rate": 2.0799725416166123e-05, + "loss": 0.9624, + "step": 3574 + }, + { + "epoch": 0.61, + "grad_norm": 8.618603706359863, + "learning_rate": 2.079715119272353e-05, + "loss": 0.9904, + "step": 3575 + }, + { + "epoch": 0.61, + "grad_norm": 10.872036933898926, + "learning_rate": 2.0794576969280933e-05, + "loss": 0.9718, + "step": 3576 + }, + { + "epoch": 0.61, + "grad_norm": 9.641082763671875, + "learning_rate": 2.079200274583834e-05, + "loss": 1.0681, + "step": 3577 + }, + { + "epoch": 0.61, + "grad_norm": 11.628007888793945, + "learning_rate": 2.0789428522395743e-05, + "loss": 1.004, + "step": 3578 + }, + { + "epoch": 0.61, + "grad_norm": 12.607502937316895, + "learning_rate": 2.078685429895315e-05, + "loss": 1.1708, + "step": 3579 + }, + { + "epoch": 0.61, + "grad_norm": 9.71158218383789, + "learning_rate": 2.0784280075510553e-05, + "loss": 0.9265, + "step": 3580 + }, + { + "epoch": 0.61, + "grad_norm": 8.65300464630127, + "learning_rate": 2.078170585206796e-05, + "loss": 0.6915, + "step": 3581 + }, + { + "epoch": 0.61, + "grad_norm": 9.42484188079834, + "learning_rate": 2.0779131628625367e-05, + "loss": 0.9448, + "step": 3582 + }, + { + "epoch": 0.61, + "grad_norm": 9.00444221496582, + "learning_rate": 2.077655740518277e-05, + "loss": 0.9024, + "step": 3583 + }, + { + "epoch": 0.62, + "grad_norm": 9.023063659667969, + "learning_rate": 2.0773983181740177e-05, + "loss": 0.8951, + "step": 3584 + }, + { + "epoch": 0.62, + "grad_norm": 11.056589126586914, + "learning_rate": 2.077140895829758e-05, + "loss": 0.6078, + "step": 3585 + }, + { + "epoch": 0.62, + "grad_norm": 10.09450912475586, + "learning_rate": 2.0768834734854987e-05, + "loss": 1.0143, + "step": 3586 + }, + { + "epoch": 0.62, + "grad_norm": 10.711925506591797, + "learning_rate": 2.076626051141239e-05, + "loss": 1.0569, + "step": 3587 + }, + { + "epoch": 0.62, + "grad_norm": 11.987104415893555, + "learning_rate": 2.0763686287969797e-05, + "loss": 0.9301, + "step": 3588 + }, + { + "epoch": 0.62, + "grad_norm": 10.281723022460938, + "learning_rate": 2.07611120645272e-05, + "loss": 0.8745, + "step": 3589 + }, + { + "epoch": 0.62, + "grad_norm": 11.499015808105469, + "learning_rate": 2.0758537841084607e-05, + "loss": 0.9749, + "step": 3590 + }, + { + "epoch": 0.62, + "grad_norm": 9.553611755371094, + "learning_rate": 2.0755963617642013e-05, + "loss": 0.8495, + "step": 3591 + }, + { + "epoch": 0.62, + "grad_norm": 11.892396926879883, + "learning_rate": 2.0753389394199417e-05, + "loss": 0.9487, + "step": 3592 + }, + { + "epoch": 0.62, + "grad_norm": 9.20673942565918, + "learning_rate": 2.0750815170756823e-05, + "loss": 0.7373, + "step": 3593 + }, + { + "epoch": 0.62, + "grad_norm": 10.146307945251465, + "learning_rate": 2.0748240947314227e-05, + "loss": 0.637, + "step": 3594 + }, + { + "epoch": 0.62, + "grad_norm": 8.793533325195312, + "learning_rate": 2.0745666723871633e-05, + "loss": 0.7771, + "step": 3595 + }, + { + "epoch": 0.62, + "grad_norm": 7.788816928863525, + "learning_rate": 2.0743092500429037e-05, + "loss": 0.7002, + "step": 3596 + }, + { + "epoch": 0.62, + "grad_norm": 9.618098258972168, + "learning_rate": 2.0740518276986443e-05, + "loss": 0.8879, + "step": 3597 + }, + { + "epoch": 0.62, + "grad_norm": 11.012236595153809, + "learning_rate": 2.0737944053543847e-05, + "loss": 0.8965, + "step": 3598 + }, + { + "epoch": 0.62, + "grad_norm": 11.48543643951416, + "learning_rate": 2.0735369830101253e-05, + "loss": 0.9293, + "step": 3599 + }, + { + "epoch": 0.62, + "grad_norm": 11.59442138671875, + "learning_rate": 2.073279560665866e-05, + "loss": 1.0291, + "step": 3600 + }, + { + "epoch": 0.62, + "grad_norm": 8.945700645446777, + "learning_rate": 2.0730221383216063e-05, + "loss": 1.0973, + "step": 3601 + }, + { + "epoch": 0.62, + "grad_norm": 9.3099365234375, + "learning_rate": 2.072764715977347e-05, + "loss": 0.5768, + "step": 3602 + }, + { + "epoch": 0.62, + "grad_norm": 10.856770515441895, + "learning_rate": 2.0725072936330873e-05, + "loss": 0.8684, + "step": 3603 + }, + { + "epoch": 0.62, + "grad_norm": 10.140678405761719, + "learning_rate": 2.072249871288828e-05, + "loss": 0.8551, + "step": 3604 + }, + { + "epoch": 0.62, + "grad_norm": 8.962964057922363, + "learning_rate": 2.0719924489445683e-05, + "loss": 0.8009, + "step": 3605 + }, + { + "epoch": 0.62, + "grad_norm": 10.85222053527832, + "learning_rate": 2.071735026600309e-05, + "loss": 1.1754, + "step": 3606 + }, + { + "epoch": 0.62, + "grad_norm": 8.708244323730469, + "learning_rate": 2.0714776042560493e-05, + "loss": 1.0253, + "step": 3607 + }, + { + "epoch": 0.62, + "grad_norm": 10.539422988891602, + "learning_rate": 2.07122018191179e-05, + "loss": 1.039, + "step": 3608 + }, + { + "epoch": 0.62, + "grad_norm": 11.20456314086914, + "learning_rate": 2.0709627595675306e-05, + "loss": 1.0313, + "step": 3609 + }, + { + "epoch": 0.62, + "grad_norm": 9.100445747375488, + "learning_rate": 2.0707053372232713e-05, + "loss": 0.7912, + "step": 3610 + }, + { + "epoch": 0.62, + "grad_norm": 11.814589500427246, + "learning_rate": 2.0704479148790116e-05, + "loss": 0.9934, + "step": 3611 + }, + { + "epoch": 0.62, + "grad_norm": 9.972722053527832, + "learning_rate": 2.070190492534752e-05, + "loss": 1.1131, + "step": 3612 + }, + { + "epoch": 0.62, + "grad_norm": 11.813591003417969, + "learning_rate": 2.0699330701904926e-05, + "loss": 0.8968, + "step": 3613 + }, + { + "epoch": 0.62, + "grad_norm": 8.923067092895508, + "learning_rate": 2.069675647846233e-05, + "loss": 0.9494, + "step": 3614 + }, + { + "epoch": 0.62, + "grad_norm": 7.696853160858154, + "learning_rate": 2.0694182255019736e-05, + "loss": 0.6798, + "step": 3615 + }, + { + "epoch": 0.62, + "grad_norm": 9.308184623718262, + "learning_rate": 2.069160803157714e-05, + "loss": 0.8866, + "step": 3616 + }, + { + "epoch": 0.62, + "grad_norm": 7.513751029968262, + "learning_rate": 2.0689033808134546e-05, + "loss": 1.0054, + "step": 3617 + }, + { + "epoch": 0.62, + "grad_norm": 10.617974281311035, + "learning_rate": 2.068645958469195e-05, + "loss": 0.9725, + "step": 3618 + }, + { + "epoch": 0.62, + "grad_norm": 7.4360198974609375, + "learning_rate": 2.068388536124936e-05, + "loss": 0.8346, + "step": 3619 + }, + { + "epoch": 0.62, + "grad_norm": 8.895474433898926, + "learning_rate": 2.0681311137806763e-05, + "loss": 0.883, + "step": 3620 + }, + { + "epoch": 0.62, + "grad_norm": 9.972665786743164, + "learning_rate": 2.0678736914364166e-05, + "loss": 0.9801, + "step": 3621 + }, + { + "epoch": 0.62, + "grad_norm": 8.942977905273438, + "learning_rate": 2.0676162690921573e-05, + "loss": 0.7673, + "step": 3622 + }, + { + "epoch": 0.62, + "grad_norm": 11.423605918884277, + "learning_rate": 2.0673588467478976e-05, + "loss": 0.9278, + "step": 3623 + }, + { + "epoch": 0.62, + "grad_norm": 8.923066139221191, + "learning_rate": 2.0671014244036383e-05, + "loss": 0.825, + "step": 3624 + }, + { + "epoch": 0.62, + "grad_norm": 11.148574829101562, + "learning_rate": 2.0668440020593786e-05, + "loss": 0.8397, + "step": 3625 + }, + { + "epoch": 0.62, + "grad_norm": 9.27463436126709, + "learning_rate": 2.0665865797151193e-05, + "loss": 0.7673, + "step": 3626 + }, + { + "epoch": 0.62, + "grad_norm": 10.153227806091309, + "learning_rate": 2.0663291573708596e-05, + "loss": 1.045, + "step": 3627 + }, + { + "epoch": 0.62, + "grad_norm": 10.231895446777344, + "learning_rate": 2.0660717350266006e-05, + "loss": 0.9806, + "step": 3628 + }, + { + "epoch": 0.62, + "grad_norm": 8.98068904876709, + "learning_rate": 2.065814312682341e-05, + "loss": 0.9754, + "step": 3629 + }, + { + "epoch": 0.62, + "grad_norm": 10.418566703796387, + "learning_rate": 2.0655568903380816e-05, + "loss": 0.8759, + "step": 3630 + }, + { + "epoch": 0.62, + "grad_norm": 10.500130653381348, + "learning_rate": 2.065299467993822e-05, + "loss": 0.9201, + "step": 3631 + }, + { + "epoch": 0.62, + "grad_norm": 10.282633781433105, + "learning_rate": 2.0650420456495623e-05, + "loss": 1.0528, + "step": 3632 + }, + { + "epoch": 0.62, + "grad_norm": 9.78018569946289, + "learning_rate": 2.064784623305303e-05, + "loss": 1.0521, + "step": 3633 + }, + { + "epoch": 0.62, + "grad_norm": 10.125130653381348, + "learning_rate": 2.0645272009610433e-05, + "loss": 0.7093, + "step": 3634 + }, + { + "epoch": 0.62, + "grad_norm": 9.378721237182617, + "learning_rate": 2.064269778616784e-05, + "loss": 1.1703, + "step": 3635 + }, + { + "epoch": 0.62, + "grad_norm": 11.150736808776855, + "learning_rate": 2.0640123562725243e-05, + "loss": 1.0198, + "step": 3636 + }, + { + "epoch": 0.62, + "grad_norm": 10.465408325195312, + "learning_rate": 2.063754933928265e-05, + "loss": 0.8568, + "step": 3637 + }, + { + "epoch": 0.62, + "grad_norm": 10.055120468139648, + "learning_rate": 2.0634975115840056e-05, + "loss": 0.7264, + "step": 3638 + }, + { + "epoch": 0.62, + "grad_norm": 8.949440002441406, + "learning_rate": 2.0632400892397463e-05, + "loss": 0.7122, + "step": 3639 + }, + { + "epoch": 0.62, + "grad_norm": 10.54444694519043, + "learning_rate": 2.0629826668954866e-05, + "loss": 0.8656, + "step": 3640 + }, + { + "epoch": 0.62, + "grad_norm": 12.26147174835205, + "learning_rate": 2.0627252445512273e-05, + "loss": 1.0589, + "step": 3641 + }, + { + "epoch": 0.63, + "grad_norm": 10.616227149963379, + "learning_rate": 2.0624678222069676e-05, + "loss": 1.0893, + "step": 3642 + }, + { + "epoch": 0.63, + "grad_norm": 10.967999458312988, + "learning_rate": 2.062210399862708e-05, + "loss": 0.8218, + "step": 3643 + }, + { + "epoch": 0.63, + "grad_norm": 10.495426177978516, + "learning_rate": 2.0619529775184486e-05, + "loss": 0.8297, + "step": 3644 + }, + { + "epoch": 0.63, + "grad_norm": 9.690934181213379, + "learning_rate": 2.061695555174189e-05, + "loss": 0.8979, + "step": 3645 + }, + { + "epoch": 0.63, + "grad_norm": 11.918691635131836, + "learning_rate": 2.0614381328299296e-05, + "loss": 1.068, + "step": 3646 + }, + { + "epoch": 0.63, + "grad_norm": 13.609515190124512, + "learning_rate": 2.0611807104856703e-05, + "loss": 1.0839, + "step": 3647 + }, + { + "epoch": 0.63, + "grad_norm": 9.08926010131836, + "learning_rate": 2.060923288141411e-05, + "loss": 0.6693, + "step": 3648 + }, + { + "epoch": 0.63, + "grad_norm": 10.391222953796387, + "learning_rate": 2.0606658657971513e-05, + "loss": 0.7525, + "step": 3649 + }, + { + "epoch": 0.63, + "grad_norm": 9.984003067016602, + "learning_rate": 2.060408443452892e-05, + "loss": 1.16, + "step": 3650 + }, + { + "epoch": 0.63, + "grad_norm": 11.724226951599121, + "learning_rate": 2.0601510211086323e-05, + "loss": 1.1775, + "step": 3651 + }, + { + "epoch": 0.63, + "grad_norm": 9.231865882873535, + "learning_rate": 2.059893598764373e-05, + "loss": 1.1622, + "step": 3652 + }, + { + "epoch": 0.63, + "grad_norm": 9.671733856201172, + "learning_rate": 2.0596361764201133e-05, + "loss": 0.8171, + "step": 3653 + }, + { + "epoch": 0.63, + "grad_norm": 9.980101585388184, + "learning_rate": 2.0593787540758536e-05, + "loss": 0.9338, + "step": 3654 + }, + { + "epoch": 0.63, + "grad_norm": 9.749847412109375, + "learning_rate": 2.0591213317315943e-05, + "loss": 0.8241, + "step": 3655 + }, + { + "epoch": 0.63, + "grad_norm": 10.234421730041504, + "learning_rate": 2.0588639093873346e-05, + "loss": 0.8507, + "step": 3656 + }, + { + "epoch": 0.63, + "grad_norm": 10.617901802062988, + "learning_rate": 2.0586064870430756e-05, + "loss": 1.1041, + "step": 3657 + }, + { + "epoch": 0.63, + "grad_norm": 9.370726585388184, + "learning_rate": 2.058349064698816e-05, + "loss": 1.0995, + "step": 3658 + }, + { + "epoch": 0.63, + "grad_norm": 11.447821617126465, + "learning_rate": 2.0580916423545566e-05, + "loss": 0.6795, + "step": 3659 + }, + { + "epoch": 0.63, + "grad_norm": 9.15230941772461, + "learning_rate": 2.057834220010297e-05, + "loss": 0.849, + "step": 3660 + }, + { + "epoch": 0.63, + "grad_norm": 9.862015724182129, + "learning_rate": 2.0575767976660376e-05, + "loss": 0.9088, + "step": 3661 + }, + { + "epoch": 0.63, + "grad_norm": 9.513533592224121, + "learning_rate": 2.057319375321778e-05, + "loss": 0.8844, + "step": 3662 + }, + { + "epoch": 0.63, + "grad_norm": 11.859278678894043, + "learning_rate": 2.0570619529775183e-05, + "loss": 1.1208, + "step": 3663 + }, + { + "epoch": 0.63, + "grad_norm": 9.01164722442627, + "learning_rate": 2.056804530633259e-05, + "loss": 0.8136, + "step": 3664 + }, + { + "epoch": 0.63, + "grad_norm": 9.088756561279297, + "learning_rate": 2.0565471082889993e-05, + "loss": 0.9166, + "step": 3665 + }, + { + "epoch": 0.63, + "grad_norm": 9.779842376708984, + "learning_rate": 2.0562896859447403e-05, + "loss": 0.8252, + "step": 3666 + }, + { + "epoch": 0.63, + "grad_norm": 9.122698783874512, + "learning_rate": 2.0560322636004806e-05, + "loss": 1.1255, + "step": 3667 + }, + { + "epoch": 0.63, + "grad_norm": 9.587461471557617, + "learning_rate": 2.0557748412562213e-05, + "loss": 0.9872, + "step": 3668 + }, + { + "epoch": 0.63, + "grad_norm": 11.257893562316895, + "learning_rate": 2.0555174189119616e-05, + "loss": 0.9899, + "step": 3669 + }, + { + "epoch": 0.63, + "grad_norm": 9.810105323791504, + "learning_rate": 2.0552599965677023e-05, + "loss": 0.8676, + "step": 3670 + }, + { + "epoch": 0.63, + "grad_norm": 10.048532485961914, + "learning_rate": 2.0550025742234426e-05, + "loss": 0.8357, + "step": 3671 + }, + { + "epoch": 0.63, + "grad_norm": 9.374921798706055, + "learning_rate": 2.0547451518791833e-05, + "loss": 0.805, + "step": 3672 + }, + { + "epoch": 0.63, + "grad_norm": 9.545283317565918, + "learning_rate": 2.0544877295349236e-05, + "loss": 0.7126, + "step": 3673 + }, + { + "epoch": 0.63, + "grad_norm": 11.644498825073242, + "learning_rate": 2.054230307190664e-05, + "loss": 1.0556, + "step": 3674 + }, + { + "epoch": 0.63, + "grad_norm": 10.367122650146484, + "learning_rate": 2.053972884846405e-05, + "loss": 0.7204, + "step": 3675 + }, + { + "epoch": 0.63, + "grad_norm": 11.233128547668457, + "learning_rate": 2.0537154625021452e-05, + "loss": 0.9228, + "step": 3676 + }, + { + "epoch": 0.63, + "grad_norm": 10.225369453430176, + "learning_rate": 2.053458040157886e-05, + "loss": 1.1144, + "step": 3677 + }, + { + "epoch": 0.63, + "grad_norm": 10.940107345581055, + "learning_rate": 2.0532006178136262e-05, + "loss": 1.012, + "step": 3678 + }, + { + "epoch": 0.63, + "grad_norm": 8.932011604309082, + "learning_rate": 2.052943195469367e-05, + "loss": 0.7457, + "step": 3679 + }, + { + "epoch": 0.63, + "grad_norm": 11.4862060546875, + "learning_rate": 2.0526857731251072e-05, + "loss": 1.4854, + "step": 3680 + }, + { + "epoch": 0.63, + "grad_norm": 10.594715118408203, + "learning_rate": 2.052428350780848e-05, + "loss": 1.0158, + "step": 3681 + }, + { + "epoch": 0.63, + "grad_norm": 7.786680698394775, + "learning_rate": 2.0521709284365882e-05, + "loss": 0.6139, + "step": 3682 + }, + { + "epoch": 0.63, + "grad_norm": 8.549161911010742, + "learning_rate": 2.051913506092329e-05, + "loss": 0.7108, + "step": 3683 + }, + { + "epoch": 0.63, + "grad_norm": 11.391383171081543, + "learning_rate": 2.0516560837480692e-05, + "loss": 1.3806, + "step": 3684 + }, + { + "epoch": 0.63, + "grad_norm": 9.751867294311523, + "learning_rate": 2.05139866140381e-05, + "loss": 0.7695, + "step": 3685 + }, + { + "epoch": 0.63, + "grad_norm": 9.59591007232666, + "learning_rate": 2.0511412390595506e-05, + "loss": 1.2076, + "step": 3686 + }, + { + "epoch": 0.63, + "grad_norm": 7.624353885650635, + "learning_rate": 2.050883816715291e-05, + "loss": 0.9371, + "step": 3687 + }, + { + "epoch": 0.63, + "grad_norm": 9.224845886230469, + "learning_rate": 2.0506263943710316e-05, + "loss": 0.8086, + "step": 3688 + }, + { + "epoch": 0.63, + "grad_norm": 8.501087188720703, + "learning_rate": 2.050368972026772e-05, + "loss": 0.9888, + "step": 3689 + }, + { + "epoch": 0.63, + "grad_norm": 9.994588851928711, + "learning_rate": 2.0501115496825126e-05, + "loss": 1.1196, + "step": 3690 + }, + { + "epoch": 0.63, + "grad_norm": 9.628186225891113, + "learning_rate": 2.049854127338253e-05, + "loss": 0.8487, + "step": 3691 + }, + { + "epoch": 0.63, + "grad_norm": 10.290560722351074, + "learning_rate": 2.0495967049939936e-05, + "loss": 0.8929, + "step": 3692 + }, + { + "epoch": 0.63, + "grad_norm": 10.528678894042969, + "learning_rate": 2.049339282649734e-05, + "loss": 0.8967, + "step": 3693 + }, + { + "epoch": 0.63, + "grad_norm": 10.490278244018555, + "learning_rate": 2.0490818603054746e-05, + "loss": 0.8183, + "step": 3694 + }, + { + "epoch": 0.63, + "grad_norm": 10.31004810333252, + "learning_rate": 2.0488244379612152e-05, + "loss": 0.8496, + "step": 3695 + }, + { + "epoch": 0.63, + "grad_norm": 8.87453842163086, + "learning_rate": 2.0485670156169556e-05, + "loss": 0.6821, + "step": 3696 + }, + { + "epoch": 0.63, + "grad_norm": 12.041169166564941, + "learning_rate": 2.0483095932726962e-05, + "loss": 1.1253, + "step": 3697 + }, + { + "epoch": 0.63, + "grad_norm": 7.960317134857178, + "learning_rate": 2.0480521709284366e-05, + "loss": 0.7579, + "step": 3698 + }, + { + "epoch": 0.63, + "grad_norm": 10.513557434082031, + "learning_rate": 2.0477947485841772e-05, + "loss": 0.7965, + "step": 3699 + }, + { + "epoch": 0.63, + "grad_norm": 11.026494026184082, + "learning_rate": 2.0475373262399176e-05, + "loss": 0.6793, + "step": 3700 + }, + { + "epoch": 0.64, + "grad_norm": 9.285836219787598, + "learning_rate": 2.0472799038956582e-05, + "loss": 0.7486, + "step": 3701 + }, + { + "epoch": 0.64, + "grad_norm": 11.21915054321289, + "learning_rate": 2.0470224815513986e-05, + "loss": 0.8206, + "step": 3702 + }, + { + "epoch": 0.64, + "grad_norm": 10.627345085144043, + "learning_rate": 2.0467650592071392e-05, + "loss": 1.1697, + "step": 3703 + }, + { + "epoch": 0.64, + "grad_norm": 11.660696029663086, + "learning_rate": 2.04650763686288e-05, + "loss": 1.0654, + "step": 3704 + }, + { + "epoch": 0.64, + "grad_norm": 11.604650497436523, + "learning_rate": 2.0462502145186202e-05, + "loss": 0.9215, + "step": 3705 + }, + { + "epoch": 0.64, + "grad_norm": 8.867598533630371, + "learning_rate": 2.045992792174361e-05, + "loss": 0.854, + "step": 3706 + }, + { + "epoch": 0.64, + "grad_norm": 10.247719764709473, + "learning_rate": 2.0457353698301012e-05, + "loss": 0.8712, + "step": 3707 + }, + { + "epoch": 0.64, + "grad_norm": 10.443865776062012, + "learning_rate": 2.045477947485842e-05, + "loss": 0.9615, + "step": 3708 + }, + { + "epoch": 0.64, + "grad_norm": 9.16934871673584, + "learning_rate": 2.0452205251415822e-05, + "loss": 0.9865, + "step": 3709 + }, + { + "epoch": 0.64, + "grad_norm": 10.650999069213867, + "learning_rate": 2.044963102797323e-05, + "loss": 0.9038, + "step": 3710 + }, + { + "epoch": 0.64, + "grad_norm": 10.862171173095703, + "learning_rate": 2.0447056804530632e-05, + "loss": 0.8635, + "step": 3711 + }, + { + "epoch": 0.64, + "grad_norm": 8.971321105957031, + "learning_rate": 2.044448258108804e-05, + "loss": 0.7518, + "step": 3712 + }, + { + "epoch": 0.64, + "grad_norm": 8.484099388122559, + "learning_rate": 2.0441908357645445e-05, + "loss": 0.888, + "step": 3713 + }, + { + "epoch": 0.64, + "grad_norm": 9.152271270751953, + "learning_rate": 2.0439334134202852e-05, + "loss": 0.856, + "step": 3714 + }, + { + "epoch": 0.64, + "grad_norm": 8.827516555786133, + "learning_rate": 2.0436759910760255e-05, + "loss": 0.7075, + "step": 3715 + }, + { + "epoch": 0.64, + "grad_norm": 9.254996299743652, + "learning_rate": 2.043418568731766e-05, + "loss": 0.7263, + "step": 3716 + }, + { + "epoch": 0.64, + "grad_norm": 9.175752639770508, + "learning_rate": 2.0431611463875065e-05, + "loss": 0.9317, + "step": 3717 + }, + { + "epoch": 0.64, + "grad_norm": 9.29155158996582, + "learning_rate": 2.042903724043247e-05, + "loss": 0.7879, + "step": 3718 + }, + { + "epoch": 0.64, + "grad_norm": 11.067744255065918, + "learning_rate": 2.0426463016989875e-05, + "loss": 0.9092, + "step": 3719 + }, + { + "epoch": 0.64, + "grad_norm": 10.410298347473145, + "learning_rate": 2.042388879354728e-05, + "loss": 0.803, + "step": 3720 + }, + { + "epoch": 0.64, + "grad_norm": 9.558835983276367, + "learning_rate": 2.0421314570104685e-05, + "loss": 1.0914, + "step": 3721 + }, + { + "epoch": 0.64, + "grad_norm": 9.75089168548584, + "learning_rate": 2.041874034666209e-05, + "loss": 0.7516, + "step": 3722 + }, + { + "epoch": 0.64, + "grad_norm": 10.924942970275879, + "learning_rate": 2.04161661232195e-05, + "loss": 1.0185, + "step": 3723 + }, + { + "epoch": 0.64, + "grad_norm": 8.148600578308105, + "learning_rate": 2.0413591899776902e-05, + "loss": 0.6848, + "step": 3724 + }, + { + "epoch": 0.64, + "grad_norm": 8.697454452514648, + "learning_rate": 2.041101767633431e-05, + "loss": 0.7205, + "step": 3725 + }, + { + "epoch": 0.64, + "grad_norm": 11.940282821655273, + "learning_rate": 2.0408443452891712e-05, + "loss": 1.0038, + "step": 3726 + }, + { + "epoch": 0.64, + "grad_norm": 12.05156421661377, + "learning_rate": 2.0405869229449115e-05, + "loss": 0.7903, + "step": 3727 + }, + { + "epoch": 0.64, + "grad_norm": 9.532210350036621, + "learning_rate": 2.0403295006006522e-05, + "loss": 0.9322, + "step": 3728 + }, + { + "epoch": 0.64, + "grad_norm": 11.266775131225586, + "learning_rate": 2.0400720782563925e-05, + "loss": 1.1588, + "step": 3729 + }, + { + "epoch": 0.64, + "grad_norm": 11.365345001220703, + "learning_rate": 2.0398146559121332e-05, + "loss": 1.3292, + "step": 3730 + }, + { + "epoch": 0.64, + "grad_norm": 10.505094528198242, + "learning_rate": 2.0395572335678735e-05, + "loss": 0.9166, + "step": 3731 + }, + { + "epoch": 0.64, + "grad_norm": 8.55389404296875, + "learning_rate": 2.0392998112236145e-05, + "loss": 0.9683, + "step": 3732 + }, + { + "epoch": 0.64, + "grad_norm": 8.614384651184082, + "learning_rate": 2.039042388879355e-05, + "loss": 0.6841, + "step": 3733 + }, + { + "epoch": 0.64, + "grad_norm": 8.463835716247559, + "learning_rate": 2.0387849665350955e-05, + "loss": 0.7642, + "step": 3734 + }, + { + "epoch": 0.64, + "grad_norm": 11.269347190856934, + "learning_rate": 2.038527544190836e-05, + "loss": 1.3927, + "step": 3735 + }, + { + "epoch": 0.64, + "grad_norm": 10.42772388458252, + "learning_rate": 2.0382701218465762e-05, + "loss": 0.8991, + "step": 3736 + }, + { + "epoch": 0.64, + "grad_norm": 12.168792724609375, + "learning_rate": 2.038012699502317e-05, + "loss": 1.0906, + "step": 3737 + }, + { + "epoch": 0.64, + "grad_norm": 10.61593246459961, + "learning_rate": 2.0377552771580572e-05, + "loss": 0.9604, + "step": 3738 + }, + { + "epoch": 0.64, + "grad_norm": 8.775289535522461, + "learning_rate": 2.037497854813798e-05, + "loss": 0.7586, + "step": 3739 + }, + { + "epoch": 0.64, + "grad_norm": 11.232149124145508, + "learning_rate": 2.0372404324695382e-05, + "loss": 0.9341, + "step": 3740 + }, + { + "epoch": 0.64, + "grad_norm": 9.806316375732422, + "learning_rate": 2.036983010125279e-05, + "loss": 0.9023, + "step": 3741 + }, + { + "epoch": 0.64, + "grad_norm": 9.716303825378418, + "learning_rate": 2.0367255877810195e-05, + "loss": 0.764, + "step": 3742 + }, + { + "epoch": 0.64, + "grad_norm": 8.610280990600586, + "learning_rate": 2.0364681654367602e-05, + "loss": 0.7535, + "step": 3743 + }, + { + "epoch": 0.64, + "grad_norm": 9.894516944885254, + "learning_rate": 2.0362107430925005e-05, + "loss": 0.8253, + "step": 3744 + }, + { + "epoch": 0.64, + "grad_norm": 10.50018310546875, + "learning_rate": 2.0359533207482412e-05, + "loss": 0.7683, + "step": 3745 + }, + { + "epoch": 0.64, + "grad_norm": 9.97812557220459, + "learning_rate": 2.0356958984039815e-05, + "loss": 0.8469, + "step": 3746 + }, + { + "epoch": 0.64, + "grad_norm": 9.179431915283203, + "learning_rate": 2.035438476059722e-05, + "loss": 1.0417, + "step": 3747 + }, + { + "epoch": 0.64, + "grad_norm": 8.565556526184082, + "learning_rate": 2.0351810537154625e-05, + "loss": 0.9344, + "step": 3748 + }, + { + "epoch": 0.64, + "grad_norm": 11.892313957214355, + "learning_rate": 2.034923631371203e-05, + "loss": 0.8471, + "step": 3749 + }, + { + "epoch": 0.64, + "grad_norm": 9.469517707824707, + "learning_rate": 2.0346662090269435e-05, + "loss": 0.6998, + "step": 3750 + }, + { + "epoch": 0.64, + "grad_norm": 8.924263954162598, + "learning_rate": 2.0344087866826842e-05, + "loss": 0.9014, + "step": 3751 + }, + { + "epoch": 0.64, + "grad_norm": 8.163795471191406, + "learning_rate": 2.034151364338425e-05, + "loss": 0.724, + "step": 3752 + }, + { + "epoch": 0.64, + "grad_norm": 7.019913196563721, + "learning_rate": 2.0338939419941652e-05, + "loss": 0.7146, + "step": 3753 + }, + { + "epoch": 0.64, + "grad_norm": 8.92927074432373, + "learning_rate": 2.033636519649906e-05, + "loss": 0.7491, + "step": 3754 + }, + { + "epoch": 0.64, + "grad_norm": 10.876102447509766, + "learning_rate": 2.0333790973056462e-05, + "loss": 0.9295, + "step": 3755 + }, + { + "epoch": 0.64, + "grad_norm": 10.042825698852539, + "learning_rate": 2.033121674961387e-05, + "loss": 1.0661, + "step": 3756 + }, + { + "epoch": 0.64, + "grad_norm": 7.4817795753479, + "learning_rate": 2.0328642526171272e-05, + "loss": 0.7045, + "step": 3757 + }, + { + "epoch": 0.64, + "grad_norm": 11.642267227172852, + "learning_rate": 2.0326068302728675e-05, + "loss": 0.8738, + "step": 3758 + }, + { + "epoch": 0.65, + "grad_norm": 8.736170768737793, + "learning_rate": 2.032349407928608e-05, + "loss": 0.7063, + "step": 3759 + }, + { + "epoch": 0.65, + "grad_norm": 11.759143829345703, + "learning_rate": 2.0320919855843485e-05, + "loss": 0.884, + "step": 3760 + }, + { + "epoch": 0.65, + "grad_norm": 12.837376594543457, + "learning_rate": 2.0318345632400895e-05, + "loss": 1.0298, + "step": 3761 + }, + { + "epoch": 0.65, + "grad_norm": 10.604540824890137, + "learning_rate": 2.03157714089583e-05, + "loss": 0.8982, + "step": 3762 + }, + { + "epoch": 0.65, + "grad_norm": 11.09512710571289, + "learning_rate": 2.0313197185515705e-05, + "loss": 0.8747, + "step": 3763 + }, + { + "epoch": 0.65, + "grad_norm": 10.5975923538208, + "learning_rate": 2.031062296207311e-05, + "loss": 0.9215, + "step": 3764 + }, + { + "epoch": 0.65, + "grad_norm": 11.992774963378906, + "learning_rate": 2.0308048738630515e-05, + "loss": 1.066, + "step": 3765 + }, + { + "epoch": 0.65, + "grad_norm": 11.00985050201416, + "learning_rate": 2.0305474515187918e-05, + "loss": 1.0393, + "step": 3766 + }, + { + "epoch": 0.65, + "grad_norm": 8.537951469421387, + "learning_rate": 2.030290029174532e-05, + "loss": 0.8109, + "step": 3767 + }, + { + "epoch": 0.65, + "grad_norm": 9.143230438232422, + "learning_rate": 2.0300326068302728e-05, + "loss": 0.9042, + "step": 3768 + }, + { + "epoch": 0.65, + "grad_norm": 8.912277221679688, + "learning_rate": 2.029775184486013e-05, + "loss": 0.89, + "step": 3769 + }, + { + "epoch": 0.65, + "grad_norm": 9.441620826721191, + "learning_rate": 2.029517762141754e-05, + "loss": 0.7692, + "step": 3770 + }, + { + "epoch": 0.65, + "grad_norm": 8.406990051269531, + "learning_rate": 2.0292603397974945e-05, + "loss": 0.6933, + "step": 3771 + }, + { + "epoch": 0.65, + "grad_norm": 9.918211936950684, + "learning_rate": 2.029002917453235e-05, + "loss": 0.8019, + "step": 3772 + }, + { + "epoch": 0.65, + "grad_norm": 7.296481132507324, + "learning_rate": 2.0287454951089755e-05, + "loss": 0.7531, + "step": 3773 + }, + { + "epoch": 0.65, + "grad_norm": 10.972326278686523, + "learning_rate": 2.028488072764716e-05, + "loss": 0.9363, + "step": 3774 + }, + { + "epoch": 0.65, + "grad_norm": 7.636072158813477, + "learning_rate": 2.0282306504204565e-05, + "loss": 0.7106, + "step": 3775 + }, + { + "epoch": 0.65, + "grad_norm": 9.79617691040039, + "learning_rate": 2.027973228076197e-05, + "loss": 1.0762, + "step": 3776 + }, + { + "epoch": 0.65, + "grad_norm": 8.521615982055664, + "learning_rate": 2.0277158057319375e-05, + "loss": 0.6283, + "step": 3777 + }, + { + "epoch": 0.65, + "grad_norm": 9.016718864440918, + "learning_rate": 2.0274583833876778e-05, + "loss": 0.7252, + "step": 3778 + }, + { + "epoch": 0.65, + "grad_norm": 9.147590637207031, + "learning_rate": 2.0272009610434185e-05, + "loss": 0.7279, + "step": 3779 + }, + { + "epoch": 0.65, + "grad_norm": 11.012465476989746, + "learning_rate": 2.026943538699159e-05, + "loss": 0.9239, + "step": 3780 + }, + { + "epoch": 0.65, + "grad_norm": 12.803175926208496, + "learning_rate": 2.0266861163548998e-05, + "loss": 0.7739, + "step": 3781 + }, + { + "epoch": 0.65, + "grad_norm": 11.089409828186035, + "learning_rate": 2.02642869401064e-05, + "loss": 0.9459, + "step": 3782 + }, + { + "epoch": 0.65, + "grad_norm": 10.030314445495605, + "learning_rate": 2.0261712716663808e-05, + "loss": 1.0363, + "step": 3783 + }, + { + "epoch": 0.65, + "grad_norm": 8.39607048034668, + "learning_rate": 2.025913849322121e-05, + "loss": 0.7716, + "step": 3784 + }, + { + "epoch": 0.65, + "grad_norm": 10.431344032287598, + "learning_rate": 2.0256564269778618e-05, + "loss": 1.1317, + "step": 3785 + }, + { + "epoch": 0.65, + "grad_norm": 8.764358520507812, + "learning_rate": 2.025399004633602e-05, + "loss": 0.819, + "step": 3786 + }, + { + "epoch": 0.65, + "grad_norm": 9.26692008972168, + "learning_rate": 2.0251415822893428e-05, + "loss": 0.8458, + "step": 3787 + }, + { + "epoch": 0.65, + "grad_norm": 8.75743293762207, + "learning_rate": 2.024884159945083e-05, + "loss": 0.6654, + "step": 3788 + }, + { + "epoch": 0.65, + "grad_norm": 9.401911735534668, + "learning_rate": 2.0246267376008238e-05, + "loss": 0.7295, + "step": 3789 + }, + { + "epoch": 0.65, + "grad_norm": 9.62918758392334, + "learning_rate": 2.0243693152565645e-05, + "loss": 0.7644, + "step": 3790 + }, + { + "epoch": 0.65, + "grad_norm": 9.303372383117676, + "learning_rate": 2.0241118929123048e-05, + "loss": 0.8104, + "step": 3791 + }, + { + "epoch": 0.65, + "grad_norm": 9.16428279876709, + "learning_rate": 2.0238544705680455e-05, + "loss": 0.6437, + "step": 3792 + }, + { + "epoch": 0.65, + "grad_norm": 10.093573570251465, + "learning_rate": 2.0235970482237858e-05, + "loss": 0.8171, + "step": 3793 + }, + { + "epoch": 0.65, + "grad_norm": 9.73033618927002, + "learning_rate": 2.0233396258795265e-05, + "loss": 0.8519, + "step": 3794 + }, + { + "epoch": 0.65, + "grad_norm": 9.583580017089844, + "learning_rate": 2.0230822035352668e-05, + "loss": 0.8865, + "step": 3795 + }, + { + "epoch": 0.65, + "grad_norm": 11.221098899841309, + "learning_rate": 2.0228247811910075e-05, + "loss": 0.8021, + "step": 3796 + }, + { + "epoch": 0.65, + "grad_norm": 9.95055866241455, + "learning_rate": 2.0225673588467478e-05, + "loss": 0.6669, + "step": 3797 + }, + { + "epoch": 0.65, + "grad_norm": 9.826077461242676, + "learning_rate": 2.0223099365024885e-05, + "loss": 0.8876, + "step": 3798 + }, + { + "epoch": 0.65, + "grad_norm": 12.529410362243652, + "learning_rate": 2.022052514158229e-05, + "loss": 0.7734, + "step": 3799 + }, + { + "epoch": 0.65, + "grad_norm": 10.645349502563477, + "learning_rate": 2.0217950918139695e-05, + "loss": 0.9514, + "step": 3800 + }, + { + "epoch": 0.65, + "grad_norm": 10.191333770751953, + "learning_rate": 2.02153766946971e-05, + "loss": 0.9086, + "step": 3801 + }, + { + "epoch": 0.65, + "grad_norm": 8.085418701171875, + "learning_rate": 2.0212802471254505e-05, + "loss": 0.7162, + "step": 3802 + }, + { + "epoch": 0.65, + "grad_norm": 11.082696914672852, + "learning_rate": 2.021022824781191e-05, + "loss": 0.9013, + "step": 3803 + }, + { + "epoch": 0.65, + "grad_norm": 8.440038681030273, + "learning_rate": 2.0207654024369315e-05, + "loss": 0.9378, + "step": 3804 + }, + { + "epoch": 0.65, + "grad_norm": 8.477656364440918, + "learning_rate": 2.020507980092672e-05, + "loss": 0.9534, + "step": 3805 + }, + { + "epoch": 0.65, + "grad_norm": 10.915143013000488, + "learning_rate": 2.0202505577484125e-05, + "loss": 0.8992, + "step": 3806 + }, + { + "epoch": 0.65, + "grad_norm": 9.547405242919922, + "learning_rate": 2.019993135404153e-05, + "loss": 0.6859, + "step": 3807 + }, + { + "epoch": 0.65, + "grad_norm": 10.993768692016602, + "learning_rate": 2.0197357130598938e-05, + "loss": 0.9447, + "step": 3808 + }, + { + "epoch": 0.65, + "grad_norm": 9.867547035217285, + "learning_rate": 2.019478290715634e-05, + "loss": 0.7099, + "step": 3809 + }, + { + "epoch": 0.65, + "grad_norm": 10.236151695251465, + "learning_rate": 2.0192208683713748e-05, + "loss": 0.8131, + "step": 3810 + }, + { + "epoch": 0.65, + "grad_norm": 12.449265480041504, + "learning_rate": 2.018963446027115e-05, + "loss": 1.2849, + "step": 3811 + }, + { + "epoch": 0.65, + "grad_norm": 9.297179222106934, + "learning_rate": 2.0187060236828558e-05, + "loss": 0.8853, + "step": 3812 + }, + { + "epoch": 0.65, + "grad_norm": 11.278464317321777, + "learning_rate": 2.018448601338596e-05, + "loss": 1.0639, + "step": 3813 + }, + { + "epoch": 0.65, + "grad_norm": 11.30207633972168, + "learning_rate": 2.0181911789943368e-05, + "loss": 1.0393, + "step": 3814 + }, + { + "epoch": 0.65, + "grad_norm": 9.025157928466797, + "learning_rate": 2.017933756650077e-05, + "loss": 0.8197, + "step": 3815 + }, + { + "epoch": 0.65, + "grad_norm": 9.642430305480957, + "learning_rate": 2.0176763343058178e-05, + "loss": 0.7759, + "step": 3816 + }, + { + "epoch": 0.66, + "grad_norm": 8.751938819885254, + "learning_rate": 2.0174189119615585e-05, + "loss": 0.7446, + "step": 3817 + }, + { + "epoch": 0.66, + "grad_norm": 9.939157485961914, + "learning_rate": 2.017161489617299e-05, + "loss": 0.9189, + "step": 3818 + }, + { + "epoch": 0.66, + "grad_norm": 8.48426628112793, + "learning_rate": 2.0169040672730394e-05, + "loss": 1.0649, + "step": 3819 + }, + { + "epoch": 0.66, + "grad_norm": 9.230118751525879, + "learning_rate": 2.0166466449287798e-05, + "loss": 0.6961, + "step": 3820 + }, + { + "epoch": 0.66, + "grad_norm": 8.982062339782715, + "learning_rate": 2.0163892225845204e-05, + "loss": 0.7987, + "step": 3821 + }, + { + "epoch": 0.66, + "grad_norm": 8.723980903625488, + "learning_rate": 2.0161318002402608e-05, + "loss": 0.8645, + "step": 3822 + }, + { + "epoch": 0.66, + "grad_norm": 9.927157402038574, + "learning_rate": 2.0158743778960014e-05, + "loss": 0.9157, + "step": 3823 + }, + { + "epoch": 0.66, + "grad_norm": 9.691457748413086, + "learning_rate": 2.0156169555517418e-05, + "loss": 1.1289, + "step": 3824 + }, + { + "epoch": 0.66, + "grad_norm": 9.984818458557129, + "learning_rate": 2.0153595332074824e-05, + "loss": 0.9488, + "step": 3825 + }, + { + "epoch": 0.66, + "grad_norm": 9.21733570098877, + "learning_rate": 2.0151021108632228e-05, + "loss": 1.0839, + "step": 3826 + }, + { + "epoch": 0.66, + "grad_norm": 8.890050888061523, + "learning_rate": 2.0148446885189638e-05, + "loss": 1.0869, + "step": 3827 + }, + { + "epoch": 0.66, + "grad_norm": 10.107467651367188, + "learning_rate": 2.014587266174704e-05, + "loss": 0.9242, + "step": 3828 + }, + { + "epoch": 0.66, + "grad_norm": 8.55062198638916, + "learning_rate": 2.0143298438304448e-05, + "loss": 0.8801, + "step": 3829 + }, + { + "epoch": 0.66, + "grad_norm": 11.245901107788086, + "learning_rate": 2.014072421486185e-05, + "loss": 1.0323, + "step": 3830 + }, + { + "epoch": 0.66, + "grad_norm": 9.906715393066406, + "learning_rate": 2.0138149991419254e-05, + "loss": 0.8801, + "step": 3831 + }, + { + "epoch": 0.66, + "grad_norm": 9.886914253234863, + "learning_rate": 2.013557576797666e-05, + "loss": 1.0558, + "step": 3832 + }, + { + "epoch": 0.66, + "grad_norm": 10.154519081115723, + "learning_rate": 2.0133001544534064e-05, + "loss": 0.9072, + "step": 3833 + }, + { + "epoch": 0.66, + "grad_norm": 9.438261032104492, + "learning_rate": 2.013042732109147e-05, + "loss": 0.7267, + "step": 3834 + }, + { + "epoch": 0.66, + "grad_norm": 9.867873191833496, + "learning_rate": 2.0127853097648874e-05, + "loss": 1.0893, + "step": 3835 + }, + { + "epoch": 0.66, + "grad_norm": 10.160661697387695, + "learning_rate": 2.0125278874206284e-05, + "loss": 1.0172, + "step": 3836 + }, + { + "epoch": 0.66, + "grad_norm": 10.30505084991455, + "learning_rate": 2.0122704650763688e-05, + "loss": 0.9757, + "step": 3837 + }, + { + "epoch": 0.66, + "grad_norm": 10.593942642211914, + "learning_rate": 2.0120130427321094e-05, + "loss": 0.8271, + "step": 3838 + }, + { + "epoch": 0.66, + "grad_norm": 12.254300117492676, + "learning_rate": 2.0117556203878498e-05, + "loss": 1.0351, + "step": 3839 + }, + { + "epoch": 0.66, + "grad_norm": 9.483967781066895, + "learning_rate": 2.01149819804359e-05, + "loss": 0.8084, + "step": 3840 + }, + { + "epoch": 0.66, + "grad_norm": 10.560583114624023, + "learning_rate": 2.0112407756993308e-05, + "loss": 1.077, + "step": 3841 + }, + { + "epoch": 0.66, + "grad_norm": 10.414691925048828, + "learning_rate": 2.010983353355071e-05, + "loss": 0.9408, + "step": 3842 + }, + { + "epoch": 0.66, + "grad_norm": 9.46604061126709, + "learning_rate": 2.0107259310108118e-05, + "loss": 0.8097, + "step": 3843 + }, + { + "epoch": 0.66, + "grad_norm": 11.858001708984375, + "learning_rate": 2.010468508666552e-05, + "loss": 0.9786, + "step": 3844 + }, + { + "epoch": 0.66, + "grad_norm": 13.043100357055664, + "learning_rate": 2.0102110863222928e-05, + "loss": 1.1017, + "step": 3845 + }, + { + "epoch": 0.66, + "grad_norm": 10.875686645507812, + "learning_rate": 2.0099536639780334e-05, + "loss": 1.1055, + "step": 3846 + }, + { + "epoch": 0.66, + "grad_norm": 10.703948974609375, + "learning_rate": 2.009696241633774e-05, + "loss": 0.8754, + "step": 3847 + }, + { + "epoch": 0.66, + "grad_norm": 8.572092056274414, + "learning_rate": 2.0094388192895144e-05, + "loss": 0.7629, + "step": 3848 + }, + { + "epoch": 0.66, + "grad_norm": 8.499730110168457, + "learning_rate": 2.009181396945255e-05, + "loss": 0.7391, + "step": 3849 + }, + { + "epoch": 0.66, + "grad_norm": 14.371528625488281, + "learning_rate": 2.0089239746009954e-05, + "loss": 1.2184, + "step": 3850 + }, + { + "epoch": 0.66, + "grad_norm": 9.71910285949707, + "learning_rate": 2.0086665522567357e-05, + "loss": 0.9444, + "step": 3851 + }, + { + "epoch": 0.66, + "grad_norm": 9.443791389465332, + "learning_rate": 2.0084091299124764e-05, + "loss": 0.8411, + "step": 3852 + }, + { + "epoch": 0.66, + "grad_norm": 9.28685474395752, + "learning_rate": 2.0081517075682167e-05, + "loss": 1.0332, + "step": 3853 + }, + { + "epoch": 0.66, + "grad_norm": 9.836456298828125, + "learning_rate": 2.0078942852239574e-05, + "loss": 0.8048, + "step": 3854 + }, + { + "epoch": 0.66, + "grad_norm": 11.466194152832031, + "learning_rate": 2.007636862879698e-05, + "loss": 0.9095, + "step": 3855 + }, + { + "epoch": 0.66, + "grad_norm": 10.917708396911621, + "learning_rate": 2.0073794405354387e-05, + "loss": 1.3415, + "step": 3856 + }, + { + "epoch": 0.66, + "grad_norm": 8.729721069335938, + "learning_rate": 2.007122018191179e-05, + "loss": 1.0201, + "step": 3857 + }, + { + "epoch": 0.66, + "grad_norm": 10.793363571166992, + "learning_rate": 2.0068645958469197e-05, + "loss": 1.1044, + "step": 3858 + }, + { + "epoch": 0.66, + "grad_norm": 9.919968605041504, + "learning_rate": 2.00660717350266e-05, + "loss": 0.706, + "step": 3859 + }, + { + "epoch": 0.66, + "grad_norm": 8.555896759033203, + "learning_rate": 2.0063497511584007e-05, + "loss": 0.6672, + "step": 3860 + }, + { + "epoch": 0.66, + "grad_norm": 8.051361083984375, + "learning_rate": 2.006092328814141e-05, + "loss": 0.9201, + "step": 3861 + }, + { + "epoch": 0.66, + "grad_norm": 11.385688781738281, + "learning_rate": 2.0058349064698814e-05, + "loss": 0.9987, + "step": 3862 + }, + { + "epoch": 0.66, + "grad_norm": 10.015984535217285, + "learning_rate": 2.005577484125622e-05, + "loss": 0.9494, + "step": 3863 + }, + { + "epoch": 0.66, + "grad_norm": 10.236820220947266, + "learning_rate": 2.0053200617813624e-05, + "loss": 0.9098, + "step": 3864 + }, + { + "epoch": 0.66, + "grad_norm": 9.320958137512207, + "learning_rate": 2.0050626394371034e-05, + "loss": 0.8217, + "step": 3865 + }, + { + "epoch": 0.66, + "grad_norm": 9.640387535095215, + "learning_rate": 2.0048052170928437e-05, + "loss": 0.8337, + "step": 3866 + }, + { + "epoch": 0.66, + "grad_norm": 10.85649299621582, + "learning_rate": 2.0045477947485844e-05, + "loss": 0.8443, + "step": 3867 + }, + { + "epoch": 0.66, + "grad_norm": 8.217432975769043, + "learning_rate": 2.0042903724043247e-05, + "loss": 0.9053, + "step": 3868 + }, + { + "epoch": 0.66, + "grad_norm": 10.322803497314453, + "learning_rate": 2.0040329500600654e-05, + "loss": 0.9291, + "step": 3869 + }, + { + "epoch": 0.66, + "grad_norm": 9.638202667236328, + "learning_rate": 2.0037755277158057e-05, + "loss": 0.7854, + "step": 3870 + }, + { + "epoch": 0.66, + "grad_norm": 13.8377103805542, + "learning_rate": 2.003518105371546e-05, + "loss": 1.0823, + "step": 3871 + }, + { + "epoch": 0.66, + "grad_norm": 10.107596397399902, + "learning_rate": 2.0032606830272867e-05, + "loss": 0.817, + "step": 3872 + }, + { + "epoch": 0.66, + "grad_norm": 11.954251289367676, + "learning_rate": 2.003003260683027e-05, + "loss": 1.035, + "step": 3873 + }, + { + "epoch": 0.66, + "grad_norm": 9.055277824401855, + "learning_rate": 2.002745838338768e-05, + "loss": 0.629, + "step": 3874 + }, + { + "epoch": 0.67, + "grad_norm": 11.547800064086914, + "learning_rate": 2.0024884159945084e-05, + "loss": 0.864, + "step": 3875 + }, + { + "epoch": 0.67, + "grad_norm": 11.284165382385254, + "learning_rate": 2.002230993650249e-05, + "loss": 0.9818, + "step": 3876 + }, + { + "epoch": 0.67, + "grad_norm": 9.371208190917969, + "learning_rate": 2.0019735713059894e-05, + "loss": 0.883, + "step": 3877 + }, + { + "epoch": 0.67, + "grad_norm": 10.45896053314209, + "learning_rate": 2.00171614896173e-05, + "loss": 0.7917, + "step": 3878 + }, + { + "epoch": 0.67, + "grad_norm": 9.150925636291504, + "learning_rate": 2.0014587266174704e-05, + "loss": 0.8472, + "step": 3879 + }, + { + "epoch": 0.67, + "grad_norm": 8.101434707641602, + "learning_rate": 2.001201304273211e-05, + "loss": 0.6148, + "step": 3880 + }, + { + "epoch": 0.67, + "grad_norm": 8.535784721374512, + "learning_rate": 2.0009438819289514e-05, + "loss": 0.8225, + "step": 3881 + }, + { + "epoch": 0.67, + "grad_norm": 9.274863243103027, + "learning_rate": 2.0006864595846917e-05, + "loss": 0.6738, + "step": 3882 + }, + { + "epoch": 0.67, + "grad_norm": 12.1603422164917, + "learning_rate": 2.0004290372404324e-05, + "loss": 0.9445, + "step": 3883 + }, + { + "epoch": 0.67, + "grad_norm": 8.697280883789062, + "learning_rate": 2.000171614896173e-05, + "loss": 0.7578, + "step": 3884 + }, + { + "epoch": 0.67, + "grad_norm": 10.526755332946777, + "learning_rate": 1.9999141925519137e-05, + "loss": 0.8279, + "step": 3885 + }, + { + "epoch": 0.67, + "grad_norm": 12.127225875854492, + "learning_rate": 1.999656770207654e-05, + "loss": 1.3112, + "step": 3886 + }, + { + "epoch": 0.67, + "grad_norm": 10.579535484313965, + "learning_rate": 1.9993993478633947e-05, + "loss": 0.9747, + "step": 3887 + }, + { + "epoch": 0.67, + "grad_norm": 7.592826843261719, + "learning_rate": 1.999141925519135e-05, + "loss": 0.7073, + "step": 3888 + }, + { + "epoch": 0.67, + "grad_norm": 8.447958946228027, + "learning_rate": 1.9988845031748757e-05, + "loss": 0.6782, + "step": 3889 + }, + { + "epoch": 0.67, + "grad_norm": 10.861957550048828, + "learning_rate": 1.998627080830616e-05, + "loss": 0.908, + "step": 3890 + }, + { + "epoch": 0.67, + "grad_norm": 8.820218086242676, + "learning_rate": 1.9983696584863567e-05, + "loss": 0.8226, + "step": 3891 + }, + { + "epoch": 0.67, + "grad_norm": 9.044750213623047, + "learning_rate": 1.998112236142097e-05, + "loss": 0.7616, + "step": 3892 + }, + { + "epoch": 0.67, + "grad_norm": 8.752326011657715, + "learning_rate": 1.9978548137978377e-05, + "loss": 0.6696, + "step": 3893 + }, + { + "epoch": 0.67, + "grad_norm": 9.505807876586914, + "learning_rate": 1.9975973914535784e-05, + "loss": 0.916, + "step": 3894 + }, + { + "epoch": 0.67, + "grad_norm": 10.49383544921875, + "learning_rate": 1.9973399691093187e-05, + "loss": 0.8205, + "step": 3895 + }, + { + "epoch": 0.67, + "grad_norm": 11.455961227416992, + "learning_rate": 1.9970825467650594e-05, + "loss": 0.7455, + "step": 3896 + }, + { + "epoch": 0.67, + "grad_norm": 10.989890098571777, + "learning_rate": 1.9968251244207997e-05, + "loss": 0.7624, + "step": 3897 + }, + { + "epoch": 0.67, + "grad_norm": 9.868483543395996, + "learning_rate": 1.9965677020765404e-05, + "loss": 1.0551, + "step": 3898 + }, + { + "epoch": 0.67, + "grad_norm": 9.663898468017578, + "learning_rate": 1.9963102797322807e-05, + "loss": 0.7522, + "step": 3899 + }, + { + "epoch": 0.67, + "grad_norm": 9.32986068725586, + "learning_rate": 1.9960528573880214e-05, + "loss": 0.6805, + "step": 3900 + }, + { + "epoch": 0.67, + "grad_norm": 10.079046249389648, + "learning_rate": 1.9957954350437617e-05, + "loss": 0.9041, + "step": 3901 + }, + { + "epoch": 0.67, + "grad_norm": 9.203667640686035, + "learning_rate": 1.9955380126995024e-05, + "loss": 0.8628, + "step": 3902 + }, + { + "epoch": 0.67, + "grad_norm": 11.46178913116455, + "learning_rate": 1.995280590355243e-05, + "loss": 1.0608, + "step": 3903 + }, + { + "epoch": 0.67, + "grad_norm": 10.199135780334473, + "learning_rate": 1.9950231680109834e-05, + "loss": 0.7819, + "step": 3904 + }, + { + "epoch": 0.67, + "grad_norm": 7.815032005310059, + "learning_rate": 1.994765745666724e-05, + "loss": 0.8539, + "step": 3905 + }, + { + "epoch": 0.67, + "grad_norm": 11.437705039978027, + "learning_rate": 1.9945083233224644e-05, + "loss": 0.8162, + "step": 3906 + }, + { + "epoch": 0.67, + "grad_norm": 8.97113037109375, + "learning_rate": 1.994250900978205e-05, + "loss": 0.7135, + "step": 3907 + }, + { + "epoch": 0.67, + "grad_norm": 9.980389595031738, + "learning_rate": 1.9939934786339454e-05, + "loss": 0.9366, + "step": 3908 + }, + { + "epoch": 0.67, + "grad_norm": 10.487383842468262, + "learning_rate": 1.993736056289686e-05, + "loss": 0.7006, + "step": 3909 + }, + { + "epoch": 0.67, + "grad_norm": 10.992122650146484, + "learning_rate": 1.9934786339454264e-05, + "loss": 0.7282, + "step": 3910 + }, + { + "epoch": 0.67, + "grad_norm": 8.490836143493652, + "learning_rate": 1.993221211601167e-05, + "loss": 0.7352, + "step": 3911 + }, + { + "epoch": 0.67, + "grad_norm": 10.785042762756348, + "learning_rate": 1.9929637892569077e-05, + "loss": 1.1043, + "step": 3912 + }, + { + "epoch": 0.67, + "grad_norm": 12.121511459350586, + "learning_rate": 1.992706366912648e-05, + "loss": 1.0134, + "step": 3913 + }, + { + "epoch": 0.67, + "grad_norm": 10.977069854736328, + "learning_rate": 1.9924489445683887e-05, + "loss": 0.8898, + "step": 3914 + }, + { + "epoch": 0.67, + "grad_norm": 9.827605247497559, + "learning_rate": 1.992191522224129e-05, + "loss": 0.7977, + "step": 3915 + }, + { + "epoch": 0.67, + "grad_norm": 11.312994956970215, + "learning_rate": 1.9919340998798697e-05, + "loss": 0.8871, + "step": 3916 + }, + { + "epoch": 0.67, + "grad_norm": 11.360579490661621, + "learning_rate": 1.99167667753561e-05, + "loss": 0.7374, + "step": 3917 + }, + { + "epoch": 0.67, + "grad_norm": 9.107073783874512, + "learning_rate": 1.9914192551913507e-05, + "loss": 0.7401, + "step": 3918 + }, + { + "epoch": 0.67, + "grad_norm": 10.83326530456543, + "learning_rate": 1.991161832847091e-05, + "loss": 0.8274, + "step": 3919 + }, + { + "epoch": 0.67, + "grad_norm": 9.00495719909668, + "learning_rate": 1.9909044105028317e-05, + "loss": 0.8952, + "step": 3920 + }, + { + "epoch": 0.67, + "grad_norm": 9.67477798461914, + "learning_rate": 1.9906469881585724e-05, + "loss": 0.873, + "step": 3921 + }, + { + "epoch": 0.67, + "grad_norm": 8.246710777282715, + "learning_rate": 1.990389565814313e-05, + "loss": 0.6519, + "step": 3922 + }, + { + "epoch": 0.67, + "grad_norm": 9.416906356811523, + "learning_rate": 1.9901321434700533e-05, + "loss": 0.8081, + "step": 3923 + }, + { + "epoch": 0.67, + "grad_norm": 9.19828987121582, + "learning_rate": 1.9898747211257937e-05, + "loss": 0.9664, + "step": 3924 + }, + { + "epoch": 0.67, + "grad_norm": 10.778648376464844, + "learning_rate": 1.9896172987815343e-05, + "loss": 0.9061, + "step": 3925 + }, + { + "epoch": 0.67, + "grad_norm": 9.403685569763184, + "learning_rate": 1.9893598764372747e-05, + "loss": 0.8616, + "step": 3926 + }, + { + "epoch": 0.67, + "grad_norm": 11.282853126525879, + "learning_rate": 1.9891024540930153e-05, + "loss": 1.2102, + "step": 3927 + }, + { + "epoch": 0.67, + "grad_norm": 9.931404113769531, + "learning_rate": 1.9888450317487557e-05, + "loss": 0.8525, + "step": 3928 + }, + { + "epoch": 0.67, + "grad_norm": 10.738006591796875, + "learning_rate": 1.9885876094044963e-05, + "loss": 0.9559, + "step": 3929 + }, + { + "epoch": 0.67, + "grad_norm": 12.042181015014648, + "learning_rate": 1.9883301870602367e-05, + "loss": 0.9463, + "step": 3930 + }, + { + "epoch": 0.67, + "grad_norm": 8.851289749145508, + "learning_rate": 1.9880727647159777e-05, + "loss": 0.7099, + "step": 3931 + }, + { + "epoch": 0.67, + "grad_norm": 7.815483093261719, + "learning_rate": 1.987815342371718e-05, + "loss": 0.5558, + "step": 3932 + }, + { + "epoch": 0.67, + "grad_norm": 9.740373611450195, + "learning_rate": 1.9875579200274587e-05, + "loss": 1.0212, + "step": 3933 + }, + { + "epoch": 0.68, + "grad_norm": 9.694733619689941, + "learning_rate": 1.987300497683199e-05, + "loss": 1.0334, + "step": 3934 + }, + { + "epoch": 0.68, + "grad_norm": 9.542588233947754, + "learning_rate": 1.9870430753389393e-05, + "loss": 0.6258, + "step": 3935 + }, + { + "epoch": 0.68, + "grad_norm": 9.80351448059082, + "learning_rate": 1.98678565299468e-05, + "loss": 0.7614, + "step": 3936 + }, + { + "epoch": 0.68, + "grad_norm": 7.947404861450195, + "learning_rate": 1.9865282306504203e-05, + "loss": 0.6481, + "step": 3937 + }, + { + "epoch": 0.68, + "grad_norm": 9.279672622680664, + "learning_rate": 1.986270808306161e-05, + "loss": 0.9361, + "step": 3938 + }, + { + "epoch": 0.68, + "grad_norm": 9.397329330444336, + "learning_rate": 1.9860133859619013e-05, + "loss": 0.6903, + "step": 3939 + }, + { + "epoch": 0.68, + "grad_norm": 10.429089546203613, + "learning_rate": 1.9857559636176423e-05, + "loss": 1.1352, + "step": 3940 + }, + { + "epoch": 0.68, + "grad_norm": 10.662603378295898, + "learning_rate": 1.9854985412733827e-05, + "loss": 0.9475, + "step": 3941 + }, + { + "epoch": 0.68, + "grad_norm": 11.281474113464355, + "learning_rate": 1.9852411189291233e-05, + "loss": 1.1532, + "step": 3942 + }, + { + "epoch": 0.68, + "grad_norm": 9.530838966369629, + "learning_rate": 1.9849836965848637e-05, + "loss": 0.9931, + "step": 3943 + }, + { + "epoch": 0.68, + "grad_norm": 10.199763298034668, + "learning_rate": 1.984726274240604e-05, + "loss": 0.7225, + "step": 3944 + }, + { + "epoch": 0.68, + "grad_norm": 9.985010147094727, + "learning_rate": 1.9844688518963447e-05, + "loss": 0.6305, + "step": 3945 + }, + { + "epoch": 0.68, + "grad_norm": 8.202077865600586, + "learning_rate": 1.984211429552085e-05, + "loss": 1.0916, + "step": 3946 + }, + { + "epoch": 0.68, + "grad_norm": 9.160751342773438, + "learning_rate": 1.9839540072078257e-05, + "loss": 0.9651, + "step": 3947 + }, + { + "epoch": 0.68, + "grad_norm": 8.579391479492188, + "learning_rate": 1.983696584863566e-05, + "loss": 0.8875, + "step": 3948 + }, + { + "epoch": 0.68, + "grad_norm": 8.264220237731934, + "learning_rate": 1.9834391625193067e-05, + "loss": 0.7546, + "step": 3949 + }, + { + "epoch": 0.68, + "grad_norm": 12.203835487365723, + "learning_rate": 1.9831817401750473e-05, + "loss": 0.8868, + "step": 3950 + }, + { + "epoch": 0.68, + "grad_norm": 10.139389991760254, + "learning_rate": 1.982924317830788e-05, + "loss": 0.7785, + "step": 3951 + }, + { + "epoch": 0.68, + "grad_norm": 9.022866249084473, + "learning_rate": 1.9826668954865283e-05, + "loss": 0.7928, + "step": 3952 + }, + { + "epoch": 0.68, + "grad_norm": 9.815255165100098, + "learning_rate": 1.982409473142269e-05, + "loss": 0.9878, + "step": 3953 + }, + { + "epoch": 0.68, + "grad_norm": 13.767159461975098, + "learning_rate": 1.9821520507980093e-05, + "loss": 1.0266, + "step": 3954 + }, + { + "epoch": 0.68, + "grad_norm": 9.081512451171875, + "learning_rate": 1.9818946284537496e-05, + "loss": 0.6789, + "step": 3955 + }, + { + "epoch": 0.68, + "grad_norm": 9.766496658325195, + "learning_rate": 1.9816372061094903e-05, + "loss": 0.7038, + "step": 3956 + }, + { + "epoch": 0.68, + "grad_norm": 9.34941577911377, + "learning_rate": 1.9813797837652306e-05, + "loss": 0.6806, + "step": 3957 + }, + { + "epoch": 0.68, + "grad_norm": 10.309305191040039, + "learning_rate": 1.9811223614209713e-05, + "loss": 0.8272, + "step": 3958 + }, + { + "epoch": 0.68, + "grad_norm": 12.701016426086426, + "learning_rate": 1.980864939076712e-05, + "loss": 0.9789, + "step": 3959 + }, + { + "epoch": 0.68, + "grad_norm": 9.719133377075195, + "learning_rate": 1.9806075167324526e-05, + "loss": 0.8249, + "step": 3960 + }, + { + "epoch": 0.68, + "grad_norm": 8.09665298461914, + "learning_rate": 1.980350094388193e-05, + "loss": 0.6647, + "step": 3961 + }, + { + "epoch": 0.68, + "grad_norm": 12.239025115966797, + "learning_rate": 1.9800926720439336e-05, + "loss": 1.0294, + "step": 3962 + }, + { + "epoch": 0.68, + "grad_norm": 9.544559478759766, + "learning_rate": 1.979835249699674e-05, + "loss": 0.7316, + "step": 3963 + }, + { + "epoch": 0.68, + "grad_norm": 10.569293022155762, + "learning_rate": 1.9795778273554146e-05, + "loss": 0.98, + "step": 3964 + }, + { + "epoch": 0.68, + "grad_norm": 8.094076156616211, + "learning_rate": 1.979320405011155e-05, + "loss": 0.75, + "step": 3965 + }, + { + "epoch": 0.68, + "grad_norm": 10.303956985473633, + "learning_rate": 1.9790629826668953e-05, + "loss": 0.7886, + "step": 3966 + }, + { + "epoch": 0.68, + "grad_norm": 10.011849403381348, + "learning_rate": 1.978805560322636e-05, + "loss": 0.8513, + "step": 3967 + }, + { + "epoch": 0.68, + "grad_norm": 9.344328880310059, + "learning_rate": 1.9785481379783763e-05, + "loss": 0.8999, + "step": 3968 + }, + { + "epoch": 0.68, + "grad_norm": 10.330906867980957, + "learning_rate": 1.9782907156341173e-05, + "loss": 1.0393, + "step": 3969 + }, + { + "epoch": 0.68, + "grad_norm": 10.964336395263672, + "learning_rate": 1.9780332932898576e-05, + "loss": 0.7451, + "step": 3970 + }, + { + "epoch": 0.68, + "grad_norm": 9.336642265319824, + "learning_rate": 1.9777758709455983e-05, + "loss": 0.9648, + "step": 3971 + }, + { + "epoch": 0.68, + "grad_norm": 6.92069673538208, + "learning_rate": 1.9775184486013386e-05, + "loss": 0.6064, + "step": 3972 + }, + { + "epoch": 0.68, + "grad_norm": 9.91940689086914, + "learning_rate": 1.9772610262570793e-05, + "loss": 0.7281, + "step": 3973 + }, + { + "epoch": 0.68, + "grad_norm": 9.428845405578613, + "learning_rate": 1.9770036039128196e-05, + "loss": 0.8193, + "step": 3974 + }, + { + "epoch": 0.68, + "grad_norm": 8.538307189941406, + "learning_rate": 1.97674618156856e-05, + "loss": 0.8031, + "step": 3975 + }, + { + "epoch": 0.68, + "grad_norm": 11.502593994140625, + "learning_rate": 1.9764887592243006e-05, + "loss": 0.8204, + "step": 3976 + }, + { + "epoch": 0.68, + "grad_norm": 11.508402824401855, + "learning_rate": 1.976231336880041e-05, + "loss": 0.8356, + "step": 3977 + }, + { + "epoch": 0.68, + "grad_norm": 10.833074569702148, + "learning_rate": 1.975973914535782e-05, + "loss": 0.675, + "step": 3978 + }, + { + "epoch": 0.68, + "grad_norm": 13.171976089477539, + "learning_rate": 1.9757164921915223e-05, + "loss": 1.0401, + "step": 3979 + }, + { + "epoch": 0.68, + "grad_norm": 10.629979133605957, + "learning_rate": 1.975459069847263e-05, + "loss": 1.0111, + "step": 3980 + }, + { + "epoch": 0.68, + "grad_norm": 10.351917266845703, + "learning_rate": 1.9752016475030033e-05, + "loss": 0.8158, + "step": 3981 + }, + { + "epoch": 0.68, + "grad_norm": 11.653636932373047, + "learning_rate": 1.974944225158744e-05, + "loss": 0.9946, + "step": 3982 + }, + { + "epoch": 0.68, + "grad_norm": 11.064470291137695, + "learning_rate": 1.9746868028144843e-05, + "loss": 0.8905, + "step": 3983 + }, + { + "epoch": 0.68, + "grad_norm": 8.972502708435059, + "learning_rate": 1.974429380470225e-05, + "loss": 0.7342, + "step": 3984 + }, + { + "epoch": 0.68, + "grad_norm": 11.094321250915527, + "learning_rate": 1.9741719581259653e-05, + "loss": 0.8376, + "step": 3985 + }, + { + "epoch": 0.68, + "grad_norm": 10.028883934020996, + "learning_rate": 1.9739145357817056e-05, + "loss": 0.9002, + "step": 3986 + }, + { + "epoch": 0.68, + "grad_norm": 13.039002418518066, + "learning_rate": 1.9736571134374463e-05, + "loss": 0.9004, + "step": 3987 + }, + { + "epoch": 0.68, + "grad_norm": 7.977614879608154, + "learning_rate": 1.973399691093187e-05, + "loss": 0.6631, + "step": 3988 + }, + { + "epoch": 0.68, + "grad_norm": 9.446571350097656, + "learning_rate": 1.9731422687489276e-05, + "loss": 0.7548, + "step": 3989 + }, + { + "epoch": 0.68, + "grad_norm": 13.09100341796875, + "learning_rate": 1.972884846404668e-05, + "loss": 1.1521, + "step": 3990 + }, + { + "epoch": 0.68, + "grad_norm": 10.782296180725098, + "learning_rate": 1.9726274240604086e-05, + "loss": 0.8137, + "step": 3991 + }, + { + "epoch": 0.69, + "grad_norm": 11.478397369384766, + "learning_rate": 1.972370001716149e-05, + "loss": 0.8823, + "step": 3992 + }, + { + "epoch": 0.69, + "grad_norm": 11.257417678833008, + "learning_rate": 1.9721125793718896e-05, + "loss": 1.0606, + "step": 3993 + }, + { + "epoch": 0.69, + "grad_norm": 10.692514419555664, + "learning_rate": 1.97185515702763e-05, + "loss": 1.0351, + "step": 3994 + }, + { + "epoch": 0.69, + "grad_norm": 13.016999244689941, + "learning_rate": 1.9715977346833706e-05, + "loss": 0.9902, + "step": 3995 + }, + { + "epoch": 0.69, + "grad_norm": 8.181031227111816, + "learning_rate": 1.971340312339111e-05, + "loss": 0.7479, + "step": 3996 + }, + { + "epoch": 0.69, + "grad_norm": 8.7083740234375, + "learning_rate": 1.9710828899948516e-05, + "loss": 0.8022, + "step": 3997 + }, + { + "epoch": 0.69, + "grad_norm": 7.884144306182861, + "learning_rate": 1.9708254676505923e-05, + "loss": 0.7198, + "step": 3998 + }, + { + "epoch": 0.69, + "grad_norm": 10.083291053771973, + "learning_rate": 1.9705680453063326e-05, + "loss": 1.1176, + "step": 3999 + }, + { + "epoch": 0.69, + "grad_norm": 8.507850646972656, + "learning_rate": 1.9703106229620733e-05, + "loss": 0.7748, + "step": 4000 + }, + { + "epoch": 0.69, + "grad_norm": 9.2831449508667, + "learning_rate": 1.9700532006178136e-05, + "loss": 0.9409, + "step": 4001 + }, + { + "epoch": 0.69, + "grad_norm": 10.327726364135742, + "learning_rate": 1.9697957782735543e-05, + "loss": 1.0776, + "step": 4002 + }, + { + "epoch": 0.69, + "grad_norm": 8.2455472946167, + "learning_rate": 1.9695383559292946e-05, + "loss": 0.714, + "step": 4003 + }, + { + "epoch": 0.69, + "grad_norm": 8.188271522521973, + "learning_rate": 1.9692809335850353e-05, + "loss": 0.8372, + "step": 4004 + }, + { + "epoch": 0.69, + "grad_norm": 9.892428398132324, + "learning_rate": 1.9690235112407756e-05, + "loss": 0.9201, + "step": 4005 + }, + { + "epoch": 0.69, + "grad_norm": 7.546725273132324, + "learning_rate": 1.9687660888965163e-05, + "loss": 0.8367, + "step": 4006 + }, + { + "epoch": 0.69, + "grad_norm": 8.803289413452148, + "learning_rate": 1.968508666552257e-05, + "loss": 0.7534, + "step": 4007 + }, + { + "epoch": 0.69, + "grad_norm": 9.834587097167969, + "learning_rate": 1.9682512442079973e-05, + "loss": 0.8098, + "step": 4008 + }, + { + "epoch": 0.69, + "grad_norm": 9.217619895935059, + "learning_rate": 1.967993821863738e-05, + "loss": 0.6934, + "step": 4009 + }, + { + "epoch": 0.69, + "grad_norm": 10.22787857055664, + "learning_rate": 1.9677363995194783e-05, + "loss": 1.0037, + "step": 4010 + }, + { + "epoch": 0.69, + "grad_norm": 9.565980911254883, + "learning_rate": 1.967478977175219e-05, + "loss": 0.8713, + "step": 4011 + }, + { + "epoch": 0.69, + "grad_norm": 10.52838134765625, + "learning_rate": 1.9672215548309593e-05, + "loss": 0.8921, + "step": 4012 + }, + { + "epoch": 0.69, + "grad_norm": 10.013540267944336, + "learning_rate": 1.9669641324867e-05, + "loss": 0.8828, + "step": 4013 + }, + { + "epoch": 0.69, + "grad_norm": 10.014223098754883, + "learning_rate": 1.9667067101424403e-05, + "loss": 0.8123, + "step": 4014 + }, + { + "epoch": 0.69, + "grad_norm": 9.63780689239502, + "learning_rate": 1.966449287798181e-05, + "loss": 0.7775, + "step": 4015 + }, + { + "epoch": 0.69, + "grad_norm": 10.96135425567627, + "learning_rate": 1.9661918654539216e-05, + "loss": 0.8927, + "step": 4016 + }, + { + "epoch": 0.69, + "grad_norm": 9.869874000549316, + "learning_rate": 1.965934443109662e-05, + "loss": 0.9113, + "step": 4017 + }, + { + "epoch": 0.69, + "grad_norm": 10.326499938964844, + "learning_rate": 1.9656770207654026e-05, + "loss": 0.8379, + "step": 4018 + }, + { + "epoch": 0.69, + "grad_norm": 10.697678565979004, + "learning_rate": 1.965419598421143e-05, + "loss": 1.0268, + "step": 4019 + }, + { + "epoch": 0.69, + "grad_norm": 10.77447509765625, + "learning_rate": 1.9651621760768836e-05, + "loss": 0.958, + "step": 4020 + }, + { + "epoch": 0.69, + "grad_norm": 10.400771141052246, + "learning_rate": 1.964904753732624e-05, + "loss": 0.7048, + "step": 4021 + }, + { + "epoch": 0.69, + "grad_norm": 10.796031951904297, + "learning_rate": 1.9646473313883646e-05, + "loss": 0.8138, + "step": 4022 + }, + { + "epoch": 0.69, + "grad_norm": 12.505208969116211, + "learning_rate": 1.964389909044105e-05, + "loss": 0.9146, + "step": 4023 + }, + { + "epoch": 0.69, + "grad_norm": 10.580912590026855, + "learning_rate": 1.9641324866998456e-05, + "loss": 0.8553, + "step": 4024 + }, + { + "epoch": 0.69, + "grad_norm": 11.221284866333008, + "learning_rate": 1.9638750643555863e-05, + "loss": 1.0036, + "step": 4025 + }, + { + "epoch": 0.69, + "grad_norm": 8.90367317199707, + "learning_rate": 1.963617642011327e-05, + "loss": 0.8058, + "step": 4026 + }, + { + "epoch": 0.69, + "grad_norm": 10.296404838562012, + "learning_rate": 1.9633602196670673e-05, + "loss": 0.9541, + "step": 4027 + }, + { + "epoch": 0.69, + "grad_norm": 10.00625991821289, + "learning_rate": 1.9631027973228076e-05, + "loss": 0.7058, + "step": 4028 + }, + { + "epoch": 0.69, + "grad_norm": 9.92353343963623, + "learning_rate": 1.9628453749785482e-05, + "loss": 0.845, + "step": 4029 + }, + { + "epoch": 0.69, + "grad_norm": 13.148040771484375, + "learning_rate": 1.9625879526342886e-05, + "loss": 0.9769, + "step": 4030 + }, + { + "epoch": 0.69, + "grad_norm": 11.550756454467773, + "learning_rate": 1.9623305302900292e-05, + "loss": 1.1777, + "step": 4031 + }, + { + "epoch": 0.69, + "grad_norm": 8.699030876159668, + "learning_rate": 1.9620731079457696e-05, + "loss": 0.7918, + "step": 4032 + }, + { + "epoch": 0.69, + "grad_norm": 8.76816463470459, + "learning_rate": 1.9618156856015102e-05, + "loss": 0.5004, + "step": 4033 + }, + { + "epoch": 0.69, + "grad_norm": 6.999948024749756, + "learning_rate": 1.9615582632572506e-05, + "loss": 0.615, + "step": 4034 + }, + { + "epoch": 0.69, + "grad_norm": 9.726495742797852, + "learning_rate": 1.9613008409129916e-05, + "loss": 0.9607, + "step": 4035 + }, + { + "epoch": 0.69, + "grad_norm": 9.991649627685547, + "learning_rate": 1.961043418568732e-05, + "loss": 0.9198, + "step": 4036 + }, + { + "epoch": 0.69, + "grad_norm": 10.714914321899414, + "learning_rate": 1.9607859962244726e-05, + "loss": 1.0131, + "step": 4037 + }, + { + "epoch": 0.69, + "grad_norm": 11.5370512008667, + "learning_rate": 1.960528573880213e-05, + "loss": 0.749, + "step": 4038 + }, + { + "epoch": 0.69, + "grad_norm": 11.226852416992188, + "learning_rate": 1.9602711515359532e-05, + "loss": 0.7837, + "step": 4039 + }, + { + "epoch": 0.69, + "grad_norm": 8.896933555603027, + "learning_rate": 1.960013729191694e-05, + "loss": 0.6167, + "step": 4040 + }, + { + "epoch": 0.69, + "grad_norm": 10.405925750732422, + "learning_rate": 1.9597563068474342e-05, + "loss": 0.972, + "step": 4041 + }, + { + "epoch": 0.69, + "grad_norm": 8.742990493774414, + "learning_rate": 1.959498884503175e-05, + "loss": 0.9503, + "step": 4042 + }, + { + "epoch": 0.69, + "grad_norm": 9.902505874633789, + "learning_rate": 1.9592414621589152e-05, + "loss": 0.8328, + "step": 4043 + }, + { + "epoch": 0.69, + "grad_norm": 8.653990745544434, + "learning_rate": 1.9589840398146562e-05, + "loss": 0.6924, + "step": 4044 + }, + { + "epoch": 0.69, + "grad_norm": 8.983641624450684, + "learning_rate": 1.9587266174703966e-05, + "loss": 0.6876, + "step": 4045 + }, + { + "epoch": 0.69, + "grad_norm": 11.483264923095703, + "learning_rate": 1.9584691951261372e-05, + "loss": 0.8161, + "step": 4046 + }, + { + "epoch": 0.69, + "grad_norm": 10.088647842407227, + "learning_rate": 1.9582117727818776e-05, + "loss": 0.9787, + "step": 4047 + }, + { + "epoch": 0.69, + "grad_norm": 9.744452476501465, + "learning_rate": 1.957954350437618e-05, + "loss": 0.6605, + "step": 4048 + }, + { + "epoch": 0.69, + "grad_norm": 10.046748161315918, + "learning_rate": 1.9576969280933586e-05, + "loss": 0.7685, + "step": 4049 + }, + { + "epoch": 0.7, + "grad_norm": 10.177292823791504, + "learning_rate": 1.957439505749099e-05, + "loss": 0.9668, + "step": 4050 + }, + { + "epoch": 0.7, + "grad_norm": 10.781136512756348, + "learning_rate": 1.9571820834048396e-05, + "loss": 0.9335, + "step": 4051 + }, + { + "epoch": 0.7, + "grad_norm": 10.072296142578125, + "learning_rate": 1.95692466106058e-05, + "loss": 0.9117, + "step": 4052 + }, + { + "epoch": 0.7, + "grad_norm": 11.207942008972168, + "learning_rate": 1.9566672387163206e-05, + "loss": 0.9793, + "step": 4053 + }, + { + "epoch": 0.7, + "grad_norm": 8.456552505493164, + "learning_rate": 1.9564098163720612e-05, + "loss": 0.5707, + "step": 4054 + }, + { + "epoch": 0.7, + "grad_norm": 11.58328914642334, + "learning_rate": 1.956152394027802e-05, + "loss": 0.8544, + "step": 4055 + }, + { + "epoch": 0.7, + "grad_norm": 10.354524612426758, + "learning_rate": 1.9558949716835422e-05, + "loss": 0.875, + "step": 4056 + }, + { + "epoch": 0.7, + "grad_norm": 10.670668601989746, + "learning_rate": 1.955637549339283e-05, + "loss": 0.7347, + "step": 4057 + }, + { + "epoch": 0.7, + "grad_norm": 9.295302391052246, + "learning_rate": 1.9553801269950232e-05, + "loss": 0.7823, + "step": 4058 + }, + { + "epoch": 0.7, + "grad_norm": 11.055832862854004, + "learning_rate": 1.9551227046507635e-05, + "loss": 0.8927, + "step": 4059 + }, + { + "epoch": 0.7, + "grad_norm": 11.445508003234863, + "learning_rate": 1.9548652823065042e-05, + "loss": 0.6966, + "step": 4060 + }, + { + "epoch": 0.7, + "grad_norm": 11.543179512023926, + "learning_rate": 1.9546078599622445e-05, + "loss": 0.9005, + "step": 4061 + }, + { + "epoch": 0.7, + "grad_norm": 10.495827674865723, + "learning_rate": 1.9543504376179852e-05, + "loss": 0.7935, + "step": 4062 + }, + { + "epoch": 0.7, + "grad_norm": 11.159778594970703, + "learning_rate": 1.954093015273726e-05, + "loss": 0.8622, + "step": 4063 + }, + { + "epoch": 0.7, + "grad_norm": 9.753851890563965, + "learning_rate": 1.9538355929294666e-05, + "loss": 0.7778, + "step": 4064 + }, + { + "epoch": 0.7, + "grad_norm": 11.581764221191406, + "learning_rate": 1.953578170585207e-05, + "loss": 0.8594, + "step": 4065 + }, + { + "epoch": 0.7, + "grad_norm": 10.900815963745117, + "learning_rate": 1.9533207482409475e-05, + "loss": 1.0903, + "step": 4066 + }, + { + "epoch": 0.7, + "grad_norm": 8.259923934936523, + "learning_rate": 1.953063325896688e-05, + "loss": 0.9654, + "step": 4067 + }, + { + "epoch": 0.7, + "grad_norm": 8.771180152893066, + "learning_rate": 1.9528059035524285e-05, + "loss": 0.6501, + "step": 4068 + }, + { + "epoch": 0.7, + "grad_norm": 9.301082611083984, + "learning_rate": 1.952548481208169e-05, + "loss": 0.798, + "step": 4069 + }, + { + "epoch": 0.7, + "grad_norm": 9.170222282409668, + "learning_rate": 1.9522910588639092e-05, + "loss": 0.8041, + "step": 4070 + }, + { + "epoch": 0.7, + "grad_norm": 10.72446346282959, + "learning_rate": 1.95203363651965e-05, + "loss": 1.018, + "step": 4071 + }, + { + "epoch": 0.7, + "grad_norm": 8.974662780761719, + "learning_rate": 1.9517762141753902e-05, + "loss": 0.7048, + "step": 4072 + }, + { + "epoch": 0.7, + "grad_norm": 9.767720222473145, + "learning_rate": 1.9515187918311312e-05, + "loss": 0.76, + "step": 4073 + }, + { + "epoch": 0.7, + "grad_norm": 9.805106163024902, + "learning_rate": 1.9512613694868715e-05, + "loss": 0.7146, + "step": 4074 + }, + { + "epoch": 0.7, + "grad_norm": 8.177083015441895, + "learning_rate": 1.9510039471426122e-05, + "loss": 0.6145, + "step": 4075 + }, + { + "epoch": 0.7, + "grad_norm": 10.28121566772461, + "learning_rate": 1.9507465247983525e-05, + "loss": 0.8947, + "step": 4076 + }, + { + "epoch": 0.7, + "grad_norm": 9.950400352478027, + "learning_rate": 1.9504891024540932e-05, + "loss": 0.8343, + "step": 4077 + }, + { + "epoch": 0.7, + "grad_norm": 10.08894157409668, + "learning_rate": 1.9502316801098335e-05, + "loss": 0.7929, + "step": 4078 + }, + { + "epoch": 0.7, + "grad_norm": 8.948592185974121, + "learning_rate": 1.949974257765574e-05, + "loss": 0.7078, + "step": 4079 + }, + { + "epoch": 0.7, + "grad_norm": 9.790074348449707, + "learning_rate": 1.9497168354213145e-05, + "loss": 0.9945, + "step": 4080 + }, + { + "epoch": 0.7, + "grad_norm": 8.925834655761719, + "learning_rate": 1.949459413077055e-05, + "loss": 0.6215, + "step": 4081 + }, + { + "epoch": 0.7, + "grad_norm": 10.762432098388672, + "learning_rate": 1.949201990732796e-05, + "loss": 1.0637, + "step": 4082 + }, + { + "epoch": 0.7, + "grad_norm": 8.114999771118164, + "learning_rate": 1.9489445683885362e-05, + "loss": 0.7603, + "step": 4083 + }, + { + "epoch": 0.7, + "grad_norm": 9.416316986083984, + "learning_rate": 1.948687146044277e-05, + "loss": 0.8125, + "step": 4084 + }, + { + "epoch": 0.7, + "grad_norm": 9.129293441772461, + "learning_rate": 1.9484297237000172e-05, + "loss": 0.8315, + "step": 4085 + }, + { + "epoch": 0.7, + "grad_norm": 9.308990478515625, + "learning_rate": 1.948172301355758e-05, + "loss": 0.9291, + "step": 4086 + }, + { + "epoch": 0.7, + "grad_norm": 9.069663047790527, + "learning_rate": 1.9479148790114982e-05, + "loss": 0.9763, + "step": 4087 + }, + { + "epoch": 0.7, + "grad_norm": 11.242548942565918, + "learning_rate": 1.947657456667239e-05, + "loss": 0.8067, + "step": 4088 + }, + { + "epoch": 0.7, + "grad_norm": 10.22472095489502, + "learning_rate": 1.9474000343229792e-05, + "loss": 0.9655, + "step": 4089 + }, + { + "epoch": 0.7, + "grad_norm": 9.0079984664917, + "learning_rate": 1.9471426119787195e-05, + "loss": 0.8751, + "step": 4090 + }, + { + "epoch": 0.7, + "grad_norm": 11.007758140563965, + "learning_rate": 1.9468851896344602e-05, + "loss": 1.2319, + "step": 4091 + }, + { + "epoch": 0.7, + "grad_norm": 9.411227226257324, + "learning_rate": 1.946627767290201e-05, + "loss": 0.8935, + "step": 4092 + }, + { + "epoch": 0.7, + "grad_norm": 10.085384368896484, + "learning_rate": 1.9463703449459415e-05, + "loss": 0.8864, + "step": 4093 + }, + { + "epoch": 0.7, + "grad_norm": 8.263283729553223, + "learning_rate": 1.946112922601682e-05, + "loss": 0.7343, + "step": 4094 + }, + { + "epoch": 0.7, + "grad_norm": 9.197152137756348, + "learning_rate": 1.9458555002574225e-05, + "loss": 0.7345, + "step": 4095 + }, + { + "epoch": 0.7, + "grad_norm": 7.262878894805908, + "learning_rate": 1.945598077913163e-05, + "loss": 0.7368, + "step": 4096 + }, + { + "epoch": 0.7, + "grad_norm": 9.347179412841797, + "learning_rate": 1.9453406555689035e-05, + "loss": 0.7142, + "step": 4097 + }, + { + "epoch": 0.7, + "grad_norm": 7.919077396392822, + "learning_rate": 1.945083233224644e-05, + "loss": 0.5645, + "step": 4098 + }, + { + "epoch": 0.7, + "grad_norm": 9.976900100708008, + "learning_rate": 1.9448258108803845e-05, + "loss": 1.0602, + "step": 4099 + }, + { + "epoch": 0.7, + "grad_norm": 9.500653266906738, + "learning_rate": 1.944568388536125e-05, + "loss": 0.6233, + "step": 4100 + }, + { + "epoch": 0.7, + "grad_norm": 8.730000495910645, + "learning_rate": 1.9443109661918655e-05, + "loss": 0.8537, + "step": 4101 + }, + { + "epoch": 0.7, + "grad_norm": 9.802042007446289, + "learning_rate": 1.9440535438476062e-05, + "loss": 0.8679, + "step": 4102 + }, + { + "epoch": 0.7, + "grad_norm": 9.940086364746094, + "learning_rate": 1.9437961215033465e-05, + "loss": 0.9542, + "step": 4103 + }, + { + "epoch": 0.7, + "grad_norm": 11.198892593383789, + "learning_rate": 1.9435386991590872e-05, + "loss": 0.9823, + "step": 4104 + }, + { + "epoch": 0.7, + "grad_norm": 8.480631828308105, + "learning_rate": 1.9432812768148275e-05, + "loss": 0.9504, + "step": 4105 + }, + { + "epoch": 0.7, + "grad_norm": 12.300588607788086, + "learning_rate": 1.9430238544705682e-05, + "loss": 0.9653, + "step": 4106 + }, + { + "epoch": 0.7, + "grad_norm": 9.719565391540527, + "learning_rate": 1.9427664321263085e-05, + "loss": 0.7701, + "step": 4107 + }, + { + "epoch": 0.7, + "grad_norm": 9.093180656433105, + "learning_rate": 1.9425090097820492e-05, + "loss": 0.7741, + "step": 4108 + }, + { + "epoch": 0.71, + "grad_norm": 13.70360279083252, + "learning_rate": 1.9422515874377895e-05, + "loss": 0.789, + "step": 4109 + }, + { + "epoch": 0.71, + "grad_norm": 9.274117469787598, + "learning_rate": 1.9419941650935302e-05, + "loss": 0.7288, + "step": 4110 + }, + { + "epoch": 0.71, + "grad_norm": 11.471673965454102, + "learning_rate": 1.941736742749271e-05, + "loss": 0.9696, + "step": 4111 + }, + { + "epoch": 0.71, + "grad_norm": 10.78393840789795, + "learning_rate": 1.941479320405011e-05, + "loss": 0.8418, + "step": 4112 + }, + { + "epoch": 0.71, + "grad_norm": 10.887661933898926, + "learning_rate": 1.941221898060752e-05, + "loss": 0.7292, + "step": 4113 + }, + { + "epoch": 0.71, + "grad_norm": 11.562491416931152, + "learning_rate": 1.940964475716492e-05, + "loss": 0.9941, + "step": 4114 + }, + { + "epoch": 0.71, + "grad_norm": 9.537960052490234, + "learning_rate": 1.940707053372233e-05, + "loss": 0.72, + "step": 4115 + }, + { + "epoch": 0.71, + "grad_norm": 8.944845199584961, + "learning_rate": 1.940449631027973e-05, + "loss": 0.9124, + "step": 4116 + }, + { + "epoch": 0.71, + "grad_norm": 12.464266777038574, + "learning_rate": 1.9401922086837138e-05, + "loss": 1.0593, + "step": 4117 + }, + { + "epoch": 0.71, + "grad_norm": 9.064270973205566, + "learning_rate": 1.939934786339454e-05, + "loss": 0.8479, + "step": 4118 + }, + { + "epoch": 0.71, + "grad_norm": 9.163776397705078, + "learning_rate": 1.9396773639951948e-05, + "loss": 0.9357, + "step": 4119 + }, + { + "epoch": 0.71, + "grad_norm": 9.305214881896973, + "learning_rate": 1.9394199416509355e-05, + "loss": 0.8636, + "step": 4120 + }, + { + "epoch": 0.71, + "grad_norm": 10.623992919921875, + "learning_rate": 1.9391625193066758e-05, + "loss": 0.9591, + "step": 4121 + }, + { + "epoch": 0.71, + "grad_norm": 10.971527099609375, + "learning_rate": 1.9389050969624165e-05, + "loss": 1.1617, + "step": 4122 + }, + { + "epoch": 0.71, + "grad_norm": 8.063480377197266, + "learning_rate": 1.9386476746181568e-05, + "loss": 0.7387, + "step": 4123 + }, + { + "epoch": 0.71, + "grad_norm": 7.922619342803955, + "learning_rate": 1.9383902522738975e-05, + "loss": 0.647, + "step": 4124 + }, + { + "epoch": 0.71, + "grad_norm": 9.8080472946167, + "learning_rate": 1.9381328299296378e-05, + "loss": 0.729, + "step": 4125 + }, + { + "epoch": 0.71, + "grad_norm": 9.750015258789062, + "learning_rate": 1.9378754075853785e-05, + "loss": 0.9147, + "step": 4126 + }, + { + "epoch": 0.71, + "grad_norm": 10.464468002319336, + "learning_rate": 1.9376179852411188e-05, + "loss": 0.7463, + "step": 4127 + }, + { + "epoch": 0.71, + "grad_norm": 10.865391731262207, + "learning_rate": 1.9373605628968595e-05, + "loss": 1.2584, + "step": 4128 + }, + { + "epoch": 0.71, + "grad_norm": 8.900446891784668, + "learning_rate": 1.9371031405525998e-05, + "loss": 0.8376, + "step": 4129 + }, + { + "epoch": 0.71, + "grad_norm": 10.419876098632812, + "learning_rate": 1.9368457182083408e-05, + "loss": 0.9709, + "step": 4130 + }, + { + "epoch": 0.71, + "grad_norm": 9.008069038391113, + "learning_rate": 1.936588295864081e-05, + "loss": 0.7046, + "step": 4131 + }, + { + "epoch": 0.71, + "grad_norm": 8.859042167663574, + "learning_rate": 1.9363308735198215e-05, + "loss": 0.838, + "step": 4132 + }, + { + "epoch": 0.71, + "grad_norm": 7.988000392913818, + "learning_rate": 1.936073451175562e-05, + "loss": 0.7154, + "step": 4133 + }, + { + "epoch": 0.71, + "grad_norm": 7.62507963180542, + "learning_rate": 1.9358160288313025e-05, + "loss": 0.7137, + "step": 4134 + }, + { + "epoch": 0.71, + "grad_norm": 7.302682399749756, + "learning_rate": 1.935558606487043e-05, + "loss": 0.7076, + "step": 4135 + }, + { + "epoch": 0.71, + "grad_norm": 9.77242374420166, + "learning_rate": 1.9353011841427835e-05, + "loss": 0.7872, + "step": 4136 + }, + { + "epoch": 0.71, + "grad_norm": 8.081562995910645, + "learning_rate": 1.935043761798524e-05, + "loss": 0.5662, + "step": 4137 + }, + { + "epoch": 0.71, + "grad_norm": 12.44813060760498, + "learning_rate": 1.9347863394542645e-05, + "loss": 1.0015, + "step": 4138 + }, + { + "epoch": 0.71, + "grad_norm": 11.001461029052734, + "learning_rate": 1.9345289171100055e-05, + "loss": 0.9098, + "step": 4139 + }, + { + "epoch": 0.71, + "grad_norm": 13.881959915161133, + "learning_rate": 1.9342714947657458e-05, + "loss": 1.1068, + "step": 4140 + }, + { + "epoch": 0.71, + "grad_norm": 11.648541450500488, + "learning_rate": 1.9340140724214865e-05, + "loss": 0.9119, + "step": 4141 + }, + { + "epoch": 0.71, + "grad_norm": 11.838765144348145, + "learning_rate": 1.9337566500772268e-05, + "loss": 0.9047, + "step": 4142 + }, + { + "epoch": 0.71, + "grad_norm": 10.513163566589355, + "learning_rate": 1.933499227732967e-05, + "loss": 0.7928, + "step": 4143 + }, + { + "epoch": 0.71, + "grad_norm": 9.546553611755371, + "learning_rate": 1.9332418053887078e-05, + "loss": 0.7728, + "step": 4144 + }, + { + "epoch": 0.71, + "grad_norm": 11.782511711120605, + "learning_rate": 1.932984383044448e-05, + "loss": 0.8833, + "step": 4145 + }, + { + "epoch": 0.71, + "grad_norm": 11.285393714904785, + "learning_rate": 1.9327269607001888e-05, + "loss": 0.9278, + "step": 4146 + }, + { + "epoch": 0.71, + "grad_norm": 12.105173110961914, + "learning_rate": 1.932469538355929e-05, + "loss": 0.9307, + "step": 4147 + }, + { + "epoch": 0.71, + "grad_norm": 8.945642471313477, + "learning_rate": 1.93221211601167e-05, + "loss": 0.7086, + "step": 4148 + }, + { + "epoch": 0.71, + "grad_norm": 8.571893692016602, + "learning_rate": 1.9319546936674105e-05, + "loss": 1.0421, + "step": 4149 + }, + { + "epoch": 0.71, + "grad_norm": 12.30229663848877, + "learning_rate": 1.931697271323151e-05, + "loss": 0.9622, + "step": 4150 + }, + { + "epoch": 0.71, + "grad_norm": 10.79883861541748, + "learning_rate": 1.9314398489788915e-05, + "loss": 0.8826, + "step": 4151 + }, + { + "epoch": 0.71, + "grad_norm": 11.391386985778809, + "learning_rate": 1.9311824266346318e-05, + "loss": 1.1381, + "step": 4152 + }, + { + "epoch": 0.71, + "grad_norm": 10.61949348449707, + "learning_rate": 1.9309250042903725e-05, + "loss": 0.7949, + "step": 4153 + }, + { + "epoch": 0.71, + "grad_norm": 10.430593490600586, + "learning_rate": 1.9306675819461128e-05, + "loss": 0.7028, + "step": 4154 + }, + { + "epoch": 0.71, + "grad_norm": 8.597196578979492, + "learning_rate": 1.9304101596018535e-05, + "loss": 0.8895, + "step": 4155 + }, + { + "epoch": 0.71, + "grad_norm": 12.876840591430664, + "learning_rate": 1.9301527372575938e-05, + "loss": 0.9208, + "step": 4156 + }, + { + "epoch": 0.71, + "grad_norm": 8.680956840515137, + "learning_rate": 1.9298953149133345e-05, + "loss": 0.6098, + "step": 4157 + }, + { + "epoch": 0.71, + "grad_norm": 8.585216522216797, + "learning_rate": 1.929637892569075e-05, + "loss": 0.7811, + "step": 4158 + }, + { + "epoch": 0.71, + "grad_norm": 9.264379501342773, + "learning_rate": 1.9293804702248158e-05, + "loss": 0.7338, + "step": 4159 + }, + { + "epoch": 0.71, + "grad_norm": 10.872125625610352, + "learning_rate": 1.929123047880556e-05, + "loss": 0.7388, + "step": 4160 + }, + { + "epoch": 0.71, + "grad_norm": 11.399907112121582, + "learning_rate": 1.9288656255362968e-05, + "loss": 0.9994, + "step": 4161 + }, + { + "epoch": 0.71, + "grad_norm": 9.19143295288086, + "learning_rate": 1.928608203192037e-05, + "loss": 0.7663, + "step": 4162 + }, + { + "epoch": 0.71, + "grad_norm": 11.973332405090332, + "learning_rate": 1.9283507808477774e-05, + "loss": 0.7019, + "step": 4163 + }, + { + "epoch": 0.71, + "grad_norm": 11.952611923217773, + "learning_rate": 1.928093358503518e-05, + "loss": 0.8369, + "step": 4164 + }, + { + "epoch": 0.71, + "grad_norm": 9.02942943572998, + "learning_rate": 1.9278359361592584e-05, + "loss": 0.6435, + "step": 4165 + }, + { + "epoch": 0.71, + "grad_norm": 10.73558521270752, + "learning_rate": 1.927578513814999e-05, + "loss": 0.7812, + "step": 4166 + }, + { + "epoch": 0.72, + "grad_norm": 10.565032958984375, + "learning_rate": 1.9273210914707398e-05, + "loss": 0.8437, + "step": 4167 + }, + { + "epoch": 0.72, + "grad_norm": 11.904428482055664, + "learning_rate": 1.9270636691264805e-05, + "loss": 1.0478, + "step": 4168 + }, + { + "epoch": 0.72, + "grad_norm": 12.629289627075195, + "learning_rate": 1.9268062467822208e-05, + "loss": 1.1011, + "step": 4169 + }, + { + "epoch": 0.72, + "grad_norm": 14.416136741638184, + "learning_rate": 1.9265488244379614e-05, + "loss": 1.1382, + "step": 4170 + }, + { + "epoch": 0.72, + "grad_norm": 11.459004402160645, + "learning_rate": 1.9262914020937018e-05, + "loss": 0.7723, + "step": 4171 + }, + { + "epoch": 0.72, + "grad_norm": 10.841681480407715, + "learning_rate": 1.9260339797494424e-05, + "loss": 0.857, + "step": 4172 + }, + { + "epoch": 0.72, + "grad_norm": 7.328619003295898, + "learning_rate": 1.9257765574051828e-05, + "loss": 0.5077, + "step": 4173 + }, + { + "epoch": 0.72, + "grad_norm": 9.433027267456055, + "learning_rate": 1.925519135060923e-05, + "loss": 0.5644, + "step": 4174 + }, + { + "epoch": 0.72, + "grad_norm": 9.906169891357422, + "learning_rate": 1.9252617127166638e-05, + "loss": 0.8372, + "step": 4175 + }, + { + "epoch": 0.72, + "grad_norm": 8.936905860900879, + "learning_rate": 1.925004290372404e-05, + "loss": 0.7456, + "step": 4176 + }, + { + "epoch": 0.72, + "grad_norm": 8.933682441711426, + "learning_rate": 1.924746868028145e-05, + "loss": 0.6806, + "step": 4177 + }, + { + "epoch": 0.72, + "grad_norm": 9.381811141967773, + "learning_rate": 1.9244894456838854e-05, + "loss": 0.7936, + "step": 4178 + }, + { + "epoch": 0.72, + "grad_norm": 10.274429321289062, + "learning_rate": 1.924232023339626e-05, + "loss": 0.9193, + "step": 4179 + }, + { + "epoch": 0.72, + "grad_norm": 10.535015106201172, + "learning_rate": 1.9239746009953664e-05, + "loss": 1.034, + "step": 4180 + }, + { + "epoch": 0.72, + "grad_norm": 12.031618118286133, + "learning_rate": 1.923717178651107e-05, + "loss": 1.0178, + "step": 4181 + }, + { + "epoch": 0.72, + "grad_norm": 11.387957572937012, + "learning_rate": 1.9234597563068474e-05, + "loss": 0.9654, + "step": 4182 + }, + { + "epoch": 0.72, + "grad_norm": 10.000457763671875, + "learning_rate": 1.923202333962588e-05, + "loss": 0.9824, + "step": 4183 + }, + { + "epoch": 0.72, + "grad_norm": 8.215188980102539, + "learning_rate": 1.9229449116183284e-05, + "loss": 0.5607, + "step": 4184 + }, + { + "epoch": 0.72, + "grad_norm": 10.841573715209961, + "learning_rate": 1.9226874892740688e-05, + "loss": 0.9767, + "step": 4185 + }, + { + "epoch": 0.72, + "grad_norm": 8.959955215454102, + "learning_rate": 1.9224300669298098e-05, + "loss": 0.8393, + "step": 4186 + }, + { + "epoch": 0.72, + "grad_norm": 12.130094528198242, + "learning_rate": 1.92217264458555e-05, + "loss": 0.9968, + "step": 4187 + }, + { + "epoch": 0.72, + "grad_norm": 9.533580780029297, + "learning_rate": 1.9219152222412908e-05, + "loss": 0.7348, + "step": 4188 + }, + { + "epoch": 0.72, + "grad_norm": 12.317439079284668, + "learning_rate": 1.921657799897031e-05, + "loss": 0.8343, + "step": 4189 + }, + { + "epoch": 0.72, + "grad_norm": 10.607730865478516, + "learning_rate": 1.9214003775527718e-05, + "loss": 0.8719, + "step": 4190 + }, + { + "epoch": 0.72, + "grad_norm": 10.656519889831543, + "learning_rate": 1.921142955208512e-05, + "loss": 0.7524, + "step": 4191 + }, + { + "epoch": 0.72, + "grad_norm": 11.75776195526123, + "learning_rate": 1.9208855328642528e-05, + "loss": 0.7508, + "step": 4192 + }, + { + "epoch": 0.72, + "grad_norm": 10.010034561157227, + "learning_rate": 1.920628110519993e-05, + "loss": 0.7258, + "step": 4193 + }, + { + "epoch": 0.72, + "grad_norm": 10.178448677062988, + "learning_rate": 1.9203706881757334e-05, + "loss": 0.8304, + "step": 4194 + }, + { + "epoch": 0.72, + "grad_norm": 11.157059669494629, + "learning_rate": 1.920113265831474e-05, + "loss": 0.9199, + "step": 4195 + }, + { + "epoch": 0.72, + "grad_norm": 10.935074806213379, + "learning_rate": 1.9198558434872148e-05, + "loss": 0.9531, + "step": 4196 + }, + { + "epoch": 0.72, + "grad_norm": 6.679344177246094, + "learning_rate": 1.9195984211429554e-05, + "loss": 0.6097, + "step": 4197 + }, + { + "epoch": 0.72, + "grad_norm": 11.078520774841309, + "learning_rate": 1.9193409987986958e-05, + "loss": 0.7061, + "step": 4198 + }, + { + "epoch": 0.72, + "grad_norm": 9.979924201965332, + "learning_rate": 1.9190835764544364e-05, + "loss": 0.8073, + "step": 4199 + }, + { + "epoch": 0.72, + "grad_norm": 11.267154693603516, + "learning_rate": 1.9188261541101767e-05, + "loss": 0.975, + "step": 4200 + }, + { + "epoch": 0.72, + "grad_norm": 8.881691932678223, + "learning_rate": 1.9185687317659174e-05, + "loss": 0.7223, + "step": 4201 + }, + { + "epoch": 0.72, + "grad_norm": 9.524392127990723, + "learning_rate": 1.9183113094216577e-05, + "loss": 0.7263, + "step": 4202 + }, + { + "epoch": 0.72, + "grad_norm": 8.83057975769043, + "learning_rate": 1.9180538870773984e-05, + "loss": 0.7493, + "step": 4203 + }, + { + "epoch": 0.72, + "grad_norm": 11.009383201599121, + "learning_rate": 1.9177964647331387e-05, + "loss": 0.5886, + "step": 4204 + }, + { + "epoch": 0.72, + "grad_norm": 13.207352638244629, + "learning_rate": 1.9175390423888794e-05, + "loss": 1.0468, + "step": 4205 + }, + { + "epoch": 0.72, + "grad_norm": 11.457111358642578, + "learning_rate": 1.91728162004462e-05, + "loss": 0.9551, + "step": 4206 + }, + { + "epoch": 0.72, + "grad_norm": 8.947735786437988, + "learning_rate": 1.9170241977003604e-05, + "loss": 0.8575, + "step": 4207 + }, + { + "epoch": 0.72, + "grad_norm": 10.02142333984375, + "learning_rate": 1.916766775356101e-05, + "loss": 0.8909, + "step": 4208 + }, + { + "epoch": 0.72, + "grad_norm": 9.354259490966797, + "learning_rate": 1.9165093530118414e-05, + "loss": 0.8369, + "step": 4209 + }, + { + "epoch": 0.72, + "grad_norm": 9.476853370666504, + "learning_rate": 1.916251930667582e-05, + "loss": 0.7134, + "step": 4210 + }, + { + "epoch": 0.72, + "grad_norm": 9.338850021362305, + "learning_rate": 1.9159945083233224e-05, + "loss": 0.8601, + "step": 4211 + }, + { + "epoch": 0.72, + "grad_norm": 10.50589370727539, + "learning_rate": 1.915737085979063e-05, + "loss": 0.834, + "step": 4212 + }, + { + "epoch": 0.72, + "grad_norm": 10.211137771606445, + "learning_rate": 1.9154796636348034e-05, + "loss": 0.9652, + "step": 4213 + }, + { + "epoch": 0.72, + "grad_norm": 7.985767841339111, + "learning_rate": 1.915222241290544e-05, + "loss": 0.8474, + "step": 4214 + }, + { + "epoch": 0.72, + "grad_norm": 9.327977180480957, + "learning_rate": 1.9149648189462847e-05, + "loss": 0.8398, + "step": 4215 + }, + { + "epoch": 0.72, + "grad_norm": 12.896589279174805, + "learning_rate": 1.914707396602025e-05, + "loss": 0.8843, + "step": 4216 + }, + { + "epoch": 0.72, + "grad_norm": 11.053586959838867, + "learning_rate": 1.9144499742577657e-05, + "loss": 0.8559, + "step": 4217 + }, + { + "epoch": 0.72, + "grad_norm": 10.480340957641602, + "learning_rate": 1.914192551913506e-05, + "loss": 0.9843, + "step": 4218 + }, + { + "epoch": 0.72, + "grad_norm": 12.425511360168457, + "learning_rate": 1.9139351295692467e-05, + "loss": 0.8868, + "step": 4219 + }, + { + "epoch": 0.72, + "grad_norm": 9.773504257202148, + "learning_rate": 1.913677707224987e-05, + "loss": 0.7274, + "step": 4220 + }, + { + "epoch": 0.72, + "grad_norm": 9.101371765136719, + "learning_rate": 1.9134202848807277e-05, + "loss": 0.829, + "step": 4221 + }, + { + "epoch": 0.72, + "grad_norm": 10.200393676757812, + "learning_rate": 1.913162862536468e-05, + "loss": 0.8919, + "step": 4222 + }, + { + "epoch": 0.72, + "grad_norm": 8.389777183532715, + "learning_rate": 1.9129054401922087e-05, + "loss": 0.7035, + "step": 4223 + }, + { + "epoch": 0.72, + "grad_norm": 10.115687370300293, + "learning_rate": 1.9126480178479494e-05, + "loss": 0.817, + "step": 4224 + }, + { + "epoch": 0.73, + "grad_norm": 6.318549156188965, + "learning_rate": 1.9123905955036897e-05, + "loss": 0.5087, + "step": 4225 + }, + { + "epoch": 0.73, + "grad_norm": 9.522407531738281, + "learning_rate": 1.9121331731594304e-05, + "loss": 0.8161, + "step": 4226 + }, + { + "epoch": 0.73, + "grad_norm": 11.159699440002441, + "learning_rate": 1.9118757508151707e-05, + "loss": 0.9951, + "step": 4227 + }, + { + "epoch": 0.73, + "grad_norm": 7.890766143798828, + "learning_rate": 1.9116183284709114e-05, + "loss": 0.6243, + "step": 4228 + }, + { + "epoch": 0.73, + "grad_norm": 9.272420883178711, + "learning_rate": 1.9113609061266517e-05, + "loss": 0.9083, + "step": 4229 + }, + { + "epoch": 0.73, + "grad_norm": 9.610692024230957, + "learning_rate": 1.9111034837823924e-05, + "loss": 1.009, + "step": 4230 + }, + { + "epoch": 0.73, + "grad_norm": 10.850475311279297, + "learning_rate": 1.9108460614381327e-05, + "loss": 0.7239, + "step": 4231 + }, + { + "epoch": 0.73, + "grad_norm": 10.294280052185059, + "learning_rate": 1.9105886390938734e-05, + "loss": 0.8397, + "step": 4232 + }, + { + "epoch": 0.73, + "grad_norm": 9.980622291564941, + "learning_rate": 1.9103312167496137e-05, + "loss": 0.9628, + "step": 4233 + }, + { + "epoch": 0.73, + "grad_norm": 10.041665077209473, + "learning_rate": 1.9100737944053547e-05, + "loss": 0.7433, + "step": 4234 + }, + { + "epoch": 0.73, + "grad_norm": 7.734867095947266, + "learning_rate": 1.909816372061095e-05, + "loss": 0.7507, + "step": 4235 + }, + { + "epoch": 0.73, + "grad_norm": 8.993006706237793, + "learning_rate": 1.9095589497168354e-05, + "loss": 0.9328, + "step": 4236 + }, + { + "epoch": 0.73, + "grad_norm": 11.371488571166992, + "learning_rate": 1.909301527372576e-05, + "loss": 1.0895, + "step": 4237 + }, + { + "epoch": 0.73, + "grad_norm": 9.333434104919434, + "learning_rate": 1.9090441050283164e-05, + "loss": 0.8659, + "step": 4238 + }, + { + "epoch": 0.73, + "grad_norm": 9.16541862487793, + "learning_rate": 1.908786682684057e-05, + "loss": 0.8717, + "step": 4239 + }, + { + "epoch": 0.73, + "grad_norm": 7.871678352355957, + "learning_rate": 1.9085292603397974e-05, + "loss": 0.646, + "step": 4240 + }, + { + "epoch": 0.73, + "grad_norm": 9.546360969543457, + "learning_rate": 1.908271837995538e-05, + "loss": 0.7511, + "step": 4241 + }, + { + "epoch": 0.73, + "grad_norm": 9.895899772644043, + "learning_rate": 1.9080144156512784e-05, + "loss": 0.7912, + "step": 4242 + }, + { + "epoch": 0.73, + "grad_norm": 9.175511360168457, + "learning_rate": 1.9077569933070194e-05, + "loss": 0.9611, + "step": 4243 + }, + { + "epoch": 0.73, + "grad_norm": 10.024049758911133, + "learning_rate": 1.9074995709627597e-05, + "loss": 0.9508, + "step": 4244 + }, + { + "epoch": 0.73, + "grad_norm": 9.078110694885254, + "learning_rate": 1.9072421486185004e-05, + "loss": 0.9578, + "step": 4245 + }, + { + "epoch": 0.73, + "grad_norm": 11.51116943359375, + "learning_rate": 1.9069847262742407e-05, + "loss": 1.1497, + "step": 4246 + }, + { + "epoch": 0.73, + "grad_norm": 8.76651668548584, + "learning_rate": 1.906727303929981e-05, + "loss": 0.6663, + "step": 4247 + }, + { + "epoch": 0.73, + "grad_norm": 9.659747123718262, + "learning_rate": 1.9064698815857217e-05, + "loss": 0.8842, + "step": 4248 + }, + { + "epoch": 0.73, + "grad_norm": 12.561182022094727, + "learning_rate": 1.906212459241462e-05, + "loss": 0.8444, + "step": 4249 + }, + { + "epoch": 0.73, + "grad_norm": 12.045940399169922, + "learning_rate": 1.9059550368972027e-05, + "loss": 0.9408, + "step": 4250 + }, + { + "epoch": 0.73, + "grad_norm": 12.490055084228516, + "learning_rate": 1.905697614552943e-05, + "loss": 0.9343, + "step": 4251 + }, + { + "epoch": 0.73, + "grad_norm": 8.027691841125488, + "learning_rate": 1.9054401922086837e-05, + "loss": 0.6872, + "step": 4252 + }, + { + "epoch": 0.73, + "grad_norm": 7.355417251586914, + "learning_rate": 1.9051827698644244e-05, + "loss": 0.5568, + "step": 4253 + }, + { + "epoch": 0.73, + "grad_norm": 10.123159408569336, + "learning_rate": 1.904925347520165e-05, + "loss": 0.7632, + "step": 4254 + }, + { + "epoch": 0.73, + "grad_norm": 11.372822761535645, + "learning_rate": 1.9046679251759054e-05, + "loss": 0.8914, + "step": 4255 + }, + { + "epoch": 0.73, + "grad_norm": 10.501077651977539, + "learning_rate": 1.9044105028316457e-05, + "loss": 0.824, + "step": 4256 + }, + { + "epoch": 0.73, + "grad_norm": 11.116118431091309, + "learning_rate": 1.9041530804873864e-05, + "loss": 1.2319, + "step": 4257 + }, + { + "epoch": 0.73, + "grad_norm": 10.635515213012695, + "learning_rate": 1.9038956581431267e-05, + "loss": 0.7783, + "step": 4258 + }, + { + "epoch": 0.73, + "grad_norm": 7.615957260131836, + "learning_rate": 1.9036382357988674e-05, + "loss": 0.5975, + "step": 4259 + }, + { + "epoch": 0.73, + "grad_norm": 8.458331108093262, + "learning_rate": 1.9033808134546077e-05, + "loss": 0.7164, + "step": 4260 + }, + { + "epoch": 0.73, + "grad_norm": 10.402206420898438, + "learning_rate": 1.9031233911103484e-05, + "loss": 0.7424, + "step": 4261 + }, + { + "epoch": 0.73, + "grad_norm": 9.506240844726562, + "learning_rate": 1.902865968766089e-05, + "loss": 1.1194, + "step": 4262 + }, + { + "epoch": 0.73, + "grad_norm": 9.461847305297852, + "learning_rate": 1.9026085464218297e-05, + "loss": 0.7483, + "step": 4263 + }, + { + "epoch": 0.73, + "grad_norm": 10.207564353942871, + "learning_rate": 1.90235112407757e-05, + "loss": 0.6526, + "step": 4264 + }, + { + "epoch": 0.73, + "grad_norm": 10.552288055419922, + "learning_rate": 1.9020937017333107e-05, + "loss": 1.1085, + "step": 4265 + }, + { + "epoch": 0.73, + "grad_norm": 8.87618637084961, + "learning_rate": 1.901836279389051e-05, + "loss": 0.7648, + "step": 4266 + }, + { + "epoch": 0.73, + "grad_norm": 11.346138954162598, + "learning_rate": 1.9015788570447914e-05, + "loss": 0.7646, + "step": 4267 + }, + { + "epoch": 0.73, + "grad_norm": 8.180513381958008, + "learning_rate": 1.901321434700532e-05, + "loss": 0.7532, + "step": 4268 + }, + { + "epoch": 0.73, + "grad_norm": 9.566898345947266, + "learning_rate": 1.9010640123562723e-05, + "loss": 0.8056, + "step": 4269 + }, + { + "epoch": 0.73, + "grad_norm": 10.16606616973877, + "learning_rate": 1.900806590012013e-05, + "loss": 0.7936, + "step": 4270 + }, + { + "epoch": 0.73, + "grad_norm": 8.111207962036133, + "learning_rate": 1.9005491676677537e-05, + "loss": 0.614, + "step": 4271 + }, + { + "epoch": 0.73, + "grad_norm": 11.534924507141113, + "learning_rate": 1.9002917453234944e-05, + "loss": 1.2625, + "step": 4272 + }, + { + "epoch": 0.73, + "grad_norm": 8.890825271606445, + "learning_rate": 1.9000343229792347e-05, + "loss": 0.8264, + "step": 4273 + }, + { + "epoch": 0.73, + "grad_norm": 7.985905647277832, + "learning_rate": 1.8997769006349754e-05, + "loss": 0.7684, + "step": 4274 + }, + { + "epoch": 0.73, + "grad_norm": 11.069395065307617, + "learning_rate": 1.8995194782907157e-05, + "loss": 1.0358, + "step": 4275 + }, + { + "epoch": 0.73, + "grad_norm": 10.809677124023438, + "learning_rate": 1.8992620559464563e-05, + "loss": 0.8179, + "step": 4276 + }, + { + "epoch": 0.73, + "grad_norm": 9.157917022705078, + "learning_rate": 1.8990046336021967e-05, + "loss": 0.6746, + "step": 4277 + }, + { + "epoch": 0.73, + "grad_norm": 10.686842918395996, + "learning_rate": 1.898747211257937e-05, + "loss": 0.8305, + "step": 4278 + }, + { + "epoch": 0.73, + "grad_norm": 11.763129234313965, + "learning_rate": 1.8984897889136777e-05, + "loss": 1.1049, + "step": 4279 + }, + { + "epoch": 0.73, + "grad_norm": 10.007186889648438, + "learning_rate": 1.898232366569418e-05, + "loss": 0.85, + "step": 4280 + }, + { + "epoch": 0.73, + "grad_norm": 11.028447151184082, + "learning_rate": 1.897974944225159e-05, + "loss": 0.6868, + "step": 4281 + }, + { + "epoch": 0.73, + "grad_norm": 13.202744483947754, + "learning_rate": 1.8977175218808993e-05, + "loss": 0.9465, + "step": 4282 + }, + { + "epoch": 0.74, + "grad_norm": 11.139639854431152, + "learning_rate": 1.89746009953664e-05, + "loss": 1.0126, + "step": 4283 + }, + { + "epoch": 0.74, + "grad_norm": 12.211018562316895, + "learning_rate": 1.8972026771923803e-05, + "loss": 1.0667, + "step": 4284 + }, + { + "epoch": 0.74, + "grad_norm": 10.292472839355469, + "learning_rate": 1.896945254848121e-05, + "loss": 0.6016, + "step": 4285 + }, + { + "epoch": 0.74, + "grad_norm": 12.162647247314453, + "learning_rate": 1.8966878325038613e-05, + "loss": 0.934, + "step": 4286 + }, + { + "epoch": 0.74, + "grad_norm": 10.490167617797852, + "learning_rate": 1.896430410159602e-05, + "loss": 0.7284, + "step": 4287 + }, + { + "epoch": 0.74, + "grad_norm": 12.79624080657959, + "learning_rate": 1.8961729878153423e-05, + "loss": 0.8251, + "step": 4288 + }, + { + "epoch": 0.74, + "grad_norm": 9.193233489990234, + "learning_rate": 1.8959155654710827e-05, + "loss": 0.7528, + "step": 4289 + }, + { + "epoch": 0.74, + "grad_norm": 9.678086280822754, + "learning_rate": 1.8956581431268237e-05, + "loss": 0.8643, + "step": 4290 + }, + { + "epoch": 0.74, + "grad_norm": 10.563193321228027, + "learning_rate": 1.895400720782564e-05, + "loss": 0.9623, + "step": 4291 + }, + { + "epoch": 0.74, + "grad_norm": 10.784810066223145, + "learning_rate": 1.8951432984383047e-05, + "loss": 0.775, + "step": 4292 + }, + { + "epoch": 0.74, + "grad_norm": 11.589600563049316, + "learning_rate": 1.894885876094045e-05, + "loss": 0.7281, + "step": 4293 + }, + { + "epoch": 0.74, + "grad_norm": 8.031818389892578, + "learning_rate": 1.8946284537497857e-05, + "loss": 0.5981, + "step": 4294 + }, + { + "epoch": 0.74, + "grad_norm": 10.341252326965332, + "learning_rate": 1.894371031405526e-05, + "loss": 0.9253, + "step": 4295 + }, + { + "epoch": 0.74, + "grad_norm": 9.869950294494629, + "learning_rate": 1.8941136090612667e-05, + "loss": 0.8761, + "step": 4296 + }, + { + "epoch": 0.74, + "grad_norm": 8.53954029083252, + "learning_rate": 1.893856186717007e-05, + "loss": 0.5906, + "step": 4297 + }, + { + "epoch": 0.74, + "grad_norm": 9.133573532104492, + "learning_rate": 1.8935987643727473e-05, + "loss": 0.6317, + "step": 4298 + }, + { + "epoch": 0.74, + "grad_norm": 11.367504119873047, + "learning_rate": 1.893341342028488e-05, + "loss": 0.9038, + "step": 4299 + }, + { + "epoch": 0.74, + "grad_norm": 12.127020835876465, + "learning_rate": 1.8930839196842287e-05, + "loss": 0.9277, + "step": 4300 + }, + { + "epoch": 0.74, + "grad_norm": 11.476219177246094, + "learning_rate": 1.8928264973399693e-05, + "loss": 0.8453, + "step": 4301 + }, + { + "epoch": 0.74, + "grad_norm": 9.979869842529297, + "learning_rate": 1.8925690749957097e-05, + "loss": 0.8248, + "step": 4302 + }, + { + "epoch": 0.74, + "grad_norm": 9.847350120544434, + "learning_rate": 1.8923116526514503e-05, + "loss": 0.7173, + "step": 4303 + }, + { + "epoch": 0.74, + "grad_norm": 9.573976516723633, + "learning_rate": 1.8920542303071907e-05, + "loss": 1.047, + "step": 4304 + }, + { + "epoch": 0.74, + "grad_norm": 11.462332725524902, + "learning_rate": 1.8917968079629313e-05, + "loss": 0.6791, + "step": 4305 + }, + { + "epoch": 0.74, + "grad_norm": 9.317166328430176, + "learning_rate": 1.8915393856186716e-05, + "loss": 0.7915, + "step": 4306 + }, + { + "epoch": 0.74, + "grad_norm": 8.177624702453613, + "learning_rate": 1.8912819632744123e-05, + "loss": 0.528, + "step": 4307 + }, + { + "epoch": 0.74, + "grad_norm": 9.47716236114502, + "learning_rate": 1.8910245409301526e-05, + "loss": 0.6932, + "step": 4308 + }, + { + "epoch": 0.74, + "grad_norm": 7.621345520019531, + "learning_rate": 1.8907671185858933e-05, + "loss": 0.5768, + "step": 4309 + }, + { + "epoch": 0.74, + "grad_norm": 11.76716423034668, + "learning_rate": 1.890509696241634e-05, + "loss": 0.7745, + "step": 4310 + }, + { + "epoch": 0.74, + "grad_norm": 13.033775329589844, + "learning_rate": 1.8902522738973743e-05, + "loss": 0.9711, + "step": 4311 + }, + { + "epoch": 0.74, + "grad_norm": 10.942069053649902, + "learning_rate": 1.889994851553115e-05, + "loss": 0.6979, + "step": 4312 + }, + { + "epoch": 0.74, + "grad_norm": 10.966338157653809, + "learning_rate": 1.8897374292088553e-05, + "loss": 0.8136, + "step": 4313 + }, + { + "epoch": 0.74, + "grad_norm": 9.818129539489746, + "learning_rate": 1.889480006864596e-05, + "loss": 0.8066, + "step": 4314 + }, + { + "epoch": 0.74, + "grad_norm": 10.387496948242188, + "learning_rate": 1.8892225845203363e-05, + "loss": 0.6352, + "step": 4315 + }, + { + "epoch": 0.74, + "grad_norm": 9.305728912353516, + "learning_rate": 1.888965162176077e-05, + "loss": 0.6251, + "step": 4316 + }, + { + "epoch": 0.74, + "grad_norm": 10.720781326293945, + "learning_rate": 1.8887077398318173e-05, + "loss": 0.837, + "step": 4317 + }, + { + "epoch": 0.74, + "grad_norm": 11.153433799743652, + "learning_rate": 1.888450317487558e-05, + "loss": 1.0185, + "step": 4318 + }, + { + "epoch": 0.74, + "grad_norm": 12.407583236694336, + "learning_rate": 1.8881928951432986e-05, + "loss": 0.6867, + "step": 4319 + }, + { + "epoch": 0.74, + "grad_norm": 9.725436210632324, + "learning_rate": 1.887935472799039e-05, + "loss": 0.6625, + "step": 4320 + }, + { + "epoch": 0.74, + "grad_norm": 9.523588180541992, + "learning_rate": 1.8876780504547796e-05, + "loss": 0.7459, + "step": 4321 + }, + { + "epoch": 0.74, + "grad_norm": 12.071678161621094, + "learning_rate": 1.88742062811052e-05, + "loss": 0.6611, + "step": 4322 + }, + { + "epoch": 0.74, + "grad_norm": 9.482538223266602, + "learning_rate": 1.8871632057662606e-05, + "loss": 0.8529, + "step": 4323 + }, + { + "epoch": 0.74, + "grad_norm": 11.277457237243652, + "learning_rate": 1.886905783422001e-05, + "loss": 0.636, + "step": 4324 + }, + { + "epoch": 0.74, + "grad_norm": 8.441004753112793, + "learning_rate": 1.8866483610777416e-05, + "loss": 0.7508, + "step": 4325 + }, + { + "epoch": 0.74, + "grad_norm": 9.06314468383789, + "learning_rate": 1.886390938733482e-05, + "loss": 0.6786, + "step": 4326 + }, + { + "epoch": 0.74, + "grad_norm": 11.114742279052734, + "learning_rate": 1.8861335163892226e-05, + "loss": 0.9106, + "step": 4327 + }, + { + "epoch": 0.74, + "grad_norm": 10.636996269226074, + "learning_rate": 1.8858760940449633e-05, + "loss": 1.137, + "step": 4328 + }, + { + "epoch": 0.74, + "grad_norm": 12.75655460357666, + "learning_rate": 1.8856186717007036e-05, + "loss": 1.2952, + "step": 4329 + }, + { + "epoch": 0.74, + "grad_norm": 9.261377334594727, + "learning_rate": 1.8853612493564443e-05, + "loss": 0.6218, + "step": 4330 + }, + { + "epoch": 0.74, + "grad_norm": 10.316879272460938, + "learning_rate": 1.8851038270121846e-05, + "loss": 0.7588, + "step": 4331 + }, + { + "epoch": 0.74, + "grad_norm": 9.593915939331055, + "learning_rate": 1.8848464046679253e-05, + "loss": 0.7051, + "step": 4332 + }, + { + "epoch": 0.74, + "grad_norm": 10.073686599731445, + "learning_rate": 1.8845889823236656e-05, + "loss": 0.8409, + "step": 4333 + }, + { + "epoch": 0.74, + "grad_norm": 10.310689926147461, + "learning_rate": 1.8843315599794063e-05, + "loss": 0.927, + "step": 4334 + }, + { + "epoch": 0.74, + "grad_norm": 12.633893013000488, + "learning_rate": 1.8840741376351466e-05, + "loss": 0.8132, + "step": 4335 + }, + { + "epoch": 0.74, + "grad_norm": 8.84637451171875, + "learning_rate": 1.8838167152908873e-05, + "loss": 0.7943, + "step": 4336 + }, + { + "epoch": 0.74, + "grad_norm": 8.861120223999023, + "learning_rate": 1.8835592929466276e-05, + "loss": 0.7672, + "step": 4337 + }, + { + "epoch": 0.74, + "grad_norm": 8.433868408203125, + "learning_rate": 1.8833018706023686e-05, + "loss": 0.6974, + "step": 4338 + }, + { + "epoch": 0.74, + "grad_norm": 7.834355354309082, + "learning_rate": 1.883044448258109e-05, + "loss": 0.72, + "step": 4339 + }, + { + "epoch": 0.74, + "grad_norm": 9.720258712768555, + "learning_rate": 1.8827870259138493e-05, + "loss": 0.9226, + "step": 4340 + }, + { + "epoch": 0.74, + "grad_norm": 10.472895622253418, + "learning_rate": 1.88252960356959e-05, + "loss": 0.8724, + "step": 4341 + }, + { + "epoch": 0.75, + "grad_norm": 12.831478118896484, + "learning_rate": 1.8822721812253303e-05, + "loss": 0.8691, + "step": 4342 + }, + { + "epoch": 0.75, + "grad_norm": 9.863024711608887, + "learning_rate": 1.882014758881071e-05, + "loss": 0.8022, + "step": 4343 + }, + { + "epoch": 0.75, + "grad_norm": 9.083821296691895, + "learning_rate": 1.8817573365368113e-05, + "loss": 0.758, + "step": 4344 + }, + { + "epoch": 0.75, + "grad_norm": 10.028779983520508, + "learning_rate": 1.881499914192552e-05, + "loss": 0.7373, + "step": 4345 + }, + { + "epoch": 0.75, + "grad_norm": 9.553868293762207, + "learning_rate": 1.8812424918482923e-05, + "loss": 0.654, + "step": 4346 + }, + { + "epoch": 0.75, + "grad_norm": 11.712279319763184, + "learning_rate": 1.8809850695040333e-05, + "loss": 0.7303, + "step": 4347 + }, + { + "epoch": 0.75, + "grad_norm": 11.865372657775879, + "learning_rate": 1.8807276471597736e-05, + "loss": 1.0221, + "step": 4348 + }, + { + "epoch": 0.75, + "grad_norm": 10.506478309631348, + "learning_rate": 1.8804702248155143e-05, + "loss": 0.7201, + "step": 4349 + }, + { + "epoch": 0.75, + "grad_norm": 10.890604019165039, + "learning_rate": 1.8802128024712546e-05, + "loss": 0.7847, + "step": 4350 + }, + { + "epoch": 0.75, + "grad_norm": 7.948525428771973, + "learning_rate": 1.879955380126995e-05, + "loss": 0.5751, + "step": 4351 + }, + { + "epoch": 0.75, + "grad_norm": 10.099681854248047, + "learning_rate": 1.8796979577827356e-05, + "loss": 0.8288, + "step": 4352 + }, + { + "epoch": 0.75, + "grad_norm": 10.870171546936035, + "learning_rate": 1.879440535438476e-05, + "loss": 1.0622, + "step": 4353 + }, + { + "epoch": 0.75, + "grad_norm": 12.078024864196777, + "learning_rate": 1.8791831130942166e-05, + "loss": 0.9769, + "step": 4354 + }, + { + "epoch": 0.75, + "grad_norm": 10.907835960388184, + "learning_rate": 1.878925690749957e-05, + "loss": 0.9277, + "step": 4355 + }, + { + "epoch": 0.75, + "grad_norm": 10.870816230773926, + "learning_rate": 1.8786682684056976e-05, + "loss": 0.7616, + "step": 4356 + }, + { + "epoch": 0.75, + "grad_norm": 8.830236434936523, + "learning_rate": 1.8784108460614383e-05, + "loss": 0.7201, + "step": 4357 + }, + { + "epoch": 0.75, + "grad_norm": 10.235135078430176, + "learning_rate": 1.878153423717179e-05, + "loss": 0.8442, + "step": 4358 + }, + { + "epoch": 0.75, + "grad_norm": 7.954062461853027, + "learning_rate": 1.8778960013729193e-05, + "loss": 0.6129, + "step": 4359 + }, + { + "epoch": 0.75, + "grad_norm": 8.778276443481445, + "learning_rate": 1.8776385790286596e-05, + "loss": 1.0534, + "step": 4360 + }, + { + "epoch": 0.75, + "grad_norm": 10.113384246826172, + "learning_rate": 1.8773811566844003e-05, + "loss": 0.7507, + "step": 4361 + }, + { + "epoch": 0.75, + "grad_norm": 7.1980390548706055, + "learning_rate": 1.8771237343401406e-05, + "loss": 0.4489, + "step": 4362 + }, + { + "epoch": 0.75, + "grad_norm": 12.564810752868652, + "learning_rate": 1.8768663119958813e-05, + "loss": 0.8186, + "step": 4363 + }, + { + "epoch": 0.75, + "grad_norm": 8.596735954284668, + "learning_rate": 1.8766088896516216e-05, + "loss": 0.5342, + "step": 4364 + }, + { + "epoch": 0.75, + "grad_norm": 10.781926155090332, + "learning_rate": 1.8763514673073623e-05, + "loss": 1.0391, + "step": 4365 + }, + { + "epoch": 0.75, + "grad_norm": 9.596892356872559, + "learning_rate": 1.876094044963103e-05, + "loss": 0.8483, + "step": 4366 + }, + { + "epoch": 0.75, + "grad_norm": 12.084346771240234, + "learning_rate": 1.8758366226188436e-05, + "loss": 0.7646, + "step": 4367 + }, + { + "epoch": 0.75, + "grad_norm": 10.50231647491455, + "learning_rate": 1.875579200274584e-05, + "loss": 0.8787, + "step": 4368 + }, + { + "epoch": 0.75, + "grad_norm": 9.484762191772461, + "learning_rate": 1.8753217779303246e-05, + "loss": 0.7781, + "step": 4369 + }, + { + "epoch": 0.75, + "grad_norm": 11.743453979492188, + "learning_rate": 1.875064355586065e-05, + "loss": 0.9012, + "step": 4370 + }, + { + "epoch": 0.75, + "grad_norm": 12.534058570861816, + "learning_rate": 1.8748069332418053e-05, + "loss": 0.9213, + "step": 4371 + }, + { + "epoch": 0.75, + "grad_norm": 11.27253246307373, + "learning_rate": 1.874549510897546e-05, + "loss": 0.7629, + "step": 4372 + }, + { + "epoch": 0.75, + "grad_norm": 10.572549819946289, + "learning_rate": 1.8742920885532862e-05, + "loss": 0.7689, + "step": 4373 + }, + { + "epoch": 0.75, + "grad_norm": 10.57839298248291, + "learning_rate": 1.874034666209027e-05, + "loss": 0.9119, + "step": 4374 + }, + { + "epoch": 0.75, + "grad_norm": 10.892982482910156, + "learning_rate": 1.8737772438647676e-05, + "loss": 0.9118, + "step": 4375 + }, + { + "epoch": 0.75, + "grad_norm": 10.607345581054688, + "learning_rate": 1.8735198215205083e-05, + "loss": 0.6165, + "step": 4376 + }, + { + "epoch": 0.75, + "grad_norm": 10.54564094543457, + "learning_rate": 1.8732623991762486e-05, + "loss": 0.9474, + "step": 4377 + }, + { + "epoch": 0.75, + "grad_norm": 9.732884407043457, + "learning_rate": 1.8730049768319893e-05, + "loss": 0.8056, + "step": 4378 + }, + { + "epoch": 0.75, + "grad_norm": 10.062182426452637, + "learning_rate": 1.8727475544877296e-05, + "loss": 0.9305, + "step": 4379 + }, + { + "epoch": 0.75, + "grad_norm": 10.44251537322998, + "learning_rate": 1.8724901321434702e-05, + "loss": 0.9487, + "step": 4380 + }, + { + "epoch": 0.75, + "grad_norm": 9.254076957702637, + "learning_rate": 1.8722327097992106e-05, + "loss": 0.8822, + "step": 4381 + }, + { + "epoch": 0.75, + "grad_norm": 9.709226608276367, + "learning_rate": 1.871975287454951e-05, + "loss": 0.7476, + "step": 4382 + }, + { + "epoch": 0.75, + "grad_norm": 9.419149398803711, + "learning_rate": 1.8717178651106916e-05, + "loss": 0.7381, + "step": 4383 + }, + { + "epoch": 0.75, + "grad_norm": 9.5823392868042, + "learning_rate": 1.871460442766432e-05, + "loss": 0.6735, + "step": 4384 + }, + { + "epoch": 0.75, + "grad_norm": 10.25571060180664, + "learning_rate": 1.871203020422173e-05, + "loss": 0.78, + "step": 4385 + }, + { + "epoch": 0.75, + "grad_norm": 10.075605392456055, + "learning_rate": 1.8709455980779132e-05, + "loss": 0.9736, + "step": 4386 + }, + { + "epoch": 0.75, + "grad_norm": 7.6416754722595215, + "learning_rate": 1.870688175733654e-05, + "loss": 0.6801, + "step": 4387 + }, + { + "epoch": 0.75, + "grad_norm": 9.600601196289062, + "learning_rate": 1.8704307533893942e-05, + "loss": 0.8792, + "step": 4388 + }, + { + "epoch": 0.75, + "grad_norm": 9.597159385681152, + "learning_rate": 1.870173331045135e-05, + "loss": 0.723, + "step": 4389 + }, + { + "epoch": 0.75, + "grad_norm": 11.653776168823242, + "learning_rate": 1.8699159087008752e-05, + "loss": 1.0082, + "step": 4390 + }, + { + "epoch": 0.75, + "grad_norm": 12.68677806854248, + "learning_rate": 1.869658486356616e-05, + "loss": 1.106, + "step": 4391 + }, + { + "epoch": 0.75, + "grad_norm": 8.043035507202148, + "learning_rate": 1.8694010640123562e-05, + "loss": 0.7793, + "step": 4392 + }, + { + "epoch": 0.75, + "grad_norm": 9.986228942871094, + "learning_rate": 1.8691436416680966e-05, + "loss": 0.8221, + "step": 4393 + }, + { + "epoch": 0.75, + "grad_norm": 10.062551498413086, + "learning_rate": 1.8688862193238376e-05, + "loss": 0.8659, + "step": 4394 + }, + { + "epoch": 0.75, + "grad_norm": 8.912751197814941, + "learning_rate": 1.868628796979578e-05, + "loss": 0.756, + "step": 4395 + }, + { + "epoch": 0.75, + "grad_norm": 11.395503044128418, + "learning_rate": 1.8683713746353186e-05, + "loss": 0.9355, + "step": 4396 + }, + { + "epoch": 0.75, + "grad_norm": 11.25494384765625, + "learning_rate": 1.868113952291059e-05, + "loss": 0.7836, + "step": 4397 + }, + { + "epoch": 0.75, + "grad_norm": 11.707947731018066, + "learning_rate": 1.8678565299467996e-05, + "loss": 0.9053, + "step": 4398 + }, + { + "epoch": 0.75, + "grad_norm": 11.001806259155273, + "learning_rate": 1.86759910760254e-05, + "loss": 0.7759, + "step": 4399 + }, + { + "epoch": 0.76, + "grad_norm": 9.141298294067383, + "learning_rate": 1.8673416852582806e-05, + "loss": 0.5919, + "step": 4400 + }, + { + "epoch": 0.76, + "grad_norm": 10.65458869934082, + "learning_rate": 1.867084262914021e-05, + "loss": 1.0048, + "step": 4401 + }, + { + "epoch": 0.76, + "grad_norm": 10.322049140930176, + "learning_rate": 1.8668268405697612e-05, + "loss": 0.8787, + "step": 4402 + }, + { + "epoch": 0.76, + "grad_norm": 10.761757850646973, + "learning_rate": 1.866569418225502e-05, + "loss": 1.0129, + "step": 4403 + }, + { + "epoch": 0.76, + "grad_norm": 9.715259552001953, + "learning_rate": 1.8663119958812426e-05, + "loss": 0.7009, + "step": 4404 + }, + { + "epoch": 0.76, + "grad_norm": 10.733769416809082, + "learning_rate": 1.8660545735369832e-05, + "loss": 0.9702, + "step": 4405 + }, + { + "epoch": 0.76, + "grad_norm": 9.306440353393555, + "learning_rate": 1.8657971511927236e-05, + "loss": 0.9039, + "step": 4406 + }, + { + "epoch": 0.76, + "grad_norm": 7.9626145362854, + "learning_rate": 1.8655397288484642e-05, + "loss": 0.7413, + "step": 4407 + }, + { + "epoch": 0.76, + "grad_norm": 7.926734924316406, + "learning_rate": 1.8652823065042046e-05, + "loss": 0.8488, + "step": 4408 + }, + { + "epoch": 0.76, + "grad_norm": 8.632416725158691, + "learning_rate": 1.8650248841599452e-05, + "loss": 0.6175, + "step": 4409 + }, + { + "epoch": 0.76, + "grad_norm": 9.221813201904297, + "learning_rate": 1.8647674618156855e-05, + "loss": 0.8773, + "step": 4410 + }, + { + "epoch": 0.76, + "grad_norm": 7.929887771606445, + "learning_rate": 1.8645100394714262e-05, + "loss": 0.638, + "step": 4411 + }, + { + "epoch": 0.76, + "grad_norm": 9.351088523864746, + "learning_rate": 1.8642526171271665e-05, + "loss": 0.756, + "step": 4412 + }, + { + "epoch": 0.76, + "grad_norm": 12.052444458007812, + "learning_rate": 1.8639951947829072e-05, + "loss": 1.0514, + "step": 4413 + }, + { + "epoch": 0.76, + "grad_norm": 8.6525297164917, + "learning_rate": 1.863737772438648e-05, + "loss": 0.7541, + "step": 4414 + }, + { + "epoch": 0.76, + "grad_norm": 11.138541221618652, + "learning_rate": 1.8634803500943882e-05, + "loss": 0.7851, + "step": 4415 + }, + { + "epoch": 0.76, + "grad_norm": 11.111345291137695, + "learning_rate": 1.863222927750129e-05, + "loss": 0.884, + "step": 4416 + }, + { + "epoch": 0.76, + "grad_norm": 11.87191104888916, + "learning_rate": 1.8629655054058692e-05, + "loss": 1.0476, + "step": 4417 + }, + { + "epoch": 0.76, + "grad_norm": 11.426712036132812, + "learning_rate": 1.86270808306161e-05, + "loss": 0.8644, + "step": 4418 + }, + { + "epoch": 0.76, + "grad_norm": 11.35315990447998, + "learning_rate": 1.8624506607173502e-05, + "loss": 0.9767, + "step": 4419 + }, + { + "epoch": 0.76, + "grad_norm": 10.673909187316895, + "learning_rate": 1.862193238373091e-05, + "loss": 0.9786, + "step": 4420 + }, + { + "epoch": 0.76, + "grad_norm": 10.761059761047363, + "learning_rate": 1.8619358160288312e-05, + "loss": 0.7912, + "step": 4421 + }, + { + "epoch": 0.76, + "grad_norm": 11.249832153320312, + "learning_rate": 1.861678393684572e-05, + "loss": 0.8706, + "step": 4422 + }, + { + "epoch": 0.76, + "grad_norm": 8.88196849822998, + "learning_rate": 1.8614209713403125e-05, + "loss": 0.7316, + "step": 4423 + }, + { + "epoch": 0.76, + "grad_norm": 9.350408554077148, + "learning_rate": 1.861163548996053e-05, + "loss": 0.7068, + "step": 4424 + }, + { + "epoch": 0.76, + "grad_norm": 9.889779090881348, + "learning_rate": 1.8609061266517935e-05, + "loss": 0.9263, + "step": 4425 + }, + { + "epoch": 0.76, + "grad_norm": 11.010964393615723, + "learning_rate": 1.860648704307534e-05, + "loss": 1.3115, + "step": 4426 + }, + { + "epoch": 0.76, + "grad_norm": 10.4732666015625, + "learning_rate": 1.8603912819632745e-05, + "loss": 0.7906, + "step": 4427 + }, + { + "epoch": 0.76, + "grad_norm": 9.549907684326172, + "learning_rate": 1.860133859619015e-05, + "loss": 0.781, + "step": 4428 + }, + { + "epoch": 0.76, + "grad_norm": 9.509839057922363, + "learning_rate": 1.8598764372747555e-05, + "loss": 0.647, + "step": 4429 + }, + { + "epoch": 0.76, + "grad_norm": 8.75, + "learning_rate": 1.859619014930496e-05, + "loss": 0.7365, + "step": 4430 + }, + { + "epoch": 0.76, + "grad_norm": 8.616283416748047, + "learning_rate": 1.8593615925862365e-05, + "loss": 0.9994, + "step": 4431 + }, + { + "epoch": 0.76, + "grad_norm": 10.0025634765625, + "learning_rate": 1.8591041702419772e-05, + "loss": 0.7246, + "step": 4432 + }, + { + "epoch": 0.76, + "grad_norm": 8.842690467834473, + "learning_rate": 1.8588467478977175e-05, + "loss": 0.8649, + "step": 4433 + }, + { + "epoch": 0.76, + "grad_norm": 8.467083930969238, + "learning_rate": 1.8585893255534582e-05, + "loss": 0.6283, + "step": 4434 + }, + { + "epoch": 0.76, + "grad_norm": 11.707691192626953, + "learning_rate": 1.8583319032091985e-05, + "loss": 0.8441, + "step": 4435 + }, + { + "epoch": 0.76, + "grad_norm": 11.234848976135254, + "learning_rate": 1.8580744808649392e-05, + "loss": 1.0229, + "step": 4436 + }, + { + "epoch": 0.76, + "grad_norm": 10.026167869567871, + "learning_rate": 1.8578170585206795e-05, + "loss": 0.95, + "step": 4437 + }, + { + "epoch": 0.76, + "grad_norm": 11.936605453491211, + "learning_rate": 1.8575596361764202e-05, + "loss": 0.9674, + "step": 4438 + }, + { + "epoch": 0.76, + "grad_norm": 8.654797554016113, + "learning_rate": 1.8573022138321605e-05, + "loss": 0.5985, + "step": 4439 + }, + { + "epoch": 0.76, + "grad_norm": 9.77736759185791, + "learning_rate": 1.8570447914879012e-05, + "loss": 0.8257, + "step": 4440 + }, + { + "epoch": 0.76, + "grad_norm": 10.64514446258545, + "learning_rate": 1.8567873691436415e-05, + "loss": 0.9767, + "step": 4441 + }, + { + "epoch": 0.76, + "grad_norm": 10.730393409729004, + "learning_rate": 1.8565299467993825e-05, + "loss": 0.8074, + "step": 4442 + }, + { + "epoch": 0.76, + "grad_norm": 8.7951078414917, + "learning_rate": 1.856272524455123e-05, + "loss": 0.5972, + "step": 4443 + }, + { + "epoch": 0.76, + "grad_norm": 9.156877517700195, + "learning_rate": 1.8560151021108632e-05, + "loss": 0.6072, + "step": 4444 + }, + { + "epoch": 0.76, + "grad_norm": 10.159260749816895, + "learning_rate": 1.855757679766604e-05, + "loss": 0.8721, + "step": 4445 + }, + { + "epoch": 0.76, + "grad_norm": 9.914385795593262, + "learning_rate": 1.8555002574223442e-05, + "loss": 0.8044, + "step": 4446 + }, + { + "epoch": 0.76, + "grad_norm": 9.912651062011719, + "learning_rate": 1.855242835078085e-05, + "loss": 0.7537, + "step": 4447 + }, + { + "epoch": 0.76, + "grad_norm": 10.371186256408691, + "learning_rate": 1.8549854127338252e-05, + "loss": 0.6967, + "step": 4448 + }, + { + "epoch": 0.76, + "grad_norm": 10.682121276855469, + "learning_rate": 1.854727990389566e-05, + "loss": 1.0528, + "step": 4449 + }, + { + "epoch": 0.76, + "grad_norm": 9.716703414916992, + "learning_rate": 1.8544705680453062e-05, + "loss": 0.8986, + "step": 4450 + }, + { + "epoch": 0.76, + "grad_norm": 9.13463020324707, + "learning_rate": 1.8542131457010472e-05, + "loss": 0.7043, + "step": 4451 + }, + { + "epoch": 0.76, + "grad_norm": 9.702315330505371, + "learning_rate": 1.8539557233567875e-05, + "loss": 0.8191, + "step": 4452 + }, + { + "epoch": 0.76, + "grad_norm": 8.548389434814453, + "learning_rate": 1.8536983010125282e-05, + "loss": 0.5203, + "step": 4453 + }, + { + "epoch": 0.76, + "grad_norm": 11.226734161376953, + "learning_rate": 1.8534408786682685e-05, + "loss": 0.6779, + "step": 4454 + }, + { + "epoch": 0.76, + "grad_norm": 11.5157470703125, + "learning_rate": 1.853183456324009e-05, + "loss": 0.695, + "step": 4455 + }, + { + "epoch": 0.76, + "grad_norm": 10.56718635559082, + "learning_rate": 1.8529260339797495e-05, + "loss": 0.7932, + "step": 4456 + }, + { + "epoch": 0.76, + "grad_norm": 11.645631790161133, + "learning_rate": 1.85266861163549e-05, + "loss": 0.8119, + "step": 4457 + }, + { + "epoch": 0.77, + "grad_norm": 11.036974906921387, + "learning_rate": 1.8524111892912305e-05, + "loss": 0.7608, + "step": 4458 + }, + { + "epoch": 0.77, + "grad_norm": 9.421669960021973, + "learning_rate": 1.852153766946971e-05, + "loss": 0.7267, + "step": 4459 + }, + { + "epoch": 0.77, + "grad_norm": 7.169737815856934, + "learning_rate": 1.8518963446027115e-05, + "loss": 0.5726, + "step": 4460 + }, + { + "epoch": 0.77, + "grad_norm": 9.590944290161133, + "learning_rate": 1.8516389222584522e-05, + "loss": 0.8535, + "step": 4461 + }, + { + "epoch": 0.77, + "grad_norm": 9.078580856323242, + "learning_rate": 1.851381499914193e-05, + "loss": 0.7788, + "step": 4462 + }, + { + "epoch": 0.77, + "grad_norm": 10.609288215637207, + "learning_rate": 1.851124077569933e-05, + "loss": 0.9815, + "step": 4463 + }, + { + "epoch": 0.77, + "grad_norm": 9.61540412902832, + "learning_rate": 1.8508666552256735e-05, + "loss": 0.817, + "step": 4464 + }, + { + "epoch": 0.77, + "grad_norm": 11.372817039489746, + "learning_rate": 1.850609232881414e-05, + "loss": 0.9686, + "step": 4465 + }, + { + "epoch": 0.77, + "grad_norm": 12.059974670410156, + "learning_rate": 1.8503518105371545e-05, + "loss": 0.6221, + "step": 4466 + }, + { + "epoch": 0.77, + "grad_norm": 9.431178092956543, + "learning_rate": 1.850094388192895e-05, + "loss": 0.624, + "step": 4467 + }, + { + "epoch": 0.77, + "grad_norm": 12.192831039428711, + "learning_rate": 1.8498369658486355e-05, + "loss": 0.7813, + "step": 4468 + }, + { + "epoch": 0.77, + "grad_norm": 11.22218132019043, + "learning_rate": 1.849579543504376e-05, + "loss": 0.9372, + "step": 4469 + }, + { + "epoch": 0.77, + "grad_norm": 11.294771194458008, + "learning_rate": 1.8493221211601168e-05, + "loss": 0.6635, + "step": 4470 + }, + { + "epoch": 0.77, + "grad_norm": 12.357868194580078, + "learning_rate": 1.8490646988158575e-05, + "loss": 0.8066, + "step": 4471 + }, + { + "epoch": 0.77, + "grad_norm": 11.630175590515137, + "learning_rate": 1.8488072764715978e-05, + "loss": 0.8054, + "step": 4472 + }, + { + "epoch": 0.77, + "grad_norm": 11.369306564331055, + "learning_rate": 1.8485498541273385e-05, + "loss": 0.9648, + "step": 4473 + }, + { + "epoch": 0.77, + "grad_norm": 12.858946800231934, + "learning_rate": 1.8482924317830788e-05, + "loss": 0.9286, + "step": 4474 + }, + { + "epoch": 0.77, + "grad_norm": 11.09499740600586, + "learning_rate": 1.848035009438819e-05, + "loss": 0.7685, + "step": 4475 + }, + { + "epoch": 0.77, + "grad_norm": 9.864041328430176, + "learning_rate": 1.8477775870945598e-05, + "loss": 0.834, + "step": 4476 + }, + { + "epoch": 0.77, + "grad_norm": 10.916614532470703, + "learning_rate": 1.8475201647503e-05, + "loss": 0.7069, + "step": 4477 + }, + { + "epoch": 0.77, + "grad_norm": 10.010899543762207, + "learning_rate": 1.8472627424060408e-05, + "loss": 0.8302, + "step": 4478 + }, + { + "epoch": 0.77, + "grad_norm": 11.282487869262695, + "learning_rate": 1.847005320061781e-05, + "loss": 0.9146, + "step": 4479 + }, + { + "epoch": 0.77, + "grad_norm": 10.03085994720459, + "learning_rate": 1.846747897717522e-05, + "loss": 0.8117, + "step": 4480 + }, + { + "epoch": 0.77, + "grad_norm": 8.702635765075684, + "learning_rate": 1.8464904753732625e-05, + "loss": 0.5839, + "step": 4481 + }, + { + "epoch": 0.77, + "grad_norm": 10.291382789611816, + "learning_rate": 1.846233053029003e-05, + "loss": 0.7454, + "step": 4482 + }, + { + "epoch": 0.77, + "grad_norm": 9.000566482543945, + "learning_rate": 1.8459756306847435e-05, + "loss": 0.879, + "step": 4483 + }, + { + "epoch": 0.77, + "grad_norm": 10.346214294433594, + "learning_rate": 1.845718208340484e-05, + "loss": 1.0057, + "step": 4484 + }, + { + "epoch": 0.77, + "grad_norm": 9.30116081237793, + "learning_rate": 1.8454607859962245e-05, + "loss": 0.5507, + "step": 4485 + }, + { + "epoch": 0.77, + "grad_norm": 10.069503784179688, + "learning_rate": 1.8452033636519648e-05, + "loss": 0.6537, + "step": 4486 + }, + { + "epoch": 0.77, + "grad_norm": 8.520319938659668, + "learning_rate": 1.8449459413077055e-05, + "loss": 0.6122, + "step": 4487 + }, + { + "epoch": 0.77, + "grad_norm": 9.078132629394531, + "learning_rate": 1.8446885189634458e-05, + "loss": 0.6766, + "step": 4488 + }, + { + "epoch": 0.77, + "grad_norm": 9.916189193725586, + "learning_rate": 1.8444310966191868e-05, + "loss": 0.8757, + "step": 4489 + }, + { + "epoch": 0.77, + "grad_norm": 9.934427261352539, + "learning_rate": 1.844173674274927e-05, + "loss": 0.7369, + "step": 4490 + }, + { + "epoch": 0.77, + "grad_norm": 11.735384941101074, + "learning_rate": 1.8439162519306678e-05, + "loss": 0.9513, + "step": 4491 + }, + { + "epoch": 0.77, + "grad_norm": 7.546665191650391, + "learning_rate": 1.843658829586408e-05, + "loss": 0.514, + "step": 4492 + }, + { + "epoch": 0.77, + "grad_norm": 9.621626853942871, + "learning_rate": 1.8434014072421488e-05, + "loss": 0.8021, + "step": 4493 + }, + { + "epoch": 0.77, + "grad_norm": 10.1311616897583, + "learning_rate": 1.843143984897889e-05, + "loss": 0.8665, + "step": 4494 + }, + { + "epoch": 0.77, + "grad_norm": 9.149614334106445, + "learning_rate": 1.8428865625536298e-05, + "loss": 0.8601, + "step": 4495 + }, + { + "epoch": 0.77, + "grad_norm": 9.288810729980469, + "learning_rate": 1.84262914020937e-05, + "loss": 0.5983, + "step": 4496 + }, + { + "epoch": 0.77, + "grad_norm": 11.577433586120605, + "learning_rate": 1.8423717178651105e-05, + "loss": 0.9949, + "step": 4497 + }, + { + "epoch": 0.77, + "grad_norm": 11.912052154541016, + "learning_rate": 1.8421142955208515e-05, + "loss": 0.8605, + "step": 4498 + }, + { + "epoch": 0.77, + "grad_norm": 10.540184020996094, + "learning_rate": 1.8418568731765918e-05, + "loss": 0.6795, + "step": 4499 + }, + { + "epoch": 0.77, + "grad_norm": 9.33013916015625, + "learning_rate": 1.8415994508323325e-05, + "loss": 0.741, + "step": 4500 + }, + { + "epoch": 0.77, + "grad_norm": 11.999069213867188, + "learning_rate": 1.8413420284880728e-05, + "loss": 0.7492, + "step": 4501 + }, + { + "epoch": 0.77, + "grad_norm": 10.603501319885254, + "learning_rate": 1.8410846061438135e-05, + "loss": 0.6832, + "step": 4502 + }, + { + "epoch": 0.77, + "grad_norm": 12.333259582519531, + "learning_rate": 1.8408271837995538e-05, + "loss": 0.7259, + "step": 4503 + }, + { + "epoch": 0.77, + "grad_norm": 10.627447128295898, + "learning_rate": 1.8405697614552945e-05, + "loss": 0.8405, + "step": 4504 + }, + { + "epoch": 0.77, + "grad_norm": 11.332725524902344, + "learning_rate": 1.8403123391110348e-05, + "loss": 1.0176, + "step": 4505 + }, + { + "epoch": 0.77, + "grad_norm": 8.903879165649414, + "learning_rate": 1.840054916766775e-05, + "loss": 0.7921, + "step": 4506 + }, + { + "epoch": 0.77, + "grad_norm": 12.524657249450684, + "learning_rate": 1.8397974944225158e-05, + "loss": 0.833, + "step": 4507 + }, + { + "epoch": 0.77, + "grad_norm": 12.497197151184082, + "learning_rate": 1.8395400720782565e-05, + "loss": 0.9854, + "step": 4508 + }, + { + "epoch": 0.77, + "grad_norm": 11.108022689819336, + "learning_rate": 1.839282649733997e-05, + "loss": 0.8133, + "step": 4509 + }, + { + "epoch": 0.77, + "grad_norm": 11.229659080505371, + "learning_rate": 1.8390252273897375e-05, + "loss": 0.8802, + "step": 4510 + }, + { + "epoch": 0.77, + "grad_norm": 10.211379051208496, + "learning_rate": 1.838767805045478e-05, + "loss": 0.8888, + "step": 4511 + }, + { + "epoch": 0.77, + "grad_norm": 11.35076904296875, + "learning_rate": 1.8385103827012185e-05, + "loss": 0.9631, + "step": 4512 + }, + { + "epoch": 0.77, + "grad_norm": 12.228515625, + "learning_rate": 1.838252960356959e-05, + "loss": 0.8236, + "step": 4513 + }, + { + "epoch": 0.77, + "grad_norm": 10.220845222473145, + "learning_rate": 1.8379955380126995e-05, + "loss": 0.9212, + "step": 4514 + }, + { + "epoch": 0.77, + "grad_norm": 11.140191078186035, + "learning_rate": 1.83773811566844e-05, + "loss": 0.8759, + "step": 4515 + }, + { + "epoch": 0.78, + "grad_norm": 11.232515335083008, + "learning_rate": 1.8374806933241804e-05, + "loss": 0.9269, + "step": 4516 + }, + { + "epoch": 0.78, + "grad_norm": 8.290019989013672, + "learning_rate": 1.837223270979921e-05, + "loss": 0.575, + "step": 4517 + }, + { + "epoch": 0.78, + "grad_norm": 11.017510414123535, + "learning_rate": 1.8369658486356618e-05, + "loss": 0.9091, + "step": 4518 + }, + { + "epoch": 0.78, + "grad_norm": 12.72248649597168, + "learning_rate": 1.836708426291402e-05, + "loss": 1.0444, + "step": 4519 + }, + { + "epoch": 0.78, + "grad_norm": 8.022782325744629, + "learning_rate": 1.8364510039471428e-05, + "loss": 0.7715, + "step": 4520 + }, + { + "epoch": 0.78, + "grad_norm": 11.991741180419922, + "learning_rate": 1.836193581602883e-05, + "loss": 0.8239, + "step": 4521 + }, + { + "epoch": 0.78, + "grad_norm": 10.724997520446777, + "learning_rate": 1.8359361592586238e-05, + "loss": 0.7705, + "step": 4522 + }, + { + "epoch": 0.78, + "grad_norm": 8.342066764831543, + "learning_rate": 1.835678736914364e-05, + "loss": 0.7131, + "step": 4523 + }, + { + "epoch": 0.78, + "grad_norm": 7.394556522369385, + "learning_rate": 1.8354213145701048e-05, + "loss": 0.4355, + "step": 4524 + }, + { + "epoch": 0.78, + "grad_norm": 11.0487699508667, + "learning_rate": 1.835163892225845e-05, + "loss": 0.6849, + "step": 4525 + }, + { + "epoch": 0.78, + "grad_norm": 10.3297758102417, + "learning_rate": 1.8349064698815858e-05, + "loss": 0.8275, + "step": 4526 + }, + { + "epoch": 0.78, + "grad_norm": 9.003467559814453, + "learning_rate": 1.8346490475373264e-05, + "loss": 0.7469, + "step": 4527 + }, + { + "epoch": 0.78, + "grad_norm": 9.577159881591797, + "learning_rate": 1.8343916251930668e-05, + "loss": 0.7427, + "step": 4528 + }, + { + "epoch": 0.78, + "grad_norm": 11.81598949432373, + "learning_rate": 1.8341342028488074e-05, + "loss": 0.9844, + "step": 4529 + }, + { + "epoch": 0.78, + "grad_norm": 10.05923843383789, + "learning_rate": 1.8338767805045478e-05, + "loss": 0.964, + "step": 4530 + }, + { + "epoch": 0.78, + "grad_norm": 8.914913177490234, + "learning_rate": 1.8336193581602884e-05, + "loss": 0.7655, + "step": 4531 + }, + { + "epoch": 0.78, + "grad_norm": 9.596419334411621, + "learning_rate": 1.8333619358160288e-05, + "loss": 0.8081, + "step": 4532 + }, + { + "epoch": 0.78, + "grad_norm": 11.512883186340332, + "learning_rate": 1.8331045134717694e-05, + "loss": 0.7835, + "step": 4533 + }, + { + "epoch": 0.78, + "grad_norm": 11.740601539611816, + "learning_rate": 1.8328470911275098e-05, + "loss": 1.0698, + "step": 4534 + }, + { + "epoch": 0.78, + "grad_norm": 7.815478801727295, + "learning_rate": 1.8325896687832504e-05, + "loss": 0.6298, + "step": 4535 + }, + { + "epoch": 0.78, + "grad_norm": 11.781991004943848, + "learning_rate": 1.832332246438991e-05, + "loss": 0.8833, + "step": 4536 + }, + { + "epoch": 0.78, + "grad_norm": 7.7704691886901855, + "learning_rate": 1.8320748240947314e-05, + "loss": 0.6173, + "step": 4537 + }, + { + "epoch": 0.78, + "grad_norm": 9.388161659240723, + "learning_rate": 1.831817401750472e-05, + "loss": 0.6387, + "step": 4538 + }, + { + "epoch": 0.78, + "grad_norm": 10.602012634277344, + "learning_rate": 1.8315599794062124e-05, + "loss": 0.8828, + "step": 4539 + }, + { + "epoch": 0.78, + "grad_norm": 10.607589721679688, + "learning_rate": 1.831302557061953e-05, + "loss": 0.7411, + "step": 4540 + }, + { + "epoch": 0.78, + "grad_norm": 9.596658706665039, + "learning_rate": 1.8310451347176934e-05, + "loss": 0.665, + "step": 4541 + }, + { + "epoch": 0.78, + "grad_norm": 9.29475212097168, + "learning_rate": 1.830787712373434e-05, + "loss": 0.9465, + "step": 4542 + }, + { + "epoch": 0.78, + "grad_norm": 8.66930103302002, + "learning_rate": 1.8305302900291744e-05, + "loss": 0.6908, + "step": 4543 + }, + { + "epoch": 0.78, + "grad_norm": 8.661069869995117, + "learning_rate": 1.830272867684915e-05, + "loss": 0.5897, + "step": 4544 + }, + { + "epoch": 0.78, + "grad_norm": 8.405319213867188, + "learning_rate": 1.8300154453406554e-05, + "loss": 0.5914, + "step": 4545 + }, + { + "epoch": 0.78, + "grad_norm": 11.724068641662598, + "learning_rate": 1.8297580229963964e-05, + "loss": 0.7239, + "step": 4546 + }, + { + "epoch": 0.78, + "grad_norm": 11.002670288085938, + "learning_rate": 1.8295006006521368e-05, + "loss": 0.7828, + "step": 4547 + }, + { + "epoch": 0.78, + "grad_norm": 10.000532150268555, + "learning_rate": 1.829243178307877e-05, + "loss": 0.7399, + "step": 4548 + }, + { + "epoch": 0.78, + "grad_norm": 8.797389030456543, + "learning_rate": 1.8289857559636178e-05, + "loss": 0.8916, + "step": 4549 + }, + { + "epoch": 0.78, + "grad_norm": 10.00051212310791, + "learning_rate": 1.828728333619358e-05, + "loss": 0.6231, + "step": 4550 + }, + { + "epoch": 0.78, + "grad_norm": 10.578989028930664, + "learning_rate": 1.8284709112750988e-05, + "loss": 0.7963, + "step": 4551 + }, + { + "epoch": 0.78, + "grad_norm": 11.214009284973145, + "learning_rate": 1.828213488930839e-05, + "loss": 1.0731, + "step": 4552 + }, + { + "epoch": 0.78, + "grad_norm": 10.950447082519531, + "learning_rate": 1.8279560665865797e-05, + "loss": 0.8213, + "step": 4553 + }, + { + "epoch": 0.78, + "grad_norm": 13.635833740234375, + "learning_rate": 1.82769864424232e-05, + "loss": 0.9224, + "step": 4554 + }, + { + "epoch": 0.78, + "grad_norm": 11.554736137390137, + "learning_rate": 1.827441221898061e-05, + "loss": 0.7679, + "step": 4555 + }, + { + "epoch": 0.78, + "grad_norm": 9.232850074768066, + "learning_rate": 1.8271837995538014e-05, + "loss": 0.6687, + "step": 4556 + }, + { + "epoch": 0.78, + "grad_norm": 12.844307899475098, + "learning_rate": 1.826926377209542e-05, + "loss": 0.9273, + "step": 4557 + }, + { + "epoch": 0.78, + "grad_norm": 8.278690338134766, + "learning_rate": 1.8266689548652824e-05, + "loss": 0.7023, + "step": 4558 + }, + { + "epoch": 0.78, + "grad_norm": 9.582830429077148, + "learning_rate": 1.8264115325210227e-05, + "loss": 0.8023, + "step": 4559 + }, + { + "epoch": 0.78, + "grad_norm": 9.576903343200684, + "learning_rate": 1.8261541101767634e-05, + "loss": 0.7627, + "step": 4560 + }, + { + "epoch": 0.78, + "grad_norm": 14.200854301452637, + "learning_rate": 1.8258966878325037e-05, + "loss": 0.9509, + "step": 4561 + }, + { + "epoch": 0.78, + "grad_norm": 11.573077201843262, + "learning_rate": 1.8256392654882444e-05, + "loss": 0.911, + "step": 4562 + }, + { + "epoch": 0.78, + "grad_norm": 9.515393257141113, + "learning_rate": 1.8253818431439847e-05, + "loss": 0.8064, + "step": 4563 + }, + { + "epoch": 0.78, + "grad_norm": 12.103652000427246, + "learning_rate": 1.8251244207997254e-05, + "loss": 1.0753, + "step": 4564 + }, + { + "epoch": 0.78, + "grad_norm": 9.32536506652832, + "learning_rate": 1.824866998455466e-05, + "loss": 0.6403, + "step": 4565 + }, + { + "epoch": 0.78, + "grad_norm": 9.977213859558105, + "learning_rate": 1.8246095761112067e-05, + "loss": 0.7228, + "step": 4566 + }, + { + "epoch": 0.78, + "grad_norm": 12.798955917358398, + "learning_rate": 1.824352153766947e-05, + "loss": 0.8165, + "step": 4567 + }, + { + "epoch": 0.78, + "grad_norm": 8.52734375, + "learning_rate": 1.8240947314226874e-05, + "loss": 0.6456, + "step": 4568 + }, + { + "epoch": 0.78, + "grad_norm": 8.67576789855957, + "learning_rate": 1.823837309078428e-05, + "loss": 0.6377, + "step": 4569 + }, + { + "epoch": 0.78, + "grad_norm": 9.465245246887207, + "learning_rate": 1.8235798867341684e-05, + "loss": 0.7506, + "step": 4570 + }, + { + "epoch": 0.78, + "grad_norm": 13.725875854492188, + "learning_rate": 1.823322464389909e-05, + "loss": 0.9004, + "step": 4571 + }, + { + "epoch": 0.78, + "grad_norm": 7.636159896850586, + "learning_rate": 1.8230650420456494e-05, + "loss": 0.6241, + "step": 4572 + }, + { + "epoch": 0.78, + "grad_norm": 9.581345558166504, + "learning_rate": 1.82280761970139e-05, + "loss": 0.8353, + "step": 4573 + }, + { + "epoch": 0.78, + "grad_norm": 11.311539649963379, + "learning_rate": 1.8225501973571307e-05, + "loss": 0.9389, + "step": 4574 + }, + { + "epoch": 0.79, + "grad_norm": 9.504876136779785, + "learning_rate": 1.8222927750128714e-05, + "loss": 0.7312, + "step": 4575 + }, + { + "epoch": 0.79, + "grad_norm": 9.0369234085083, + "learning_rate": 1.8220353526686117e-05, + "loss": 0.8633, + "step": 4576 + }, + { + "epoch": 0.79, + "grad_norm": 10.956113815307617, + "learning_rate": 1.8217779303243524e-05, + "loss": 0.731, + "step": 4577 + }, + { + "epoch": 0.79, + "grad_norm": 9.403385162353516, + "learning_rate": 1.8215205079800927e-05, + "loss": 0.7374, + "step": 4578 + }, + { + "epoch": 0.79, + "grad_norm": 7.958566188812256, + "learning_rate": 1.821263085635833e-05, + "loss": 0.3727, + "step": 4579 + }, + { + "epoch": 0.79, + "grad_norm": 9.500529289245605, + "learning_rate": 1.8210056632915737e-05, + "loss": 1.0132, + "step": 4580 + }, + { + "epoch": 0.79, + "grad_norm": 9.410884857177734, + "learning_rate": 1.820748240947314e-05, + "loss": 0.7981, + "step": 4581 + }, + { + "epoch": 0.79, + "grad_norm": 8.440316200256348, + "learning_rate": 1.8204908186030547e-05, + "loss": 0.7446, + "step": 4582 + }, + { + "epoch": 0.79, + "grad_norm": 12.056214332580566, + "learning_rate": 1.820233396258795e-05, + "loss": 0.7824, + "step": 4583 + }, + { + "epoch": 0.79, + "grad_norm": 10.531798362731934, + "learning_rate": 1.819975973914536e-05, + "loss": 0.7659, + "step": 4584 + }, + { + "epoch": 0.79, + "grad_norm": 11.205221176147461, + "learning_rate": 1.8197185515702764e-05, + "loss": 0.8347, + "step": 4585 + }, + { + "epoch": 0.79, + "grad_norm": 9.71817684173584, + "learning_rate": 1.819461129226017e-05, + "loss": 0.7105, + "step": 4586 + }, + { + "epoch": 0.79, + "grad_norm": 10.199410438537598, + "learning_rate": 1.8192037068817574e-05, + "loss": 0.8129, + "step": 4587 + }, + { + "epoch": 0.79, + "grad_norm": 10.65517520904541, + "learning_rate": 1.818946284537498e-05, + "loss": 0.7054, + "step": 4588 + }, + { + "epoch": 0.79, + "grad_norm": 10.244235038757324, + "learning_rate": 1.8186888621932384e-05, + "loss": 0.8342, + "step": 4589 + }, + { + "epoch": 0.79, + "grad_norm": 8.863982200622559, + "learning_rate": 1.8184314398489787e-05, + "loss": 0.6068, + "step": 4590 + }, + { + "epoch": 0.79, + "grad_norm": 12.03433895111084, + "learning_rate": 1.8181740175047194e-05, + "loss": 0.8217, + "step": 4591 + }, + { + "epoch": 0.79, + "grad_norm": 10.143538475036621, + "learning_rate": 1.8179165951604597e-05, + "loss": 0.7609, + "step": 4592 + }, + { + "epoch": 0.79, + "grad_norm": 9.755047798156738, + "learning_rate": 1.8176591728162007e-05, + "loss": 0.6745, + "step": 4593 + }, + { + "epoch": 0.79, + "grad_norm": 12.403046607971191, + "learning_rate": 1.817401750471941e-05, + "loss": 0.8431, + "step": 4594 + }, + { + "epoch": 0.79, + "grad_norm": 10.723485946655273, + "learning_rate": 1.8171443281276817e-05, + "loss": 0.7758, + "step": 4595 + }, + { + "epoch": 0.79, + "grad_norm": 12.263933181762695, + "learning_rate": 1.816886905783422e-05, + "loss": 0.8909, + "step": 4596 + }, + { + "epoch": 0.79, + "grad_norm": 12.518206596374512, + "learning_rate": 1.8166294834391627e-05, + "loss": 0.8204, + "step": 4597 + }, + { + "epoch": 0.79, + "grad_norm": 11.208568572998047, + "learning_rate": 1.816372061094903e-05, + "loss": 0.9405, + "step": 4598 + }, + { + "epoch": 0.79, + "grad_norm": 12.45848274230957, + "learning_rate": 1.8161146387506437e-05, + "loss": 1.0782, + "step": 4599 + }, + { + "epoch": 0.79, + "grad_norm": 12.41175365447998, + "learning_rate": 1.815857216406384e-05, + "loss": 0.9453, + "step": 4600 + }, + { + "epoch": 0.79, + "grad_norm": 7.178896903991699, + "learning_rate": 1.8155997940621244e-05, + "loss": 0.4872, + "step": 4601 + }, + { + "epoch": 0.79, + "grad_norm": 10.46392822265625, + "learning_rate": 1.815342371717865e-05, + "loss": 0.8833, + "step": 4602 + }, + { + "epoch": 0.79, + "grad_norm": 9.135536193847656, + "learning_rate": 1.8150849493736057e-05, + "loss": 0.6072, + "step": 4603 + }, + { + "epoch": 0.79, + "grad_norm": 9.613280296325684, + "learning_rate": 1.8148275270293464e-05, + "loss": 0.6517, + "step": 4604 + }, + { + "epoch": 0.79, + "grad_norm": 13.646885871887207, + "learning_rate": 1.8145701046850867e-05, + "loss": 0.8452, + "step": 4605 + }, + { + "epoch": 0.79, + "grad_norm": 10.22172737121582, + "learning_rate": 1.8143126823408274e-05, + "loss": 0.8655, + "step": 4606 + }, + { + "epoch": 0.79, + "grad_norm": 9.80384349822998, + "learning_rate": 1.8140552599965677e-05, + "loss": 0.7573, + "step": 4607 + }, + { + "epoch": 0.79, + "grad_norm": 9.838275909423828, + "learning_rate": 1.8137978376523084e-05, + "loss": 0.5721, + "step": 4608 + }, + { + "epoch": 0.79, + "grad_norm": 8.503539085388184, + "learning_rate": 1.8135404153080487e-05, + "loss": 0.6233, + "step": 4609 + }, + { + "epoch": 0.79, + "grad_norm": 8.924944877624512, + "learning_rate": 1.813282992963789e-05, + "loss": 0.6523, + "step": 4610 + }, + { + "epoch": 0.79, + "grad_norm": 11.088214874267578, + "learning_rate": 1.8130255706195297e-05, + "loss": 0.8185, + "step": 4611 + }, + { + "epoch": 0.79, + "grad_norm": 10.18722152709961, + "learning_rate": 1.8127681482752704e-05, + "loss": 0.7974, + "step": 4612 + }, + { + "epoch": 0.79, + "grad_norm": 10.173393249511719, + "learning_rate": 1.812510725931011e-05, + "loss": 0.7328, + "step": 4613 + }, + { + "epoch": 0.79, + "grad_norm": 9.351269721984863, + "learning_rate": 1.8122533035867514e-05, + "loss": 0.6004, + "step": 4614 + }, + { + "epoch": 0.79, + "grad_norm": 10.80124568939209, + "learning_rate": 1.811995881242492e-05, + "loss": 0.812, + "step": 4615 + }, + { + "epoch": 0.79, + "grad_norm": 10.095589637756348, + "learning_rate": 1.8117384588982324e-05, + "loss": 0.8113, + "step": 4616 + }, + { + "epoch": 0.79, + "grad_norm": 11.35456371307373, + "learning_rate": 1.811481036553973e-05, + "loss": 0.9161, + "step": 4617 + }, + { + "epoch": 0.79, + "grad_norm": 10.764464378356934, + "learning_rate": 1.8112236142097134e-05, + "loss": 0.8103, + "step": 4618 + }, + { + "epoch": 0.79, + "grad_norm": 9.464253425598145, + "learning_rate": 1.810966191865454e-05, + "loss": 0.903, + "step": 4619 + }, + { + "epoch": 0.79, + "grad_norm": 9.548282623291016, + "learning_rate": 1.8107087695211943e-05, + "loss": 0.773, + "step": 4620 + }, + { + "epoch": 0.79, + "grad_norm": 7.999635696411133, + "learning_rate": 1.810451347176935e-05, + "loss": 0.4953, + "step": 4621 + }, + { + "epoch": 0.79, + "grad_norm": 11.703523635864258, + "learning_rate": 1.8101939248326757e-05, + "loss": 0.9012, + "step": 4622 + }, + { + "epoch": 0.79, + "grad_norm": 9.518583297729492, + "learning_rate": 1.809936502488416e-05, + "loss": 0.9153, + "step": 4623 + }, + { + "epoch": 0.79, + "grad_norm": 12.757223129272461, + "learning_rate": 1.8096790801441567e-05, + "loss": 0.8886, + "step": 4624 + }, + { + "epoch": 0.79, + "grad_norm": 11.543490409851074, + "learning_rate": 1.809421657799897e-05, + "loss": 0.8808, + "step": 4625 + }, + { + "epoch": 0.79, + "grad_norm": 7.924624919891357, + "learning_rate": 1.8091642354556377e-05, + "loss": 0.6956, + "step": 4626 + }, + { + "epoch": 0.79, + "grad_norm": 9.829492568969727, + "learning_rate": 1.808906813111378e-05, + "loss": 0.7511, + "step": 4627 + }, + { + "epoch": 0.79, + "grad_norm": 9.045244216918945, + "learning_rate": 1.8086493907671187e-05, + "loss": 0.8252, + "step": 4628 + }, + { + "epoch": 0.79, + "grad_norm": 11.998333930969238, + "learning_rate": 1.808391968422859e-05, + "loss": 0.8851, + "step": 4629 + }, + { + "epoch": 0.79, + "grad_norm": 12.977302551269531, + "learning_rate": 1.8081345460785997e-05, + "loss": 1.103, + "step": 4630 + }, + { + "epoch": 0.79, + "grad_norm": 9.581474304199219, + "learning_rate": 1.8078771237343403e-05, + "loss": 0.6281, + "step": 4631 + }, + { + "epoch": 0.79, + "grad_norm": 10.6852445602417, + "learning_rate": 1.8076197013900807e-05, + "loss": 0.8449, + "step": 4632 + }, + { + "epoch": 0.8, + "grad_norm": 10.24959659576416, + "learning_rate": 1.8073622790458213e-05, + "loss": 0.7105, + "step": 4633 + }, + { + "epoch": 0.8, + "grad_norm": 10.15105152130127, + "learning_rate": 1.8071048567015617e-05, + "loss": 0.7902, + "step": 4634 + }, + { + "epoch": 0.8, + "grad_norm": 9.813551902770996, + "learning_rate": 1.8068474343573023e-05, + "loss": 0.5855, + "step": 4635 + }, + { + "epoch": 0.8, + "grad_norm": 10.484671592712402, + "learning_rate": 1.8065900120130427e-05, + "loss": 0.7479, + "step": 4636 + }, + { + "epoch": 0.8, + "grad_norm": 10.329991340637207, + "learning_rate": 1.8063325896687833e-05, + "loss": 0.8843, + "step": 4637 + }, + { + "epoch": 0.8, + "grad_norm": 8.500802993774414, + "learning_rate": 1.8060751673245237e-05, + "loss": 0.721, + "step": 4638 + }, + { + "epoch": 0.8, + "grad_norm": 10.564055442810059, + "learning_rate": 1.8058177449802643e-05, + "loss": 0.6692, + "step": 4639 + }, + { + "epoch": 0.8, + "grad_norm": 9.584227561950684, + "learning_rate": 1.805560322636005e-05, + "loss": 0.6948, + "step": 4640 + }, + { + "epoch": 0.8, + "grad_norm": 9.984607696533203, + "learning_rate": 1.8053029002917453e-05, + "loss": 0.9441, + "step": 4641 + }, + { + "epoch": 0.8, + "grad_norm": 10.908760070800781, + "learning_rate": 1.805045477947486e-05, + "loss": 0.8061, + "step": 4642 + }, + { + "epoch": 0.8, + "grad_norm": 11.603954315185547, + "learning_rate": 1.8047880556032263e-05, + "loss": 0.8863, + "step": 4643 + }, + { + "epoch": 0.8, + "grad_norm": 10.7311429977417, + "learning_rate": 1.804530633258967e-05, + "loss": 0.8869, + "step": 4644 + }, + { + "epoch": 0.8, + "grad_norm": 9.57198715209961, + "learning_rate": 1.8042732109147073e-05, + "loss": 0.5754, + "step": 4645 + }, + { + "epoch": 0.8, + "grad_norm": 10.147098541259766, + "learning_rate": 1.804015788570448e-05, + "loss": 0.6755, + "step": 4646 + }, + { + "epoch": 0.8, + "grad_norm": 10.718560218811035, + "learning_rate": 1.8037583662261883e-05, + "loss": 0.575, + "step": 4647 + }, + { + "epoch": 0.8, + "grad_norm": 9.119189262390137, + "learning_rate": 1.803500943881929e-05, + "loss": 0.5091, + "step": 4648 + }, + { + "epoch": 0.8, + "grad_norm": 13.507854461669922, + "learning_rate": 1.8032435215376693e-05, + "loss": 0.9684, + "step": 4649 + }, + { + "epoch": 0.8, + "grad_norm": 10.607927322387695, + "learning_rate": 1.8029860991934103e-05, + "loss": 0.7739, + "step": 4650 + }, + { + "epoch": 0.8, + "grad_norm": 12.133891105651855, + "learning_rate": 1.8027286768491507e-05, + "loss": 0.8511, + "step": 4651 + }, + { + "epoch": 0.8, + "grad_norm": 9.290109634399414, + "learning_rate": 1.802471254504891e-05, + "loss": 0.7475, + "step": 4652 + }, + { + "epoch": 0.8, + "grad_norm": 9.49523639678955, + "learning_rate": 1.8022138321606317e-05, + "loss": 0.7153, + "step": 4653 + }, + { + "epoch": 0.8, + "grad_norm": 7.8330254554748535, + "learning_rate": 1.801956409816372e-05, + "loss": 0.6563, + "step": 4654 + }, + { + "epoch": 0.8, + "grad_norm": 8.321264266967773, + "learning_rate": 1.8016989874721127e-05, + "loss": 0.6686, + "step": 4655 + }, + { + "epoch": 0.8, + "grad_norm": 9.530916213989258, + "learning_rate": 1.801441565127853e-05, + "loss": 0.6511, + "step": 4656 + }, + { + "epoch": 0.8, + "grad_norm": 10.36806869506836, + "learning_rate": 1.8011841427835936e-05, + "loss": 0.8904, + "step": 4657 + }, + { + "epoch": 0.8, + "grad_norm": 11.371545791625977, + "learning_rate": 1.800926720439334e-05, + "loss": 0.7104, + "step": 4658 + }, + { + "epoch": 0.8, + "grad_norm": 10.19075870513916, + "learning_rate": 1.800669298095075e-05, + "loss": 0.6715, + "step": 4659 + }, + { + "epoch": 0.8, + "grad_norm": 12.404905319213867, + "learning_rate": 1.8004118757508153e-05, + "loss": 0.8377, + "step": 4660 + }, + { + "epoch": 0.8, + "grad_norm": 7.673076629638672, + "learning_rate": 1.800154453406556e-05, + "loss": 0.5113, + "step": 4661 + }, + { + "epoch": 0.8, + "grad_norm": 11.194426536560059, + "learning_rate": 1.7998970310622963e-05, + "loss": 0.8991, + "step": 4662 + }, + { + "epoch": 0.8, + "grad_norm": 10.455160140991211, + "learning_rate": 1.7996396087180366e-05, + "loss": 1.0607, + "step": 4663 + }, + { + "epoch": 0.8, + "grad_norm": 8.950098037719727, + "learning_rate": 1.7993821863737773e-05, + "loss": 0.7847, + "step": 4664 + }, + { + "epoch": 0.8, + "grad_norm": 8.758997917175293, + "learning_rate": 1.7991247640295176e-05, + "loss": 0.7718, + "step": 4665 + }, + { + "epoch": 0.8, + "grad_norm": 8.240198135375977, + "learning_rate": 1.7988673416852583e-05, + "loss": 0.7921, + "step": 4666 + }, + { + "epoch": 0.8, + "grad_norm": 11.519044876098633, + "learning_rate": 1.7986099193409986e-05, + "loss": 0.8884, + "step": 4667 + }, + { + "epoch": 0.8, + "grad_norm": 11.16071891784668, + "learning_rate": 1.7983524969967393e-05, + "loss": 0.6255, + "step": 4668 + }, + { + "epoch": 0.8, + "grad_norm": 9.046331405639648, + "learning_rate": 1.79809507465248e-05, + "loss": 0.7597, + "step": 4669 + }, + { + "epoch": 0.8, + "grad_norm": 9.17223072052002, + "learning_rate": 1.7978376523082206e-05, + "loss": 0.6832, + "step": 4670 + }, + { + "epoch": 0.8, + "grad_norm": 7.298058032989502, + "learning_rate": 1.797580229963961e-05, + "loss": 0.5662, + "step": 4671 + }, + { + "epoch": 0.8, + "grad_norm": 10.127120018005371, + "learning_rate": 1.7973228076197013e-05, + "loss": 0.8818, + "step": 4672 + }, + { + "epoch": 0.8, + "grad_norm": 9.152034759521484, + "learning_rate": 1.797065385275442e-05, + "loss": 0.7842, + "step": 4673 + }, + { + "epoch": 0.8, + "grad_norm": 11.81672477722168, + "learning_rate": 1.7968079629311823e-05, + "loss": 1.113, + "step": 4674 + }, + { + "epoch": 0.8, + "grad_norm": 10.437200546264648, + "learning_rate": 1.796550540586923e-05, + "loss": 0.7425, + "step": 4675 + }, + { + "epoch": 0.8, + "grad_norm": 10.65206241607666, + "learning_rate": 1.7962931182426633e-05, + "loss": 0.7048, + "step": 4676 + }, + { + "epoch": 0.8, + "grad_norm": 10.43521785736084, + "learning_rate": 1.796035695898404e-05, + "loss": 0.8755, + "step": 4677 + }, + { + "epoch": 0.8, + "grad_norm": 13.043573379516602, + "learning_rate": 1.7957782735541446e-05, + "loss": 0.8106, + "step": 4678 + }, + { + "epoch": 0.8, + "grad_norm": 11.109203338623047, + "learning_rate": 1.7955208512098853e-05, + "loss": 0.9277, + "step": 4679 + }, + { + "epoch": 0.8, + "grad_norm": 8.440686225891113, + "learning_rate": 1.7952634288656256e-05, + "loss": 0.8093, + "step": 4680 + }, + { + "epoch": 0.8, + "grad_norm": 9.53641128540039, + "learning_rate": 1.7950060065213663e-05, + "loss": 0.5236, + "step": 4681 + }, + { + "epoch": 0.8, + "grad_norm": 8.874175071716309, + "learning_rate": 1.7947485841771066e-05, + "loss": 0.878, + "step": 4682 + }, + { + "epoch": 0.8, + "grad_norm": 12.545369148254395, + "learning_rate": 1.794491161832847e-05, + "loss": 1.1202, + "step": 4683 + }, + { + "epoch": 0.8, + "grad_norm": 9.253897666931152, + "learning_rate": 1.7942337394885876e-05, + "loss": 0.7012, + "step": 4684 + }, + { + "epoch": 0.8, + "grad_norm": 10.787257194519043, + "learning_rate": 1.793976317144328e-05, + "loss": 0.6988, + "step": 4685 + }, + { + "epoch": 0.8, + "grad_norm": 9.269976615905762, + "learning_rate": 1.7937188948000686e-05, + "loss": 0.7213, + "step": 4686 + }, + { + "epoch": 0.8, + "grad_norm": 10.339044570922852, + "learning_rate": 1.793461472455809e-05, + "loss": 0.6812, + "step": 4687 + }, + { + "epoch": 0.8, + "grad_norm": 12.309968948364258, + "learning_rate": 1.79320405011155e-05, + "loss": 0.7178, + "step": 4688 + }, + { + "epoch": 0.8, + "grad_norm": 11.741803169250488, + "learning_rate": 1.7929466277672903e-05, + "loss": 0.9432, + "step": 4689 + }, + { + "epoch": 0.8, + "grad_norm": 11.693902969360352, + "learning_rate": 1.792689205423031e-05, + "loss": 0.7564, + "step": 4690 + }, + { + "epoch": 0.81, + "grad_norm": 9.328643798828125, + "learning_rate": 1.7924317830787713e-05, + "loss": 0.6594, + "step": 4691 + }, + { + "epoch": 0.81, + "grad_norm": 11.708168029785156, + "learning_rate": 1.792174360734512e-05, + "loss": 1.0262, + "step": 4692 + }, + { + "epoch": 0.81, + "grad_norm": 9.268716812133789, + "learning_rate": 1.7919169383902523e-05, + "loss": 0.6609, + "step": 4693 + }, + { + "epoch": 0.81, + "grad_norm": 11.470682144165039, + "learning_rate": 1.7916595160459926e-05, + "loss": 0.9439, + "step": 4694 + }, + { + "epoch": 0.81, + "grad_norm": 11.904067039489746, + "learning_rate": 1.7914020937017333e-05, + "loss": 0.8827, + "step": 4695 + }, + { + "epoch": 0.81, + "grad_norm": 11.216064453125, + "learning_rate": 1.7911446713574736e-05, + "loss": 0.8426, + "step": 4696 + }, + { + "epoch": 0.81, + "grad_norm": 12.793227195739746, + "learning_rate": 1.7908872490132146e-05, + "loss": 0.7214, + "step": 4697 + }, + { + "epoch": 0.81, + "grad_norm": 12.365936279296875, + "learning_rate": 1.790629826668955e-05, + "loss": 0.9337, + "step": 4698 + }, + { + "epoch": 0.81, + "grad_norm": 8.970952987670898, + "learning_rate": 1.7903724043246956e-05, + "loss": 0.544, + "step": 4699 + }, + { + "epoch": 0.81, + "grad_norm": 10.585055351257324, + "learning_rate": 1.790114981980436e-05, + "loss": 0.7607, + "step": 4700 + }, + { + "epoch": 0.81, + "grad_norm": 10.284903526306152, + "learning_rate": 1.7898575596361766e-05, + "loss": 0.7597, + "step": 4701 + }, + { + "epoch": 0.81, + "grad_norm": 9.15507698059082, + "learning_rate": 1.789600137291917e-05, + "loss": 0.5263, + "step": 4702 + }, + { + "epoch": 0.81, + "grad_norm": 12.795269966125488, + "learning_rate": 1.7893427149476576e-05, + "loss": 1.1584, + "step": 4703 + }, + { + "epoch": 0.81, + "grad_norm": 10.412137985229492, + "learning_rate": 1.789085292603398e-05, + "loss": 0.7047, + "step": 4704 + }, + { + "epoch": 0.81, + "grad_norm": 11.058917999267578, + "learning_rate": 1.7888278702591383e-05, + "loss": 0.8618, + "step": 4705 + }, + { + "epoch": 0.81, + "grad_norm": 11.027851104736328, + "learning_rate": 1.788570447914879e-05, + "loss": 0.6486, + "step": 4706 + }, + { + "epoch": 0.81, + "grad_norm": 11.5999116897583, + "learning_rate": 1.7883130255706196e-05, + "loss": 0.679, + "step": 4707 + }, + { + "epoch": 0.81, + "grad_norm": 10.76766300201416, + "learning_rate": 1.7880556032263603e-05, + "loss": 0.8544, + "step": 4708 + }, + { + "epoch": 0.81, + "grad_norm": 8.434048652648926, + "learning_rate": 1.7877981808821006e-05, + "loss": 0.5485, + "step": 4709 + }, + { + "epoch": 0.81, + "grad_norm": 12.41886043548584, + "learning_rate": 1.7875407585378413e-05, + "loss": 1.045, + "step": 4710 + }, + { + "epoch": 0.81, + "grad_norm": 12.896388053894043, + "learning_rate": 1.7872833361935816e-05, + "loss": 1.0275, + "step": 4711 + }, + { + "epoch": 0.81, + "grad_norm": 11.855283737182617, + "learning_rate": 1.7870259138493223e-05, + "loss": 0.8951, + "step": 4712 + }, + { + "epoch": 0.81, + "grad_norm": 7.453601837158203, + "learning_rate": 1.7867684915050626e-05, + "loss": 0.5739, + "step": 4713 + }, + { + "epoch": 0.81, + "grad_norm": 9.944012641906738, + "learning_rate": 1.786511069160803e-05, + "loss": 0.7602, + "step": 4714 + }, + { + "epoch": 0.81, + "grad_norm": 8.393771171569824, + "learning_rate": 1.7862536468165436e-05, + "loss": 0.6744, + "step": 4715 + }, + { + "epoch": 0.81, + "grad_norm": 10.566618919372559, + "learning_rate": 1.7859962244722843e-05, + "loss": 0.7501, + "step": 4716 + }, + { + "epoch": 0.81, + "grad_norm": 9.667789459228516, + "learning_rate": 1.785738802128025e-05, + "loss": 0.6859, + "step": 4717 + }, + { + "epoch": 0.81, + "grad_norm": 11.746609687805176, + "learning_rate": 1.7854813797837653e-05, + "loss": 1.0023, + "step": 4718 + }, + { + "epoch": 0.81, + "grad_norm": 9.340370178222656, + "learning_rate": 1.785223957439506e-05, + "loss": 0.7859, + "step": 4719 + }, + { + "epoch": 0.81, + "grad_norm": 10.385442733764648, + "learning_rate": 1.7849665350952463e-05, + "loss": 0.9263, + "step": 4720 + }, + { + "epoch": 0.81, + "grad_norm": 10.542134284973145, + "learning_rate": 1.784709112750987e-05, + "loss": 0.8354, + "step": 4721 + }, + { + "epoch": 0.81, + "grad_norm": 10.154189109802246, + "learning_rate": 1.7844516904067273e-05, + "loss": 0.6012, + "step": 4722 + }, + { + "epoch": 0.81, + "grad_norm": 6.087700366973877, + "learning_rate": 1.784194268062468e-05, + "loss": 0.4578, + "step": 4723 + }, + { + "epoch": 0.81, + "grad_norm": 8.976358413696289, + "learning_rate": 1.7839368457182083e-05, + "loss": 0.6712, + "step": 4724 + }, + { + "epoch": 0.81, + "grad_norm": 10.208373069763184, + "learning_rate": 1.783679423373949e-05, + "loss": 0.9655, + "step": 4725 + }, + { + "epoch": 0.81, + "grad_norm": 8.266886711120605, + "learning_rate": 1.7834220010296896e-05, + "loss": 0.8008, + "step": 4726 + }, + { + "epoch": 0.81, + "grad_norm": 8.366615295410156, + "learning_rate": 1.78316457868543e-05, + "loss": 0.7056, + "step": 4727 + }, + { + "epoch": 0.81, + "grad_norm": 12.03531265258789, + "learning_rate": 1.7829071563411706e-05, + "loss": 0.9338, + "step": 4728 + }, + { + "epoch": 0.81, + "grad_norm": 9.879033088684082, + "learning_rate": 1.782649733996911e-05, + "loss": 0.8495, + "step": 4729 + }, + { + "epoch": 0.81, + "grad_norm": 13.972820281982422, + "learning_rate": 1.7823923116526516e-05, + "loss": 0.9394, + "step": 4730 + }, + { + "epoch": 0.81, + "grad_norm": 10.600964546203613, + "learning_rate": 1.782134889308392e-05, + "loss": 0.8586, + "step": 4731 + }, + { + "epoch": 0.81, + "grad_norm": 9.581879615783691, + "learning_rate": 1.7818774669641326e-05, + "loss": 0.7712, + "step": 4732 + }, + { + "epoch": 0.81, + "grad_norm": 10.14083480834961, + "learning_rate": 1.781620044619873e-05, + "loss": 0.7987, + "step": 4733 + }, + { + "epoch": 0.81, + "grad_norm": 11.475602149963379, + "learning_rate": 1.7813626222756136e-05, + "loss": 0.7109, + "step": 4734 + }, + { + "epoch": 0.81, + "grad_norm": 9.210009574890137, + "learning_rate": 1.7811051999313542e-05, + "loss": 0.7231, + "step": 4735 + }, + { + "epoch": 0.81, + "grad_norm": 9.758543014526367, + "learning_rate": 1.7808477775870946e-05, + "loss": 0.5857, + "step": 4736 + }, + { + "epoch": 0.81, + "grad_norm": 11.707775115966797, + "learning_rate": 1.7805903552428352e-05, + "loss": 0.9456, + "step": 4737 + }, + { + "epoch": 0.81, + "grad_norm": 10.69056510925293, + "learning_rate": 1.7803329328985756e-05, + "loss": 0.7248, + "step": 4738 + }, + { + "epoch": 0.81, + "grad_norm": 9.393020629882812, + "learning_rate": 1.7800755105543162e-05, + "loss": 0.6604, + "step": 4739 + }, + { + "epoch": 0.81, + "grad_norm": 9.653887748718262, + "learning_rate": 1.7798180882100566e-05, + "loss": 0.6858, + "step": 4740 + }, + { + "epoch": 0.81, + "grad_norm": 9.445959091186523, + "learning_rate": 1.7795606658657972e-05, + "loss": 0.8981, + "step": 4741 + }, + { + "epoch": 0.81, + "grad_norm": 13.136672973632812, + "learning_rate": 1.7793032435215376e-05, + "loss": 0.6346, + "step": 4742 + }, + { + "epoch": 0.81, + "grad_norm": 10.509740829467773, + "learning_rate": 1.7790458211772782e-05, + "loss": 0.6559, + "step": 4743 + }, + { + "epoch": 0.81, + "grad_norm": 10.922419548034668, + "learning_rate": 1.778788398833019e-05, + "loss": 0.6987, + "step": 4744 + }, + { + "epoch": 0.81, + "grad_norm": 9.939035415649414, + "learning_rate": 1.7785309764887592e-05, + "loss": 0.7952, + "step": 4745 + }, + { + "epoch": 0.81, + "grad_norm": 9.739534378051758, + "learning_rate": 1.7782735541445e-05, + "loss": 0.6478, + "step": 4746 + }, + { + "epoch": 0.81, + "grad_norm": 11.027047157287598, + "learning_rate": 1.7780161318002402e-05, + "loss": 0.9522, + "step": 4747 + }, + { + "epoch": 0.81, + "grad_norm": 9.606758117675781, + "learning_rate": 1.777758709455981e-05, + "loss": 0.5663, + "step": 4748 + }, + { + "epoch": 0.81, + "grad_norm": 10.612149238586426, + "learning_rate": 1.7775012871117212e-05, + "loss": 0.7487, + "step": 4749 + }, + { + "epoch": 0.82, + "grad_norm": 8.411164283752441, + "learning_rate": 1.777243864767462e-05, + "loss": 0.8331, + "step": 4750 + }, + { + "epoch": 0.82, + "grad_norm": 10.100812911987305, + "learning_rate": 1.7769864424232022e-05, + "loss": 0.6652, + "step": 4751 + }, + { + "epoch": 0.82, + "grad_norm": 8.680567741394043, + "learning_rate": 1.776729020078943e-05, + "loss": 0.8855, + "step": 4752 + }, + { + "epoch": 0.82, + "grad_norm": 9.902464866638184, + "learning_rate": 1.7764715977346832e-05, + "loss": 0.6956, + "step": 4753 + }, + { + "epoch": 0.82, + "grad_norm": 8.097615242004395, + "learning_rate": 1.7762141753904242e-05, + "loss": 0.5542, + "step": 4754 + }, + { + "epoch": 0.82, + "grad_norm": 10.215203285217285, + "learning_rate": 1.7759567530461646e-05, + "loss": 0.7032, + "step": 4755 + }, + { + "epoch": 0.82, + "grad_norm": 9.553426742553711, + "learning_rate": 1.775699330701905e-05, + "loss": 0.872, + "step": 4756 + }, + { + "epoch": 0.82, + "grad_norm": 12.280723571777344, + "learning_rate": 1.7754419083576456e-05, + "loss": 0.7258, + "step": 4757 + }, + { + "epoch": 0.82, + "grad_norm": 8.920151710510254, + "learning_rate": 1.775184486013386e-05, + "loss": 0.7815, + "step": 4758 + }, + { + "epoch": 0.82, + "grad_norm": 10.299314498901367, + "learning_rate": 1.7749270636691266e-05, + "loss": 0.8043, + "step": 4759 + }, + { + "epoch": 0.82, + "grad_norm": 11.703652381896973, + "learning_rate": 1.774669641324867e-05, + "loss": 0.9688, + "step": 4760 + }, + { + "epoch": 0.82, + "grad_norm": 11.8981351852417, + "learning_rate": 1.7744122189806076e-05, + "loss": 0.8533, + "step": 4761 + }, + { + "epoch": 0.82, + "grad_norm": 10.716887474060059, + "learning_rate": 1.774154796636348e-05, + "loss": 0.7105, + "step": 4762 + }, + { + "epoch": 0.82, + "grad_norm": 12.862297058105469, + "learning_rate": 1.773897374292089e-05, + "loss": 0.9016, + "step": 4763 + }, + { + "epoch": 0.82, + "grad_norm": 8.814416885375977, + "learning_rate": 1.7736399519478292e-05, + "loss": 0.7525, + "step": 4764 + }, + { + "epoch": 0.82, + "grad_norm": 12.40992259979248, + "learning_rate": 1.77338252960357e-05, + "loss": 0.7877, + "step": 4765 + }, + { + "epoch": 0.82, + "grad_norm": 8.749227523803711, + "learning_rate": 1.7731251072593102e-05, + "loss": 0.6553, + "step": 4766 + }, + { + "epoch": 0.82, + "grad_norm": 9.865870475769043, + "learning_rate": 1.7728676849150505e-05, + "loss": 0.7681, + "step": 4767 + }, + { + "epoch": 0.82, + "grad_norm": 9.250228881835938, + "learning_rate": 1.7726102625707912e-05, + "loss": 0.6776, + "step": 4768 + }, + { + "epoch": 0.82, + "grad_norm": 10.269866943359375, + "learning_rate": 1.7723528402265315e-05, + "loss": 0.6895, + "step": 4769 + }, + { + "epoch": 0.82, + "grad_norm": 9.050362586975098, + "learning_rate": 1.7720954178822722e-05, + "loss": 0.7331, + "step": 4770 + }, + { + "epoch": 0.82, + "grad_norm": 10.744763374328613, + "learning_rate": 1.7718379955380125e-05, + "loss": 0.867, + "step": 4771 + }, + { + "epoch": 0.82, + "grad_norm": 11.055499076843262, + "learning_rate": 1.7715805731937532e-05, + "loss": 0.739, + "step": 4772 + }, + { + "epoch": 0.82, + "grad_norm": 7.691717624664307, + "learning_rate": 1.771323150849494e-05, + "loss": 0.5485, + "step": 4773 + }, + { + "epoch": 0.82, + "grad_norm": 10.498370170593262, + "learning_rate": 1.7710657285052345e-05, + "loss": 0.91, + "step": 4774 + }, + { + "epoch": 0.82, + "grad_norm": 9.988570213317871, + "learning_rate": 1.770808306160975e-05, + "loss": 0.6445, + "step": 4775 + }, + { + "epoch": 0.82, + "grad_norm": 8.056323051452637, + "learning_rate": 1.7705508838167155e-05, + "loss": 0.4321, + "step": 4776 + }, + { + "epoch": 0.82, + "grad_norm": 10.658989906311035, + "learning_rate": 1.770293461472456e-05, + "loss": 0.7111, + "step": 4777 + }, + { + "epoch": 0.82, + "grad_norm": 12.714153289794922, + "learning_rate": 1.7700360391281962e-05, + "loss": 0.7901, + "step": 4778 + }, + { + "epoch": 0.82, + "grad_norm": 9.393281936645508, + "learning_rate": 1.769778616783937e-05, + "loss": 0.6817, + "step": 4779 + }, + { + "epoch": 0.82, + "grad_norm": 11.453137397766113, + "learning_rate": 1.7695211944396772e-05, + "loss": 0.9336, + "step": 4780 + }, + { + "epoch": 0.82, + "grad_norm": 11.852784156799316, + "learning_rate": 1.769263772095418e-05, + "loss": 0.7724, + "step": 4781 + }, + { + "epoch": 0.82, + "grad_norm": 10.578731536865234, + "learning_rate": 1.7690063497511585e-05, + "loss": 0.7911, + "step": 4782 + }, + { + "epoch": 0.82, + "grad_norm": 9.003982543945312, + "learning_rate": 1.7687489274068992e-05, + "loss": 0.8182, + "step": 4783 + }, + { + "epoch": 0.82, + "grad_norm": 10.337196350097656, + "learning_rate": 1.7684915050626395e-05, + "loss": 0.7574, + "step": 4784 + }, + { + "epoch": 0.82, + "grad_norm": 9.3565673828125, + "learning_rate": 1.7682340827183802e-05, + "loss": 0.6727, + "step": 4785 + }, + { + "epoch": 0.82, + "grad_norm": 11.473671913146973, + "learning_rate": 1.7679766603741205e-05, + "loss": 0.6664, + "step": 4786 + }, + { + "epoch": 0.82, + "grad_norm": 10.382794380187988, + "learning_rate": 1.767719238029861e-05, + "loss": 0.6981, + "step": 4787 + }, + { + "epoch": 0.82, + "grad_norm": 10.083308219909668, + "learning_rate": 1.7674618156856015e-05, + "loss": 0.7013, + "step": 4788 + }, + { + "epoch": 0.82, + "grad_norm": 12.678533554077148, + "learning_rate": 1.767204393341342e-05, + "loss": 0.8264, + "step": 4789 + }, + { + "epoch": 0.82, + "grad_norm": 11.436578750610352, + "learning_rate": 1.7669469709970825e-05, + "loss": 0.8023, + "step": 4790 + }, + { + "epoch": 0.82, + "grad_norm": 10.489977836608887, + "learning_rate": 1.766689548652823e-05, + "loss": 0.7519, + "step": 4791 + }, + { + "epoch": 0.82, + "grad_norm": 11.302467346191406, + "learning_rate": 1.766432126308564e-05, + "loss": 0.5428, + "step": 4792 + }, + { + "epoch": 0.82, + "grad_norm": 12.264116287231445, + "learning_rate": 1.7661747039643042e-05, + "loss": 0.7665, + "step": 4793 + }, + { + "epoch": 0.82, + "grad_norm": 9.950275421142578, + "learning_rate": 1.765917281620045e-05, + "loss": 0.8895, + "step": 4794 + }, + { + "epoch": 0.82, + "grad_norm": 9.896724700927734, + "learning_rate": 1.7656598592757852e-05, + "loss": 0.6034, + "step": 4795 + }, + { + "epoch": 0.82, + "grad_norm": 9.13314151763916, + "learning_rate": 1.765402436931526e-05, + "loss": 0.8166, + "step": 4796 + }, + { + "epoch": 0.82, + "grad_norm": 11.60966968536377, + "learning_rate": 1.7651450145872662e-05, + "loss": 0.8992, + "step": 4797 + }, + { + "epoch": 0.82, + "grad_norm": 9.669915199279785, + "learning_rate": 1.7648875922430065e-05, + "loss": 0.6287, + "step": 4798 + }, + { + "epoch": 0.82, + "grad_norm": 11.607988357543945, + "learning_rate": 1.7646301698987472e-05, + "loss": 0.9002, + "step": 4799 + }, + { + "epoch": 0.82, + "grad_norm": 9.235288619995117, + "learning_rate": 1.7643727475544875e-05, + "loss": 0.7838, + "step": 4800 + }, + { + "epoch": 0.82, + "grad_norm": 9.377300262451172, + "learning_rate": 1.7641153252102285e-05, + "loss": 0.7719, + "step": 4801 + }, + { + "epoch": 0.82, + "grad_norm": 9.074933052062988, + "learning_rate": 1.763857902865969e-05, + "loss": 0.6962, + "step": 4802 + }, + { + "epoch": 0.82, + "grad_norm": 8.036238670349121, + "learning_rate": 1.7636004805217095e-05, + "loss": 0.6683, + "step": 4803 + }, + { + "epoch": 0.82, + "grad_norm": 10.239750862121582, + "learning_rate": 1.76334305817745e-05, + "loss": 0.7341, + "step": 4804 + }, + { + "epoch": 0.82, + "grad_norm": 9.698104858398438, + "learning_rate": 1.7630856358331905e-05, + "loss": 0.7769, + "step": 4805 + }, + { + "epoch": 0.82, + "grad_norm": 9.367387771606445, + "learning_rate": 1.762828213488931e-05, + "loss": 0.6348, + "step": 4806 + }, + { + "epoch": 0.82, + "grad_norm": 9.038697242736816, + "learning_rate": 1.7625707911446715e-05, + "loss": 0.6603, + "step": 4807 + }, + { + "epoch": 0.83, + "grad_norm": 11.054304122924805, + "learning_rate": 1.762313368800412e-05, + "loss": 0.8838, + "step": 4808 + }, + { + "epoch": 0.83, + "grad_norm": 10.05933666229248, + "learning_rate": 1.762055946456152e-05, + "loss": 0.4696, + "step": 4809 + }, + { + "epoch": 0.83, + "grad_norm": 9.898219108581543, + "learning_rate": 1.761798524111893e-05, + "loss": 0.7013, + "step": 4810 + }, + { + "epoch": 0.83, + "grad_norm": 10.645747184753418, + "learning_rate": 1.7615411017676335e-05, + "loss": 0.9902, + "step": 4811 + }, + { + "epoch": 0.83, + "grad_norm": 10.932388305664062, + "learning_rate": 1.7612836794233742e-05, + "loss": 0.8642, + "step": 4812 + }, + { + "epoch": 0.83, + "grad_norm": 11.371105194091797, + "learning_rate": 1.7610262570791145e-05, + "loss": 1.0138, + "step": 4813 + }, + { + "epoch": 0.83, + "grad_norm": 11.310482025146484, + "learning_rate": 1.7607688347348552e-05, + "loss": 0.7208, + "step": 4814 + }, + { + "epoch": 0.83, + "grad_norm": 10.735977172851562, + "learning_rate": 1.7605114123905955e-05, + "loss": 0.7119, + "step": 4815 + }, + { + "epoch": 0.83, + "grad_norm": 10.394113540649414, + "learning_rate": 1.760253990046336e-05, + "loss": 0.8329, + "step": 4816 + }, + { + "epoch": 0.83, + "grad_norm": 8.97292423248291, + "learning_rate": 1.7599965677020765e-05, + "loss": 0.8301, + "step": 4817 + }, + { + "epoch": 0.83, + "grad_norm": 13.534422874450684, + "learning_rate": 1.7597391453578168e-05, + "loss": 1.0778, + "step": 4818 + }, + { + "epoch": 0.83, + "grad_norm": 11.125338554382324, + "learning_rate": 1.7594817230135575e-05, + "loss": 0.9699, + "step": 4819 + }, + { + "epoch": 0.83, + "grad_norm": 10.227917671203613, + "learning_rate": 1.759224300669298e-05, + "loss": 0.7768, + "step": 4820 + }, + { + "epoch": 0.83, + "grad_norm": 8.644247055053711, + "learning_rate": 1.7589668783250388e-05, + "loss": 0.7802, + "step": 4821 + }, + { + "epoch": 0.83, + "grad_norm": 13.752542495727539, + "learning_rate": 1.758709455980779e-05, + "loss": 0.9837, + "step": 4822 + }, + { + "epoch": 0.83, + "grad_norm": 11.625603675842285, + "learning_rate": 1.7584520336365198e-05, + "loss": 0.8077, + "step": 4823 + }, + { + "epoch": 0.83, + "grad_norm": 9.706161499023438, + "learning_rate": 1.75819461129226e-05, + "loss": 0.7319, + "step": 4824 + }, + { + "epoch": 0.83, + "grad_norm": 12.121040344238281, + "learning_rate": 1.7579371889480008e-05, + "loss": 0.8439, + "step": 4825 + }, + { + "epoch": 0.83, + "grad_norm": 9.253231048583984, + "learning_rate": 1.757679766603741e-05, + "loss": 0.7152, + "step": 4826 + }, + { + "epoch": 0.83, + "grad_norm": 11.22000503540039, + "learning_rate": 1.7574223442594818e-05, + "loss": 0.461, + "step": 4827 + }, + { + "epoch": 0.83, + "grad_norm": 10.102100372314453, + "learning_rate": 1.757164921915222e-05, + "loss": 0.8478, + "step": 4828 + }, + { + "epoch": 0.83, + "grad_norm": 9.431357383728027, + "learning_rate": 1.7569074995709625e-05, + "loss": 0.8168, + "step": 4829 + }, + { + "epoch": 0.83, + "grad_norm": 9.592718124389648, + "learning_rate": 1.7566500772267035e-05, + "loss": 0.8607, + "step": 4830 + }, + { + "epoch": 0.83, + "grad_norm": 10.677876472473145, + "learning_rate": 1.7563926548824438e-05, + "loss": 0.9186, + "step": 4831 + }, + { + "epoch": 0.83, + "grad_norm": 9.456489562988281, + "learning_rate": 1.7561352325381845e-05, + "loss": 0.6835, + "step": 4832 + }, + { + "epoch": 0.83, + "grad_norm": 11.43439769744873, + "learning_rate": 1.7558778101939248e-05, + "loss": 0.9202, + "step": 4833 + }, + { + "epoch": 0.83, + "grad_norm": 10.976131439208984, + "learning_rate": 1.7556203878496655e-05, + "loss": 0.6594, + "step": 4834 + }, + { + "epoch": 0.83, + "grad_norm": 9.379373550415039, + "learning_rate": 1.7553629655054058e-05, + "loss": 0.7108, + "step": 4835 + }, + { + "epoch": 0.83, + "grad_norm": 10.393901824951172, + "learning_rate": 1.7551055431611465e-05, + "loss": 0.6935, + "step": 4836 + }, + { + "epoch": 0.83, + "grad_norm": 8.976284980773926, + "learning_rate": 1.7548481208168868e-05, + "loss": 0.409, + "step": 4837 + }, + { + "epoch": 0.83, + "grad_norm": 9.669224739074707, + "learning_rate": 1.7545906984726275e-05, + "loss": 0.7548, + "step": 4838 + }, + { + "epoch": 0.83, + "grad_norm": 11.577635765075684, + "learning_rate": 1.754333276128368e-05, + "loss": 0.7728, + "step": 4839 + }, + { + "epoch": 0.83, + "grad_norm": 14.180777549743652, + "learning_rate": 1.7540758537841085e-05, + "loss": 0.9255, + "step": 4840 + }, + { + "epoch": 0.83, + "grad_norm": 11.16286849975586, + "learning_rate": 1.753818431439849e-05, + "loss": 0.8874, + "step": 4841 + }, + { + "epoch": 0.83, + "grad_norm": 10.272332191467285, + "learning_rate": 1.7535610090955895e-05, + "loss": 0.8475, + "step": 4842 + }, + { + "epoch": 0.83, + "grad_norm": 11.789423942565918, + "learning_rate": 1.75330358675133e-05, + "loss": 0.5312, + "step": 4843 + }, + { + "epoch": 0.83, + "grad_norm": 10.371023178100586, + "learning_rate": 1.7530461644070705e-05, + "loss": 0.5665, + "step": 4844 + }, + { + "epoch": 0.83, + "grad_norm": 10.805109977722168, + "learning_rate": 1.752788742062811e-05, + "loss": 0.6869, + "step": 4845 + }, + { + "epoch": 0.83, + "grad_norm": 13.336012840270996, + "learning_rate": 1.7525313197185515e-05, + "loss": 0.8833, + "step": 4846 + }, + { + "epoch": 0.83, + "grad_norm": 12.783849716186523, + "learning_rate": 1.752273897374292e-05, + "loss": 0.9346, + "step": 4847 + }, + { + "epoch": 0.83, + "grad_norm": 10.484649658203125, + "learning_rate": 1.7520164750300328e-05, + "loss": 0.6735, + "step": 4848 + }, + { + "epoch": 0.83, + "grad_norm": 11.093966484069824, + "learning_rate": 1.751759052685773e-05, + "loss": 0.6745, + "step": 4849 + }, + { + "epoch": 0.83, + "grad_norm": 11.905133247375488, + "learning_rate": 1.7515016303415138e-05, + "loss": 0.9312, + "step": 4850 + }, + { + "epoch": 0.83, + "grad_norm": 12.303994178771973, + "learning_rate": 1.751244207997254e-05, + "loss": 0.714, + "step": 4851 + }, + { + "epoch": 0.83, + "grad_norm": 12.163971900939941, + "learning_rate": 1.7509867856529948e-05, + "loss": 0.8427, + "step": 4852 + }, + { + "epoch": 0.83, + "grad_norm": 12.35700798034668, + "learning_rate": 1.750729363308735e-05, + "loss": 0.6725, + "step": 4853 + }, + { + "epoch": 0.83, + "grad_norm": 9.923184394836426, + "learning_rate": 1.7504719409644758e-05, + "loss": 0.8309, + "step": 4854 + }, + { + "epoch": 0.83, + "grad_norm": 9.923500061035156, + "learning_rate": 1.750214518620216e-05, + "loss": 0.7293, + "step": 4855 + }, + { + "epoch": 0.83, + "grad_norm": 10.36686897277832, + "learning_rate": 1.7499570962759568e-05, + "loss": 0.7542, + "step": 4856 + }, + { + "epoch": 0.83, + "grad_norm": 11.174764633178711, + "learning_rate": 1.749699673931697e-05, + "loss": 0.7159, + "step": 4857 + }, + { + "epoch": 0.83, + "grad_norm": 10.03627872467041, + "learning_rate": 1.749442251587438e-05, + "loss": 0.8438, + "step": 4858 + }, + { + "epoch": 0.83, + "grad_norm": 9.963062286376953, + "learning_rate": 1.7491848292431785e-05, + "loss": 0.844, + "step": 4859 + }, + { + "epoch": 0.83, + "grad_norm": 13.55281925201416, + "learning_rate": 1.7489274068989188e-05, + "loss": 1.0125, + "step": 4860 + }, + { + "epoch": 0.83, + "grad_norm": 9.869220733642578, + "learning_rate": 1.7486699845546595e-05, + "loss": 0.6718, + "step": 4861 + }, + { + "epoch": 0.83, + "grad_norm": 11.24552059173584, + "learning_rate": 1.7484125622103998e-05, + "loss": 0.5701, + "step": 4862 + }, + { + "epoch": 0.83, + "grad_norm": 9.85763168334961, + "learning_rate": 1.7481551398661405e-05, + "loss": 0.6519, + "step": 4863 + }, + { + "epoch": 0.83, + "grad_norm": 10.538481712341309, + "learning_rate": 1.7478977175218808e-05, + "loss": 0.6596, + "step": 4864 + }, + { + "epoch": 0.83, + "grad_norm": 9.993450164794922, + "learning_rate": 1.7476402951776215e-05, + "loss": 0.8185, + "step": 4865 + }, + { + "epoch": 0.84, + "grad_norm": 10.547932624816895, + "learning_rate": 1.7473828728333618e-05, + "loss": 0.8077, + "step": 4866 + }, + { + "epoch": 0.84, + "grad_norm": 9.072369575500488, + "learning_rate": 1.7471254504891028e-05, + "loss": 0.5572, + "step": 4867 + }, + { + "epoch": 0.84, + "grad_norm": 9.876172065734863, + "learning_rate": 1.746868028144843e-05, + "loss": 0.7, + "step": 4868 + }, + { + "epoch": 0.84, + "grad_norm": 9.314702987670898, + "learning_rate": 1.7466106058005838e-05, + "loss": 0.6645, + "step": 4869 + }, + { + "epoch": 0.84, + "grad_norm": 10.259981155395508, + "learning_rate": 1.746353183456324e-05, + "loss": 0.7082, + "step": 4870 + }, + { + "epoch": 0.84, + "grad_norm": 11.255072593688965, + "learning_rate": 1.7460957611120644e-05, + "loss": 0.8642, + "step": 4871 + }, + { + "epoch": 0.84, + "grad_norm": 11.429109573364258, + "learning_rate": 1.745838338767805e-05, + "loss": 0.7689, + "step": 4872 + }, + { + "epoch": 0.84, + "grad_norm": 9.946885108947754, + "learning_rate": 1.7455809164235454e-05, + "loss": 1.0431, + "step": 4873 + }, + { + "epoch": 0.84, + "grad_norm": 8.412161827087402, + "learning_rate": 1.745323494079286e-05, + "loss": 0.6067, + "step": 4874 + }, + { + "epoch": 0.84, + "grad_norm": 10.824974060058594, + "learning_rate": 1.7450660717350264e-05, + "loss": 0.712, + "step": 4875 + }, + { + "epoch": 0.84, + "grad_norm": 13.531627655029297, + "learning_rate": 1.744808649390767e-05, + "loss": 1.3283, + "step": 4876 + }, + { + "epoch": 0.84, + "grad_norm": 11.385008811950684, + "learning_rate": 1.7445512270465078e-05, + "loss": 0.9784, + "step": 4877 + }, + { + "epoch": 0.84, + "grad_norm": 12.504364013671875, + "learning_rate": 1.7442938047022484e-05, + "loss": 0.7913, + "step": 4878 + }, + { + "epoch": 0.84, + "grad_norm": 10.046695709228516, + "learning_rate": 1.7440363823579888e-05, + "loss": 0.9239, + "step": 4879 + }, + { + "epoch": 0.84, + "grad_norm": 7.872415542602539, + "learning_rate": 1.7437789600137294e-05, + "loss": 0.4716, + "step": 4880 + }, + { + "epoch": 0.84, + "grad_norm": 8.566971778869629, + "learning_rate": 1.7435215376694698e-05, + "loss": 0.4739, + "step": 4881 + }, + { + "epoch": 0.84, + "grad_norm": 8.889461517333984, + "learning_rate": 1.74326411532521e-05, + "loss": 0.6172, + "step": 4882 + }, + { + "epoch": 0.84, + "grad_norm": 8.186433792114258, + "learning_rate": 1.7430066929809508e-05, + "loss": 0.5798, + "step": 4883 + }, + { + "epoch": 0.84, + "grad_norm": 9.673079490661621, + "learning_rate": 1.742749270636691e-05, + "loss": 0.7373, + "step": 4884 + }, + { + "epoch": 0.84, + "grad_norm": 10.75408935546875, + "learning_rate": 1.7424918482924318e-05, + "loss": 0.8041, + "step": 4885 + }, + { + "epoch": 0.84, + "grad_norm": 9.072247505187988, + "learning_rate": 1.7422344259481724e-05, + "loss": 0.6379, + "step": 4886 + }, + { + "epoch": 0.84, + "grad_norm": 11.713211059570312, + "learning_rate": 1.741977003603913e-05, + "loss": 0.7199, + "step": 4887 + }, + { + "epoch": 0.84, + "grad_norm": 9.176946640014648, + "learning_rate": 1.7417195812596534e-05, + "loss": 0.674, + "step": 4888 + }, + { + "epoch": 0.84, + "grad_norm": 12.093537330627441, + "learning_rate": 1.741462158915394e-05, + "loss": 0.7224, + "step": 4889 + }, + { + "epoch": 0.84, + "grad_norm": 10.163803100585938, + "learning_rate": 1.7412047365711344e-05, + "loss": 0.722, + "step": 4890 + }, + { + "epoch": 0.84, + "grad_norm": 10.443512916564941, + "learning_rate": 1.7409473142268748e-05, + "loss": 0.5932, + "step": 4891 + }, + { + "epoch": 0.84, + "grad_norm": 9.385797500610352, + "learning_rate": 1.7406898918826154e-05, + "loss": 0.6597, + "step": 4892 + }, + { + "epoch": 0.84, + "grad_norm": 11.46291732788086, + "learning_rate": 1.7404324695383558e-05, + "loss": 0.7738, + "step": 4893 + }, + { + "epoch": 0.84, + "grad_norm": 13.094919204711914, + "learning_rate": 1.7401750471940964e-05, + "loss": 0.694, + "step": 4894 + }, + { + "epoch": 0.84, + "grad_norm": 9.094781875610352, + "learning_rate": 1.7399176248498368e-05, + "loss": 0.6353, + "step": 4895 + }, + { + "epoch": 0.84, + "grad_norm": 10.404618263244629, + "learning_rate": 1.7396602025055778e-05, + "loss": 0.6247, + "step": 4896 + }, + { + "epoch": 0.84, + "grad_norm": 12.333617210388184, + "learning_rate": 1.739402780161318e-05, + "loss": 0.9858, + "step": 4897 + }, + { + "epoch": 0.84, + "grad_norm": 9.884780883789062, + "learning_rate": 1.7391453578170588e-05, + "loss": 0.7308, + "step": 4898 + }, + { + "epoch": 0.84, + "grad_norm": 10.797029495239258, + "learning_rate": 1.738887935472799e-05, + "loss": 0.7795, + "step": 4899 + }, + { + "epoch": 0.84, + "grad_norm": 9.812067031860352, + "learning_rate": 1.7386305131285398e-05, + "loss": 0.9258, + "step": 4900 + }, + { + "epoch": 0.84, + "grad_norm": 9.568648338317871, + "learning_rate": 1.73837309078428e-05, + "loss": 0.6762, + "step": 4901 + }, + { + "epoch": 0.84, + "grad_norm": 11.883368492126465, + "learning_rate": 1.7381156684400204e-05, + "loss": 0.7522, + "step": 4902 + }, + { + "epoch": 0.84, + "grad_norm": 11.427814483642578, + "learning_rate": 1.737858246095761e-05, + "loss": 0.7882, + "step": 4903 + }, + { + "epoch": 0.84, + "grad_norm": 9.573358535766602, + "learning_rate": 1.7376008237515014e-05, + "loss": 0.7208, + "step": 4904 + }, + { + "epoch": 0.84, + "grad_norm": 12.028850555419922, + "learning_rate": 1.7373434014072424e-05, + "loss": 0.9841, + "step": 4905 + }, + { + "epoch": 0.84, + "grad_norm": 11.733213424682617, + "learning_rate": 1.7370859790629827e-05, + "loss": 0.7934, + "step": 4906 + }, + { + "epoch": 0.84, + "grad_norm": 10.079960823059082, + "learning_rate": 1.7368285567187234e-05, + "loss": 0.7694, + "step": 4907 + }, + { + "epoch": 0.84, + "grad_norm": 11.276900291442871, + "learning_rate": 1.7365711343744637e-05, + "loss": 0.7879, + "step": 4908 + }, + { + "epoch": 0.84, + "grad_norm": 9.477254867553711, + "learning_rate": 1.7363137120302044e-05, + "loss": 0.7624, + "step": 4909 + }, + { + "epoch": 0.84, + "grad_norm": 12.625418663024902, + "learning_rate": 1.7360562896859447e-05, + "loss": 0.9777, + "step": 4910 + }, + { + "epoch": 0.84, + "grad_norm": 8.324193954467773, + "learning_rate": 1.7357988673416854e-05, + "loss": 0.7598, + "step": 4911 + }, + { + "epoch": 0.84, + "grad_norm": 8.814605712890625, + "learning_rate": 1.7355414449974257e-05, + "loss": 0.7478, + "step": 4912 + }, + { + "epoch": 0.84, + "grad_norm": 9.550097465515137, + "learning_rate": 1.735284022653166e-05, + "loss": 0.6007, + "step": 4913 + }, + { + "epoch": 0.84, + "grad_norm": 10.572664260864258, + "learning_rate": 1.7350266003089067e-05, + "loss": 0.8189, + "step": 4914 + }, + { + "epoch": 0.84, + "grad_norm": 11.535879135131836, + "learning_rate": 1.7347691779646474e-05, + "loss": 0.7733, + "step": 4915 + }, + { + "epoch": 0.84, + "grad_norm": 8.643726348876953, + "learning_rate": 1.734511755620388e-05, + "loss": 0.8174, + "step": 4916 + }, + { + "epoch": 0.84, + "grad_norm": 11.396097183227539, + "learning_rate": 1.7342543332761284e-05, + "loss": 0.914, + "step": 4917 + }, + { + "epoch": 0.84, + "grad_norm": 10.25436019897461, + "learning_rate": 1.733996910931869e-05, + "loss": 0.8079, + "step": 4918 + }, + { + "epoch": 0.84, + "grad_norm": 10.029252052307129, + "learning_rate": 1.7337394885876094e-05, + "loss": 0.7383, + "step": 4919 + }, + { + "epoch": 0.84, + "grad_norm": 8.062467575073242, + "learning_rate": 1.73348206624335e-05, + "loss": 0.5442, + "step": 4920 + }, + { + "epoch": 0.84, + "grad_norm": 9.284063339233398, + "learning_rate": 1.7332246438990904e-05, + "loss": 0.6925, + "step": 4921 + }, + { + "epoch": 0.84, + "grad_norm": 11.368736267089844, + "learning_rate": 1.7329672215548307e-05, + "loss": 0.786, + "step": 4922 + }, + { + "epoch": 0.84, + "grad_norm": 10.50294017791748, + "learning_rate": 1.7327097992105714e-05, + "loss": 1.0086, + "step": 4923 + }, + { + "epoch": 0.85, + "grad_norm": 8.022612571716309, + "learning_rate": 1.732452376866312e-05, + "loss": 0.6656, + "step": 4924 + }, + { + "epoch": 0.85, + "grad_norm": 9.516199111938477, + "learning_rate": 1.7321949545220527e-05, + "loss": 0.6494, + "step": 4925 + }, + { + "epoch": 0.85, + "grad_norm": 9.147618293762207, + "learning_rate": 1.731937532177793e-05, + "loss": 0.8569, + "step": 4926 + }, + { + "epoch": 0.85, + "grad_norm": 11.019527435302734, + "learning_rate": 1.7316801098335337e-05, + "loss": 1.1321, + "step": 4927 + }, + { + "epoch": 0.85, + "grad_norm": 12.74247932434082, + "learning_rate": 1.731422687489274e-05, + "loss": 0.9128, + "step": 4928 + }, + { + "epoch": 0.85, + "grad_norm": 13.714862823486328, + "learning_rate": 1.7311652651450147e-05, + "loss": 1.023, + "step": 4929 + }, + { + "epoch": 0.85, + "grad_norm": 10.445389747619629, + "learning_rate": 1.730907842800755e-05, + "loss": 0.7577, + "step": 4930 + }, + { + "epoch": 0.85, + "grad_norm": 9.68558406829834, + "learning_rate": 1.7306504204564957e-05, + "loss": 0.7211, + "step": 4931 + }, + { + "epoch": 0.85, + "grad_norm": 9.790580749511719, + "learning_rate": 1.730392998112236e-05, + "loss": 0.8245, + "step": 4932 + }, + { + "epoch": 0.85, + "grad_norm": 8.841532707214355, + "learning_rate": 1.7301355757679764e-05, + "loss": 0.5434, + "step": 4933 + }, + { + "epoch": 0.85, + "grad_norm": 6.869279384613037, + "learning_rate": 1.7298781534237174e-05, + "loss": 0.5786, + "step": 4934 + }, + { + "epoch": 0.85, + "grad_norm": 10.581611633300781, + "learning_rate": 1.7296207310794577e-05, + "loss": 0.9899, + "step": 4935 + }, + { + "epoch": 0.85, + "grad_norm": 12.266263961791992, + "learning_rate": 1.7293633087351984e-05, + "loss": 0.8071, + "step": 4936 + }, + { + "epoch": 0.85, + "grad_norm": 11.568883895874023, + "learning_rate": 1.7291058863909387e-05, + "loss": 0.9715, + "step": 4937 + }, + { + "epoch": 0.85, + "grad_norm": 8.261860847473145, + "learning_rate": 1.7288484640466794e-05, + "loss": 0.765, + "step": 4938 + }, + { + "epoch": 0.85, + "grad_norm": 7.793584823608398, + "learning_rate": 1.7285910417024197e-05, + "loss": 0.6247, + "step": 4939 + }, + { + "epoch": 0.85, + "grad_norm": 11.055368423461914, + "learning_rate": 1.7283336193581604e-05, + "loss": 0.8427, + "step": 4940 + }, + { + "epoch": 0.85, + "grad_norm": 10.205330848693848, + "learning_rate": 1.7280761970139007e-05, + "loss": 0.6852, + "step": 4941 + }, + { + "epoch": 0.85, + "grad_norm": 9.361076354980469, + "learning_rate": 1.7278187746696414e-05, + "loss": 0.7573, + "step": 4942 + }, + { + "epoch": 0.85, + "grad_norm": 9.780275344848633, + "learning_rate": 1.727561352325382e-05, + "loss": 0.6961, + "step": 4943 + }, + { + "epoch": 0.85, + "grad_norm": 8.784602165222168, + "learning_rate": 1.7273039299811224e-05, + "loss": 0.8277, + "step": 4944 + }, + { + "epoch": 0.85, + "grad_norm": 9.712226867675781, + "learning_rate": 1.727046507636863e-05, + "loss": 0.5497, + "step": 4945 + }, + { + "epoch": 0.85, + "grad_norm": 10.916309356689453, + "learning_rate": 1.7267890852926034e-05, + "loss": 0.6639, + "step": 4946 + }, + { + "epoch": 0.85, + "grad_norm": 9.422940254211426, + "learning_rate": 1.726531662948344e-05, + "loss": 0.682, + "step": 4947 + }, + { + "epoch": 0.85, + "grad_norm": 9.396326065063477, + "learning_rate": 1.7262742406040844e-05, + "loss": 0.682, + "step": 4948 + }, + { + "epoch": 0.85, + "grad_norm": 11.037121772766113, + "learning_rate": 1.726016818259825e-05, + "loss": 0.8204, + "step": 4949 + }, + { + "epoch": 0.85, + "grad_norm": 11.892542839050293, + "learning_rate": 1.7257593959155654e-05, + "loss": 0.7595, + "step": 4950 + }, + { + "epoch": 0.85, + "grad_norm": 9.303763389587402, + "learning_rate": 1.725501973571306e-05, + "loss": 0.7321, + "step": 4951 + }, + { + "epoch": 0.85, + "grad_norm": 13.016682624816895, + "learning_rate": 1.7252445512270464e-05, + "loss": 0.8576, + "step": 4952 + }, + { + "epoch": 0.85, + "grad_norm": 8.216480255126953, + "learning_rate": 1.724987128882787e-05, + "loss": 0.5196, + "step": 4953 + }, + { + "epoch": 0.85, + "grad_norm": 8.688359260559082, + "learning_rate": 1.7247297065385277e-05, + "loss": 0.6283, + "step": 4954 + }, + { + "epoch": 0.85, + "grad_norm": 11.238533973693848, + "learning_rate": 1.724472284194268e-05, + "loss": 0.6993, + "step": 4955 + }, + { + "epoch": 0.85, + "grad_norm": 11.846015930175781, + "learning_rate": 1.7242148618500087e-05, + "loss": 0.6817, + "step": 4956 + }, + { + "epoch": 0.85, + "grad_norm": 12.082934379577637, + "learning_rate": 1.723957439505749e-05, + "loss": 0.774, + "step": 4957 + }, + { + "epoch": 0.85, + "grad_norm": 12.129729270935059, + "learning_rate": 1.7237000171614897e-05, + "loss": 0.7532, + "step": 4958 + }, + { + "epoch": 0.85, + "grad_norm": 10.17959976196289, + "learning_rate": 1.72344259481723e-05, + "loss": 0.7147, + "step": 4959 + }, + { + "epoch": 0.85, + "grad_norm": 9.005559921264648, + "learning_rate": 1.7231851724729707e-05, + "loss": 0.6481, + "step": 4960 + }, + { + "epoch": 0.85, + "grad_norm": 12.59207534790039, + "learning_rate": 1.722927750128711e-05, + "loss": 1.1346, + "step": 4961 + }, + { + "epoch": 0.85, + "grad_norm": 10.42135238647461, + "learning_rate": 1.722670327784452e-05, + "loss": 0.9172, + "step": 4962 + }, + { + "epoch": 0.85, + "grad_norm": 8.474270820617676, + "learning_rate": 1.7224129054401924e-05, + "loss": 0.6278, + "step": 4963 + }, + { + "epoch": 0.85, + "grad_norm": 8.794137954711914, + "learning_rate": 1.7221554830959327e-05, + "loss": 0.6219, + "step": 4964 + }, + { + "epoch": 0.85, + "grad_norm": 9.153670310974121, + "learning_rate": 1.7218980607516734e-05, + "loss": 0.6652, + "step": 4965 + }, + { + "epoch": 0.85, + "grad_norm": 12.129912376403809, + "learning_rate": 1.7216406384074137e-05, + "loss": 0.7677, + "step": 4966 + }, + { + "epoch": 0.85, + "grad_norm": 12.544198036193848, + "learning_rate": 1.7213832160631544e-05, + "loss": 1.0513, + "step": 4967 + }, + { + "epoch": 0.85, + "grad_norm": 10.749582290649414, + "learning_rate": 1.7211257937188947e-05, + "loss": 0.827, + "step": 4968 + }, + { + "epoch": 0.85, + "grad_norm": 11.035421371459961, + "learning_rate": 1.7208683713746354e-05, + "loss": 0.7915, + "step": 4969 + }, + { + "epoch": 0.85, + "grad_norm": 9.850958824157715, + "learning_rate": 1.7206109490303757e-05, + "loss": 0.6275, + "step": 4970 + }, + { + "epoch": 0.85, + "grad_norm": 14.161952018737793, + "learning_rate": 1.7203535266861167e-05, + "loss": 0.8849, + "step": 4971 + }, + { + "epoch": 0.85, + "grad_norm": 11.796048164367676, + "learning_rate": 1.720096104341857e-05, + "loss": 0.7508, + "step": 4972 + }, + { + "epoch": 0.85, + "grad_norm": 9.304971694946289, + "learning_rate": 1.7198386819975977e-05, + "loss": 0.6869, + "step": 4973 + }, + { + "epoch": 0.85, + "grad_norm": 11.204222679138184, + "learning_rate": 1.719581259653338e-05, + "loss": 0.87, + "step": 4974 + }, + { + "epoch": 0.85, + "grad_norm": 8.814250946044922, + "learning_rate": 1.7193238373090783e-05, + "loss": 0.4595, + "step": 4975 + }, + { + "epoch": 0.85, + "grad_norm": 9.42802906036377, + "learning_rate": 1.719066414964819e-05, + "loss": 0.6612, + "step": 4976 + }, + { + "epoch": 0.85, + "grad_norm": 10.03331184387207, + "learning_rate": 1.7188089926205593e-05, + "loss": 0.5956, + "step": 4977 + }, + { + "epoch": 0.85, + "grad_norm": 9.797418594360352, + "learning_rate": 1.7185515702763e-05, + "loss": 0.7991, + "step": 4978 + }, + { + "epoch": 0.85, + "grad_norm": 10.321005821228027, + "learning_rate": 1.7182941479320403e-05, + "loss": 1.0181, + "step": 4979 + }, + { + "epoch": 0.85, + "grad_norm": 8.11483383178711, + "learning_rate": 1.718036725587781e-05, + "loss": 0.6289, + "step": 4980 + }, + { + "epoch": 0.85, + "grad_norm": 9.9276123046875, + "learning_rate": 1.7177793032435217e-05, + "loss": 0.7497, + "step": 4981 + }, + { + "epoch": 0.85, + "grad_norm": 9.66787052154541, + "learning_rate": 1.7175218808992623e-05, + "loss": 0.7535, + "step": 4982 + }, + { + "epoch": 0.86, + "grad_norm": 11.72194766998291, + "learning_rate": 1.7172644585550027e-05, + "loss": 0.753, + "step": 4983 + }, + { + "epoch": 0.86, + "grad_norm": 8.778253555297852, + "learning_rate": 1.7170070362107433e-05, + "loss": 0.6282, + "step": 4984 + }, + { + "epoch": 0.86, + "grad_norm": 10.868002891540527, + "learning_rate": 1.7167496138664837e-05, + "loss": 0.7105, + "step": 4985 + }, + { + "epoch": 0.86, + "grad_norm": 11.45827865600586, + "learning_rate": 1.716492191522224e-05, + "loss": 0.636, + "step": 4986 + }, + { + "epoch": 0.86, + "grad_norm": 12.1046781539917, + "learning_rate": 1.7162347691779647e-05, + "loss": 0.8897, + "step": 4987 + }, + { + "epoch": 0.86, + "grad_norm": 9.193692207336426, + "learning_rate": 1.715977346833705e-05, + "loss": 0.7267, + "step": 4988 + }, + { + "epoch": 0.86, + "grad_norm": 10.283628463745117, + "learning_rate": 1.7157199244894457e-05, + "loss": 0.75, + "step": 4989 + }, + { + "epoch": 0.86, + "grad_norm": 10.282515525817871, + "learning_rate": 1.7154625021451863e-05, + "loss": 0.7044, + "step": 4990 + }, + { + "epoch": 0.86, + "grad_norm": 8.588621139526367, + "learning_rate": 1.715205079800927e-05, + "loss": 0.6408, + "step": 4991 + }, + { + "epoch": 0.86, + "grad_norm": 8.896646499633789, + "learning_rate": 1.7149476574566673e-05, + "loss": 0.7279, + "step": 4992 + }, + { + "epoch": 0.86, + "grad_norm": 9.332626342773438, + "learning_rate": 1.714690235112408e-05, + "loss": 0.6586, + "step": 4993 + }, + { + "epoch": 0.86, + "grad_norm": 10.131858825683594, + "learning_rate": 1.7144328127681483e-05, + "loss": 0.883, + "step": 4994 + }, + { + "epoch": 0.86, + "grad_norm": 11.531006813049316, + "learning_rate": 1.7141753904238887e-05, + "loss": 0.8161, + "step": 4995 + }, + { + "epoch": 0.86, + "grad_norm": 9.387847900390625, + "learning_rate": 1.7139179680796293e-05, + "loss": 0.605, + "step": 4996 + }, + { + "epoch": 0.86, + "grad_norm": 11.156927108764648, + "learning_rate": 1.7136605457353697e-05, + "loss": 0.9032, + "step": 4997 + }, + { + "epoch": 0.86, + "grad_norm": 10.074671745300293, + "learning_rate": 1.7134031233911103e-05, + "loss": 0.737, + "step": 4998 + }, + { + "epoch": 0.86, + "grad_norm": 10.010988235473633, + "learning_rate": 1.7131457010468507e-05, + "loss": 0.4996, + "step": 4999 + }, + { + "epoch": 0.86, + "grad_norm": 8.418106079101562, + "learning_rate": 1.7128882787025917e-05, + "loss": 0.8099, + "step": 5000 + }, + { + "epoch": 0.86, + "grad_norm": 11.001021385192871, + "learning_rate": 1.712630856358332e-05, + "loss": 0.8199, + "step": 5001 + }, + { + "epoch": 0.86, + "grad_norm": 10.040778160095215, + "learning_rate": 1.7123734340140727e-05, + "loss": 0.5868, + "step": 5002 + }, + { + "epoch": 0.86, + "grad_norm": 9.183379173278809, + "learning_rate": 1.712116011669813e-05, + "loss": 0.9051, + "step": 5003 + }, + { + "epoch": 0.86, + "grad_norm": 11.451287269592285, + "learning_rate": 1.7118585893255537e-05, + "loss": 0.961, + "step": 5004 + }, + { + "epoch": 0.86, + "grad_norm": 9.420075416564941, + "learning_rate": 1.711601166981294e-05, + "loss": 0.6247, + "step": 5005 + }, + { + "epoch": 0.86, + "grad_norm": 12.370603561401367, + "learning_rate": 1.7113437446370343e-05, + "loss": 0.8457, + "step": 5006 + }, + { + "epoch": 0.86, + "grad_norm": 9.350581169128418, + "learning_rate": 1.711086322292775e-05, + "loss": 0.5281, + "step": 5007 + }, + { + "epoch": 0.86, + "grad_norm": 10.304046630859375, + "learning_rate": 1.7108288999485153e-05, + "loss": 0.8896, + "step": 5008 + }, + { + "epoch": 0.86, + "grad_norm": 10.822914123535156, + "learning_rate": 1.7105714776042563e-05, + "loss": 0.6603, + "step": 5009 + }, + { + "epoch": 0.86, + "grad_norm": 9.38222885131836, + "learning_rate": 1.7103140552599966e-05, + "loss": 0.7601, + "step": 5010 + }, + { + "epoch": 0.86, + "grad_norm": 10.930339813232422, + "learning_rate": 1.7100566329157373e-05, + "loss": 0.7744, + "step": 5011 + }, + { + "epoch": 0.86, + "grad_norm": 7.399715423583984, + "learning_rate": 1.7097992105714776e-05, + "loss": 0.4404, + "step": 5012 + }, + { + "epoch": 0.86, + "grad_norm": 6.872371673583984, + "learning_rate": 1.7095417882272183e-05, + "loss": 0.5168, + "step": 5013 + }, + { + "epoch": 0.86, + "grad_norm": 13.332186698913574, + "learning_rate": 1.7092843658829586e-05, + "loss": 0.8214, + "step": 5014 + }, + { + "epoch": 0.86, + "grad_norm": 10.342545509338379, + "learning_rate": 1.7090269435386993e-05, + "loss": 0.8542, + "step": 5015 + }, + { + "epoch": 0.86, + "grad_norm": 9.146199226379395, + "learning_rate": 1.7087695211944396e-05, + "loss": 0.5202, + "step": 5016 + }, + { + "epoch": 0.86, + "grad_norm": 12.357397079467773, + "learning_rate": 1.70851209885018e-05, + "loss": 0.6796, + "step": 5017 + }, + { + "epoch": 0.86, + "grad_norm": 14.00622844696045, + "learning_rate": 1.7082546765059206e-05, + "loss": 0.8675, + "step": 5018 + }, + { + "epoch": 0.86, + "grad_norm": 11.33556079864502, + "learning_rate": 1.7079972541616613e-05, + "loss": 0.6171, + "step": 5019 + }, + { + "epoch": 0.86, + "grad_norm": 10.016263008117676, + "learning_rate": 1.707739831817402e-05, + "loss": 0.6974, + "step": 5020 + }, + { + "epoch": 0.86, + "grad_norm": 13.643027305603027, + "learning_rate": 1.7074824094731423e-05, + "loss": 1.0009, + "step": 5021 + }, + { + "epoch": 0.86, + "grad_norm": 11.957036018371582, + "learning_rate": 1.707224987128883e-05, + "loss": 1.0078, + "step": 5022 + }, + { + "epoch": 0.86, + "grad_norm": 9.904595375061035, + "learning_rate": 1.7069675647846233e-05, + "loss": 0.6725, + "step": 5023 + }, + { + "epoch": 0.86, + "grad_norm": 9.033756256103516, + "learning_rate": 1.706710142440364e-05, + "loss": 0.5452, + "step": 5024 + }, + { + "epoch": 0.86, + "grad_norm": 11.128388404846191, + "learning_rate": 1.7064527200961043e-05, + "loss": 0.796, + "step": 5025 + }, + { + "epoch": 0.86, + "grad_norm": 10.647265434265137, + "learning_rate": 1.7061952977518446e-05, + "loss": 0.7683, + "step": 5026 + }, + { + "epoch": 0.86, + "grad_norm": 9.176582336425781, + "learning_rate": 1.7059378754075853e-05, + "loss": 0.6503, + "step": 5027 + }, + { + "epoch": 0.86, + "grad_norm": 9.657790184020996, + "learning_rate": 1.705680453063326e-05, + "loss": 0.9437, + "step": 5028 + }, + { + "epoch": 0.86, + "grad_norm": 9.837785720825195, + "learning_rate": 1.7054230307190666e-05, + "loss": 0.6766, + "step": 5029 + }, + { + "epoch": 0.86, + "grad_norm": 11.116557121276855, + "learning_rate": 1.705165608374807e-05, + "loss": 0.8057, + "step": 5030 + }, + { + "epoch": 0.86, + "grad_norm": 10.438505172729492, + "learning_rate": 1.7049081860305476e-05, + "loss": 0.6892, + "step": 5031 + }, + { + "epoch": 0.86, + "grad_norm": 11.117741584777832, + "learning_rate": 1.704650763686288e-05, + "loss": 0.7923, + "step": 5032 + }, + { + "epoch": 0.86, + "grad_norm": 9.207502365112305, + "learning_rate": 1.7043933413420286e-05, + "loss": 0.7875, + "step": 5033 + }, + { + "epoch": 0.86, + "grad_norm": 10.58642578125, + "learning_rate": 1.704135918997769e-05, + "loss": 0.6887, + "step": 5034 + }, + { + "epoch": 0.86, + "grad_norm": 10.861223220825195, + "learning_rate": 1.7038784966535096e-05, + "loss": 0.7291, + "step": 5035 + }, + { + "epoch": 0.86, + "grad_norm": 9.419525146484375, + "learning_rate": 1.70362107430925e-05, + "loss": 0.7209, + "step": 5036 + }, + { + "epoch": 0.86, + "grad_norm": 9.348834037780762, + "learning_rate": 1.7033636519649903e-05, + "loss": 0.8206, + "step": 5037 + }, + { + "epoch": 0.86, + "grad_norm": 11.353058815002441, + "learning_rate": 1.7031062296207313e-05, + "loss": 0.91, + "step": 5038 + }, + { + "epoch": 0.86, + "grad_norm": 9.92310905456543, + "learning_rate": 1.7028488072764716e-05, + "loss": 0.9756, + "step": 5039 + }, + { + "epoch": 0.86, + "grad_norm": 11.245994567871094, + "learning_rate": 1.7025913849322123e-05, + "loss": 0.7373, + "step": 5040 + }, + { + "epoch": 0.87, + "grad_norm": 12.201173782348633, + "learning_rate": 1.7023339625879526e-05, + "loss": 1.0927, + "step": 5041 + }, + { + "epoch": 0.87, + "grad_norm": 11.186721801757812, + "learning_rate": 1.7020765402436933e-05, + "loss": 0.9485, + "step": 5042 + }, + { + "epoch": 0.87, + "grad_norm": 8.173857688903809, + "learning_rate": 1.7018191178994336e-05, + "loss": 0.7514, + "step": 5043 + }, + { + "epoch": 0.87, + "grad_norm": 10.36408519744873, + "learning_rate": 1.7015616955551743e-05, + "loss": 0.7456, + "step": 5044 + }, + { + "epoch": 0.87, + "grad_norm": 8.549567222595215, + "learning_rate": 1.7013042732109146e-05, + "loss": 0.656, + "step": 5045 + }, + { + "epoch": 0.87, + "grad_norm": 10.760339736938477, + "learning_rate": 1.7010468508666553e-05, + "loss": 0.7951, + "step": 5046 + }, + { + "epoch": 0.87, + "grad_norm": 11.459860801696777, + "learning_rate": 1.700789428522396e-05, + "loss": 0.678, + "step": 5047 + }, + { + "epoch": 0.87, + "grad_norm": 8.143653869628906, + "learning_rate": 1.7005320061781363e-05, + "loss": 0.5365, + "step": 5048 + }, + { + "epoch": 0.87, + "grad_norm": 9.60336971282959, + "learning_rate": 1.700274583833877e-05, + "loss": 0.719, + "step": 5049 + }, + { + "epoch": 0.87, + "grad_norm": 10.393378257751465, + "learning_rate": 1.7000171614896173e-05, + "loss": 0.7732, + "step": 5050 + }, + { + "epoch": 0.87, + "grad_norm": 10.6954345703125, + "learning_rate": 1.699759739145358e-05, + "loss": 0.6829, + "step": 5051 + }, + { + "epoch": 0.87, + "grad_norm": 11.231731414794922, + "learning_rate": 1.6995023168010983e-05, + "loss": 0.7362, + "step": 5052 + }, + { + "epoch": 0.87, + "grad_norm": 7.381374359130859, + "learning_rate": 1.699244894456839e-05, + "loss": 0.4342, + "step": 5053 + }, + { + "epoch": 0.87, + "grad_norm": 9.06167221069336, + "learning_rate": 1.6989874721125793e-05, + "loss": 0.7154, + "step": 5054 + }, + { + "epoch": 0.87, + "grad_norm": 9.840649604797363, + "learning_rate": 1.69873004976832e-05, + "loss": 0.6862, + "step": 5055 + }, + { + "epoch": 0.87, + "grad_norm": 12.552257537841797, + "learning_rate": 1.6984726274240603e-05, + "loss": 0.9419, + "step": 5056 + }, + { + "epoch": 0.87, + "grad_norm": 11.155996322631836, + "learning_rate": 1.698215205079801e-05, + "loss": 0.9167, + "step": 5057 + }, + { + "epoch": 0.87, + "grad_norm": 12.930998802185059, + "learning_rate": 1.6979577827355416e-05, + "loss": 0.8857, + "step": 5058 + }, + { + "epoch": 0.87, + "grad_norm": 9.22996997833252, + "learning_rate": 1.697700360391282e-05, + "loss": 0.5786, + "step": 5059 + }, + { + "epoch": 0.87, + "grad_norm": 9.228675842285156, + "learning_rate": 1.6974429380470226e-05, + "loss": 0.7936, + "step": 5060 + }, + { + "epoch": 0.87, + "grad_norm": 11.768718719482422, + "learning_rate": 1.697185515702763e-05, + "loss": 0.8322, + "step": 5061 + }, + { + "epoch": 0.87, + "grad_norm": 9.743788719177246, + "learning_rate": 1.6969280933585036e-05, + "loss": 0.7182, + "step": 5062 + }, + { + "epoch": 0.87, + "grad_norm": 10.028605461120605, + "learning_rate": 1.696670671014244e-05, + "loss": 0.9524, + "step": 5063 + }, + { + "epoch": 0.87, + "grad_norm": 13.297452926635742, + "learning_rate": 1.6964132486699846e-05, + "loss": 0.9374, + "step": 5064 + }, + { + "epoch": 0.87, + "grad_norm": 11.017545700073242, + "learning_rate": 1.696155826325725e-05, + "loss": 0.751, + "step": 5065 + }, + { + "epoch": 0.87, + "grad_norm": 10.493666648864746, + "learning_rate": 1.695898403981466e-05, + "loss": 0.6565, + "step": 5066 + }, + { + "epoch": 0.87, + "grad_norm": 11.744612693786621, + "learning_rate": 1.6956409816372063e-05, + "loss": 0.7618, + "step": 5067 + }, + { + "epoch": 0.87, + "grad_norm": 11.950468063354492, + "learning_rate": 1.6953835592929466e-05, + "loss": 0.8004, + "step": 5068 + }, + { + "epoch": 0.87, + "grad_norm": 9.73985767364502, + "learning_rate": 1.6951261369486873e-05, + "loss": 0.7173, + "step": 5069 + }, + { + "epoch": 0.87, + "grad_norm": 10.84887981414795, + "learning_rate": 1.6948687146044276e-05, + "loss": 0.7528, + "step": 5070 + }, + { + "epoch": 0.87, + "grad_norm": 10.326107025146484, + "learning_rate": 1.6946112922601683e-05, + "loss": 0.6849, + "step": 5071 + }, + { + "epoch": 0.87, + "grad_norm": 8.734704971313477, + "learning_rate": 1.6943538699159086e-05, + "loss": 0.5548, + "step": 5072 + }, + { + "epoch": 0.87, + "grad_norm": 11.152992248535156, + "learning_rate": 1.6940964475716493e-05, + "loss": 0.9213, + "step": 5073 + }, + { + "epoch": 0.87, + "grad_norm": 10.008866310119629, + "learning_rate": 1.6938390252273896e-05, + "loss": 0.7313, + "step": 5074 + }, + { + "epoch": 0.87, + "grad_norm": 10.599018096923828, + "learning_rate": 1.6935816028831306e-05, + "loss": 0.7501, + "step": 5075 + }, + { + "epoch": 0.87, + "grad_norm": 12.486008644104004, + "learning_rate": 1.693324180538871e-05, + "loss": 0.7916, + "step": 5076 + }, + { + "epoch": 0.87, + "grad_norm": 8.657783508300781, + "learning_rate": 1.6930667581946116e-05, + "loss": 0.6116, + "step": 5077 + }, + { + "epoch": 0.87, + "grad_norm": 8.07896614074707, + "learning_rate": 1.692809335850352e-05, + "loss": 0.6275, + "step": 5078 + }, + { + "epoch": 0.87, + "grad_norm": 9.535330772399902, + "learning_rate": 1.6925519135060922e-05, + "loss": 0.8092, + "step": 5079 + }, + { + "epoch": 0.87, + "grad_norm": 11.815303802490234, + "learning_rate": 1.692294491161833e-05, + "loss": 0.9915, + "step": 5080 + }, + { + "epoch": 0.87, + "grad_norm": 9.145001411437988, + "learning_rate": 1.6920370688175732e-05, + "loss": 0.6142, + "step": 5081 + }, + { + "epoch": 0.87, + "grad_norm": 11.141441345214844, + "learning_rate": 1.691779646473314e-05, + "loss": 0.9283, + "step": 5082 + }, + { + "epoch": 0.87, + "grad_norm": 11.250448226928711, + "learning_rate": 1.6915222241290542e-05, + "loss": 0.9554, + "step": 5083 + }, + { + "epoch": 0.87, + "grad_norm": 8.658493995666504, + "learning_rate": 1.691264801784795e-05, + "loss": 0.5825, + "step": 5084 + }, + { + "epoch": 0.87, + "grad_norm": 10.949767112731934, + "learning_rate": 1.6910073794405356e-05, + "loss": 0.6711, + "step": 5085 + }, + { + "epoch": 0.87, + "grad_norm": 8.661419868469238, + "learning_rate": 1.6907499570962762e-05, + "loss": 0.6141, + "step": 5086 + }, + { + "epoch": 0.87, + "grad_norm": 10.766616821289062, + "learning_rate": 1.6904925347520166e-05, + "loss": 0.8972, + "step": 5087 + }, + { + "epoch": 0.87, + "grad_norm": 9.832223892211914, + "learning_rate": 1.6902351124077572e-05, + "loss": 0.5916, + "step": 5088 + }, + { + "epoch": 0.87, + "grad_norm": 8.83588695526123, + "learning_rate": 1.6899776900634976e-05, + "loss": 0.4926, + "step": 5089 + }, + { + "epoch": 0.87, + "grad_norm": 10.589808464050293, + "learning_rate": 1.689720267719238e-05, + "loss": 0.9012, + "step": 5090 + }, + { + "epoch": 0.87, + "grad_norm": 9.086411476135254, + "learning_rate": 1.6894628453749786e-05, + "loss": 0.8289, + "step": 5091 + }, + { + "epoch": 0.87, + "grad_norm": 10.400313377380371, + "learning_rate": 1.689205423030719e-05, + "loss": 0.7144, + "step": 5092 + }, + { + "epoch": 0.87, + "grad_norm": 11.67238712310791, + "learning_rate": 1.6889480006864596e-05, + "loss": 0.6807, + "step": 5093 + }, + { + "epoch": 0.87, + "grad_norm": 9.829276084899902, + "learning_rate": 1.6886905783422002e-05, + "loss": 0.5496, + "step": 5094 + }, + { + "epoch": 0.87, + "grad_norm": 9.994205474853516, + "learning_rate": 1.688433155997941e-05, + "loss": 0.5911, + "step": 5095 + }, + { + "epoch": 0.87, + "grad_norm": 13.82376480102539, + "learning_rate": 1.6881757336536812e-05, + "loss": 0.9673, + "step": 5096 + }, + { + "epoch": 0.87, + "grad_norm": 10.390604019165039, + "learning_rate": 1.687918311309422e-05, + "loss": 0.7123, + "step": 5097 + }, + { + "epoch": 0.87, + "grad_norm": 9.539375305175781, + "learning_rate": 1.6876608889651622e-05, + "loss": 0.6047, + "step": 5098 + }, + { + "epoch": 0.88, + "grad_norm": 9.492993354797363, + "learning_rate": 1.6874034666209026e-05, + "loss": 0.8519, + "step": 5099 + }, + { + "epoch": 0.88, + "grad_norm": 10.150948524475098, + "learning_rate": 1.6871460442766432e-05, + "loss": 0.8469, + "step": 5100 + }, + { + "epoch": 0.88, + "grad_norm": 15.869233131408691, + "learning_rate": 1.6868886219323836e-05, + "loss": 0.5796, + "step": 5101 + }, + { + "epoch": 0.88, + "grad_norm": 10.846354484558105, + "learning_rate": 1.6866311995881242e-05, + "loss": 0.8305, + "step": 5102 + }, + { + "epoch": 0.88, + "grad_norm": 10.064055442810059, + "learning_rate": 1.6863737772438646e-05, + "loss": 0.8497, + "step": 5103 + }, + { + "epoch": 0.88, + "grad_norm": 9.882604598999023, + "learning_rate": 1.6861163548996056e-05, + "loss": 0.4962, + "step": 5104 + }, + { + "epoch": 0.88, + "grad_norm": 10.37267017364502, + "learning_rate": 1.685858932555346e-05, + "loss": 0.8019, + "step": 5105 + }, + { + "epoch": 0.88, + "grad_norm": 10.571423530578613, + "learning_rate": 1.6856015102110866e-05, + "loss": 0.6557, + "step": 5106 + }, + { + "epoch": 0.88, + "grad_norm": 10.922707557678223, + "learning_rate": 1.685344087866827e-05, + "loss": 0.6606, + "step": 5107 + }, + { + "epoch": 0.88, + "grad_norm": 10.683191299438477, + "learning_rate": 1.6850866655225676e-05, + "loss": 0.7353, + "step": 5108 + }, + { + "epoch": 0.88, + "grad_norm": 11.57392406463623, + "learning_rate": 1.684829243178308e-05, + "loss": 0.8121, + "step": 5109 + }, + { + "epoch": 0.88, + "grad_norm": 11.889533996582031, + "learning_rate": 1.6845718208340482e-05, + "loss": 0.6311, + "step": 5110 + }, + { + "epoch": 0.88, + "grad_norm": 8.42176342010498, + "learning_rate": 1.684314398489789e-05, + "loss": 0.6659, + "step": 5111 + }, + { + "epoch": 0.88, + "grad_norm": 10.499669075012207, + "learning_rate": 1.6840569761455292e-05, + "loss": 0.59, + "step": 5112 + }, + { + "epoch": 0.88, + "grad_norm": 10.916189193725586, + "learning_rate": 1.6837995538012702e-05, + "loss": 0.7427, + "step": 5113 + }, + { + "epoch": 0.88, + "grad_norm": 10.4607572555542, + "learning_rate": 1.6835421314570105e-05, + "loss": 0.7307, + "step": 5114 + }, + { + "epoch": 0.88, + "grad_norm": 12.490965843200684, + "learning_rate": 1.6832847091127512e-05, + "loss": 0.9674, + "step": 5115 + }, + { + "epoch": 0.88, + "grad_norm": 11.16115951538086, + "learning_rate": 1.6830272867684915e-05, + "loss": 0.678, + "step": 5116 + }, + { + "epoch": 0.88, + "grad_norm": 8.330124855041504, + "learning_rate": 1.6827698644242322e-05, + "loss": 0.4714, + "step": 5117 + }, + { + "epoch": 0.88, + "grad_norm": 13.619162559509277, + "learning_rate": 1.6825124420799725e-05, + "loss": 0.7703, + "step": 5118 + }, + { + "epoch": 0.88, + "grad_norm": 9.129340171813965, + "learning_rate": 1.6822550197357132e-05, + "loss": 0.7886, + "step": 5119 + }, + { + "epoch": 0.88, + "grad_norm": 12.748601913452148, + "learning_rate": 1.6819975973914535e-05, + "loss": 0.8321, + "step": 5120 + }, + { + "epoch": 0.88, + "grad_norm": 10.726327896118164, + "learning_rate": 1.681740175047194e-05, + "loss": 0.744, + "step": 5121 + }, + { + "epoch": 0.88, + "grad_norm": 11.06992244720459, + "learning_rate": 1.6814827527029345e-05, + "loss": 0.8202, + "step": 5122 + }, + { + "epoch": 0.88, + "grad_norm": 10.089544296264648, + "learning_rate": 1.6812253303586752e-05, + "loss": 0.6877, + "step": 5123 + }, + { + "epoch": 0.88, + "grad_norm": 9.7350435256958, + "learning_rate": 1.680967908014416e-05, + "loss": 0.5948, + "step": 5124 + }, + { + "epoch": 0.88, + "grad_norm": 9.103888511657715, + "learning_rate": 1.6807104856701562e-05, + "loss": 0.5181, + "step": 5125 + }, + { + "epoch": 0.88, + "grad_norm": 10.345788955688477, + "learning_rate": 1.680453063325897e-05, + "loss": 0.5736, + "step": 5126 + }, + { + "epoch": 0.88, + "grad_norm": 9.09771728515625, + "learning_rate": 1.6801956409816372e-05, + "loss": 0.4244, + "step": 5127 + }, + { + "epoch": 0.88, + "grad_norm": 10.414392471313477, + "learning_rate": 1.679938218637378e-05, + "loss": 0.6431, + "step": 5128 + }, + { + "epoch": 0.88, + "grad_norm": 9.769451141357422, + "learning_rate": 1.6796807962931182e-05, + "loss": 0.7522, + "step": 5129 + }, + { + "epoch": 0.88, + "grad_norm": 7.722218990325928, + "learning_rate": 1.6794233739488585e-05, + "loss": 0.4948, + "step": 5130 + }, + { + "epoch": 0.88, + "grad_norm": 11.459582328796387, + "learning_rate": 1.6791659516045992e-05, + "loss": 0.788, + "step": 5131 + }, + { + "epoch": 0.88, + "grad_norm": 11.663519859313965, + "learning_rate": 1.67890852926034e-05, + "loss": 0.7317, + "step": 5132 + }, + { + "epoch": 0.88, + "grad_norm": 11.588348388671875, + "learning_rate": 1.6786511069160805e-05, + "loss": 0.7427, + "step": 5133 + }, + { + "epoch": 0.88, + "grad_norm": 10.5723876953125, + "learning_rate": 1.678393684571821e-05, + "loss": 0.8075, + "step": 5134 + }, + { + "epoch": 0.88, + "grad_norm": 12.216257095336914, + "learning_rate": 1.6781362622275615e-05, + "loss": 0.6408, + "step": 5135 + }, + { + "epoch": 0.88, + "grad_norm": 9.186063766479492, + "learning_rate": 1.677878839883302e-05, + "loss": 0.499, + "step": 5136 + }, + { + "epoch": 0.88, + "grad_norm": 9.45901107788086, + "learning_rate": 1.6776214175390425e-05, + "loss": 0.6864, + "step": 5137 + }, + { + "epoch": 0.88, + "grad_norm": 10.202424049377441, + "learning_rate": 1.677363995194783e-05, + "loss": 0.6882, + "step": 5138 + }, + { + "epoch": 0.88, + "grad_norm": 12.158854484558105, + "learning_rate": 1.6771065728505235e-05, + "loss": 0.8581, + "step": 5139 + }, + { + "epoch": 0.88, + "grad_norm": 8.483059883117676, + "learning_rate": 1.676849150506264e-05, + "loss": 0.6036, + "step": 5140 + }, + { + "epoch": 0.88, + "grad_norm": 12.016020774841309, + "learning_rate": 1.6765917281620042e-05, + "loss": 0.8377, + "step": 5141 + }, + { + "epoch": 0.88, + "grad_norm": 11.223566055297852, + "learning_rate": 1.6763343058177452e-05, + "loss": 0.7415, + "step": 5142 + }, + { + "epoch": 0.88, + "grad_norm": 10.837448120117188, + "learning_rate": 1.6760768834734855e-05, + "loss": 0.459, + "step": 5143 + }, + { + "epoch": 0.88, + "grad_norm": 12.283288955688477, + "learning_rate": 1.6758194611292262e-05, + "loss": 1.038, + "step": 5144 + }, + { + "epoch": 0.88, + "grad_norm": 12.15881633758545, + "learning_rate": 1.6755620387849665e-05, + "loss": 0.6349, + "step": 5145 + }, + { + "epoch": 0.88, + "grad_norm": 10.535287857055664, + "learning_rate": 1.6753046164407072e-05, + "loss": 0.688, + "step": 5146 + }, + { + "epoch": 0.88, + "grad_norm": 14.299918174743652, + "learning_rate": 1.6750471940964475e-05, + "loss": 1.0001, + "step": 5147 + }, + { + "epoch": 0.88, + "grad_norm": 10.1094970703125, + "learning_rate": 1.6747897717521882e-05, + "loss": 0.7172, + "step": 5148 + }, + { + "epoch": 0.88, + "grad_norm": 8.257475852966309, + "learning_rate": 1.6745323494079285e-05, + "loss": 0.544, + "step": 5149 + }, + { + "epoch": 0.88, + "grad_norm": 10.434012413024902, + "learning_rate": 1.6742749270636692e-05, + "loss": 0.5604, + "step": 5150 + }, + { + "epoch": 0.88, + "grad_norm": 8.975931167602539, + "learning_rate": 1.67401750471941e-05, + "loss": 0.8215, + "step": 5151 + }, + { + "epoch": 0.88, + "grad_norm": 10.961003303527832, + "learning_rate": 1.6737600823751502e-05, + "loss": 0.9019, + "step": 5152 + }, + { + "epoch": 0.88, + "grad_norm": 9.639623641967773, + "learning_rate": 1.673502660030891e-05, + "loss": 0.7083, + "step": 5153 + }, + { + "epoch": 0.88, + "grad_norm": 9.64686107635498, + "learning_rate": 1.6732452376866312e-05, + "loss": 0.6626, + "step": 5154 + }, + { + "epoch": 0.88, + "grad_norm": 10.169412612915039, + "learning_rate": 1.672987815342372e-05, + "loss": 0.7328, + "step": 5155 + }, + { + "epoch": 0.88, + "grad_norm": 8.250401496887207, + "learning_rate": 1.6727303929981122e-05, + "loss": 0.5575, + "step": 5156 + }, + { + "epoch": 0.89, + "grad_norm": 11.876055717468262, + "learning_rate": 1.672472970653853e-05, + "loss": 0.7575, + "step": 5157 + }, + { + "epoch": 0.89, + "grad_norm": 9.425901412963867, + "learning_rate": 1.6722155483095932e-05, + "loss": 0.6735, + "step": 5158 + }, + { + "epoch": 0.89, + "grad_norm": 8.946558952331543, + "learning_rate": 1.671958125965334e-05, + "loss": 0.4543, + "step": 5159 + }, + { + "epoch": 0.89, + "grad_norm": 12.222345352172852, + "learning_rate": 1.671700703621074e-05, + "loss": 0.8022, + "step": 5160 + }, + { + "epoch": 0.89, + "grad_norm": 10.4634428024292, + "learning_rate": 1.671443281276815e-05, + "loss": 0.7809, + "step": 5161 + }, + { + "epoch": 0.89, + "grad_norm": 12.227447509765625, + "learning_rate": 1.6711858589325555e-05, + "loss": 0.8119, + "step": 5162 + }, + { + "epoch": 0.89, + "grad_norm": 11.55435848236084, + "learning_rate": 1.670928436588296e-05, + "loss": 0.8565, + "step": 5163 + }, + { + "epoch": 0.89, + "grad_norm": 15.997054100036621, + "learning_rate": 1.6706710142440365e-05, + "loss": 1.0876, + "step": 5164 + }, + { + "epoch": 0.89, + "grad_norm": 10.246514320373535, + "learning_rate": 1.670413591899777e-05, + "loss": 0.7111, + "step": 5165 + }, + { + "epoch": 0.89, + "grad_norm": 10.147298812866211, + "learning_rate": 1.6701561695555175e-05, + "loss": 0.667, + "step": 5166 + }, + { + "epoch": 0.89, + "grad_norm": 12.040410041809082, + "learning_rate": 1.6698987472112578e-05, + "loss": 0.8043, + "step": 5167 + }, + { + "epoch": 0.89, + "grad_norm": 11.274188041687012, + "learning_rate": 1.6696413248669985e-05, + "loss": 0.5311, + "step": 5168 + }, + { + "epoch": 0.89, + "grad_norm": 12.591151237487793, + "learning_rate": 1.6693839025227388e-05, + "loss": 0.8394, + "step": 5169 + }, + { + "epoch": 0.89, + "grad_norm": 14.305787086486816, + "learning_rate": 1.66912648017848e-05, + "loss": 0.8099, + "step": 5170 + }, + { + "epoch": 0.89, + "grad_norm": 12.254986763000488, + "learning_rate": 1.66886905783422e-05, + "loss": 0.7154, + "step": 5171 + }, + { + "epoch": 0.89, + "grad_norm": 11.92106819152832, + "learning_rate": 1.6686116354899605e-05, + "loss": 0.8145, + "step": 5172 + }, + { + "epoch": 0.89, + "grad_norm": 11.297516822814941, + "learning_rate": 1.668354213145701e-05, + "loss": 0.7931, + "step": 5173 + }, + { + "epoch": 0.89, + "grad_norm": 10.657331466674805, + "learning_rate": 1.6680967908014415e-05, + "loss": 0.7878, + "step": 5174 + }, + { + "epoch": 0.89, + "grad_norm": 10.934943199157715, + "learning_rate": 1.667839368457182e-05, + "loss": 0.7537, + "step": 5175 + }, + { + "epoch": 0.89, + "grad_norm": 13.230857849121094, + "learning_rate": 1.6675819461129225e-05, + "loss": 0.7391, + "step": 5176 + }, + { + "epoch": 0.89, + "grad_norm": 11.595080375671387, + "learning_rate": 1.667324523768663e-05, + "loss": 0.62, + "step": 5177 + }, + { + "epoch": 0.89, + "grad_norm": 12.526241302490234, + "learning_rate": 1.6670671014244035e-05, + "loss": 0.9329, + "step": 5178 + }, + { + "epoch": 0.89, + "grad_norm": 8.58484935760498, + "learning_rate": 1.666809679080144e-05, + "loss": 0.5699, + "step": 5179 + }, + { + "epoch": 0.89, + "grad_norm": 10.413646697998047, + "learning_rate": 1.6665522567358848e-05, + "loss": 0.7794, + "step": 5180 + }, + { + "epoch": 0.89, + "grad_norm": 11.08458423614502, + "learning_rate": 1.6662948343916255e-05, + "loss": 0.7778, + "step": 5181 + }, + { + "epoch": 0.89, + "grad_norm": 10.542162895202637, + "learning_rate": 1.6660374120473658e-05, + "loss": 0.6902, + "step": 5182 + }, + { + "epoch": 0.89, + "grad_norm": 10.577092170715332, + "learning_rate": 1.665779989703106e-05, + "loss": 0.6979, + "step": 5183 + }, + { + "epoch": 0.89, + "grad_norm": 11.598038673400879, + "learning_rate": 1.6655225673588468e-05, + "loss": 0.8171, + "step": 5184 + }, + { + "epoch": 0.89, + "grad_norm": 14.036426544189453, + "learning_rate": 1.665265145014587e-05, + "loss": 1.0667, + "step": 5185 + }, + { + "epoch": 0.89, + "grad_norm": 11.628640174865723, + "learning_rate": 1.6650077226703278e-05, + "loss": 0.9543, + "step": 5186 + }, + { + "epoch": 0.89, + "grad_norm": 8.614828109741211, + "learning_rate": 1.664750300326068e-05, + "loss": 0.6174, + "step": 5187 + }, + { + "epoch": 0.89, + "grad_norm": 10.248889923095703, + "learning_rate": 1.6644928779818088e-05, + "loss": 0.8864, + "step": 5188 + }, + { + "epoch": 0.89, + "grad_norm": 9.222512245178223, + "learning_rate": 1.6642354556375495e-05, + "loss": 0.7374, + "step": 5189 + }, + { + "epoch": 0.89, + "grad_norm": 8.618529319763184, + "learning_rate": 1.66397803329329e-05, + "loss": 0.7824, + "step": 5190 + }, + { + "epoch": 0.89, + "grad_norm": 10.544220924377441, + "learning_rate": 1.6637206109490305e-05, + "loss": 0.7273, + "step": 5191 + }, + { + "epoch": 0.89, + "grad_norm": 10.569472312927246, + "learning_rate": 1.663463188604771e-05, + "loss": 0.7878, + "step": 5192 + }, + { + "epoch": 0.89, + "grad_norm": 11.192878723144531, + "learning_rate": 1.6632057662605115e-05, + "loss": 0.7754, + "step": 5193 + }, + { + "epoch": 0.89, + "grad_norm": 10.241219520568848, + "learning_rate": 1.6629483439162518e-05, + "loss": 0.7687, + "step": 5194 + }, + { + "epoch": 0.89, + "grad_norm": 11.386351585388184, + "learning_rate": 1.6626909215719925e-05, + "loss": 0.9269, + "step": 5195 + }, + { + "epoch": 0.89, + "grad_norm": 11.048517227172852, + "learning_rate": 1.6624334992277328e-05, + "loss": 0.9212, + "step": 5196 + }, + { + "epoch": 0.89, + "grad_norm": 7.806606769561768, + "learning_rate": 1.6621760768834735e-05, + "loss": 0.3998, + "step": 5197 + }, + { + "epoch": 0.89, + "grad_norm": 10.608366966247559, + "learning_rate": 1.661918654539214e-05, + "loss": 0.7697, + "step": 5198 + }, + { + "epoch": 0.89, + "grad_norm": 9.722428321838379, + "learning_rate": 1.6616612321949548e-05, + "loss": 0.7826, + "step": 5199 + }, + { + "epoch": 0.89, + "grad_norm": 11.099624633789062, + "learning_rate": 1.661403809850695e-05, + "loss": 0.7108, + "step": 5200 + }, + { + "epoch": 0.89, + "grad_norm": 7.431052207946777, + "learning_rate": 1.6611463875064358e-05, + "loss": 0.6995, + "step": 5201 + }, + { + "epoch": 0.89, + "grad_norm": 10.23111343383789, + "learning_rate": 1.660888965162176e-05, + "loss": 0.8535, + "step": 5202 + }, + { + "epoch": 0.89, + "grad_norm": 11.609075546264648, + "learning_rate": 1.6606315428179165e-05, + "loss": 0.6788, + "step": 5203 + }, + { + "epoch": 0.89, + "grad_norm": 14.281352996826172, + "learning_rate": 1.660374120473657e-05, + "loss": 0.8985, + "step": 5204 + }, + { + "epoch": 0.89, + "grad_norm": 11.516215324401855, + "learning_rate": 1.6601166981293975e-05, + "loss": 0.7634, + "step": 5205 + }, + { + "epoch": 0.89, + "grad_norm": 9.74077033996582, + "learning_rate": 1.659859275785138e-05, + "loss": 0.8806, + "step": 5206 + }, + { + "epoch": 0.89, + "grad_norm": 8.645482063293457, + "learning_rate": 1.6596018534408785e-05, + "loss": 0.5675, + "step": 5207 + }, + { + "epoch": 0.89, + "grad_norm": 9.549633979797363, + "learning_rate": 1.6593444310966195e-05, + "loss": 0.7807, + "step": 5208 + }, + { + "epoch": 0.89, + "grad_norm": 11.692028999328613, + "learning_rate": 1.6590870087523598e-05, + "loss": 0.8201, + "step": 5209 + }, + { + "epoch": 0.89, + "grad_norm": 11.625863075256348, + "learning_rate": 1.6588295864081005e-05, + "loss": 0.7049, + "step": 5210 + }, + { + "epoch": 0.89, + "grad_norm": 10.592692375183105, + "learning_rate": 1.6585721640638408e-05, + "loss": 0.7609, + "step": 5211 + }, + { + "epoch": 0.89, + "grad_norm": 11.667631149291992, + "learning_rate": 1.6583147417195815e-05, + "loss": 0.9495, + "step": 5212 + }, + { + "epoch": 0.89, + "grad_norm": 12.62667179107666, + "learning_rate": 1.6580573193753218e-05, + "loss": 0.5782, + "step": 5213 + }, + { + "epoch": 0.89, + "grad_norm": 10.771811485290527, + "learning_rate": 1.657799897031062e-05, + "loss": 0.6552, + "step": 5214 + }, + { + "epoch": 0.89, + "grad_norm": 10.704256057739258, + "learning_rate": 1.6575424746868028e-05, + "loss": 0.7867, + "step": 5215 + }, + { + "epoch": 0.9, + "grad_norm": 8.04047966003418, + "learning_rate": 1.657285052342543e-05, + "loss": 0.6639, + "step": 5216 + }, + { + "epoch": 0.9, + "grad_norm": 9.500970840454102, + "learning_rate": 1.657027629998284e-05, + "loss": 0.6785, + "step": 5217 + }, + { + "epoch": 0.9, + "grad_norm": 9.975128173828125, + "learning_rate": 1.6567702076540245e-05, + "loss": 0.6367, + "step": 5218 + }, + { + "epoch": 0.9, + "grad_norm": 10.489596366882324, + "learning_rate": 1.656512785309765e-05, + "loss": 0.7099, + "step": 5219 + }, + { + "epoch": 0.9, + "grad_norm": 11.5138521194458, + "learning_rate": 1.6562553629655054e-05, + "loss": 0.6627, + "step": 5220 + }, + { + "epoch": 0.9, + "grad_norm": 8.358445167541504, + "learning_rate": 1.655997940621246e-05, + "loss": 0.5574, + "step": 5221 + }, + { + "epoch": 0.9, + "grad_norm": 10.792037963867188, + "learning_rate": 1.6557405182769864e-05, + "loss": 0.7517, + "step": 5222 + }, + { + "epoch": 0.9, + "grad_norm": 10.092707633972168, + "learning_rate": 1.655483095932727e-05, + "loss": 0.6959, + "step": 5223 + }, + { + "epoch": 0.9, + "grad_norm": 8.377098083496094, + "learning_rate": 1.6552256735884674e-05, + "loss": 0.5483, + "step": 5224 + }, + { + "epoch": 0.9, + "grad_norm": 11.945281982421875, + "learning_rate": 1.6549682512442078e-05, + "loss": 0.8506, + "step": 5225 + }, + { + "epoch": 0.9, + "grad_norm": 10.849797248840332, + "learning_rate": 1.6547108288999484e-05, + "loss": 0.7787, + "step": 5226 + }, + { + "epoch": 0.9, + "grad_norm": 10.040619850158691, + "learning_rate": 1.654453406555689e-05, + "loss": 0.7812, + "step": 5227 + }, + { + "epoch": 0.9, + "grad_norm": 10.434410095214844, + "learning_rate": 1.6541959842114298e-05, + "loss": 0.742, + "step": 5228 + }, + { + "epoch": 0.9, + "grad_norm": 12.027389526367188, + "learning_rate": 1.65393856186717e-05, + "loss": 0.6401, + "step": 5229 + }, + { + "epoch": 0.9, + "grad_norm": 11.295379638671875, + "learning_rate": 1.6536811395229108e-05, + "loss": 0.6541, + "step": 5230 + }, + { + "epoch": 0.9, + "grad_norm": 8.087778091430664, + "learning_rate": 1.653423717178651e-05, + "loss": 0.4963, + "step": 5231 + }, + { + "epoch": 0.9, + "grad_norm": 10.502301216125488, + "learning_rate": 1.6531662948343918e-05, + "loss": 0.8584, + "step": 5232 + }, + { + "epoch": 0.9, + "grad_norm": 9.659680366516113, + "learning_rate": 1.652908872490132e-05, + "loss": 0.7465, + "step": 5233 + }, + { + "epoch": 0.9, + "grad_norm": 11.165035247802734, + "learning_rate": 1.6526514501458728e-05, + "loss": 0.6075, + "step": 5234 + }, + { + "epoch": 0.9, + "grad_norm": 9.327047348022461, + "learning_rate": 1.652394027801613e-05, + "loss": 0.8386, + "step": 5235 + }, + { + "epoch": 0.9, + "grad_norm": 9.365690231323242, + "learning_rate": 1.6521366054573538e-05, + "loss": 0.6087, + "step": 5236 + }, + { + "epoch": 0.9, + "grad_norm": 8.67734146118164, + "learning_rate": 1.6518791831130944e-05, + "loss": 0.6336, + "step": 5237 + }, + { + "epoch": 0.9, + "grad_norm": 11.418550491333008, + "learning_rate": 1.6516217607688348e-05, + "loss": 0.76, + "step": 5238 + }, + { + "epoch": 0.9, + "grad_norm": 9.90666389465332, + "learning_rate": 1.6513643384245754e-05, + "loss": 0.8328, + "step": 5239 + }, + { + "epoch": 0.9, + "grad_norm": 9.265645027160645, + "learning_rate": 1.6511069160803158e-05, + "loss": 0.5108, + "step": 5240 + }, + { + "epoch": 0.9, + "grad_norm": 9.731019020080566, + "learning_rate": 1.6508494937360564e-05, + "loss": 0.7937, + "step": 5241 + }, + { + "epoch": 0.9, + "grad_norm": 11.847784042358398, + "learning_rate": 1.6505920713917968e-05, + "loss": 0.606, + "step": 5242 + }, + { + "epoch": 0.9, + "grad_norm": 11.213117599487305, + "learning_rate": 1.6503346490475374e-05, + "loss": 0.7917, + "step": 5243 + }, + { + "epoch": 0.9, + "grad_norm": 9.141951560974121, + "learning_rate": 1.6500772267032778e-05, + "loss": 0.6665, + "step": 5244 + }, + { + "epoch": 0.9, + "grad_norm": 11.041945457458496, + "learning_rate": 1.649819804359018e-05, + "loss": 0.8224, + "step": 5245 + }, + { + "epoch": 0.9, + "grad_norm": 8.495555877685547, + "learning_rate": 1.649562382014759e-05, + "loss": 0.4823, + "step": 5246 + }, + { + "epoch": 0.9, + "grad_norm": 10.829453468322754, + "learning_rate": 1.6493049596704994e-05, + "loss": 0.6583, + "step": 5247 + }, + { + "epoch": 0.9, + "grad_norm": 8.408878326416016, + "learning_rate": 1.64904753732624e-05, + "loss": 0.492, + "step": 5248 + }, + { + "epoch": 0.9, + "grad_norm": 10.759248733520508, + "learning_rate": 1.6487901149819804e-05, + "loss": 0.722, + "step": 5249 + }, + { + "epoch": 0.9, + "grad_norm": 9.546459197998047, + "learning_rate": 1.648532692637721e-05, + "loss": 0.8778, + "step": 5250 + }, + { + "epoch": 0.9, + "grad_norm": 10.193469047546387, + "learning_rate": 1.6482752702934614e-05, + "loss": 0.8, + "step": 5251 + }, + { + "epoch": 0.9, + "grad_norm": 8.022163391113281, + "learning_rate": 1.648017847949202e-05, + "loss": 0.4692, + "step": 5252 + }, + { + "epoch": 0.9, + "grad_norm": 10.208653450012207, + "learning_rate": 1.6477604256049424e-05, + "loss": 0.5389, + "step": 5253 + }, + { + "epoch": 0.9, + "grad_norm": 9.21313762664795, + "learning_rate": 1.647503003260683e-05, + "loss": 0.7251, + "step": 5254 + }, + { + "epoch": 0.9, + "grad_norm": 9.25330638885498, + "learning_rate": 1.6472455809164238e-05, + "loss": 0.6814, + "step": 5255 + }, + { + "epoch": 0.9, + "grad_norm": 12.095317840576172, + "learning_rate": 1.646988158572164e-05, + "loss": 0.7626, + "step": 5256 + }, + { + "epoch": 0.9, + "grad_norm": 11.305283546447754, + "learning_rate": 1.6467307362279047e-05, + "loss": 0.7507, + "step": 5257 + }, + { + "epoch": 0.9, + "grad_norm": 8.431222915649414, + "learning_rate": 1.646473313883645e-05, + "loss": 0.5146, + "step": 5258 + }, + { + "epoch": 0.9, + "grad_norm": 10.402660369873047, + "learning_rate": 1.6462158915393857e-05, + "loss": 0.7854, + "step": 5259 + }, + { + "epoch": 0.9, + "grad_norm": 12.578250885009766, + "learning_rate": 1.645958469195126e-05, + "loss": 0.8119, + "step": 5260 + }, + { + "epoch": 0.9, + "grad_norm": 12.49059772491455, + "learning_rate": 1.6457010468508667e-05, + "loss": 0.8049, + "step": 5261 + }, + { + "epoch": 0.9, + "grad_norm": 11.794490814208984, + "learning_rate": 1.645443624506607e-05, + "loss": 0.7455, + "step": 5262 + }, + { + "epoch": 0.9, + "grad_norm": 11.36746597290039, + "learning_rate": 1.6451862021623477e-05, + "loss": 0.6569, + "step": 5263 + }, + { + "epoch": 0.9, + "grad_norm": 7.957120418548584, + "learning_rate": 1.644928779818088e-05, + "loss": 0.5979, + "step": 5264 + }, + { + "epoch": 0.9, + "grad_norm": 10.1671781539917, + "learning_rate": 1.6446713574738287e-05, + "loss": 0.7981, + "step": 5265 + }, + { + "epoch": 0.9, + "grad_norm": 10.471668243408203, + "learning_rate": 1.6444139351295694e-05, + "loss": 0.6581, + "step": 5266 + }, + { + "epoch": 0.9, + "grad_norm": 11.272623062133789, + "learning_rate": 1.6441565127853097e-05, + "loss": 0.9694, + "step": 5267 + }, + { + "epoch": 0.9, + "grad_norm": 11.479127883911133, + "learning_rate": 1.6438990904410504e-05, + "loss": 0.6734, + "step": 5268 + }, + { + "epoch": 0.9, + "grad_norm": 10.329911231994629, + "learning_rate": 1.6436416680967907e-05, + "loss": 0.916, + "step": 5269 + }, + { + "epoch": 0.9, + "grad_norm": 9.38661003112793, + "learning_rate": 1.6433842457525314e-05, + "loss": 0.8645, + "step": 5270 + }, + { + "epoch": 0.9, + "grad_norm": 10.047325134277344, + "learning_rate": 1.6431268234082717e-05, + "loss": 0.6926, + "step": 5271 + }, + { + "epoch": 0.9, + "grad_norm": 9.207578659057617, + "learning_rate": 1.6428694010640124e-05, + "loss": 0.6558, + "step": 5272 + }, + { + "epoch": 0.9, + "grad_norm": 9.521034240722656, + "learning_rate": 1.6426119787197527e-05, + "loss": 0.6857, + "step": 5273 + }, + { + "epoch": 0.91, + "grad_norm": 8.073044776916504, + "learning_rate": 1.6423545563754937e-05, + "loss": 0.5497, + "step": 5274 + }, + { + "epoch": 0.91, + "grad_norm": 8.431992530822754, + "learning_rate": 1.642097134031234e-05, + "loss": 0.629, + "step": 5275 + }, + { + "epoch": 0.91, + "grad_norm": 9.577980041503906, + "learning_rate": 1.6418397116869744e-05, + "loss": 0.5072, + "step": 5276 + }, + { + "epoch": 0.91, + "grad_norm": 12.020289421081543, + "learning_rate": 1.641582289342715e-05, + "loss": 0.8599, + "step": 5277 + }, + { + "epoch": 0.91, + "grad_norm": 9.187503814697266, + "learning_rate": 1.6413248669984554e-05, + "loss": 0.7593, + "step": 5278 + }, + { + "epoch": 0.91, + "grad_norm": 10.601643562316895, + "learning_rate": 1.641067444654196e-05, + "loss": 0.7217, + "step": 5279 + }, + { + "epoch": 0.91, + "grad_norm": 12.692886352539062, + "learning_rate": 1.6408100223099364e-05, + "loss": 0.7866, + "step": 5280 + }, + { + "epoch": 0.91, + "grad_norm": 13.177303314208984, + "learning_rate": 1.640552599965677e-05, + "loss": 0.5475, + "step": 5281 + }, + { + "epoch": 0.91, + "grad_norm": 10.756418228149414, + "learning_rate": 1.6402951776214174e-05, + "loss": 0.5752, + "step": 5282 + }, + { + "epoch": 0.91, + "grad_norm": 12.627341270446777, + "learning_rate": 1.640037755277158e-05, + "loss": 0.9314, + "step": 5283 + }, + { + "epoch": 0.91, + "grad_norm": 10.185128211975098, + "learning_rate": 1.6397803329328987e-05, + "loss": 0.7916, + "step": 5284 + }, + { + "epoch": 0.91, + "grad_norm": 10.435806274414062, + "learning_rate": 1.6395229105886394e-05, + "loss": 0.8947, + "step": 5285 + }, + { + "epoch": 0.91, + "grad_norm": 8.506839752197266, + "learning_rate": 1.6392654882443797e-05, + "loss": 0.6167, + "step": 5286 + }, + { + "epoch": 0.91, + "grad_norm": 13.203943252563477, + "learning_rate": 1.63900806590012e-05, + "loss": 0.7711, + "step": 5287 + }, + { + "epoch": 0.91, + "grad_norm": 10.451401710510254, + "learning_rate": 1.6387506435558607e-05, + "loss": 0.6407, + "step": 5288 + }, + { + "epoch": 0.91, + "grad_norm": 13.373774528503418, + "learning_rate": 1.638493221211601e-05, + "loss": 0.8961, + "step": 5289 + }, + { + "epoch": 0.91, + "grad_norm": 9.490797996520996, + "learning_rate": 1.6382357988673417e-05, + "loss": 0.7242, + "step": 5290 + }, + { + "epoch": 0.91, + "grad_norm": 11.302254676818848, + "learning_rate": 1.637978376523082e-05, + "loss": 0.6484, + "step": 5291 + }, + { + "epoch": 0.91, + "grad_norm": 10.7662935256958, + "learning_rate": 1.6377209541788227e-05, + "loss": 0.8167, + "step": 5292 + }, + { + "epoch": 0.91, + "grad_norm": 12.014126777648926, + "learning_rate": 1.6374635318345634e-05, + "loss": 0.6313, + "step": 5293 + }, + { + "epoch": 0.91, + "grad_norm": 7.957836151123047, + "learning_rate": 1.637206109490304e-05, + "loss": 0.4871, + "step": 5294 + }, + { + "epoch": 0.91, + "grad_norm": 11.586565971374512, + "learning_rate": 1.6369486871460444e-05, + "loss": 0.645, + "step": 5295 + }, + { + "epoch": 0.91, + "grad_norm": 9.761870384216309, + "learning_rate": 1.636691264801785e-05, + "loss": 1.0254, + "step": 5296 + }, + { + "epoch": 0.91, + "grad_norm": 9.778366088867188, + "learning_rate": 1.6364338424575254e-05, + "loss": 0.5141, + "step": 5297 + }, + { + "epoch": 0.91, + "grad_norm": 8.954097747802734, + "learning_rate": 1.6361764201132657e-05, + "loss": 0.6822, + "step": 5298 + }, + { + "epoch": 0.91, + "grad_norm": 11.593156814575195, + "learning_rate": 1.6359189977690064e-05, + "loss": 0.7444, + "step": 5299 + }, + { + "epoch": 0.91, + "grad_norm": 10.708962440490723, + "learning_rate": 1.6356615754247467e-05, + "loss": 0.7988, + "step": 5300 + }, + { + "epoch": 0.91, + "grad_norm": 9.688947677612305, + "learning_rate": 1.6354041530804874e-05, + "loss": 0.7307, + "step": 5301 + }, + { + "epoch": 0.91, + "grad_norm": 13.107463836669922, + "learning_rate": 1.6351467307362277e-05, + "loss": 0.7361, + "step": 5302 + }, + { + "epoch": 0.91, + "grad_norm": 11.472657203674316, + "learning_rate": 1.6348893083919687e-05, + "loss": 0.7446, + "step": 5303 + }, + { + "epoch": 0.91, + "grad_norm": 12.349526405334473, + "learning_rate": 1.634631886047709e-05, + "loss": 0.9223, + "step": 5304 + }, + { + "epoch": 0.91, + "grad_norm": 11.807222366333008, + "learning_rate": 1.6343744637034497e-05, + "loss": 0.9837, + "step": 5305 + }, + { + "epoch": 0.91, + "grad_norm": 11.888278007507324, + "learning_rate": 1.63411704135919e-05, + "loss": 0.6046, + "step": 5306 + }, + { + "epoch": 0.91, + "grad_norm": 17.625167846679688, + "learning_rate": 1.6338596190149304e-05, + "loss": 0.7998, + "step": 5307 + }, + { + "epoch": 0.91, + "grad_norm": 9.900516510009766, + "learning_rate": 1.633602196670671e-05, + "loss": 0.6392, + "step": 5308 + }, + { + "epoch": 0.91, + "grad_norm": 9.312675476074219, + "learning_rate": 1.6333447743264114e-05, + "loss": 0.4499, + "step": 5309 + }, + { + "epoch": 0.91, + "grad_norm": 9.923124313354492, + "learning_rate": 1.633087351982152e-05, + "loss": 0.5753, + "step": 5310 + }, + { + "epoch": 0.91, + "grad_norm": 10.0786771774292, + "learning_rate": 1.6328299296378924e-05, + "loss": 0.7728, + "step": 5311 + }, + { + "epoch": 0.91, + "grad_norm": 10.278932571411133, + "learning_rate": 1.6325725072936334e-05, + "loss": 0.8013, + "step": 5312 + }, + { + "epoch": 0.91, + "grad_norm": 10.094017028808594, + "learning_rate": 1.6323150849493737e-05, + "loss": 0.6387, + "step": 5313 + }, + { + "epoch": 0.91, + "grad_norm": 11.221474647521973, + "learning_rate": 1.6320576626051144e-05, + "loss": 0.582, + "step": 5314 + }, + { + "epoch": 0.91, + "grad_norm": 9.438040733337402, + "learning_rate": 1.6318002402608547e-05, + "loss": 0.5847, + "step": 5315 + }, + { + "epoch": 0.91, + "grad_norm": 10.305757522583008, + "learning_rate": 1.6315428179165954e-05, + "loss": 0.5777, + "step": 5316 + }, + { + "epoch": 0.91, + "grad_norm": 9.588105201721191, + "learning_rate": 1.6312853955723357e-05, + "loss": 0.8716, + "step": 5317 + }, + { + "epoch": 0.91, + "grad_norm": 10.558932304382324, + "learning_rate": 1.631027973228076e-05, + "loss": 0.9259, + "step": 5318 + }, + { + "epoch": 0.91, + "grad_norm": 10.885242462158203, + "learning_rate": 1.6307705508838167e-05, + "loss": 0.7004, + "step": 5319 + }, + { + "epoch": 0.91, + "grad_norm": 9.342612266540527, + "learning_rate": 1.630513128539557e-05, + "loss": 0.6183, + "step": 5320 + }, + { + "epoch": 0.91, + "grad_norm": 8.590620994567871, + "learning_rate": 1.630255706195298e-05, + "loss": 0.5081, + "step": 5321 + }, + { + "epoch": 0.91, + "grad_norm": 9.505029678344727, + "learning_rate": 1.6299982838510384e-05, + "loss": 0.609, + "step": 5322 + }, + { + "epoch": 0.91, + "grad_norm": 12.353503227233887, + "learning_rate": 1.629740861506779e-05, + "loss": 0.7164, + "step": 5323 + }, + { + "epoch": 0.91, + "grad_norm": 10.86381721496582, + "learning_rate": 1.6294834391625193e-05, + "loss": 0.7149, + "step": 5324 + }, + { + "epoch": 0.91, + "grad_norm": 10.773503303527832, + "learning_rate": 1.62922601681826e-05, + "loss": 0.8802, + "step": 5325 + }, + { + "epoch": 0.91, + "grad_norm": 13.397554397583008, + "learning_rate": 1.6289685944740003e-05, + "loss": 1.0327, + "step": 5326 + }, + { + "epoch": 0.91, + "grad_norm": 14.209512710571289, + "learning_rate": 1.628711172129741e-05, + "loss": 1.0638, + "step": 5327 + }, + { + "epoch": 0.91, + "grad_norm": 9.346537590026855, + "learning_rate": 1.6284537497854813e-05, + "loss": 0.7663, + "step": 5328 + }, + { + "epoch": 0.91, + "grad_norm": 9.009007453918457, + "learning_rate": 1.6281963274412217e-05, + "loss": 0.6647, + "step": 5329 + }, + { + "epoch": 0.91, + "grad_norm": 12.355412483215332, + "learning_rate": 1.6279389050969623e-05, + "loss": 0.8087, + "step": 5330 + }, + { + "epoch": 0.91, + "grad_norm": 10.055541038513184, + "learning_rate": 1.627681482752703e-05, + "loss": 0.5819, + "step": 5331 + }, + { + "epoch": 0.92, + "grad_norm": 9.069061279296875, + "learning_rate": 1.6274240604084437e-05, + "loss": 0.6965, + "step": 5332 + }, + { + "epoch": 0.92, + "grad_norm": 11.634626388549805, + "learning_rate": 1.627166638064184e-05, + "loss": 0.7589, + "step": 5333 + }, + { + "epoch": 0.92, + "grad_norm": 10.606729507446289, + "learning_rate": 1.6269092157199247e-05, + "loss": 0.7905, + "step": 5334 + }, + { + "epoch": 0.92, + "grad_norm": 10.825538635253906, + "learning_rate": 1.626651793375665e-05, + "loss": 0.6053, + "step": 5335 + }, + { + "epoch": 0.92, + "grad_norm": 10.846492767333984, + "learning_rate": 1.6263943710314057e-05, + "loss": 0.6376, + "step": 5336 + }, + { + "epoch": 0.92, + "grad_norm": 14.11724853515625, + "learning_rate": 1.626136948687146e-05, + "loss": 0.9662, + "step": 5337 + }, + { + "epoch": 0.92, + "grad_norm": 12.473100662231445, + "learning_rate": 1.6258795263428867e-05, + "loss": 0.611, + "step": 5338 + }, + { + "epoch": 0.92, + "grad_norm": 8.624481201171875, + "learning_rate": 1.625622103998627e-05, + "loss": 0.6607, + "step": 5339 + }, + { + "epoch": 0.92, + "grad_norm": 9.945741653442383, + "learning_rate": 1.6253646816543677e-05, + "loss": 0.6932, + "step": 5340 + }, + { + "epoch": 0.92, + "grad_norm": 9.372787475585938, + "learning_rate": 1.6251072593101083e-05, + "loss": 0.6036, + "step": 5341 + }, + { + "epoch": 0.92, + "grad_norm": 12.175030708312988, + "learning_rate": 1.6248498369658487e-05, + "loss": 0.7683, + "step": 5342 + }, + { + "epoch": 0.92, + "grad_norm": 11.389568328857422, + "learning_rate": 1.6245924146215893e-05, + "loss": 0.7174, + "step": 5343 + }, + { + "epoch": 0.92, + "grad_norm": 10.045255661010742, + "learning_rate": 1.6243349922773297e-05, + "loss": 0.7243, + "step": 5344 + }, + { + "epoch": 0.92, + "grad_norm": 8.945423126220703, + "learning_rate": 1.6240775699330703e-05, + "loss": 0.5235, + "step": 5345 + }, + { + "epoch": 0.92, + "grad_norm": 9.613673210144043, + "learning_rate": 1.6238201475888107e-05, + "loss": 0.5549, + "step": 5346 + }, + { + "epoch": 0.92, + "grad_norm": 9.48994255065918, + "learning_rate": 1.6235627252445513e-05, + "loss": 0.7129, + "step": 5347 + }, + { + "epoch": 0.92, + "grad_norm": 7.308940410614014, + "learning_rate": 1.6233053029002917e-05, + "loss": 0.4572, + "step": 5348 + }, + { + "epoch": 0.92, + "grad_norm": 8.74059772491455, + "learning_rate": 1.623047880556032e-05, + "loss": 0.5891, + "step": 5349 + }, + { + "epoch": 0.92, + "grad_norm": 13.355896949768066, + "learning_rate": 1.622790458211773e-05, + "loss": 0.6838, + "step": 5350 + }, + { + "epoch": 0.92, + "grad_norm": 10.994861602783203, + "learning_rate": 1.6225330358675133e-05, + "loss": 0.7444, + "step": 5351 + }, + { + "epoch": 0.92, + "grad_norm": 8.859659194946289, + "learning_rate": 1.622275613523254e-05, + "loss": 0.3659, + "step": 5352 + }, + { + "epoch": 0.92, + "grad_norm": 10.659871101379395, + "learning_rate": 1.6220181911789943e-05, + "loss": 0.5816, + "step": 5353 + }, + { + "epoch": 0.92, + "grad_norm": 9.827496528625488, + "learning_rate": 1.621760768834735e-05, + "loss": 0.5036, + "step": 5354 + }, + { + "epoch": 0.92, + "grad_norm": 11.024316787719727, + "learning_rate": 1.6215033464904753e-05, + "loss": 0.7096, + "step": 5355 + }, + { + "epoch": 0.92, + "grad_norm": 10.871668815612793, + "learning_rate": 1.621245924146216e-05, + "loss": 0.6767, + "step": 5356 + }, + { + "epoch": 0.92, + "grad_norm": 12.786467552185059, + "learning_rate": 1.6209885018019563e-05, + "loss": 0.7224, + "step": 5357 + }, + { + "epoch": 0.92, + "grad_norm": 16.874835968017578, + "learning_rate": 1.620731079457697e-05, + "loss": 0.8037, + "step": 5358 + }, + { + "epoch": 0.92, + "grad_norm": 8.968464851379395, + "learning_rate": 1.6204736571134377e-05, + "loss": 0.6626, + "step": 5359 + }, + { + "epoch": 0.92, + "grad_norm": 10.908061981201172, + "learning_rate": 1.620216234769178e-05, + "loss": 0.8516, + "step": 5360 + }, + { + "epoch": 0.92, + "grad_norm": 13.781600952148438, + "learning_rate": 1.6199588124249186e-05, + "loss": 0.9234, + "step": 5361 + }, + { + "epoch": 0.92, + "grad_norm": 11.504817962646484, + "learning_rate": 1.619701390080659e-05, + "loss": 0.7915, + "step": 5362 + }, + { + "epoch": 0.92, + "grad_norm": 11.987812042236328, + "learning_rate": 1.6194439677363996e-05, + "loss": 0.6629, + "step": 5363 + }, + { + "epoch": 0.92, + "grad_norm": 8.693304061889648, + "learning_rate": 1.61918654539214e-05, + "loss": 0.7122, + "step": 5364 + }, + { + "epoch": 0.92, + "grad_norm": 9.195155143737793, + "learning_rate": 1.6189291230478806e-05, + "loss": 0.7479, + "step": 5365 + }, + { + "epoch": 0.92, + "grad_norm": 10.161410331726074, + "learning_rate": 1.618671700703621e-05, + "loss": 0.5539, + "step": 5366 + }, + { + "epoch": 0.92, + "grad_norm": 9.984532356262207, + "learning_rate": 1.6184142783593616e-05, + "loss": 0.7711, + "step": 5367 + }, + { + "epoch": 0.92, + "grad_norm": 11.00101375579834, + "learning_rate": 1.618156856015102e-05, + "loss": 0.819, + "step": 5368 + }, + { + "epoch": 0.92, + "grad_norm": 10.316864967346191, + "learning_rate": 1.617899433670843e-05, + "loss": 0.7121, + "step": 5369 + }, + { + "epoch": 0.92, + "grad_norm": 9.20595932006836, + "learning_rate": 1.6176420113265833e-05, + "loss": 0.5368, + "step": 5370 + }, + { + "epoch": 0.92, + "grad_norm": 12.708369255065918, + "learning_rate": 1.6173845889823236e-05, + "loss": 1.2529, + "step": 5371 + }, + { + "epoch": 0.92, + "grad_norm": 10.034847259521484, + "learning_rate": 1.6171271666380643e-05, + "loss": 0.7555, + "step": 5372 + }, + { + "epoch": 0.92, + "grad_norm": 11.021966934204102, + "learning_rate": 1.6168697442938046e-05, + "loss": 0.604, + "step": 5373 + }, + { + "epoch": 0.92, + "grad_norm": 11.402461051940918, + "learning_rate": 1.6166123219495453e-05, + "loss": 0.6444, + "step": 5374 + }, + { + "epoch": 0.92, + "grad_norm": 11.433053016662598, + "learning_rate": 1.6163548996052856e-05, + "loss": 0.8579, + "step": 5375 + }, + { + "epoch": 0.92, + "grad_norm": 10.826925277709961, + "learning_rate": 1.6160974772610263e-05, + "loss": 0.6832, + "step": 5376 + }, + { + "epoch": 0.92, + "grad_norm": 10.661369323730469, + "learning_rate": 1.6158400549167666e-05, + "loss": 0.7607, + "step": 5377 + }, + { + "epoch": 0.92, + "grad_norm": 8.432463645935059, + "learning_rate": 1.6155826325725076e-05, + "loss": 0.6476, + "step": 5378 + }, + { + "epoch": 0.92, + "grad_norm": 11.132686614990234, + "learning_rate": 1.615325210228248e-05, + "loss": 0.624, + "step": 5379 + }, + { + "epoch": 0.92, + "grad_norm": 12.702945709228516, + "learning_rate": 1.6150677878839883e-05, + "loss": 0.811, + "step": 5380 + }, + { + "epoch": 0.92, + "grad_norm": 12.254922866821289, + "learning_rate": 1.614810365539729e-05, + "loss": 0.9329, + "step": 5381 + }, + { + "epoch": 0.92, + "grad_norm": 11.407812118530273, + "learning_rate": 1.6145529431954693e-05, + "loss": 0.592, + "step": 5382 + }, + { + "epoch": 0.92, + "grad_norm": 13.516083717346191, + "learning_rate": 1.61429552085121e-05, + "loss": 0.9406, + "step": 5383 + }, + { + "epoch": 0.92, + "grad_norm": 9.30846881866455, + "learning_rate": 1.6140380985069503e-05, + "loss": 0.5874, + "step": 5384 + }, + { + "epoch": 0.92, + "grad_norm": 10.725110054016113, + "learning_rate": 1.613780676162691e-05, + "loss": 0.5758, + "step": 5385 + }, + { + "epoch": 0.92, + "grad_norm": 11.639054298400879, + "learning_rate": 1.6135232538184313e-05, + "loss": 0.9237, + "step": 5386 + }, + { + "epoch": 0.92, + "grad_norm": 10.927342414855957, + "learning_rate": 1.613265831474172e-05, + "loss": 0.8197, + "step": 5387 + }, + { + "epoch": 0.92, + "grad_norm": 8.463661193847656, + "learning_rate": 1.6130084091299126e-05, + "loss": 0.5633, + "step": 5388 + }, + { + "epoch": 0.92, + "grad_norm": 9.141190528869629, + "learning_rate": 1.6127509867856533e-05, + "loss": 0.7412, + "step": 5389 + }, + { + "epoch": 0.93, + "grad_norm": 10.962833404541016, + "learning_rate": 1.6124935644413936e-05, + "loss": 0.7952, + "step": 5390 + }, + { + "epoch": 0.93, + "grad_norm": 8.48841381072998, + "learning_rate": 1.612236142097134e-05, + "loss": 0.787, + "step": 5391 + }, + { + "epoch": 0.93, + "grad_norm": 8.667954444885254, + "learning_rate": 1.6119787197528746e-05, + "loss": 0.6521, + "step": 5392 + }, + { + "epoch": 0.93, + "grad_norm": 8.683670997619629, + "learning_rate": 1.611721297408615e-05, + "loss": 0.8079, + "step": 5393 + }, + { + "epoch": 0.93, + "grad_norm": 10.248801231384277, + "learning_rate": 1.6114638750643556e-05, + "loss": 0.6947, + "step": 5394 + }, + { + "epoch": 0.93, + "grad_norm": 9.234082221984863, + "learning_rate": 1.611206452720096e-05, + "loss": 0.5561, + "step": 5395 + }, + { + "epoch": 0.93, + "grad_norm": 9.410922050476074, + "learning_rate": 1.6109490303758366e-05, + "loss": 0.8626, + "step": 5396 + }, + { + "epoch": 0.93, + "grad_norm": 7.7655229568481445, + "learning_rate": 1.6106916080315773e-05, + "loss": 0.7134, + "step": 5397 + }, + { + "epoch": 0.93, + "grad_norm": 11.13786792755127, + "learning_rate": 1.610434185687318e-05, + "loss": 0.6613, + "step": 5398 + }, + { + "epoch": 0.93, + "grad_norm": 9.836605072021484, + "learning_rate": 1.6101767633430583e-05, + "loss": 0.7106, + "step": 5399 + }, + { + "epoch": 0.93, + "grad_norm": 9.276037216186523, + "learning_rate": 1.609919340998799e-05, + "loss": 0.7859, + "step": 5400 + }, + { + "epoch": 0.93, + "grad_norm": 12.147769927978516, + "learning_rate": 1.6096619186545393e-05, + "loss": 0.6519, + "step": 5401 + }, + { + "epoch": 0.93, + "grad_norm": 11.965616226196289, + "learning_rate": 1.6094044963102796e-05, + "loss": 0.9663, + "step": 5402 + }, + { + "epoch": 0.93, + "grad_norm": 12.371088027954102, + "learning_rate": 1.6091470739660203e-05, + "loss": 0.7387, + "step": 5403 + }, + { + "epoch": 0.93, + "grad_norm": 11.353103637695312, + "learning_rate": 1.6088896516217606e-05, + "loss": 0.9717, + "step": 5404 + }, + { + "epoch": 0.93, + "grad_norm": 8.7879056930542, + "learning_rate": 1.6086322292775013e-05, + "loss": 0.4814, + "step": 5405 + }, + { + "epoch": 0.93, + "grad_norm": 9.066062927246094, + "learning_rate": 1.6083748069332416e-05, + "loss": 0.6042, + "step": 5406 + }, + { + "epoch": 0.93, + "grad_norm": 10.037806510925293, + "learning_rate": 1.6081173845889826e-05, + "loss": 0.5713, + "step": 5407 + }, + { + "epoch": 0.93, + "grad_norm": 11.090384483337402, + "learning_rate": 1.607859962244723e-05, + "loss": 0.6953, + "step": 5408 + }, + { + "epoch": 0.93, + "grad_norm": 11.951784133911133, + "learning_rate": 1.6076025399004636e-05, + "loss": 0.6865, + "step": 5409 + }, + { + "epoch": 0.93, + "grad_norm": 9.99825382232666, + "learning_rate": 1.607345117556204e-05, + "loss": 0.8309, + "step": 5410 + }, + { + "epoch": 0.93, + "grad_norm": 12.917034149169922, + "learning_rate": 1.6070876952119443e-05, + "loss": 0.7286, + "step": 5411 + }, + { + "epoch": 0.93, + "grad_norm": 10.623841285705566, + "learning_rate": 1.606830272867685e-05, + "loss": 0.8353, + "step": 5412 + }, + { + "epoch": 0.93, + "grad_norm": 13.034842491149902, + "learning_rate": 1.6065728505234253e-05, + "loss": 0.7367, + "step": 5413 + }, + { + "epoch": 0.93, + "grad_norm": 14.12311840057373, + "learning_rate": 1.606315428179166e-05, + "loss": 0.7642, + "step": 5414 + }, + { + "epoch": 0.93, + "grad_norm": 12.247233390808105, + "learning_rate": 1.6060580058349063e-05, + "loss": 0.858, + "step": 5415 + }, + { + "epoch": 0.93, + "grad_norm": 13.87278938293457, + "learning_rate": 1.6058005834906473e-05, + "loss": 0.7625, + "step": 5416 + }, + { + "epoch": 0.93, + "grad_norm": 9.773738861083984, + "learning_rate": 1.6055431611463876e-05, + "loss": 0.6346, + "step": 5417 + }, + { + "epoch": 0.93, + "grad_norm": 11.190245628356934, + "learning_rate": 1.6052857388021283e-05, + "loss": 0.7055, + "step": 5418 + }, + { + "epoch": 0.93, + "grad_norm": 12.070527076721191, + "learning_rate": 1.6050283164578686e-05, + "loss": 0.6107, + "step": 5419 + }, + { + "epoch": 0.93, + "grad_norm": 11.259936332702637, + "learning_rate": 1.6047708941136093e-05, + "loss": 0.6917, + "step": 5420 + }, + { + "epoch": 0.93, + "grad_norm": 9.312941551208496, + "learning_rate": 1.6045134717693496e-05, + "loss": 0.7604, + "step": 5421 + }, + { + "epoch": 0.93, + "grad_norm": 11.834226608276367, + "learning_rate": 1.60425604942509e-05, + "loss": 0.7127, + "step": 5422 + }, + { + "epoch": 0.93, + "grad_norm": 10.275663375854492, + "learning_rate": 1.6039986270808306e-05, + "loss": 0.7497, + "step": 5423 + }, + { + "epoch": 0.93, + "grad_norm": 10.038375854492188, + "learning_rate": 1.603741204736571e-05, + "loss": 0.7263, + "step": 5424 + }, + { + "epoch": 0.93, + "grad_norm": 12.30618953704834, + "learning_rate": 1.603483782392312e-05, + "loss": 0.6782, + "step": 5425 + }, + { + "epoch": 0.93, + "grad_norm": 11.826001167297363, + "learning_rate": 1.6032263600480523e-05, + "loss": 0.6668, + "step": 5426 + }, + { + "epoch": 0.93, + "grad_norm": 11.249032974243164, + "learning_rate": 1.602968937703793e-05, + "loss": 0.6276, + "step": 5427 + }, + { + "epoch": 0.93, + "grad_norm": 9.06242847442627, + "learning_rate": 1.6027115153595333e-05, + "loss": 0.6218, + "step": 5428 + }, + { + "epoch": 0.93, + "grad_norm": 8.804352760314941, + "learning_rate": 1.602454093015274e-05, + "loss": 0.7112, + "step": 5429 + }, + { + "epoch": 0.93, + "grad_norm": 11.694586753845215, + "learning_rate": 1.6021966706710142e-05, + "loss": 0.9347, + "step": 5430 + }, + { + "epoch": 0.93, + "grad_norm": 10.349326133728027, + "learning_rate": 1.601939248326755e-05, + "loss": 0.689, + "step": 5431 + }, + { + "epoch": 0.93, + "grad_norm": 8.02199649810791, + "learning_rate": 1.6016818259824952e-05, + "loss": 0.4271, + "step": 5432 + }, + { + "epoch": 0.93, + "grad_norm": 11.841898918151855, + "learning_rate": 1.6014244036382356e-05, + "loss": 0.9099, + "step": 5433 + }, + { + "epoch": 0.93, + "grad_norm": 12.227035522460938, + "learning_rate": 1.6011669812939762e-05, + "loss": 0.6111, + "step": 5434 + }, + { + "epoch": 0.93, + "grad_norm": 11.281566619873047, + "learning_rate": 1.600909558949717e-05, + "loss": 0.8266, + "step": 5435 + }, + { + "epoch": 0.93, + "grad_norm": 11.28485107421875, + "learning_rate": 1.6006521366054576e-05, + "loss": 0.701, + "step": 5436 + }, + { + "epoch": 0.93, + "grad_norm": 8.885891914367676, + "learning_rate": 1.600394714261198e-05, + "loss": 0.6103, + "step": 5437 + }, + { + "epoch": 0.93, + "grad_norm": 16.44804573059082, + "learning_rate": 1.6001372919169386e-05, + "loss": 0.9201, + "step": 5438 + }, + { + "epoch": 0.93, + "grad_norm": 11.522759437561035, + "learning_rate": 1.599879869572679e-05, + "loss": 0.8267, + "step": 5439 + }, + { + "epoch": 0.93, + "grad_norm": 9.83130931854248, + "learning_rate": 1.5996224472284196e-05, + "loss": 0.5862, + "step": 5440 + }, + { + "epoch": 0.93, + "grad_norm": 13.161563873291016, + "learning_rate": 1.59936502488416e-05, + "loss": 0.8455, + "step": 5441 + }, + { + "epoch": 0.93, + "grad_norm": 10.917141914367676, + "learning_rate": 1.5991076025399006e-05, + "loss": 0.8669, + "step": 5442 + }, + { + "epoch": 0.93, + "grad_norm": 11.429743766784668, + "learning_rate": 1.598850180195641e-05, + "loss": 0.7464, + "step": 5443 + }, + { + "epoch": 0.93, + "grad_norm": 11.11600399017334, + "learning_rate": 1.5985927578513816e-05, + "loss": 0.6064, + "step": 5444 + }, + { + "epoch": 0.93, + "grad_norm": 8.265796661376953, + "learning_rate": 1.5983353355071222e-05, + "loss": 0.4947, + "step": 5445 + }, + { + "epoch": 0.93, + "grad_norm": 9.254405975341797, + "learning_rate": 1.5980779131628626e-05, + "loss": 0.6366, + "step": 5446 + }, + { + "epoch": 0.93, + "grad_norm": 9.471699714660645, + "learning_rate": 1.5978204908186032e-05, + "loss": 0.7729, + "step": 5447 + }, + { + "epoch": 0.93, + "grad_norm": 10.652485847473145, + "learning_rate": 1.5975630684743436e-05, + "loss": 0.6428, + "step": 5448 + }, + { + "epoch": 0.94, + "grad_norm": 11.507447242736816, + "learning_rate": 1.5973056461300842e-05, + "loss": 0.7779, + "step": 5449 + }, + { + "epoch": 0.94, + "grad_norm": 13.162154197692871, + "learning_rate": 1.5970482237858246e-05, + "loss": 0.7809, + "step": 5450 + }, + { + "epoch": 0.94, + "grad_norm": 8.86877727508545, + "learning_rate": 1.5967908014415652e-05, + "loss": 0.5767, + "step": 5451 + }, + { + "epoch": 0.94, + "grad_norm": 8.88869571685791, + "learning_rate": 1.5965333790973056e-05, + "loss": 0.601, + "step": 5452 + }, + { + "epoch": 0.94, + "grad_norm": 9.477707862854004, + "learning_rate": 1.596275956753046e-05, + "loss": 0.4985, + "step": 5453 + }, + { + "epoch": 0.94, + "grad_norm": 9.238130569458008, + "learning_rate": 1.596018534408787e-05, + "loss": 0.566, + "step": 5454 + }, + { + "epoch": 0.94, + "grad_norm": 7.624138355255127, + "learning_rate": 1.5957611120645272e-05, + "loss": 0.4094, + "step": 5455 + }, + { + "epoch": 0.94, + "grad_norm": 13.222182273864746, + "learning_rate": 1.595503689720268e-05, + "loss": 0.6509, + "step": 5456 + }, + { + "epoch": 0.94, + "grad_norm": 14.988697052001953, + "learning_rate": 1.5952462673760082e-05, + "loss": 1.0603, + "step": 5457 + }, + { + "epoch": 0.94, + "grad_norm": 12.02749252319336, + "learning_rate": 1.594988845031749e-05, + "loss": 0.6549, + "step": 5458 + }, + { + "epoch": 0.94, + "grad_norm": 12.617202758789062, + "learning_rate": 1.5947314226874892e-05, + "loss": 0.7358, + "step": 5459 + }, + { + "epoch": 0.94, + "grad_norm": 12.516440391540527, + "learning_rate": 1.59447400034323e-05, + "loss": 0.9274, + "step": 5460 + }, + { + "epoch": 0.94, + "grad_norm": 8.804156303405762, + "learning_rate": 1.5942165779989702e-05, + "loss": 0.7639, + "step": 5461 + }, + { + "epoch": 0.94, + "grad_norm": 17.205520629882812, + "learning_rate": 1.593959155654711e-05, + "loss": 0.7911, + "step": 5462 + }, + { + "epoch": 0.94, + "grad_norm": 14.2595796585083, + "learning_rate": 1.5937017333104516e-05, + "loss": 0.608, + "step": 5463 + }, + { + "epoch": 0.94, + "grad_norm": 10.216423034667969, + "learning_rate": 1.593444310966192e-05, + "loss": 0.922, + "step": 5464 + }, + { + "epoch": 0.94, + "grad_norm": 10.078286170959473, + "learning_rate": 1.5931868886219326e-05, + "loss": 0.618, + "step": 5465 + }, + { + "epoch": 0.94, + "grad_norm": 11.788003921508789, + "learning_rate": 1.592929466277673e-05, + "loss": 1.0134, + "step": 5466 + }, + { + "epoch": 0.94, + "grad_norm": 10.516185760498047, + "learning_rate": 1.5926720439334135e-05, + "loss": 0.6751, + "step": 5467 + }, + { + "epoch": 0.94, + "grad_norm": 13.466375350952148, + "learning_rate": 1.592414621589154e-05, + "loss": 0.5006, + "step": 5468 + }, + { + "epoch": 0.94, + "grad_norm": 9.340472221374512, + "learning_rate": 1.5921571992448945e-05, + "loss": 0.8121, + "step": 5469 + }, + { + "epoch": 0.94, + "grad_norm": 12.013021469116211, + "learning_rate": 1.591899776900635e-05, + "loss": 0.8057, + "step": 5470 + }, + { + "epoch": 0.94, + "grad_norm": 9.822932243347168, + "learning_rate": 1.5916423545563755e-05, + "loss": 0.521, + "step": 5471 + }, + { + "epoch": 0.94, + "grad_norm": 10.534416198730469, + "learning_rate": 1.591384932212116e-05, + "loss": 0.7171, + "step": 5472 + }, + { + "epoch": 0.94, + "grad_norm": 9.124041557312012, + "learning_rate": 1.591127509867857e-05, + "loss": 0.5739, + "step": 5473 + }, + { + "epoch": 0.94, + "grad_norm": 12.582082748413086, + "learning_rate": 1.5908700875235972e-05, + "loss": 0.9217, + "step": 5474 + }, + { + "epoch": 0.94, + "grad_norm": 9.603957176208496, + "learning_rate": 1.5906126651793375e-05, + "loss": 0.6886, + "step": 5475 + }, + { + "epoch": 0.94, + "grad_norm": 12.811408042907715, + "learning_rate": 1.5903552428350782e-05, + "loss": 0.7656, + "step": 5476 + }, + { + "epoch": 0.94, + "grad_norm": 10.944864273071289, + "learning_rate": 1.5900978204908185e-05, + "loss": 0.7287, + "step": 5477 + }, + { + "epoch": 0.94, + "grad_norm": 10.971102714538574, + "learning_rate": 1.5898403981465592e-05, + "loss": 0.7978, + "step": 5478 + }, + { + "epoch": 0.94, + "grad_norm": 14.381170272827148, + "learning_rate": 1.5895829758022995e-05, + "loss": 0.8835, + "step": 5479 + }, + { + "epoch": 0.94, + "grad_norm": 8.820755004882812, + "learning_rate": 1.5893255534580402e-05, + "loss": 0.6181, + "step": 5480 + }, + { + "epoch": 0.94, + "grad_norm": 10.809287071228027, + "learning_rate": 1.5890681311137805e-05, + "loss": 0.6795, + "step": 5481 + }, + { + "epoch": 0.94, + "grad_norm": 9.557952880859375, + "learning_rate": 1.5888107087695215e-05, + "loss": 0.6398, + "step": 5482 + }, + { + "epoch": 0.94, + "grad_norm": 10.693344116210938, + "learning_rate": 1.588553286425262e-05, + "loss": 0.8036, + "step": 5483 + }, + { + "epoch": 0.94, + "grad_norm": 9.018784523010254, + "learning_rate": 1.5882958640810022e-05, + "loss": 0.6238, + "step": 5484 + }, + { + "epoch": 0.94, + "grad_norm": 10.494216918945312, + "learning_rate": 1.588038441736743e-05, + "loss": 0.7607, + "step": 5485 + }, + { + "epoch": 0.94, + "grad_norm": 10.84225082397461, + "learning_rate": 1.5877810193924832e-05, + "loss": 0.7017, + "step": 5486 + }, + { + "epoch": 0.94, + "grad_norm": 10.3541259765625, + "learning_rate": 1.587523597048224e-05, + "loss": 0.8281, + "step": 5487 + }, + { + "epoch": 0.94, + "grad_norm": 7.750877857208252, + "learning_rate": 1.5872661747039642e-05, + "loss": 0.5429, + "step": 5488 + }, + { + "epoch": 0.94, + "grad_norm": 9.787055015563965, + "learning_rate": 1.587008752359705e-05, + "loss": 0.7507, + "step": 5489 + }, + { + "epoch": 0.94, + "grad_norm": 9.039846420288086, + "learning_rate": 1.5867513300154452e-05, + "loss": 0.5399, + "step": 5490 + }, + { + "epoch": 0.94, + "grad_norm": 9.538870811462402, + "learning_rate": 1.586493907671186e-05, + "loss": 0.6185, + "step": 5491 + }, + { + "epoch": 0.94, + "grad_norm": 11.526559829711914, + "learning_rate": 1.5862364853269265e-05, + "loss": 0.7405, + "step": 5492 + }, + { + "epoch": 0.94, + "grad_norm": 12.64937686920166, + "learning_rate": 1.5859790629826672e-05, + "loss": 1.0004, + "step": 5493 + }, + { + "epoch": 0.94, + "grad_norm": 10.196798324584961, + "learning_rate": 1.5857216406384075e-05, + "loss": 0.7292, + "step": 5494 + }, + { + "epoch": 0.94, + "grad_norm": 10.425310134887695, + "learning_rate": 1.585464218294148e-05, + "loss": 0.6555, + "step": 5495 + }, + { + "epoch": 0.94, + "grad_norm": 11.53381633758545, + "learning_rate": 1.5852067959498885e-05, + "loss": 0.8289, + "step": 5496 + }, + { + "epoch": 0.94, + "grad_norm": 9.954731941223145, + "learning_rate": 1.584949373605629e-05, + "loss": 0.709, + "step": 5497 + }, + { + "epoch": 0.94, + "grad_norm": 9.777397155761719, + "learning_rate": 1.5846919512613695e-05, + "loss": 0.8233, + "step": 5498 + }, + { + "epoch": 0.94, + "grad_norm": 11.766757011413574, + "learning_rate": 1.58443452891711e-05, + "loss": 0.8391, + "step": 5499 + }, + { + "epoch": 0.94, + "grad_norm": 11.392250061035156, + "learning_rate": 1.5841771065728505e-05, + "loss": 0.8215, + "step": 5500 + }, + { + "epoch": 0.94, + "grad_norm": 7.856593608856201, + "learning_rate": 1.5839196842285912e-05, + "loss": 0.6673, + "step": 5501 + }, + { + "epoch": 0.94, + "grad_norm": 11.711043357849121, + "learning_rate": 1.583662261884332e-05, + "loss": 0.8859, + "step": 5502 + }, + { + "epoch": 0.94, + "grad_norm": 11.095938682556152, + "learning_rate": 1.5834048395400722e-05, + "loss": 0.7732, + "step": 5503 + }, + { + "epoch": 0.94, + "grad_norm": 10.732726097106934, + "learning_rate": 1.583147417195813e-05, + "loss": 0.841, + "step": 5504 + }, + { + "epoch": 0.94, + "grad_norm": 9.311925888061523, + "learning_rate": 1.5828899948515532e-05, + "loss": 0.893, + "step": 5505 + }, + { + "epoch": 0.94, + "grad_norm": 10.491415023803711, + "learning_rate": 1.5826325725072935e-05, + "loss": 0.7547, + "step": 5506 + }, + { + "epoch": 0.95, + "grad_norm": 10.706657409667969, + "learning_rate": 1.5823751501630342e-05, + "loss": 0.8287, + "step": 5507 + }, + { + "epoch": 0.95, + "grad_norm": 10.36621379852295, + "learning_rate": 1.5821177278187745e-05, + "loss": 0.6054, + "step": 5508 + }, + { + "epoch": 0.95, + "grad_norm": 11.965426445007324, + "learning_rate": 1.5818603054745152e-05, + "loss": 0.737, + "step": 5509 + }, + { + "epoch": 0.95, + "grad_norm": 11.062065124511719, + "learning_rate": 1.5816028831302555e-05, + "loss": 0.9597, + "step": 5510 + }, + { + "epoch": 0.95, + "grad_norm": 11.279169082641602, + "learning_rate": 1.5813454607859965e-05, + "loss": 0.8107, + "step": 5511 + }, + { + "epoch": 0.95, + "grad_norm": 7.843371391296387, + "learning_rate": 1.581088038441737e-05, + "loss": 0.4532, + "step": 5512 + }, + { + "epoch": 0.95, + "grad_norm": 10.130149841308594, + "learning_rate": 1.5808306160974775e-05, + "loss": 0.6675, + "step": 5513 + }, + { + "epoch": 0.95, + "grad_norm": 8.880720138549805, + "learning_rate": 1.580573193753218e-05, + "loss": 0.5462, + "step": 5514 + }, + { + "epoch": 0.95, + "grad_norm": 10.71376895904541, + "learning_rate": 1.580315771408958e-05, + "loss": 0.6443, + "step": 5515 + }, + { + "epoch": 0.95, + "grad_norm": 10.150823593139648, + "learning_rate": 1.580058349064699e-05, + "loss": 0.8697, + "step": 5516 + }, + { + "epoch": 0.95, + "grad_norm": 11.813486099243164, + "learning_rate": 1.579800926720439e-05, + "loss": 0.7977, + "step": 5517 + }, + { + "epoch": 0.95, + "grad_norm": 12.239295959472656, + "learning_rate": 1.57954350437618e-05, + "loss": 0.6793, + "step": 5518 + }, + { + "epoch": 0.95, + "grad_norm": 11.66236686706543, + "learning_rate": 1.57928608203192e-05, + "loss": 0.7512, + "step": 5519 + }, + { + "epoch": 0.95, + "grad_norm": 9.604142189025879, + "learning_rate": 1.579028659687661e-05, + "loss": 0.6044, + "step": 5520 + }, + { + "epoch": 0.95, + "grad_norm": 10.841404914855957, + "learning_rate": 1.5787712373434015e-05, + "loss": 0.579, + "step": 5521 + }, + { + "epoch": 0.95, + "grad_norm": 10.501837730407715, + "learning_rate": 1.578513814999142e-05, + "loss": 0.7205, + "step": 5522 + }, + { + "epoch": 0.95, + "grad_norm": 9.540206909179688, + "learning_rate": 1.5782563926548825e-05, + "loss": 0.7782, + "step": 5523 + }, + { + "epoch": 0.95, + "grad_norm": 12.388627052307129, + "learning_rate": 1.577998970310623e-05, + "loss": 0.7765, + "step": 5524 + }, + { + "epoch": 0.95, + "grad_norm": 8.31390380859375, + "learning_rate": 1.5777415479663635e-05, + "loss": 0.5311, + "step": 5525 + }, + { + "epoch": 0.95, + "grad_norm": 11.276957511901855, + "learning_rate": 1.5774841256221038e-05, + "loss": 0.9633, + "step": 5526 + }, + { + "epoch": 0.95, + "grad_norm": 9.880576133728027, + "learning_rate": 1.5772267032778445e-05, + "loss": 0.5084, + "step": 5527 + }, + { + "epoch": 0.95, + "grad_norm": 7.902566909790039, + "learning_rate": 1.5769692809335848e-05, + "loss": 0.4669, + "step": 5528 + }, + { + "epoch": 0.95, + "grad_norm": 9.811795234680176, + "learning_rate": 1.5767118585893255e-05, + "loss": 0.6889, + "step": 5529 + }, + { + "epoch": 0.95, + "grad_norm": 11.534520149230957, + "learning_rate": 1.576454436245066e-05, + "loss": 1.0246, + "step": 5530 + }, + { + "epoch": 0.95, + "grad_norm": 11.52299690246582, + "learning_rate": 1.5761970139008068e-05, + "loss": 0.8349, + "step": 5531 + }, + { + "epoch": 0.95, + "grad_norm": 12.835138320922852, + "learning_rate": 1.575939591556547e-05, + "loss": 0.8441, + "step": 5532 + }, + { + "epoch": 0.95, + "grad_norm": 9.813920021057129, + "learning_rate": 1.5756821692122878e-05, + "loss": 0.7039, + "step": 5533 + }, + { + "epoch": 0.95, + "grad_norm": 10.295403480529785, + "learning_rate": 1.575424746868028e-05, + "loss": 0.8619, + "step": 5534 + }, + { + "epoch": 0.95, + "grad_norm": 9.677909851074219, + "learning_rate": 1.5751673245237688e-05, + "loss": 0.6487, + "step": 5535 + }, + { + "epoch": 0.95, + "grad_norm": 10.992595672607422, + "learning_rate": 1.574909902179509e-05, + "loss": 0.7828, + "step": 5536 + }, + { + "epoch": 0.95, + "grad_norm": 9.744140625, + "learning_rate": 1.5746524798352495e-05, + "loss": 0.6547, + "step": 5537 + }, + { + "epoch": 0.95, + "grad_norm": 10.94292163848877, + "learning_rate": 1.57439505749099e-05, + "loss": 0.8616, + "step": 5538 + }, + { + "epoch": 0.95, + "grad_norm": 9.255457878112793, + "learning_rate": 1.5741376351467308e-05, + "loss": 0.7145, + "step": 5539 + }, + { + "epoch": 0.95, + "grad_norm": 12.098308563232422, + "learning_rate": 1.5738802128024715e-05, + "loss": 0.8017, + "step": 5540 + }, + { + "epoch": 0.95, + "grad_norm": 11.505480766296387, + "learning_rate": 1.5736227904582118e-05, + "loss": 0.9495, + "step": 5541 + }, + { + "epoch": 0.95, + "grad_norm": 10.07521915435791, + "learning_rate": 1.5733653681139525e-05, + "loss": 0.8291, + "step": 5542 + }, + { + "epoch": 0.95, + "grad_norm": 10.103720664978027, + "learning_rate": 1.5731079457696928e-05, + "loss": 0.9458, + "step": 5543 + }, + { + "epoch": 0.95, + "grad_norm": 9.460010528564453, + "learning_rate": 1.5728505234254335e-05, + "loss": 0.5839, + "step": 5544 + }, + { + "epoch": 0.95, + "grad_norm": 10.207123756408691, + "learning_rate": 1.5725931010811738e-05, + "loss": 0.7652, + "step": 5545 + }, + { + "epoch": 0.95, + "grad_norm": 8.013753890991211, + "learning_rate": 1.5723356787369145e-05, + "loss": 0.5342, + "step": 5546 + }, + { + "epoch": 0.95, + "grad_norm": 10.64785385131836, + "learning_rate": 1.5720782563926548e-05, + "loss": 0.8422, + "step": 5547 + }, + { + "epoch": 0.95, + "grad_norm": 10.991358757019043, + "learning_rate": 1.5718208340483955e-05, + "loss": 0.8929, + "step": 5548 + }, + { + "epoch": 0.95, + "grad_norm": 8.931777954101562, + "learning_rate": 1.571563411704136e-05, + "loss": 0.6247, + "step": 5549 + }, + { + "epoch": 0.95, + "grad_norm": 10.044845581054688, + "learning_rate": 1.5713059893598765e-05, + "loss": 0.685, + "step": 5550 + }, + { + "epoch": 0.95, + "grad_norm": 6.44978666305542, + "learning_rate": 1.571048567015617e-05, + "loss": 0.4404, + "step": 5551 + }, + { + "epoch": 0.95, + "grad_norm": 9.396489143371582, + "learning_rate": 1.5707911446713575e-05, + "loss": 0.6322, + "step": 5552 + }, + { + "epoch": 0.95, + "grad_norm": 9.507052421569824, + "learning_rate": 1.570533722327098e-05, + "loss": 0.6494, + "step": 5553 + }, + { + "epoch": 0.95, + "grad_norm": 8.790435791015625, + "learning_rate": 1.5702762999828385e-05, + "loss": 0.5111, + "step": 5554 + }, + { + "epoch": 0.95, + "grad_norm": 8.560773849487305, + "learning_rate": 1.570018877638579e-05, + "loss": 0.5038, + "step": 5555 + }, + { + "epoch": 0.95, + "grad_norm": 11.470216751098633, + "learning_rate": 1.5697614552943195e-05, + "loss": 0.7754, + "step": 5556 + }, + { + "epoch": 0.95, + "grad_norm": 12.905423164367676, + "learning_rate": 1.5695040329500598e-05, + "loss": 0.7474, + "step": 5557 + }, + { + "epoch": 0.95, + "grad_norm": 13.42026138305664, + "learning_rate": 1.5692466106058008e-05, + "loss": 0.936, + "step": 5558 + }, + { + "epoch": 0.95, + "grad_norm": 10.158493041992188, + "learning_rate": 1.568989188261541e-05, + "loss": 0.556, + "step": 5559 + }, + { + "epoch": 0.95, + "grad_norm": 10.384407997131348, + "learning_rate": 1.5687317659172818e-05, + "loss": 0.6649, + "step": 5560 + }, + { + "epoch": 0.95, + "grad_norm": 11.067281723022461, + "learning_rate": 1.568474343573022e-05, + "loss": 0.7918, + "step": 5561 + }, + { + "epoch": 0.95, + "grad_norm": 15.902259826660156, + "learning_rate": 1.5682169212287628e-05, + "loss": 0.8508, + "step": 5562 + }, + { + "epoch": 0.95, + "grad_norm": 8.968539237976074, + "learning_rate": 1.567959498884503e-05, + "loss": 0.6452, + "step": 5563 + }, + { + "epoch": 0.95, + "grad_norm": 10.663578033447266, + "learning_rate": 1.5677020765402438e-05, + "loss": 0.6886, + "step": 5564 + }, + { + "epoch": 0.96, + "grad_norm": 15.051219940185547, + "learning_rate": 1.567444654195984e-05, + "loss": 0.9093, + "step": 5565 + }, + { + "epoch": 0.96, + "grad_norm": 10.78207015991211, + "learning_rate": 1.5671872318517248e-05, + "loss": 0.6888, + "step": 5566 + }, + { + "epoch": 0.96, + "grad_norm": 10.442506790161133, + "learning_rate": 1.5669298095074655e-05, + "loss": 0.7745, + "step": 5567 + }, + { + "epoch": 0.96, + "grad_norm": 8.913450241088867, + "learning_rate": 1.5666723871632058e-05, + "loss": 0.6244, + "step": 5568 + }, + { + "epoch": 0.96, + "grad_norm": 15.408548355102539, + "learning_rate": 1.5664149648189465e-05, + "loss": 0.875, + "step": 5569 + }, + { + "epoch": 0.96, + "grad_norm": 10.3099365234375, + "learning_rate": 1.5661575424746868e-05, + "loss": 0.7039, + "step": 5570 + }, + { + "epoch": 0.96, + "grad_norm": 8.898088455200195, + "learning_rate": 1.5659001201304274e-05, + "loss": 0.5219, + "step": 5571 + }, + { + "epoch": 0.96, + "grad_norm": 9.329687118530273, + "learning_rate": 1.5656426977861678e-05, + "loss": 0.5974, + "step": 5572 + }, + { + "epoch": 0.96, + "grad_norm": 8.845309257507324, + "learning_rate": 1.5653852754419084e-05, + "loss": 0.6365, + "step": 5573 + }, + { + "epoch": 0.96, + "grad_norm": 10.738419532775879, + "learning_rate": 1.5651278530976488e-05, + "loss": 0.6588, + "step": 5574 + }, + { + "epoch": 0.96, + "grad_norm": 10.223743438720703, + "learning_rate": 1.5648704307533894e-05, + "loss": 0.608, + "step": 5575 + }, + { + "epoch": 0.96, + "grad_norm": 9.70917797088623, + "learning_rate": 1.5646130084091298e-05, + "loss": 0.6659, + "step": 5576 + }, + { + "epoch": 0.96, + "grad_norm": 10.924893379211426, + "learning_rate": 1.5643555860648708e-05, + "loss": 0.879, + "step": 5577 + }, + { + "epoch": 0.96, + "grad_norm": 8.618356704711914, + "learning_rate": 1.564098163720611e-05, + "loss": 0.6395, + "step": 5578 + }, + { + "epoch": 0.96, + "grad_norm": 10.092124938964844, + "learning_rate": 1.5638407413763514e-05, + "loss": 0.4761, + "step": 5579 + }, + { + "epoch": 0.96, + "grad_norm": 12.847647666931152, + "learning_rate": 1.563583319032092e-05, + "loss": 0.6705, + "step": 5580 + }, + { + "epoch": 0.96, + "grad_norm": 13.82312297821045, + "learning_rate": 1.5633258966878324e-05, + "loss": 0.998, + "step": 5581 + }, + { + "epoch": 0.96, + "grad_norm": 11.251140594482422, + "learning_rate": 1.563068474343573e-05, + "loss": 0.6906, + "step": 5582 + }, + { + "epoch": 0.96, + "grad_norm": 12.709558486938477, + "learning_rate": 1.5628110519993134e-05, + "loss": 0.822, + "step": 5583 + }, + { + "epoch": 0.96, + "grad_norm": 10.888821601867676, + "learning_rate": 1.562553629655054e-05, + "loss": 0.7191, + "step": 5584 + }, + { + "epoch": 0.96, + "grad_norm": 10.68293571472168, + "learning_rate": 1.5622962073107944e-05, + "loss": 0.5517, + "step": 5585 + }, + { + "epoch": 0.96, + "grad_norm": 11.949898719787598, + "learning_rate": 1.5620387849665354e-05, + "loss": 0.9547, + "step": 5586 + }, + { + "epoch": 0.96, + "grad_norm": 14.29300308227539, + "learning_rate": 1.5617813626222758e-05, + "loss": 0.739, + "step": 5587 + }, + { + "epoch": 0.96, + "grad_norm": 7.012667179107666, + "learning_rate": 1.561523940278016e-05, + "loss": 0.5322, + "step": 5588 + }, + { + "epoch": 0.96, + "grad_norm": 11.041412353515625, + "learning_rate": 1.5612665179337568e-05, + "loss": 0.7658, + "step": 5589 + }, + { + "epoch": 0.96, + "grad_norm": 11.061575889587402, + "learning_rate": 1.561009095589497e-05, + "loss": 0.7991, + "step": 5590 + }, + { + "epoch": 0.96, + "grad_norm": 9.600547790527344, + "learning_rate": 1.5607516732452378e-05, + "loss": 0.6509, + "step": 5591 + }, + { + "epoch": 0.96, + "grad_norm": 10.921608924865723, + "learning_rate": 1.560494250900978e-05, + "loss": 0.7163, + "step": 5592 + }, + { + "epoch": 0.96, + "grad_norm": 8.13800048828125, + "learning_rate": 1.5602368285567188e-05, + "loss": 0.6234, + "step": 5593 + }, + { + "epoch": 0.96, + "grad_norm": 9.188948631286621, + "learning_rate": 1.559979406212459e-05, + "loss": 0.7124, + "step": 5594 + }, + { + "epoch": 0.96, + "grad_norm": 11.671902656555176, + "learning_rate": 1.5597219838681998e-05, + "loss": 1.0355, + "step": 5595 + }, + { + "epoch": 0.96, + "grad_norm": 14.45449447631836, + "learning_rate": 1.5594645615239404e-05, + "loss": 0.8463, + "step": 5596 + }, + { + "epoch": 0.96, + "grad_norm": 10.132134437561035, + "learning_rate": 1.559207139179681e-05, + "loss": 0.6501, + "step": 5597 + }, + { + "epoch": 0.96, + "grad_norm": 8.015098571777344, + "learning_rate": 1.5589497168354214e-05, + "loss": 0.5043, + "step": 5598 + }, + { + "epoch": 0.96, + "grad_norm": 7.141470909118652, + "learning_rate": 1.5586922944911618e-05, + "loss": 0.3979, + "step": 5599 + }, + { + "epoch": 0.96, + "grad_norm": 8.37172794342041, + "learning_rate": 1.5584348721469024e-05, + "loss": 0.4986, + "step": 5600 + }, + { + "epoch": 0.96, + "grad_norm": 8.976631164550781, + "learning_rate": 1.5581774498026428e-05, + "loss": 0.6687, + "step": 5601 + }, + { + "epoch": 0.96, + "grad_norm": 8.840478897094727, + "learning_rate": 1.5579200274583834e-05, + "loss": 0.7312, + "step": 5602 + }, + { + "epoch": 0.96, + "grad_norm": 13.035346984863281, + "learning_rate": 1.5576626051141237e-05, + "loss": 0.9072, + "step": 5603 + }, + { + "epoch": 0.96, + "grad_norm": 9.700462341308594, + "learning_rate": 1.5574051827698644e-05, + "loss": 0.7465, + "step": 5604 + }, + { + "epoch": 0.96, + "grad_norm": 12.324307441711426, + "learning_rate": 1.557147760425605e-05, + "loss": 0.7753, + "step": 5605 + }, + { + "epoch": 0.96, + "grad_norm": 9.882134437561035, + "learning_rate": 1.5568903380813458e-05, + "loss": 0.5816, + "step": 5606 + }, + { + "epoch": 0.96, + "grad_norm": 8.551189422607422, + "learning_rate": 1.556632915737086e-05, + "loss": 0.6115, + "step": 5607 + }, + { + "epoch": 0.96, + "grad_norm": 8.379423141479492, + "learning_rate": 1.5563754933928267e-05, + "loss": 0.5545, + "step": 5608 + }, + { + "epoch": 0.96, + "grad_norm": 7.434854507446289, + "learning_rate": 1.556118071048567e-05, + "loss": 0.5058, + "step": 5609 + }, + { + "epoch": 0.96, + "grad_norm": 11.850112915039062, + "learning_rate": 1.5558606487043074e-05, + "loss": 0.6237, + "step": 5610 + }, + { + "epoch": 0.96, + "grad_norm": 15.739999771118164, + "learning_rate": 1.555603226360048e-05, + "loss": 0.9191, + "step": 5611 + }, + { + "epoch": 0.96, + "grad_norm": 9.242504119873047, + "learning_rate": 1.5553458040157884e-05, + "loss": 0.5725, + "step": 5612 + }, + { + "epoch": 0.96, + "grad_norm": 13.402344703674316, + "learning_rate": 1.555088381671529e-05, + "loss": 0.8992, + "step": 5613 + }, + { + "epoch": 0.96, + "grad_norm": 9.253782272338867, + "learning_rate": 1.5548309593272694e-05, + "loss": 0.4061, + "step": 5614 + }, + { + "epoch": 0.96, + "grad_norm": 12.940531730651855, + "learning_rate": 1.5545735369830104e-05, + "loss": 0.6459, + "step": 5615 + }, + { + "epoch": 0.96, + "grad_norm": 10.35522174835205, + "learning_rate": 1.5543161146387507e-05, + "loss": 0.7855, + "step": 5616 + }, + { + "epoch": 0.96, + "grad_norm": 8.31906509399414, + "learning_rate": 1.5540586922944914e-05, + "loss": 0.6374, + "step": 5617 + }, + { + "epoch": 0.96, + "grad_norm": 11.768364906311035, + "learning_rate": 1.5538012699502317e-05, + "loss": 0.8213, + "step": 5618 + }, + { + "epoch": 0.96, + "grad_norm": 13.215407371520996, + "learning_rate": 1.553543847605972e-05, + "loss": 0.7401, + "step": 5619 + }, + { + "epoch": 0.96, + "grad_norm": 9.41373348236084, + "learning_rate": 1.5532864252617127e-05, + "loss": 0.7124, + "step": 5620 + }, + { + "epoch": 0.96, + "grad_norm": 10.58165168762207, + "learning_rate": 1.553029002917453e-05, + "loss": 0.5673, + "step": 5621 + }, + { + "epoch": 0.96, + "grad_norm": 11.350859642028809, + "learning_rate": 1.5527715805731937e-05, + "loss": 0.6906, + "step": 5622 + }, + { + "epoch": 0.96, + "grad_norm": 13.630638122558594, + "learning_rate": 1.552514158228934e-05, + "loss": 1.0032, + "step": 5623 + }, + { + "epoch": 0.97, + "grad_norm": 11.555750846862793, + "learning_rate": 1.552256735884675e-05, + "loss": 0.9854, + "step": 5624 + }, + { + "epoch": 0.97, + "grad_norm": 9.529454231262207, + "learning_rate": 1.5519993135404154e-05, + "loss": 0.5593, + "step": 5625 + }, + { + "epoch": 0.97, + "grad_norm": 9.921797752380371, + "learning_rate": 1.551741891196156e-05, + "loss": 0.7022, + "step": 5626 + }, + { + "epoch": 0.97, + "grad_norm": 10.80700969696045, + "learning_rate": 1.5514844688518964e-05, + "loss": 0.7121, + "step": 5627 + }, + { + "epoch": 0.97, + "grad_norm": 8.550198554992676, + "learning_rate": 1.551227046507637e-05, + "loss": 0.451, + "step": 5628 + }, + { + "epoch": 0.97, + "grad_norm": 11.028861999511719, + "learning_rate": 1.5509696241633774e-05, + "loss": 0.7682, + "step": 5629 + }, + { + "epoch": 0.97, + "grad_norm": 10.928542137145996, + "learning_rate": 1.5507122018191177e-05, + "loss": 0.7565, + "step": 5630 + }, + { + "epoch": 0.97, + "grad_norm": 10.143327713012695, + "learning_rate": 1.5504547794748584e-05, + "loss": 0.5115, + "step": 5631 + }, + { + "epoch": 0.97, + "grad_norm": 9.377349853515625, + "learning_rate": 1.5501973571305987e-05, + "loss": 0.6014, + "step": 5632 + }, + { + "epoch": 0.97, + "grad_norm": 14.393840789794922, + "learning_rate": 1.5499399347863394e-05, + "loss": 0.9157, + "step": 5633 + }, + { + "epoch": 0.97, + "grad_norm": 9.588883399963379, + "learning_rate": 1.54968251244208e-05, + "loss": 0.7787, + "step": 5634 + }, + { + "epoch": 0.97, + "grad_norm": 7.148269176483154, + "learning_rate": 1.5494250900978207e-05, + "loss": 0.5022, + "step": 5635 + }, + { + "epoch": 0.97, + "grad_norm": 9.02895450592041, + "learning_rate": 1.549167667753561e-05, + "loss": 0.7828, + "step": 5636 + }, + { + "epoch": 0.97, + "grad_norm": 7.645138740539551, + "learning_rate": 1.5489102454093017e-05, + "loss": 0.688, + "step": 5637 + }, + { + "epoch": 0.97, + "grad_norm": 10.871220588684082, + "learning_rate": 1.548652823065042e-05, + "loss": 0.6839, + "step": 5638 + }, + { + "epoch": 0.97, + "grad_norm": 10.227852821350098, + "learning_rate": 1.5483954007207827e-05, + "loss": 0.6695, + "step": 5639 + }, + { + "epoch": 0.97, + "grad_norm": 10.953507423400879, + "learning_rate": 1.548137978376523e-05, + "loss": 0.6187, + "step": 5640 + }, + { + "epoch": 0.97, + "grad_norm": 10.557047843933105, + "learning_rate": 1.5478805560322634e-05, + "loss": 0.6582, + "step": 5641 + }, + { + "epoch": 0.97, + "grad_norm": 11.249770164489746, + "learning_rate": 1.547623133688004e-05, + "loss": 0.6913, + "step": 5642 + }, + { + "epoch": 0.97, + "grad_norm": 10.216269493103027, + "learning_rate": 1.5473657113437447e-05, + "loss": 0.7298, + "step": 5643 + }, + { + "epoch": 0.97, + "grad_norm": 11.961535453796387, + "learning_rate": 1.5471082889994854e-05, + "loss": 0.7452, + "step": 5644 + }, + { + "epoch": 0.97, + "grad_norm": 12.155769348144531, + "learning_rate": 1.5468508666552257e-05, + "loss": 0.8345, + "step": 5645 + }, + { + "epoch": 0.97, + "grad_norm": 10.022253036499023, + "learning_rate": 1.5465934443109664e-05, + "loss": 0.4066, + "step": 5646 + }, + { + "epoch": 0.97, + "grad_norm": 14.56843090057373, + "learning_rate": 1.5463360219667067e-05, + "loss": 0.9937, + "step": 5647 + }, + { + "epoch": 0.97, + "grad_norm": 11.05449104309082, + "learning_rate": 1.5460785996224474e-05, + "loss": 0.7719, + "step": 5648 + }, + { + "epoch": 0.97, + "grad_norm": 10.686256408691406, + "learning_rate": 1.5458211772781877e-05, + "loss": 0.6502, + "step": 5649 + }, + { + "epoch": 0.97, + "grad_norm": 12.032071113586426, + "learning_rate": 1.5455637549339284e-05, + "loss": 0.7303, + "step": 5650 + }, + { + "epoch": 0.97, + "grad_norm": 10.978096008300781, + "learning_rate": 1.5453063325896687e-05, + "loss": 0.7298, + "step": 5651 + }, + { + "epoch": 0.97, + "grad_norm": 9.85512638092041, + "learning_rate": 1.545048910245409e-05, + "loss": 0.806, + "step": 5652 + }, + { + "epoch": 0.97, + "grad_norm": 12.632328033447266, + "learning_rate": 1.54479148790115e-05, + "loss": 0.688, + "step": 5653 + }, + { + "epoch": 0.97, + "grad_norm": 9.15112018585205, + "learning_rate": 1.5445340655568904e-05, + "loss": 0.6182, + "step": 5654 + }, + { + "epoch": 0.97, + "grad_norm": 9.540310859680176, + "learning_rate": 1.544276643212631e-05, + "loss": 0.5802, + "step": 5655 + }, + { + "epoch": 0.97, + "grad_norm": 10.291088104248047, + "learning_rate": 1.5440192208683714e-05, + "loss": 0.9598, + "step": 5656 + }, + { + "epoch": 0.97, + "grad_norm": 7.799928188323975, + "learning_rate": 1.543761798524112e-05, + "loss": 0.5829, + "step": 5657 + }, + { + "epoch": 0.97, + "grad_norm": 9.369114875793457, + "learning_rate": 1.5435043761798524e-05, + "loss": 0.81, + "step": 5658 + }, + { + "epoch": 0.97, + "grad_norm": 10.463690757751465, + "learning_rate": 1.543246953835593e-05, + "loss": 0.7768, + "step": 5659 + }, + { + "epoch": 0.97, + "grad_norm": 12.777408599853516, + "learning_rate": 1.5429895314913334e-05, + "loss": 0.8269, + "step": 5660 + }, + { + "epoch": 0.97, + "grad_norm": 13.610390663146973, + "learning_rate": 1.5427321091470737e-05, + "loss": 0.8049, + "step": 5661 + }, + { + "epoch": 0.97, + "grad_norm": 10.596389770507812, + "learning_rate": 1.5424746868028147e-05, + "loss": 0.6828, + "step": 5662 + }, + { + "epoch": 0.97, + "grad_norm": 8.544905662536621, + "learning_rate": 1.542217264458555e-05, + "loss": 0.6366, + "step": 5663 + }, + { + "epoch": 0.97, + "grad_norm": 10.65538215637207, + "learning_rate": 1.5419598421142957e-05, + "loss": 0.5219, + "step": 5664 + }, + { + "epoch": 0.97, + "grad_norm": 9.16901969909668, + "learning_rate": 1.541702419770036e-05, + "loss": 0.6199, + "step": 5665 + }, + { + "epoch": 0.97, + "grad_norm": 11.136821746826172, + "learning_rate": 1.5414449974257767e-05, + "loss": 0.7471, + "step": 5666 + }, + { + "epoch": 0.97, + "grad_norm": 12.340278625488281, + "learning_rate": 1.541187575081517e-05, + "loss": 0.82, + "step": 5667 + }, + { + "epoch": 0.97, + "grad_norm": 11.651012420654297, + "learning_rate": 1.5409301527372577e-05, + "loss": 0.9482, + "step": 5668 + }, + { + "epoch": 0.97, + "grad_norm": 10.645423889160156, + "learning_rate": 1.540672730392998e-05, + "loss": 0.6914, + "step": 5669 + }, + { + "epoch": 0.97, + "grad_norm": 9.610738754272461, + "learning_rate": 1.5404153080487387e-05, + "loss": 0.6715, + "step": 5670 + }, + { + "epoch": 0.97, + "grad_norm": 13.92861270904541, + "learning_rate": 1.5401578857044794e-05, + "loss": 0.8211, + "step": 5671 + }, + { + "epoch": 0.97, + "grad_norm": 13.625335693359375, + "learning_rate": 1.5399004633602197e-05, + "loss": 0.7101, + "step": 5672 + }, + { + "epoch": 0.97, + "grad_norm": 11.360447883605957, + "learning_rate": 1.5396430410159604e-05, + "loss": 0.7596, + "step": 5673 + }, + { + "epoch": 0.97, + "grad_norm": 10.988791465759277, + "learning_rate": 1.5393856186717007e-05, + "loss": 0.6096, + "step": 5674 + }, + { + "epoch": 0.97, + "grad_norm": 11.392794609069824, + "learning_rate": 1.5391281963274414e-05, + "loss": 0.9062, + "step": 5675 + }, + { + "epoch": 0.97, + "grad_norm": 13.133210182189941, + "learning_rate": 1.5388707739831817e-05, + "loss": 0.956, + "step": 5676 + }, + { + "epoch": 0.97, + "grad_norm": 11.403499603271484, + "learning_rate": 1.5386133516389223e-05, + "loss": 0.5713, + "step": 5677 + }, + { + "epoch": 0.97, + "grad_norm": 10.655791282653809, + "learning_rate": 1.5383559292946627e-05, + "loss": 0.7032, + "step": 5678 + }, + { + "epoch": 0.97, + "grad_norm": 8.98996639251709, + "learning_rate": 1.5380985069504033e-05, + "loss": 0.4332, + "step": 5679 + }, + { + "epoch": 0.97, + "grad_norm": 9.498340606689453, + "learning_rate": 1.5378410846061437e-05, + "loss": 0.6589, + "step": 5680 + }, + { + "epoch": 0.97, + "grad_norm": 9.076794624328613, + "learning_rate": 1.5375836622618847e-05, + "loss": 0.5753, + "step": 5681 + }, + { + "epoch": 0.98, + "grad_norm": 10.586895942687988, + "learning_rate": 1.537326239917625e-05, + "loss": 0.6369, + "step": 5682 + }, + { + "epoch": 0.98, + "grad_norm": 13.331412315368652, + "learning_rate": 1.5370688175733653e-05, + "loss": 0.8093, + "step": 5683 + }, + { + "epoch": 0.98, + "grad_norm": 9.331785202026367, + "learning_rate": 1.536811395229106e-05, + "loss": 0.5642, + "step": 5684 + }, + { + "epoch": 0.98, + "grad_norm": 14.352324485778809, + "learning_rate": 1.5365539728848463e-05, + "loss": 1.1086, + "step": 5685 + }, + { + "epoch": 0.98, + "grad_norm": 10.84002685546875, + "learning_rate": 1.536296550540587e-05, + "loss": 0.7655, + "step": 5686 + }, + { + "epoch": 0.98, + "grad_norm": 10.961621284484863, + "learning_rate": 1.5360391281963273e-05, + "loss": 0.5642, + "step": 5687 + }, + { + "epoch": 0.98, + "grad_norm": 11.137855529785156, + "learning_rate": 1.535781705852068e-05, + "loss": 0.7061, + "step": 5688 + }, + { + "epoch": 0.98, + "grad_norm": 10.495466232299805, + "learning_rate": 1.5355242835078083e-05, + "loss": 0.8364, + "step": 5689 + }, + { + "epoch": 0.98, + "grad_norm": 10.012463569641113, + "learning_rate": 1.5352668611635493e-05, + "loss": 0.8006, + "step": 5690 + }, + { + "epoch": 0.98, + "grad_norm": 10.657920837402344, + "learning_rate": 1.5350094388192897e-05, + "loss": 0.6529, + "step": 5691 + }, + { + "epoch": 0.98, + "grad_norm": 9.993449211120605, + "learning_rate": 1.53475201647503e-05, + "loss": 0.6604, + "step": 5692 + }, + { + "epoch": 0.98, + "grad_norm": 9.653358459472656, + "learning_rate": 1.5344945941307707e-05, + "loss": 0.788, + "step": 5693 + }, + { + "epoch": 0.98, + "grad_norm": 11.433140754699707, + "learning_rate": 1.534237171786511e-05, + "loss": 0.6557, + "step": 5694 + }, + { + "epoch": 0.98, + "grad_norm": 10.148329734802246, + "learning_rate": 1.5339797494422517e-05, + "loss": 0.7572, + "step": 5695 + }, + { + "epoch": 0.98, + "grad_norm": 12.149693489074707, + "learning_rate": 1.533722327097992e-05, + "loss": 0.5877, + "step": 5696 + }, + { + "epoch": 0.98, + "grad_norm": 11.582533836364746, + "learning_rate": 1.5334649047537327e-05, + "loss": 0.8213, + "step": 5697 + }, + { + "epoch": 0.98, + "grad_norm": 9.772625923156738, + "learning_rate": 1.533207482409473e-05, + "loss": 0.6146, + "step": 5698 + }, + { + "epoch": 0.98, + "grad_norm": 10.03817367553711, + "learning_rate": 1.5329500600652137e-05, + "loss": 0.7681, + "step": 5699 + }, + { + "epoch": 0.98, + "grad_norm": 9.559398651123047, + "learning_rate": 1.5326926377209543e-05, + "loss": 0.6298, + "step": 5700 + }, + { + "epoch": 0.98, + "grad_norm": 9.982650756835938, + "learning_rate": 1.532435215376695e-05, + "loss": 0.6202, + "step": 5701 + }, + { + "epoch": 0.98, + "grad_norm": 8.992537498474121, + "learning_rate": 1.5321777930324353e-05, + "loss": 0.8079, + "step": 5702 + }, + { + "epoch": 0.98, + "grad_norm": 10.939094543457031, + "learning_rate": 1.5319203706881757e-05, + "loss": 0.5668, + "step": 5703 + }, + { + "epoch": 0.98, + "grad_norm": 9.615065574645996, + "learning_rate": 1.5316629483439163e-05, + "loss": 0.6235, + "step": 5704 + }, + { + "epoch": 0.98, + "grad_norm": 11.674843788146973, + "learning_rate": 1.5314055259996567e-05, + "loss": 0.7264, + "step": 5705 + }, + { + "epoch": 0.98, + "grad_norm": 9.875312805175781, + "learning_rate": 1.5311481036553973e-05, + "loss": 0.7015, + "step": 5706 + }, + { + "epoch": 0.98, + "grad_norm": 10.64791202545166, + "learning_rate": 1.5308906813111376e-05, + "loss": 0.8735, + "step": 5707 + }, + { + "epoch": 0.98, + "grad_norm": 13.883646965026855, + "learning_rate": 1.5306332589668783e-05, + "loss": 0.7349, + "step": 5708 + }, + { + "epoch": 0.98, + "grad_norm": 7.677424430847168, + "learning_rate": 1.530375836622619e-05, + "loss": 0.5359, + "step": 5709 + }, + { + "epoch": 0.98, + "grad_norm": 9.181770324707031, + "learning_rate": 1.5301184142783597e-05, + "loss": 0.5543, + "step": 5710 + }, + { + "epoch": 0.98, + "grad_norm": 17.04793930053711, + "learning_rate": 1.5298609919341e-05, + "loss": 0.9618, + "step": 5711 + }, + { + "epoch": 0.98, + "grad_norm": 9.564706802368164, + "learning_rate": 1.5296035695898407e-05, + "loss": 0.4316, + "step": 5712 + }, + { + "epoch": 0.98, + "grad_norm": 10.771110534667969, + "learning_rate": 1.529346147245581e-05, + "loss": 0.7018, + "step": 5713 + }, + { + "epoch": 0.98, + "grad_norm": 10.063695907592773, + "learning_rate": 1.5290887249013213e-05, + "loss": 0.7138, + "step": 5714 + }, + { + "epoch": 0.98, + "grad_norm": 10.364028930664062, + "learning_rate": 1.528831302557062e-05, + "loss": 0.6103, + "step": 5715 + }, + { + "epoch": 0.98, + "grad_norm": 9.407870292663574, + "learning_rate": 1.5285738802128023e-05, + "loss": 0.6776, + "step": 5716 + }, + { + "epoch": 0.98, + "grad_norm": 11.684124946594238, + "learning_rate": 1.528316457868543e-05, + "loss": 0.8618, + "step": 5717 + }, + { + "epoch": 0.98, + "grad_norm": 11.859552383422852, + "learning_rate": 1.5280590355242833e-05, + "loss": 0.7771, + "step": 5718 + }, + { + "epoch": 0.98, + "grad_norm": 12.708332061767578, + "learning_rate": 1.5278016131800243e-05, + "loss": 0.9494, + "step": 5719 + }, + { + "epoch": 0.98, + "grad_norm": 9.849374771118164, + "learning_rate": 1.5275441908357646e-05, + "loss": 0.6346, + "step": 5720 + }, + { + "epoch": 0.98, + "grad_norm": 9.793769836425781, + "learning_rate": 1.5272867684915053e-05, + "loss": 0.6131, + "step": 5721 + }, + { + "epoch": 0.98, + "grad_norm": 10.740447998046875, + "learning_rate": 1.5270293461472456e-05, + "loss": 0.9266, + "step": 5722 + }, + { + "epoch": 0.98, + "grad_norm": 10.526744842529297, + "learning_rate": 1.526771923802986e-05, + "loss": 0.9554, + "step": 5723 + }, + { + "epoch": 0.98, + "grad_norm": 10.141620635986328, + "learning_rate": 1.5265145014587266e-05, + "loss": 0.7873, + "step": 5724 + }, + { + "epoch": 0.98, + "grad_norm": 7.561409950256348, + "learning_rate": 1.526257079114467e-05, + "loss": 0.5033, + "step": 5725 + }, + { + "epoch": 0.98, + "grad_norm": 11.583409309387207, + "learning_rate": 1.5259996567702076e-05, + "loss": 0.7237, + "step": 5726 + }, + { + "epoch": 0.98, + "grad_norm": 8.0056791305542, + "learning_rate": 1.5257422344259481e-05, + "loss": 0.5162, + "step": 5727 + }, + { + "epoch": 0.98, + "grad_norm": 8.778443336486816, + "learning_rate": 1.5254848120816888e-05, + "loss": 0.6362, + "step": 5728 + }, + { + "epoch": 0.98, + "grad_norm": 11.222827911376953, + "learning_rate": 1.5252273897374293e-05, + "loss": 0.9317, + "step": 5729 + }, + { + "epoch": 0.98, + "grad_norm": 7.880953311920166, + "learning_rate": 1.5249699673931698e-05, + "loss": 0.4786, + "step": 5730 + }, + { + "epoch": 0.98, + "grad_norm": 11.761146545410156, + "learning_rate": 1.5247125450489103e-05, + "loss": 0.4865, + "step": 5731 + }, + { + "epoch": 0.98, + "grad_norm": 10.01679515838623, + "learning_rate": 1.5244551227046508e-05, + "loss": 0.7607, + "step": 5732 + }, + { + "epoch": 0.98, + "grad_norm": 10.966072082519531, + "learning_rate": 1.5241977003603913e-05, + "loss": 0.6358, + "step": 5733 + }, + { + "epoch": 0.98, + "grad_norm": 9.958866119384766, + "learning_rate": 1.5239402780161318e-05, + "loss": 0.6514, + "step": 5734 + }, + { + "epoch": 0.98, + "grad_norm": 10.584814071655273, + "learning_rate": 1.5236828556718723e-05, + "loss": 0.6512, + "step": 5735 + }, + { + "epoch": 0.98, + "grad_norm": 9.573479652404785, + "learning_rate": 1.5234254333276128e-05, + "loss": 0.5867, + "step": 5736 + }, + { + "epoch": 0.98, + "grad_norm": 9.151217460632324, + "learning_rate": 1.5231680109833533e-05, + "loss": 0.5512, + "step": 5737 + }, + { + "epoch": 0.98, + "grad_norm": 9.794495582580566, + "learning_rate": 1.5229105886390941e-05, + "loss": 0.6392, + "step": 5738 + }, + { + "epoch": 0.98, + "grad_norm": 14.396523475646973, + "learning_rate": 1.5226531662948345e-05, + "loss": 0.6959, + "step": 5739 + }, + { + "epoch": 0.99, + "grad_norm": 11.361831665039062, + "learning_rate": 1.522395743950575e-05, + "loss": 0.6456, + "step": 5740 + }, + { + "epoch": 0.99, + "grad_norm": 11.718506813049316, + "learning_rate": 1.5221383216063155e-05, + "loss": 0.837, + "step": 5741 + }, + { + "epoch": 0.99, + "grad_norm": 12.24177074432373, + "learning_rate": 1.521880899262056e-05, + "loss": 0.7099, + "step": 5742 + }, + { + "epoch": 0.99, + "grad_norm": 10.529495239257812, + "learning_rate": 1.5216234769177965e-05, + "loss": 0.7896, + "step": 5743 + }, + { + "epoch": 0.99, + "grad_norm": 9.81344223022461, + "learning_rate": 1.521366054573537e-05, + "loss": 0.6909, + "step": 5744 + }, + { + "epoch": 0.99, + "grad_norm": 12.75560474395752, + "learning_rate": 1.5211086322292774e-05, + "loss": 0.52, + "step": 5745 + }, + { + "epoch": 0.99, + "grad_norm": 9.679831504821777, + "learning_rate": 1.520851209885018e-05, + "loss": 0.5626, + "step": 5746 + }, + { + "epoch": 0.99, + "grad_norm": 11.186136245727539, + "learning_rate": 1.5205937875407588e-05, + "loss": 0.6164, + "step": 5747 + }, + { + "epoch": 0.99, + "grad_norm": 11.998634338378906, + "learning_rate": 1.5203363651964993e-05, + "loss": 0.776, + "step": 5748 + }, + { + "epoch": 0.99, + "grad_norm": 8.41545295715332, + "learning_rate": 1.5200789428522396e-05, + "loss": 0.4787, + "step": 5749 + }, + { + "epoch": 0.99, + "grad_norm": 13.25280475616455, + "learning_rate": 1.5198215205079801e-05, + "loss": 0.5787, + "step": 5750 + }, + { + "epoch": 0.99, + "grad_norm": 11.7398681640625, + "learning_rate": 1.5195640981637206e-05, + "loss": 0.7759, + "step": 5751 + }, + { + "epoch": 0.99, + "grad_norm": 12.124595642089844, + "learning_rate": 1.5193066758194611e-05, + "loss": 0.6163, + "step": 5752 + }, + { + "epoch": 0.99, + "grad_norm": 11.229990005493164, + "learning_rate": 1.5190492534752016e-05, + "loss": 0.6962, + "step": 5753 + }, + { + "epoch": 0.99, + "grad_norm": 9.292335510253906, + "learning_rate": 1.5187918311309421e-05, + "loss": 0.7061, + "step": 5754 + }, + { + "epoch": 0.99, + "grad_norm": 10.922220230102539, + "learning_rate": 1.5185344087866826e-05, + "loss": 0.6132, + "step": 5755 + }, + { + "epoch": 0.99, + "grad_norm": 10.665719032287598, + "learning_rate": 1.5182769864424231e-05, + "loss": 0.6725, + "step": 5756 + }, + { + "epoch": 0.99, + "grad_norm": 9.034870147705078, + "learning_rate": 1.518019564098164e-05, + "loss": 0.6868, + "step": 5757 + }, + { + "epoch": 0.99, + "grad_norm": 11.273383140563965, + "learning_rate": 1.5177621417539044e-05, + "loss": 0.54, + "step": 5758 + }, + { + "epoch": 0.99, + "grad_norm": 11.658744812011719, + "learning_rate": 1.517504719409645e-05, + "loss": 0.5812, + "step": 5759 + }, + { + "epoch": 0.99, + "grad_norm": 9.452827453613281, + "learning_rate": 1.5172472970653853e-05, + "loss": 0.5922, + "step": 5760 + }, + { + "epoch": 0.99, + "grad_norm": 15.487955093383789, + "learning_rate": 1.5169898747211258e-05, + "loss": 0.7667, + "step": 5761 + }, + { + "epoch": 0.99, + "grad_norm": 11.797428131103516, + "learning_rate": 1.5167324523768663e-05, + "loss": 0.6671, + "step": 5762 + }, + { + "epoch": 0.99, + "grad_norm": 9.021202087402344, + "learning_rate": 1.5164750300326068e-05, + "loss": 0.4791, + "step": 5763 + }, + { + "epoch": 0.99, + "grad_norm": 13.655767440795898, + "learning_rate": 1.5162176076883473e-05, + "loss": 1.0293, + "step": 5764 + }, + { + "epoch": 0.99, + "grad_norm": 12.35323429107666, + "learning_rate": 1.5159601853440878e-05, + "loss": 0.6349, + "step": 5765 + }, + { + "epoch": 0.99, + "grad_norm": 12.621145248413086, + "learning_rate": 1.5157027629998286e-05, + "loss": 0.4933, + "step": 5766 + }, + { + "epoch": 0.99, + "grad_norm": 12.433503150939941, + "learning_rate": 1.5154453406555691e-05, + "loss": 0.7231, + "step": 5767 + }, + { + "epoch": 0.99, + "grad_norm": 11.64327621459961, + "learning_rate": 1.5151879183113096e-05, + "loss": 0.8214, + "step": 5768 + }, + { + "epoch": 0.99, + "grad_norm": 8.654711723327637, + "learning_rate": 1.5149304959670501e-05, + "loss": 0.5217, + "step": 5769 + }, + { + "epoch": 0.99, + "grad_norm": 8.769917488098145, + "learning_rate": 1.5146730736227904e-05, + "loss": 0.7467, + "step": 5770 + }, + { + "epoch": 0.99, + "grad_norm": 8.103089332580566, + "learning_rate": 1.514415651278531e-05, + "loss": 0.4932, + "step": 5771 + }, + { + "epoch": 0.99, + "grad_norm": 8.62012004852295, + "learning_rate": 1.5141582289342714e-05, + "loss": 0.5993, + "step": 5772 + }, + { + "epoch": 0.99, + "grad_norm": 10.610448837280273, + "learning_rate": 1.513900806590012e-05, + "loss": 0.6998, + "step": 5773 + }, + { + "epoch": 0.99, + "grad_norm": 11.021026611328125, + "learning_rate": 1.5136433842457524e-05, + "loss": 0.5324, + "step": 5774 + }, + { + "epoch": 0.99, + "grad_norm": 7.560358047485352, + "learning_rate": 1.5133859619014933e-05, + "loss": 0.4598, + "step": 5775 + }, + { + "epoch": 0.99, + "grad_norm": 14.713452339172363, + "learning_rate": 1.5131285395572338e-05, + "loss": 0.8595, + "step": 5776 + }, + { + "epoch": 0.99, + "grad_norm": 8.286099433898926, + "learning_rate": 1.5128711172129743e-05, + "loss": 0.7166, + "step": 5777 + }, + { + "epoch": 0.99, + "grad_norm": 10.132092475891113, + "learning_rate": 1.5126136948687148e-05, + "loss": 0.5737, + "step": 5778 + }, + { + "epoch": 0.99, + "grad_norm": 9.688316345214844, + "learning_rate": 1.5123562725244553e-05, + "loss": 0.7009, + "step": 5779 + }, + { + "epoch": 0.99, + "grad_norm": 12.423620223999023, + "learning_rate": 1.5120988501801958e-05, + "loss": 0.6277, + "step": 5780 + }, + { + "epoch": 0.99, + "grad_norm": 11.948112487792969, + "learning_rate": 1.511841427835936e-05, + "loss": 0.8181, + "step": 5781 + }, + { + "epoch": 0.99, + "grad_norm": 9.351773262023926, + "learning_rate": 1.5115840054916766e-05, + "loss": 0.5594, + "step": 5782 + }, + { + "epoch": 0.99, + "grad_norm": 12.852547645568848, + "learning_rate": 1.511326583147417e-05, + "loss": 0.9714, + "step": 5783 + }, + { + "epoch": 0.99, + "grad_norm": 10.424971580505371, + "learning_rate": 1.5110691608031576e-05, + "loss": 0.6949, + "step": 5784 + }, + { + "epoch": 0.99, + "grad_norm": 11.850028038024902, + "learning_rate": 1.5108117384588984e-05, + "loss": 0.8682, + "step": 5785 + }, + { + "epoch": 0.99, + "grad_norm": 9.023741722106934, + "learning_rate": 1.5105543161146389e-05, + "loss": 0.6199, + "step": 5786 + }, + { + "epoch": 0.99, + "grad_norm": 10.838698387145996, + "learning_rate": 1.5102968937703794e-05, + "loss": 0.5625, + "step": 5787 + }, + { + "epoch": 0.99, + "grad_norm": 10.173796653747559, + "learning_rate": 1.5100394714261199e-05, + "loss": 0.6288, + "step": 5788 + }, + { + "epoch": 0.99, + "grad_norm": 8.8538818359375, + "learning_rate": 1.5097820490818604e-05, + "loss": 0.4745, + "step": 5789 + }, + { + "epoch": 0.99, + "grad_norm": 7.6320319175720215, + "learning_rate": 1.5095246267376009e-05, + "loss": 0.527, + "step": 5790 + }, + { + "epoch": 0.99, + "grad_norm": 10.076807975769043, + "learning_rate": 1.5092672043933412e-05, + "loss": 0.5604, + "step": 5791 + }, + { + "epoch": 0.99, + "grad_norm": 9.503802299499512, + "learning_rate": 1.5090097820490817e-05, + "loss": 0.5523, + "step": 5792 + }, + { + "epoch": 0.99, + "grad_norm": 12.966728210449219, + "learning_rate": 1.5087523597048222e-05, + "loss": 0.8937, + "step": 5793 + }, + { + "epoch": 0.99, + "grad_norm": 12.7310209274292, + "learning_rate": 1.508494937360563e-05, + "loss": 0.8369, + "step": 5794 + }, + { + "epoch": 0.99, + "grad_norm": 9.883963584899902, + "learning_rate": 1.5082375150163036e-05, + "loss": 0.7915, + "step": 5795 + }, + { + "epoch": 0.99, + "grad_norm": 9.055442810058594, + "learning_rate": 1.507980092672044e-05, + "loss": 0.5225, + "step": 5796 + }, + { + "epoch": 0.99, + "grad_norm": 11.309677124023438, + "learning_rate": 1.5077226703277846e-05, + "loss": 0.5614, + "step": 5797 + }, + { + "epoch": 1.0, + "grad_norm": 11.3325834274292, + "learning_rate": 1.507465247983525e-05, + "loss": 0.8401, + "step": 5798 + }, + { + "epoch": 1.0, + "grad_norm": 13.112789154052734, + "learning_rate": 1.5072078256392656e-05, + "loss": 0.7446, + "step": 5799 + }, + { + "epoch": 1.0, + "grad_norm": 9.968636512756348, + "learning_rate": 1.506950403295006e-05, + "loss": 0.5288, + "step": 5800 + }, + { + "epoch": 1.0, + "grad_norm": 12.94079875946045, + "learning_rate": 1.5066929809507466e-05, + "loss": 0.9001, + "step": 5801 + }, + { + "epoch": 1.0, + "grad_norm": 10.881861686706543, + "learning_rate": 1.5064355586064869e-05, + "loss": 0.591, + "step": 5802 + }, + { + "epoch": 1.0, + "grad_norm": 8.56390380859375, + "learning_rate": 1.5061781362622274e-05, + "loss": 0.4244, + "step": 5803 + }, + { + "epoch": 1.0, + "grad_norm": 10.640572547912598, + "learning_rate": 1.5059207139179682e-05, + "loss": 0.6966, + "step": 5804 + }, + { + "epoch": 1.0, + "grad_norm": 13.12524700164795, + "learning_rate": 1.5056632915737087e-05, + "loss": 0.7201, + "step": 5805 + }, + { + "epoch": 1.0, + "grad_norm": 12.12606430053711, + "learning_rate": 1.5054058692294492e-05, + "loss": 0.9234, + "step": 5806 + }, + { + "epoch": 1.0, + "grad_norm": 7.305171489715576, + "learning_rate": 1.5051484468851897e-05, + "loss": 0.4994, + "step": 5807 + }, + { + "epoch": 1.0, + "grad_norm": 10.895528793334961, + "learning_rate": 1.5048910245409302e-05, + "loss": 0.5146, + "step": 5808 + }, + { + "epoch": 1.0, + "grad_norm": 9.512129783630371, + "learning_rate": 1.5046336021966707e-05, + "loss": 0.674, + "step": 5809 + }, + { + "epoch": 1.0, + "grad_norm": 9.729283332824707, + "learning_rate": 1.5043761798524112e-05, + "loss": 0.5777, + "step": 5810 + }, + { + "epoch": 1.0, + "grad_norm": 13.76346206665039, + "learning_rate": 1.5041187575081517e-05, + "loss": 0.7952, + "step": 5811 + }, + { + "epoch": 1.0, + "grad_norm": 10.924771308898926, + "learning_rate": 1.503861335163892e-05, + "loss": 0.6634, + "step": 5812 + }, + { + "epoch": 1.0, + "grad_norm": 12.528681755065918, + "learning_rate": 1.5036039128196329e-05, + "loss": 0.7472, + "step": 5813 + }, + { + "epoch": 1.0, + "grad_norm": 9.787907600402832, + "learning_rate": 1.5033464904753734e-05, + "loss": 0.501, + "step": 5814 + }, + { + "epoch": 1.0, + "grad_norm": 7.359318733215332, + "learning_rate": 1.5030890681311139e-05, + "loss": 0.567, + "step": 5815 + }, + { + "epoch": 1.0, + "grad_norm": 10.569374084472656, + "learning_rate": 1.5028316457868544e-05, + "loss": 0.796, + "step": 5816 + }, + { + "epoch": 1.0, + "grad_norm": 13.525404930114746, + "learning_rate": 1.5025742234425949e-05, + "loss": 0.8468, + "step": 5817 + }, + { + "epoch": 1.0, + "grad_norm": 11.533405303955078, + "learning_rate": 1.5023168010983354e-05, + "loss": 0.7123, + "step": 5818 + }, + { + "epoch": 1.0, + "grad_norm": 10.92939567565918, + "learning_rate": 1.5020593787540759e-05, + "loss": 0.6604, + "step": 5819 + }, + { + "epoch": 1.0, + "grad_norm": 12.282258033752441, + "learning_rate": 1.5018019564098164e-05, + "loss": 0.6263, + "step": 5820 + }, + { + "epoch": 1.0, + "grad_norm": 12.508992195129395, + "learning_rate": 1.5015445340655569e-05, + "loss": 0.7314, + "step": 5821 + }, + { + "epoch": 1.0, + "grad_norm": 9.85505485534668, + "learning_rate": 1.5012871117212972e-05, + "loss": 0.5178, + "step": 5822 + }, + { + "epoch": 1.0, + "grad_norm": 12.519545555114746, + "learning_rate": 1.501029689377038e-05, + "loss": 1.1049, + "step": 5823 + }, + { + "epoch": 1.0, + "grad_norm": 11.095693588256836, + "learning_rate": 1.5007722670327785e-05, + "loss": 0.4994, + "step": 5824 + }, + { + "epoch": 1.0, + "grad_norm": 11.617183685302734, + "learning_rate": 1.500514844688519e-05, + "loss": 0.733, + "step": 5825 + }, + { + "epoch": 1.0, + "grad_norm": 9.229683876037598, + "learning_rate": 1.5002574223442595e-05, + "loss": 0.6827, + "step": 5826 + }, + { + "epoch": 1.0, + "grad_norm": 13.437691688537598, + "learning_rate": 1.5e-05, + "loss": 0.5757, + "step": 5827 + }, + { + "epoch": 1.0, + "grad_norm": 9.996328353881836, + "learning_rate": 1.4997425776557405e-05, + "loss": 0.5054, + "step": 5828 + }, + { + "epoch": 1.0, + "grad_norm": 9.449949264526367, + "learning_rate": 1.499485155311481e-05, + "loss": 0.7316, + "step": 5829 + }, + { + "epoch": 1.0, + "grad_norm": 10.471872329711914, + "learning_rate": 1.4992277329672217e-05, + "loss": 0.6715, + "step": 5830 + }, + { + "epoch": 1.0, + "grad_norm": 9.9858980178833, + "learning_rate": 1.4989703106229622e-05, + "loss": 0.4177, + "step": 5831 + }, + { + "epoch": 1.0, + "grad_norm": 9.0646390914917, + "learning_rate": 1.4987128882787027e-05, + "loss": 0.5055, + "step": 5832 + }, + { + "epoch": 1.0, + "grad_norm": 8.068058013916016, + "learning_rate": 1.498455465934443e-05, + "loss": 0.5957, + "step": 5833 + }, + { + "epoch": 1.0, + "grad_norm": 8.851325035095215, + "learning_rate": 1.4981980435901835e-05, + "loss": 0.5739, + "step": 5834 + }, + { + "epoch": 1.0, + "grad_norm": 8.4882230758667, + "learning_rate": 1.4979406212459242e-05, + "loss": 0.4547, + "step": 5835 + }, + { + "epoch": 1.0, + "grad_norm": 9.539527893066406, + "learning_rate": 1.4976831989016647e-05, + "loss": 0.5826, + "step": 5836 + }, + { + "epoch": 1.0, + "grad_norm": 9.06041145324707, + "learning_rate": 1.4974257765574052e-05, + "loss": 0.6335, + "step": 5837 + }, + { + "epoch": 1.0, + "grad_norm": 10.827814102172852, + "learning_rate": 1.4971683542131457e-05, + "loss": 0.6449, + "step": 5838 + }, + { + "epoch": 1.0, + "grad_norm": 11.952459335327148, + "learning_rate": 1.4969109318688864e-05, + "loss": 0.6177, + "step": 5839 + }, + { + "epoch": 1.0, + "grad_norm": 9.339852333068848, + "learning_rate": 1.4966535095246269e-05, + "loss": 0.7179, + "step": 5840 + }, + { + "epoch": 1.0, + "grad_norm": 10.543264389038086, + "learning_rate": 1.4963960871803674e-05, + "loss": 0.6508, + "step": 5841 + }, + { + "epoch": 1.0, + "grad_norm": 8.495478630065918, + "learning_rate": 1.4961386648361079e-05, + "loss": 0.6763, + "step": 5842 + }, + { + "epoch": 1.0, + "grad_norm": 10.267716407775879, + "learning_rate": 1.4958812424918482e-05, + "loss": 0.7016, + "step": 5843 + }, + { + "epoch": 1.0, + "grad_norm": 10.899517059326172, + "learning_rate": 1.4956238201475889e-05, + "loss": 0.7314, + "step": 5844 + }, + { + "epoch": 1.0, + "grad_norm": 7.8445658683776855, + "learning_rate": 1.4953663978033294e-05, + "loss": 0.5011, + "step": 5845 + }, + { + "epoch": 1.0, + "grad_norm": 11.901155471801758, + "learning_rate": 1.4951089754590699e-05, + "loss": 0.813, + "step": 5846 + }, + { + "epoch": 1.0, + "grad_norm": 9.040955543518066, + "learning_rate": 1.4948515531148104e-05, + "loss": 0.6457, + "step": 5847 + }, + { + "epoch": 1.0, + "grad_norm": 8.827336311340332, + "learning_rate": 1.4945941307705509e-05, + "loss": 0.5509, + "step": 5848 + }, + { + "epoch": 1.0, + "grad_norm": 9.569506645202637, + "learning_rate": 1.4943367084262915e-05, + "loss": 0.5852, + "step": 5849 + }, + { + "epoch": 1.0, + "grad_norm": 11.296262741088867, + "learning_rate": 1.494079286082032e-05, + "loss": 0.4235, + "step": 5850 + }, + { + "epoch": 1.0, + "grad_norm": 7.892028331756592, + "learning_rate": 1.4938218637377725e-05, + "loss": 0.4394, + "step": 5851 + }, + { + "epoch": 1.0, + "grad_norm": 9.314630508422852, + "learning_rate": 1.493564441393513e-05, + "loss": 0.6473, + "step": 5852 + }, + { + "epoch": 1.0, + "grad_norm": 8.771956443786621, + "learning_rate": 1.4933070190492535e-05, + "loss": 0.5424, + "step": 5853 + }, + { + "epoch": 1.0, + "grad_norm": 10.939587593078613, + "learning_rate": 1.493049596704994e-05, + "loss": 0.8219, + "step": 5854 + }, + { + "epoch": 1.0, + "grad_norm": 6.545926570892334, + "learning_rate": 1.4927921743607345e-05, + "loss": 0.4095, + "step": 5855 + }, + { + "epoch": 1.0, + "grad_norm": 9.807062149047852, + "learning_rate": 1.492534752016475e-05, + "loss": 0.5949, + "step": 5856 + }, + { + "epoch": 1.01, + "grad_norm": 9.42801570892334, + "learning_rate": 1.4922773296722155e-05, + "loss": 0.4882, + "step": 5857 + }, + { + "epoch": 1.01, + "grad_norm": 10.682374954223633, + "learning_rate": 1.4920199073279562e-05, + "loss": 0.6485, + "step": 5858 + }, + { + "epoch": 1.01, + "grad_norm": 10.165955543518066, + "learning_rate": 1.4917624849836967e-05, + "loss": 0.6441, + "step": 5859 + }, + { + "epoch": 1.01, + "grad_norm": 10.182374954223633, + "learning_rate": 1.4915050626394372e-05, + "loss": 0.5382, + "step": 5860 + }, + { + "epoch": 1.01, + "grad_norm": 12.19049072265625, + "learning_rate": 1.4912476402951777e-05, + "loss": 0.5167, + "step": 5861 + }, + { + "epoch": 1.01, + "grad_norm": 9.343765258789062, + "learning_rate": 1.4909902179509182e-05, + "loss": 0.5953, + "step": 5862 + }, + { + "epoch": 1.01, + "grad_norm": 14.150381088256836, + "learning_rate": 1.4907327956066588e-05, + "loss": 0.8045, + "step": 5863 + }, + { + "epoch": 1.01, + "grad_norm": 13.685317039489746, + "learning_rate": 1.4904753732623992e-05, + "loss": 0.5688, + "step": 5864 + }, + { + "epoch": 1.01, + "grad_norm": 14.326414108276367, + "learning_rate": 1.4902179509181397e-05, + "loss": 0.8766, + "step": 5865 + }, + { + "epoch": 1.01, + "grad_norm": 6.847957134246826, + "learning_rate": 1.4899605285738802e-05, + "loss": 0.4393, + "step": 5866 + }, + { + "epoch": 1.01, + "grad_norm": 11.0089693069458, + "learning_rate": 1.4897031062296207e-05, + "loss": 0.5271, + "step": 5867 + }, + { + "epoch": 1.01, + "grad_norm": 7.443350315093994, + "learning_rate": 1.4894456838853613e-05, + "loss": 0.4036, + "step": 5868 + }, + { + "epoch": 1.01, + "grad_norm": 12.425400733947754, + "learning_rate": 1.4891882615411018e-05, + "loss": 0.8541, + "step": 5869 + }, + { + "epoch": 1.01, + "grad_norm": 8.673444747924805, + "learning_rate": 1.4889308391968423e-05, + "loss": 0.6116, + "step": 5870 + }, + { + "epoch": 1.01, + "grad_norm": 8.117676734924316, + "learning_rate": 1.4886734168525828e-05, + "loss": 0.4866, + "step": 5871 + }, + { + "epoch": 1.01, + "grad_norm": 10.535839080810547, + "learning_rate": 1.4884159945083235e-05, + "loss": 0.496, + "step": 5872 + }, + { + "epoch": 1.01, + "grad_norm": 10.214252471923828, + "learning_rate": 1.488158572164064e-05, + "loss": 0.6112, + "step": 5873 + }, + { + "epoch": 1.01, + "grad_norm": 11.156107902526855, + "learning_rate": 1.4879011498198043e-05, + "loss": 0.7178, + "step": 5874 + }, + { + "epoch": 1.01, + "grad_norm": 12.114778518676758, + "learning_rate": 1.4876437274755448e-05, + "loss": 0.716, + "step": 5875 + }, + { + "epoch": 1.01, + "grad_norm": 13.132325172424316, + "learning_rate": 1.4873863051312853e-05, + "loss": 0.8676, + "step": 5876 + }, + { + "epoch": 1.01, + "grad_norm": 11.38235092163086, + "learning_rate": 1.487128882787026e-05, + "loss": 0.5939, + "step": 5877 + }, + { + "epoch": 1.01, + "grad_norm": 10.981067657470703, + "learning_rate": 1.4868714604427665e-05, + "loss": 0.595, + "step": 5878 + }, + { + "epoch": 1.01, + "grad_norm": 9.459383010864258, + "learning_rate": 1.486614038098507e-05, + "loss": 0.7795, + "step": 5879 + }, + { + "epoch": 1.01, + "grad_norm": 9.010276794433594, + "learning_rate": 1.4863566157542475e-05, + "loss": 0.7097, + "step": 5880 + }, + { + "epoch": 1.01, + "grad_norm": 6.208502769470215, + "learning_rate": 1.486099193409988e-05, + "loss": 0.3053, + "step": 5881 + }, + { + "epoch": 1.01, + "grad_norm": 8.99191665649414, + "learning_rate": 1.4858417710657287e-05, + "loss": 0.4777, + "step": 5882 + }, + { + "epoch": 1.01, + "grad_norm": 9.625293731689453, + "learning_rate": 1.4855843487214692e-05, + "loss": 0.6028, + "step": 5883 + }, + { + "epoch": 1.01, + "grad_norm": 8.562214851379395, + "learning_rate": 1.4853269263772097e-05, + "loss": 0.5358, + "step": 5884 + }, + { + "epoch": 1.01, + "grad_norm": 9.344232559204102, + "learning_rate": 1.48506950403295e-05, + "loss": 0.5031, + "step": 5885 + }, + { + "epoch": 1.01, + "grad_norm": 12.200315475463867, + "learning_rate": 1.4848120816886905e-05, + "loss": 0.6646, + "step": 5886 + }, + { + "epoch": 1.01, + "grad_norm": 9.999994277954102, + "learning_rate": 1.4845546593444311e-05, + "loss": 0.5921, + "step": 5887 + }, + { + "epoch": 1.01, + "grad_norm": 8.149828910827637, + "learning_rate": 1.4842972370001716e-05, + "loss": 0.4568, + "step": 5888 + }, + { + "epoch": 1.01, + "grad_norm": 8.828720092773438, + "learning_rate": 1.4840398146559121e-05, + "loss": 0.5932, + "step": 5889 + }, + { + "epoch": 1.01, + "grad_norm": 7.309082984924316, + "learning_rate": 1.4837823923116526e-05, + "loss": 0.4213, + "step": 5890 + }, + { + "epoch": 1.01, + "grad_norm": 9.649279594421387, + "learning_rate": 1.4835249699673933e-05, + "loss": 0.4919, + "step": 5891 + }, + { + "epoch": 1.01, + "grad_norm": 10.159218788146973, + "learning_rate": 1.4832675476231338e-05, + "loss": 0.6065, + "step": 5892 + }, + { + "epoch": 1.01, + "grad_norm": 13.690546035766602, + "learning_rate": 1.4830101252788743e-05, + "loss": 0.5822, + "step": 5893 + }, + { + "epoch": 1.01, + "grad_norm": 8.738336563110352, + "learning_rate": 1.4827527029346148e-05, + "loss": 0.5516, + "step": 5894 + }, + { + "epoch": 1.01, + "grad_norm": 9.259581565856934, + "learning_rate": 1.4824952805903551e-05, + "loss": 0.5583, + "step": 5895 + }, + { + "epoch": 1.01, + "grad_norm": 11.709681510925293, + "learning_rate": 1.4822378582460958e-05, + "loss": 0.4218, + "step": 5896 + }, + { + "epoch": 1.01, + "grad_norm": 11.941258430480957, + "learning_rate": 1.4819804359018363e-05, + "loss": 0.3773, + "step": 5897 + }, + { + "epoch": 1.01, + "grad_norm": 12.74229621887207, + "learning_rate": 1.4817230135575768e-05, + "loss": 0.6493, + "step": 5898 + }, + { + "epoch": 1.01, + "grad_norm": 10.19922161102295, + "learning_rate": 1.4814655912133173e-05, + "loss": 0.5747, + "step": 5899 + }, + { + "epoch": 1.01, + "grad_norm": 9.202072143554688, + "learning_rate": 1.4812081688690578e-05, + "loss": 0.4071, + "step": 5900 + }, + { + "epoch": 1.01, + "grad_norm": 11.328014373779297, + "learning_rate": 1.4809507465247985e-05, + "loss": 0.7073, + "step": 5901 + }, + { + "epoch": 1.01, + "grad_norm": 13.359185218811035, + "learning_rate": 1.480693324180539e-05, + "loss": 0.6825, + "step": 5902 + }, + { + "epoch": 1.01, + "grad_norm": 10.029674530029297, + "learning_rate": 1.4804359018362795e-05, + "loss": 0.4647, + "step": 5903 + }, + { + "epoch": 1.01, + "grad_norm": 12.793015480041504, + "learning_rate": 1.48017847949202e-05, + "loss": 0.7146, + "step": 5904 + }, + { + "epoch": 1.01, + "grad_norm": 8.733183860778809, + "learning_rate": 1.4799210571477605e-05, + "loss": 0.3918, + "step": 5905 + }, + { + "epoch": 1.01, + "grad_norm": 8.152375221252441, + "learning_rate": 1.479663634803501e-05, + "loss": 0.4684, + "step": 5906 + }, + { + "epoch": 1.01, + "grad_norm": 8.329442024230957, + "learning_rate": 1.4794062124592415e-05, + "loss": 0.5085, + "step": 5907 + }, + { + "epoch": 1.01, + "grad_norm": 11.1718111038208, + "learning_rate": 1.479148790114982e-05, + "loss": 0.5066, + "step": 5908 + }, + { + "epoch": 1.01, + "grad_norm": 13.822720527648926, + "learning_rate": 1.4788913677707225e-05, + "loss": 0.6753, + "step": 5909 + }, + { + "epoch": 1.01, + "grad_norm": 11.792845726013184, + "learning_rate": 1.4786339454264631e-05, + "loss": 0.3951, + "step": 5910 + }, + { + "epoch": 1.01, + "grad_norm": 9.74872875213623, + "learning_rate": 1.4783765230822036e-05, + "loss": 0.3757, + "step": 5911 + }, + { + "epoch": 1.01, + "grad_norm": 9.287698745727539, + "learning_rate": 1.4781191007379441e-05, + "loss": 0.6864, + "step": 5912 + }, + { + "epoch": 1.01, + "grad_norm": 9.916751861572266, + "learning_rate": 1.4778616783936846e-05, + "loss": 0.4277, + "step": 5913 + }, + { + "epoch": 1.01, + "grad_norm": 10.870539665222168, + "learning_rate": 1.4776042560494251e-05, + "loss": 0.592, + "step": 5914 + }, + { + "epoch": 1.02, + "grad_norm": 17.008909225463867, + "learning_rate": 1.4773468337051658e-05, + "loss": 0.7179, + "step": 5915 + }, + { + "epoch": 1.02, + "grad_norm": 10.516189575195312, + "learning_rate": 1.4770894113609061e-05, + "loss": 0.5969, + "step": 5916 + }, + { + "epoch": 1.02, + "grad_norm": 9.788212776184082, + "learning_rate": 1.4768319890166466e-05, + "loss": 0.518, + "step": 5917 + }, + { + "epoch": 1.02, + "grad_norm": 10.915926933288574, + "learning_rate": 1.4765745666723871e-05, + "loss": 0.623, + "step": 5918 + }, + { + "epoch": 1.02, + "grad_norm": 12.850130081176758, + "learning_rate": 1.4763171443281276e-05, + "loss": 0.5302, + "step": 5919 + }, + { + "epoch": 1.02, + "grad_norm": 10.732810020446777, + "learning_rate": 1.4760597219838683e-05, + "loss": 0.7025, + "step": 5920 + }, + { + "epoch": 1.02, + "grad_norm": 12.842598915100098, + "learning_rate": 1.4758022996396088e-05, + "loss": 0.7036, + "step": 5921 + }, + { + "epoch": 1.02, + "grad_norm": 10.236542701721191, + "learning_rate": 1.4755448772953493e-05, + "loss": 0.4438, + "step": 5922 + }, + { + "epoch": 1.02, + "grad_norm": 10.911584854125977, + "learning_rate": 1.4752874549510898e-05, + "loss": 0.4967, + "step": 5923 + }, + { + "epoch": 1.02, + "grad_norm": 14.001578330993652, + "learning_rate": 1.4750300326068304e-05, + "loss": 0.5498, + "step": 5924 + }, + { + "epoch": 1.02, + "grad_norm": 10.843045234680176, + "learning_rate": 1.474772610262571e-05, + "loss": 0.6339, + "step": 5925 + }, + { + "epoch": 1.02, + "grad_norm": 12.488044738769531, + "learning_rate": 1.4745151879183113e-05, + "loss": 0.7011, + "step": 5926 + }, + { + "epoch": 1.02, + "grad_norm": 12.051386833190918, + "learning_rate": 1.4742577655740518e-05, + "loss": 0.6415, + "step": 5927 + }, + { + "epoch": 1.02, + "grad_norm": 11.3502779006958, + "learning_rate": 1.4740003432297923e-05, + "loss": 0.5564, + "step": 5928 + }, + { + "epoch": 1.02, + "grad_norm": 9.254146575927734, + "learning_rate": 1.473742920885533e-05, + "loss": 0.4641, + "step": 5929 + }, + { + "epoch": 1.02, + "grad_norm": 8.9171781539917, + "learning_rate": 1.4734854985412734e-05, + "loss": 0.6273, + "step": 5930 + }, + { + "epoch": 1.02, + "grad_norm": 10.963153839111328, + "learning_rate": 1.473228076197014e-05, + "loss": 0.5815, + "step": 5931 + }, + { + "epoch": 1.02, + "grad_norm": 11.150697708129883, + "learning_rate": 1.4729706538527544e-05, + "loss": 0.6865, + "step": 5932 + }, + { + "epoch": 1.02, + "grad_norm": 13.16169548034668, + "learning_rate": 1.472713231508495e-05, + "loss": 0.6854, + "step": 5933 + }, + { + "epoch": 1.02, + "grad_norm": 11.712112426757812, + "learning_rate": 1.4724558091642356e-05, + "loss": 0.5281, + "step": 5934 + }, + { + "epoch": 1.02, + "grad_norm": 9.701800346374512, + "learning_rate": 1.4721983868199761e-05, + "loss": 0.6936, + "step": 5935 + }, + { + "epoch": 1.02, + "grad_norm": 8.796243667602539, + "learning_rate": 1.4719409644757166e-05, + "loss": 0.3823, + "step": 5936 + }, + { + "epoch": 1.02, + "grad_norm": 10.319583892822266, + "learning_rate": 1.471683542131457e-05, + "loss": 0.4861, + "step": 5937 + }, + { + "epoch": 1.02, + "grad_norm": 11.393757820129395, + "learning_rate": 1.4714261197871974e-05, + "loss": 0.7571, + "step": 5938 + }, + { + "epoch": 1.02, + "grad_norm": 9.345955848693848, + "learning_rate": 1.4711686974429381e-05, + "loss": 0.497, + "step": 5939 + }, + { + "epoch": 1.02, + "grad_norm": 10.249866485595703, + "learning_rate": 1.4709112750986786e-05, + "loss": 0.5631, + "step": 5940 + }, + { + "epoch": 1.02, + "grad_norm": 8.252386093139648, + "learning_rate": 1.4706538527544191e-05, + "loss": 0.5427, + "step": 5941 + }, + { + "epoch": 1.02, + "grad_norm": 12.183730125427246, + "learning_rate": 1.4703964304101596e-05, + "loss": 0.7649, + "step": 5942 + }, + { + "epoch": 1.02, + "grad_norm": 10.999211311340332, + "learning_rate": 1.4701390080659003e-05, + "loss": 0.7697, + "step": 5943 + }, + { + "epoch": 1.02, + "grad_norm": 10.877355575561523, + "learning_rate": 1.4698815857216408e-05, + "loss": 0.5126, + "step": 5944 + }, + { + "epoch": 1.02, + "grad_norm": 11.60456657409668, + "learning_rate": 1.4696241633773813e-05, + "loss": 0.5625, + "step": 5945 + }, + { + "epoch": 1.02, + "grad_norm": 11.823182106018066, + "learning_rate": 1.4693667410331218e-05, + "loss": 0.707, + "step": 5946 + }, + { + "epoch": 1.02, + "grad_norm": 12.35696029663086, + "learning_rate": 1.4691093186888621e-05, + "loss": 0.6407, + "step": 5947 + }, + { + "epoch": 1.02, + "grad_norm": 10.381437301635742, + "learning_rate": 1.4688518963446028e-05, + "loss": 0.6908, + "step": 5948 + }, + { + "epoch": 1.02, + "grad_norm": 10.455245971679688, + "learning_rate": 1.4685944740003433e-05, + "loss": 0.5932, + "step": 5949 + }, + { + "epoch": 1.02, + "grad_norm": 8.504873275756836, + "learning_rate": 1.4683370516560838e-05, + "loss": 0.6447, + "step": 5950 + }, + { + "epoch": 1.02, + "grad_norm": 10.666354179382324, + "learning_rate": 1.4680796293118243e-05, + "loss": 0.6577, + "step": 5951 + }, + { + "epoch": 1.02, + "grad_norm": 9.356197357177734, + "learning_rate": 1.4678222069675648e-05, + "loss": 0.4181, + "step": 5952 + }, + { + "epoch": 1.02, + "grad_norm": 11.843424797058105, + "learning_rate": 1.4675647846233054e-05, + "loss": 0.6225, + "step": 5953 + }, + { + "epoch": 1.02, + "grad_norm": 9.660552978515625, + "learning_rate": 1.467307362279046e-05, + "loss": 0.5203, + "step": 5954 + }, + { + "epoch": 1.02, + "grad_norm": 11.64244556427002, + "learning_rate": 1.4670499399347864e-05, + "loss": 0.5729, + "step": 5955 + }, + { + "epoch": 1.02, + "grad_norm": 7.673271179199219, + "learning_rate": 1.466792517590527e-05, + "loss": 0.4832, + "step": 5956 + }, + { + "epoch": 1.02, + "grad_norm": 10.190770149230957, + "learning_rate": 1.4665350952462674e-05, + "loss": 0.5042, + "step": 5957 + }, + { + "epoch": 1.02, + "grad_norm": 8.698134422302246, + "learning_rate": 1.4662776729020079e-05, + "loss": 0.4508, + "step": 5958 + }, + { + "epoch": 1.02, + "grad_norm": 10.254560470581055, + "learning_rate": 1.4660202505577484e-05, + "loss": 0.6634, + "step": 5959 + }, + { + "epoch": 1.02, + "grad_norm": 9.10484790802002, + "learning_rate": 1.4657628282134889e-05, + "loss": 0.3792, + "step": 5960 + }, + { + "epoch": 1.02, + "grad_norm": 9.497947692871094, + "learning_rate": 1.4655054058692294e-05, + "loss": 0.511, + "step": 5961 + }, + { + "epoch": 1.02, + "grad_norm": 10.054791450500488, + "learning_rate": 1.46524798352497e-05, + "loss": 0.5077, + "step": 5962 + }, + { + "epoch": 1.02, + "grad_norm": 11.14282512664795, + "learning_rate": 1.4649905611807106e-05, + "loss": 0.4494, + "step": 5963 + }, + { + "epoch": 1.02, + "grad_norm": 11.792989730834961, + "learning_rate": 1.464733138836451e-05, + "loss": 0.9415, + "step": 5964 + }, + { + "epoch": 1.02, + "grad_norm": 8.341668128967285, + "learning_rate": 1.4644757164921916e-05, + "loss": 0.4478, + "step": 5965 + }, + { + "epoch": 1.02, + "grad_norm": 12.086487770080566, + "learning_rate": 1.464218294147932e-05, + "loss": 0.6163, + "step": 5966 + }, + { + "epoch": 1.02, + "grad_norm": 11.91039752960205, + "learning_rate": 1.4639608718036727e-05, + "loss": 0.637, + "step": 5967 + }, + { + "epoch": 1.02, + "grad_norm": 8.866921424865723, + "learning_rate": 1.463703449459413e-05, + "loss": 0.409, + "step": 5968 + }, + { + "epoch": 1.02, + "grad_norm": 11.187800407409668, + "learning_rate": 1.4634460271151536e-05, + "loss": 0.5432, + "step": 5969 + }, + { + "epoch": 1.02, + "grad_norm": 9.714569091796875, + "learning_rate": 1.463188604770894e-05, + "loss": 0.5639, + "step": 5970 + }, + { + "epoch": 1.02, + "grad_norm": 7.751258373260498, + "learning_rate": 1.4629311824266346e-05, + "loss": 0.3676, + "step": 5971 + }, + { + "epoch": 1.02, + "grad_norm": 10.975625038146973, + "learning_rate": 1.4626737600823752e-05, + "loss": 0.5023, + "step": 5972 + }, + { + "epoch": 1.03, + "grad_norm": 10.23196792602539, + "learning_rate": 1.4624163377381157e-05, + "loss": 0.5034, + "step": 5973 + }, + { + "epoch": 1.03, + "grad_norm": 11.886031150817871, + "learning_rate": 1.4621589153938562e-05, + "loss": 0.4551, + "step": 5974 + }, + { + "epoch": 1.03, + "grad_norm": 9.783336639404297, + "learning_rate": 1.4619014930495967e-05, + "loss": 0.6406, + "step": 5975 + }, + { + "epoch": 1.03, + "grad_norm": 10.055586814880371, + "learning_rate": 1.4616440707053374e-05, + "loss": 0.548, + "step": 5976 + }, + { + "epoch": 1.03, + "grad_norm": 9.34635066986084, + "learning_rate": 1.4613866483610779e-05, + "loss": 0.5204, + "step": 5977 + }, + { + "epoch": 1.03, + "grad_norm": 8.790834426879883, + "learning_rate": 1.4611292260168182e-05, + "loss": 0.4637, + "step": 5978 + }, + { + "epoch": 1.03, + "grad_norm": 10.629433631896973, + "learning_rate": 1.4608718036725587e-05, + "loss": 0.3886, + "step": 5979 + }, + { + "epoch": 1.03, + "grad_norm": 12.253458976745605, + "learning_rate": 1.4606143813282992e-05, + "loss": 0.5302, + "step": 5980 + }, + { + "epoch": 1.03, + "grad_norm": 9.117012977600098, + "learning_rate": 1.4603569589840399e-05, + "loss": 0.5157, + "step": 5981 + }, + { + "epoch": 1.03, + "grad_norm": 11.922342300415039, + "learning_rate": 1.4600995366397804e-05, + "loss": 0.5678, + "step": 5982 + }, + { + "epoch": 1.03, + "grad_norm": 9.276809692382812, + "learning_rate": 1.4598421142955209e-05, + "loss": 0.4387, + "step": 5983 + }, + { + "epoch": 1.03, + "grad_norm": 10.134427070617676, + "learning_rate": 1.4595846919512614e-05, + "loss": 0.5558, + "step": 5984 + }, + { + "epoch": 1.03, + "grad_norm": 9.616376876831055, + "learning_rate": 1.4593272696070019e-05, + "loss": 0.6387, + "step": 5985 + }, + { + "epoch": 1.03, + "grad_norm": 9.63627815246582, + "learning_rate": 1.4590698472627426e-05, + "loss": 0.6726, + "step": 5986 + }, + { + "epoch": 1.03, + "grad_norm": 15.850713729858398, + "learning_rate": 1.458812424918483e-05, + "loss": 0.6863, + "step": 5987 + }, + { + "epoch": 1.03, + "grad_norm": 9.737812995910645, + "learning_rate": 1.4585550025742236e-05, + "loss": 0.3511, + "step": 5988 + }, + { + "epoch": 1.03, + "grad_norm": 9.919577598571777, + "learning_rate": 1.4582975802299639e-05, + "loss": 0.4189, + "step": 5989 + }, + { + "epoch": 1.03, + "grad_norm": 12.109214782714844, + "learning_rate": 1.4580401578857044e-05, + "loss": 0.5978, + "step": 5990 + }, + { + "epoch": 1.03, + "grad_norm": 11.94290542602539, + "learning_rate": 1.457782735541445e-05, + "loss": 0.5975, + "step": 5991 + }, + { + "epoch": 1.03, + "grad_norm": 9.736477851867676, + "learning_rate": 1.4575253131971855e-05, + "loss": 0.5522, + "step": 5992 + }, + { + "epoch": 1.03, + "grad_norm": 13.462236404418945, + "learning_rate": 1.457267890852926e-05, + "loss": 0.7905, + "step": 5993 + }, + { + "epoch": 1.03, + "grad_norm": 10.172720909118652, + "learning_rate": 1.4570104685086665e-05, + "loss": 0.6194, + "step": 5994 + }, + { + "epoch": 1.03, + "grad_norm": 13.092374801635742, + "learning_rate": 1.4567530461644072e-05, + "loss": 0.5071, + "step": 5995 + }, + { + "epoch": 1.03, + "grad_norm": 9.301154136657715, + "learning_rate": 1.4564956238201477e-05, + "loss": 0.5913, + "step": 5996 + }, + { + "epoch": 1.03, + "grad_norm": 9.991229057312012, + "learning_rate": 1.4562382014758882e-05, + "loss": 0.4657, + "step": 5997 + }, + { + "epoch": 1.03, + "grad_norm": 11.99020004272461, + "learning_rate": 1.4559807791316287e-05, + "loss": 0.4323, + "step": 5998 + }, + { + "epoch": 1.03, + "grad_norm": 10.797056198120117, + "learning_rate": 1.455723356787369e-05, + "loss": 0.6752, + "step": 5999 + }, + { + "epoch": 1.03, + "grad_norm": 8.071431159973145, + "learning_rate": 1.4554659344431097e-05, + "loss": 0.3541, + "step": 6000 + }, + { + "epoch": 1.03, + "grad_norm": 12.672924995422363, + "learning_rate": 1.4552085120988502e-05, + "loss": 0.631, + "step": 6001 + }, + { + "epoch": 1.03, + "grad_norm": 9.877246856689453, + "learning_rate": 1.4549510897545907e-05, + "loss": 0.485, + "step": 6002 + }, + { + "epoch": 1.03, + "grad_norm": 12.03546142578125, + "learning_rate": 1.4546936674103312e-05, + "loss": 0.7234, + "step": 6003 + }, + { + "epoch": 1.03, + "grad_norm": 10.23678970336914, + "learning_rate": 1.4544362450660717e-05, + "loss": 0.5195, + "step": 6004 + }, + { + "epoch": 1.03, + "grad_norm": 8.419454574584961, + "learning_rate": 1.4541788227218124e-05, + "loss": 0.5899, + "step": 6005 + }, + { + "epoch": 1.03, + "grad_norm": 9.912901878356934, + "learning_rate": 1.4539214003775529e-05, + "loss": 0.5016, + "step": 6006 + }, + { + "epoch": 1.03, + "grad_norm": 9.926216125488281, + "learning_rate": 1.4536639780332934e-05, + "loss": 0.5323, + "step": 6007 + }, + { + "epoch": 1.03, + "grad_norm": 9.15507984161377, + "learning_rate": 1.4534065556890339e-05, + "loss": 0.5297, + "step": 6008 + }, + { + "epoch": 1.03, + "grad_norm": 9.702319145202637, + "learning_rate": 1.4531491333447744e-05, + "loss": 0.5739, + "step": 6009 + }, + { + "epoch": 1.03, + "grad_norm": 12.169788360595703, + "learning_rate": 1.4528917110005149e-05, + "loss": 0.675, + "step": 6010 + }, + { + "epoch": 1.03, + "grad_norm": 9.871467590332031, + "learning_rate": 1.4526342886562554e-05, + "loss": 0.4943, + "step": 6011 + }, + { + "epoch": 1.03, + "grad_norm": 8.006232261657715, + "learning_rate": 1.4523768663119959e-05, + "loss": 0.4178, + "step": 6012 + }, + { + "epoch": 1.03, + "grad_norm": 13.051650047302246, + "learning_rate": 1.4521194439677364e-05, + "loss": 0.7047, + "step": 6013 + }, + { + "epoch": 1.03, + "grad_norm": 13.217045783996582, + "learning_rate": 1.451862021623477e-05, + "loss": 0.8165, + "step": 6014 + }, + { + "epoch": 1.03, + "grad_norm": 13.112601280212402, + "learning_rate": 1.4516045992792175e-05, + "loss": 0.6061, + "step": 6015 + }, + { + "epoch": 1.03, + "grad_norm": 13.458157539367676, + "learning_rate": 1.451347176934958e-05, + "loss": 0.6302, + "step": 6016 + }, + { + "epoch": 1.03, + "grad_norm": 8.235051155090332, + "learning_rate": 1.4510897545906985e-05, + "loss": 0.4292, + "step": 6017 + }, + { + "epoch": 1.03, + "grad_norm": 9.034002304077148, + "learning_rate": 1.450832332246439e-05, + "loss": 0.4661, + "step": 6018 + }, + { + "epoch": 1.03, + "grad_norm": 12.086384773254395, + "learning_rate": 1.4505749099021797e-05, + "loss": 0.6491, + "step": 6019 + }, + { + "epoch": 1.03, + "grad_norm": 12.544063568115234, + "learning_rate": 1.45031748755792e-05, + "loss": 0.6637, + "step": 6020 + }, + { + "epoch": 1.03, + "grad_norm": 8.971297264099121, + "learning_rate": 1.4500600652136605e-05, + "loss": 0.4539, + "step": 6021 + }, + { + "epoch": 1.03, + "grad_norm": 13.303975105285645, + "learning_rate": 1.449802642869401e-05, + "loss": 0.6362, + "step": 6022 + }, + { + "epoch": 1.03, + "grad_norm": 10.662263870239258, + "learning_rate": 1.4495452205251415e-05, + "loss": 0.8009, + "step": 6023 + }, + { + "epoch": 1.03, + "grad_norm": 13.112136840820312, + "learning_rate": 1.4492877981808822e-05, + "loss": 0.738, + "step": 6024 + }, + { + "epoch": 1.03, + "grad_norm": 14.446643829345703, + "learning_rate": 1.4490303758366227e-05, + "loss": 0.7352, + "step": 6025 + }, + { + "epoch": 1.03, + "grad_norm": 10.696676254272461, + "learning_rate": 1.4487729534923632e-05, + "loss": 0.5278, + "step": 6026 + }, + { + "epoch": 1.03, + "grad_norm": 16.39923095703125, + "learning_rate": 1.4485155311481037e-05, + "loss": 0.5024, + "step": 6027 + }, + { + "epoch": 1.03, + "grad_norm": 9.176897048950195, + "learning_rate": 1.4482581088038442e-05, + "loss": 0.573, + "step": 6028 + }, + { + "epoch": 1.03, + "grad_norm": 9.741162300109863, + "learning_rate": 1.4480006864595848e-05, + "loss": 0.6059, + "step": 6029 + }, + { + "epoch": 1.03, + "grad_norm": 10.605361938476562, + "learning_rate": 1.4477432641153252e-05, + "loss": 0.4713, + "step": 6030 + }, + { + "epoch": 1.04, + "grad_norm": 7.7548956871032715, + "learning_rate": 1.4474858417710657e-05, + "loss": 0.3838, + "step": 6031 + }, + { + "epoch": 1.04, + "grad_norm": 9.093345642089844, + "learning_rate": 1.4472284194268062e-05, + "loss": 0.3958, + "step": 6032 + }, + { + "epoch": 1.04, + "grad_norm": 10.763676643371582, + "learning_rate": 1.4469709970825468e-05, + "loss": 0.4929, + "step": 6033 + }, + { + "epoch": 1.04, + "grad_norm": 11.30533504486084, + "learning_rate": 1.4467135747382873e-05, + "loss": 0.5723, + "step": 6034 + }, + { + "epoch": 1.04, + "grad_norm": 8.449240684509277, + "learning_rate": 1.4464561523940278e-05, + "loss": 0.5791, + "step": 6035 + }, + { + "epoch": 1.04, + "grad_norm": 10.579524040222168, + "learning_rate": 1.4461987300497683e-05, + "loss": 0.4807, + "step": 6036 + }, + { + "epoch": 1.04, + "grad_norm": 9.388040542602539, + "learning_rate": 1.4459413077055088e-05, + "loss": 0.4591, + "step": 6037 + }, + { + "epoch": 1.04, + "grad_norm": 8.479193687438965, + "learning_rate": 1.4456838853612495e-05, + "loss": 0.5965, + "step": 6038 + }, + { + "epoch": 1.04, + "grad_norm": 9.420098304748535, + "learning_rate": 1.44542646301699e-05, + "loss": 0.6916, + "step": 6039 + }, + { + "epoch": 1.04, + "grad_norm": 11.806793212890625, + "learning_rate": 1.4451690406727305e-05, + "loss": 0.854, + "step": 6040 + }, + { + "epoch": 1.04, + "grad_norm": 9.206016540527344, + "learning_rate": 1.4449116183284708e-05, + "loss": 0.4748, + "step": 6041 + }, + { + "epoch": 1.04, + "grad_norm": 8.494976043701172, + "learning_rate": 1.4446541959842113e-05, + "loss": 0.4301, + "step": 6042 + }, + { + "epoch": 1.04, + "grad_norm": 11.136125564575195, + "learning_rate": 1.444396773639952e-05, + "loss": 0.9049, + "step": 6043 + }, + { + "epoch": 1.04, + "grad_norm": 9.781466484069824, + "learning_rate": 1.4441393512956925e-05, + "loss": 0.6523, + "step": 6044 + }, + { + "epoch": 1.04, + "grad_norm": 8.593643188476562, + "learning_rate": 1.443881928951433e-05, + "loss": 0.4749, + "step": 6045 + }, + { + "epoch": 1.04, + "grad_norm": 12.517271995544434, + "learning_rate": 1.4436245066071735e-05, + "loss": 0.7917, + "step": 6046 + }, + { + "epoch": 1.04, + "grad_norm": 9.094876289367676, + "learning_rate": 1.4433670842629142e-05, + "loss": 0.4401, + "step": 6047 + }, + { + "epoch": 1.04, + "grad_norm": 10.035802841186523, + "learning_rate": 1.4431096619186547e-05, + "loss": 0.6043, + "step": 6048 + }, + { + "epoch": 1.04, + "grad_norm": 9.315948486328125, + "learning_rate": 1.4428522395743952e-05, + "loss": 0.6069, + "step": 6049 + }, + { + "epoch": 1.04, + "grad_norm": 11.189122200012207, + "learning_rate": 1.4425948172301357e-05, + "loss": 0.6013, + "step": 6050 + }, + { + "epoch": 1.04, + "grad_norm": 13.863465309143066, + "learning_rate": 1.442337394885876e-05, + "loss": 0.7592, + "step": 6051 + }, + { + "epoch": 1.04, + "grad_norm": 8.783334732055664, + "learning_rate": 1.4420799725416167e-05, + "loss": 0.512, + "step": 6052 + }, + { + "epoch": 1.04, + "grad_norm": 10.720061302185059, + "learning_rate": 1.4418225501973572e-05, + "loss": 0.6549, + "step": 6053 + }, + { + "epoch": 1.04, + "grad_norm": 9.834259033203125, + "learning_rate": 1.4415651278530977e-05, + "loss": 0.5887, + "step": 6054 + }, + { + "epoch": 1.04, + "grad_norm": 12.825617790222168, + "learning_rate": 1.4413077055088382e-05, + "loss": 0.5317, + "step": 6055 + }, + { + "epoch": 1.04, + "grad_norm": 10.527396202087402, + "learning_rate": 1.4410502831645787e-05, + "loss": 0.5497, + "step": 6056 + }, + { + "epoch": 1.04, + "grad_norm": 8.533768653869629, + "learning_rate": 1.4407928608203193e-05, + "loss": 0.3781, + "step": 6057 + }, + { + "epoch": 1.04, + "grad_norm": 9.703971862792969, + "learning_rate": 1.4405354384760598e-05, + "loss": 0.5077, + "step": 6058 + }, + { + "epoch": 1.04, + "grad_norm": 13.134206771850586, + "learning_rate": 1.4402780161318003e-05, + "loss": 0.608, + "step": 6059 + }, + { + "epoch": 1.04, + "grad_norm": 11.357513427734375, + "learning_rate": 1.4400205937875408e-05, + "loss": 0.6031, + "step": 6060 + }, + { + "epoch": 1.04, + "grad_norm": 11.609781265258789, + "learning_rate": 1.4397631714432813e-05, + "loss": 0.3727, + "step": 6061 + }, + { + "epoch": 1.04, + "grad_norm": 10.689179420471191, + "learning_rate": 1.4395057490990218e-05, + "loss": 0.5953, + "step": 6062 + }, + { + "epoch": 1.04, + "grad_norm": 12.445324897766113, + "learning_rate": 1.4392483267547623e-05, + "loss": 0.5379, + "step": 6063 + }, + { + "epoch": 1.04, + "grad_norm": 9.921080589294434, + "learning_rate": 1.4389909044105028e-05, + "loss": 0.5976, + "step": 6064 + }, + { + "epoch": 1.04, + "grad_norm": 11.612894058227539, + "learning_rate": 1.4387334820662433e-05, + "loss": 0.5477, + "step": 6065 + }, + { + "epoch": 1.04, + "grad_norm": 12.109848976135254, + "learning_rate": 1.438476059721984e-05, + "loss": 0.5091, + "step": 6066 + }, + { + "epoch": 1.04, + "grad_norm": 10.760492324829102, + "learning_rate": 1.4382186373777245e-05, + "loss": 0.5526, + "step": 6067 + }, + { + "epoch": 1.04, + "grad_norm": 7.640450954437256, + "learning_rate": 1.437961215033465e-05, + "loss": 0.3258, + "step": 6068 + }, + { + "epoch": 1.04, + "grad_norm": 11.686336517333984, + "learning_rate": 1.4377037926892055e-05, + "loss": 0.7677, + "step": 6069 + }, + { + "epoch": 1.04, + "grad_norm": 9.44484806060791, + "learning_rate": 1.437446370344946e-05, + "loss": 0.4966, + "step": 6070 + }, + { + "epoch": 1.04, + "grad_norm": 12.284594535827637, + "learning_rate": 1.4371889480006866e-05, + "loss": 0.501, + "step": 6071 + }, + { + "epoch": 1.04, + "grad_norm": 12.47825813293457, + "learning_rate": 1.436931525656427e-05, + "loss": 0.6583, + "step": 6072 + }, + { + "epoch": 1.04, + "grad_norm": 10.232020378112793, + "learning_rate": 1.4366741033121675e-05, + "loss": 0.5751, + "step": 6073 + }, + { + "epoch": 1.04, + "grad_norm": 11.311654090881348, + "learning_rate": 1.436416680967908e-05, + "loss": 0.6769, + "step": 6074 + }, + { + "epoch": 1.04, + "grad_norm": 9.717187881469727, + "learning_rate": 1.4361592586236485e-05, + "loss": 0.5025, + "step": 6075 + }, + { + "epoch": 1.04, + "grad_norm": 10.880813598632812, + "learning_rate": 1.4359018362793891e-05, + "loss": 0.6143, + "step": 6076 + }, + { + "epoch": 1.04, + "grad_norm": 13.355393409729004, + "learning_rate": 1.4356444139351296e-05, + "loss": 0.6914, + "step": 6077 + }, + { + "epoch": 1.04, + "grad_norm": 10.87582778930664, + "learning_rate": 1.4353869915908701e-05, + "loss": 0.5311, + "step": 6078 + }, + { + "epoch": 1.04, + "grad_norm": 10.29086971282959, + "learning_rate": 1.4351295692466106e-05, + "loss": 0.5469, + "step": 6079 + }, + { + "epoch": 1.04, + "grad_norm": 9.28413200378418, + "learning_rate": 1.4348721469023511e-05, + "loss": 0.4143, + "step": 6080 + }, + { + "epoch": 1.04, + "grad_norm": 11.597415924072266, + "learning_rate": 1.4346147245580918e-05, + "loss": 0.6202, + "step": 6081 + }, + { + "epoch": 1.04, + "grad_norm": 9.337462425231934, + "learning_rate": 1.4343573022138321e-05, + "loss": 0.4907, + "step": 6082 + }, + { + "epoch": 1.04, + "grad_norm": 9.425405502319336, + "learning_rate": 1.4340998798695726e-05, + "loss": 0.343, + "step": 6083 + }, + { + "epoch": 1.04, + "grad_norm": 12.436531066894531, + "learning_rate": 1.4338424575253131e-05, + "loss": 0.6264, + "step": 6084 + }, + { + "epoch": 1.04, + "grad_norm": 13.24310302734375, + "learning_rate": 1.4335850351810538e-05, + "loss": 0.6385, + "step": 6085 + }, + { + "epoch": 1.04, + "grad_norm": 9.942517280578613, + "learning_rate": 1.4333276128367943e-05, + "loss": 0.473, + "step": 6086 + }, + { + "epoch": 1.04, + "grad_norm": 10.65208911895752, + "learning_rate": 1.4330701904925348e-05, + "loss": 0.5932, + "step": 6087 + }, + { + "epoch": 1.04, + "grad_norm": 12.827574729919434, + "learning_rate": 1.4328127681482753e-05, + "loss": 0.4508, + "step": 6088 + }, + { + "epoch": 1.04, + "grad_norm": 13.024276733398438, + "learning_rate": 1.4325553458040158e-05, + "loss": 0.7384, + "step": 6089 + }, + { + "epoch": 1.05, + "grad_norm": 14.000886917114258, + "learning_rate": 1.4322979234597565e-05, + "loss": 0.543, + "step": 6090 + }, + { + "epoch": 1.05, + "grad_norm": 9.903984069824219, + "learning_rate": 1.432040501115497e-05, + "loss": 0.703, + "step": 6091 + }, + { + "epoch": 1.05, + "grad_norm": 9.271925926208496, + "learning_rate": 1.4317830787712375e-05, + "loss": 0.5366, + "step": 6092 + }, + { + "epoch": 1.05, + "grad_norm": 11.144278526306152, + "learning_rate": 1.4315256564269778e-05, + "loss": 0.5025, + "step": 6093 + }, + { + "epoch": 1.05, + "grad_norm": 9.94912338256836, + "learning_rate": 1.4312682340827183e-05, + "loss": 0.5062, + "step": 6094 + }, + { + "epoch": 1.05, + "grad_norm": 8.116493225097656, + "learning_rate": 1.431010811738459e-05, + "loss": 0.4661, + "step": 6095 + }, + { + "epoch": 1.05, + "grad_norm": 10.658554077148438, + "learning_rate": 1.4307533893941994e-05, + "loss": 0.5392, + "step": 6096 + }, + { + "epoch": 1.05, + "grad_norm": 12.043598175048828, + "learning_rate": 1.43049596704994e-05, + "loss": 0.6206, + "step": 6097 + }, + { + "epoch": 1.05, + "grad_norm": 12.47064208984375, + "learning_rate": 1.4302385447056804e-05, + "loss": 0.7444, + "step": 6098 + }, + { + "epoch": 1.05, + "grad_norm": 10.273091316223145, + "learning_rate": 1.4299811223614211e-05, + "loss": 0.5602, + "step": 6099 + }, + { + "epoch": 1.05, + "grad_norm": 10.650313377380371, + "learning_rate": 1.4297237000171616e-05, + "loss": 0.6733, + "step": 6100 + }, + { + "epoch": 1.05, + "grad_norm": 7.835964679718018, + "learning_rate": 1.4294662776729021e-05, + "loss": 0.4004, + "step": 6101 + }, + { + "epoch": 1.05, + "grad_norm": 9.33560562133789, + "learning_rate": 1.4292088553286426e-05, + "loss": 0.6252, + "step": 6102 + }, + { + "epoch": 1.05, + "grad_norm": 9.087122917175293, + "learning_rate": 1.428951432984383e-05, + "loss": 0.4333, + "step": 6103 + }, + { + "epoch": 1.05, + "grad_norm": 11.257527351379395, + "learning_rate": 1.4286940106401236e-05, + "loss": 0.4685, + "step": 6104 + }, + { + "epoch": 1.05, + "grad_norm": 10.58927059173584, + "learning_rate": 1.4284365882958641e-05, + "loss": 0.4967, + "step": 6105 + }, + { + "epoch": 1.05, + "grad_norm": 13.252796173095703, + "learning_rate": 1.4281791659516046e-05, + "loss": 0.5997, + "step": 6106 + }, + { + "epoch": 1.05, + "grad_norm": 10.413355827331543, + "learning_rate": 1.4279217436073451e-05, + "loss": 0.3748, + "step": 6107 + }, + { + "epoch": 1.05, + "grad_norm": 10.290119171142578, + "learning_rate": 1.4276643212630856e-05, + "loss": 0.6509, + "step": 6108 + }, + { + "epoch": 1.05, + "grad_norm": 8.814628601074219, + "learning_rate": 1.4274068989188263e-05, + "loss": 0.4193, + "step": 6109 + }, + { + "epoch": 1.05, + "grad_norm": 13.923113822937012, + "learning_rate": 1.4271494765745668e-05, + "loss": 0.6374, + "step": 6110 + }, + { + "epoch": 1.05, + "grad_norm": 9.146993637084961, + "learning_rate": 1.4268920542303073e-05, + "loss": 0.3913, + "step": 6111 + }, + { + "epoch": 1.05, + "grad_norm": 11.459632873535156, + "learning_rate": 1.4266346318860478e-05, + "loss": 0.7129, + "step": 6112 + }, + { + "epoch": 1.05, + "grad_norm": 11.424100875854492, + "learning_rate": 1.4263772095417883e-05, + "loss": 0.4261, + "step": 6113 + }, + { + "epoch": 1.05, + "grad_norm": 11.101913452148438, + "learning_rate": 1.4261197871975288e-05, + "loss": 0.6028, + "step": 6114 + }, + { + "epoch": 1.05, + "grad_norm": 9.185338973999023, + "learning_rate": 1.4258623648532693e-05, + "loss": 0.5468, + "step": 6115 + }, + { + "epoch": 1.05, + "grad_norm": 10.860051155090332, + "learning_rate": 1.4256049425090098e-05, + "loss": 0.5122, + "step": 6116 + }, + { + "epoch": 1.05, + "grad_norm": 9.243034362792969, + "learning_rate": 1.4253475201647503e-05, + "loss": 0.5197, + "step": 6117 + }, + { + "epoch": 1.05, + "grad_norm": 9.012742042541504, + "learning_rate": 1.425090097820491e-05, + "loss": 0.5302, + "step": 6118 + }, + { + "epoch": 1.05, + "grad_norm": 8.1295804977417, + "learning_rate": 1.4248326754762314e-05, + "loss": 0.4654, + "step": 6119 + }, + { + "epoch": 1.05, + "grad_norm": 7.8541388511657715, + "learning_rate": 1.424575253131972e-05, + "loss": 0.3855, + "step": 6120 + }, + { + "epoch": 1.05, + "grad_norm": 10.729656219482422, + "learning_rate": 1.4243178307877124e-05, + "loss": 0.5408, + "step": 6121 + }, + { + "epoch": 1.05, + "grad_norm": 8.455890655517578, + "learning_rate": 1.424060408443453e-05, + "loss": 0.5054, + "step": 6122 + }, + { + "epoch": 1.05, + "grad_norm": 12.174029350280762, + "learning_rate": 1.4238029860991936e-05, + "loss": 0.5078, + "step": 6123 + }, + { + "epoch": 1.05, + "grad_norm": 12.40572738647461, + "learning_rate": 1.423545563754934e-05, + "loss": 0.7425, + "step": 6124 + }, + { + "epoch": 1.05, + "grad_norm": 10.948651313781738, + "learning_rate": 1.4232881414106744e-05, + "loss": 0.7963, + "step": 6125 + }, + { + "epoch": 1.05, + "grad_norm": 9.797237396240234, + "learning_rate": 1.423030719066415e-05, + "loss": 0.5415, + "step": 6126 + }, + { + "epoch": 1.05, + "grad_norm": 10.037702560424805, + "learning_rate": 1.4227732967221554e-05, + "loss": 0.4369, + "step": 6127 + }, + { + "epoch": 1.05, + "grad_norm": 8.486723899841309, + "learning_rate": 1.4225158743778961e-05, + "loss": 0.5017, + "step": 6128 + }, + { + "epoch": 1.05, + "grad_norm": 10.453106880187988, + "learning_rate": 1.4222584520336366e-05, + "loss": 0.5797, + "step": 6129 + }, + { + "epoch": 1.05, + "grad_norm": 9.221585273742676, + "learning_rate": 1.422001029689377e-05, + "loss": 0.4769, + "step": 6130 + }, + { + "epoch": 1.05, + "grad_norm": 12.47728443145752, + "learning_rate": 1.4217436073451176e-05, + "loss": 0.6353, + "step": 6131 + }, + { + "epoch": 1.05, + "grad_norm": 11.76475715637207, + "learning_rate": 1.421486185000858e-05, + "loss": 0.5109, + "step": 6132 + }, + { + "epoch": 1.05, + "grad_norm": 15.694698333740234, + "learning_rate": 1.4212287626565987e-05, + "loss": 0.6959, + "step": 6133 + }, + { + "epoch": 1.05, + "grad_norm": 10.754035949707031, + "learning_rate": 1.420971340312339e-05, + "loss": 0.855, + "step": 6134 + }, + { + "epoch": 1.05, + "grad_norm": 6.095348358154297, + "learning_rate": 1.4207139179680796e-05, + "loss": 0.1835, + "step": 6135 + }, + { + "epoch": 1.05, + "grad_norm": 11.529303550720215, + "learning_rate": 1.42045649562382e-05, + "loss": 0.4942, + "step": 6136 + }, + { + "epoch": 1.05, + "grad_norm": 10.631980895996094, + "learning_rate": 1.4201990732795607e-05, + "loss": 0.7518, + "step": 6137 + }, + { + "epoch": 1.05, + "grad_norm": 12.752338409423828, + "learning_rate": 1.4199416509353012e-05, + "loss": 0.7107, + "step": 6138 + }, + { + "epoch": 1.05, + "grad_norm": 11.647884368896484, + "learning_rate": 1.4196842285910417e-05, + "loss": 0.6206, + "step": 6139 + }, + { + "epoch": 1.05, + "grad_norm": 10.637974739074707, + "learning_rate": 1.4194268062467822e-05, + "loss": 0.5882, + "step": 6140 + }, + { + "epoch": 1.05, + "grad_norm": 12.008574485778809, + "learning_rate": 1.4191693839025227e-05, + "loss": 0.6725, + "step": 6141 + }, + { + "epoch": 1.05, + "grad_norm": 9.038464546203613, + "learning_rate": 1.4189119615582634e-05, + "loss": 0.5212, + "step": 6142 + }, + { + "epoch": 1.05, + "grad_norm": 10.971981048583984, + "learning_rate": 1.4186545392140039e-05, + "loss": 0.6147, + "step": 6143 + }, + { + "epoch": 1.05, + "grad_norm": 12.331040382385254, + "learning_rate": 1.4183971168697444e-05, + "loss": 0.5533, + "step": 6144 + }, + { + "epoch": 1.05, + "grad_norm": 8.561538696289062, + "learning_rate": 1.4181396945254847e-05, + "loss": 0.4664, + "step": 6145 + }, + { + "epoch": 1.05, + "grad_norm": 10.650005340576172, + "learning_rate": 1.4178822721812252e-05, + "loss": 0.6292, + "step": 6146 + }, + { + "epoch": 1.05, + "grad_norm": 9.890765190124512, + "learning_rate": 1.4176248498369659e-05, + "loss": 0.5113, + "step": 6147 + }, + { + "epoch": 1.06, + "grad_norm": 10.604499816894531, + "learning_rate": 1.4173674274927064e-05, + "loss": 0.5598, + "step": 6148 + }, + { + "epoch": 1.06, + "grad_norm": 10.501017570495605, + "learning_rate": 1.4171100051484469e-05, + "loss": 0.7128, + "step": 6149 + }, + { + "epoch": 1.06, + "grad_norm": 13.226709365844727, + "learning_rate": 1.4168525828041874e-05, + "loss": 0.6862, + "step": 6150 + }, + { + "epoch": 1.06, + "grad_norm": 10.615044593811035, + "learning_rate": 1.416595160459928e-05, + "loss": 0.4897, + "step": 6151 + }, + { + "epoch": 1.06, + "grad_norm": 10.787710189819336, + "learning_rate": 1.4163377381156686e-05, + "loss": 0.5188, + "step": 6152 + }, + { + "epoch": 1.06, + "grad_norm": 9.707430839538574, + "learning_rate": 1.416080315771409e-05, + "loss": 0.6976, + "step": 6153 + }, + { + "epoch": 1.06, + "grad_norm": 10.319376945495605, + "learning_rate": 1.4158228934271496e-05, + "loss": 0.7427, + "step": 6154 + }, + { + "epoch": 1.06, + "grad_norm": 10.069814682006836, + "learning_rate": 1.4155654710828899e-05, + "loss": 0.6196, + "step": 6155 + }, + { + "epoch": 1.06, + "grad_norm": 10.175179481506348, + "learning_rate": 1.4153080487386306e-05, + "loss": 0.5862, + "step": 6156 + }, + { + "epoch": 1.06, + "grad_norm": 13.734851837158203, + "learning_rate": 1.415050626394371e-05, + "loss": 0.8946, + "step": 6157 + }, + { + "epoch": 1.06, + "grad_norm": 6.9085516929626465, + "learning_rate": 1.4147932040501116e-05, + "loss": 0.2874, + "step": 6158 + }, + { + "epoch": 1.06, + "grad_norm": 12.46718692779541, + "learning_rate": 1.414535781705852e-05, + "loss": 0.6351, + "step": 6159 + }, + { + "epoch": 1.06, + "grad_norm": 10.667304992675781, + "learning_rate": 1.4142783593615926e-05, + "loss": 0.6053, + "step": 6160 + }, + { + "epoch": 1.06, + "grad_norm": 8.37648868560791, + "learning_rate": 1.4140209370173332e-05, + "loss": 0.6371, + "step": 6161 + }, + { + "epoch": 1.06, + "grad_norm": 14.953534126281738, + "learning_rate": 1.4137635146730737e-05, + "loss": 0.5669, + "step": 6162 + }, + { + "epoch": 1.06, + "grad_norm": 11.759281158447266, + "learning_rate": 1.4135060923288142e-05, + "loss": 0.5766, + "step": 6163 + }, + { + "epoch": 1.06, + "grad_norm": 11.923698425292969, + "learning_rate": 1.4132486699845547e-05, + "loss": 0.6697, + "step": 6164 + }, + { + "epoch": 1.06, + "grad_norm": 7.941294193267822, + "learning_rate": 1.4129912476402952e-05, + "loss": 0.6458, + "step": 6165 + }, + { + "epoch": 1.06, + "grad_norm": 9.728187561035156, + "learning_rate": 1.4127338252960357e-05, + "loss": 0.4643, + "step": 6166 + }, + { + "epoch": 1.06, + "grad_norm": 12.354240417480469, + "learning_rate": 1.4124764029517762e-05, + "loss": 0.5838, + "step": 6167 + }, + { + "epoch": 1.06, + "grad_norm": 11.264355659484863, + "learning_rate": 1.4122189806075167e-05, + "loss": 0.8504, + "step": 6168 + }, + { + "epoch": 1.06, + "grad_norm": 7.871974468231201, + "learning_rate": 1.4119615582632572e-05, + "loss": 0.4049, + "step": 6169 + }, + { + "epoch": 1.06, + "grad_norm": 12.0407075881958, + "learning_rate": 1.4117041359189979e-05, + "loss": 0.6596, + "step": 6170 + }, + { + "epoch": 1.06, + "grad_norm": 8.660916328430176, + "learning_rate": 1.4114467135747384e-05, + "loss": 0.458, + "step": 6171 + }, + { + "epoch": 1.06, + "grad_norm": 11.8273344039917, + "learning_rate": 1.4111892912304789e-05, + "loss": 0.3837, + "step": 6172 + }, + { + "epoch": 1.06, + "grad_norm": 8.746609687805176, + "learning_rate": 1.4109318688862194e-05, + "loss": 0.5208, + "step": 6173 + }, + { + "epoch": 1.06, + "grad_norm": 10.71379280090332, + "learning_rate": 1.4106744465419599e-05, + "loss": 0.6932, + "step": 6174 + }, + { + "epoch": 1.06, + "grad_norm": 11.956168174743652, + "learning_rate": 1.4104170241977005e-05, + "loss": 0.8068, + "step": 6175 + }, + { + "epoch": 1.06, + "grad_norm": 16.844505310058594, + "learning_rate": 1.4101596018534409e-05, + "loss": 0.7495, + "step": 6176 + }, + { + "epoch": 1.06, + "grad_norm": 7.569572925567627, + "learning_rate": 1.4099021795091814e-05, + "loss": 0.414, + "step": 6177 + }, + { + "epoch": 1.06, + "grad_norm": 12.275727272033691, + "learning_rate": 1.4096447571649219e-05, + "loss": 0.5681, + "step": 6178 + }, + { + "epoch": 1.06, + "grad_norm": 10.412195205688477, + "learning_rate": 1.4093873348206624e-05, + "loss": 0.5247, + "step": 6179 + }, + { + "epoch": 1.06, + "grad_norm": 9.642062187194824, + "learning_rate": 1.409129912476403e-05, + "loss": 0.5671, + "step": 6180 + }, + { + "epoch": 1.06, + "grad_norm": 11.485599517822266, + "learning_rate": 1.4088724901321435e-05, + "loss": 0.5321, + "step": 6181 + }, + { + "epoch": 1.06, + "grad_norm": 9.10834789276123, + "learning_rate": 1.408615067787884e-05, + "loss": 0.6384, + "step": 6182 + }, + { + "epoch": 1.06, + "grad_norm": 10.994007110595703, + "learning_rate": 1.4083576454436245e-05, + "loss": 0.7008, + "step": 6183 + }, + { + "epoch": 1.06, + "grad_norm": 15.241264343261719, + "learning_rate": 1.408100223099365e-05, + "loss": 0.552, + "step": 6184 + }, + { + "epoch": 1.06, + "grad_norm": 14.289605140686035, + "learning_rate": 1.4078428007551057e-05, + "loss": 0.7744, + "step": 6185 + }, + { + "epoch": 1.06, + "grad_norm": 11.377355575561523, + "learning_rate": 1.407585378410846e-05, + "loss": 0.5369, + "step": 6186 + }, + { + "epoch": 1.06, + "grad_norm": 7.560219764709473, + "learning_rate": 1.4073279560665865e-05, + "loss": 0.5053, + "step": 6187 + }, + { + "epoch": 1.06, + "grad_norm": 10.4827241897583, + "learning_rate": 1.407070533722327e-05, + "loss": 0.5675, + "step": 6188 + }, + { + "epoch": 1.06, + "grad_norm": 9.145089149475098, + "learning_rate": 1.4068131113780677e-05, + "loss": 0.4695, + "step": 6189 + }, + { + "epoch": 1.06, + "grad_norm": 7.306835174560547, + "learning_rate": 1.4065556890338082e-05, + "loss": 0.4757, + "step": 6190 + }, + { + "epoch": 1.06, + "grad_norm": 11.71056842803955, + "learning_rate": 1.4062982666895487e-05, + "loss": 0.6799, + "step": 6191 + }, + { + "epoch": 1.06, + "grad_norm": 10.993293762207031, + "learning_rate": 1.4060408443452892e-05, + "loss": 0.6636, + "step": 6192 + }, + { + "epoch": 1.06, + "grad_norm": 11.135930061340332, + "learning_rate": 1.4057834220010297e-05, + "loss": 0.4951, + "step": 6193 + }, + { + "epoch": 1.06, + "grad_norm": 9.61836051940918, + "learning_rate": 1.4055259996567704e-05, + "loss": 0.6253, + "step": 6194 + }, + { + "epoch": 1.06, + "grad_norm": 9.176191329956055, + "learning_rate": 1.4052685773125109e-05, + "loss": 0.6785, + "step": 6195 + }, + { + "epoch": 1.06, + "grad_norm": 10.15185546875, + "learning_rate": 1.4050111549682514e-05, + "loss": 0.6408, + "step": 6196 + }, + { + "epoch": 1.06, + "grad_norm": 11.607077598571777, + "learning_rate": 1.4047537326239917e-05, + "loss": 0.6836, + "step": 6197 + }, + { + "epoch": 1.06, + "grad_norm": 10.048965454101562, + "learning_rate": 1.4044963102797322e-05, + "loss": 0.5954, + "step": 6198 + }, + { + "epoch": 1.06, + "grad_norm": 9.19587230682373, + "learning_rate": 1.4042388879354729e-05, + "loss": 0.4546, + "step": 6199 + }, + { + "epoch": 1.06, + "grad_norm": 9.464704513549805, + "learning_rate": 1.4039814655912134e-05, + "loss": 0.441, + "step": 6200 + }, + { + "epoch": 1.06, + "grad_norm": 10.671178817749023, + "learning_rate": 1.4037240432469538e-05, + "loss": 0.4576, + "step": 6201 + }, + { + "epoch": 1.06, + "grad_norm": 8.676521301269531, + "learning_rate": 1.4034666209026943e-05, + "loss": 0.4821, + "step": 6202 + }, + { + "epoch": 1.06, + "grad_norm": 10.109301567077637, + "learning_rate": 1.4032091985584348e-05, + "loss": 0.5541, + "step": 6203 + }, + { + "epoch": 1.06, + "grad_norm": 9.648665428161621, + "learning_rate": 1.4029517762141755e-05, + "loss": 0.4771, + "step": 6204 + }, + { + "epoch": 1.06, + "grad_norm": 12.39564323425293, + "learning_rate": 1.402694353869916e-05, + "loss": 0.8375, + "step": 6205 + }, + { + "epoch": 1.07, + "grad_norm": 11.830465316772461, + "learning_rate": 1.4024369315256565e-05, + "loss": 0.444, + "step": 6206 + }, + { + "epoch": 1.07, + "grad_norm": 11.146523475646973, + "learning_rate": 1.4021795091813968e-05, + "loss": 0.709, + "step": 6207 + }, + { + "epoch": 1.07, + "grad_norm": 15.60805606842041, + "learning_rate": 1.4019220868371375e-05, + "loss": 0.6636, + "step": 6208 + }, + { + "epoch": 1.07, + "grad_norm": 9.50390911102295, + "learning_rate": 1.401664664492878e-05, + "loss": 0.3645, + "step": 6209 + }, + { + "epoch": 1.07, + "grad_norm": 10.364044189453125, + "learning_rate": 1.4014072421486185e-05, + "loss": 0.6259, + "step": 6210 + }, + { + "epoch": 1.07, + "grad_norm": 12.997735977172852, + "learning_rate": 1.401149819804359e-05, + "loss": 0.8093, + "step": 6211 + }, + { + "epoch": 1.07, + "grad_norm": 13.80439567565918, + "learning_rate": 1.4008923974600995e-05, + "loss": 0.6535, + "step": 6212 + }, + { + "epoch": 1.07, + "grad_norm": 13.367583274841309, + "learning_rate": 1.4006349751158402e-05, + "loss": 0.4333, + "step": 6213 + }, + { + "epoch": 1.07, + "grad_norm": 13.364331245422363, + "learning_rate": 1.4003775527715807e-05, + "loss": 0.6987, + "step": 6214 + }, + { + "epoch": 1.07, + "grad_norm": 14.478676795959473, + "learning_rate": 1.4001201304273212e-05, + "loss": 0.4178, + "step": 6215 + }, + { + "epoch": 1.07, + "grad_norm": 10.883241653442383, + "learning_rate": 1.3998627080830617e-05, + "loss": 0.8292, + "step": 6216 + }, + { + "epoch": 1.07, + "grad_norm": 7.789076328277588, + "learning_rate": 1.3996052857388022e-05, + "loss": 0.459, + "step": 6217 + }, + { + "epoch": 1.07, + "grad_norm": 11.890912055969238, + "learning_rate": 1.3993478633945427e-05, + "loss": 0.5262, + "step": 6218 + }, + { + "epoch": 1.07, + "grad_norm": 14.356431007385254, + "learning_rate": 1.3990904410502832e-05, + "loss": 0.6706, + "step": 6219 + }, + { + "epoch": 1.07, + "grad_norm": 12.556934356689453, + "learning_rate": 1.3988330187060237e-05, + "loss": 0.7136, + "step": 6220 + }, + { + "epoch": 1.07, + "grad_norm": 10.441965103149414, + "learning_rate": 1.3985755963617642e-05, + "loss": 0.5053, + "step": 6221 + }, + { + "epoch": 1.07, + "grad_norm": 9.535511016845703, + "learning_rate": 1.3983181740175048e-05, + "loss": 0.5035, + "step": 6222 + }, + { + "epoch": 1.07, + "grad_norm": 10.299717903137207, + "learning_rate": 1.3980607516732453e-05, + "loss": 0.6058, + "step": 6223 + }, + { + "epoch": 1.07, + "grad_norm": 11.213362693786621, + "learning_rate": 1.3978033293289858e-05, + "loss": 0.5974, + "step": 6224 + }, + { + "epoch": 1.07, + "grad_norm": 10.583738327026367, + "learning_rate": 1.3975459069847263e-05, + "loss": 0.5529, + "step": 6225 + }, + { + "epoch": 1.07, + "grad_norm": 9.918071746826172, + "learning_rate": 1.3972884846404668e-05, + "loss": 0.7726, + "step": 6226 + }, + { + "epoch": 1.07, + "grad_norm": 10.898406028747559, + "learning_rate": 1.3970310622962075e-05, + "loss": 0.4654, + "step": 6227 + }, + { + "epoch": 1.07, + "grad_norm": 8.304642677307129, + "learning_rate": 1.3967736399519478e-05, + "loss": 0.4431, + "step": 6228 + }, + { + "epoch": 1.07, + "grad_norm": 10.091839790344238, + "learning_rate": 1.3965162176076883e-05, + "loss": 0.5877, + "step": 6229 + }, + { + "epoch": 1.07, + "grad_norm": 9.9913911819458, + "learning_rate": 1.3962587952634288e-05, + "loss": 0.4119, + "step": 6230 + }, + { + "epoch": 1.07, + "grad_norm": 9.420130729675293, + "learning_rate": 1.3960013729191693e-05, + "loss": 0.6257, + "step": 6231 + }, + { + "epoch": 1.07, + "grad_norm": 9.855796813964844, + "learning_rate": 1.39574395057491e-05, + "loss": 0.515, + "step": 6232 + }, + { + "epoch": 1.07, + "grad_norm": 9.3886079788208, + "learning_rate": 1.3954865282306505e-05, + "loss": 0.4054, + "step": 6233 + }, + { + "epoch": 1.07, + "grad_norm": 10.868733406066895, + "learning_rate": 1.395229105886391e-05, + "loss": 0.5337, + "step": 6234 + }, + { + "epoch": 1.07, + "grad_norm": 10.671782493591309, + "learning_rate": 1.3949716835421315e-05, + "loss": 0.422, + "step": 6235 + }, + { + "epoch": 1.07, + "grad_norm": 9.012228965759277, + "learning_rate": 1.394714261197872e-05, + "loss": 0.4726, + "step": 6236 + }, + { + "epoch": 1.07, + "grad_norm": 9.76025676727295, + "learning_rate": 1.3944568388536127e-05, + "loss": 0.3327, + "step": 6237 + }, + { + "epoch": 1.07, + "grad_norm": 12.357230186462402, + "learning_rate": 1.394199416509353e-05, + "loss": 0.4904, + "step": 6238 + }, + { + "epoch": 1.07, + "grad_norm": 16.895170211791992, + "learning_rate": 1.3939419941650935e-05, + "loss": 0.6192, + "step": 6239 + }, + { + "epoch": 1.07, + "grad_norm": 13.209478378295898, + "learning_rate": 1.393684571820834e-05, + "loss": 0.5817, + "step": 6240 + }, + { + "epoch": 1.07, + "grad_norm": 13.764213562011719, + "learning_rate": 1.3934271494765746e-05, + "loss": 0.6675, + "step": 6241 + }, + { + "epoch": 1.07, + "grad_norm": 10.004423141479492, + "learning_rate": 1.3931697271323151e-05, + "loss": 0.517, + "step": 6242 + }, + { + "epoch": 1.07, + "grad_norm": 11.49935245513916, + "learning_rate": 1.3929123047880556e-05, + "loss": 0.6067, + "step": 6243 + }, + { + "epoch": 1.07, + "grad_norm": 8.47079086303711, + "learning_rate": 1.3926548824437961e-05, + "loss": 0.4759, + "step": 6244 + }, + { + "epoch": 1.07, + "grad_norm": 11.959939002990723, + "learning_rate": 1.3923974600995366e-05, + "loss": 0.5148, + "step": 6245 + }, + { + "epoch": 1.07, + "grad_norm": 11.511268615722656, + "learning_rate": 1.3921400377552773e-05, + "loss": 0.7078, + "step": 6246 + }, + { + "epoch": 1.07, + "grad_norm": 11.832539558410645, + "learning_rate": 1.3918826154110178e-05, + "loss": 0.421, + "step": 6247 + }, + { + "epoch": 1.07, + "grad_norm": 9.258830070495605, + "learning_rate": 1.3916251930667583e-05, + "loss": 0.6067, + "step": 6248 + }, + { + "epoch": 1.07, + "grad_norm": 9.234213829040527, + "learning_rate": 1.3913677707224986e-05, + "loss": 0.5588, + "step": 6249 + }, + { + "epoch": 1.07, + "grad_norm": 10.037192344665527, + "learning_rate": 1.3911103483782391e-05, + "loss": 0.5106, + "step": 6250 + }, + { + "epoch": 1.07, + "grad_norm": 9.15066909790039, + "learning_rate": 1.3908529260339798e-05, + "loss": 0.561, + "step": 6251 + }, + { + "epoch": 1.07, + "grad_norm": 10.844749450683594, + "learning_rate": 1.3905955036897203e-05, + "loss": 0.7623, + "step": 6252 + }, + { + "epoch": 1.07, + "grad_norm": 9.8617582321167, + "learning_rate": 1.3903380813454608e-05, + "loss": 0.7299, + "step": 6253 + }, + { + "epoch": 1.07, + "grad_norm": 11.919418334960938, + "learning_rate": 1.3900806590012013e-05, + "loss": 0.8348, + "step": 6254 + }, + { + "epoch": 1.07, + "grad_norm": 9.06890869140625, + "learning_rate": 1.3898232366569418e-05, + "loss": 0.5835, + "step": 6255 + }, + { + "epoch": 1.07, + "grad_norm": 10.210168838500977, + "learning_rate": 1.3895658143126825e-05, + "loss": 0.6658, + "step": 6256 + }, + { + "epoch": 1.07, + "grad_norm": 8.673344612121582, + "learning_rate": 1.389308391968423e-05, + "loss": 0.5187, + "step": 6257 + }, + { + "epoch": 1.07, + "grad_norm": 10.394471168518066, + "learning_rate": 1.3890509696241635e-05, + "loss": 0.6847, + "step": 6258 + }, + { + "epoch": 1.07, + "grad_norm": 9.45179557800293, + "learning_rate": 1.3887935472799038e-05, + "loss": 0.565, + "step": 6259 + }, + { + "epoch": 1.07, + "grad_norm": 9.54520034790039, + "learning_rate": 1.3885361249356445e-05, + "loss": 0.4987, + "step": 6260 + }, + { + "epoch": 1.07, + "grad_norm": 10.653621673583984, + "learning_rate": 1.388278702591385e-05, + "loss": 0.5881, + "step": 6261 + }, + { + "epoch": 1.07, + "grad_norm": 9.62857437133789, + "learning_rate": 1.3880212802471255e-05, + "loss": 0.479, + "step": 6262 + }, + { + "epoch": 1.07, + "grad_norm": 11.425786018371582, + "learning_rate": 1.387763857902866e-05, + "loss": 0.5928, + "step": 6263 + }, + { + "epoch": 1.07, + "grad_norm": 8.90549087524414, + "learning_rate": 1.3875064355586065e-05, + "loss": 0.474, + "step": 6264 + }, + { + "epoch": 1.08, + "grad_norm": 9.314797401428223, + "learning_rate": 1.3872490132143471e-05, + "loss": 0.5046, + "step": 6265 + }, + { + "epoch": 1.08, + "grad_norm": 7.694711208343506, + "learning_rate": 1.3869915908700876e-05, + "loss": 0.5418, + "step": 6266 + }, + { + "epoch": 1.08, + "grad_norm": 10.304850578308105, + "learning_rate": 1.3867341685258281e-05, + "loss": 0.5628, + "step": 6267 + }, + { + "epoch": 1.08, + "grad_norm": 13.094827651977539, + "learning_rate": 1.3864767461815686e-05, + "loss": 0.5786, + "step": 6268 + }, + { + "epoch": 1.08, + "grad_norm": 11.304694175720215, + "learning_rate": 1.3862193238373091e-05, + "loss": 0.6707, + "step": 6269 + }, + { + "epoch": 1.08, + "grad_norm": 7.792288780212402, + "learning_rate": 1.3859619014930496e-05, + "loss": 0.5119, + "step": 6270 + }, + { + "epoch": 1.08, + "grad_norm": 10.740971565246582, + "learning_rate": 1.3857044791487901e-05, + "loss": 0.5572, + "step": 6271 + }, + { + "epoch": 1.08, + "grad_norm": 13.444781303405762, + "learning_rate": 1.3854470568045306e-05, + "loss": 0.6394, + "step": 6272 + }, + { + "epoch": 1.08, + "grad_norm": 10.442985534667969, + "learning_rate": 1.3851896344602711e-05, + "loss": 0.4735, + "step": 6273 + }, + { + "epoch": 1.08, + "grad_norm": 8.584697723388672, + "learning_rate": 1.3849322121160118e-05, + "loss": 0.5799, + "step": 6274 + }, + { + "epoch": 1.08, + "grad_norm": 9.112217903137207, + "learning_rate": 1.3846747897717523e-05, + "loss": 0.4647, + "step": 6275 + }, + { + "epoch": 1.08, + "grad_norm": 12.636457443237305, + "learning_rate": 1.3844173674274928e-05, + "loss": 0.4893, + "step": 6276 + }, + { + "epoch": 1.08, + "grad_norm": 9.850175857543945, + "learning_rate": 1.3841599450832333e-05, + "loss": 0.3756, + "step": 6277 + }, + { + "epoch": 1.08, + "grad_norm": 13.449297904968262, + "learning_rate": 1.3839025227389738e-05, + "loss": 0.4627, + "step": 6278 + }, + { + "epoch": 1.08, + "grad_norm": 12.737735748291016, + "learning_rate": 1.3836451003947144e-05, + "loss": 0.7851, + "step": 6279 + }, + { + "epoch": 1.08, + "grad_norm": 11.187867164611816, + "learning_rate": 1.3833876780504548e-05, + "loss": 0.4972, + "step": 6280 + }, + { + "epoch": 1.08, + "grad_norm": 8.601746559143066, + "learning_rate": 1.3831302557061953e-05, + "loss": 0.5256, + "step": 6281 + }, + { + "epoch": 1.08, + "grad_norm": 10.227254867553711, + "learning_rate": 1.3828728333619358e-05, + "loss": 0.4618, + "step": 6282 + }, + { + "epoch": 1.08, + "grad_norm": 7.60275411605835, + "learning_rate": 1.3826154110176763e-05, + "loss": 0.4363, + "step": 6283 + }, + { + "epoch": 1.08, + "grad_norm": 8.972223281860352, + "learning_rate": 1.382357988673417e-05, + "loss": 0.4141, + "step": 6284 + }, + { + "epoch": 1.08, + "grad_norm": 7.946643829345703, + "learning_rate": 1.3821005663291574e-05, + "loss": 0.5362, + "step": 6285 + }, + { + "epoch": 1.08, + "grad_norm": 8.773284912109375, + "learning_rate": 1.381843143984898e-05, + "loss": 0.5777, + "step": 6286 + }, + { + "epoch": 1.08, + "grad_norm": 11.536645889282227, + "learning_rate": 1.3815857216406384e-05, + "loss": 0.541, + "step": 6287 + }, + { + "epoch": 1.08, + "grad_norm": 9.107416152954102, + "learning_rate": 1.381328299296379e-05, + "loss": 0.5172, + "step": 6288 + }, + { + "epoch": 1.08, + "grad_norm": 9.692770004272461, + "learning_rate": 1.3810708769521196e-05, + "loss": 0.6203, + "step": 6289 + }, + { + "epoch": 1.08, + "grad_norm": 12.433701515197754, + "learning_rate": 1.3808134546078601e-05, + "loss": 0.5653, + "step": 6290 + }, + { + "epoch": 1.08, + "grad_norm": 13.546643257141113, + "learning_rate": 1.3805560322636004e-05, + "loss": 0.5105, + "step": 6291 + }, + { + "epoch": 1.08, + "grad_norm": 10.81982135772705, + "learning_rate": 1.380298609919341e-05, + "loss": 0.6135, + "step": 6292 + }, + { + "epoch": 1.08, + "grad_norm": 10.829192161560059, + "learning_rate": 1.3800411875750816e-05, + "loss": 0.4661, + "step": 6293 + }, + { + "epoch": 1.08, + "grad_norm": 10.260122299194336, + "learning_rate": 1.3797837652308221e-05, + "loss": 0.6475, + "step": 6294 + }, + { + "epoch": 1.08, + "grad_norm": 8.663241386413574, + "learning_rate": 1.3795263428865626e-05, + "loss": 0.4079, + "step": 6295 + }, + { + "epoch": 1.08, + "grad_norm": 11.937297821044922, + "learning_rate": 1.3792689205423031e-05, + "loss": 0.6343, + "step": 6296 + }, + { + "epoch": 1.08, + "grad_norm": 9.142282485961914, + "learning_rate": 1.3790114981980436e-05, + "loss": 0.4985, + "step": 6297 + }, + { + "epoch": 1.08, + "grad_norm": 7.680667400360107, + "learning_rate": 1.3787540758537843e-05, + "loss": 0.2924, + "step": 6298 + }, + { + "epoch": 1.08, + "grad_norm": 10.194073677062988, + "learning_rate": 1.3784966535095248e-05, + "loss": 0.4541, + "step": 6299 + }, + { + "epoch": 1.08, + "grad_norm": 10.01657485961914, + "learning_rate": 1.3782392311652653e-05, + "loss": 0.6727, + "step": 6300 + }, + { + "epoch": 1.08, + "grad_norm": 12.428760528564453, + "learning_rate": 1.3779818088210056e-05, + "loss": 0.6225, + "step": 6301 + }, + { + "epoch": 1.08, + "grad_norm": 9.794927597045898, + "learning_rate": 1.377724386476746e-05, + "loss": 0.585, + "step": 6302 + }, + { + "epoch": 1.08, + "grad_norm": 14.727364540100098, + "learning_rate": 1.3774669641324868e-05, + "loss": 0.5741, + "step": 6303 + }, + { + "epoch": 1.08, + "grad_norm": 12.955038070678711, + "learning_rate": 1.3772095417882273e-05, + "loss": 0.5454, + "step": 6304 + }, + { + "epoch": 1.08, + "grad_norm": 9.521261215209961, + "learning_rate": 1.3769521194439678e-05, + "loss": 0.5624, + "step": 6305 + }, + { + "epoch": 1.08, + "grad_norm": 11.289969444274902, + "learning_rate": 1.3766946970997082e-05, + "loss": 0.7043, + "step": 6306 + }, + { + "epoch": 1.08, + "grad_norm": 10.969691276550293, + "learning_rate": 1.3764372747554487e-05, + "loss": 0.5271, + "step": 6307 + }, + { + "epoch": 1.08, + "grad_norm": 8.186813354492188, + "learning_rate": 1.3761798524111894e-05, + "loss": 0.4161, + "step": 6308 + }, + { + "epoch": 1.08, + "grad_norm": 10.47191047668457, + "learning_rate": 1.3759224300669299e-05, + "loss": 0.6287, + "step": 6309 + }, + { + "epoch": 1.08, + "grad_norm": 9.426264762878418, + "learning_rate": 1.3756650077226704e-05, + "loss": 0.5705, + "step": 6310 + }, + { + "epoch": 1.08, + "grad_norm": 13.465685844421387, + "learning_rate": 1.3754075853784107e-05, + "loss": 0.6473, + "step": 6311 + }, + { + "epoch": 1.08, + "grad_norm": 12.629545211791992, + "learning_rate": 1.3751501630341514e-05, + "loss": 0.5713, + "step": 6312 + }, + { + "epoch": 1.08, + "grad_norm": 8.682764053344727, + "learning_rate": 1.3748927406898919e-05, + "loss": 0.6149, + "step": 6313 + }, + { + "epoch": 1.08, + "grad_norm": 13.982463836669922, + "learning_rate": 1.3746353183456324e-05, + "loss": 0.7517, + "step": 6314 + }, + { + "epoch": 1.08, + "grad_norm": 12.972879409790039, + "learning_rate": 1.3743778960013729e-05, + "loss": 0.7649, + "step": 6315 + }, + { + "epoch": 1.08, + "grad_norm": 8.674260139465332, + "learning_rate": 1.3741204736571134e-05, + "loss": 0.412, + "step": 6316 + }, + { + "epoch": 1.08, + "grad_norm": 8.33468246459961, + "learning_rate": 1.373863051312854e-05, + "loss": 0.4099, + "step": 6317 + }, + { + "epoch": 1.08, + "grad_norm": 13.371768951416016, + "learning_rate": 1.3736056289685946e-05, + "loss": 0.9034, + "step": 6318 + }, + { + "epoch": 1.08, + "grad_norm": 9.01196575164795, + "learning_rate": 1.373348206624335e-05, + "loss": 0.5516, + "step": 6319 + }, + { + "epoch": 1.08, + "grad_norm": 6.978898525238037, + "learning_rate": 1.3730907842800756e-05, + "loss": 0.4562, + "step": 6320 + }, + { + "epoch": 1.08, + "grad_norm": 6.977396488189697, + "learning_rate": 1.372833361935816e-05, + "loss": 0.5025, + "step": 6321 + }, + { + "epoch": 1.08, + "grad_norm": 11.403948783874512, + "learning_rate": 1.3725759395915566e-05, + "loss": 0.5199, + "step": 6322 + }, + { + "epoch": 1.09, + "grad_norm": 11.480254173278809, + "learning_rate": 1.372318517247297e-05, + "loss": 0.6569, + "step": 6323 + }, + { + "epoch": 1.09, + "grad_norm": 7.804694652557373, + "learning_rate": 1.3720610949030376e-05, + "loss": 0.4094, + "step": 6324 + }, + { + "epoch": 1.09, + "grad_norm": 7.974738597869873, + "learning_rate": 1.371803672558778e-05, + "loss": 0.577, + "step": 6325 + }, + { + "epoch": 1.09, + "grad_norm": 12.148778915405273, + "learning_rate": 1.3715462502145187e-05, + "loss": 0.6078, + "step": 6326 + }, + { + "epoch": 1.09, + "grad_norm": 10.429791450500488, + "learning_rate": 1.3712888278702592e-05, + "loss": 0.423, + "step": 6327 + }, + { + "epoch": 1.09, + "grad_norm": 11.341646194458008, + "learning_rate": 1.3710314055259997e-05, + "loss": 0.7242, + "step": 6328 + }, + { + "epoch": 1.09, + "grad_norm": 9.757133483886719, + "learning_rate": 1.3707739831817402e-05, + "loss": 0.3231, + "step": 6329 + }, + { + "epoch": 1.09, + "grad_norm": 16.56751823425293, + "learning_rate": 1.3705165608374807e-05, + "loss": 0.716, + "step": 6330 + }, + { + "epoch": 1.09, + "grad_norm": 9.949222564697266, + "learning_rate": 1.3702591384932214e-05, + "loss": 0.5034, + "step": 6331 + }, + { + "epoch": 1.09, + "grad_norm": 11.675853729248047, + "learning_rate": 1.3700017161489617e-05, + "loss": 0.5055, + "step": 6332 + }, + { + "epoch": 1.09, + "grad_norm": 11.437301635742188, + "learning_rate": 1.3697442938047022e-05, + "loss": 0.6229, + "step": 6333 + }, + { + "epoch": 1.09, + "grad_norm": 8.74101448059082, + "learning_rate": 1.3694868714604427e-05, + "loss": 0.367, + "step": 6334 + }, + { + "epoch": 1.09, + "grad_norm": 11.52475643157959, + "learning_rate": 1.3692294491161832e-05, + "loss": 0.4028, + "step": 6335 + }, + { + "epoch": 1.09, + "grad_norm": 10.165909767150879, + "learning_rate": 1.3689720267719239e-05, + "loss": 0.4518, + "step": 6336 + }, + { + "epoch": 1.09, + "grad_norm": 11.376535415649414, + "learning_rate": 1.3687146044276644e-05, + "loss": 0.7996, + "step": 6337 + }, + { + "epoch": 1.09, + "grad_norm": 11.022297859191895, + "learning_rate": 1.3684571820834049e-05, + "loss": 0.5165, + "step": 6338 + }, + { + "epoch": 1.09, + "grad_norm": 10.504289627075195, + "learning_rate": 1.3681997597391454e-05, + "loss": 0.6616, + "step": 6339 + }, + { + "epoch": 1.09, + "grad_norm": 10.874673843383789, + "learning_rate": 1.3679423373948859e-05, + "loss": 0.5026, + "step": 6340 + }, + { + "epoch": 1.09, + "grad_norm": 9.333306312561035, + "learning_rate": 1.3676849150506266e-05, + "loss": 0.4877, + "step": 6341 + }, + { + "epoch": 1.09, + "grad_norm": 9.488816261291504, + "learning_rate": 1.367427492706367e-05, + "loss": 0.44, + "step": 6342 + }, + { + "epoch": 1.09, + "grad_norm": 10.746158599853516, + "learning_rate": 1.3671700703621074e-05, + "loss": 0.5407, + "step": 6343 + }, + { + "epoch": 1.09, + "grad_norm": 9.122489929199219, + "learning_rate": 1.3669126480178479e-05, + "loss": 0.5318, + "step": 6344 + }, + { + "epoch": 1.09, + "grad_norm": 8.632287979125977, + "learning_rate": 1.3666552256735885e-05, + "loss": 0.4774, + "step": 6345 + }, + { + "epoch": 1.09, + "grad_norm": 9.978248596191406, + "learning_rate": 1.366397803329329e-05, + "loss": 0.5511, + "step": 6346 + }, + { + "epoch": 1.09, + "grad_norm": 10.998766899108887, + "learning_rate": 1.3661403809850695e-05, + "loss": 0.4407, + "step": 6347 + }, + { + "epoch": 1.09, + "grad_norm": 11.10408878326416, + "learning_rate": 1.36588295864081e-05, + "loss": 0.4636, + "step": 6348 + }, + { + "epoch": 1.09, + "grad_norm": 8.009697914123535, + "learning_rate": 1.3656255362965505e-05, + "loss": 0.3376, + "step": 6349 + }, + { + "epoch": 1.09, + "grad_norm": 8.989333152770996, + "learning_rate": 1.3653681139522912e-05, + "loss": 0.4242, + "step": 6350 + }, + { + "epoch": 1.09, + "grad_norm": 9.495859146118164, + "learning_rate": 1.3651106916080317e-05, + "loss": 0.4429, + "step": 6351 + }, + { + "epoch": 1.09, + "grad_norm": 11.282435417175293, + "learning_rate": 1.3648532692637722e-05, + "loss": 0.4659, + "step": 6352 + }, + { + "epoch": 1.09, + "grad_norm": 11.34176254272461, + "learning_rate": 1.3645958469195125e-05, + "loss": 0.4175, + "step": 6353 + }, + { + "epoch": 1.09, + "grad_norm": 11.509920120239258, + "learning_rate": 1.364338424575253e-05, + "loss": 0.5349, + "step": 6354 + }, + { + "epoch": 1.09, + "grad_norm": 8.715743064880371, + "learning_rate": 1.3640810022309937e-05, + "loss": 0.499, + "step": 6355 + }, + { + "epoch": 1.09, + "grad_norm": 11.126643180847168, + "learning_rate": 1.3638235798867342e-05, + "loss": 0.6813, + "step": 6356 + }, + { + "epoch": 1.09, + "grad_norm": 8.739013671875, + "learning_rate": 1.3635661575424747e-05, + "loss": 0.5275, + "step": 6357 + }, + { + "epoch": 1.09, + "grad_norm": 8.351797103881836, + "learning_rate": 1.3633087351982152e-05, + "loss": 0.4292, + "step": 6358 + }, + { + "epoch": 1.09, + "grad_norm": 12.987312316894531, + "learning_rate": 1.3630513128539557e-05, + "loss": 0.6248, + "step": 6359 + }, + { + "epoch": 1.09, + "grad_norm": 8.425827026367188, + "learning_rate": 1.3627938905096964e-05, + "loss": 0.4033, + "step": 6360 + }, + { + "epoch": 1.09, + "grad_norm": 12.433403015136719, + "learning_rate": 1.3625364681654369e-05, + "loss": 0.547, + "step": 6361 + }, + { + "epoch": 1.09, + "grad_norm": 11.584619522094727, + "learning_rate": 1.3622790458211774e-05, + "loss": 0.6028, + "step": 6362 + }, + { + "epoch": 1.09, + "grad_norm": 10.98801040649414, + "learning_rate": 1.3620216234769177e-05, + "loss": 0.6658, + "step": 6363 + }, + { + "epoch": 1.09, + "grad_norm": 11.8985595703125, + "learning_rate": 1.3617642011326584e-05, + "loss": 0.4547, + "step": 6364 + }, + { + "epoch": 1.09, + "grad_norm": 14.857511520385742, + "learning_rate": 1.3615067787883989e-05, + "loss": 0.518, + "step": 6365 + }, + { + "epoch": 1.09, + "grad_norm": 8.525308609008789, + "learning_rate": 1.3612493564441394e-05, + "loss": 0.4712, + "step": 6366 + }, + { + "epoch": 1.09, + "grad_norm": 13.426017761230469, + "learning_rate": 1.3609919340998799e-05, + "loss": 0.6713, + "step": 6367 + }, + { + "epoch": 1.09, + "grad_norm": 11.638419151306152, + "learning_rate": 1.3607345117556204e-05, + "loss": 0.5222, + "step": 6368 + }, + { + "epoch": 1.09, + "grad_norm": 10.876932144165039, + "learning_rate": 1.360477089411361e-05, + "loss": 0.5994, + "step": 6369 + }, + { + "epoch": 1.09, + "grad_norm": 12.331613540649414, + "learning_rate": 1.3602196670671015e-05, + "loss": 0.83, + "step": 6370 + }, + { + "epoch": 1.09, + "grad_norm": 12.791149139404297, + "learning_rate": 1.359962244722842e-05, + "loss": 0.5903, + "step": 6371 + }, + { + "epoch": 1.09, + "grad_norm": 13.320500373840332, + "learning_rate": 1.3597048223785825e-05, + "loss": 0.7425, + "step": 6372 + }, + { + "epoch": 1.09, + "grad_norm": 12.020678520202637, + "learning_rate": 1.359447400034323e-05, + "loss": 0.6261, + "step": 6373 + }, + { + "epoch": 1.09, + "grad_norm": 7.463134288787842, + "learning_rate": 1.3591899776900635e-05, + "loss": 0.3939, + "step": 6374 + }, + { + "epoch": 1.09, + "grad_norm": 9.661871910095215, + "learning_rate": 1.358932555345804e-05, + "loss": 0.4429, + "step": 6375 + }, + { + "epoch": 1.09, + "grad_norm": 14.510074615478516, + "learning_rate": 1.3586751330015445e-05, + "loss": 0.5624, + "step": 6376 + }, + { + "epoch": 1.09, + "grad_norm": 12.635680198669434, + "learning_rate": 1.358417710657285e-05, + "loss": 0.5875, + "step": 6377 + }, + { + "epoch": 1.09, + "grad_norm": 10.059264183044434, + "learning_rate": 1.3581602883130255e-05, + "loss": 0.4513, + "step": 6378 + }, + { + "epoch": 1.09, + "grad_norm": 10.257065773010254, + "learning_rate": 1.3579028659687662e-05, + "loss": 0.4845, + "step": 6379 + }, + { + "epoch": 1.09, + "grad_norm": 8.997725486755371, + "learning_rate": 1.3576454436245067e-05, + "loss": 0.5778, + "step": 6380 + }, + { + "epoch": 1.1, + "grad_norm": 10.622173309326172, + "learning_rate": 1.3573880212802472e-05, + "loss": 0.641, + "step": 6381 + }, + { + "epoch": 1.1, + "grad_norm": 10.398406982421875, + "learning_rate": 1.3571305989359877e-05, + "loss": 0.4863, + "step": 6382 + }, + { + "epoch": 1.1, + "grad_norm": 11.99822998046875, + "learning_rate": 1.3568731765917283e-05, + "loss": 0.91, + "step": 6383 + }, + { + "epoch": 1.1, + "grad_norm": 8.757206916809082, + "learning_rate": 1.3566157542474687e-05, + "loss": 0.5074, + "step": 6384 + }, + { + "epoch": 1.1, + "grad_norm": 8.954026222229004, + "learning_rate": 1.3563583319032092e-05, + "loss": 0.4112, + "step": 6385 + }, + { + "epoch": 1.1, + "grad_norm": 9.116982460021973, + "learning_rate": 1.3561009095589497e-05, + "loss": 0.3211, + "step": 6386 + }, + { + "epoch": 1.1, + "grad_norm": 13.540571212768555, + "learning_rate": 1.3558434872146902e-05, + "loss": 0.5083, + "step": 6387 + }, + { + "epoch": 1.1, + "grad_norm": 12.422941207885742, + "learning_rate": 1.3555860648704308e-05, + "loss": 0.6311, + "step": 6388 + }, + { + "epoch": 1.1, + "grad_norm": 10.01567554473877, + "learning_rate": 1.3553286425261713e-05, + "loss": 0.6047, + "step": 6389 + }, + { + "epoch": 1.1, + "grad_norm": 12.086312294006348, + "learning_rate": 1.3550712201819118e-05, + "loss": 0.5753, + "step": 6390 + }, + { + "epoch": 1.1, + "grad_norm": 10.17856216430664, + "learning_rate": 1.3548137978376523e-05, + "loss": 0.696, + "step": 6391 + }, + { + "epoch": 1.1, + "grad_norm": 10.220876693725586, + "learning_rate": 1.3545563754933928e-05, + "loss": 0.4285, + "step": 6392 + }, + { + "epoch": 1.1, + "grad_norm": 9.963923454284668, + "learning_rate": 1.3542989531491335e-05, + "loss": 0.5472, + "step": 6393 + }, + { + "epoch": 1.1, + "grad_norm": 10.718011856079102, + "learning_rate": 1.354041530804874e-05, + "loss": 0.5005, + "step": 6394 + }, + { + "epoch": 1.1, + "grad_norm": 8.599753379821777, + "learning_rate": 1.3537841084606143e-05, + "loss": 0.551, + "step": 6395 + }, + { + "epoch": 1.1, + "grad_norm": 11.1625337600708, + "learning_rate": 1.3535266861163548e-05, + "loss": 0.5666, + "step": 6396 + }, + { + "epoch": 1.1, + "grad_norm": 9.343599319458008, + "learning_rate": 1.3532692637720955e-05, + "loss": 0.4813, + "step": 6397 + }, + { + "epoch": 1.1, + "grad_norm": 8.8057279586792, + "learning_rate": 1.353011841427836e-05, + "loss": 0.5189, + "step": 6398 + }, + { + "epoch": 1.1, + "grad_norm": 10.72898006439209, + "learning_rate": 1.3527544190835765e-05, + "loss": 0.7332, + "step": 6399 + }, + { + "epoch": 1.1, + "grad_norm": 8.966325759887695, + "learning_rate": 1.352496996739317e-05, + "loss": 0.3484, + "step": 6400 + }, + { + "epoch": 1.1, + "grad_norm": 11.623270988464355, + "learning_rate": 1.3522395743950575e-05, + "loss": 0.4777, + "step": 6401 + }, + { + "epoch": 1.1, + "grad_norm": 13.14462947845459, + "learning_rate": 1.3519821520507982e-05, + "loss": 0.5484, + "step": 6402 + }, + { + "epoch": 1.1, + "grad_norm": 10.376391410827637, + "learning_rate": 1.3517247297065387e-05, + "loss": 0.7525, + "step": 6403 + }, + { + "epoch": 1.1, + "grad_norm": 10.11588191986084, + "learning_rate": 1.3514673073622792e-05, + "loss": 0.6512, + "step": 6404 + }, + { + "epoch": 1.1, + "grad_norm": 9.11837387084961, + "learning_rate": 1.3512098850180195e-05, + "loss": 0.4214, + "step": 6405 + }, + { + "epoch": 1.1, + "grad_norm": 10.572527885437012, + "learning_rate": 1.35095246267376e-05, + "loss": 0.4418, + "step": 6406 + }, + { + "epoch": 1.1, + "grad_norm": 8.243064880371094, + "learning_rate": 1.3506950403295007e-05, + "loss": 0.3604, + "step": 6407 + }, + { + "epoch": 1.1, + "grad_norm": 7.808772563934326, + "learning_rate": 1.3504376179852412e-05, + "loss": 0.4495, + "step": 6408 + }, + { + "epoch": 1.1, + "grad_norm": 11.752381324768066, + "learning_rate": 1.3501801956409817e-05, + "loss": 0.6866, + "step": 6409 + }, + { + "epoch": 1.1, + "grad_norm": 10.337677955627441, + "learning_rate": 1.3499227732967221e-05, + "loss": 0.5827, + "step": 6410 + }, + { + "epoch": 1.1, + "grad_norm": 7.918295860290527, + "learning_rate": 1.3496653509524626e-05, + "loss": 0.427, + "step": 6411 + }, + { + "epoch": 1.1, + "grad_norm": 9.244314193725586, + "learning_rate": 1.3494079286082033e-05, + "loss": 0.5604, + "step": 6412 + }, + { + "epoch": 1.1, + "grad_norm": 10.718318939208984, + "learning_rate": 1.3491505062639438e-05, + "loss": 0.5631, + "step": 6413 + }, + { + "epoch": 1.1, + "grad_norm": 13.136080741882324, + "learning_rate": 1.3488930839196843e-05, + "loss": 0.7477, + "step": 6414 + }, + { + "epoch": 1.1, + "grad_norm": 10.44372844696045, + "learning_rate": 1.3486356615754246e-05, + "loss": 0.6166, + "step": 6415 + }, + { + "epoch": 1.1, + "grad_norm": 11.982657432556152, + "learning_rate": 1.3483782392311653e-05, + "loss": 0.6096, + "step": 6416 + }, + { + "epoch": 1.1, + "grad_norm": 10.544780731201172, + "learning_rate": 1.3481208168869058e-05, + "loss": 0.694, + "step": 6417 + }, + { + "epoch": 1.1, + "grad_norm": 13.923746109008789, + "learning_rate": 1.3478633945426463e-05, + "loss": 0.7195, + "step": 6418 + }, + { + "epoch": 1.1, + "grad_norm": 14.48173713684082, + "learning_rate": 1.3476059721983868e-05, + "loss": 0.5827, + "step": 6419 + }, + { + "epoch": 1.1, + "grad_norm": 9.413176536560059, + "learning_rate": 1.3473485498541273e-05, + "loss": 0.6107, + "step": 6420 + }, + { + "epoch": 1.1, + "grad_norm": 9.797290802001953, + "learning_rate": 1.347091127509868e-05, + "loss": 0.7206, + "step": 6421 + }, + { + "epoch": 1.1, + "grad_norm": 9.028078079223633, + "learning_rate": 1.3468337051656085e-05, + "loss": 0.3351, + "step": 6422 + }, + { + "epoch": 1.1, + "grad_norm": 10.243917465209961, + "learning_rate": 1.346576282821349e-05, + "loss": 0.5047, + "step": 6423 + }, + { + "epoch": 1.1, + "grad_norm": 9.005094528198242, + "learning_rate": 1.3463188604770895e-05, + "loss": 0.4533, + "step": 6424 + }, + { + "epoch": 1.1, + "grad_norm": 8.739798545837402, + "learning_rate": 1.34606143813283e-05, + "loss": 0.3979, + "step": 6425 + }, + { + "epoch": 1.1, + "grad_norm": 9.81917667388916, + "learning_rate": 1.3458040157885705e-05, + "loss": 0.8191, + "step": 6426 + }, + { + "epoch": 1.1, + "grad_norm": 9.305505752563477, + "learning_rate": 1.345546593444311e-05, + "loss": 0.5539, + "step": 6427 + }, + { + "epoch": 1.1, + "grad_norm": 12.53671932220459, + "learning_rate": 1.3452891711000515e-05, + "loss": 0.3946, + "step": 6428 + }, + { + "epoch": 1.1, + "grad_norm": 11.575438499450684, + "learning_rate": 1.345031748755792e-05, + "loss": 0.4827, + "step": 6429 + }, + { + "epoch": 1.1, + "grad_norm": 11.569454193115234, + "learning_rate": 1.3447743264115325e-05, + "loss": 0.5711, + "step": 6430 + }, + { + "epoch": 1.1, + "grad_norm": 13.72888469696045, + "learning_rate": 1.3445169040672731e-05, + "loss": 0.6241, + "step": 6431 + }, + { + "epoch": 1.1, + "grad_norm": 10.193585395812988, + "learning_rate": 1.3442594817230136e-05, + "loss": 0.6339, + "step": 6432 + }, + { + "epoch": 1.1, + "grad_norm": 12.300786972045898, + "learning_rate": 1.3440020593787541e-05, + "loss": 0.6643, + "step": 6433 + }, + { + "epoch": 1.1, + "grad_norm": 11.59980583190918, + "learning_rate": 1.3437446370344946e-05, + "loss": 0.6161, + "step": 6434 + }, + { + "epoch": 1.1, + "grad_norm": 11.819564819335938, + "learning_rate": 1.3434872146902353e-05, + "loss": 0.6186, + "step": 6435 + }, + { + "epoch": 1.1, + "grad_norm": 11.4440279006958, + "learning_rate": 1.3432297923459756e-05, + "loss": 0.484, + "step": 6436 + }, + { + "epoch": 1.1, + "grad_norm": 12.75587272644043, + "learning_rate": 1.3429723700017161e-05, + "loss": 0.5328, + "step": 6437 + }, + { + "epoch": 1.1, + "grad_norm": 11.196776390075684, + "learning_rate": 1.3427149476574566e-05, + "loss": 0.5457, + "step": 6438 + }, + { + "epoch": 1.11, + "grad_norm": 10.927118301391602, + "learning_rate": 1.3424575253131971e-05, + "loss": 0.5412, + "step": 6439 + }, + { + "epoch": 1.11, + "grad_norm": 12.193121910095215, + "learning_rate": 1.3422001029689378e-05, + "loss": 0.4445, + "step": 6440 + }, + { + "epoch": 1.11, + "grad_norm": 9.619534492492676, + "learning_rate": 1.3419426806246783e-05, + "loss": 0.4927, + "step": 6441 + }, + { + "epoch": 1.11, + "grad_norm": 11.151309967041016, + "learning_rate": 1.3416852582804188e-05, + "loss": 0.6449, + "step": 6442 + }, + { + "epoch": 1.11, + "grad_norm": 8.470060348510742, + "learning_rate": 1.3414278359361593e-05, + "loss": 0.3805, + "step": 6443 + }, + { + "epoch": 1.11, + "grad_norm": 13.84939956665039, + "learning_rate": 1.3411704135918998e-05, + "loss": 0.5345, + "step": 6444 + }, + { + "epoch": 1.11, + "grad_norm": 12.695511817932129, + "learning_rate": 1.3409129912476405e-05, + "loss": 0.4607, + "step": 6445 + }, + { + "epoch": 1.11, + "grad_norm": 14.321296691894531, + "learning_rate": 1.340655568903381e-05, + "loss": 0.5508, + "step": 6446 + }, + { + "epoch": 1.11, + "grad_norm": 10.543728828430176, + "learning_rate": 1.3403981465591213e-05, + "loss": 0.6933, + "step": 6447 + }, + { + "epoch": 1.11, + "grad_norm": 10.99250602722168, + "learning_rate": 1.3401407242148618e-05, + "loss": 0.5453, + "step": 6448 + }, + { + "epoch": 1.11, + "grad_norm": 12.549531936645508, + "learning_rate": 1.3398833018706024e-05, + "loss": 0.7422, + "step": 6449 + }, + { + "epoch": 1.11, + "grad_norm": 12.516470909118652, + "learning_rate": 1.339625879526343e-05, + "loss": 0.6039, + "step": 6450 + }, + { + "epoch": 1.11, + "grad_norm": 10.269268035888672, + "learning_rate": 1.3393684571820834e-05, + "loss": 0.5617, + "step": 6451 + }, + { + "epoch": 1.11, + "grad_norm": 13.108945846557617, + "learning_rate": 1.339111034837824e-05, + "loss": 0.4165, + "step": 6452 + }, + { + "epoch": 1.11, + "grad_norm": 12.033666610717773, + "learning_rate": 1.3388536124935644e-05, + "loss": 0.6895, + "step": 6453 + }, + { + "epoch": 1.11, + "grad_norm": 10.472251892089844, + "learning_rate": 1.3385961901493051e-05, + "loss": 0.6872, + "step": 6454 + }, + { + "epoch": 1.11, + "grad_norm": 9.815091133117676, + "learning_rate": 1.3383387678050456e-05, + "loss": 0.4326, + "step": 6455 + }, + { + "epoch": 1.11, + "grad_norm": 9.05883502960205, + "learning_rate": 1.3380813454607861e-05, + "loss": 0.5259, + "step": 6456 + }, + { + "epoch": 1.11, + "grad_norm": 10.620383262634277, + "learning_rate": 1.3378239231165264e-05, + "loss": 0.6472, + "step": 6457 + }, + { + "epoch": 1.11, + "grad_norm": 8.336908340454102, + "learning_rate": 1.337566500772267e-05, + "loss": 0.4718, + "step": 6458 + }, + { + "epoch": 1.11, + "grad_norm": 11.147042274475098, + "learning_rate": 1.3373090784280076e-05, + "loss": 0.4651, + "step": 6459 + }, + { + "epoch": 1.11, + "grad_norm": 12.0889253616333, + "learning_rate": 1.3370516560837481e-05, + "loss": 0.6705, + "step": 6460 + }, + { + "epoch": 1.11, + "grad_norm": 12.031773567199707, + "learning_rate": 1.3367942337394886e-05, + "loss": 0.6216, + "step": 6461 + }, + { + "epoch": 1.11, + "grad_norm": 10.55211067199707, + "learning_rate": 1.3365368113952291e-05, + "loss": 0.693, + "step": 6462 + }, + { + "epoch": 1.11, + "grad_norm": 9.207207679748535, + "learning_rate": 1.3362793890509696e-05, + "loss": 0.4096, + "step": 6463 + }, + { + "epoch": 1.11, + "grad_norm": 7.5676679611206055, + "learning_rate": 1.3360219667067103e-05, + "loss": 0.4043, + "step": 6464 + }, + { + "epoch": 1.11, + "grad_norm": 8.421072959899902, + "learning_rate": 1.3357645443624508e-05, + "loss": 0.6345, + "step": 6465 + }, + { + "epoch": 1.11, + "grad_norm": 14.281852722167969, + "learning_rate": 1.3355071220181913e-05, + "loss": 0.7356, + "step": 6466 + }, + { + "epoch": 1.11, + "grad_norm": 9.934675216674805, + "learning_rate": 1.3352496996739316e-05, + "loss": 0.6215, + "step": 6467 + }, + { + "epoch": 1.11, + "grad_norm": 13.282097816467285, + "learning_rate": 1.3349922773296723e-05, + "loss": 0.8024, + "step": 6468 + }, + { + "epoch": 1.11, + "grad_norm": 10.563453674316406, + "learning_rate": 1.3347348549854128e-05, + "loss": 0.553, + "step": 6469 + }, + { + "epoch": 1.11, + "grad_norm": 9.30909538269043, + "learning_rate": 1.3344774326411533e-05, + "loss": 0.7434, + "step": 6470 + }, + { + "epoch": 1.11, + "grad_norm": 10.07658576965332, + "learning_rate": 1.3342200102968938e-05, + "loss": 0.5206, + "step": 6471 + }, + { + "epoch": 1.11, + "grad_norm": 7.457747459411621, + "learning_rate": 1.3339625879526343e-05, + "loss": 0.4478, + "step": 6472 + }, + { + "epoch": 1.11, + "grad_norm": 13.680563926696777, + "learning_rate": 1.333705165608375e-05, + "loss": 0.6205, + "step": 6473 + }, + { + "epoch": 1.11, + "grad_norm": 14.597335815429688, + "learning_rate": 1.3334477432641154e-05, + "loss": 0.6658, + "step": 6474 + }, + { + "epoch": 1.11, + "grad_norm": 9.61847972869873, + "learning_rate": 1.333190320919856e-05, + "loss": 0.4784, + "step": 6475 + }, + { + "epoch": 1.11, + "grad_norm": 9.983537673950195, + "learning_rate": 1.3329328985755964e-05, + "loss": 0.5591, + "step": 6476 + }, + { + "epoch": 1.11, + "grad_norm": 9.116236686706543, + "learning_rate": 1.332675476231337e-05, + "loss": 0.4129, + "step": 6477 + }, + { + "epoch": 1.11, + "grad_norm": 12.38467025756836, + "learning_rate": 1.3324180538870774e-05, + "loss": 0.6361, + "step": 6478 + }, + { + "epoch": 1.11, + "grad_norm": 8.936291694641113, + "learning_rate": 1.332160631542818e-05, + "loss": 0.4836, + "step": 6479 + }, + { + "epoch": 1.11, + "grad_norm": 12.047820091247559, + "learning_rate": 1.3319032091985584e-05, + "loss": 0.6939, + "step": 6480 + }, + { + "epoch": 1.11, + "grad_norm": 13.784460067749023, + "learning_rate": 1.3316457868542989e-05, + "loss": 0.6411, + "step": 6481 + }, + { + "epoch": 1.11, + "grad_norm": 10.867308616638184, + "learning_rate": 1.3313883645100394e-05, + "loss": 0.5686, + "step": 6482 + }, + { + "epoch": 1.11, + "grad_norm": 13.41030502319336, + "learning_rate": 1.33113094216578e-05, + "loss": 0.4649, + "step": 6483 + }, + { + "epoch": 1.11, + "grad_norm": 10.422324180603027, + "learning_rate": 1.3308735198215206e-05, + "loss": 0.4035, + "step": 6484 + }, + { + "epoch": 1.11, + "grad_norm": 8.874316215515137, + "learning_rate": 1.330616097477261e-05, + "loss": 0.4512, + "step": 6485 + }, + { + "epoch": 1.11, + "grad_norm": 11.03233814239502, + "learning_rate": 1.3303586751330016e-05, + "loss": 0.4717, + "step": 6486 + }, + { + "epoch": 1.11, + "grad_norm": 13.250544548034668, + "learning_rate": 1.3301012527887422e-05, + "loss": 0.5532, + "step": 6487 + }, + { + "epoch": 1.11, + "grad_norm": 9.79434585571289, + "learning_rate": 1.3298438304444826e-05, + "loss": 0.4571, + "step": 6488 + }, + { + "epoch": 1.11, + "grad_norm": 10.573814392089844, + "learning_rate": 1.329586408100223e-05, + "loss": 0.4652, + "step": 6489 + }, + { + "epoch": 1.11, + "grad_norm": 14.933924674987793, + "learning_rate": 1.3293289857559636e-05, + "loss": 0.6203, + "step": 6490 + }, + { + "epoch": 1.11, + "grad_norm": 9.825202941894531, + "learning_rate": 1.329071563411704e-05, + "loss": 0.5241, + "step": 6491 + }, + { + "epoch": 1.11, + "grad_norm": 13.141525268554688, + "learning_rate": 1.3288141410674447e-05, + "loss": 0.5837, + "step": 6492 + }, + { + "epoch": 1.11, + "grad_norm": 12.80006217956543, + "learning_rate": 1.3285567187231852e-05, + "loss": 0.6059, + "step": 6493 + }, + { + "epoch": 1.11, + "grad_norm": 11.984138488769531, + "learning_rate": 1.3282992963789257e-05, + "loss": 0.5903, + "step": 6494 + }, + { + "epoch": 1.11, + "grad_norm": 9.175521850585938, + "learning_rate": 1.3280418740346662e-05, + "loss": 0.4014, + "step": 6495 + }, + { + "epoch": 1.11, + "grad_norm": 12.794022560119629, + "learning_rate": 1.3277844516904067e-05, + "loss": 0.4755, + "step": 6496 + }, + { + "epoch": 1.11, + "grad_norm": 8.497796058654785, + "learning_rate": 1.3275270293461474e-05, + "loss": 0.538, + "step": 6497 + }, + { + "epoch": 1.12, + "grad_norm": 9.756322860717773, + "learning_rate": 1.3272696070018879e-05, + "loss": 0.4544, + "step": 6498 + }, + { + "epoch": 1.12, + "grad_norm": 11.190783500671387, + "learning_rate": 1.3270121846576282e-05, + "loss": 0.5956, + "step": 6499 + }, + { + "epoch": 1.12, + "grad_norm": 10.597745895385742, + "learning_rate": 1.3267547623133687e-05, + "loss": 0.5907, + "step": 6500 + }, + { + "epoch": 1.12, + "grad_norm": 10.086770057678223, + "learning_rate": 1.3264973399691094e-05, + "loss": 0.6616, + "step": 6501 + }, + { + "epoch": 1.12, + "grad_norm": 10.597195625305176, + "learning_rate": 1.3262399176248499e-05, + "loss": 0.6504, + "step": 6502 + }, + { + "epoch": 1.12, + "grad_norm": 7.084816932678223, + "learning_rate": 1.3259824952805904e-05, + "loss": 0.3417, + "step": 6503 + }, + { + "epoch": 1.12, + "grad_norm": 11.245378494262695, + "learning_rate": 1.3257250729363309e-05, + "loss": 0.6522, + "step": 6504 + }, + { + "epoch": 1.12, + "grad_norm": 9.221365928649902, + "learning_rate": 1.3254676505920714e-05, + "loss": 0.5071, + "step": 6505 + }, + { + "epoch": 1.12, + "grad_norm": 12.660913467407227, + "learning_rate": 1.325210228247812e-05, + "loss": 0.7962, + "step": 6506 + }, + { + "epoch": 1.12, + "grad_norm": 9.997649192810059, + "learning_rate": 1.3249528059035526e-05, + "loss": 0.5702, + "step": 6507 + }, + { + "epoch": 1.12, + "grad_norm": 8.492313385009766, + "learning_rate": 1.324695383559293e-05, + "loss": 0.4827, + "step": 6508 + }, + { + "epoch": 1.12, + "grad_norm": 8.572259902954102, + "learning_rate": 1.3244379612150334e-05, + "loss": 0.5088, + "step": 6509 + }, + { + "epoch": 1.12, + "grad_norm": 9.79743766784668, + "learning_rate": 1.3241805388707739e-05, + "loss": 0.5494, + "step": 6510 + }, + { + "epoch": 1.12, + "grad_norm": 11.583270072937012, + "learning_rate": 1.3239231165265146e-05, + "loss": 0.714, + "step": 6511 + }, + { + "epoch": 1.12, + "grad_norm": 8.860138893127441, + "learning_rate": 1.323665694182255e-05, + "loss": 0.4833, + "step": 6512 + }, + { + "epoch": 1.12, + "grad_norm": 13.43123722076416, + "learning_rate": 1.3234082718379956e-05, + "loss": 0.739, + "step": 6513 + }, + { + "epoch": 1.12, + "grad_norm": 12.040709495544434, + "learning_rate": 1.323150849493736e-05, + "loss": 0.7545, + "step": 6514 + }, + { + "epoch": 1.12, + "grad_norm": 7.854092121124268, + "learning_rate": 1.3228934271494765e-05, + "loss": 0.3384, + "step": 6515 + }, + { + "epoch": 1.12, + "grad_norm": 13.156378746032715, + "learning_rate": 1.3226360048052172e-05, + "loss": 0.5428, + "step": 6516 + }, + { + "epoch": 1.12, + "grad_norm": 10.444669723510742, + "learning_rate": 1.3223785824609577e-05, + "loss": 0.5682, + "step": 6517 + }, + { + "epoch": 1.12, + "grad_norm": 9.886656761169434, + "learning_rate": 1.3221211601166982e-05, + "loss": 0.5117, + "step": 6518 + }, + { + "epoch": 1.12, + "grad_norm": 8.365802764892578, + "learning_rate": 1.3218637377724387e-05, + "loss": 0.5611, + "step": 6519 + }, + { + "epoch": 1.12, + "grad_norm": 10.611737251281738, + "learning_rate": 1.3216063154281792e-05, + "loss": 0.5034, + "step": 6520 + }, + { + "epoch": 1.12, + "grad_norm": 10.446642875671387, + "learning_rate": 1.3213488930839197e-05, + "loss": 0.7348, + "step": 6521 + }, + { + "epoch": 1.12, + "grad_norm": 8.14039134979248, + "learning_rate": 1.3210914707396602e-05, + "loss": 0.3526, + "step": 6522 + }, + { + "epoch": 1.12, + "grad_norm": 10.861857414245605, + "learning_rate": 1.3208340483954007e-05, + "loss": 0.5412, + "step": 6523 + }, + { + "epoch": 1.12, + "grad_norm": 10.091312408447266, + "learning_rate": 1.3205766260511412e-05, + "loss": 0.5061, + "step": 6524 + }, + { + "epoch": 1.12, + "grad_norm": 10.006696701049805, + "learning_rate": 1.3203192037068819e-05, + "loss": 0.5319, + "step": 6525 + }, + { + "epoch": 1.12, + "grad_norm": 8.957270622253418, + "learning_rate": 1.3200617813626224e-05, + "loss": 0.3701, + "step": 6526 + }, + { + "epoch": 1.12, + "grad_norm": 10.448317527770996, + "learning_rate": 1.3198043590183629e-05, + "loss": 0.4563, + "step": 6527 + }, + { + "epoch": 1.12, + "grad_norm": 10.945698738098145, + "learning_rate": 1.3195469366741034e-05, + "loss": 0.4663, + "step": 6528 + }, + { + "epoch": 1.12, + "grad_norm": 8.893011093139648, + "learning_rate": 1.3192895143298439e-05, + "loss": 0.5749, + "step": 6529 + }, + { + "epoch": 1.12, + "grad_norm": 10.641251564025879, + "learning_rate": 1.3190320919855844e-05, + "loss": 0.527, + "step": 6530 + }, + { + "epoch": 1.12, + "grad_norm": 9.86562728881836, + "learning_rate": 1.3187746696413249e-05, + "loss": 0.4909, + "step": 6531 + }, + { + "epoch": 1.12, + "grad_norm": 13.261011123657227, + "learning_rate": 1.3185172472970654e-05, + "loss": 0.6065, + "step": 6532 + }, + { + "epoch": 1.12, + "grad_norm": 13.138240814208984, + "learning_rate": 1.3182598249528059e-05, + "loss": 0.6694, + "step": 6533 + }, + { + "epoch": 1.12, + "grad_norm": 10.378952026367188, + "learning_rate": 1.3180024026085464e-05, + "loss": 0.4776, + "step": 6534 + }, + { + "epoch": 1.12, + "grad_norm": 9.386820793151855, + "learning_rate": 1.317744980264287e-05, + "loss": 0.5835, + "step": 6535 + }, + { + "epoch": 1.12, + "grad_norm": 13.621814727783203, + "learning_rate": 1.3174875579200275e-05, + "loss": 0.6197, + "step": 6536 + }, + { + "epoch": 1.12, + "grad_norm": 10.80805778503418, + "learning_rate": 1.317230135575768e-05, + "loss": 0.4563, + "step": 6537 + }, + { + "epoch": 1.12, + "grad_norm": 11.223711967468262, + "learning_rate": 1.3169727132315085e-05, + "loss": 0.7816, + "step": 6538 + }, + { + "epoch": 1.12, + "grad_norm": 10.302451133728027, + "learning_rate": 1.3167152908872492e-05, + "loss": 0.4981, + "step": 6539 + }, + { + "epoch": 1.12, + "grad_norm": 12.550187110900879, + "learning_rate": 1.3164578685429895e-05, + "loss": 0.612, + "step": 6540 + }, + { + "epoch": 1.12, + "grad_norm": 11.217123031616211, + "learning_rate": 1.31620044619873e-05, + "loss": 0.5187, + "step": 6541 + }, + { + "epoch": 1.12, + "grad_norm": 11.528560638427734, + "learning_rate": 1.3159430238544705e-05, + "loss": 0.6893, + "step": 6542 + }, + { + "epoch": 1.12, + "grad_norm": 10.158309936523438, + "learning_rate": 1.315685601510211e-05, + "loss": 0.5037, + "step": 6543 + }, + { + "epoch": 1.12, + "grad_norm": 10.893864631652832, + "learning_rate": 1.3154281791659517e-05, + "loss": 0.6542, + "step": 6544 + }, + { + "epoch": 1.12, + "grad_norm": 13.234877586364746, + "learning_rate": 1.3151707568216922e-05, + "loss": 0.5203, + "step": 6545 + }, + { + "epoch": 1.12, + "grad_norm": 9.685635566711426, + "learning_rate": 1.3149133344774327e-05, + "loss": 0.6837, + "step": 6546 + }, + { + "epoch": 1.12, + "grad_norm": 7.813433647155762, + "learning_rate": 1.3146559121331732e-05, + "loss": 0.4363, + "step": 6547 + }, + { + "epoch": 1.12, + "grad_norm": 7.92897367477417, + "learning_rate": 1.3143984897889137e-05, + "loss": 0.4507, + "step": 6548 + }, + { + "epoch": 1.12, + "grad_norm": 8.177204132080078, + "learning_rate": 1.3141410674446544e-05, + "loss": 0.4679, + "step": 6549 + }, + { + "epoch": 1.12, + "grad_norm": 11.392449378967285, + "learning_rate": 1.3138836451003949e-05, + "loss": 0.5465, + "step": 6550 + }, + { + "epoch": 1.12, + "grad_norm": 7.450992584228516, + "learning_rate": 1.3136262227561352e-05, + "loss": 0.5741, + "step": 6551 + }, + { + "epoch": 1.12, + "grad_norm": 10.513679504394531, + "learning_rate": 1.3133688004118757e-05, + "loss": 0.4398, + "step": 6552 + }, + { + "epoch": 1.12, + "grad_norm": 10.889396667480469, + "learning_rate": 1.3131113780676162e-05, + "loss": 0.8657, + "step": 6553 + }, + { + "epoch": 1.12, + "grad_norm": 8.439090728759766, + "learning_rate": 1.3128539557233568e-05, + "loss": 0.5031, + "step": 6554 + }, + { + "epoch": 1.12, + "grad_norm": 9.649551391601562, + "learning_rate": 1.3125965333790973e-05, + "loss": 0.4117, + "step": 6555 + }, + { + "epoch": 1.13, + "grad_norm": 8.325549125671387, + "learning_rate": 1.3123391110348378e-05, + "loss": 0.4438, + "step": 6556 + }, + { + "epoch": 1.13, + "grad_norm": 14.233033180236816, + "learning_rate": 1.3120816886905783e-05, + "loss": 0.7199, + "step": 6557 + }, + { + "epoch": 1.13, + "grad_norm": 10.1182222366333, + "learning_rate": 1.311824266346319e-05, + "loss": 0.569, + "step": 6558 + }, + { + "epoch": 1.13, + "grad_norm": 13.452754974365234, + "learning_rate": 1.3115668440020595e-05, + "loss": 0.5249, + "step": 6559 + }, + { + "epoch": 1.13, + "grad_norm": 11.976300239562988, + "learning_rate": 1.3113094216578e-05, + "loss": 0.7084, + "step": 6560 + }, + { + "epoch": 1.13, + "grad_norm": 11.58817195892334, + "learning_rate": 1.3110519993135403e-05, + "loss": 0.4723, + "step": 6561 + }, + { + "epoch": 1.13, + "grad_norm": 10.377490043640137, + "learning_rate": 1.3107945769692808e-05, + "loss": 0.4459, + "step": 6562 + }, + { + "epoch": 1.13, + "grad_norm": 9.727197647094727, + "learning_rate": 1.3105371546250215e-05, + "loss": 0.4807, + "step": 6563 + }, + { + "epoch": 1.13, + "grad_norm": 10.826410293579102, + "learning_rate": 1.310279732280762e-05, + "loss": 0.5837, + "step": 6564 + }, + { + "epoch": 1.13, + "grad_norm": 10.527913093566895, + "learning_rate": 1.3100223099365025e-05, + "loss": 0.3558, + "step": 6565 + }, + { + "epoch": 1.13, + "grad_norm": 10.869670867919922, + "learning_rate": 1.309764887592243e-05, + "loss": 0.6012, + "step": 6566 + }, + { + "epoch": 1.13, + "grad_norm": 13.175188064575195, + "learning_rate": 1.3095074652479835e-05, + "loss": 0.6005, + "step": 6567 + }, + { + "epoch": 1.13, + "grad_norm": 9.989154815673828, + "learning_rate": 1.3092500429037242e-05, + "loss": 0.6041, + "step": 6568 + }, + { + "epoch": 1.13, + "grad_norm": 13.257312774658203, + "learning_rate": 1.3089926205594647e-05, + "loss": 0.6224, + "step": 6569 + }, + { + "epoch": 1.13, + "grad_norm": 11.00068473815918, + "learning_rate": 1.3087351982152052e-05, + "loss": 0.5205, + "step": 6570 + }, + { + "epoch": 1.13, + "grad_norm": 9.617300987243652, + "learning_rate": 1.3084777758709457e-05, + "loss": 0.5443, + "step": 6571 + }, + { + "epoch": 1.13, + "grad_norm": 8.372429847717285, + "learning_rate": 1.3082203535266862e-05, + "loss": 0.4517, + "step": 6572 + }, + { + "epoch": 1.13, + "grad_norm": 8.8631591796875, + "learning_rate": 1.3079629311824267e-05, + "loss": 0.4206, + "step": 6573 + }, + { + "epoch": 1.13, + "grad_norm": 6.8099541664123535, + "learning_rate": 1.3077055088381672e-05, + "loss": 0.3376, + "step": 6574 + }, + { + "epoch": 1.13, + "grad_norm": 10.668180465698242, + "learning_rate": 1.3074480864939077e-05, + "loss": 0.6727, + "step": 6575 + }, + { + "epoch": 1.13, + "grad_norm": 7.970752716064453, + "learning_rate": 1.3071906641496482e-05, + "loss": 0.3809, + "step": 6576 + }, + { + "epoch": 1.13, + "grad_norm": 10.777921676635742, + "learning_rate": 1.3069332418053888e-05, + "loss": 0.4781, + "step": 6577 + }, + { + "epoch": 1.13, + "grad_norm": 11.43701171875, + "learning_rate": 1.3066758194611293e-05, + "loss": 0.5624, + "step": 6578 + }, + { + "epoch": 1.13, + "grad_norm": 9.084794998168945, + "learning_rate": 1.3064183971168698e-05, + "loss": 0.5276, + "step": 6579 + }, + { + "epoch": 1.13, + "grad_norm": 9.189397811889648, + "learning_rate": 1.3061609747726103e-05, + "loss": 0.4693, + "step": 6580 + }, + { + "epoch": 1.13, + "grad_norm": 9.391263961791992, + "learning_rate": 1.3059035524283508e-05, + "loss": 0.5776, + "step": 6581 + }, + { + "epoch": 1.13, + "grad_norm": 10.65726089477539, + "learning_rate": 1.3056461300840913e-05, + "loss": 0.4856, + "step": 6582 + }, + { + "epoch": 1.13, + "grad_norm": 13.06060791015625, + "learning_rate": 1.3053887077398318e-05, + "loss": 0.6063, + "step": 6583 + }, + { + "epoch": 1.13, + "grad_norm": 26.931459426879883, + "learning_rate": 1.3051312853955723e-05, + "loss": 0.5167, + "step": 6584 + }, + { + "epoch": 1.13, + "grad_norm": 13.135416030883789, + "learning_rate": 1.3048738630513128e-05, + "loss": 0.6144, + "step": 6585 + }, + { + "epoch": 1.13, + "grad_norm": 12.115387916564941, + "learning_rate": 1.3046164407070533e-05, + "loss": 0.5541, + "step": 6586 + }, + { + "epoch": 1.13, + "grad_norm": 10.395634651184082, + "learning_rate": 1.304359018362794e-05, + "loss": 0.3439, + "step": 6587 + }, + { + "epoch": 1.13, + "grad_norm": 9.804046630859375, + "learning_rate": 1.3041015960185345e-05, + "loss": 0.5401, + "step": 6588 + }, + { + "epoch": 1.13, + "grad_norm": 9.473113059997559, + "learning_rate": 1.303844173674275e-05, + "loss": 0.4357, + "step": 6589 + }, + { + "epoch": 1.13, + "grad_norm": 7.7532854080200195, + "learning_rate": 1.3035867513300155e-05, + "loss": 0.3525, + "step": 6590 + }, + { + "epoch": 1.13, + "grad_norm": 10.283815383911133, + "learning_rate": 1.3033293289857561e-05, + "loss": 0.4762, + "step": 6591 + }, + { + "epoch": 1.13, + "grad_norm": 8.335599899291992, + "learning_rate": 1.3030719066414965e-05, + "loss": 0.3369, + "step": 6592 + }, + { + "epoch": 1.13, + "grad_norm": 11.679243087768555, + "learning_rate": 1.302814484297237e-05, + "loss": 0.5246, + "step": 6593 + }, + { + "epoch": 1.13, + "grad_norm": 10.233473777770996, + "learning_rate": 1.3025570619529775e-05, + "loss": 0.5099, + "step": 6594 + }, + { + "epoch": 1.13, + "grad_norm": 9.13751220703125, + "learning_rate": 1.302299639608718e-05, + "loss": 0.3833, + "step": 6595 + }, + { + "epoch": 1.13, + "grad_norm": 9.868047714233398, + "learning_rate": 1.3020422172644586e-05, + "loss": 0.3488, + "step": 6596 + }, + { + "epoch": 1.13, + "grad_norm": 12.519608497619629, + "learning_rate": 1.3017847949201991e-05, + "loss": 0.4467, + "step": 6597 + }, + { + "epoch": 1.13, + "grad_norm": 10.756606101989746, + "learning_rate": 1.3015273725759396e-05, + "loss": 0.5031, + "step": 6598 + }, + { + "epoch": 1.13, + "grad_norm": 11.033324241638184, + "learning_rate": 1.3012699502316801e-05, + "loss": 0.5938, + "step": 6599 + }, + { + "epoch": 1.13, + "grad_norm": 9.303865432739258, + "learning_rate": 1.3010125278874206e-05, + "loss": 0.4744, + "step": 6600 + }, + { + "epoch": 1.13, + "grad_norm": 22.110292434692383, + "learning_rate": 1.3007551055431613e-05, + "loss": 0.4931, + "step": 6601 + }, + { + "epoch": 1.13, + "grad_norm": 11.843348503112793, + "learning_rate": 1.3004976831989018e-05, + "loss": 0.5608, + "step": 6602 + }, + { + "epoch": 1.13, + "grad_norm": 14.494935989379883, + "learning_rate": 1.3002402608546421e-05, + "loss": 0.5409, + "step": 6603 + }, + { + "epoch": 1.13, + "grad_norm": 11.201446533203125, + "learning_rate": 1.2999828385103826e-05, + "loss": 0.4608, + "step": 6604 + }, + { + "epoch": 1.13, + "grad_norm": 11.345254898071289, + "learning_rate": 1.2997254161661231e-05, + "loss": 0.6298, + "step": 6605 + }, + { + "epoch": 1.13, + "grad_norm": 8.122580528259277, + "learning_rate": 1.2994679938218638e-05, + "loss": 0.4921, + "step": 6606 + }, + { + "epoch": 1.13, + "grad_norm": 10.820239067077637, + "learning_rate": 1.2992105714776043e-05, + "loss": 0.5318, + "step": 6607 + }, + { + "epoch": 1.13, + "grad_norm": 10.083476066589355, + "learning_rate": 1.2989531491333448e-05, + "loss": 0.6027, + "step": 6608 + }, + { + "epoch": 1.13, + "grad_norm": 12.082131385803223, + "learning_rate": 1.2986957267890853e-05, + "loss": 0.4911, + "step": 6609 + }, + { + "epoch": 1.13, + "grad_norm": 11.85802173614502, + "learning_rate": 1.298438304444826e-05, + "loss": 0.706, + "step": 6610 + }, + { + "epoch": 1.13, + "grad_norm": 11.752325057983398, + "learning_rate": 1.2981808821005665e-05, + "loss": 0.7447, + "step": 6611 + }, + { + "epoch": 1.13, + "grad_norm": 12.42308521270752, + "learning_rate": 1.297923459756307e-05, + "loss": 0.6757, + "step": 6612 + }, + { + "epoch": 1.13, + "grad_norm": 12.254287719726562, + "learning_rate": 1.2976660374120473e-05, + "loss": 0.5193, + "step": 6613 + }, + { + "epoch": 1.14, + "grad_norm": 12.064400672912598, + "learning_rate": 1.2974086150677878e-05, + "loss": 0.5508, + "step": 6614 + }, + { + "epoch": 1.14, + "grad_norm": 11.173871994018555, + "learning_rate": 1.2971511927235285e-05, + "loss": 0.5603, + "step": 6615 + }, + { + "epoch": 1.14, + "grad_norm": 11.750433921813965, + "learning_rate": 1.296893770379269e-05, + "loss": 0.616, + "step": 6616 + }, + { + "epoch": 1.14, + "grad_norm": 12.16136646270752, + "learning_rate": 1.2966363480350095e-05, + "loss": 0.7753, + "step": 6617 + }, + { + "epoch": 1.14, + "grad_norm": 14.198493957519531, + "learning_rate": 1.29637892569075e-05, + "loss": 0.5215, + "step": 6618 + }, + { + "epoch": 1.14, + "grad_norm": 11.20034122467041, + "learning_rate": 1.2961215033464905e-05, + "loss": 0.5867, + "step": 6619 + }, + { + "epoch": 1.14, + "grad_norm": 12.259267807006836, + "learning_rate": 1.2958640810022311e-05, + "loss": 0.5525, + "step": 6620 + }, + { + "epoch": 1.14, + "grad_norm": 9.16513442993164, + "learning_rate": 1.2956066586579716e-05, + "loss": 0.4784, + "step": 6621 + }, + { + "epoch": 1.14, + "grad_norm": 12.509882926940918, + "learning_rate": 1.2953492363137121e-05, + "loss": 0.7367, + "step": 6622 + }, + { + "epoch": 1.14, + "grad_norm": 10.642499923706055, + "learning_rate": 1.2950918139694526e-05, + "loss": 0.835, + "step": 6623 + }, + { + "epoch": 1.14, + "grad_norm": 12.391217231750488, + "learning_rate": 1.2948343916251931e-05, + "loss": 0.7651, + "step": 6624 + }, + { + "epoch": 1.14, + "grad_norm": 7.915867328643799, + "learning_rate": 1.2945769692809336e-05, + "loss": 0.4071, + "step": 6625 + }, + { + "epoch": 1.14, + "grad_norm": 13.243711471557617, + "learning_rate": 1.2943195469366741e-05, + "loss": 0.6216, + "step": 6626 + }, + { + "epoch": 1.14, + "grad_norm": 10.474575996398926, + "learning_rate": 1.2940621245924146e-05, + "loss": 0.4674, + "step": 6627 + }, + { + "epoch": 1.14, + "grad_norm": 12.771490097045898, + "learning_rate": 1.2938047022481551e-05, + "loss": 0.3771, + "step": 6628 + }, + { + "epoch": 1.14, + "grad_norm": 11.810847282409668, + "learning_rate": 1.2935472799038958e-05, + "loss": 0.6311, + "step": 6629 + }, + { + "epoch": 1.14, + "grad_norm": 8.244412422180176, + "learning_rate": 1.2932898575596363e-05, + "loss": 0.4131, + "step": 6630 + }, + { + "epoch": 1.14, + "grad_norm": 9.575753211975098, + "learning_rate": 1.2930324352153768e-05, + "loss": 0.5737, + "step": 6631 + }, + { + "epoch": 1.14, + "grad_norm": 10.175783157348633, + "learning_rate": 1.2927750128711173e-05, + "loss": 0.5061, + "step": 6632 + }, + { + "epoch": 1.14, + "grad_norm": 10.517440795898438, + "learning_rate": 1.2925175905268578e-05, + "loss": 0.5074, + "step": 6633 + }, + { + "epoch": 1.14, + "grad_norm": 12.021347999572754, + "learning_rate": 1.2922601681825983e-05, + "loss": 0.5154, + "step": 6634 + }, + { + "epoch": 1.14, + "grad_norm": 11.485918045043945, + "learning_rate": 1.2920027458383388e-05, + "loss": 0.4024, + "step": 6635 + }, + { + "epoch": 1.14, + "grad_norm": 7.599707126617432, + "learning_rate": 1.2917453234940793e-05, + "loss": 0.5677, + "step": 6636 + }, + { + "epoch": 1.14, + "grad_norm": 7.97475528717041, + "learning_rate": 1.2914879011498198e-05, + "loss": 0.5792, + "step": 6637 + }, + { + "epoch": 1.14, + "grad_norm": 10.74142074584961, + "learning_rate": 1.2912304788055603e-05, + "loss": 0.564, + "step": 6638 + }, + { + "epoch": 1.14, + "grad_norm": 7.491780757904053, + "learning_rate": 1.290973056461301e-05, + "loss": 0.4438, + "step": 6639 + }, + { + "epoch": 1.14, + "grad_norm": 9.42711353302002, + "learning_rate": 1.2907156341170414e-05, + "loss": 0.4575, + "step": 6640 + }, + { + "epoch": 1.14, + "grad_norm": 8.107260704040527, + "learning_rate": 1.290458211772782e-05, + "loss": 0.4211, + "step": 6641 + }, + { + "epoch": 1.14, + "grad_norm": 9.416579246520996, + "learning_rate": 1.2902007894285224e-05, + "loss": 0.4333, + "step": 6642 + }, + { + "epoch": 1.14, + "grad_norm": 12.01895523071289, + "learning_rate": 1.2899433670842631e-05, + "loss": 0.5369, + "step": 6643 + }, + { + "epoch": 1.14, + "grad_norm": 12.207151412963867, + "learning_rate": 1.2896859447400034e-05, + "loss": 0.6145, + "step": 6644 + }, + { + "epoch": 1.14, + "grad_norm": 10.951322555541992, + "learning_rate": 1.289428522395744e-05, + "loss": 0.6184, + "step": 6645 + }, + { + "epoch": 1.14, + "grad_norm": 9.277907371520996, + "learning_rate": 1.2891711000514844e-05, + "loss": 0.3816, + "step": 6646 + }, + { + "epoch": 1.14, + "grad_norm": 10.779108047485352, + "learning_rate": 1.288913677707225e-05, + "loss": 0.5212, + "step": 6647 + }, + { + "epoch": 1.14, + "grad_norm": 9.678054809570312, + "learning_rate": 1.2886562553629656e-05, + "loss": 0.6432, + "step": 6648 + }, + { + "epoch": 1.14, + "grad_norm": 9.1563081741333, + "learning_rate": 1.2883988330187061e-05, + "loss": 0.3937, + "step": 6649 + }, + { + "epoch": 1.14, + "grad_norm": 9.920218467712402, + "learning_rate": 1.2881414106744466e-05, + "loss": 0.4412, + "step": 6650 + }, + { + "epoch": 1.14, + "grad_norm": 11.496282577514648, + "learning_rate": 1.2878839883301871e-05, + "loss": 0.6261, + "step": 6651 + }, + { + "epoch": 1.14, + "grad_norm": 10.1200590133667, + "learning_rate": 1.2876265659859276e-05, + "loss": 0.5965, + "step": 6652 + }, + { + "epoch": 1.14, + "grad_norm": 12.59368896484375, + "learning_rate": 1.2873691436416683e-05, + "loss": 0.4906, + "step": 6653 + }, + { + "epoch": 1.14, + "grad_norm": 8.63156509399414, + "learning_rate": 1.2871117212974088e-05, + "loss": 0.5116, + "step": 6654 + }, + { + "epoch": 1.14, + "grad_norm": 9.79605484008789, + "learning_rate": 1.286854298953149e-05, + "loss": 0.592, + "step": 6655 + }, + { + "epoch": 1.14, + "grad_norm": 8.004490852355957, + "learning_rate": 1.2865968766088896e-05, + "loss": 0.4054, + "step": 6656 + }, + { + "epoch": 1.14, + "grad_norm": 8.764505386352539, + "learning_rate": 1.28633945426463e-05, + "loss": 0.5286, + "step": 6657 + }, + { + "epoch": 1.14, + "grad_norm": 14.3403902053833, + "learning_rate": 1.2860820319203707e-05, + "loss": 0.7439, + "step": 6658 + }, + { + "epoch": 1.14, + "grad_norm": 8.892584800720215, + "learning_rate": 1.2858246095761112e-05, + "loss": 0.7384, + "step": 6659 + }, + { + "epoch": 1.14, + "grad_norm": 11.002338409423828, + "learning_rate": 1.2855671872318517e-05, + "loss": 0.6576, + "step": 6660 + }, + { + "epoch": 1.14, + "grad_norm": 10.832758903503418, + "learning_rate": 1.2853097648875922e-05, + "loss": 0.6233, + "step": 6661 + }, + { + "epoch": 1.14, + "grad_norm": 9.900686264038086, + "learning_rate": 1.2850523425433329e-05, + "loss": 0.5388, + "step": 6662 + }, + { + "epoch": 1.14, + "grad_norm": 11.024088859558105, + "learning_rate": 1.2847949201990734e-05, + "loss": 0.476, + "step": 6663 + }, + { + "epoch": 1.14, + "grad_norm": 9.43239688873291, + "learning_rate": 1.2845374978548139e-05, + "loss": 0.4752, + "step": 6664 + }, + { + "epoch": 1.14, + "grad_norm": 14.08057975769043, + "learning_rate": 1.2842800755105542e-05, + "loss": 0.5479, + "step": 6665 + }, + { + "epoch": 1.14, + "grad_norm": 10.494189262390137, + "learning_rate": 1.2840226531662947e-05, + "loss": 0.5553, + "step": 6666 + }, + { + "epoch": 1.14, + "grad_norm": 9.32431697845459, + "learning_rate": 1.2837652308220354e-05, + "loss": 0.4402, + "step": 6667 + }, + { + "epoch": 1.14, + "grad_norm": 12.670583724975586, + "learning_rate": 1.2835078084777759e-05, + "loss": 0.5737, + "step": 6668 + }, + { + "epoch": 1.14, + "grad_norm": 9.428166389465332, + "learning_rate": 1.2832503861335164e-05, + "loss": 0.3968, + "step": 6669 + }, + { + "epoch": 1.14, + "grad_norm": 10.724882125854492, + "learning_rate": 1.2829929637892569e-05, + "loss": 0.4201, + "step": 6670 + }, + { + "epoch": 1.14, + "grad_norm": 7.7952656745910645, + "learning_rate": 1.2827355414449974e-05, + "loss": 0.4525, + "step": 6671 + }, + { + "epoch": 1.15, + "grad_norm": 12.33945083618164, + "learning_rate": 1.282478119100738e-05, + "loss": 0.5353, + "step": 6672 + }, + { + "epoch": 1.15, + "grad_norm": 11.126082420349121, + "learning_rate": 1.2822206967564786e-05, + "loss": 0.5401, + "step": 6673 + }, + { + "epoch": 1.15, + "grad_norm": 11.593513488769531, + "learning_rate": 1.281963274412219e-05, + "loss": 0.7047, + "step": 6674 + }, + { + "epoch": 1.15, + "grad_norm": 11.098726272583008, + "learning_rate": 1.2817058520679596e-05, + "loss": 0.6034, + "step": 6675 + }, + { + "epoch": 1.15, + "grad_norm": 10.617947578430176, + "learning_rate": 1.2814484297237e-05, + "loss": 0.3692, + "step": 6676 + }, + { + "epoch": 1.15, + "grad_norm": 9.953615188598633, + "learning_rate": 1.2811910073794406e-05, + "loss": 0.5437, + "step": 6677 + }, + { + "epoch": 1.15, + "grad_norm": 11.29536247253418, + "learning_rate": 1.280933585035181e-05, + "loss": 0.53, + "step": 6678 + }, + { + "epoch": 1.15, + "grad_norm": 12.399231910705566, + "learning_rate": 1.2806761626909216e-05, + "loss": 0.6281, + "step": 6679 + }, + { + "epoch": 1.15, + "grad_norm": 10.361248970031738, + "learning_rate": 1.280418740346662e-05, + "loss": 0.3911, + "step": 6680 + }, + { + "epoch": 1.15, + "grad_norm": 12.777186393737793, + "learning_rate": 1.2801613180024027e-05, + "loss": 0.4876, + "step": 6681 + }, + { + "epoch": 1.15, + "grad_norm": 14.768627166748047, + "learning_rate": 1.2799038956581432e-05, + "loss": 0.7891, + "step": 6682 + }, + { + "epoch": 1.15, + "grad_norm": 11.145493507385254, + "learning_rate": 1.2796464733138837e-05, + "loss": 0.8949, + "step": 6683 + }, + { + "epoch": 1.15, + "grad_norm": 11.15176773071289, + "learning_rate": 1.2793890509696242e-05, + "loss": 0.5289, + "step": 6684 + }, + { + "epoch": 1.15, + "grad_norm": 12.483684539794922, + "learning_rate": 1.2791316286253647e-05, + "loss": 0.6661, + "step": 6685 + }, + { + "epoch": 1.15, + "grad_norm": 9.809720039367676, + "learning_rate": 1.2788742062811052e-05, + "loss": 0.4736, + "step": 6686 + }, + { + "epoch": 1.15, + "grad_norm": 18.834911346435547, + "learning_rate": 1.2786167839368457e-05, + "loss": 0.6128, + "step": 6687 + }, + { + "epoch": 1.15, + "grad_norm": 13.260591506958008, + "learning_rate": 1.2783593615925862e-05, + "loss": 0.7091, + "step": 6688 + }, + { + "epoch": 1.15, + "grad_norm": 10.857799530029297, + "learning_rate": 1.2781019392483267e-05, + "loss": 0.5103, + "step": 6689 + }, + { + "epoch": 1.15, + "grad_norm": 10.976391792297363, + "learning_rate": 1.2778445169040672e-05, + "loss": 0.693, + "step": 6690 + }, + { + "epoch": 1.15, + "grad_norm": 9.264769554138184, + "learning_rate": 1.2775870945598079e-05, + "loss": 0.4276, + "step": 6691 + }, + { + "epoch": 1.15, + "grad_norm": 7.126923084259033, + "learning_rate": 1.2773296722155484e-05, + "loss": 0.309, + "step": 6692 + }, + { + "epoch": 1.15, + "grad_norm": 9.771899223327637, + "learning_rate": 1.2770722498712889e-05, + "loss": 0.548, + "step": 6693 + }, + { + "epoch": 1.15, + "grad_norm": 10.559141159057617, + "learning_rate": 1.2768148275270294e-05, + "loss": 0.6396, + "step": 6694 + }, + { + "epoch": 1.15, + "grad_norm": 9.344493865966797, + "learning_rate": 1.27655740518277e-05, + "loss": 0.6079, + "step": 6695 + }, + { + "epoch": 1.15, + "grad_norm": 11.279424667358398, + "learning_rate": 1.2762999828385104e-05, + "loss": 0.9092, + "step": 6696 + }, + { + "epoch": 1.15, + "grad_norm": 12.471848487854004, + "learning_rate": 1.2760425604942509e-05, + "loss": 0.7587, + "step": 6697 + }, + { + "epoch": 1.15, + "grad_norm": 10.34504222869873, + "learning_rate": 1.2757851381499914e-05, + "loss": 0.5484, + "step": 6698 + }, + { + "epoch": 1.15, + "grad_norm": 10.52061653137207, + "learning_rate": 1.2755277158057319e-05, + "loss": 0.5139, + "step": 6699 + }, + { + "epoch": 1.15, + "grad_norm": 8.919501304626465, + "learning_rate": 1.2752702934614725e-05, + "loss": 0.4915, + "step": 6700 + }, + { + "epoch": 1.15, + "grad_norm": 10.039825439453125, + "learning_rate": 1.275012871117213e-05, + "loss": 0.3715, + "step": 6701 + }, + { + "epoch": 1.15, + "grad_norm": 10.143620491027832, + "learning_rate": 1.2747554487729535e-05, + "loss": 0.4217, + "step": 6702 + }, + { + "epoch": 1.15, + "grad_norm": 10.147356986999512, + "learning_rate": 1.274498026428694e-05, + "loss": 0.5138, + "step": 6703 + }, + { + "epoch": 1.15, + "grad_norm": 11.371033668518066, + "learning_rate": 1.2742406040844345e-05, + "loss": 0.5299, + "step": 6704 + }, + { + "epoch": 1.15, + "grad_norm": 10.041821479797363, + "learning_rate": 1.2739831817401752e-05, + "loss": 0.4868, + "step": 6705 + }, + { + "epoch": 1.15, + "grad_norm": 11.828451156616211, + "learning_rate": 1.2737257593959157e-05, + "loss": 0.5587, + "step": 6706 + }, + { + "epoch": 1.15, + "grad_norm": 9.729795455932617, + "learning_rate": 1.273468337051656e-05, + "loss": 0.3908, + "step": 6707 + }, + { + "epoch": 1.15, + "grad_norm": 10.917746543884277, + "learning_rate": 1.2732109147073965e-05, + "loss": 0.3145, + "step": 6708 + }, + { + "epoch": 1.15, + "grad_norm": 9.760326385498047, + "learning_rate": 1.272953492363137e-05, + "loss": 0.3523, + "step": 6709 + }, + { + "epoch": 1.15, + "grad_norm": 10.388890266418457, + "learning_rate": 1.2726960700188777e-05, + "loss": 0.5523, + "step": 6710 + }, + { + "epoch": 1.15, + "grad_norm": 12.174628257751465, + "learning_rate": 1.2724386476746182e-05, + "loss": 0.5867, + "step": 6711 + }, + { + "epoch": 1.15, + "grad_norm": 12.072940826416016, + "learning_rate": 1.2721812253303587e-05, + "loss": 0.5981, + "step": 6712 + }, + { + "epoch": 1.15, + "grad_norm": 11.517169952392578, + "learning_rate": 1.2719238029860992e-05, + "loss": 0.4229, + "step": 6713 + }, + { + "epoch": 1.15, + "grad_norm": 16.435815811157227, + "learning_rate": 1.2716663806418399e-05, + "loss": 0.8389, + "step": 6714 + }, + { + "epoch": 1.15, + "grad_norm": 10.92951774597168, + "learning_rate": 1.2714089582975804e-05, + "loss": 0.5887, + "step": 6715 + }, + { + "epoch": 1.15, + "grad_norm": 12.015429496765137, + "learning_rate": 1.2711515359533209e-05, + "loss": 0.5204, + "step": 6716 + }, + { + "epoch": 1.15, + "grad_norm": 12.950034141540527, + "learning_rate": 1.2708941136090612e-05, + "loss": 0.5696, + "step": 6717 + }, + { + "epoch": 1.15, + "grad_norm": 14.45562744140625, + "learning_rate": 1.2706366912648017e-05, + "loss": 0.7551, + "step": 6718 + }, + { + "epoch": 1.15, + "grad_norm": 15.538154602050781, + "learning_rate": 1.2703792689205424e-05, + "loss": 0.5767, + "step": 6719 + }, + { + "epoch": 1.15, + "grad_norm": 16.079790115356445, + "learning_rate": 1.2701218465762829e-05, + "loss": 0.5863, + "step": 6720 + }, + { + "epoch": 1.15, + "grad_norm": 13.61666488647461, + "learning_rate": 1.2698644242320234e-05, + "loss": 0.691, + "step": 6721 + }, + { + "epoch": 1.15, + "grad_norm": 8.60115909576416, + "learning_rate": 1.2696070018877639e-05, + "loss": 0.4377, + "step": 6722 + }, + { + "epoch": 1.15, + "grad_norm": 11.699920654296875, + "learning_rate": 1.2693495795435044e-05, + "loss": 0.6973, + "step": 6723 + }, + { + "epoch": 1.15, + "grad_norm": 7.728428840637207, + "learning_rate": 1.269092157199245e-05, + "loss": 0.4249, + "step": 6724 + }, + { + "epoch": 1.15, + "grad_norm": 10.760560989379883, + "learning_rate": 1.2688347348549855e-05, + "loss": 0.9823, + "step": 6725 + }, + { + "epoch": 1.15, + "grad_norm": 10.318336486816406, + "learning_rate": 1.268577312510726e-05, + "loss": 0.4204, + "step": 6726 + }, + { + "epoch": 1.15, + "grad_norm": 10.182144165039062, + "learning_rate": 1.2683198901664665e-05, + "loss": 0.4436, + "step": 6727 + }, + { + "epoch": 1.15, + "grad_norm": 12.014725685119629, + "learning_rate": 1.2680624678222068e-05, + "loss": 0.7163, + "step": 6728 + }, + { + "epoch": 1.15, + "grad_norm": 10.626301765441895, + "learning_rate": 1.2678050454779475e-05, + "loss": 0.6035, + "step": 6729 + }, + { + "epoch": 1.15, + "grad_norm": 11.62788200378418, + "learning_rate": 1.267547623133688e-05, + "loss": 0.4954, + "step": 6730 + }, + { + "epoch": 1.16, + "grad_norm": 9.553505897521973, + "learning_rate": 1.2672902007894285e-05, + "loss": 0.4281, + "step": 6731 + }, + { + "epoch": 1.16, + "grad_norm": 11.797175407409668, + "learning_rate": 1.267032778445169e-05, + "loss": 0.6781, + "step": 6732 + }, + { + "epoch": 1.16, + "grad_norm": 11.341609954833984, + "learning_rate": 1.2667753561009097e-05, + "loss": 0.6553, + "step": 6733 + }, + { + "epoch": 1.16, + "grad_norm": 10.617149353027344, + "learning_rate": 1.2665179337566502e-05, + "loss": 0.5201, + "step": 6734 + }, + { + "epoch": 1.16, + "grad_norm": 8.680614471435547, + "learning_rate": 1.2662605114123907e-05, + "loss": 0.6146, + "step": 6735 + }, + { + "epoch": 1.16, + "grad_norm": 11.916034698486328, + "learning_rate": 1.2660030890681312e-05, + "loss": 0.8438, + "step": 6736 + }, + { + "epoch": 1.16, + "grad_norm": 12.099270820617676, + "learning_rate": 1.2657456667238717e-05, + "loss": 0.5887, + "step": 6737 + }, + { + "epoch": 1.16, + "grad_norm": 9.548237800598145, + "learning_rate": 1.2654882443796122e-05, + "loss": 0.6987, + "step": 6738 + }, + { + "epoch": 1.16, + "grad_norm": 10.586852073669434, + "learning_rate": 1.2652308220353527e-05, + "loss": 0.4909, + "step": 6739 + }, + { + "epoch": 1.16, + "grad_norm": 8.159618377685547, + "learning_rate": 1.2649733996910932e-05, + "loss": 0.3954, + "step": 6740 + }, + { + "epoch": 1.16, + "grad_norm": 9.554677963256836, + "learning_rate": 1.2647159773468337e-05, + "loss": 0.5544, + "step": 6741 + }, + { + "epoch": 1.16, + "grad_norm": 14.785699844360352, + "learning_rate": 1.2644585550025742e-05, + "loss": 0.5357, + "step": 6742 + }, + { + "epoch": 1.16, + "grad_norm": 11.965754508972168, + "learning_rate": 1.2642011326583148e-05, + "loss": 0.5779, + "step": 6743 + }, + { + "epoch": 1.16, + "grad_norm": 8.249988555908203, + "learning_rate": 1.2639437103140553e-05, + "loss": 0.5882, + "step": 6744 + }, + { + "epoch": 1.16, + "grad_norm": 8.296032905578613, + "learning_rate": 1.2636862879697958e-05, + "loss": 0.3963, + "step": 6745 + }, + { + "epoch": 1.16, + "grad_norm": 8.62956428527832, + "learning_rate": 1.2634288656255363e-05, + "loss": 0.4947, + "step": 6746 + }, + { + "epoch": 1.16, + "grad_norm": 8.288333892822266, + "learning_rate": 1.263171443281277e-05, + "loss": 0.4609, + "step": 6747 + }, + { + "epoch": 1.16, + "grad_norm": 12.94849681854248, + "learning_rate": 1.2629140209370173e-05, + "loss": 0.6603, + "step": 6748 + }, + { + "epoch": 1.16, + "grad_norm": 10.329885482788086, + "learning_rate": 1.2626565985927578e-05, + "loss": 0.569, + "step": 6749 + }, + { + "epoch": 1.16, + "grad_norm": 9.659663200378418, + "learning_rate": 1.2623991762484983e-05, + "loss": 0.4487, + "step": 6750 + }, + { + "epoch": 1.16, + "grad_norm": 10.303452491760254, + "learning_rate": 1.2621417539042388e-05, + "loss": 0.5362, + "step": 6751 + }, + { + "epoch": 1.16, + "grad_norm": 12.65714168548584, + "learning_rate": 1.2618843315599795e-05, + "loss": 0.7225, + "step": 6752 + }, + { + "epoch": 1.16, + "grad_norm": 9.553597450256348, + "learning_rate": 1.26162690921572e-05, + "loss": 0.3538, + "step": 6753 + }, + { + "epoch": 1.16, + "grad_norm": 10.719615936279297, + "learning_rate": 1.2613694868714605e-05, + "loss": 0.5659, + "step": 6754 + }, + { + "epoch": 1.16, + "grad_norm": 9.221165657043457, + "learning_rate": 1.261112064527201e-05, + "loss": 0.4145, + "step": 6755 + }, + { + "epoch": 1.16, + "grad_norm": 13.493268013000488, + "learning_rate": 1.2608546421829415e-05, + "loss": 0.5086, + "step": 6756 + }, + { + "epoch": 1.16, + "grad_norm": 12.034260749816895, + "learning_rate": 1.2605972198386822e-05, + "loss": 0.6157, + "step": 6757 + }, + { + "epoch": 1.16, + "grad_norm": 13.103609085083008, + "learning_rate": 1.2603397974944227e-05, + "loss": 0.6273, + "step": 6758 + }, + { + "epoch": 1.16, + "grad_norm": 9.861727714538574, + "learning_rate": 1.260082375150163e-05, + "loss": 0.4413, + "step": 6759 + }, + { + "epoch": 1.16, + "grad_norm": 9.260859489440918, + "learning_rate": 1.2598249528059035e-05, + "loss": 0.5558, + "step": 6760 + }, + { + "epoch": 1.16, + "grad_norm": 9.298737525939941, + "learning_rate": 1.259567530461644e-05, + "loss": 0.3999, + "step": 6761 + }, + { + "epoch": 1.16, + "grad_norm": 9.542593002319336, + "learning_rate": 1.2593101081173846e-05, + "loss": 0.4122, + "step": 6762 + }, + { + "epoch": 1.16, + "grad_norm": 11.299402236938477, + "learning_rate": 1.2590526857731251e-05, + "loss": 0.4636, + "step": 6763 + }, + { + "epoch": 1.16, + "grad_norm": 13.075308799743652, + "learning_rate": 1.2587952634288656e-05, + "loss": 0.6894, + "step": 6764 + }, + { + "epoch": 1.16, + "grad_norm": 11.396122932434082, + "learning_rate": 1.2585378410846061e-05, + "loss": 0.5546, + "step": 6765 + }, + { + "epoch": 1.16, + "grad_norm": 8.977927207946777, + "learning_rate": 1.2582804187403468e-05, + "loss": 0.5561, + "step": 6766 + }, + { + "epoch": 1.16, + "grad_norm": 10.994551658630371, + "learning_rate": 1.2580229963960873e-05, + "loss": 0.3995, + "step": 6767 + }, + { + "epoch": 1.16, + "grad_norm": 12.106865882873535, + "learning_rate": 1.2577655740518278e-05, + "loss": 0.7406, + "step": 6768 + }, + { + "epoch": 1.16, + "grad_norm": 8.34880542755127, + "learning_rate": 1.2575081517075681e-05, + "loss": 0.427, + "step": 6769 + }, + { + "epoch": 1.16, + "grad_norm": 11.80229663848877, + "learning_rate": 1.2572507293633086e-05, + "loss": 0.6487, + "step": 6770 + }, + { + "epoch": 1.16, + "grad_norm": 12.099350929260254, + "learning_rate": 1.2569933070190493e-05, + "loss": 0.4811, + "step": 6771 + }, + { + "epoch": 1.16, + "grad_norm": 11.811274528503418, + "learning_rate": 1.2567358846747898e-05, + "loss": 0.6307, + "step": 6772 + }, + { + "epoch": 1.16, + "grad_norm": 10.495442390441895, + "learning_rate": 1.2564784623305303e-05, + "loss": 0.6225, + "step": 6773 + }, + { + "epoch": 1.16, + "grad_norm": 6.954984188079834, + "learning_rate": 1.2562210399862708e-05, + "loss": 0.3954, + "step": 6774 + }, + { + "epoch": 1.16, + "grad_norm": 9.646415710449219, + "learning_rate": 1.2559636176420113e-05, + "loss": 0.5928, + "step": 6775 + }, + { + "epoch": 1.16, + "grad_norm": 15.01123046875, + "learning_rate": 1.255706195297752e-05, + "loss": 0.4359, + "step": 6776 + }, + { + "epoch": 1.16, + "grad_norm": 9.666677474975586, + "learning_rate": 1.2554487729534925e-05, + "loss": 0.4723, + "step": 6777 + }, + { + "epoch": 1.16, + "grad_norm": 12.862643241882324, + "learning_rate": 1.255191350609233e-05, + "loss": 0.684, + "step": 6778 + }, + { + "epoch": 1.16, + "grad_norm": 11.741910934448242, + "learning_rate": 1.2549339282649735e-05, + "loss": 0.5769, + "step": 6779 + }, + { + "epoch": 1.16, + "grad_norm": 10.529659271240234, + "learning_rate": 1.2546765059207138e-05, + "loss": 0.5424, + "step": 6780 + }, + { + "epoch": 1.16, + "grad_norm": 6.7211012840271, + "learning_rate": 1.2544190835764545e-05, + "loss": 0.3708, + "step": 6781 + }, + { + "epoch": 1.16, + "grad_norm": 10.521566390991211, + "learning_rate": 1.254161661232195e-05, + "loss": 0.4304, + "step": 6782 + }, + { + "epoch": 1.16, + "grad_norm": 11.458416938781738, + "learning_rate": 1.2539042388879355e-05, + "loss": 0.5796, + "step": 6783 + }, + { + "epoch": 1.16, + "grad_norm": 12.682881355285645, + "learning_rate": 1.253646816543676e-05, + "loss": 0.7297, + "step": 6784 + }, + { + "epoch": 1.16, + "grad_norm": 8.332749366760254, + "learning_rate": 1.2533893941994166e-05, + "loss": 0.4912, + "step": 6785 + }, + { + "epoch": 1.16, + "grad_norm": 11.674663543701172, + "learning_rate": 1.2531319718551571e-05, + "loss": 0.4537, + "step": 6786 + }, + { + "epoch": 1.16, + "grad_norm": 9.876639366149902, + "learning_rate": 1.2528745495108976e-05, + "loss": 0.6138, + "step": 6787 + }, + { + "epoch": 1.16, + "grad_norm": 13.832884788513184, + "learning_rate": 1.2526171271666381e-05, + "loss": 0.6242, + "step": 6788 + }, + { + "epoch": 1.17, + "grad_norm": 8.507376670837402, + "learning_rate": 1.2523597048223786e-05, + "loss": 0.3926, + "step": 6789 + }, + { + "epoch": 1.17, + "grad_norm": 13.872946739196777, + "learning_rate": 1.2521022824781191e-05, + "loss": 0.4891, + "step": 6790 + }, + { + "epoch": 1.17, + "grad_norm": 7.619639873504639, + "learning_rate": 1.2518448601338596e-05, + "loss": 0.5368, + "step": 6791 + }, + { + "epoch": 1.17, + "grad_norm": 11.240330696105957, + "learning_rate": 1.2515874377896001e-05, + "loss": 0.4658, + "step": 6792 + }, + { + "epoch": 1.17, + "grad_norm": 10.234431266784668, + "learning_rate": 1.2513300154453406e-05, + "loss": 0.4802, + "step": 6793 + }, + { + "epoch": 1.17, + "grad_norm": 8.64299201965332, + "learning_rate": 1.2510725931010811e-05, + "loss": 0.5694, + "step": 6794 + }, + { + "epoch": 1.17, + "grad_norm": 9.616625785827637, + "learning_rate": 1.2508151707568218e-05, + "loss": 0.4409, + "step": 6795 + }, + { + "epoch": 1.17, + "grad_norm": 9.579665184020996, + "learning_rate": 1.2505577484125623e-05, + "loss": 0.4268, + "step": 6796 + }, + { + "epoch": 1.17, + "grad_norm": 10.091870307922363, + "learning_rate": 1.2503003260683028e-05, + "loss": 0.4956, + "step": 6797 + }, + { + "epoch": 1.17, + "grad_norm": 13.065946578979492, + "learning_rate": 1.2500429037240433e-05, + "loss": 0.7655, + "step": 6798 + }, + { + "epoch": 1.17, + "grad_norm": 9.62783145904541, + "learning_rate": 1.249785481379784e-05, + "loss": 0.5838, + "step": 6799 + }, + { + "epoch": 1.17, + "grad_norm": 9.778759002685547, + "learning_rate": 1.2495280590355243e-05, + "loss": 0.4989, + "step": 6800 + }, + { + "epoch": 1.17, + "grad_norm": 13.538947105407715, + "learning_rate": 1.2492706366912648e-05, + "loss": 0.6402, + "step": 6801 + }, + { + "epoch": 1.17, + "grad_norm": 8.49754810333252, + "learning_rate": 1.2490132143470053e-05, + "loss": 0.3548, + "step": 6802 + }, + { + "epoch": 1.17, + "grad_norm": 8.568778991699219, + "learning_rate": 1.2487557920027458e-05, + "loss": 0.3727, + "step": 6803 + }, + { + "epoch": 1.17, + "grad_norm": 10.141242027282715, + "learning_rate": 1.2484983696584864e-05, + "loss": 0.4706, + "step": 6804 + }, + { + "epoch": 1.17, + "grad_norm": 12.662147521972656, + "learning_rate": 1.248240947314227e-05, + "loss": 0.6372, + "step": 6805 + }, + { + "epoch": 1.17, + "grad_norm": 9.579142570495605, + "learning_rate": 1.2479835249699674e-05, + "loss": 0.513, + "step": 6806 + }, + { + "epoch": 1.17, + "grad_norm": 9.628792762756348, + "learning_rate": 1.247726102625708e-05, + "loss": 0.6056, + "step": 6807 + }, + { + "epoch": 1.17, + "grad_norm": 8.767404556274414, + "learning_rate": 1.2474686802814484e-05, + "loss": 0.6001, + "step": 6808 + }, + { + "epoch": 1.17, + "grad_norm": 11.569428443908691, + "learning_rate": 1.2472112579371891e-05, + "loss": 0.5376, + "step": 6809 + }, + { + "epoch": 1.17, + "grad_norm": 12.378584861755371, + "learning_rate": 1.2469538355929296e-05, + "loss": 0.3777, + "step": 6810 + }, + { + "epoch": 1.17, + "grad_norm": 11.086263656616211, + "learning_rate": 1.24669641324867e-05, + "loss": 0.6188, + "step": 6811 + }, + { + "epoch": 1.17, + "grad_norm": 10.007267951965332, + "learning_rate": 1.2464389909044104e-05, + "loss": 0.5115, + "step": 6812 + }, + { + "epoch": 1.17, + "grad_norm": 7.275233745574951, + "learning_rate": 1.246181568560151e-05, + "loss": 0.3426, + "step": 6813 + }, + { + "epoch": 1.17, + "grad_norm": 13.040640830993652, + "learning_rate": 1.2459241462158916e-05, + "loss": 0.6234, + "step": 6814 + }, + { + "epoch": 1.17, + "grad_norm": 8.325801849365234, + "learning_rate": 1.2456667238716321e-05, + "loss": 0.4771, + "step": 6815 + }, + { + "epoch": 1.17, + "grad_norm": 7.979382514953613, + "learning_rate": 1.2454093015273726e-05, + "loss": 0.3407, + "step": 6816 + }, + { + "epoch": 1.17, + "grad_norm": 13.458011627197266, + "learning_rate": 1.2451518791831131e-05, + "loss": 0.567, + "step": 6817 + }, + { + "epoch": 1.17, + "grad_norm": 15.544333457946777, + "learning_rate": 1.2448944568388538e-05, + "loss": 0.615, + "step": 6818 + }, + { + "epoch": 1.17, + "grad_norm": 8.83591079711914, + "learning_rate": 1.2446370344945943e-05, + "loss": 0.4769, + "step": 6819 + }, + { + "epoch": 1.17, + "grad_norm": 10.802139282226562, + "learning_rate": 1.2443796121503348e-05, + "loss": 0.5772, + "step": 6820 + }, + { + "epoch": 1.17, + "grad_norm": 11.135982513427734, + "learning_rate": 1.2441221898060751e-05, + "loss": 0.5482, + "step": 6821 + }, + { + "epoch": 1.17, + "grad_norm": 11.574701309204102, + "learning_rate": 1.2438647674618156e-05, + "loss": 0.3975, + "step": 6822 + }, + { + "epoch": 1.17, + "grad_norm": 10.294801712036133, + "learning_rate": 1.2436073451175563e-05, + "loss": 0.4745, + "step": 6823 + }, + { + "epoch": 1.17, + "grad_norm": 14.599040031433105, + "learning_rate": 1.2433499227732968e-05, + "loss": 0.5905, + "step": 6824 + }, + { + "epoch": 1.17, + "grad_norm": 6.782919406890869, + "learning_rate": 1.2430925004290373e-05, + "loss": 0.3649, + "step": 6825 + }, + { + "epoch": 1.17, + "grad_norm": 9.188220977783203, + "learning_rate": 1.2428350780847778e-05, + "loss": 0.4365, + "step": 6826 + }, + { + "epoch": 1.17, + "grad_norm": 12.269412994384766, + "learning_rate": 1.2425776557405183e-05, + "loss": 0.5385, + "step": 6827 + }, + { + "epoch": 1.17, + "grad_norm": 11.031576156616211, + "learning_rate": 1.242320233396259e-05, + "loss": 0.6859, + "step": 6828 + }, + { + "epoch": 1.17, + "grad_norm": 9.004095077514648, + "learning_rate": 1.2420628110519994e-05, + "loss": 0.5292, + "step": 6829 + }, + { + "epoch": 1.17, + "grad_norm": 9.24494457244873, + "learning_rate": 1.24180538870774e-05, + "loss": 0.424, + "step": 6830 + }, + { + "epoch": 1.17, + "grad_norm": 13.175368309020996, + "learning_rate": 1.2415479663634804e-05, + "loss": 0.6736, + "step": 6831 + }, + { + "epoch": 1.17, + "grad_norm": 11.513937950134277, + "learning_rate": 1.2412905440192207e-05, + "loss": 0.551, + "step": 6832 + }, + { + "epoch": 1.17, + "grad_norm": 14.278603553771973, + "learning_rate": 1.2410331216749614e-05, + "loss": 0.8424, + "step": 6833 + }, + { + "epoch": 1.17, + "grad_norm": 11.5023832321167, + "learning_rate": 1.2407756993307019e-05, + "loss": 0.5901, + "step": 6834 + }, + { + "epoch": 1.17, + "grad_norm": 12.795166969299316, + "learning_rate": 1.2405182769864424e-05, + "loss": 0.5726, + "step": 6835 + }, + { + "epoch": 1.17, + "grad_norm": 10.320119857788086, + "learning_rate": 1.2402608546421829e-05, + "loss": 0.5802, + "step": 6836 + }, + { + "epoch": 1.17, + "grad_norm": 11.436756134033203, + "learning_rate": 1.2400034322979236e-05, + "loss": 0.5802, + "step": 6837 + }, + { + "epoch": 1.17, + "grad_norm": 12.720342636108398, + "learning_rate": 1.239746009953664e-05, + "loss": 0.6414, + "step": 6838 + }, + { + "epoch": 1.17, + "grad_norm": 7.858325481414795, + "learning_rate": 1.2394885876094046e-05, + "loss": 0.4137, + "step": 6839 + }, + { + "epoch": 1.17, + "grad_norm": 9.454837799072266, + "learning_rate": 1.239231165265145e-05, + "loss": 0.3474, + "step": 6840 + }, + { + "epoch": 1.17, + "grad_norm": 13.06265926361084, + "learning_rate": 1.2389737429208856e-05, + "loss": 0.7606, + "step": 6841 + }, + { + "epoch": 1.17, + "grad_norm": 13.420174598693848, + "learning_rate": 1.238716320576626e-05, + "loss": 0.7873, + "step": 6842 + }, + { + "epoch": 1.17, + "grad_norm": 15.158156394958496, + "learning_rate": 1.2384588982323666e-05, + "loss": 0.643, + "step": 6843 + }, + { + "epoch": 1.17, + "grad_norm": 11.007019996643066, + "learning_rate": 1.238201475888107e-05, + "loss": 0.4884, + "step": 6844 + }, + { + "epoch": 1.17, + "grad_norm": 8.749552726745605, + "learning_rate": 1.2379440535438476e-05, + "loss": 0.3755, + "step": 6845 + }, + { + "epoch": 1.17, + "grad_norm": 11.634915351867676, + "learning_rate": 1.237686631199588e-05, + "loss": 0.6722, + "step": 6846 + }, + { + "epoch": 1.18, + "grad_norm": 10.04141902923584, + "learning_rate": 1.2374292088553287e-05, + "loss": 0.489, + "step": 6847 + }, + { + "epoch": 1.18, + "grad_norm": 11.348457336425781, + "learning_rate": 1.2371717865110692e-05, + "loss": 0.5139, + "step": 6848 + }, + { + "epoch": 1.18, + "grad_norm": 9.05846881866455, + "learning_rate": 1.2369143641668097e-05, + "loss": 0.4805, + "step": 6849 + }, + { + "epoch": 1.18, + "grad_norm": 12.247013092041016, + "learning_rate": 1.2366569418225502e-05, + "loss": 0.8583, + "step": 6850 + }, + { + "epoch": 1.18, + "grad_norm": 9.411354064941406, + "learning_rate": 1.2363995194782909e-05, + "loss": 0.5649, + "step": 6851 + }, + { + "epoch": 1.18, + "grad_norm": 12.20920181274414, + "learning_rate": 1.2361420971340312e-05, + "loss": 0.7022, + "step": 6852 + }, + { + "epoch": 1.18, + "grad_norm": 9.595610618591309, + "learning_rate": 1.2358846747897717e-05, + "loss": 0.4616, + "step": 6853 + }, + { + "epoch": 1.18, + "grad_norm": 10.999131202697754, + "learning_rate": 1.2356272524455122e-05, + "loss": 0.6998, + "step": 6854 + }, + { + "epoch": 1.18, + "grad_norm": 13.027018547058105, + "learning_rate": 1.2353698301012527e-05, + "loss": 0.7248, + "step": 6855 + }, + { + "epoch": 1.18, + "grad_norm": 10.674300193786621, + "learning_rate": 1.2351124077569934e-05, + "loss": 0.5586, + "step": 6856 + }, + { + "epoch": 1.18, + "grad_norm": 9.837050437927246, + "learning_rate": 1.2348549854127339e-05, + "loss": 0.4872, + "step": 6857 + }, + { + "epoch": 1.18, + "grad_norm": 10.066168785095215, + "learning_rate": 1.2345975630684744e-05, + "loss": 0.6862, + "step": 6858 + }, + { + "epoch": 1.18, + "grad_norm": 8.072893142700195, + "learning_rate": 1.2343401407242149e-05, + "loss": 0.4017, + "step": 6859 + }, + { + "epoch": 1.18, + "grad_norm": 9.817351341247559, + "learning_rate": 1.2340827183799554e-05, + "loss": 0.545, + "step": 6860 + }, + { + "epoch": 1.18, + "grad_norm": 9.184842109680176, + "learning_rate": 1.233825296035696e-05, + "loss": 0.3974, + "step": 6861 + }, + { + "epoch": 1.18, + "grad_norm": 7.7413859367370605, + "learning_rate": 1.2335678736914366e-05, + "loss": 0.4536, + "step": 6862 + }, + { + "epoch": 1.18, + "grad_norm": 9.22570514678955, + "learning_rate": 1.2333104513471769e-05, + "loss": 0.5915, + "step": 6863 + }, + { + "epoch": 1.18, + "grad_norm": 9.475497245788574, + "learning_rate": 1.2330530290029174e-05, + "loss": 0.4488, + "step": 6864 + }, + { + "epoch": 1.18, + "grad_norm": 10.001102447509766, + "learning_rate": 1.2327956066586579e-05, + "loss": 0.4683, + "step": 6865 + }, + { + "epoch": 1.18, + "grad_norm": 8.96658706665039, + "learning_rate": 1.2325381843143986e-05, + "loss": 0.494, + "step": 6866 + }, + { + "epoch": 1.18, + "grad_norm": 9.885152816772461, + "learning_rate": 1.232280761970139e-05, + "loss": 0.3838, + "step": 6867 + }, + { + "epoch": 1.18, + "grad_norm": 9.549115180969238, + "learning_rate": 1.2320233396258795e-05, + "loss": 0.4423, + "step": 6868 + }, + { + "epoch": 1.18, + "grad_norm": 10.89554214477539, + "learning_rate": 1.23176591728162e-05, + "loss": 0.5626, + "step": 6869 + }, + { + "epoch": 1.18, + "grad_norm": 7.456453323364258, + "learning_rate": 1.2315084949373607e-05, + "loss": 0.4202, + "step": 6870 + }, + { + "epoch": 1.18, + "grad_norm": 13.577530860900879, + "learning_rate": 1.2312510725931012e-05, + "loss": 0.6112, + "step": 6871 + }, + { + "epoch": 1.18, + "grad_norm": 12.862924575805664, + "learning_rate": 1.2309936502488417e-05, + "loss": 0.478, + "step": 6872 + }, + { + "epoch": 1.18, + "grad_norm": 10.98381519317627, + "learning_rate": 1.230736227904582e-05, + "loss": 0.4468, + "step": 6873 + }, + { + "epoch": 1.18, + "grad_norm": 13.726097106933594, + "learning_rate": 1.2304788055603225e-05, + "loss": 0.608, + "step": 6874 + }, + { + "epoch": 1.18, + "grad_norm": 7.779783248901367, + "learning_rate": 1.2302213832160632e-05, + "loss": 0.3484, + "step": 6875 + }, + { + "epoch": 1.18, + "grad_norm": 10.76326847076416, + "learning_rate": 1.2299639608718037e-05, + "loss": 0.4865, + "step": 6876 + }, + { + "epoch": 1.18, + "grad_norm": 11.6524658203125, + "learning_rate": 1.2297065385275442e-05, + "loss": 0.537, + "step": 6877 + }, + { + "epoch": 1.18, + "grad_norm": 8.634624481201172, + "learning_rate": 1.2294491161832847e-05, + "loss": 0.4657, + "step": 6878 + }, + { + "epoch": 1.18, + "grad_norm": 9.680264472961426, + "learning_rate": 1.2291916938390252e-05, + "loss": 0.4945, + "step": 6879 + }, + { + "epoch": 1.18, + "grad_norm": 10.019296646118164, + "learning_rate": 1.2289342714947659e-05, + "loss": 0.4204, + "step": 6880 + }, + { + "epoch": 1.18, + "grad_norm": 12.379463195800781, + "learning_rate": 1.2286768491505064e-05, + "loss": 0.5197, + "step": 6881 + }, + { + "epoch": 1.18, + "grad_norm": 12.925402641296387, + "learning_rate": 1.2284194268062469e-05, + "loss": 0.6828, + "step": 6882 + }, + { + "epoch": 1.18, + "grad_norm": 8.060747146606445, + "learning_rate": 1.2281620044619874e-05, + "loss": 0.3344, + "step": 6883 + }, + { + "epoch": 1.18, + "grad_norm": 13.3075590133667, + "learning_rate": 1.2279045821177277e-05, + "loss": 0.7703, + "step": 6884 + }, + { + "epoch": 1.18, + "grad_norm": 12.060023307800293, + "learning_rate": 1.2276471597734684e-05, + "loss": 0.65, + "step": 6885 + }, + { + "epoch": 1.18, + "grad_norm": 11.080811500549316, + "learning_rate": 1.2273897374292089e-05, + "loss": 0.5368, + "step": 6886 + }, + { + "epoch": 1.18, + "grad_norm": 8.84549617767334, + "learning_rate": 1.2271323150849494e-05, + "loss": 0.381, + "step": 6887 + }, + { + "epoch": 1.18, + "grad_norm": 10.63058853149414, + "learning_rate": 1.2268748927406899e-05, + "loss": 0.4809, + "step": 6888 + }, + { + "epoch": 1.18, + "grad_norm": 13.03759765625, + "learning_rate": 1.2266174703964305e-05, + "loss": 0.5285, + "step": 6889 + }, + { + "epoch": 1.18, + "grad_norm": 9.594520568847656, + "learning_rate": 1.226360048052171e-05, + "loss": 0.48, + "step": 6890 + }, + { + "epoch": 1.18, + "grad_norm": 8.152198791503906, + "learning_rate": 1.2261026257079115e-05, + "loss": 0.3318, + "step": 6891 + }, + { + "epoch": 1.18, + "grad_norm": 10.063015937805176, + "learning_rate": 1.225845203363652e-05, + "loss": 0.5612, + "step": 6892 + }, + { + "epoch": 1.18, + "grad_norm": 11.247658729553223, + "learning_rate": 1.2255877810193925e-05, + "loss": 0.6018, + "step": 6893 + }, + { + "epoch": 1.18, + "grad_norm": 8.392963409423828, + "learning_rate": 1.225330358675133e-05, + "loss": 0.394, + "step": 6894 + }, + { + "epoch": 1.18, + "grad_norm": 11.330645561218262, + "learning_rate": 1.2250729363308735e-05, + "loss": 0.7683, + "step": 6895 + }, + { + "epoch": 1.18, + "grad_norm": 7.127904415130615, + "learning_rate": 1.224815513986614e-05, + "loss": 0.4794, + "step": 6896 + }, + { + "epoch": 1.18, + "grad_norm": 9.829099655151367, + "learning_rate": 1.2245580916423545e-05, + "loss": 0.4362, + "step": 6897 + }, + { + "epoch": 1.18, + "grad_norm": 7.440721035003662, + "learning_rate": 1.224300669298095e-05, + "loss": 0.4115, + "step": 6898 + }, + { + "epoch": 1.18, + "grad_norm": 10.553647994995117, + "learning_rate": 1.2240432469538357e-05, + "loss": 0.431, + "step": 6899 + }, + { + "epoch": 1.18, + "grad_norm": 10.637636184692383, + "learning_rate": 1.2237858246095762e-05, + "loss": 0.5577, + "step": 6900 + }, + { + "epoch": 1.18, + "grad_norm": 10.499363899230957, + "learning_rate": 1.2235284022653167e-05, + "loss": 0.4076, + "step": 6901 + }, + { + "epoch": 1.18, + "grad_norm": 12.196478843688965, + "learning_rate": 1.2232709799210572e-05, + "loss": 0.677, + "step": 6902 + }, + { + "epoch": 1.18, + "grad_norm": 10.579410552978516, + "learning_rate": 1.2230135575767977e-05, + "loss": 0.6375, + "step": 6903 + }, + { + "epoch": 1.18, + "grad_norm": 11.992631912231445, + "learning_rate": 1.2227561352325382e-05, + "loss": 0.5329, + "step": 6904 + }, + { + "epoch": 1.19, + "grad_norm": 14.567315101623535, + "learning_rate": 1.2224987128882787e-05, + "loss": 0.7124, + "step": 6905 + }, + { + "epoch": 1.19, + "grad_norm": 10.849156379699707, + "learning_rate": 1.2222412905440192e-05, + "loss": 0.439, + "step": 6906 + }, + { + "epoch": 1.19, + "grad_norm": 8.673786163330078, + "learning_rate": 1.2219838681997597e-05, + "loss": 0.3328, + "step": 6907 + }, + { + "epoch": 1.19, + "grad_norm": 8.686080932617188, + "learning_rate": 1.2217264458555003e-05, + "loss": 0.5408, + "step": 6908 + }, + { + "epoch": 1.19, + "grad_norm": 8.886615753173828, + "learning_rate": 1.2214690235112408e-05, + "loss": 0.4232, + "step": 6909 + }, + { + "epoch": 1.19, + "grad_norm": 12.084321022033691, + "learning_rate": 1.2212116011669813e-05, + "loss": 0.5372, + "step": 6910 + }, + { + "epoch": 1.19, + "grad_norm": 8.213665008544922, + "learning_rate": 1.2209541788227218e-05, + "loss": 0.3082, + "step": 6911 + }, + { + "epoch": 1.19, + "grad_norm": 12.59633731842041, + "learning_rate": 1.2206967564784623e-05, + "loss": 0.5706, + "step": 6912 + }, + { + "epoch": 1.19, + "grad_norm": 12.798014640808105, + "learning_rate": 1.220439334134203e-05, + "loss": 0.5154, + "step": 6913 + }, + { + "epoch": 1.19, + "grad_norm": 13.033978462219238, + "learning_rate": 1.2201819117899435e-05, + "loss": 0.6533, + "step": 6914 + }, + { + "epoch": 1.19, + "grad_norm": 14.254484176635742, + "learning_rate": 1.2199244894456838e-05, + "loss": 0.4753, + "step": 6915 + }, + { + "epoch": 1.19, + "grad_norm": 10.755403518676758, + "learning_rate": 1.2196670671014243e-05, + "loss": 0.6712, + "step": 6916 + }, + { + "epoch": 1.19, + "grad_norm": 9.937376976013184, + "learning_rate": 1.2194096447571648e-05, + "loss": 0.5788, + "step": 6917 + }, + { + "epoch": 1.19, + "grad_norm": 8.414369583129883, + "learning_rate": 1.2191522224129055e-05, + "loss": 0.5647, + "step": 6918 + }, + { + "epoch": 1.19, + "grad_norm": 10.309571266174316, + "learning_rate": 1.218894800068646e-05, + "loss": 0.4972, + "step": 6919 + }, + { + "epoch": 1.19, + "grad_norm": 8.03283405303955, + "learning_rate": 1.2186373777243865e-05, + "loss": 0.3565, + "step": 6920 + }, + { + "epoch": 1.19, + "grad_norm": 8.124144554138184, + "learning_rate": 1.218379955380127e-05, + "loss": 0.6071, + "step": 6921 + }, + { + "epoch": 1.19, + "grad_norm": 9.392751693725586, + "learning_rate": 1.2181225330358677e-05, + "loss": 0.5957, + "step": 6922 + }, + { + "epoch": 1.19, + "grad_norm": 9.871675491333008, + "learning_rate": 1.2178651106916082e-05, + "loss": 0.4522, + "step": 6923 + }, + { + "epoch": 1.19, + "grad_norm": 10.38727855682373, + "learning_rate": 1.2176076883473487e-05, + "loss": 0.3741, + "step": 6924 + }, + { + "epoch": 1.19, + "grad_norm": 9.851292610168457, + "learning_rate": 1.217350266003089e-05, + "loss": 0.4121, + "step": 6925 + }, + { + "epoch": 1.19, + "grad_norm": 9.155853271484375, + "learning_rate": 1.2170928436588295e-05, + "loss": 0.5876, + "step": 6926 + }, + { + "epoch": 1.19, + "grad_norm": 8.711142539978027, + "learning_rate": 1.2168354213145702e-05, + "loss": 0.5739, + "step": 6927 + }, + { + "epoch": 1.19, + "grad_norm": 14.95273208618164, + "learning_rate": 1.2165779989703107e-05, + "loss": 0.5317, + "step": 6928 + }, + { + "epoch": 1.19, + "grad_norm": 9.308259010314941, + "learning_rate": 1.2163205766260512e-05, + "loss": 0.422, + "step": 6929 + }, + { + "epoch": 1.19, + "grad_norm": 9.667654037475586, + "learning_rate": 1.2160631542817917e-05, + "loss": 0.4162, + "step": 6930 + }, + { + "epoch": 1.19, + "grad_norm": 8.427236557006836, + "learning_rate": 1.2158057319375322e-05, + "loss": 0.4395, + "step": 6931 + }, + { + "epoch": 1.19, + "grad_norm": 10.244489669799805, + "learning_rate": 1.2155483095932728e-05, + "loss": 0.4093, + "step": 6932 + }, + { + "epoch": 1.19, + "grad_norm": 8.465315818786621, + "learning_rate": 1.2152908872490133e-05, + "loss": 0.4847, + "step": 6933 + }, + { + "epoch": 1.19, + "grad_norm": 11.299603462219238, + "learning_rate": 1.2150334649047538e-05, + "loss": 0.627, + "step": 6934 + }, + { + "epoch": 1.19, + "grad_norm": 10.604196548461914, + "learning_rate": 1.2147760425604943e-05, + "loss": 0.5017, + "step": 6935 + }, + { + "epoch": 1.19, + "grad_norm": 9.398388862609863, + "learning_rate": 1.2145186202162346e-05, + "loss": 0.4372, + "step": 6936 + }, + { + "epoch": 1.19, + "grad_norm": 12.70260238647461, + "learning_rate": 1.2142611978719753e-05, + "loss": 0.671, + "step": 6937 + }, + { + "epoch": 1.19, + "grad_norm": 7.968806266784668, + "learning_rate": 1.2140037755277158e-05, + "loss": 0.3225, + "step": 6938 + }, + { + "epoch": 1.19, + "grad_norm": 9.71995735168457, + "learning_rate": 1.2137463531834563e-05, + "loss": 0.5295, + "step": 6939 + }, + { + "epoch": 1.19, + "grad_norm": 11.185986518859863, + "learning_rate": 1.2134889308391968e-05, + "loss": 0.4386, + "step": 6940 + }, + { + "epoch": 1.19, + "grad_norm": 10.755692481994629, + "learning_rate": 1.2132315084949375e-05, + "loss": 0.8041, + "step": 6941 + }, + { + "epoch": 1.19, + "grad_norm": 10.157242774963379, + "learning_rate": 1.212974086150678e-05, + "loss": 0.4863, + "step": 6942 + }, + { + "epoch": 1.19, + "grad_norm": 8.33084487915039, + "learning_rate": 1.2127166638064185e-05, + "loss": 0.4258, + "step": 6943 + }, + { + "epoch": 1.19, + "grad_norm": 9.067972183227539, + "learning_rate": 1.212459241462159e-05, + "loss": 0.6102, + "step": 6944 + }, + { + "epoch": 1.19, + "grad_norm": 14.8798246383667, + "learning_rate": 1.2122018191178995e-05, + "loss": 0.5402, + "step": 6945 + }, + { + "epoch": 1.19, + "grad_norm": 10.439143180847168, + "learning_rate": 1.21194439677364e-05, + "loss": 0.458, + "step": 6946 + }, + { + "epoch": 1.19, + "grad_norm": 9.634234428405762, + "learning_rate": 1.2116869744293805e-05, + "loss": 0.415, + "step": 6947 + }, + { + "epoch": 1.19, + "grad_norm": 7.790219306945801, + "learning_rate": 1.211429552085121e-05, + "loss": 0.4349, + "step": 6948 + }, + { + "epoch": 1.19, + "grad_norm": 12.400364875793457, + "learning_rate": 1.2111721297408615e-05, + "loss": 0.5006, + "step": 6949 + }, + { + "epoch": 1.19, + "grad_norm": 10.733245849609375, + "learning_rate": 1.210914707396602e-05, + "loss": 0.5136, + "step": 6950 + }, + { + "epoch": 1.19, + "grad_norm": 9.066165924072266, + "learning_rate": 1.2106572850523426e-05, + "loss": 0.4303, + "step": 6951 + }, + { + "epoch": 1.19, + "grad_norm": 10.95359992980957, + "learning_rate": 1.2103998627080831e-05, + "loss": 0.5602, + "step": 6952 + }, + { + "epoch": 1.19, + "grad_norm": 8.627713203430176, + "learning_rate": 1.2101424403638236e-05, + "loss": 0.495, + "step": 6953 + }, + { + "epoch": 1.19, + "grad_norm": 9.105578422546387, + "learning_rate": 1.2098850180195641e-05, + "loss": 0.5282, + "step": 6954 + }, + { + "epoch": 1.19, + "grad_norm": 9.413445472717285, + "learning_rate": 1.2096275956753046e-05, + "loss": 0.3457, + "step": 6955 + }, + { + "epoch": 1.19, + "grad_norm": 9.681183815002441, + "learning_rate": 1.2093701733310451e-05, + "loss": 0.4098, + "step": 6956 + }, + { + "epoch": 1.19, + "grad_norm": 10.14177417755127, + "learning_rate": 1.2091127509867856e-05, + "loss": 0.6411, + "step": 6957 + }, + { + "epoch": 1.19, + "grad_norm": 11.76486587524414, + "learning_rate": 1.2088553286425261e-05, + "loss": 0.6086, + "step": 6958 + }, + { + "epoch": 1.19, + "grad_norm": 10.164863586425781, + "learning_rate": 1.2085979062982666e-05, + "loss": 0.5373, + "step": 6959 + }, + { + "epoch": 1.19, + "grad_norm": 11.067230224609375, + "learning_rate": 1.2083404839540073e-05, + "loss": 0.5701, + "step": 6960 + }, + { + "epoch": 1.19, + "grad_norm": 12.132375717163086, + "learning_rate": 1.2080830616097478e-05, + "loss": 0.5944, + "step": 6961 + }, + { + "epoch": 1.19, + "grad_norm": 7.549464225769043, + "learning_rate": 1.2078256392654883e-05, + "loss": 0.5234, + "step": 6962 + }, + { + "epoch": 1.19, + "grad_norm": 15.204045295715332, + "learning_rate": 1.2075682169212288e-05, + "loss": 0.673, + "step": 6963 + }, + { + "epoch": 1.2, + "grad_norm": 9.438495635986328, + "learning_rate": 1.2073107945769693e-05, + "loss": 0.5943, + "step": 6964 + }, + { + "epoch": 1.2, + "grad_norm": 8.335518836975098, + "learning_rate": 1.20705337223271e-05, + "loss": 0.4016, + "step": 6965 + }, + { + "epoch": 1.2, + "grad_norm": 9.443399429321289, + "learning_rate": 1.2067959498884505e-05, + "loss": 0.4555, + "step": 6966 + }, + { + "epoch": 1.2, + "grad_norm": 13.275962829589844, + "learning_rate": 1.2065385275441908e-05, + "loss": 0.6056, + "step": 6967 + }, + { + "epoch": 1.2, + "grad_norm": 9.003582954406738, + "learning_rate": 1.2062811051999313e-05, + "loss": 0.4402, + "step": 6968 + }, + { + "epoch": 1.2, + "grad_norm": 11.950246810913086, + "learning_rate": 1.2060236828556718e-05, + "loss": 0.5363, + "step": 6969 + }, + { + "epoch": 1.2, + "grad_norm": 11.460467338562012, + "learning_rate": 1.2057662605114125e-05, + "loss": 0.5114, + "step": 6970 + }, + { + "epoch": 1.2, + "grad_norm": 10.345368385314941, + "learning_rate": 1.205508838167153e-05, + "loss": 0.5689, + "step": 6971 + }, + { + "epoch": 1.2, + "grad_norm": 11.838040351867676, + "learning_rate": 1.2052514158228934e-05, + "loss": 0.4341, + "step": 6972 + }, + { + "epoch": 1.2, + "grad_norm": 12.360549926757812, + "learning_rate": 1.204993993478634e-05, + "loss": 0.4572, + "step": 6973 + }, + { + "epoch": 1.2, + "grad_norm": 14.673266410827637, + "learning_rate": 1.2047365711343746e-05, + "loss": 0.8732, + "step": 6974 + }, + { + "epoch": 1.2, + "grad_norm": 12.888762474060059, + "learning_rate": 1.2044791487901151e-05, + "loss": 0.6281, + "step": 6975 + }, + { + "epoch": 1.2, + "grad_norm": 11.244769096374512, + "learning_rate": 1.2042217264458556e-05, + "loss": 0.7336, + "step": 6976 + }, + { + "epoch": 1.2, + "grad_norm": 11.339118957519531, + "learning_rate": 1.203964304101596e-05, + "loss": 0.5565, + "step": 6977 + }, + { + "epoch": 1.2, + "grad_norm": 9.494194030761719, + "learning_rate": 1.2037068817573364e-05, + "loss": 0.5927, + "step": 6978 + }, + { + "epoch": 1.2, + "grad_norm": 8.338953018188477, + "learning_rate": 1.2034494594130771e-05, + "loss": 0.3203, + "step": 6979 + }, + { + "epoch": 1.2, + "grad_norm": 9.837724685668945, + "learning_rate": 1.2031920370688176e-05, + "loss": 0.5107, + "step": 6980 + }, + { + "epoch": 1.2, + "grad_norm": 10.4344482421875, + "learning_rate": 1.2029346147245581e-05, + "loss": 0.4333, + "step": 6981 + }, + { + "epoch": 1.2, + "grad_norm": 10.859993934631348, + "learning_rate": 1.2026771923802986e-05, + "loss": 0.6073, + "step": 6982 + }, + { + "epoch": 1.2, + "grad_norm": 15.880913734436035, + "learning_rate": 1.2024197700360391e-05, + "loss": 0.7665, + "step": 6983 + }, + { + "epoch": 1.2, + "grad_norm": 7.9858880043029785, + "learning_rate": 1.2021623476917798e-05, + "loss": 0.3886, + "step": 6984 + }, + { + "epoch": 1.2, + "grad_norm": 8.160512924194336, + "learning_rate": 1.2019049253475203e-05, + "loss": 0.3861, + "step": 6985 + }, + { + "epoch": 1.2, + "grad_norm": 9.845983505249023, + "learning_rate": 1.2016475030032608e-05, + "loss": 0.712, + "step": 6986 + }, + { + "epoch": 1.2, + "grad_norm": 9.091763496398926, + "learning_rate": 1.2013900806590013e-05, + "loss": 0.4011, + "step": 6987 + }, + { + "epoch": 1.2, + "grad_norm": 12.557726860046387, + "learning_rate": 1.2011326583147416e-05, + "loss": 0.471, + "step": 6988 + }, + { + "epoch": 1.2, + "grad_norm": 9.745680809020996, + "learning_rate": 1.2008752359704823e-05, + "loss": 0.4615, + "step": 6989 + }, + { + "epoch": 1.2, + "grad_norm": 15.776237487792969, + "learning_rate": 1.2006178136262228e-05, + "loss": 0.8105, + "step": 6990 + }, + { + "epoch": 1.2, + "grad_norm": 9.352907180786133, + "learning_rate": 1.2003603912819633e-05, + "loss": 0.5183, + "step": 6991 + }, + { + "epoch": 1.2, + "grad_norm": 13.638228416442871, + "learning_rate": 1.2001029689377038e-05, + "loss": 0.5895, + "step": 6992 + }, + { + "epoch": 1.2, + "grad_norm": 14.045228004455566, + "learning_rate": 1.1998455465934444e-05, + "loss": 0.6126, + "step": 6993 + }, + { + "epoch": 1.2, + "grad_norm": 12.426182746887207, + "learning_rate": 1.199588124249185e-05, + "loss": 0.574, + "step": 6994 + }, + { + "epoch": 1.2, + "grad_norm": 7.595043659210205, + "learning_rate": 1.1993307019049254e-05, + "loss": 0.4957, + "step": 6995 + }, + { + "epoch": 1.2, + "grad_norm": 8.870427131652832, + "learning_rate": 1.199073279560666e-05, + "loss": 0.4357, + "step": 6996 + }, + { + "epoch": 1.2, + "grad_norm": 7.806071758270264, + "learning_rate": 1.1988158572164064e-05, + "loss": 0.3712, + "step": 6997 + }, + { + "epoch": 1.2, + "grad_norm": 10.191230773925781, + "learning_rate": 1.198558434872147e-05, + "loss": 0.5265, + "step": 6998 + }, + { + "epoch": 1.2, + "grad_norm": 12.952571868896484, + "learning_rate": 1.1983010125278874e-05, + "loss": 0.6739, + "step": 6999 + }, + { + "epoch": 1.2, + "grad_norm": 14.765110969543457, + "learning_rate": 1.198043590183628e-05, + "loss": 0.5615, + "step": 7000 + }, + { + "epoch": 1.2, + "grad_norm": 9.294563293457031, + "learning_rate": 1.1977861678393684e-05, + "loss": 0.2494, + "step": 7001 + }, + { + "epoch": 1.2, + "grad_norm": 13.038796424865723, + "learning_rate": 1.197528745495109e-05, + "loss": 0.7038, + "step": 7002 + }, + { + "epoch": 1.2, + "grad_norm": 10.618143081665039, + "learning_rate": 1.1972713231508496e-05, + "loss": 0.4741, + "step": 7003 + }, + { + "epoch": 1.2, + "grad_norm": 11.299175262451172, + "learning_rate": 1.1970139008065901e-05, + "loss": 0.5161, + "step": 7004 + }, + { + "epoch": 1.2, + "grad_norm": 11.32515811920166, + "learning_rate": 1.1967564784623306e-05, + "loss": 0.6179, + "step": 7005 + }, + { + "epoch": 1.2, + "grad_norm": 10.920764923095703, + "learning_rate": 1.196499056118071e-05, + "loss": 0.542, + "step": 7006 + }, + { + "epoch": 1.2, + "grad_norm": 10.704802513122559, + "learning_rate": 1.1962416337738116e-05, + "loss": 0.5491, + "step": 7007 + }, + { + "epoch": 1.2, + "grad_norm": 10.108997344970703, + "learning_rate": 1.195984211429552e-05, + "loss": 0.4458, + "step": 7008 + }, + { + "epoch": 1.2, + "grad_norm": 9.183677673339844, + "learning_rate": 1.1957267890852926e-05, + "loss": 0.3906, + "step": 7009 + }, + { + "epoch": 1.2, + "grad_norm": 8.915328979492188, + "learning_rate": 1.195469366741033e-05, + "loss": 0.4644, + "step": 7010 + }, + { + "epoch": 1.2, + "grad_norm": 9.178936958312988, + "learning_rate": 1.1952119443967736e-05, + "loss": 0.3103, + "step": 7011 + }, + { + "epoch": 1.2, + "grad_norm": 9.293974876403809, + "learning_rate": 1.1949545220525142e-05, + "loss": 0.41, + "step": 7012 + }, + { + "epoch": 1.2, + "grad_norm": 13.273841857910156, + "learning_rate": 1.1946970997082547e-05, + "loss": 0.8585, + "step": 7013 + }, + { + "epoch": 1.2, + "grad_norm": 16.48833465576172, + "learning_rate": 1.1944396773639952e-05, + "loss": 0.3737, + "step": 7014 + }, + { + "epoch": 1.2, + "grad_norm": 9.675256729125977, + "learning_rate": 1.1941822550197357e-05, + "loss": 0.4264, + "step": 7015 + }, + { + "epoch": 1.2, + "grad_norm": 14.285032272338867, + "learning_rate": 1.1939248326754762e-05, + "loss": 0.6321, + "step": 7016 + }, + { + "epoch": 1.2, + "grad_norm": 11.52556037902832, + "learning_rate": 1.1936674103312169e-05, + "loss": 0.6268, + "step": 7017 + }, + { + "epoch": 1.2, + "grad_norm": 11.654465675354004, + "learning_rate": 1.1934099879869574e-05, + "loss": 0.5037, + "step": 7018 + }, + { + "epoch": 1.2, + "grad_norm": 9.316587448120117, + "learning_rate": 1.1931525656426977e-05, + "loss": 0.4142, + "step": 7019 + }, + { + "epoch": 1.2, + "grad_norm": 11.695708274841309, + "learning_rate": 1.1928951432984382e-05, + "loss": 0.7036, + "step": 7020 + }, + { + "epoch": 1.2, + "grad_norm": 19.179616928100586, + "learning_rate": 1.1926377209541787e-05, + "loss": 0.3866, + "step": 7021 + }, + { + "epoch": 1.21, + "grad_norm": 10.021726608276367, + "learning_rate": 1.1923802986099194e-05, + "loss": 0.4903, + "step": 7022 + }, + { + "epoch": 1.21, + "grad_norm": 8.754122734069824, + "learning_rate": 1.1921228762656599e-05, + "loss": 0.4308, + "step": 7023 + }, + { + "epoch": 1.21, + "grad_norm": 12.344573020935059, + "learning_rate": 1.1918654539214004e-05, + "loss": 0.5596, + "step": 7024 + }, + { + "epoch": 1.21, + "grad_norm": 7.760412693023682, + "learning_rate": 1.1916080315771409e-05, + "loss": 0.4381, + "step": 7025 + }, + { + "epoch": 1.21, + "grad_norm": 8.74301815032959, + "learning_rate": 1.1913506092328816e-05, + "loss": 0.3956, + "step": 7026 + }, + { + "epoch": 1.21, + "grad_norm": 15.595786094665527, + "learning_rate": 1.191093186888622e-05, + "loss": 0.8368, + "step": 7027 + }, + { + "epoch": 1.21, + "grad_norm": 11.126429557800293, + "learning_rate": 1.1908357645443626e-05, + "loss": 0.5201, + "step": 7028 + }, + { + "epoch": 1.21, + "grad_norm": 12.102508544921875, + "learning_rate": 1.1905783422001029e-05, + "loss": 0.5917, + "step": 7029 + }, + { + "epoch": 1.21, + "grad_norm": 10.898194313049316, + "learning_rate": 1.1903209198558434e-05, + "loss": 0.5386, + "step": 7030 + }, + { + "epoch": 1.21, + "grad_norm": 9.192670822143555, + "learning_rate": 1.190063497511584e-05, + "loss": 0.4813, + "step": 7031 + }, + { + "epoch": 1.21, + "grad_norm": 13.713973999023438, + "learning_rate": 1.1898060751673246e-05, + "loss": 0.5942, + "step": 7032 + }, + { + "epoch": 1.21, + "grad_norm": 9.153640747070312, + "learning_rate": 1.189548652823065e-05, + "loss": 0.5826, + "step": 7033 + }, + { + "epoch": 1.21, + "grad_norm": 13.870328903198242, + "learning_rate": 1.1892912304788056e-05, + "loss": 0.4872, + "step": 7034 + }, + { + "epoch": 1.21, + "grad_norm": 10.00422191619873, + "learning_rate": 1.189033808134546e-05, + "loss": 0.4963, + "step": 7035 + }, + { + "epoch": 1.21, + "grad_norm": 8.160073280334473, + "learning_rate": 1.1887763857902867e-05, + "loss": 0.3616, + "step": 7036 + }, + { + "epoch": 1.21, + "grad_norm": 14.520591735839844, + "learning_rate": 1.1885189634460272e-05, + "loss": 0.6143, + "step": 7037 + }, + { + "epoch": 1.21, + "grad_norm": 11.063573837280273, + "learning_rate": 1.1882615411017677e-05, + "loss": 0.4386, + "step": 7038 + }, + { + "epoch": 1.21, + "grad_norm": 8.25976848602295, + "learning_rate": 1.1880041187575082e-05, + "loss": 0.4612, + "step": 7039 + }, + { + "epoch": 1.21, + "grad_norm": 9.411417961120605, + "learning_rate": 1.1877466964132485e-05, + "loss": 0.5498, + "step": 7040 + }, + { + "epoch": 1.21, + "grad_norm": 9.323909759521484, + "learning_rate": 1.1874892740689892e-05, + "loss": 0.7059, + "step": 7041 + }, + { + "epoch": 1.21, + "grad_norm": 9.794978141784668, + "learning_rate": 1.1872318517247297e-05, + "loss": 0.5785, + "step": 7042 + }, + { + "epoch": 1.21, + "grad_norm": 9.585016250610352, + "learning_rate": 1.1869744293804702e-05, + "loss": 0.5028, + "step": 7043 + }, + { + "epoch": 1.21, + "grad_norm": 9.982545852661133, + "learning_rate": 1.1867170070362107e-05, + "loss": 0.6212, + "step": 7044 + }, + { + "epoch": 1.21, + "grad_norm": 14.801260948181152, + "learning_rate": 1.1864595846919514e-05, + "loss": 0.371, + "step": 7045 + }, + { + "epoch": 1.21, + "grad_norm": 10.951095581054688, + "learning_rate": 1.1862021623476919e-05, + "loss": 0.6263, + "step": 7046 + }, + { + "epoch": 1.21, + "grad_norm": 10.839865684509277, + "learning_rate": 1.1859447400034324e-05, + "loss": 0.5179, + "step": 7047 + }, + { + "epoch": 1.21, + "grad_norm": 7.950497627258301, + "learning_rate": 1.1856873176591729e-05, + "loss": 0.5286, + "step": 7048 + }, + { + "epoch": 1.21, + "grad_norm": 9.106574058532715, + "learning_rate": 1.1854298953149134e-05, + "loss": 0.4785, + "step": 7049 + }, + { + "epoch": 1.21, + "grad_norm": 13.89876937866211, + "learning_rate": 1.1851724729706539e-05, + "loss": 0.8721, + "step": 7050 + }, + { + "epoch": 1.21, + "grad_norm": 10.699302673339844, + "learning_rate": 1.1849150506263944e-05, + "loss": 0.578, + "step": 7051 + }, + { + "epoch": 1.21, + "grad_norm": 9.155644416809082, + "learning_rate": 1.1846576282821349e-05, + "loss": 0.5321, + "step": 7052 + }, + { + "epoch": 1.21, + "grad_norm": 13.794529914855957, + "learning_rate": 1.1844002059378754e-05, + "loss": 0.549, + "step": 7053 + }, + { + "epoch": 1.21, + "grad_norm": 9.91904354095459, + "learning_rate": 1.1841427835936159e-05, + "loss": 0.5655, + "step": 7054 + }, + { + "epoch": 1.21, + "grad_norm": 9.819161415100098, + "learning_rate": 1.1838853612493565e-05, + "loss": 0.4241, + "step": 7055 + }, + { + "epoch": 1.21, + "grad_norm": 9.905237197875977, + "learning_rate": 1.183627938905097e-05, + "loss": 0.4594, + "step": 7056 + }, + { + "epoch": 1.21, + "grad_norm": 13.802270889282227, + "learning_rate": 1.1833705165608375e-05, + "loss": 0.5523, + "step": 7057 + }, + { + "epoch": 1.21, + "grad_norm": 12.213642120361328, + "learning_rate": 1.183113094216578e-05, + "loss": 0.7524, + "step": 7058 + }, + { + "epoch": 1.21, + "grad_norm": 7.1716227531433105, + "learning_rate": 1.1828556718723185e-05, + "loss": 0.3411, + "step": 7059 + }, + { + "epoch": 1.21, + "grad_norm": 10.224350929260254, + "learning_rate": 1.182598249528059e-05, + "loss": 0.3936, + "step": 7060 + }, + { + "epoch": 1.21, + "grad_norm": 10.481537818908691, + "learning_rate": 1.1823408271837995e-05, + "loss": 0.5573, + "step": 7061 + }, + { + "epoch": 1.21, + "grad_norm": 9.607392311096191, + "learning_rate": 1.18208340483954e-05, + "loss": 0.4878, + "step": 7062 + }, + { + "epoch": 1.21, + "grad_norm": 9.242002487182617, + "learning_rate": 1.1818259824952805e-05, + "loss": 0.565, + "step": 7063 + }, + { + "epoch": 1.21, + "grad_norm": 12.567395210266113, + "learning_rate": 1.1815685601510212e-05, + "loss": 0.5141, + "step": 7064 + }, + { + "epoch": 1.21, + "grad_norm": 9.485620498657227, + "learning_rate": 1.1813111378067617e-05, + "loss": 0.4656, + "step": 7065 + }, + { + "epoch": 1.21, + "grad_norm": 11.69052505493164, + "learning_rate": 1.1810537154625022e-05, + "loss": 0.483, + "step": 7066 + }, + { + "epoch": 1.21, + "grad_norm": 7.210465431213379, + "learning_rate": 1.1807962931182427e-05, + "loss": 0.3763, + "step": 7067 + }, + { + "epoch": 1.21, + "grad_norm": 8.8494234085083, + "learning_rate": 1.1805388707739832e-05, + "loss": 0.5078, + "step": 7068 + }, + { + "epoch": 1.21, + "grad_norm": 12.431105613708496, + "learning_rate": 1.1802814484297239e-05, + "loss": 0.627, + "step": 7069 + }, + { + "epoch": 1.21, + "grad_norm": 12.741544723510742, + "learning_rate": 1.1800240260854644e-05, + "loss": 0.7926, + "step": 7070 + }, + { + "epoch": 1.21, + "grad_norm": 8.597745895385742, + "learning_rate": 1.1797666037412047e-05, + "loss": 0.4042, + "step": 7071 + }, + { + "epoch": 1.21, + "grad_norm": 12.433899879455566, + "learning_rate": 1.1795091813969452e-05, + "loss": 0.5409, + "step": 7072 + }, + { + "epoch": 1.21, + "grad_norm": 11.573753356933594, + "learning_rate": 1.1792517590526857e-05, + "loss": 0.5791, + "step": 7073 + }, + { + "epoch": 1.21, + "grad_norm": 14.764334678649902, + "learning_rate": 1.1789943367084264e-05, + "loss": 0.7008, + "step": 7074 + }, + { + "epoch": 1.21, + "grad_norm": 13.19047737121582, + "learning_rate": 1.1787369143641669e-05, + "loss": 0.7471, + "step": 7075 + }, + { + "epoch": 1.21, + "grad_norm": 10.98177719116211, + "learning_rate": 1.1784794920199074e-05, + "loss": 0.4211, + "step": 7076 + }, + { + "epoch": 1.21, + "grad_norm": 7.5682053565979, + "learning_rate": 1.1782220696756478e-05, + "loss": 0.3395, + "step": 7077 + }, + { + "epoch": 1.21, + "grad_norm": 9.34294319152832, + "learning_rate": 1.1779646473313883e-05, + "loss": 0.4304, + "step": 7078 + }, + { + "epoch": 1.21, + "grad_norm": 8.188375473022461, + "learning_rate": 1.177707224987129e-05, + "loss": 0.3686, + "step": 7079 + }, + { + "epoch": 1.22, + "grad_norm": 8.635316848754883, + "learning_rate": 1.1774498026428695e-05, + "loss": 0.4264, + "step": 7080 + }, + { + "epoch": 1.22, + "grad_norm": 12.043983459472656, + "learning_rate": 1.1771923802986098e-05, + "loss": 0.5094, + "step": 7081 + }, + { + "epoch": 1.22, + "grad_norm": 9.92259693145752, + "learning_rate": 1.1769349579543503e-05, + "loss": 0.4642, + "step": 7082 + }, + { + "epoch": 1.22, + "grad_norm": 9.077810287475586, + "learning_rate": 1.176677535610091e-05, + "loss": 0.6337, + "step": 7083 + }, + { + "epoch": 1.22, + "grad_norm": 14.101829528808594, + "learning_rate": 1.1764201132658315e-05, + "loss": 0.6638, + "step": 7084 + }, + { + "epoch": 1.22, + "grad_norm": 14.365436553955078, + "learning_rate": 1.176162690921572e-05, + "loss": 0.5677, + "step": 7085 + }, + { + "epoch": 1.22, + "grad_norm": 10.324374198913574, + "learning_rate": 1.1759052685773125e-05, + "loss": 0.5725, + "step": 7086 + }, + { + "epoch": 1.22, + "grad_norm": 10.412105560302734, + "learning_rate": 1.175647846233053e-05, + "loss": 0.5346, + "step": 7087 + }, + { + "epoch": 1.22, + "grad_norm": 11.5609769821167, + "learning_rate": 1.1753904238887937e-05, + "loss": 0.7969, + "step": 7088 + }, + { + "epoch": 1.22, + "grad_norm": 13.079243659973145, + "learning_rate": 1.1751330015445342e-05, + "loss": 0.524, + "step": 7089 + }, + { + "epoch": 1.22, + "grad_norm": 15.235267639160156, + "learning_rate": 1.1748755792002747e-05, + "loss": 0.6835, + "step": 7090 + }, + { + "epoch": 1.22, + "grad_norm": 8.892448425292969, + "learning_rate": 1.1746181568560152e-05, + "loss": 0.5749, + "step": 7091 + }, + { + "epoch": 1.22, + "grad_norm": 16.017757415771484, + "learning_rate": 1.1743607345117555e-05, + "loss": 0.5958, + "step": 7092 + }, + { + "epoch": 1.22, + "grad_norm": 13.532471656799316, + "learning_rate": 1.1741033121674962e-05, + "loss": 0.5163, + "step": 7093 + }, + { + "epoch": 1.22, + "grad_norm": 12.933429718017578, + "learning_rate": 1.1738458898232367e-05, + "loss": 0.4198, + "step": 7094 + }, + { + "epoch": 1.22, + "grad_norm": 10.935221672058105, + "learning_rate": 1.1735884674789772e-05, + "loss": 0.4781, + "step": 7095 + }, + { + "epoch": 1.22, + "grad_norm": 8.14262580871582, + "learning_rate": 1.1733310451347177e-05, + "loss": 0.4835, + "step": 7096 + }, + { + "epoch": 1.22, + "grad_norm": 11.401575088500977, + "learning_rate": 1.1730736227904583e-05, + "loss": 0.6201, + "step": 7097 + }, + { + "epoch": 1.22, + "grad_norm": 10.443571090698242, + "learning_rate": 1.1728162004461988e-05, + "loss": 0.4969, + "step": 7098 + }, + { + "epoch": 1.22, + "grad_norm": 10.579925537109375, + "learning_rate": 1.1725587781019393e-05, + "loss": 0.4178, + "step": 7099 + }, + { + "epoch": 1.22, + "grad_norm": 15.502985954284668, + "learning_rate": 1.1723013557576798e-05, + "loss": 0.5585, + "step": 7100 + }, + { + "epoch": 1.22, + "grad_norm": 7.74077844619751, + "learning_rate": 1.1720439334134203e-05, + "loss": 0.5048, + "step": 7101 + }, + { + "epoch": 1.22, + "grad_norm": 7.656698703765869, + "learning_rate": 1.1717865110691608e-05, + "loss": 0.3651, + "step": 7102 + }, + { + "epoch": 1.22, + "grad_norm": 9.099870681762695, + "learning_rate": 1.1715290887249013e-05, + "loss": 0.4139, + "step": 7103 + }, + { + "epoch": 1.22, + "grad_norm": 9.223795890808105, + "learning_rate": 1.1712716663806418e-05, + "loss": 0.4273, + "step": 7104 + }, + { + "epoch": 1.22, + "grad_norm": 13.487719535827637, + "learning_rate": 1.1710142440363823e-05, + "loss": 0.8251, + "step": 7105 + }, + { + "epoch": 1.22, + "grad_norm": 7.5261359214782715, + "learning_rate": 1.1707568216921228e-05, + "loss": 0.3658, + "step": 7106 + }, + { + "epoch": 1.22, + "grad_norm": 7.580143451690674, + "learning_rate": 1.1704993993478635e-05, + "loss": 0.3119, + "step": 7107 + }, + { + "epoch": 1.22, + "grad_norm": 11.47298526763916, + "learning_rate": 1.170241977003604e-05, + "loss": 0.6249, + "step": 7108 + }, + { + "epoch": 1.22, + "grad_norm": 8.980270385742188, + "learning_rate": 1.1699845546593445e-05, + "loss": 0.5136, + "step": 7109 + }, + { + "epoch": 1.22, + "grad_norm": 10.487540245056152, + "learning_rate": 1.169727132315085e-05, + "loss": 0.4227, + "step": 7110 + }, + { + "epoch": 1.22, + "grad_norm": 12.91495418548584, + "learning_rate": 1.1694697099708255e-05, + "loss": 0.528, + "step": 7111 + }, + { + "epoch": 1.22, + "grad_norm": 12.30009651184082, + "learning_rate": 1.1692122876265662e-05, + "loss": 0.5401, + "step": 7112 + }, + { + "epoch": 1.22, + "grad_norm": 7.5504231452941895, + "learning_rate": 1.1689548652823065e-05, + "loss": 0.5919, + "step": 7113 + }, + { + "epoch": 1.22, + "grad_norm": 8.687311172485352, + "learning_rate": 1.168697442938047e-05, + "loss": 0.4475, + "step": 7114 + }, + { + "epoch": 1.22, + "grad_norm": 8.62994384765625, + "learning_rate": 1.1684400205937875e-05, + "loss": 0.3877, + "step": 7115 + }, + { + "epoch": 1.22, + "grad_norm": 10.701693534851074, + "learning_rate": 1.1681825982495281e-05, + "loss": 0.4388, + "step": 7116 + }, + { + "epoch": 1.22, + "grad_norm": 12.731943130493164, + "learning_rate": 1.1679251759052686e-05, + "loss": 0.5003, + "step": 7117 + }, + { + "epoch": 1.22, + "grad_norm": 11.448636054992676, + "learning_rate": 1.1676677535610091e-05, + "loss": 0.6091, + "step": 7118 + }, + { + "epoch": 1.22, + "grad_norm": 13.859493255615234, + "learning_rate": 1.1674103312167496e-05, + "loss": 0.4222, + "step": 7119 + }, + { + "epoch": 1.22, + "grad_norm": 9.540654182434082, + "learning_rate": 1.1671529088724901e-05, + "loss": 0.4202, + "step": 7120 + }, + { + "epoch": 1.22, + "grad_norm": 13.63926887512207, + "learning_rate": 1.1668954865282308e-05, + "loss": 0.4924, + "step": 7121 + }, + { + "epoch": 1.22, + "grad_norm": 12.28764533996582, + "learning_rate": 1.1666380641839713e-05, + "loss": 0.3627, + "step": 7122 + }, + { + "epoch": 1.22, + "grad_norm": 11.108075141906738, + "learning_rate": 1.1663806418397116e-05, + "loss": 0.4352, + "step": 7123 + }, + { + "epoch": 1.22, + "grad_norm": 11.017898559570312, + "learning_rate": 1.1661232194954521e-05, + "loss": 0.404, + "step": 7124 + }, + { + "epoch": 1.22, + "grad_norm": 6.9288740158081055, + "learning_rate": 1.1658657971511926e-05, + "loss": 0.3302, + "step": 7125 + }, + { + "epoch": 1.22, + "grad_norm": 10.391546249389648, + "learning_rate": 1.1656083748069333e-05, + "loss": 0.6349, + "step": 7126 + }, + { + "epoch": 1.22, + "grad_norm": 7.188262462615967, + "learning_rate": 1.1653509524626738e-05, + "loss": 0.2733, + "step": 7127 + }, + { + "epoch": 1.22, + "grad_norm": 17.3939266204834, + "learning_rate": 1.1650935301184143e-05, + "loss": 0.6341, + "step": 7128 + }, + { + "epoch": 1.22, + "grad_norm": 11.814071655273438, + "learning_rate": 1.1648361077741548e-05, + "loss": 0.6084, + "step": 7129 + }, + { + "epoch": 1.22, + "grad_norm": 8.20941162109375, + "learning_rate": 1.1645786854298953e-05, + "loss": 0.3172, + "step": 7130 + }, + { + "epoch": 1.22, + "grad_norm": 12.053966522216797, + "learning_rate": 1.164321263085636e-05, + "loss": 0.562, + "step": 7131 + }, + { + "epoch": 1.22, + "grad_norm": 13.195056915283203, + "learning_rate": 1.1640638407413765e-05, + "loss": 0.4252, + "step": 7132 + }, + { + "epoch": 1.22, + "grad_norm": 12.162524223327637, + "learning_rate": 1.1638064183971168e-05, + "loss": 0.4871, + "step": 7133 + }, + { + "epoch": 1.22, + "grad_norm": 9.25905704498291, + "learning_rate": 1.1635489960528573e-05, + "loss": 0.5897, + "step": 7134 + }, + { + "epoch": 1.22, + "grad_norm": 8.6315279006958, + "learning_rate": 1.163291573708598e-05, + "loss": 0.3175, + "step": 7135 + }, + { + "epoch": 1.22, + "grad_norm": 9.307784080505371, + "learning_rate": 1.1630341513643385e-05, + "loss": 0.53, + "step": 7136 + }, + { + "epoch": 1.22, + "grad_norm": 15.675056457519531, + "learning_rate": 1.162776729020079e-05, + "loss": 0.7125, + "step": 7137 + }, + { + "epoch": 1.22, + "grad_norm": 13.297932624816895, + "learning_rate": 1.1625193066758195e-05, + "loss": 0.6564, + "step": 7138 + }, + { + "epoch": 1.23, + "grad_norm": 7.99775505065918, + "learning_rate": 1.16226188433156e-05, + "loss": 0.5808, + "step": 7139 + }, + { + "epoch": 1.23, + "grad_norm": 6.949821949005127, + "learning_rate": 1.1620044619873006e-05, + "loss": 0.3286, + "step": 7140 + }, + { + "epoch": 1.23, + "grad_norm": 10.827123641967773, + "learning_rate": 1.1617470396430411e-05, + "loss": 0.4466, + "step": 7141 + }, + { + "epoch": 1.23, + "grad_norm": 10.465693473815918, + "learning_rate": 1.1614896172987816e-05, + "loss": 0.4905, + "step": 7142 + }, + { + "epoch": 1.23, + "grad_norm": 10.595473289489746, + "learning_rate": 1.1612321949545221e-05, + "loss": 0.3843, + "step": 7143 + }, + { + "epoch": 1.23, + "grad_norm": 10.307246208190918, + "learning_rate": 1.1609747726102625e-05, + "loss": 0.6517, + "step": 7144 + }, + { + "epoch": 1.23, + "grad_norm": 10.332794189453125, + "learning_rate": 1.1607173502660031e-05, + "loss": 0.3841, + "step": 7145 + }, + { + "epoch": 1.23, + "grad_norm": 12.55783748626709, + "learning_rate": 1.1604599279217436e-05, + "loss": 0.6285, + "step": 7146 + }, + { + "epoch": 1.23, + "grad_norm": 11.59277057647705, + "learning_rate": 1.1602025055774841e-05, + "loss": 0.4413, + "step": 7147 + }, + { + "epoch": 1.23, + "grad_norm": 12.730266571044922, + "learning_rate": 1.1599450832332246e-05, + "loss": 0.5179, + "step": 7148 + }, + { + "epoch": 1.23, + "grad_norm": 12.36196231842041, + "learning_rate": 1.1596876608889653e-05, + "loss": 0.5347, + "step": 7149 + }, + { + "epoch": 1.23, + "grad_norm": 9.165251731872559, + "learning_rate": 1.1594302385447058e-05, + "loss": 0.4246, + "step": 7150 + }, + { + "epoch": 1.23, + "grad_norm": 9.816166877746582, + "learning_rate": 1.1591728162004463e-05, + "loss": 0.7186, + "step": 7151 + }, + { + "epoch": 1.23, + "grad_norm": 10.423781394958496, + "learning_rate": 1.1589153938561868e-05, + "loss": 0.4496, + "step": 7152 + }, + { + "epoch": 1.23, + "grad_norm": 14.16259479522705, + "learning_rate": 1.1586579715119273e-05, + "loss": 0.4833, + "step": 7153 + }, + { + "epoch": 1.23, + "grad_norm": 7.660272598266602, + "learning_rate": 1.1584005491676678e-05, + "loss": 0.422, + "step": 7154 + }, + { + "epoch": 1.23, + "grad_norm": 7.854595184326172, + "learning_rate": 1.1581431268234083e-05, + "loss": 0.4919, + "step": 7155 + }, + { + "epoch": 1.23, + "grad_norm": 9.222434997558594, + "learning_rate": 1.1578857044791488e-05, + "loss": 0.5104, + "step": 7156 + }, + { + "epoch": 1.23, + "grad_norm": 10.050089836120605, + "learning_rate": 1.1576282821348893e-05, + "loss": 0.4817, + "step": 7157 + }, + { + "epoch": 1.23, + "grad_norm": 11.66711711883545, + "learning_rate": 1.1573708597906298e-05, + "loss": 0.4955, + "step": 7158 + }, + { + "epoch": 1.23, + "grad_norm": 12.91462230682373, + "learning_rate": 1.1571134374463704e-05, + "loss": 0.481, + "step": 7159 + }, + { + "epoch": 1.23, + "grad_norm": 10.339996337890625, + "learning_rate": 1.156856015102111e-05, + "loss": 0.4842, + "step": 7160 + }, + { + "epoch": 1.23, + "grad_norm": 16.144424438476562, + "learning_rate": 1.1565985927578514e-05, + "loss": 0.5748, + "step": 7161 + }, + { + "epoch": 1.23, + "grad_norm": 9.7717866897583, + "learning_rate": 1.156341170413592e-05, + "loss": 0.4264, + "step": 7162 + }, + { + "epoch": 1.23, + "grad_norm": 11.315986633300781, + "learning_rate": 1.1560837480693324e-05, + "loss": 0.5858, + "step": 7163 + }, + { + "epoch": 1.23, + "grad_norm": 9.716745376586914, + "learning_rate": 1.1558263257250731e-05, + "loss": 0.3456, + "step": 7164 + }, + { + "epoch": 1.23, + "grad_norm": 11.27481746673584, + "learning_rate": 1.1555689033808134e-05, + "loss": 0.616, + "step": 7165 + }, + { + "epoch": 1.23, + "grad_norm": 8.431517601013184, + "learning_rate": 1.155311481036554e-05, + "loss": 0.5691, + "step": 7166 + }, + { + "epoch": 1.23, + "grad_norm": 10.07856559753418, + "learning_rate": 1.1550540586922944e-05, + "loss": 0.4524, + "step": 7167 + }, + { + "epoch": 1.23, + "grad_norm": 10.139068603515625, + "learning_rate": 1.1547966363480351e-05, + "loss": 0.6126, + "step": 7168 + }, + { + "epoch": 1.23, + "grad_norm": 12.351325035095215, + "learning_rate": 1.1545392140037756e-05, + "loss": 0.6218, + "step": 7169 + }, + { + "epoch": 1.23, + "grad_norm": 13.678258895874023, + "learning_rate": 1.1542817916595161e-05, + "loss": 0.5069, + "step": 7170 + }, + { + "epoch": 1.23, + "grad_norm": 8.375031471252441, + "learning_rate": 1.1540243693152566e-05, + "loss": 0.3749, + "step": 7171 + }, + { + "epoch": 1.23, + "grad_norm": 9.02367115020752, + "learning_rate": 1.1537669469709971e-05, + "loss": 0.5074, + "step": 7172 + }, + { + "epoch": 1.23, + "grad_norm": 9.382370948791504, + "learning_rate": 1.1535095246267378e-05, + "loss": 0.5201, + "step": 7173 + }, + { + "epoch": 1.23, + "grad_norm": 8.980612754821777, + "learning_rate": 1.1532521022824783e-05, + "loss": 0.4341, + "step": 7174 + }, + { + "epoch": 1.23, + "grad_norm": 8.573546409606934, + "learning_rate": 1.1529946799382186e-05, + "loss": 0.4113, + "step": 7175 + }, + { + "epoch": 1.23, + "grad_norm": 11.847718238830566, + "learning_rate": 1.1527372575939591e-05, + "loss": 0.4756, + "step": 7176 + }, + { + "epoch": 1.23, + "grad_norm": 10.996875762939453, + "learning_rate": 1.1524798352496996e-05, + "loss": 0.5359, + "step": 7177 + }, + { + "epoch": 1.23, + "grad_norm": 6.408705711364746, + "learning_rate": 1.1522224129054403e-05, + "loss": 0.455, + "step": 7178 + }, + { + "epoch": 1.23, + "grad_norm": 12.923229217529297, + "learning_rate": 1.1519649905611808e-05, + "loss": 0.5556, + "step": 7179 + }, + { + "epoch": 1.23, + "grad_norm": 11.602019309997559, + "learning_rate": 1.1517075682169213e-05, + "loss": 0.75, + "step": 7180 + }, + { + "epoch": 1.23, + "grad_norm": 13.265124320983887, + "learning_rate": 1.1514501458726618e-05, + "loss": 0.5565, + "step": 7181 + }, + { + "epoch": 1.23, + "grad_norm": 12.500020980834961, + "learning_rate": 1.1511927235284022e-05, + "loss": 0.5501, + "step": 7182 + }, + { + "epoch": 1.23, + "grad_norm": 11.173694610595703, + "learning_rate": 1.150935301184143e-05, + "loss": 0.4323, + "step": 7183 + }, + { + "epoch": 1.23, + "grad_norm": 12.560260772705078, + "learning_rate": 1.1506778788398834e-05, + "loss": 0.4102, + "step": 7184 + }, + { + "epoch": 1.23, + "grad_norm": 10.73781967163086, + "learning_rate": 1.1504204564956237e-05, + "loss": 0.5474, + "step": 7185 + }, + { + "epoch": 1.23, + "grad_norm": 12.796578407287598, + "learning_rate": 1.1501630341513642e-05, + "loss": 0.5905, + "step": 7186 + }, + { + "epoch": 1.23, + "grad_norm": 12.797844886779785, + "learning_rate": 1.1499056118071049e-05, + "loss": 0.521, + "step": 7187 + }, + { + "epoch": 1.23, + "grad_norm": 11.736172676086426, + "learning_rate": 1.1496481894628454e-05, + "loss": 0.5932, + "step": 7188 + }, + { + "epoch": 1.23, + "grad_norm": 14.909501075744629, + "learning_rate": 1.1493907671185859e-05, + "loss": 0.5309, + "step": 7189 + }, + { + "epoch": 1.23, + "grad_norm": 9.898690223693848, + "learning_rate": 1.1491333447743264e-05, + "loss": 0.4736, + "step": 7190 + }, + { + "epoch": 1.23, + "grad_norm": 12.287887573242188, + "learning_rate": 1.1488759224300669e-05, + "loss": 0.5086, + "step": 7191 + }, + { + "epoch": 1.23, + "grad_norm": 12.551624298095703, + "learning_rate": 1.1486185000858076e-05, + "loss": 0.4678, + "step": 7192 + }, + { + "epoch": 1.23, + "grad_norm": 11.073768615722656, + "learning_rate": 1.148361077741548e-05, + "loss": 0.5858, + "step": 7193 + }, + { + "epoch": 1.23, + "grad_norm": 9.796259880065918, + "learning_rate": 1.1481036553972886e-05, + "loss": 0.6086, + "step": 7194 + }, + { + "epoch": 1.23, + "grad_norm": 9.341981887817383, + "learning_rate": 1.147846233053029e-05, + "loss": 0.349, + "step": 7195 + }, + { + "epoch": 1.23, + "grad_norm": 7.76344633102417, + "learning_rate": 1.1475888107087694e-05, + "loss": 0.3582, + "step": 7196 + }, + { + "epoch": 1.24, + "grad_norm": 10.240875244140625, + "learning_rate": 1.14733138836451e-05, + "loss": 0.4038, + "step": 7197 + }, + { + "epoch": 1.24, + "grad_norm": 9.505542755126953, + "learning_rate": 1.1470739660202506e-05, + "loss": 0.4493, + "step": 7198 + }, + { + "epoch": 1.24, + "grad_norm": 9.862126350402832, + "learning_rate": 1.146816543675991e-05, + "loss": 0.3836, + "step": 7199 + }, + { + "epoch": 1.24, + "grad_norm": 10.115520477294922, + "learning_rate": 1.1465591213317316e-05, + "loss": 0.4824, + "step": 7200 + }, + { + "epoch": 1.24, + "grad_norm": 11.578757286071777, + "learning_rate": 1.1463016989874722e-05, + "loss": 0.5646, + "step": 7201 + }, + { + "epoch": 1.24, + "grad_norm": 9.81511116027832, + "learning_rate": 1.1460442766432127e-05, + "loss": 0.5767, + "step": 7202 + }, + { + "epoch": 1.24, + "grad_norm": 9.9080810546875, + "learning_rate": 1.1457868542989532e-05, + "loss": 0.589, + "step": 7203 + }, + { + "epoch": 1.24, + "grad_norm": 7.988289833068848, + "learning_rate": 1.1455294319546937e-05, + "loss": 0.3225, + "step": 7204 + }, + { + "epoch": 1.24, + "grad_norm": 9.365975379943848, + "learning_rate": 1.1452720096104342e-05, + "loss": 0.6056, + "step": 7205 + }, + { + "epoch": 1.24, + "grad_norm": 7.194970607757568, + "learning_rate": 1.1450145872661747e-05, + "loss": 0.3617, + "step": 7206 + }, + { + "epoch": 1.24, + "grad_norm": 15.30859661102295, + "learning_rate": 1.1447571649219152e-05, + "loss": 0.5695, + "step": 7207 + }, + { + "epoch": 1.24, + "grad_norm": 15.60885238647461, + "learning_rate": 1.1444997425776557e-05, + "loss": 0.5869, + "step": 7208 + }, + { + "epoch": 1.24, + "grad_norm": 9.791413307189941, + "learning_rate": 1.1442423202333962e-05, + "loss": 0.6055, + "step": 7209 + }, + { + "epoch": 1.24, + "grad_norm": 11.433341026306152, + "learning_rate": 1.1439848978891367e-05, + "loss": 0.6103, + "step": 7210 + }, + { + "epoch": 1.24, + "grad_norm": 8.403416633605957, + "learning_rate": 1.1437274755448774e-05, + "loss": 0.395, + "step": 7211 + }, + { + "epoch": 1.24, + "grad_norm": 11.142620086669922, + "learning_rate": 1.1434700532006179e-05, + "loss": 0.5486, + "step": 7212 + }, + { + "epoch": 1.24, + "grad_norm": 9.775880813598633, + "learning_rate": 1.1432126308563584e-05, + "loss": 0.5146, + "step": 7213 + }, + { + "epoch": 1.24, + "grad_norm": 11.822942733764648, + "learning_rate": 1.1429552085120989e-05, + "loss": 0.5121, + "step": 7214 + }, + { + "epoch": 1.24, + "grad_norm": 11.373087882995605, + "learning_rate": 1.1426977861678394e-05, + "loss": 0.3958, + "step": 7215 + }, + { + "epoch": 1.24, + "grad_norm": 9.169090270996094, + "learning_rate": 1.14244036382358e-05, + "loss": 0.3961, + "step": 7216 + }, + { + "epoch": 1.24, + "grad_norm": 8.584774017333984, + "learning_rate": 1.1421829414793204e-05, + "loss": 0.441, + "step": 7217 + }, + { + "epoch": 1.24, + "grad_norm": 8.406041145324707, + "learning_rate": 1.1419255191350609e-05, + "loss": 0.7111, + "step": 7218 + }, + { + "epoch": 1.24, + "grad_norm": 12.626038551330566, + "learning_rate": 1.1416680967908014e-05, + "loss": 0.6714, + "step": 7219 + }, + { + "epoch": 1.24, + "grad_norm": 12.445282936096191, + "learning_rate": 1.141410674446542e-05, + "loss": 0.4856, + "step": 7220 + }, + { + "epoch": 1.24, + "grad_norm": 9.321600914001465, + "learning_rate": 1.1411532521022825e-05, + "loss": 0.356, + "step": 7221 + }, + { + "epoch": 1.24, + "grad_norm": 7.86656379699707, + "learning_rate": 1.140895829758023e-05, + "loss": 0.3288, + "step": 7222 + }, + { + "epoch": 1.24, + "grad_norm": 9.213930130004883, + "learning_rate": 1.1406384074137635e-05, + "loss": 0.3596, + "step": 7223 + }, + { + "epoch": 1.24, + "grad_norm": 8.23892879486084, + "learning_rate": 1.140380985069504e-05, + "loss": 0.4785, + "step": 7224 + }, + { + "epoch": 1.24, + "grad_norm": 10.818760871887207, + "learning_rate": 1.1401235627252447e-05, + "loss": 0.5261, + "step": 7225 + }, + { + "epoch": 1.24, + "grad_norm": 14.000699043273926, + "learning_rate": 1.1398661403809852e-05, + "loss": 0.4484, + "step": 7226 + }, + { + "epoch": 1.24, + "grad_norm": 11.705554962158203, + "learning_rate": 1.1396087180367255e-05, + "loss": 0.4674, + "step": 7227 + }, + { + "epoch": 1.24, + "grad_norm": 13.233939170837402, + "learning_rate": 1.139351295692466e-05, + "loss": 0.4736, + "step": 7228 + }, + { + "epoch": 1.24, + "grad_norm": 10.031034469604492, + "learning_rate": 1.1390938733482065e-05, + "loss": 0.4833, + "step": 7229 + }, + { + "epoch": 1.24, + "grad_norm": 11.996298789978027, + "learning_rate": 1.1388364510039472e-05, + "loss": 0.5249, + "step": 7230 + }, + { + "epoch": 1.24, + "grad_norm": 12.602757453918457, + "learning_rate": 1.1385790286596877e-05, + "loss": 0.6922, + "step": 7231 + }, + { + "epoch": 1.24, + "grad_norm": 9.56598949432373, + "learning_rate": 1.1383216063154282e-05, + "loss": 0.4054, + "step": 7232 + }, + { + "epoch": 1.24, + "grad_norm": 9.37939453125, + "learning_rate": 1.1380641839711687e-05, + "loss": 0.5647, + "step": 7233 + }, + { + "epoch": 1.24, + "grad_norm": 8.546008110046387, + "learning_rate": 1.1378067616269092e-05, + "loss": 0.35, + "step": 7234 + }, + { + "epoch": 1.24, + "grad_norm": 10.051502227783203, + "learning_rate": 1.1375493392826499e-05, + "loss": 0.5013, + "step": 7235 + }, + { + "epoch": 1.24, + "grad_norm": 8.882331848144531, + "learning_rate": 1.1372919169383904e-05, + "loss": 0.4023, + "step": 7236 + }, + { + "epoch": 1.24, + "grad_norm": 9.841806411743164, + "learning_rate": 1.1370344945941307e-05, + "loss": 0.5238, + "step": 7237 + }, + { + "epoch": 1.24, + "grad_norm": 10.044391632080078, + "learning_rate": 1.1367770722498712e-05, + "loss": 0.4652, + "step": 7238 + }, + { + "epoch": 1.24, + "grad_norm": 12.846569061279297, + "learning_rate": 1.1365196499056119e-05, + "loss": 0.606, + "step": 7239 + }, + { + "epoch": 1.24, + "grad_norm": 10.960699081420898, + "learning_rate": 1.1362622275613524e-05, + "loss": 0.4853, + "step": 7240 + }, + { + "epoch": 1.24, + "grad_norm": 8.793228149414062, + "learning_rate": 1.1360048052170929e-05, + "loss": 0.4019, + "step": 7241 + }, + { + "epoch": 1.24, + "grad_norm": 11.172778129577637, + "learning_rate": 1.1357473828728334e-05, + "loss": 0.5707, + "step": 7242 + }, + { + "epoch": 1.24, + "grad_norm": 9.804939270019531, + "learning_rate": 1.1354899605285739e-05, + "loss": 0.3863, + "step": 7243 + }, + { + "epoch": 1.24, + "grad_norm": 10.950481414794922, + "learning_rate": 1.1352325381843145e-05, + "loss": 0.4967, + "step": 7244 + }, + { + "epoch": 1.24, + "grad_norm": 8.23788833618164, + "learning_rate": 1.134975115840055e-05, + "loss": 0.4025, + "step": 7245 + }, + { + "epoch": 1.24, + "grad_norm": 8.630895614624023, + "learning_rate": 1.1347176934957955e-05, + "loss": 0.3437, + "step": 7246 + }, + { + "epoch": 1.24, + "grad_norm": 9.612194061279297, + "learning_rate": 1.134460271151536e-05, + "loss": 0.553, + "step": 7247 + }, + { + "epoch": 1.24, + "grad_norm": 9.624996185302734, + "learning_rate": 1.1342028488072764e-05, + "loss": 0.5355, + "step": 7248 + }, + { + "epoch": 1.24, + "grad_norm": 13.33304500579834, + "learning_rate": 1.133945426463017e-05, + "loss": 0.5072, + "step": 7249 + }, + { + "epoch": 1.24, + "grad_norm": 10.467058181762695, + "learning_rate": 1.1336880041187575e-05, + "loss": 0.3252, + "step": 7250 + }, + { + "epoch": 1.24, + "grad_norm": 7.657370090484619, + "learning_rate": 1.133430581774498e-05, + "loss": 0.499, + "step": 7251 + }, + { + "epoch": 1.24, + "grad_norm": 9.748489379882812, + "learning_rate": 1.1331731594302385e-05, + "loss": 0.4994, + "step": 7252 + }, + { + "epoch": 1.24, + "grad_norm": 12.288755416870117, + "learning_rate": 1.132915737085979e-05, + "loss": 0.4601, + "step": 7253 + }, + { + "epoch": 1.24, + "grad_norm": 11.3094482421875, + "learning_rate": 1.1326583147417197e-05, + "loss": 0.5718, + "step": 7254 + }, + { + "epoch": 1.25, + "grad_norm": 8.617788314819336, + "learning_rate": 1.1324008923974602e-05, + "loss": 0.4283, + "step": 7255 + }, + { + "epoch": 1.25, + "grad_norm": 11.05827808380127, + "learning_rate": 1.1321434700532007e-05, + "loss": 0.5677, + "step": 7256 + }, + { + "epoch": 1.25, + "grad_norm": 13.439905166625977, + "learning_rate": 1.1318860477089412e-05, + "loss": 0.441, + "step": 7257 + }, + { + "epoch": 1.25, + "grad_norm": 10.854427337646484, + "learning_rate": 1.1316286253646817e-05, + "loss": 0.419, + "step": 7258 + }, + { + "epoch": 1.25, + "grad_norm": 8.072861671447754, + "learning_rate": 1.1313712030204222e-05, + "loss": 0.4099, + "step": 7259 + }, + { + "epoch": 1.25, + "grad_norm": 12.507295608520508, + "learning_rate": 1.1311137806761627e-05, + "loss": 0.5666, + "step": 7260 + }, + { + "epoch": 1.25, + "grad_norm": 10.564096450805664, + "learning_rate": 1.1308563583319032e-05, + "loss": 0.4455, + "step": 7261 + }, + { + "epoch": 1.25, + "grad_norm": 10.835600852966309, + "learning_rate": 1.1305989359876437e-05, + "loss": 0.5052, + "step": 7262 + }, + { + "epoch": 1.25, + "grad_norm": 7.7724809646606445, + "learning_rate": 1.1303415136433843e-05, + "loss": 0.514, + "step": 7263 + }, + { + "epoch": 1.25, + "grad_norm": 11.510541915893555, + "learning_rate": 1.1300840912991248e-05, + "loss": 0.5399, + "step": 7264 + }, + { + "epoch": 1.25, + "grad_norm": 12.288419723510742, + "learning_rate": 1.1298266689548653e-05, + "loss": 0.5107, + "step": 7265 + }, + { + "epoch": 1.25, + "grad_norm": 9.084488868713379, + "learning_rate": 1.1295692466106058e-05, + "loss": 0.4922, + "step": 7266 + }, + { + "epoch": 1.25, + "grad_norm": 11.851661682128906, + "learning_rate": 1.1293118242663463e-05, + "loss": 0.5858, + "step": 7267 + }, + { + "epoch": 1.25, + "grad_norm": 11.195343017578125, + "learning_rate": 1.129054401922087e-05, + "loss": 0.6452, + "step": 7268 + }, + { + "epoch": 1.25, + "grad_norm": 10.215805053710938, + "learning_rate": 1.1287969795778273e-05, + "loss": 0.4136, + "step": 7269 + }, + { + "epoch": 1.25, + "grad_norm": 11.631438255310059, + "learning_rate": 1.1285395572335678e-05, + "loss": 0.4646, + "step": 7270 + }, + { + "epoch": 1.25, + "grad_norm": 9.711803436279297, + "learning_rate": 1.1282821348893083e-05, + "loss": 0.3928, + "step": 7271 + }, + { + "epoch": 1.25, + "grad_norm": 10.507469177246094, + "learning_rate": 1.128024712545049e-05, + "loss": 0.5241, + "step": 7272 + }, + { + "epoch": 1.25, + "grad_norm": 9.972332000732422, + "learning_rate": 1.1277672902007895e-05, + "loss": 0.4817, + "step": 7273 + }, + { + "epoch": 1.25, + "grad_norm": 11.054027557373047, + "learning_rate": 1.12750986785653e-05, + "loss": 0.7787, + "step": 7274 + }, + { + "epoch": 1.25, + "grad_norm": 11.904644012451172, + "learning_rate": 1.1272524455122705e-05, + "loss": 0.5479, + "step": 7275 + }, + { + "epoch": 1.25, + "grad_norm": 13.885504722595215, + "learning_rate": 1.126995023168011e-05, + "loss": 0.4343, + "step": 7276 + }, + { + "epoch": 1.25, + "grad_norm": 8.406169891357422, + "learning_rate": 1.1267376008237517e-05, + "loss": 0.5582, + "step": 7277 + }, + { + "epoch": 1.25, + "grad_norm": 10.902565002441406, + "learning_rate": 1.1264801784794922e-05, + "loss": 0.5416, + "step": 7278 + }, + { + "epoch": 1.25, + "grad_norm": 10.308808326721191, + "learning_rate": 1.1262227561352325e-05, + "loss": 0.5342, + "step": 7279 + }, + { + "epoch": 1.25, + "grad_norm": 11.628055572509766, + "learning_rate": 1.125965333790973e-05, + "loss": 0.701, + "step": 7280 + }, + { + "epoch": 1.25, + "grad_norm": 12.303853034973145, + "learning_rate": 1.1257079114467135e-05, + "loss": 0.6888, + "step": 7281 + }, + { + "epoch": 1.25, + "grad_norm": 11.253119468688965, + "learning_rate": 1.1254504891024542e-05, + "loss": 0.4342, + "step": 7282 + }, + { + "epoch": 1.25, + "grad_norm": 8.1643648147583, + "learning_rate": 1.1251930667581947e-05, + "loss": 0.3325, + "step": 7283 + }, + { + "epoch": 1.25, + "grad_norm": 9.911738395690918, + "learning_rate": 1.1249356444139352e-05, + "loss": 0.5293, + "step": 7284 + }, + { + "epoch": 1.25, + "grad_norm": 9.523740768432617, + "learning_rate": 1.1246782220696757e-05, + "loss": 0.4431, + "step": 7285 + }, + { + "epoch": 1.25, + "grad_norm": 11.141708374023438, + "learning_rate": 1.1244207997254162e-05, + "loss": 0.604, + "step": 7286 + }, + { + "epoch": 1.25, + "grad_norm": 9.976603507995605, + "learning_rate": 1.1241633773811568e-05, + "loss": 0.4651, + "step": 7287 + }, + { + "epoch": 1.25, + "grad_norm": 12.72829818725586, + "learning_rate": 1.1239059550368973e-05, + "loss": 0.6487, + "step": 7288 + }, + { + "epoch": 1.25, + "grad_norm": 10.384855270385742, + "learning_rate": 1.1236485326926376e-05, + "loss": 0.5895, + "step": 7289 + }, + { + "epoch": 1.25, + "grad_norm": 9.792872428894043, + "learning_rate": 1.1233911103483781e-05, + "loss": 0.4809, + "step": 7290 + }, + { + "epoch": 1.25, + "grad_norm": 9.717025756835938, + "learning_rate": 1.1231336880041188e-05, + "loss": 0.4717, + "step": 7291 + }, + { + "epoch": 1.25, + "grad_norm": 10.081117630004883, + "learning_rate": 1.1228762656598593e-05, + "loss": 0.4903, + "step": 7292 + }, + { + "epoch": 1.25, + "grad_norm": 9.190896034240723, + "learning_rate": 1.1226188433155998e-05, + "loss": 0.4885, + "step": 7293 + }, + { + "epoch": 1.25, + "grad_norm": 11.908560752868652, + "learning_rate": 1.1223614209713403e-05, + "loss": 0.4172, + "step": 7294 + }, + { + "epoch": 1.25, + "grad_norm": 7.686976909637451, + "learning_rate": 1.1221039986270808e-05, + "loss": 0.3909, + "step": 7295 + }, + { + "epoch": 1.25, + "grad_norm": 10.062314987182617, + "learning_rate": 1.1218465762828215e-05, + "loss": 0.5018, + "step": 7296 + }, + { + "epoch": 1.25, + "grad_norm": 8.092988014221191, + "learning_rate": 1.121589153938562e-05, + "loss": 0.4151, + "step": 7297 + }, + { + "epoch": 1.25, + "grad_norm": 13.299968719482422, + "learning_rate": 1.1213317315943025e-05, + "loss": 0.6049, + "step": 7298 + }, + { + "epoch": 1.25, + "grad_norm": 11.324273109436035, + "learning_rate": 1.121074309250043e-05, + "loss": 0.5683, + "step": 7299 + }, + { + "epoch": 1.25, + "grad_norm": 9.269415855407715, + "learning_rate": 1.1208168869057833e-05, + "loss": 0.4494, + "step": 7300 + }, + { + "epoch": 1.25, + "grad_norm": 14.625930786132812, + "learning_rate": 1.120559464561524e-05, + "loss": 0.651, + "step": 7301 + }, + { + "epoch": 1.25, + "grad_norm": 10.52757740020752, + "learning_rate": 1.1203020422172645e-05, + "loss": 0.4696, + "step": 7302 + }, + { + "epoch": 1.25, + "grad_norm": 10.25619125366211, + "learning_rate": 1.120044619873005e-05, + "loss": 0.578, + "step": 7303 + }, + { + "epoch": 1.25, + "grad_norm": 11.537073135375977, + "learning_rate": 1.1197871975287455e-05, + "loss": 0.4884, + "step": 7304 + }, + { + "epoch": 1.25, + "grad_norm": 8.265997886657715, + "learning_rate": 1.119529775184486e-05, + "loss": 0.3923, + "step": 7305 + }, + { + "epoch": 1.25, + "grad_norm": 11.996153831481934, + "learning_rate": 1.1192723528402266e-05, + "loss": 0.4961, + "step": 7306 + }, + { + "epoch": 1.25, + "grad_norm": 8.89450740814209, + "learning_rate": 1.1190149304959671e-05, + "loss": 0.6458, + "step": 7307 + }, + { + "epoch": 1.25, + "grad_norm": 11.06220817565918, + "learning_rate": 1.1187575081517076e-05, + "loss": 0.6208, + "step": 7308 + }, + { + "epoch": 1.25, + "grad_norm": 6.924788475036621, + "learning_rate": 1.1185000858074481e-05, + "loss": 0.412, + "step": 7309 + }, + { + "epoch": 1.25, + "grad_norm": 12.411330223083496, + "learning_rate": 1.1182426634631886e-05, + "loss": 0.629, + "step": 7310 + }, + { + "epoch": 1.25, + "grad_norm": 8.121463775634766, + "learning_rate": 1.1179852411189291e-05, + "loss": 0.4451, + "step": 7311 + }, + { + "epoch": 1.25, + "grad_norm": 9.201952934265137, + "learning_rate": 1.1177278187746696e-05, + "loss": 0.4098, + "step": 7312 + }, + { + "epoch": 1.26, + "grad_norm": 14.90984058380127, + "learning_rate": 1.1174703964304101e-05, + "loss": 0.4821, + "step": 7313 + }, + { + "epoch": 1.26, + "grad_norm": 11.660990715026855, + "learning_rate": 1.1172129740861506e-05, + "loss": 0.3648, + "step": 7314 + }, + { + "epoch": 1.26, + "grad_norm": 10.568854331970215, + "learning_rate": 1.1169555517418913e-05, + "loss": 0.4303, + "step": 7315 + }, + { + "epoch": 1.26, + "grad_norm": 12.792206764221191, + "learning_rate": 1.1166981293976318e-05, + "loss": 0.5768, + "step": 7316 + }, + { + "epoch": 1.26, + "grad_norm": 7.611165523529053, + "learning_rate": 1.1164407070533723e-05, + "loss": 0.3432, + "step": 7317 + }, + { + "epoch": 1.26, + "grad_norm": 8.740774154663086, + "learning_rate": 1.1161832847091128e-05, + "loss": 0.2775, + "step": 7318 + }, + { + "epoch": 1.26, + "grad_norm": 9.653419494628906, + "learning_rate": 1.1159258623648533e-05, + "loss": 0.3901, + "step": 7319 + }, + { + "epoch": 1.26, + "grad_norm": 13.659016609191895, + "learning_rate": 1.115668440020594e-05, + "loss": 0.4967, + "step": 7320 + }, + { + "epoch": 1.26, + "grad_norm": 9.037394523620605, + "learning_rate": 1.1154110176763343e-05, + "loss": 0.59, + "step": 7321 + }, + { + "epoch": 1.26, + "grad_norm": 13.336116790771484, + "learning_rate": 1.1151535953320748e-05, + "loss": 0.539, + "step": 7322 + }, + { + "epoch": 1.26, + "grad_norm": 10.488786697387695, + "learning_rate": 1.1148961729878153e-05, + "loss": 0.3024, + "step": 7323 + }, + { + "epoch": 1.26, + "grad_norm": 11.387650489807129, + "learning_rate": 1.114638750643556e-05, + "loss": 0.4917, + "step": 7324 + }, + { + "epoch": 1.26, + "grad_norm": 8.015610694885254, + "learning_rate": 1.1143813282992964e-05, + "loss": 0.3866, + "step": 7325 + }, + { + "epoch": 1.26, + "grad_norm": 8.84521484375, + "learning_rate": 1.114123905955037e-05, + "loss": 0.3983, + "step": 7326 + }, + { + "epoch": 1.26, + "grad_norm": 12.11248779296875, + "learning_rate": 1.1138664836107774e-05, + "loss": 0.5833, + "step": 7327 + }, + { + "epoch": 1.26, + "grad_norm": 11.666871070861816, + "learning_rate": 1.113609061266518e-05, + "loss": 0.5804, + "step": 7328 + }, + { + "epoch": 1.26, + "grad_norm": 11.14678955078125, + "learning_rate": 1.1133516389222586e-05, + "loss": 0.3713, + "step": 7329 + }, + { + "epoch": 1.26, + "grad_norm": 9.727951049804688, + "learning_rate": 1.1130942165779991e-05, + "loss": 0.4718, + "step": 7330 + }, + { + "epoch": 1.26, + "grad_norm": 11.37954044342041, + "learning_rate": 1.1128367942337394e-05, + "loss": 0.4232, + "step": 7331 + }, + { + "epoch": 1.26, + "grad_norm": 10.583354949951172, + "learning_rate": 1.11257937188948e-05, + "loss": 0.486, + "step": 7332 + }, + { + "epoch": 1.26, + "grad_norm": 8.33794116973877, + "learning_rate": 1.1123219495452204e-05, + "loss": 0.3675, + "step": 7333 + }, + { + "epoch": 1.26, + "grad_norm": 15.987041473388672, + "learning_rate": 1.1120645272009611e-05, + "loss": 0.5623, + "step": 7334 + }, + { + "epoch": 1.26, + "grad_norm": 13.565496444702148, + "learning_rate": 1.1118071048567016e-05, + "loss": 0.6642, + "step": 7335 + }, + { + "epoch": 1.26, + "grad_norm": 8.611883163452148, + "learning_rate": 1.1115496825124421e-05, + "loss": 0.4221, + "step": 7336 + }, + { + "epoch": 1.26, + "grad_norm": 8.722855567932129, + "learning_rate": 1.1112922601681826e-05, + "loss": 0.5212, + "step": 7337 + }, + { + "epoch": 1.26, + "grad_norm": 10.42210578918457, + "learning_rate": 1.1110348378239231e-05, + "loss": 0.5113, + "step": 7338 + }, + { + "epoch": 1.26, + "grad_norm": 8.983506202697754, + "learning_rate": 1.1107774154796638e-05, + "loss": 0.3533, + "step": 7339 + }, + { + "epoch": 1.26, + "grad_norm": 9.018218994140625, + "learning_rate": 1.1105199931354043e-05, + "loss": 0.5285, + "step": 7340 + }, + { + "epoch": 1.26, + "grad_norm": 9.314803123474121, + "learning_rate": 1.1102625707911446e-05, + "loss": 0.382, + "step": 7341 + }, + { + "epoch": 1.26, + "grad_norm": 9.689424514770508, + "learning_rate": 1.1100051484468851e-05, + "loss": 0.4992, + "step": 7342 + }, + { + "epoch": 1.26, + "grad_norm": 8.650017738342285, + "learning_rate": 1.1097477261026258e-05, + "loss": 0.2609, + "step": 7343 + }, + { + "epoch": 1.26, + "grad_norm": 12.291447639465332, + "learning_rate": 1.1094903037583663e-05, + "loss": 0.4485, + "step": 7344 + }, + { + "epoch": 1.26, + "grad_norm": 8.257018089294434, + "learning_rate": 1.1092328814141068e-05, + "loss": 0.5109, + "step": 7345 + }, + { + "epoch": 1.26, + "grad_norm": 13.063050270080566, + "learning_rate": 1.1089754590698473e-05, + "loss": 0.6356, + "step": 7346 + }, + { + "epoch": 1.26, + "grad_norm": 8.568032264709473, + "learning_rate": 1.1087180367255878e-05, + "loss": 0.3379, + "step": 7347 + }, + { + "epoch": 1.26, + "grad_norm": 9.283342361450195, + "learning_rate": 1.1084606143813284e-05, + "loss": 0.585, + "step": 7348 + }, + { + "epoch": 1.26, + "grad_norm": 8.898557662963867, + "learning_rate": 1.108203192037069e-05, + "loss": 0.4101, + "step": 7349 + }, + { + "epoch": 1.26, + "grad_norm": 7.641866207122803, + "learning_rate": 1.1079457696928094e-05, + "loss": 0.41, + "step": 7350 + }, + { + "epoch": 1.26, + "grad_norm": 9.478236198425293, + "learning_rate": 1.10768834734855e-05, + "loss": 0.426, + "step": 7351 + }, + { + "epoch": 1.26, + "grad_norm": 8.841486930847168, + "learning_rate": 1.1074309250042903e-05, + "loss": 0.4654, + "step": 7352 + }, + { + "epoch": 1.26, + "grad_norm": 8.286558151245117, + "learning_rate": 1.107173502660031e-05, + "loss": 0.3845, + "step": 7353 + }, + { + "epoch": 1.26, + "grad_norm": 9.059571266174316, + "learning_rate": 1.1069160803157714e-05, + "loss": 0.4252, + "step": 7354 + }, + { + "epoch": 1.26, + "grad_norm": 9.960970878601074, + "learning_rate": 1.106658657971512e-05, + "loss": 0.5646, + "step": 7355 + }, + { + "epoch": 1.26, + "grad_norm": 11.519693374633789, + "learning_rate": 1.1064012356272524e-05, + "loss": 0.5789, + "step": 7356 + }, + { + "epoch": 1.26, + "grad_norm": 11.587873458862305, + "learning_rate": 1.106143813282993e-05, + "loss": 0.5044, + "step": 7357 + }, + { + "epoch": 1.26, + "grad_norm": 12.576017379760742, + "learning_rate": 1.1058863909387336e-05, + "loss": 0.4697, + "step": 7358 + }, + { + "epoch": 1.26, + "grad_norm": 9.494189262390137, + "learning_rate": 1.105628968594474e-05, + "loss": 0.3495, + "step": 7359 + }, + { + "epoch": 1.26, + "grad_norm": 9.299254417419434, + "learning_rate": 1.1053715462502146e-05, + "loss": 0.5613, + "step": 7360 + }, + { + "epoch": 1.26, + "grad_norm": 9.70302677154541, + "learning_rate": 1.105114123905955e-05, + "loss": 0.4824, + "step": 7361 + }, + { + "epoch": 1.26, + "grad_norm": 8.210065841674805, + "learning_rate": 1.1048567015616956e-05, + "loss": 0.353, + "step": 7362 + }, + { + "epoch": 1.26, + "grad_norm": 11.171737670898438, + "learning_rate": 1.104599279217436e-05, + "loss": 0.673, + "step": 7363 + }, + { + "epoch": 1.26, + "grad_norm": 11.690330505371094, + "learning_rate": 1.1043418568731766e-05, + "loss": 0.5517, + "step": 7364 + }, + { + "epoch": 1.26, + "grad_norm": 11.84161376953125, + "learning_rate": 1.104084434528917e-05, + "loss": 0.5083, + "step": 7365 + }, + { + "epoch": 1.26, + "grad_norm": 9.02966594696045, + "learning_rate": 1.1038270121846576e-05, + "loss": 0.5422, + "step": 7366 + }, + { + "epoch": 1.26, + "grad_norm": 11.594504356384277, + "learning_rate": 1.1035695898403982e-05, + "loss": 0.4964, + "step": 7367 + }, + { + "epoch": 1.26, + "grad_norm": 13.446212768554688, + "learning_rate": 1.1033121674961387e-05, + "loss": 0.7646, + "step": 7368 + }, + { + "epoch": 1.26, + "grad_norm": 10.387414932250977, + "learning_rate": 1.1030547451518792e-05, + "loss": 0.3946, + "step": 7369 + }, + { + "epoch": 1.26, + "grad_norm": 10.785445213317871, + "learning_rate": 1.1027973228076197e-05, + "loss": 0.3826, + "step": 7370 + }, + { + "epoch": 1.26, + "grad_norm": 13.900127410888672, + "learning_rate": 1.1025399004633602e-05, + "loss": 0.5517, + "step": 7371 + }, + { + "epoch": 1.27, + "grad_norm": 9.443618774414062, + "learning_rate": 1.1022824781191009e-05, + "loss": 0.3948, + "step": 7372 + }, + { + "epoch": 1.27, + "grad_norm": 10.678475379943848, + "learning_rate": 1.1020250557748412e-05, + "loss": 0.5166, + "step": 7373 + }, + { + "epoch": 1.27, + "grad_norm": 16.408212661743164, + "learning_rate": 1.1017676334305817e-05, + "loss": 0.7953, + "step": 7374 + }, + { + "epoch": 1.27, + "grad_norm": 8.494850158691406, + "learning_rate": 1.1015102110863222e-05, + "loss": 0.4099, + "step": 7375 + }, + { + "epoch": 1.27, + "grad_norm": 14.82183837890625, + "learning_rate": 1.1012527887420629e-05, + "loss": 0.5599, + "step": 7376 + }, + { + "epoch": 1.27, + "grad_norm": 11.489359855651855, + "learning_rate": 1.1009953663978034e-05, + "loss": 0.5565, + "step": 7377 + }, + { + "epoch": 1.27, + "grad_norm": 11.595519065856934, + "learning_rate": 1.1007379440535439e-05, + "loss": 0.5726, + "step": 7378 + }, + { + "epoch": 1.27, + "grad_norm": 15.113448143005371, + "learning_rate": 1.1004805217092844e-05, + "loss": 0.5839, + "step": 7379 + }, + { + "epoch": 1.27, + "grad_norm": 9.382354736328125, + "learning_rate": 1.1002230993650249e-05, + "loss": 0.4741, + "step": 7380 + }, + { + "epoch": 1.27, + "grad_norm": 10.775054931640625, + "learning_rate": 1.0999656770207656e-05, + "loss": 0.5934, + "step": 7381 + }, + { + "epoch": 1.27, + "grad_norm": 10.917232513427734, + "learning_rate": 1.099708254676506e-05, + "loss": 0.4876, + "step": 7382 + }, + { + "epoch": 1.27, + "grad_norm": 10.447864532470703, + "learning_rate": 1.0994508323322464e-05, + "loss": 0.4363, + "step": 7383 + }, + { + "epoch": 1.27, + "grad_norm": 13.548230171203613, + "learning_rate": 1.0991934099879869e-05, + "loss": 0.5875, + "step": 7384 + }, + { + "epoch": 1.27, + "grad_norm": 12.100251197814941, + "learning_rate": 1.0989359876437274e-05, + "loss": 0.4899, + "step": 7385 + }, + { + "epoch": 1.27, + "grad_norm": 13.407061576843262, + "learning_rate": 1.098678565299468e-05, + "loss": 0.6766, + "step": 7386 + }, + { + "epoch": 1.27, + "grad_norm": 9.539560317993164, + "learning_rate": 1.0984211429552086e-05, + "loss": 0.4533, + "step": 7387 + }, + { + "epoch": 1.27, + "grad_norm": 16.022130966186523, + "learning_rate": 1.098163720610949e-05, + "loss": 0.6325, + "step": 7388 + }, + { + "epoch": 1.27, + "grad_norm": 8.235760688781738, + "learning_rate": 1.0979062982666896e-05, + "loss": 0.4195, + "step": 7389 + }, + { + "epoch": 1.27, + "grad_norm": 9.352241516113281, + "learning_rate": 1.09764887592243e-05, + "loss": 0.4234, + "step": 7390 + }, + { + "epoch": 1.27, + "grad_norm": 12.590688705444336, + "learning_rate": 1.0973914535781707e-05, + "loss": 0.4648, + "step": 7391 + }, + { + "epoch": 1.27, + "grad_norm": 10.004841804504395, + "learning_rate": 1.0971340312339112e-05, + "loss": 0.558, + "step": 7392 + }, + { + "epoch": 1.27, + "grad_norm": 15.055572509765625, + "learning_rate": 1.0968766088896517e-05, + "loss": 0.5331, + "step": 7393 + }, + { + "epoch": 1.27, + "grad_norm": 13.71291732788086, + "learning_rate": 1.096619186545392e-05, + "loss": 0.6749, + "step": 7394 + }, + { + "epoch": 1.27, + "grad_norm": 10.808960914611816, + "learning_rate": 1.0963617642011327e-05, + "loss": 0.5916, + "step": 7395 + }, + { + "epoch": 1.27, + "grad_norm": 8.335718154907227, + "learning_rate": 1.0961043418568732e-05, + "loss": 0.5091, + "step": 7396 + }, + { + "epoch": 1.27, + "grad_norm": 9.797740936279297, + "learning_rate": 1.0958469195126137e-05, + "loss": 0.3448, + "step": 7397 + }, + { + "epoch": 1.27, + "grad_norm": 8.89153003692627, + "learning_rate": 1.0955894971683542e-05, + "loss": 0.4089, + "step": 7398 + }, + { + "epoch": 1.27, + "grad_norm": 14.435091018676758, + "learning_rate": 1.0953320748240947e-05, + "loss": 0.5835, + "step": 7399 + }, + { + "epoch": 1.27, + "grad_norm": 10.908112525939941, + "learning_rate": 1.0950746524798354e-05, + "loss": 0.4312, + "step": 7400 + }, + { + "epoch": 1.27, + "grad_norm": 10.195268630981445, + "learning_rate": 1.0948172301355759e-05, + "loss": 0.42, + "step": 7401 + }, + { + "epoch": 1.27, + "grad_norm": 9.425474166870117, + "learning_rate": 1.0945598077913164e-05, + "loss": 0.3167, + "step": 7402 + }, + { + "epoch": 1.27, + "grad_norm": 13.138252258300781, + "learning_rate": 1.0943023854470569e-05, + "loss": 0.4717, + "step": 7403 + }, + { + "epoch": 1.27, + "grad_norm": 10.880084037780762, + "learning_rate": 1.0940449631027972e-05, + "loss": 0.4504, + "step": 7404 + }, + { + "epoch": 1.27, + "grad_norm": 9.932443618774414, + "learning_rate": 1.0937875407585379e-05, + "loss": 0.4649, + "step": 7405 + }, + { + "epoch": 1.27, + "grad_norm": 10.748109817504883, + "learning_rate": 1.0935301184142784e-05, + "loss": 0.5998, + "step": 7406 + }, + { + "epoch": 1.27, + "grad_norm": 10.836468696594238, + "learning_rate": 1.0932726960700189e-05, + "loss": 0.4931, + "step": 7407 + }, + { + "epoch": 1.27, + "grad_norm": 12.112396240234375, + "learning_rate": 1.0930152737257594e-05, + "loss": 0.4949, + "step": 7408 + }, + { + "epoch": 1.27, + "grad_norm": 8.53786563873291, + "learning_rate": 1.0927578513814999e-05, + "loss": 0.4807, + "step": 7409 + }, + { + "epoch": 1.27, + "grad_norm": 11.181134223937988, + "learning_rate": 1.0925004290372405e-05, + "loss": 0.539, + "step": 7410 + }, + { + "epoch": 1.27, + "grad_norm": 12.775850296020508, + "learning_rate": 1.092243006692981e-05, + "loss": 0.4724, + "step": 7411 + }, + { + "epoch": 1.27, + "grad_norm": 13.097833633422852, + "learning_rate": 1.0919855843487215e-05, + "loss": 0.8134, + "step": 7412 + }, + { + "epoch": 1.27, + "grad_norm": 11.104572296142578, + "learning_rate": 1.091728162004462e-05, + "loss": 0.46, + "step": 7413 + }, + { + "epoch": 1.27, + "grad_norm": 14.703893661499023, + "learning_rate": 1.0914707396602025e-05, + "loss": 0.842, + "step": 7414 + }, + { + "epoch": 1.27, + "grad_norm": 10.625191688537598, + "learning_rate": 1.091213317315943e-05, + "loss": 0.4612, + "step": 7415 + }, + { + "epoch": 1.27, + "grad_norm": 6.64117956161499, + "learning_rate": 1.0909558949716835e-05, + "loss": 0.2308, + "step": 7416 + }, + { + "epoch": 1.27, + "grad_norm": 14.883071899414062, + "learning_rate": 1.090698472627424e-05, + "loss": 0.7752, + "step": 7417 + }, + { + "epoch": 1.27, + "grad_norm": 9.58494758605957, + "learning_rate": 1.0904410502831645e-05, + "loss": 0.3848, + "step": 7418 + }, + { + "epoch": 1.27, + "grad_norm": 11.34266471862793, + "learning_rate": 1.0901836279389052e-05, + "loss": 0.557, + "step": 7419 + }, + { + "epoch": 1.27, + "grad_norm": 9.875296592712402, + "learning_rate": 1.0899262055946457e-05, + "loss": 0.6447, + "step": 7420 + }, + { + "epoch": 1.27, + "grad_norm": 8.082805633544922, + "learning_rate": 1.0896687832503862e-05, + "loss": 0.3538, + "step": 7421 + }, + { + "epoch": 1.27, + "grad_norm": 11.51256275177002, + "learning_rate": 1.0894113609061267e-05, + "loss": 0.6898, + "step": 7422 + }, + { + "epoch": 1.27, + "grad_norm": 8.663081169128418, + "learning_rate": 1.0891539385618672e-05, + "loss": 0.5729, + "step": 7423 + }, + { + "epoch": 1.27, + "grad_norm": 9.39887809753418, + "learning_rate": 1.0888965162176079e-05, + "loss": 0.4438, + "step": 7424 + }, + { + "epoch": 1.27, + "grad_norm": 10.030675888061523, + "learning_rate": 1.0886390938733482e-05, + "loss": 0.3858, + "step": 7425 + }, + { + "epoch": 1.27, + "grad_norm": 13.428672790527344, + "learning_rate": 1.0883816715290887e-05, + "loss": 0.7735, + "step": 7426 + }, + { + "epoch": 1.27, + "grad_norm": 9.508746147155762, + "learning_rate": 1.0881242491848292e-05, + "loss": 0.5378, + "step": 7427 + }, + { + "epoch": 1.27, + "grad_norm": 10.198284149169922, + "learning_rate": 1.0878668268405697e-05, + "loss": 0.3171, + "step": 7428 + }, + { + "epoch": 1.27, + "grad_norm": 7.191280364990234, + "learning_rate": 1.0876094044963103e-05, + "loss": 0.3814, + "step": 7429 + }, + { + "epoch": 1.28, + "grad_norm": 10.388025283813477, + "learning_rate": 1.0873519821520508e-05, + "loss": 0.6304, + "step": 7430 + }, + { + "epoch": 1.28, + "grad_norm": 12.87787914276123, + "learning_rate": 1.0870945598077913e-05, + "loss": 0.5966, + "step": 7431 + }, + { + "epoch": 1.28, + "grad_norm": 6.583935260772705, + "learning_rate": 1.0868371374635318e-05, + "loss": 0.3429, + "step": 7432 + }, + { + "epoch": 1.28, + "grad_norm": 10.591317176818848, + "learning_rate": 1.0865797151192725e-05, + "loss": 0.3968, + "step": 7433 + }, + { + "epoch": 1.28, + "grad_norm": 11.970666885375977, + "learning_rate": 1.086322292775013e-05, + "loss": 0.4822, + "step": 7434 + }, + { + "epoch": 1.28, + "grad_norm": 12.406126976013184, + "learning_rate": 1.0860648704307533e-05, + "loss": 0.512, + "step": 7435 + }, + { + "epoch": 1.28, + "grad_norm": 8.578034400939941, + "learning_rate": 1.0858074480864938e-05, + "loss": 0.411, + "step": 7436 + }, + { + "epoch": 1.28, + "grad_norm": 6.091978073120117, + "learning_rate": 1.0855500257422343e-05, + "loss": 0.2314, + "step": 7437 + }, + { + "epoch": 1.28, + "grad_norm": 9.776302337646484, + "learning_rate": 1.085292603397975e-05, + "loss": 0.3227, + "step": 7438 + }, + { + "epoch": 1.28, + "grad_norm": 12.795964241027832, + "learning_rate": 1.0850351810537155e-05, + "loss": 0.5227, + "step": 7439 + }, + { + "epoch": 1.28, + "grad_norm": 9.46214485168457, + "learning_rate": 1.084777758709456e-05, + "loss": 0.4356, + "step": 7440 + }, + { + "epoch": 1.28, + "grad_norm": 12.215465545654297, + "learning_rate": 1.0845203363651965e-05, + "loss": 0.6005, + "step": 7441 + }, + { + "epoch": 1.28, + "grad_norm": 12.202252388000488, + "learning_rate": 1.084262914020937e-05, + "loss": 0.4234, + "step": 7442 + }, + { + "epoch": 1.28, + "grad_norm": 9.662891387939453, + "learning_rate": 1.0840054916766777e-05, + "loss": 0.4677, + "step": 7443 + }, + { + "epoch": 1.28, + "grad_norm": 12.376526832580566, + "learning_rate": 1.0837480693324182e-05, + "loss": 0.5627, + "step": 7444 + }, + { + "epoch": 1.28, + "grad_norm": 11.635258674621582, + "learning_rate": 1.0834906469881587e-05, + "loss": 0.4817, + "step": 7445 + }, + { + "epoch": 1.28, + "grad_norm": 12.524569511413574, + "learning_rate": 1.083233224643899e-05, + "loss": 0.8378, + "step": 7446 + }, + { + "epoch": 1.28, + "grad_norm": 9.190790176391602, + "learning_rate": 1.0829758022996397e-05, + "loss": 0.4123, + "step": 7447 + }, + { + "epoch": 1.28, + "grad_norm": 9.145949363708496, + "learning_rate": 1.0827183799553802e-05, + "loss": 0.4973, + "step": 7448 + }, + { + "epoch": 1.28, + "grad_norm": 8.877215385437012, + "learning_rate": 1.0824609576111207e-05, + "loss": 0.4719, + "step": 7449 + }, + { + "epoch": 1.28, + "grad_norm": 11.523372650146484, + "learning_rate": 1.0822035352668612e-05, + "loss": 0.6474, + "step": 7450 + }, + { + "epoch": 1.28, + "grad_norm": 11.625123023986816, + "learning_rate": 1.0819461129226017e-05, + "loss": 0.5119, + "step": 7451 + }, + { + "epoch": 1.28, + "grad_norm": 12.660490989685059, + "learning_rate": 1.0816886905783423e-05, + "loss": 0.6141, + "step": 7452 + }, + { + "epoch": 1.28, + "grad_norm": 8.527603149414062, + "learning_rate": 1.0814312682340828e-05, + "loss": 0.3769, + "step": 7453 + }, + { + "epoch": 1.28, + "grad_norm": 11.848456382751465, + "learning_rate": 1.0811738458898233e-05, + "loss": 0.4286, + "step": 7454 + }, + { + "epoch": 1.28, + "grad_norm": 11.680733680725098, + "learning_rate": 1.0809164235455638e-05, + "loss": 0.5844, + "step": 7455 + }, + { + "epoch": 1.28, + "grad_norm": 10.627582550048828, + "learning_rate": 1.0806590012013042e-05, + "loss": 0.4866, + "step": 7456 + }, + { + "epoch": 1.28, + "grad_norm": 10.343541145324707, + "learning_rate": 1.0804015788570448e-05, + "loss": 0.5801, + "step": 7457 + }, + { + "epoch": 1.28, + "grad_norm": 11.711421012878418, + "learning_rate": 1.0801441565127853e-05, + "loss": 0.6033, + "step": 7458 + }, + { + "epoch": 1.28, + "grad_norm": 8.719820976257324, + "learning_rate": 1.0798867341685258e-05, + "loss": 0.4498, + "step": 7459 + }, + { + "epoch": 1.28, + "grad_norm": 10.25599193572998, + "learning_rate": 1.0796293118242663e-05, + "loss": 0.5434, + "step": 7460 + }, + { + "epoch": 1.28, + "grad_norm": 7.425467491149902, + "learning_rate": 1.0793718894800068e-05, + "loss": 0.4365, + "step": 7461 + }, + { + "epoch": 1.28, + "grad_norm": 8.971858024597168, + "learning_rate": 1.0791144671357475e-05, + "loss": 0.4072, + "step": 7462 + }, + { + "epoch": 1.28, + "grad_norm": 15.246363639831543, + "learning_rate": 1.078857044791488e-05, + "loss": 0.5188, + "step": 7463 + }, + { + "epoch": 1.28, + "grad_norm": 13.017495155334473, + "learning_rate": 1.0785996224472285e-05, + "loss": 0.3781, + "step": 7464 + }, + { + "epoch": 1.28, + "grad_norm": 11.844362258911133, + "learning_rate": 1.078342200102969e-05, + "loss": 0.5011, + "step": 7465 + }, + { + "epoch": 1.28, + "grad_norm": 9.767553329467773, + "learning_rate": 1.0780847777587095e-05, + "loss": 0.5336, + "step": 7466 + }, + { + "epoch": 1.28, + "grad_norm": 8.728456497192383, + "learning_rate": 1.07782735541445e-05, + "loss": 0.5207, + "step": 7467 + }, + { + "epoch": 1.28, + "grad_norm": 10.234371185302734, + "learning_rate": 1.0775699330701905e-05, + "loss": 0.6167, + "step": 7468 + }, + { + "epoch": 1.28, + "grad_norm": 9.180463790893555, + "learning_rate": 1.077312510725931e-05, + "loss": 0.5041, + "step": 7469 + }, + { + "epoch": 1.28, + "grad_norm": 8.594744682312012, + "learning_rate": 1.0770550883816715e-05, + "loss": 0.3767, + "step": 7470 + }, + { + "epoch": 1.28, + "grad_norm": 10.505976676940918, + "learning_rate": 1.0767976660374121e-05, + "loss": 0.5005, + "step": 7471 + }, + { + "epoch": 1.28, + "grad_norm": 8.433834075927734, + "learning_rate": 1.0765402436931526e-05, + "loss": 0.3806, + "step": 7472 + }, + { + "epoch": 1.28, + "grad_norm": 11.749080657958984, + "learning_rate": 1.0762828213488931e-05, + "loss": 0.5238, + "step": 7473 + }, + { + "epoch": 1.28, + "grad_norm": 9.561195373535156, + "learning_rate": 1.0760253990046336e-05, + "loss": 0.5212, + "step": 7474 + }, + { + "epoch": 1.28, + "grad_norm": 11.059864044189453, + "learning_rate": 1.0757679766603741e-05, + "loss": 0.5987, + "step": 7475 + }, + { + "epoch": 1.28, + "grad_norm": 12.482023239135742, + "learning_rate": 1.0755105543161148e-05, + "loss": 0.5982, + "step": 7476 + }, + { + "epoch": 1.28, + "grad_norm": 12.836862564086914, + "learning_rate": 1.0752531319718551e-05, + "loss": 0.3857, + "step": 7477 + }, + { + "epoch": 1.28, + "grad_norm": 12.901195526123047, + "learning_rate": 1.0749957096275956e-05, + "loss": 0.6019, + "step": 7478 + }, + { + "epoch": 1.28, + "grad_norm": 11.829532623291016, + "learning_rate": 1.0747382872833361e-05, + "loss": 0.5685, + "step": 7479 + }, + { + "epoch": 1.28, + "grad_norm": 8.477641105651855, + "learning_rate": 1.0744808649390766e-05, + "loss": 0.6177, + "step": 7480 + }, + { + "epoch": 1.28, + "grad_norm": 13.40280532836914, + "learning_rate": 1.0742234425948173e-05, + "loss": 0.5157, + "step": 7481 + }, + { + "epoch": 1.28, + "grad_norm": 9.046311378479004, + "learning_rate": 1.0739660202505578e-05, + "loss": 0.6029, + "step": 7482 + }, + { + "epoch": 1.28, + "grad_norm": 9.58298110961914, + "learning_rate": 1.0737085979062983e-05, + "loss": 0.421, + "step": 7483 + }, + { + "epoch": 1.28, + "grad_norm": 10.21367359161377, + "learning_rate": 1.0734511755620388e-05, + "loss": 0.4925, + "step": 7484 + }, + { + "epoch": 1.28, + "grad_norm": 8.266897201538086, + "learning_rate": 1.0731937532177795e-05, + "loss": 0.3255, + "step": 7485 + }, + { + "epoch": 1.28, + "grad_norm": 9.406457901000977, + "learning_rate": 1.07293633087352e-05, + "loss": 0.3816, + "step": 7486 + }, + { + "epoch": 1.28, + "grad_norm": 9.383731842041016, + "learning_rate": 1.0726789085292603e-05, + "loss": 0.5075, + "step": 7487 + }, + { + "epoch": 1.29, + "grad_norm": 14.13294792175293, + "learning_rate": 1.0724214861850008e-05, + "loss": 0.5304, + "step": 7488 + }, + { + "epoch": 1.29, + "grad_norm": 10.950424194335938, + "learning_rate": 1.0721640638407413e-05, + "loss": 0.4476, + "step": 7489 + }, + { + "epoch": 1.29, + "grad_norm": 10.416092872619629, + "learning_rate": 1.071906641496482e-05, + "loss": 0.4161, + "step": 7490 + }, + { + "epoch": 1.29, + "grad_norm": 10.153877258300781, + "learning_rate": 1.0716492191522225e-05, + "loss": 0.5224, + "step": 7491 + }, + { + "epoch": 1.29, + "grad_norm": 8.353551864624023, + "learning_rate": 1.071391796807963e-05, + "loss": 0.4199, + "step": 7492 + }, + { + "epoch": 1.29, + "grad_norm": 8.87924861907959, + "learning_rate": 1.0711343744637035e-05, + "loss": 0.2971, + "step": 7493 + }, + { + "epoch": 1.29, + "grad_norm": 10.318202018737793, + "learning_rate": 1.070876952119444e-05, + "loss": 0.3374, + "step": 7494 + }, + { + "epoch": 1.29, + "grad_norm": 13.892108917236328, + "learning_rate": 1.0706195297751846e-05, + "loss": 0.6273, + "step": 7495 + }, + { + "epoch": 1.29, + "grad_norm": 10.575667381286621, + "learning_rate": 1.0703621074309251e-05, + "loss": 0.462, + "step": 7496 + }, + { + "epoch": 1.29, + "grad_norm": 13.327213287353516, + "learning_rate": 1.0701046850866656e-05, + "loss": 0.6198, + "step": 7497 + }, + { + "epoch": 1.29, + "grad_norm": 7.397997856140137, + "learning_rate": 1.069847262742406e-05, + "loss": 0.3286, + "step": 7498 + }, + { + "epoch": 1.29, + "grad_norm": 13.168190956115723, + "learning_rate": 1.0695898403981466e-05, + "loss": 0.5473, + "step": 7499 + }, + { + "epoch": 1.29, + "grad_norm": 10.656102180480957, + "learning_rate": 1.0693324180538871e-05, + "loss": 0.6185, + "step": 7500 + }, + { + "epoch": 1.29, + "grad_norm": 8.660324096679688, + "learning_rate": 1.0690749957096276e-05, + "loss": 0.4035, + "step": 7501 + }, + { + "epoch": 1.29, + "grad_norm": 14.037286758422852, + "learning_rate": 1.0688175733653681e-05, + "loss": 0.4422, + "step": 7502 + }, + { + "epoch": 1.29, + "grad_norm": 9.997214317321777, + "learning_rate": 1.0685601510211086e-05, + "loss": 0.4572, + "step": 7503 + }, + { + "epoch": 1.29, + "grad_norm": 10.286606788635254, + "learning_rate": 1.0683027286768493e-05, + "loss": 0.4222, + "step": 7504 + }, + { + "epoch": 1.29, + "grad_norm": 9.196907043457031, + "learning_rate": 1.0680453063325898e-05, + "loss": 0.5055, + "step": 7505 + }, + { + "epoch": 1.29, + "grad_norm": 9.247546195983887, + "learning_rate": 1.0677878839883303e-05, + "loss": 0.5824, + "step": 7506 + }, + { + "epoch": 1.29, + "grad_norm": 16.00897979736328, + "learning_rate": 1.0675304616440708e-05, + "loss": 0.5158, + "step": 7507 + }, + { + "epoch": 1.29, + "grad_norm": 15.000876426696777, + "learning_rate": 1.0672730392998111e-05, + "loss": 0.902, + "step": 7508 + }, + { + "epoch": 1.29, + "grad_norm": 10.657307624816895, + "learning_rate": 1.0670156169555518e-05, + "loss": 0.2476, + "step": 7509 + }, + { + "epoch": 1.29, + "grad_norm": 8.91662883758545, + "learning_rate": 1.0667581946112923e-05, + "loss": 0.5422, + "step": 7510 + }, + { + "epoch": 1.29, + "grad_norm": 9.896361351013184, + "learning_rate": 1.0665007722670328e-05, + "loss": 0.6262, + "step": 7511 + }, + { + "epoch": 1.29, + "grad_norm": 10.007389068603516, + "learning_rate": 1.0662433499227733e-05, + "loss": 0.5553, + "step": 7512 + }, + { + "epoch": 1.29, + "grad_norm": 10.739877700805664, + "learning_rate": 1.0659859275785138e-05, + "loss": 0.3934, + "step": 7513 + }, + { + "epoch": 1.29, + "grad_norm": 10.095998764038086, + "learning_rate": 1.0657285052342544e-05, + "loss": 0.3749, + "step": 7514 + }, + { + "epoch": 1.29, + "grad_norm": 8.805936813354492, + "learning_rate": 1.065471082889995e-05, + "loss": 0.4699, + "step": 7515 + }, + { + "epoch": 1.29, + "grad_norm": 9.338393211364746, + "learning_rate": 1.0652136605457354e-05, + "loss": 0.5155, + "step": 7516 + }, + { + "epoch": 1.29, + "grad_norm": 6.030548572540283, + "learning_rate": 1.064956238201476e-05, + "loss": 0.2224, + "step": 7517 + }, + { + "epoch": 1.29, + "grad_norm": 8.219165802001953, + "learning_rate": 1.0646988158572164e-05, + "loss": 0.447, + "step": 7518 + }, + { + "epoch": 1.29, + "grad_norm": 8.641654014587402, + "learning_rate": 1.064441393512957e-05, + "loss": 0.4007, + "step": 7519 + }, + { + "epoch": 1.29, + "grad_norm": 9.736027717590332, + "learning_rate": 1.0641839711686974e-05, + "loss": 0.4508, + "step": 7520 + }, + { + "epoch": 1.29, + "grad_norm": 8.524602890014648, + "learning_rate": 1.063926548824438e-05, + "loss": 0.3543, + "step": 7521 + }, + { + "epoch": 1.29, + "grad_norm": 12.188543319702148, + "learning_rate": 1.0636691264801784e-05, + "loss": 0.4827, + "step": 7522 + }, + { + "epoch": 1.29, + "grad_norm": 10.921015739440918, + "learning_rate": 1.0634117041359191e-05, + "loss": 0.4687, + "step": 7523 + }, + { + "epoch": 1.29, + "grad_norm": 12.920520782470703, + "learning_rate": 1.0631542817916596e-05, + "loss": 0.496, + "step": 7524 + }, + { + "epoch": 1.29, + "grad_norm": 11.996758460998535, + "learning_rate": 1.0628968594474001e-05, + "loss": 0.5602, + "step": 7525 + }, + { + "epoch": 1.29, + "grad_norm": 12.540448188781738, + "learning_rate": 1.0626394371031406e-05, + "loss": 0.5454, + "step": 7526 + }, + { + "epoch": 1.29, + "grad_norm": 8.803266525268555, + "learning_rate": 1.0623820147588811e-05, + "loss": 0.4147, + "step": 7527 + }, + { + "epoch": 1.29, + "grad_norm": 12.970677375793457, + "learning_rate": 1.0621245924146218e-05, + "loss": 0.3776, + "step": 7528 + }, + { + "epoch": 1.29, + "grad_norm": 9.074735641479492, + "learning_rate": 1.0618671700703621e-05, + "loss": 0.4929, + "step": 7529 + }, + { + "epoch": 1.29, + "grad_norm": 11.672462463378906, + "learning_rate": 1.0616097477261026e-05, + "loss": 0.4758, + "step": 7530 + }, + { + "epoch": 1.29, + "grad_norm": 12.037508964538574, + "learning_rate": 1.061352325381843e-05, + "loss": 0.6194, + "step": 7531 + }, + { + "epoch": 1.29, + "grad_norm": 11.949732780456543, + "learning_rate": 1.0610949030375836e-05, + "loss": 0.8195, + "step": 7532 + }, + { + "epoch": 1.29, + "grad_norm": 10.576162338256836, + "learning_rate": 1.0608374806933243e-05, + "loss": 0.6324, + "step": 7533 + }, + { + "epoch": 1.29, + "grad_norm": 7.948009967803955, + "learning_rate": 1.0605800583490647e-05, + "loss": 0.3754, + "step": 7534 + }, + { + "epoch": 1.29, + "grad_norm": 8.722002029418945, + "learning_rate": 1.0603226360048052e-05, + "loss": 0.4788, + "step": 7535 + }, + { + "epoch": 1.29, + "grad_norm": 11.692331314086914, + "learning_rate": 1.0600652136605457e-05, + "loss": 0.4046, + "step": 7536 + }, + { + "epoch": 1.29, + "grad_norm": 7.092635154724121, + "learning_rate": 1.0598077913162864e-05, + "loss": 0.3155, + "step": 7537 + }, + { + "epoch": 1.29, + "grad_norm": 12.974721908569336, + "learning_rate": 1.0595503689720269e-05, + "loss": 0.4394, + "step": 7538 + }, + { + "epoch": 1.29, + "grad_norm": 8.534664154052734, + "learning_rate": 1.0592929466277672e-05, + "loss": 0.3509, + "step": 7539 + }, + { + "epoch": 1.29, + "grad_norm": 10.622676849365234, + "learning_rate": 1.0590355242835077e-05, + "loss": 0.4098, + "step": 7540 + }, + { + "epoch": 1.29, + "grad_norm": 8.259799003601074, + "learning_rate": 1.0587781019392482e-05, + "loss": 0.3832, + "step": 7541 + }, + { + "epoch": 1.29, + "grad_norm": 7.262960433959961, + "learning_rate": 1.0585206795949889e-05, + "loss": 0.2735, + "step": 7542 + }, + { + "epoch": 1.29, + "grad_norm": 12.705814361572266, + "learning_rate": 1.0582632572507294e-05, + "loss": 0.4398, + "step": 7543 + }, + { + "epoch": 1.29, + "grad_norm": 10.44537353515625, + "learning_rate": 1.0580058349064699e-05, + "loss": 0.4076, + "step": 7544 + }, + { + "epoch": 1.29, + "grad_norm": 14.455634117126465, + "learning_rate": 1.0577484125622104e-05, + "loss": 0.5374, + "step": 7545 + }, + { + "epoch": 1.3, + "grad_norm": 11.263714790344238, + "learning_rate": 1.0574909902179509e-05, + "loss": 0.5075, + "step": 7546 + }, + { + "epoch": 1.3, + "grad_norm": 9.596541404724121, + "learning_rate": 1.0572335678736916e-05, + "loss": 0.5074, + "step": 7547 + }, + { + "epoch": 1.3, + "grad_norm": 13.088547706604004, + "learning_rate": 1.056976145529432e-05, + "loss": 0.6784, + "step": 7548 + }, + { + "epoch": 1.3, + "grad_norm": 10.327363967895508, + "learning_rate": 1.0567187231851726e-05, + "loss": 0.5328, + "step": 7549 + }, + { + "epoch": 1.3, + "grad_norm": 11.240397453308105, + "learning_rate": 1.0564613008409129e-05, + "loss": 0.493, + "step": 7550 + }, + { + "epoch": 1.3, + "grad_norm": 15.778910636901855, + "learning_rate": 1.0562038784966536e-05, + "loss": 0.5793, + "step": 7551 + }, + { + "epoch": 1.3, + "grad_norm": 11.707723617553711, + "learning_rate": 1.055946456152394e-05, + "loss": 0.4145, + "step": 7552 + }, + { + "epoch": 1.3, + "grad_norm": 7.224451541900635, + "learning_rate": 1.0556890338081346e-05, + "loss": 0.3317, + "step": 7553 + }, + { + "epoch": 1.3, + "grad_norm": 8.40455150604248, + "learning_rate": 1.055431611463875e-05, + "loss": 0.2934, + "step": 7554 + }, + { + "epoch": 1.3, + "grad_norm": 9.57699203491211, + "learning_rate": 1.0551741891196156e-05, + "loss": 0.4265, + "step": 7555 + }, + { + "epoch": 1.3, + "grad_norm": 10.641803741455078, + "learning_rate": 1.0549167667753562e-05, + "loss": 0.7676, + "step": 7556 + }, + { + "epoch": 1.3, + "grad_norm": 8.282593727111816, + "learning_rate": 1.0546593444310967e-05, + "loss": 0.4041, + "step": 7557 + }, + { + "epoch": 1.3, + "grad_norm": 11.14730453491211, + "learning_rate": 1.0544019220868372e-05, + "loss": 0.6872, + "step": 7558 + }, + { + "epoch": 1.3, + "grad_norm": 13.139691352844238, + "learning_rate": 1.0541444997425777e-05, + "loss": 0.7045, + "step": 7559 + }, + { + "epoch": 1.3, + "grad_norm": 9.67737102508545, + "learning_rate": 1.053887077398318e-05, + "loss": 0.5271, + "step": 7560 + }, + { + "epoch": 1.3, + "grad_norm": 11.156164169311523, + "learning_rate": 1.0536296550540587e-05, + "loss": 0.4915, + "step": 7561 + }, + { + "epoch": 1.3, + "grad_norm": 13.528146743774414, + "learning_rate": 1.0533722327097992e-05, + "loss": 0.5988, + "step": 7562 + }, + { + "epoch": 1.3, + "grad_norm": 10.946915626525879, + "learning_rate": 1.0531148103655397e-05, + "loss": 0.4204, + "step": 7563 + }, + { + "epoch": 1.3, + "grad_norm": 11.51050090789795, + "learning_rate": 1.0528573880212802e-05, + "loss": 0.5621, + "step": 7564 + }, + { + "epoch": 1.3, + "grad_norm": 14.574899673461914, + "learning_rate": 1.0525999656770207e-05, + "loss": 0.4502, + "step": 7565 + }, + { + "epoch": 1.3, + "grad_norm": 9.043357849121094, + "learning_rate": 1.0523425433327614e-05, + "loss": 0.4212, + "step": 7566 + }, + { + "epoch": 1.3, + "grad_norm": 8.919164657592773, + "learning_rate": 1.0520851209885019e-05, + "loss": 0.3513, + "step": 7567 + }, + { + "epoch": 1.3, + "grad_norm": 10.325124740600586, + "learning_rate": 1.0518276986442424e-05, + "loss": 0.4333, + "step": 7568 + }, + { + "epoch": 1.3, + "grad_norm": 8.761919021606445, + "learning_rate": 1.0515702762999829e-05, + "loss": 0.4765, + "step": 7569 + }, + { + "epoch": 1.3, + "grad_norm": 8.399340629577637, + "learning_rate": 1.0513128539557234e-05, + "loss": 0.4614, + "step": 7570 + }, + { + "epoch": 1.3, + "grad_norm": 12.49824047088623, + "learning_rate": 1.0510554316114639e-05, + "loss": 0.7209, + "step": 7571 + }, + { + "epoch": 1.3, + "grad_norm": 11.914290428161621, + "learning_rate": 1.0507980092672044e-05, + "loss": 0.4861, + "step": 7572 + }, + { + "epoch": 1.3, + "grad_norm": 8.58306884765625, + "learning_rate": 1.0505405869229449e-05, + "loss": 0.3629, + "step": 7573 + }, + { + "epoch": 1.3, + "grad_norm": 8.455018997192383, + "learning_rate": 1.0502831645786854e-05, + "loss": 0.3693, + "step": 7574 + }, + { + "epoch": 1.3, + "grad_norm": 7.3626508712768555, + "learning_rate": 1.050025742234426e-05, + "loss": 0.3849, + "step": 7575 + }, + { + "epoch": 1.3, + "grad_norm": 13.481241226196289, + "learning_rate": 1.0497683198901665e-05, + "loss": 0.6926, + "step": 7576 + }, + { + "epoch": 1.3, + "grad_norm": 9.063183784484863, + "learning_rate": 1.049510897545907e-05, + "loss": 0.4847, + "step": 7577 + }, + { + "epoch": 1.3, + "grad_norm": 10.723499298095703, + "learning_rate": 1.0492534752016475e-05, + "loss": 0.5744, + "step": 7578 + }, + { + "epoch": 1.3, + "grad_norm": 9.72335147857666, + "learning_rate": 1.048996052857388e-05, + "loss": 0.368, + "step": 7579 + }, + { + "epoch": 1.3, + "grad_norm": 12.503802299499512, + "learning_rate": 1.0487386305131287e-05, + "loss": 0.6391, + "step": 7580 + }, + { + "epoch": 1.3, + "grad_norm": 10.515002250671387, + "learning_rate": 1.048481208168869e-05, + "loss": 0.5714, + "step": 7581 + }, + { + "epoch": 1.3, + "grad_norm": 9.594352722167969, + "learning_rate": 1.0482237858246095e-05, + "loss": 0.5666, + "step": 7582 + }, + { + "epoch": 1.3, + "grad_norm": 8.503571510314941, + "learning_rate": 1.04796636348035e-05, + "loss": 0.3397, + "step": 7583 + }, + { + "epoch": 1.3, + "grad_norm": 12.70746898651123, + "learning_rate": 1.0477089411360905e-05, + "loss": 0.3438, + "step": 7584 + }, + { + "epoch": 1.3, + "grad_norm": 13.973519325256348, + "learning_rate": 1.0474515187918312e-05, + "loss": 0.5933, + "step": 7585 + }, + { + "epoch": 1.3, + "grad_norm": 12.70559310913086, + "learning_rate": 1.0471940964475717e-05, + "loss": 0.5616, + "step": 7586 + }, + { + "epoch": 1.3, + "grad_norm": 11.801112174987793, + "learning_rate": 1.0469366741033122e-05, + "loss": 0.6084, + "step": 7587 + }, + { + "epoch": 1.3, + "grad_norm": 9.673832893371582, + "learning_rate": 1.0466792517590527e-05, + "loss": 0.5221, + "step": 7588 + }, + { + "epoch": 1.3, + "grad_norm": 11.244690895080566, + "learning_rate": 1.0464218294147934e-05, + "loss": 0.5931, + "step": 7589 + }, + { + "epoch": 1.3, + "grad_norm": 8.470046043395996, + "learning_rate": 1.0461644070705339e-05, + "loss": 0.3065, + "step": 7590 + }, + { + "epoch": 1.3, + "grad_norm": 9.129098892211914, + "learning_rate": 1.0459069847262742e-05, + "loss": 0.3822, + "step": 7591 + }, + { + "epoch": 1.3, + "grad_norm": 10.066459655761719, + "learning_rate": 1.0456495623820147e-05, + "loss": 0.5738, + "step": 7592 + }, + { + "epoch": 1.3, + "grad_norm": 11.978337287902832, + "learning_rate": 1.0453921400377552e-05, + "loss": 0.5539, + "step": 7593 + }, + { + "epoch": 1.3, + "grad_norm": 12.200026512145996, + "learning_rate": 1.0451347176934959e-05, + "loss": 0.509, + "step": 7594 + }, + { + "epoch": 1.3, + "grad_norm": 12.632519721984863, + "learning_rate": 1.0448772953492364e-05, + "loss": 0.5611, + "step": 7595 + }, + { + "epoch": 1.3, + "grad_norm": 7.198501110076904, + "learning_rate": 1.0446198730049769e-05, + "loss": 0.338, + "step": 7596 + }, + { + "epoch": 1.3, + "grad_norm": 14.813353538513184, + "learning_rate": 1.0443624506607174e-05, + "loss": 0.812, + "step": 7597 + }, + { + "epoch": 1.3, + "grad_norm": 11.40137004852295, + "learning_rate": 1.0441050283164579e-05, + "loss": 0.6986, + "step": 7598 + }, + { + "epoch": 1.3, + "grad_norm": 10.052644729614258, + "learning_rate": 1.0438476059721985e-05, + "loss": 0.402, + "step": 7599 + }, + { + "epoch": 1.3, + "grad_norm": 11.621946334838867, + "learning_rate": 1.043590183627939e-05, + "loss": 0.5718, + "step": 7600 + }, + { + "epoch": 1.3, + "grad_norm": 8.031299591064453, + "learning_rate": 1.0433327612836795e-05, + "loss": 0.2937, + "step": 7601 + }, + { + "epoch": 1.3, + "grad_norm": 6.721993446350098, + "learning_rate": 1.0430753389394198e-05, + "loss": 0.3963, + "step": 7602 + }, + { + "epoch": 1.3, + "grad_norm": 9.292656898498535, + "learning_rate": 1.0428179165951603e-05, + "loss": 0.5122, + "step": 7603 + }, + { + "epoch": 1.3, + "grad_norm": 13.778241157531738, + "learning_rate": 1.042560494250901e-05, + "loss": 0.5224, + "step": 7604 + }, + { + "epoch": 1.31, + "grad_norm": 12.720760345458984, + "learning_rate": 1.0423030719066415e-05, + "loss": 0.5592, + "step": 7605 + }, + { + "epoch": 1.31, + "grad_norm": 12.58338737487793, + "learning_rate": 1.042045649562382e-05, + "loss": 0.5383, + "step": 7606 + }, + { + "epoch": 1.31, + "grad_norm": 12.038021087646484, + "learning_rate": 1.0417882272181225e-05, + "loss": 0.6423, + "step": 7607 + }, + { + "epoch": 1.31, + "grad_norm": 11.134819984436035, + "learning_rate": 1.0415308048738632e-05, + "loss": 0.6105, + "step": 7608 + }, + { + "epoch": 1.31, + "grad_norm": 10.333404541015625, + "learning_rate": 1.0412733825296037e-05, + "loss": 0.6074, + "step": 7609 + }, + { + "epoch": 1.31, + "grad_norm": 8.601170539855957, + "learning_rate": 1.0410159601853442e-05, + "loss": 0.4002, + "step": 7610 + }, + { + "epoch": 1.31, + "grad_norm": 10.287076950073242, + "learning_rate": 1.0407585378410847e-05, + "loss": 0.5329, + "step": 7611 + }, + { + "epoch": 1.31, + "grad_norm": 8.492240905761719, + "learning_rate": 1.040501115496825e-05, + "loss": 0.4515, + "step": 7612 + }, + { + "epoch": 1.31, + "grad_norm": 9.017926216125488, + "learning_rate": 1.0402436931525657e-05, + "loss": 0.4445, + "step": 7613 + }, + { + "epoch": 1.31, + "grad_norm": 9.896016120910645, + "learning_rate": 1.0399862708083062e-05, + "loss": 0.5659, + "step": 7614 + }, + { + "epoch": 1.31, + "grad_norm": 9.269497871398926, + "learning_rate": 1.0397288484640467e-05, + "loss": 0.5041, + "step": 7615 + }, + { + "epoch": 1.31, + "grad_norm": 10.545845985412598, + "learning_rate": 1.0394714261197872e-05, + "loss": 0.6307, + "step": 7616 + }, + { + "epoch": 1.31, + "grad_norm": 11.45733642578125, + "learning_rate": 1.0392140037755277e-05, + "loss": 0.5388, + "step": 7617 + }, + { + "epoch": 1.31, + "grad_norm": 9.823169708251953, + "learning_rate": 1.0389565814312683e-05, + "loss": 0.4665, + "step": 7618 + }, + { + "epoch": 1.31, + "grad_norm": 8.971879959106445, + "learning_rate": 1.0386991590870088e-05, + "loss": 0.461, + "step": 7619 + }, + { + "epoch": 1.31, + "grad_norm": 9.590045928955078, + "learning_rate": 1.0384417367427493e-05, + "loss": 0.4452, + "step": 7620 + }, + { + "epoch": 1.31, + "grad_norm": 12.817612648010254, + "learning_rate": 1.0381843143984898e-05, + "loss": 0.6212, + "step": 7621 + }, + { + "epoch": 1.31, + "grad_norm": 11.876532554626465, + "learning_rate": 1.0379268920542303e-05, + "loss": 0.5217, + "step": 7622 + }, + { + "epoch": 1.31, + "grad_norm": 11.179770469665527, + "learning_rate": 1.0376694697099708e-05, + "loss": 0.6029, + "step": 7623 + }, + { + "epoch": 1.31, + "grad_norm": 11.677385330200195, + "learning_rate": 1.0374120473657113e-05, + "loss": 0.6449, + "step": 7624 + }, + { + "epoch": 1.31, + "grad_norm": 10.10698413848877, + "learning_rate": 1.0371546250214518e-05, + "loss": 0.494, + "step": 7625 + }, + { + "epoch": 1.31, + "grad_norm": 9.941584587097168, + "learning_rate": 1.0368972026771923e-05, + "loss": 0.5538, + "step": 7626 + }, + { + "epoch": 1.31, + "grad_norm": 9.517629623413086, + "learning_rate": 1.036639780332933e-05, + "loss": 0.3846, + "step": 7627 + }, + { + "epoch": 1.31, + "grad_norm": 10.007301330566406, + "learning_rate": 1.0363823579886735e-05, + "loss": 0.5212, + "step": 7628 + }, + { + "epoch": 1.31, + "grad_norm": 12.88581657409668, + "learning_rate": 1.036124935644414e-05, + "loss": 0.6526, + "step": 7629 + }, + { + "epoch": 1.31, + "grad_norm": 16.389724731445312, + "learning_rate": 1.0358675133001545e-05, + "loss": 0.745, + "step": 7630 + }, + { + "epoch": 1.31, + "grad_norm": 11.649310111999512, + "learning_rate": 1.035610090955895e-05, + "loss": 0.5522, + "step": 7631 + }, + { + "epoch": 1.31, + "grad_norm": 7.6701340675354, + "learning_rate": 1.0353526686116357e-05, + "loss": 0.287, + "step": 7632 + }, + { + "epoch": 1.31, + "grad_norm": 9.678239822387695, + "learning_rate": 1.035095246267376e-05, + "loss": 0.5347, + "step": 7633 + }, + { + "epoch": 1.31, + "grad_norm": 9.223403930664062, + "learning_rate": 1.0348378239231165e-05, + "loss": 0.4687, + "step": 7634 + }, + { + "epoch": 1.31, + "grad_norm": 8.723023414611816, + "learning_rate": 1.034580401578857e-05, + "loss": 0.4222, + "step": 7635 + }, + { + "epoch": 1.31, + "grad_norm": 10.584243774414062, + "learning_rate": 1.0343229792345975e-05, + "loss": 0.6159, + "step": 7636 + }, + { + "epoch": 1.31, + "grad_norm": 10.441088676452637, + "learning_rate": 1.0340655568903382e-05, + "loss": 0.5008, + "step": 7637 + }, + { + "epoch": 1.31, + "grad_norm": 10.49659538269043, + "learning_rate": 1.0338081345460787e-05, + "loss": 0.668, + "step": 7638 + }, + { + "epoch": 1.31, + "grad_norm": 9.211221694946289, + "learning_rate": 1.0335507122018191e-05, + "loss": 0.3276, + "step": 7639 + }, + { + "epoch": 1.31, + "grad_norm": 9.020726203918457, + "learning_rate": 1.0332932898575596e-05, + "loss": 0.4316, + "step": 7640 + }, + { + "epoch": 1.31, + "grad_norm": 15.816468238830566, + "learning_rate": 1.0330358675133003e-05, + "loss": 0.4717, + "step": 7641 + }, + { + "epoch": 1.31, + "grad_norm": 12.137051582336426, + "learning_rate": 1.0327784451690408e-05, + "loss": 0.5587, + "step": 7642 + }, + { + "epoch": 1.31, + "grad_norm": 13.667003631591797, + "learning_rate": 1.0325210228247811e-05, + "loss": 0.6343, + "step": 7643 + }, + { + "epoch": 1.31, + "grad_norm": 10.340344429016113, + "learning_rate": 1.0322636004805216e-05, + "loss": 0.3575, + "step": 7644 + }, + { + "epoch": 1.31, + "grad_norm": 8.79090404510498, + "learning_rate": 1.0320061781362621e-05, + "loss": 0.4042, + "step": 7645 + }, + { + "epoch": 1.31, + "grad_norm": 12.056097984313965, + "learning_rate": 1.0317487557920028e-05, + "loss": 0.3421, + "step": 7646 + }, + { + "epoch": 1.31, + "grad_norm": 10.9146146774292, + "learning_rate": 1.0314913334477433e-05, + "loss": 0.5977, + "step": 7647 + }, + { + "epoch": 1.31, + "grad_norm": 13.455370903015137, + "learning_rate": 1.0312339111034838e-05, + "loss": 0.4442, + "step": 7648 + }, + { + "epoch": 1.31, + "grad_norm": 12.547144889831543, + "learning_rate": 1.0309764887592243e-05, + "loss": 0.4097, + "step": 7649 + }, + { + "epoch": 1.31, + "grad_norm": 12.367593765258789, + "learning_rate": 1.0307190664149648e-05, + "loss": 0.4782, + "step": 7650 + }, + { + "epoch": 1.31, + "grad_norm": 13.46049976348877, + "learning_rate": 1.0304616440707055e-05, + "loss": 0.6173, + "step": 7651 + }, + { + "epoch": 1.31, + "grad_norm": 10.880172729492188, + "learning_rate": 1.030204221726446e-05, + "loss": 0.4819, + "step": 7652 + }, + { + "epoch": 1.31, + "grad_norm": 12.555892944335938, + "learning_rate": 1.0299467993821865e-05, + "loss": 0.5615, + "step": 7653 + }, + { + "epoch": 1.31, + "grad_norm": 10.010433197021484, + "learning_rate": 1.0296893770379268e-05, + "loss": 0.3736, + "step": 7654 + }, + { + "epoch": 1.31, + "grad_norm": 11.676969528198242, + "learning_rate": 1.0294319546936673e-05, + "loss": 0.4007, + "step": 7655 + }, + { + "epoch": 1.31, + "grad_norm": 11.463154792785645, + "learning_rate": 1.029174532349408e-05, + "loss": 0.6578, + "step": 7656 + }, + { + "epoch": 1.31, + "grad_norm": 9.027267456054688, + "learning_rate": 1.0289171100051485e-05, + "loss": 0.31, + "step": 7657 + }, + { + "epoch": 1.31, + "grad_norm": 13.18458080291748, + "learning_rate": 1.028659687660889e-05, + "loss": 0.3947, + "step": 7658 + }, + { + "epoch": 1.31, + "grad_norm": 9.778727531433105, + "learning_rate": 1.0284022653166295e-05, + "loss": 0.4626, + "step": 7659 + }, + { + "epoch": 1.31, + "grad_norm": 13.715446472167969, + "learning_rate": 1.0281448429723701e-05, + "loss": 0.5173, + "step": 7660 + }, + { + "epoch": 1.31, + "grad_norm": 11.753809928894043, + "learning_rate": 1.0278874206281106e-05, + "loss": 0.3613, + "step": 7661 + }, + { + "epoch": 1.31, + "grad_norm": 13.290806770324707, + "learning_rate": 1.0276299982838511e-05, + "loss": 0.4105, + "step": 7662 + }, + { + "epoch": 1.32, + "grad_norm": 11.830404281616211, + "learning_rate": 1.0273725759395916e-05, + "loss": 0.4879, + "step": 7663 + }, + { + "epoch": 1.32, + "grad_norm": 11.855487823486328, + "learning_rate": 1.027115153595332e-05, + "loss": 0.4888, + "step": 7664 + }, + { + "epoch": 1.32, + "grad_norm": 11.879138946533203, + "learning_rate": 1.0268577312510726e-05, + "loss": 0.4953, + "step": 7665 + }, + { + "epoch": 1.32, + "grad_norm": 14.021116256713867, + "learning_rate": 1.0266003089068131e-05, + "loss": 0.5683, + "step": 7666 + }, + { + "epoch": 1.32, + "grad_norm": 8.994811058044434, + "learning_rate": 1.0263428865625536e-05, + "loss": 0.5198, + "step": 7667 + }, + { + "epoch": 1.32, + "grad_norm": 11.031010627746582, + "learning_rate": 1.0260854642182941e-05, + "loss": 0.6681, + "step": 7668 + }, + { + "epoch": 1.32, + "grad_norm": 13.21917724609375, + "learning_rate": 1.0258280418740346e-05, + "loss": 0.4113, + "step": 7669 + }, + { + "epoch": 1.32, + "grad_norm": 12.923477172851562, + "learning_rate": 1.0255706195297753e-05, + "loss": 0.5717, + "step": 7670 + }, + { + "epoch": 1.32, + "grad_norm": 16.71715545654297, + "learning_rate": 1.0253131971855158e-05, + "loss": 0.6811, + "step": 7671 + }, + { + "epoch": 1.32, + "grad_norm": 8.599434852600098, + "learning_rate": 1.0250557748412563e-05, + "loss": 0.4114, + "step": 7672 + }, + { + "epoch": 1.32, + "grad_norm": 9.938738822937012, + "learning_rate": 1.0247983524969968e-05, + "loss": 0.4104, + "step": 7673 + }, + { + "epoch": 1.32, + "grad_norm": 13.35966968536377, + "learning_rate": 1.0245409301527373e-05, + "loss": 0.5351, + "step": 7674 + }, + { + "epoch": 1.32, + "grad_norm": 11.487553596496582, + "learning_rate": 1.0242835078084778e-05, + "loss": 0.3632, + "step": 7675 + }, + { + "epoch": 1.32, + "grad_norm": 12.132365226745605, + "learning_rate": 1.0240260854642183e-05, + "loss": 0.5049, + "step": 7676 + }, + { + "epoch": 1.32, + "grad_norm": 12.248384475708008, + "learning_rate": 1.0237686631199588e-05, + "loss": 0.7382, + "step": 7677 + }, + { + "epoch": 1.32, + "grad_norm": 12.453958511352539, + "learning_rate": 1.0235112407756993e-05, + "loss": 0.6043, + "step": 7678 + }, + { + "epoch": 1.32, + "grad_norm": 12.014633178710938, + "learning_rate": 1.02325381843144e-05, + "loss": 0.5675, + "step": 7679 + }, + { + "epoch": 1.32, + "grad_norm": 13.388157844543457, + "learning_rate": 1.0229963960871804e-05, + "loss": 0.7631, + "step": 7680 + }, + { + "epoch": 1.32, + "grad_norm": 8.168139457702637, + "learning_rate": 1.022738973742921e-05, + "loss": 0.3885, + "step": 7681 + }, + { + "epoch": 1.32, + "grad_norm": 9.904991149902344, + "learning_rate": 1.0224815513986614e-05, + "loss": 0.6153, + "step": 7682 + }, + { + "epoch": 1.32, + "grad_norm": 8.918192863464355, + "learning_rate": 1.022224129054402e-05, + "loss": 0.4525, + "step": 7683 + }, + { + "epoch": 1.32, + "grad_norm": 8.1475191116333, + "learning_rate": 1.0219667067101426e-05, + "loss": 0.5083, + "step": 7684 + }, + { + "epoch": 1.32, + "grad_norm": 12.759369850158691, + "learning_rate": 1.021709284365883e-05, + "loss": 0.4407, + "step": 7685 + }, + { + "epoch": 1.32, + "grad_norm": 8.438612937927246, + "learning_rate": 1.0214518620216234e-05, + "loss": 0.3931, + "step": 7686 + }, + { + "epoch": 1.32, + "grad_norm": 9.576518058776855, + "learning_rate": 1.021194439677364e-05, + "loss": 0.5469, + "step": 7687 + }, + { + "epoch": 1.32, + "grad_norm": 11.180768966674805, + "learning_rate": 1.0209370173331044e-05, + "loss": 0.5146, + "step": 7688 + }, + { + "epoch": 1.32, + "grad_norm": 9.942716598510742, + "learning_rate": 1.0206795949888451e-05, + "loss": 0.5428, + "step": 7689 + }, + { + "epoch": 1.32, + "grad_norm": 9.733922004699707, + "learning_rate": 1.0204221726445856e-05, + "loss": 0.6077, + "step": 7690 + }, + { + "epoch": 1.32, + "grad_norm": 8.93974781036377, + "learning_rate": 1.0201647503003261e-05, + "loss": 0.4856, + "step": 7691 + }, + { + "epoch": 1.32, + "grad_norm": 11.876021385192871, + "learning_rate": 1.0199073279560666e-05, + "loss": 0.5097, + "step": 7692 + }, + { + "epoch": 1.32, + "grad_norm": 13.196867942810059, + "learning_rate": 1.0196499056118073e-05, + "loss": 0.5664, + "step": 7693 + }, + { + "epoch": 1.32, + "grad_norm": 12.308637619018555, + "learning_rate": 1.0193924832675478e-05, + "loss": 0.6109, + "step": 7694 + }, + { + "epoch": 1.32, + "grad_norm": 11.110604286193848, + "learning_rate": 1.0191350609232881e-05, + "loss": 0.478, + "step": 7695 + }, + { + "epoch": 1.32, + "grad_norm": 12.802572250366211, + "learning_rate": 1.0188776385790286e-05, + "loss": 0.4881, + "step": 7696 + }, + { + "epoch": 1.32, + "grad_norm": 10.356598854064941, + "learning_rate": 1.0186202162347691e-05, + "loss": 0.5587, + "step": 7697 + }, + { + "epoch": 1.32, + "grad_norm": 13.789701461791992, + "learning_rate": 1.0183627938905098e-05, + "loss": 0.4952, + "step": 7698 + }, + { + "epoch": 1.32, + "grad_norm": 15.346013069152832, + "learning_rate": 1.0181053715462503e-05, + "loss": 0.6237, + "step": 7699 + }, + { + "epoch": 1.32, + "grad_norm": 11.338967323303223, + "learning_rate": 1.0178479492019908e-05, + "loss": 0.5249, + "step": 7700 + }, + { + "epoch": 1.32, + "grad_norm": 10.160362243652344, + "learning_rate": 1.0175905268577313e-05, + "loss": 0.4278, + "step": 7701 + }, + { + "epoch": 1.32, + "grad_norm": 7.647008419036865, + "learning_rate": 1.0173331045134718e-05, + "loss": 0.3249, + "step": 7702 + }, + { + "epoch": 1.32, + "grad_norm": 13.2664213180542, + "learning_rate": 1.0170756821692124e-05, + "loss": 0.5051, + "step": 7703 + }, + { + "epoch": 1.32, + "grad_norm": 17.073074340820312, + "learning_rate": 1.016818259824953e-05, + "loss": 0.8712, + "step": 7704 + }, + { + "epoch": 1.32, + "grad_norm": 9.206207275390625, + "learning_rate": 1.0165608374806934e-05, + "loss": 0.4776, + "step": 7705 + }, + { + "epoch": 1.32, + "grad_norm": 9.693732261657715, + "learning_rate": 1.0163034151364338e-05, + "loss": 0.5567, + "step": 7706 + }, + { + "epoch": 1.32, + "grad_norm": 9.299224853515625, + "learning_rate": 1.0160459927921742e-05, + "loss": 0.3744, + "step": 7707 + }, + { + "epoch": 1.32, + "grad_norm": 11.106921195983887, + "learning_rate": 1.015788570447915e-05, + "loss": 0.6698, + "step": 7708 + }, + { + "epoch": 1.32, + "grad_norm": 9.241186141967773, + "learning_rate": 1.0155311481036554e-05, + "loss": 0.4848, + "step": 7709 + }, + { + "epoch": 1.32, + "grad_norm": 10.151101112365723, + "learning_rate": 1.0152737257593959e-05, + "loss": 0.4434, + "step": 7710 + }, + { + "epoch": 1.32, + "grad_norm": 12.867897033691406, + "learning_rate": 1.0150163034151364e-05, + "loss": 0.663, + "step": 7711 + }, + { + "epoch": 1.32, + "grad_norm": 8.96371841430664, + "learning_rate": 1.014758881070877e-05, + "loss": 0.5116, + "step": 7712 + }, + { + "epoch": 1.32, + "grad_norm": 9.17064094543457, + "learning_rate": 1.0145014587266176e-05, + "loss": 0.3887, + "step": 7713 + }, + { + "epoch": 1.32, + "grad_norm": 10.953055381774902, + "learning_rate": 1.014244036382358e-05, + "loss": 0.6361, + "step": 7714 + }, + { + "epoch": 1.32, + "grad_norm": 11.440086364746094, + "learning_rate": 1.0139866140380986e-05, + "loss": 0.4921, + "step": 7715 + }, + { + "epoch": 1.32, + "grad_norm": 9.911348342895508, + "learning_rate": 1.0137291916938389e-05, + "loss": 0.416, + "step": 7716 + }, + { + "epoch": 1.32, + "grad_norm": 8.459033012390137, + "learning_rate": 1.0134717693495796e-05, + "loss": 0.3508, + "step": 7717 + }, + { + "epoch": 1.32, + "grad_norm": 9.743725776672363, + "learning_rate": 1.01321434700532e-05, + "loss": 0.4247, + "step": 7718 + }, + { + "epoch": 1.32, + "grad_norm": 15.971477508544922, + "learning_rate": 1.0129569246610606e-05, + "loss": 0.6408, + "step": 7719 + }, + { + "epoch": 1.32, + "grad_norm": 10.370979309082031, + "learning_rate": 1.012699502316801e-05, + "loss": 0.5155, + "step": 7720 + }, + { + "epoch": 1.33, + "grad_norm": 11.87381649017334, + "learning_rate": 1.0124420799725416e-05, + "loss": 0.5846, + "step": 7721 + }, + { + "epoch": 1.33, + "grad_norm": 10.795620918273926, + "learning_rate": 1.0121846576282822e-05, + "loss": 0.6393, + "step": 7722 + }, + { + "epoch": 1.33, + "grad_norm": 8.147948265075684, + "learning_rate": 1.0119272352840227e-05, + "loss": 0.4861, + "step": 7723 + }, + { + "epoch": 1.33, + "grad_norm": 12.977652549743652, + "learning_rate": 1.0116698129397632e-05, + "loss": 0.557, + "step": 7724 + }, + { + "epoch": 1.33, + "grad_norm": 10.888734817504883, + "learning_rate": 1.0114123905955037e-05, + "loss": 0.5015, + "step": 7725 + }, + { + "epoch": 1.33, + "grad_norm": 6.407049655914307, + "learning_rate": 1.0111549682512442e-05, + "loss": 0.2436, + "step": 7726 + }, + { + "epoch": 1.33, + "grad_norm": 8.32789421081543, + "learning_rate": 1.0108975459069847e-05, + "loss": 0.3552, + "step": 7727 + }, + { + "epoch": 1.33, + "grad_norm": 10.634115219116211, + "learning_rate": 1.0106401235627252e-05, + "loss": 0.4183, + "step": 7728 + }, + { + "epoch": 1.33, + "grad_norm": 9.24951171875, + "learning_rate": 1.0103827012184657e-05, + "loss": 0.4233, + "step": 7729 + }, + { + "epoch": 1.33, + "grad_norm": 10.46374797821045, + "learning_rate": 1.0101252788742062e-05, + "loss": 0.4374, + "step": 7730 + }, + { + "epoch": 1.33, + "grad_norm": 7.346308708190918, + "learning_rate": 1.0098678565299469e-05, + "loss": 0.4001, + "step": 7731 + }, + { + "epoch": 1.33, + "grad_norm": 12.14107894897461, + "learning_rate": 1.0096104341856874e-05, + "loss": 0.3783, + "step": 7732 + }, + { + "epoch": 1.33, + "grad_norm": 8.576476097106934, + "learning_rate": 1.0093530118414279e-05, + "loss": 0.4913, + "step": 7733 + }, + { + "epoch": 1.33, + "grad_norm": 13.956659317016602, + "learning_rate": 1.0090955894971684e-05, + "loss": 0.5134, + "step": 7734 + }, + { + "epoch": 1.33, + "grad_norm": 11.471206665039062, + "learning_rate": 1.0088381671529089e-05, + "loss": 0.4451, + "step": 7735 + }, + { + "epoch": 1.33, + "grad_norm": 10.790603637695312, + "learning_rate": 1.0085807448086496e-05, + "loss": 0.5092, + "step": 7736 + }, + { + "epoch": 1.33, + "grad_norm": 7.110894203186035, + "learning_rate": 1.0083233224643899e-05, + "loss": 0.3445, + "step": 7737 + }, + { + "epoch": 1.33, + "grad_norm": 8.716071128845215, + "learning_rate": 1.0080659001201304e-05, + "loss": 0.5015, + "step": 7738 + }, + { + "epoch": 1.33, + "grad_norm": 11.373269081115723, + "learning_rate": 1.0078084777758709e-05, + "loss": 0.5107, + "step": 7739 + }, + { + "epoch": 1.33, + "grad_norm": 10.19774341583252, + "learning_rate": 1.0075510554316114e-05, + "loss": 0.3752, + "step": 7740 + }, + { + "epoch": 1.33, + "grad_norm": 8.300834655761719, + "learning_rate": 1.007293633087352e-05, + "loss": 0.444, + "step": 7741 + }, + { + "epoch": 1.33, + "grad_norm": 10.19477367401123, + "learning_rate": 1.0070362107430926e-05, + "loss": 0.4985, + "step": 7742 + }, + { + "epoch": 1.33, + "grad_norm": 10.454253196716309, + "learning_rate": 1.006778788398833e-05, + "loss": 0.399, + "step": 7743 + }, + { + "epoch": 1.33, + "grad_norm": 13.702960968017578, + "learning_rate": 1.0065213660545735e-05, + "loss": 0.4912, + "step": 7744 + }, + { + "epoch": 1.33, + "grad_norm": 11.439407348632812, + "learning_rate": 1.0062639437103142e-05, + "loss": 0.4091, + "step": 7745 + }, + { + "epoch": 1.33, + "grad_norm": 10.278610229492188, + "learning_rate": 1.0060065213660547e-05, + "loss": 0.4811, + "step": 7746 + }, + { + "epoch": 1.33, + "grad_norm": 12.671419143676758, + "learning_rate": 1.005749099021795e-05, + "loss": 0.5109, + "step": 7747 + }, + { + "epoch": 1.33, + "grad_norm": 12.953645706176758, + "learning_rate": 1.0054916766775355e-05, + "loss": 0.4394, + "step": 7748 + }, + { + "epoch": 1.33, + "grad_norm": 10.745628356933594, + "learning_rate": 1.005234254333276e-05, + "loss": 0.5516, + "step": 7749 + }, + { + "epoch": 1.33, + "grad_norm": 12.74183464050293, + "learning_rate": 1.0049768319890167e-05, + "loss": 0.6131, + "step": 7750 + }, + { + "epoch": 1.33, + "grad_norm": 9.532737731933594, + "learning_rate": 1.0047194096447572e-05, + "loss": 0.4789, + "step": 7751 + }, + { + "epoch": 1.33, + "grad_norm": 13.661445617675781, + "learning_rate": 1.0044619873004977e-05, + "loss": 0.4724, + "step": 7752 + }, + { + "epoch": 1.33, + "grad_norm": 9.972615242004395, + "learning_rate": 1.0042045649562382e-05, + "loss": 0.5184, + "step": 7753 + }, + { + "epoch": 1.33, + "grad_norm": 9.499295234680176, + "learning_rate": 1.0039471426119787e-05, + "loss": 0.3339, + "step": 7754 + }, + { + "epoch": 1.33, + "grad_norm": 6.487306594848633, + "learning_rate": 1.0036897202677194e-05, + "loss": 0.2722, + "step": 7755 + }, + { + "epoch": 1.33, + "grad_norm": 10.158554077148438, + "learning_rate": 1.0034322979234599e-05, + "loss": 0.5955, + "step": 7756 + }, + { + "epoch": 1.33, + "grad_norm": 9.339704513549805, + "learning_rate": 1.0031748755792004e-05, + "loss": 0.3881, + "step": 7757 + }, + { + "epoch": 1.33, + "grad_norm": 12.292365074157715, + "learning_rate": 1.0029174532349407e-05, + "loss": 0.6178, + "step": 7758 + }, + { + "epoch": 1.33, + "grad_norm": 10.204378128051758, + "learning_rate": 1.0026600308906812e-05, + "loss": 0.3679, + "step": 7759 + }, + { + "epoch": 1.33, + "grad_norm": 15.968137741088867, + "learning_rate": 1.0024026085464219e-05, + "loss": 0.7739, + "step": 7760 + }, + { + "epoch": 1.33, + "grad_norm": 12.328302383422852, + "learning_rate": 1.0021451862021624e-05, + "loss": 0.4007, + "step": 7761 + }, + { + "epoch": 1.33, + "grad_norm": 10.331490516662598, + "learning_rate": 1.0018877638579029e-05, + "loss": 0.5925, + "step": 7762 + }, + { + "epoch": 1.33, + "grad_norm": 9.456241607666016, + "learning_rate": 1.0016303415136434e-05, + "loss": 0.4873, + "step": 7763 + }, + { + "epoch": 1.33, + "grad_norm": 9.09859848022461, + "learning_rate": 1.001372919169384e-05, + "loss": 0.2885, + "step": 7764 + }, + { + "epoch": 1.33, + "grad_norm": 10.258848190307617, + "learning_rate": 1.0011154968251245e-05, + "loss": 0.4692, + "step": 7765 + }, + { + "epoch": 1.33, + "grad_norm": 14.38225269317627, + "learning_rate": 1.000858074480865e-05, + "loss": 0.7009, + "step": 7766 + }, + { + "epoch": 1.33, + "grad_norm": 9.7805757522583, + "learning_rate": 1.0006006521366055e-05, + "loss": 0.7279, + "step": 7767 + }, + { + "epoch": 1.33, + "grad_norm": 8.964977264404297, + "learning_rate": 1.0003432297923459e-05, + "loss": 0.4061, + "step": 7768 + }, + { + "epoch": 1.33, + "grad_norm": 12.108419418334961, + "learning_rate": 1.0000858074480865e-05, + "loss": 0.4724, + "step": 7769 + }, + { + "epoch": 1.33, + "grad_norm": 14.253042221069336, + "learning_rate": 9.99828385103827e-06, + "loss": 0.7647, + "step": 7770 + }, + { + "epoch": 1.33, + "grad_norm": 11.767439842224121, + "learning_rate": 9.995709627595675e-06, + "loss": 0.5012, + "step": 7771 + }, + { + "epoch": 1.33, + "grad_norm": 11.6414794921875, + "learning_rate": 9.99313540415308e-06, + "loss": 0.4409, + "step": 7772 + }, + { + "epoch": 1.33, + "grad_norm": 11.81783390045166, + "learning_rate": 9.990561180710485e-06, + "loss": 0.4807, + "step": 7773 + }, + { + "epoch": 1.33, + "grad_norm": 10.49046516418457, + "learning_rate": 9.987986957267892e-06, + "loss": 0.4881, + "step": 7774 + }, + { + "epoch": 1.33, + "grad_norm": 8.088298797607422, + "learning_rate": 9.985412733825297e-06, + "loss": 0.3567, + "step": 7775 + }, + { + "epoch": 1.33, + "grad_norm": 9.536469459533691, + "learning_rate": 9.982838510382702e-06, + "loss": 0.5164, + "step": 7776 + }, + { + "epoch": 1.33, + "grad_norm": 9.29196834564209, + "learning_rate": 9.980264286940107e-06, + "loss": 0.3726, + "step": 7777 + }, + { + "epoch": 1.33, + "grad_norm": 12.835789680480957, + "learning_rate": 9.977690063497512e-06, + "loss": 0.724, + "step": 7778 + }, + { + "epoch": 1.33, + "grad_norm": 10.835392951965332, + "learning_rate": 9.975115840054917e-06, + "loss": 0.5318, + "step": 7779 + }, + { + "epoch": 1.34, + "grad_norm": 9.065471649169922, + "learning_rate": 9.972541616612322e-06, + "loss": 0.2848, + "step": 7780 + }, + { + "epoch": 1.34, + "grad_norm": 9.572319030761719, + "learning_rate": 9.969967393169727e-06, + "loss": 0.4961, + "step": 7781 + }, + { + "epoch": 1.34, + "grad_norm": 9.702765464782715, + "learning_rate": 9.967393169727132e-06, + "loss": 0.576, + "step": 7782 + }, + { + "epoch": 1.34, + "grad_norm": 8.804869651794434, + "learning_rate": 9.964818946284538e-06, + "loss": 0.3448, + "step": 7783 + }, + { + "epoch": 1.34, + "grad_norm": 11.074567794799805, + "learning_rate": 9.962244722841943e-06, + "loss": 0.4921, + "step": 7784 + }, + { + "epoch": 1.34, + "grad_norm": 9.345091819763184, + "learning_rate": 9.959670499399348e-06, + "loss": 0.5686, + "step": 7785 + }, + { + "epoch": 1.34, + "grad_norm": 9.821876525878906, + "learning_rate": 9.957096275956753e-06, + "loss": 0.3736, + "step": 7786 + }, + { + "epoch": 1.34, + "grad_norm": 10.495122909545898, + "learning_rate": 9.954522052514158e-06, + "loss": 0.4628, + "step": 7787 + }, + { + "epoch": 1.34, + "grad_norm": 10.096467018127441, + "learning_rate": 9.951947829071565e-06, + "loss": 0.5182, + "step": 7788 + }, + { + "epoch": 1.34, + "grad_norm": 9.715571403503418, + "learning_rate": 9.949373605628968e-06, + "loss": 0.3708, + "step": 7789 + }, + { + "epoch": 1.34, + "grad_norm": 11.558807373046875, + "learning_rate": 9.946799382186373e-06, + "loss": 0.5762, + "step": 7790 + }, + { + "epoch": 1.34, + "grad_norm": 9.3325777053833, + "learning_rate": 9.944225158743778e-06, + "loss": 0.5207, + "step": 7791 + }, + { + "epoch": 1.34, + "grad_norm": 7.696695804595947, + "learning_rate": 9.941650935301183e-06, + "loss": 0.4032, + "step": 7792 + }, + { + "epoch": 1.34, + "grad_norm": 12.648896217346191, + "learning_rate": 9.93907671185859e-06, + "loss": 0.5357, + "step": 7793 + }, + { + "epoch": 1.34, + "grad_norm": 9.576841354370117, + "learning_rate": 9.936502488415995e-06, + "loss": 0.5056, + "step": 7794 + }, + { + "epoch": 1.34, + "grad_norm": 9.248518943786621, + "learning_rate": 9.9339282649734e-06, + "loss": 0.403, + "step": 7795 + }, + { + "epoch": 1.34, + "grad_norm": 12.282525062561035, + "learning_rate": 9.931354041530805e-06, + "loss": 0.4931, + "step": 7796 + }, + { + "epoch": 1.34, + "grad_norm": 9.213623046875, + "learning_rate": 9.928779818088212e-06, + "loss": 0.4309, + "step": 7797 + }, + { + "epoch": 1.34, + "grad_norm": 10.758950233459473, + "learning_rate": 9.926205594645617e-06, + "loss": 0.4596, + "step": 7798 + }, + { + "epoch": 1.34, + "grad_norm": 8.991473197937012, + "learning_rate": 9.92363137120302e-06, + "loss": 0.361, + "step": 7799 + }, + { + "epoch": 1.34, + "grad_norm": 9.212671279907227, + "learning_rate": 9.921057147760425e-06, + "loss": 0.3761, + "step": 7800 + }, + { + "epoch": 1.34, + "grad_norm": 12.036935806274414, + "learning_rate": 9.91848292431783e-06, + "loss": 0.5292, + "step": 7801 + }, + { + "epoch": 1.34, + "grad_norm": 13.830004692077637, + "learning_rate": 9.915908700875237e-06, + "loss": 0.4981, + "step": 7802 + }, + { + "epoch": 1.34, + "grad_norm": 13.893874168395996, + "learning_rate": 9.913334477432642e-06, + "loss": 0.6167, + "step": 7803 + }, + { + "epoch": 1.34, + "grad_norm": 7.476625442504883, + "learning_rate": 9.910760253990047e-06, + "loss": 0.2795, + "step": 7804 + }, + { + "epoch": 1.34, + "grad_norm": 10.794510841369629, + "learning_rate": 9.908186030547452e-06, + "loss": 0.3916, + "step": 7805 + }, + { + "epoch": 1.34, + "grad_norm": 9.44105339050293, + "learning_rate": 9.905611807104857e-06, + "loss": 0.3235, + "step": 7806 + }, + { + "epoch": 1.34, + "grad_norm": 11.826579093933105, + "learning_rate": 9.903037583662263e-06, + "loss": 0.5203, + "step": 7807 + }, + { + "epoch": 1.34, + "grad_norm": 16.6405029296875, + "learning_rate": 9.900463360219668e-06, + "loss": 0.4138, + "step": 7808 + }, + { + "epoch": 1.34, + "grad_norm": 14.390388488769531, + "learning_rate": 9.897889136777073e-06, + "loss": 0.6336, + "step": 7809 + }, + { + "epoch": 1.34, + "grad_norm": 8.235594749450684, + "learning_rate": 9.895314913334477e-06, + "loss": 0.4451, + "step": 7810 + }, + { + "epoch": 1.34, + "grad_norm": 11.92341423034668, + "learning_rate": 9.892740689891882e-06, + "loss": 0.6219, + "step": 7811 + }, + { + "epoch": 1.34, + "grad_norm": 20.03653907775879, + "learning_rate": 9.890166466449288e-06, + "loss": 0.5885, + "step": 7812 + }, + { + "epoch": 1.34, + "grad_norm": 7.7859978675842285, + "learning_rate": 9.887592243006693e-06, + "loss": 0.2507, + "step": 7813 + }, + { + "epoch": 1.34, + "grad_norm": 8.49488353729248, + "learning_rate": 9.885018019564098e-06, + "loss": 0.2834, + "step": 7814 + }, + { + "epoch": 1.34, + "grad_norm": 9.413704872131348, + "learning_rate": 9.882443796121503e-06, + "loss": 0.432, + "step": 7815 + }, + { + "epoch": 1.34, + "grad_norm": 12.287511825561523, + "learning_rate": 9.87986957267891e-06, + "loss": 0.5256, + "step": 7816 + }, + { + "epoch": 1.34, + "grad_norm": 6.119768142700195, + "learning_rate": 9.877295349236315e-06, + "loss": 0.3605, + "step": 7817 + }, + { + "epoch": 1.34, + "grad_norm": 10.733137130737305, + "learning_rate": 9.87472112579372e-06, + "loss": 0.4332, + "step": 7818 + }, + { + "epoch": 1.34, + "grad_norm": 8.899484634399414, + "learning_rate": 9.872146902351125e-06, + "loss": 0.338, + "step": 7819 + }, + { + "epoch": 1.34, + "grad_norm": 13.59583854675293, + "learning_rate": 9.869572678908528e-06, + "loss": 0.6533, + "step": 7820 + }, + { + "epoch": 1.34, + "grad_norm": 9.460033416748047, + "learning_rate": 9.866998455465935e-06, + "loss": 0.6124, + "step": 7821 + }, + { + "epoch": 1.34, + "grad_norm": 9.69294548034668, + "learning_rate": 9.86442423202334e-06, + "loss": 0.5059, + "step": 7822 + }, + { + "epoch": 1.34, + "grad_norm": 13.422442436218262, + "learning_rate": 9.861850008580745e-06, + "loss": 0.5904, + "step": 7823 + }, + { + "epoch": 1.34, + "grad_norm": 14.567588806152344, + "learning_rate": 9.85927578513815e-06, + "loss": 0.6012, + "step": 7824 + }, + { + "epoch": 1.34, + "grad_norm": 9.430335998535156, + "learning_rate": 9.856701561695555e-06, + "loss": 0.3941, + "step": 7825 + }, + { + "epoch": 1.34, + "grad_norm": 11.163117408752441, + "learning_rate": 9.854127338252961e-06, + "loss": 0.7988, + "step": 7826 + }, + { + "epoch": 1.34, + "grad_norm": 10.376513481140137, + "learning_rate": 9.851553114810366e-06, + "loss": 0.4682, + "step": 7827 + }, + { + "epoch": 1.34, + "grad_norm": 12.4733304977417, + "learning_rate": 9.848978891367771e-06, + "loss": 0.4803, + "step": 7828 + }, + { + "epoch": 1.34, + "grad_norm": 11.11894702911377, + "learning_rate": 9.846404667925176e-06, + "loss": 0.5094, + "step": 7829 + }, + { + "epoch": 1.34, + "grad_norm": 10.281590461730957, + "learning_rate": 9.843830444482581e-06, + "loss": 0.5888, + "step": 7830 + }, + { + "epoch": 1.34, + "grad_norm": 10.856283187866211, + "learning_rate": 9.841256221039986e-06, + "loss": 0.6041, + "step": 7831 + }, + { + "epoch": 1.34, + "grad_norm": 10.023073196411133, + "learning_rate": 9.838681997597391e-06, + "loss": 0.336, + "step": 7832 + }, + { + "epoch": 1.34, + "grad_norm": 11.655364990234375, + "learning_rate": 9.836107774154796e-06, + "loss": 0.4137, + "step": 7833 + }, + { + "epoch": 1.34, + "grad_norm": 9.6759033203125, + "learning_rate": 9.833533550712201e-06, + "loss": 0.5383, + "step": 7834 + }, + { + "epoch": 1.34, + "grad_norm": 8.055542945861816, + "learning_rate": 9.830959327269608e-06, + "loss": 0.4429, + "step": 7835 + }, + { + "epoch": 1.34, + "grad_norm": 12.497017860412598, + "learning_rate": 9.828385103827013e-06, + "loss": 0.6196, + "step": 7836 + }, + { + "epoch": 1.34, + "grad_norm": 12.390766143798828, + "learning_rate": 9.825810880384418e-06, + "loss": 0.3786, + "step": 7837 + }, + { + "epoch": 1.35, + "grad_norm": 14.510671615600586, + "learning_rate": 9.823236656941823e-06, + "loss": 0.5683, + "step": 7838 + }, + { + "epoch": 1.35, + "grad_norm": 12.21231460571289, + "learning_rate": 9.820662433499228e-06, + "loss": 0.3756, + "step": 7839 + }, + { + "epoch": 1.35, + "grad_norm": 6.280329704284668, + "learning_rate": 9.818088210056635e-06, + "loss": 0.3279, + "step": 7840 + }, + { + "epoch": 1.35, + "grad_norm": 12.856691360473633, + "learning_rate": 9.815513986614038e-06, + "loss": 0.5336, + "step": 7841 + }, + { + "epoch": 1.35, + "grad_norm": 9.319304466247559, + "learning_rate": 9.812939763171443e-06, + "loss": 0.3987, + "step": 7842 + }, + { + "epoch": 1.35, + "grad_norm": 14.5609712600708, + "learning_rate": 9.810365539728848e-06, + "loss": 0.5165, + "step": 7843 + }, + { + "epoch": 1.35, + "grad_norm": 7.934987545013428, + "learning_rate": 9.807791316286253e-06, + "loss": 0.3615, + "step": 7844 + }, + { + "epoch": 1.35, + "grad_norm": 8.290690422058105, + "learning_rate": 9.80521709284366e-06, + "loss": 0.3594, + "step": 7845 + }, + { + "epoch": 1.35, + "grad_norm": 9.969504356384277, + "learning_rate": 9.802642869401065e-06, + "loss": 0.6047, + "step": 7846 + }, + { + "epoch": 1.35, + "grad_norm": 11.453792572021484, + "learning_rate": 9.80006864595847e-06, + "loss": 0.5547, + "step": 7847 + }, + { + "epoch": 1.35, + "grad_norm": 10.964442253112793, + "learning_rate": 9.797494422515875e-06, + "loss": 0.512, + "step": 7848 + }, + { + "epoch": 1.35, + "grad_norm": 8.762725830078125, + "learning_rate": 9.794920199073281e-06, + "loss": 0.4341, + "step": 7849 + }, + { + "epoch": 1.35, + "grad_norm": 13.61126708984375, + "learning_rate": 9.792345975630686e-06, + "loss": 0.7721, + "step": 7850 + }, + { + "epoch": 1.35, + "grad_norm": 9.368814468383789, + "learning_rate": 9.78977175218809e-06, + "loss": 0.5659, + "step": 7851 + }, + { + "epoch": 1.35, + "grad_norm": 8.620594024658203, + "learning_rate": 9.787197528745494e-06, + "loss": 0.4628, + "step": 7852 + }, + { + "epoch": 1.35, + "grad_norm": 11.326383590698242, + "learning_rate": 9.7846233053029e-06, + "loss": 0.5452, + "step": 7853 + }, + { + "epoch": 1.35, + "grad_norm": 10.69447135925293, + "learning_rate": 9.782049081860306e-06, + "loss": 0.6278, + "step": 7854 + }, + { + "epoch": 1.35, + "grad_norm": 9.626816749572754, + "learning_rate": 9.779474858417711e-06, + "loss": 0.5241, + "step": 7855 + }, + { + "epoch": 1.35, + "grad_norm": 9.496217727661133, + "learning_rate": 9.776900634975116e-06, + "loss": 0.3537, + "step": 7856 + }, + { + "epoch": 1.35, + "grad_norm": 10.960626602172852, + "learning_rate": 9.774326411532521e-06, + "loss": 0.5719, + "step": 7857 + }, + { + "epoch": 1.35, + "grad_norm": 11.040411949157715, + "learning_rate": 9.771752188089926e-06, + "loss": 0.3659, + "step": 7858 + }, + { + "epoch": 1.35, + "grad_norm": 9.668703079223633, + "learning_rate": 9.769177964647333e-06, + "loss": 0.3318, + "step": 7859 + }, + { + "epoch": 1.35, + "grad_norm": 13.654769897460938, + "learning_rate": 9.766603741204738e-06, + "loss": 0.5931, + "step": 7860 + }, + { + "epoch": 1.35, + "grad_norm": 8.421550750732422, + "learning_rate": 9.764029517762143e-06, + "loss": 0.3518, + "step": 7861 + }, + { + "epoch": 1.35, + "grad_norm": 14.402252197265625, + "learning_rate": 9.761455294319546e-06, + "loss": 0.4765, + "step": 7862 + }, + { + "epoch": 1.35, + "grad_norm": 9.763633728027344, + "learning_rate": 9.758881070876951e-06, + "loss": 0.249, + "step": 7863 + }, + { + "epoch": 1.35, + "grad_norm": 11.121054649353027, + "learning_rate": 9.756306847434358e-06, + "loss": 0.4757, + "step": 7864 + }, + { + "epoch": 1.35, + "grad_norm": 10.769430160522461, + "learning_rate": 9.753732623991763e-06, + "loss": 0.5496, + "step": 7865 + }, + { + "epoch": 1.35, + "grad_norm": 11.768556594848633, + "learning_rate": 9.751158400549168e-06, + "loss": 0.6156, + "step": 7866 + }, + { + "epoch": 1.35, + "grad_norm": 11.153548240661621, + "learning_rate": 9.748584177106573e-06, + "loss": 0.6403, + "step": 7867 + }, + { + "epoch": 1.35, + "grad_norm": 11.739583969116211, + "learning_rate": 9.74600995366398e-06, + "loss": 0.5516, + "step": 7868 + }, + { + "epoch": 1.35, + "grad_norm": 11.17764949798584, + "learning_rate": 9.743435730221384e-06, + "loss": 0.4273, + "step": 7869 + }, + { + "epoch": 1.35, + "grad_norm": 9.148964881896973, + "learning_rate": 9.74086150677879e-06, + "loss": 0.4761, + "step": 7870 + }, + { + "epoch": 1.35, + "grad_norm": 12.738555908203125, + "learning_rate": 9.738287283336194e-06, + "loss": 0.561, + "step": 7871 + }, + { + "epoch": 1.35, + "grad_norm": 9.561820030212402, + "learning_rate": 9.735713059893598e-06, + "loss": 0.4002, + "step": 7872 + }, + { + "epoch": 1.35, + "grad_norm": 14.05392837524414, + "learning_rate": 9.733138836451004e-06, + "loss": 0.5664, + "step": 7873 + }, + { + "epoch": 1.35, + "grad_norm": 8.910451889038086, + "learning_rate": 9.73056461300841e-06, + "loss": 0.4485, + "step": 7874 + }, + { + "epoch": 1.35, + "grad_norm": 14.862271308898926, + "learning_rate": 9.727990389565814e-06, + "loss": 0.5186, + "step": 7875 + }, + { + "epoch": 1.35, + "grad_norm": 11.9047212600708, + "learning_rate": 9.72541616612322e-06, + "loss": 0.399, + "step": 7876 + }, + { + "epoch": 1.35, + "grad_norm": 10.42728042602539, + "learning_rate": 9.722841942680624e-06, + "loss": 0.4852, + "step": 7877 + }, + { + "epoch": 1.35, + "grad_norm": 10.16113567352295, + "learning_rate": 9.720267719238031e-06, + "loss": 0.4994, + "step": 7878 + }, + { + "epoch": 1.35, + "grad_norm": 9.269696235656738, + "learning_rate": 9.717693495795436e-06, + "loss": 0.4599, + "step": 7879 + }, + { + "epoch": 1.35, + "grad_norm": 11.118162155151367, + "learning_rate": 9.715119272352841e-06, + "loss": 0.494, + "step": 7880 + }, + { + "epoch": 1.35, + "grad_norm": 12.150639533996582, + "learning_rate": 9.712545048910246e-06, + "loss": 0.4689, + "step": 7881 + }, + { + "epoch": 1.35, + "grad_norm": 9.063127517700195, + "learning_rate": 9.709970825467651e-06, + "loss": 0.485, + "step": 7882 + }, + { + "epoch": 1.35, + "grad_norm": 8.748597145080566, + "learning_rate": 9.707396602025056e-06, + "loss": 0.4731, + "step": 7883 + }, + { + "epoch": 1.35, + "grad_norm": 11.388176918029785, + "learning_rate": 9.70482237858246e-06, + "loss": 0.5719, + "step": 7884 + }, + { + "epoch": 1.35, + "grad_norm": 11.090357780456543, + "learning_rate": 9.702248155139866e-06, + "loss": 0.4772, + "step": 7885 + }, + { + "epoch": 1.35, + "grad_norm": 11.651619911193848, + "learning_rate": 9.69967393169727e-06, + "loss": 0.3959, + "step": 7886 + }, + { + "epoch": 1.35, + "grad_norm": 9.236624717712402, + "learning_rate": 9.697099708254677e-06, + "loss": 0.3725, + "step": 7887 + }, + { + "epoch": 1.35, + "grad_norm": 12.64133071899414, + "learning_rate": 9.694525484812082e-06, + "loss": 0.4929, + "step": 7888 + }, + { + "epoch": 1.35, + "grad_norm": 14.256224632263184, + "learning_rate": 9.691951261369487e-06, + "loss": 0.4727, + "step": 7889 + }, + { + "epoch": 1.35, + "grad_norm": 9.375576972961426, + "learning_rate": 9.689377037926892e-06, + "loss": 0.4567, + "step": 7890 + }, + { + "epoch": 1.35, + "grad_norm": 13.470894813537598, + "learning_rate": 9.686802814484297e-06, + "loss": 0.5636, + "step": 7891 + }, + { + "epoch": 1.35, + "grad_norm": 12.24980354309082, + "learning_rate": 9.684228591041704e-06, + "loss": 0.6526, + "step": 7892 + }, + { + "epoch": 1.35, + "grad_norm": 9.546586990356445, + "learning_rate": 9.681654367599107e-06, + "loss": 0.3554, + "step": 7893 + }, + { + "epoch": 1.35, + "grad_norm": 8.096156120300293, + "learning_rate": 9.679080144156512e-06, + "loss": 0.3018, + "step": 7894 + }, + { + "epoch": 1.35, + "grad_norm": 9.843267440795898, + "learning_rate": 9.676505920713917e-06, + "loss": 0.4242, + "step": 7895 + }, + { + "epoch": 1.36, + "grad_norm": 11.440269470214844, + "learning_rate": 9.673931697271322e-06, + "loss": 0.5666, + "step": 7896 + }, + { + "epoch": 1.36, + "grad_norm": 9.503560066223145, + "learning_rate": 9.671357473828729e-06, + "loss": 0.3638, + "step": 7897 + }, + { + "epoch": 1.36, + "grad_norm": 7.833135604858398, + "learning_rate": 9.668783250386134e-06, + "loss": 0.342, + "step": 7898 + }, + { + "epoch": 1.36, + "grad_norm": 11.183473587036133, + "learning_rate": 9.666209026943539e-06, + "loss": 0.6704, + "step": 7899 + }, + { + "epoch": 1.36, + "grad_norm": 7.344062328338623, + "learning_rate": 9.663634803500944e-06, + "loss": 0.3328, + "step": 7900 + }, + { + "epoch": 1.36, + "grad_norm": 10.75399398803711, + "learning_rate": 9.66106058005835e-06, + "loss": 0.716, + "step": 7901 + }, + { + "epoch": 1.36, + "grad_norm": 10.551011085510254, + "learning_rate": 9.658486356615756e-06, + "loss": 0.4762, + "step": 7902 + }, + { + "epoch": 1.36, + "grad_norm": 13.75141716003418, + "learning_rate": 9.655912133173159e-06, + "loss": 0.5188, + "step": 7903 + }, + { + "epoch": 1.36, + "grad_norm": 11.248319625854492, + "learning_rate": 9.653337909730564e-06, + "loss": 0.4587, + "step": 7904 + }, + { + "epoch": 1.36, + "grad_norm": 7.654538154602051, + "learning_rate": 9.650763686287969e-06, + "loss": 0.3596, + "step": 7905 + }, + { + "epoch": 1.36, + "grad_norm": 14.485015869140625, + "learning_rate": 9.648189462845376e-06, + "loss": 0.7514, + "step": 7906 + }, + { + "epoch": 1.36, + "grad_norm": 9.851784706115723, + "learning_rate": 9.64561523940278e-06, + "loss": 0.3519, + "step": 7907 + }, + { + "epoch": 1.36, + "grad_norm": 11.043368339538574, + "learning_rate": 9.643041015960186e-06, + "loss": 0.4275, + "step": 7908 + }, + { + "epoch": 1.36, + "grad_norm": 7.457150459289551, + "learning_rate": 9.64046679251759e-06, + "loss": 0.2649, + "step": 7909 + }, + { + "epoch": 1.36, + "grad_norm": 9.928865432739258, + "learning_rate": 9.637892569074996e-06, + "loss": 0.7033, + "step": 7910 + }, + { + "epoch": 1.36, + "grad_norm": 10.327780723571777, + "learning_rate": 9.635318345632402e-06, + "loss": 0.6201, + "step": 7911 + }, + { + "epoch": 1.36, + "grad_norm": 7.8857502937316895, + "learning_rate": 9.632744122189807e-06, + "loss": 0.3789, + "step": 7912 + }, + { + "epoch": 1.36, + "grad_norm": 10.756880760192871, + "learning_rate": 9.630169898747212e-06, + "loss": 0.3884, + "step": 7913 + }, + { + "epoch": 1.36, + "grad_norm": 12.089845657348633, + "learning_rate": 9.627595675304616e-06, + "loss": 0.4721, + "step": 7914 + }, + { + "epoch": 1.36, + "grad_norm": 10.617827415466309, + "learning_rate": 9.62502145186202e-06, + "loss": 0.4425, + "step": 7915 + }, + { + "epoch": 1.36, + "grad_norm": 10.050688743591309, + "learning_rate": 9.622447228419427e-06, + "loss": 0.4432, + "step": 7916 + }, + { + "epoch": 1.36, + "grad_norm": 13.98144817352295, + "learning_rate": 9.619873004976832e-06, + "loss": 0.494, + "step": 7917 + }, + { + "epoch": 1.36, + "grad_norm": 9.91801643371582, + "learning_rate": 9.617298781534237e-06, + "loss": 0.3389, + "step": 7918 + }, + { + "epoch": 1.36, + "grad_norm": 11.546061515808105, + "learning_rate": 9.614724558091642e-06, + "loss": 0.5805, + "step": 7919 + }, + { + "epoch": 1.36, + "grad_norm": 8.63851547241211, + "learning_rate": 9.612150334649049e-06, + "loss": 0.38, + "step": 7920 + }, + { + "epoch": 1.36, + "grad_norm": 9.369551658630371, + "learning_rate": 9.609576111206454e-06, + "loss": 0.3167, + "step": 7921 + }, + { + "epoch": 1.36, + "grad_norm": 11.338345527648926, + "learning_rate": 9.607001887763859e-06, + "loss": 0.3936, + "step": 7922 + }, + { + "epoch": 1.36, + "grad_norm": 11.559334754943848, + "learning_rate": 9.604427664321264e-06, + "loss": 0.5655, + "step": 7923 + }, + { + "epoch": 1.36, + "grad_norm": 13.715849876403809, + "learning_rate": 9.601853440878667e-06, + "loss": 0.6027, + "step": 7924 + }, + { + "epoch": 1.36, + "grad_norm": 11.786272048950195, + "learning_rate": 9.599279217436074e-06, + "loss": 0.5646, + "step": 7925 + }, + { + "epoch": 1.36, + "grad_norm": 10.754862785339355, + "learning_rate": 9.596704993993479e-06, + "loss": 0.5029, + "step": 7926 + }, + { + "epoch": 1.36, + "grad_norm": 11.600534439086914, + "learning_rate": 9.594130770550884e-06, + "loss": 0.6923, + "step": 7927 + }, + { + "epoch": 1.36, + "grad_norm": 9.364373207092285, + "learning_rate": 9.591556547108289e-06, + "loss": 0.3036, + "step": 7928 + }, + { + "epoch": 1.36, + "grad_norm": 14.808643341064453, + "learning_rate": 9.588982323665694e-06, + "loss": 0.6615, + "step": 7929 + }, + { + "epoch": 1.36, + "grad_norm": 12.058235168457031, + "learning_rate": 9.5864081002231e-06, + "loss": 0.7172, + "step": 7930 + }, + { + "epoch": 1.36, + "grad_norm": 8.112159729003906, + "learning_rate": 9.583833876780505e-06, + "loss": 0.3329, + "step": 7931 + }, + { + "epoch": 1.36, + "grad_norm": 13.51227855682373, + "learning_rate": 9.58125965333791e-06, + "loss": 0.887, + "step": 7932 + }, + { + "epoch": 1.36, + "grad_norm": 8.956052780151367, + "learning_rate": 9.578685429895315e-06, + "loss": 0.4386, + "step": 7933 + }, + { + "epoch": 1.36, + "grad_norm": 10.272078514099121, + "learning_rate": 9.57611120645272e-06, + "loss": 0.451, + "step": 7934 + }, + { + "epoch": 1.36, + "grad_norm": 10.831804275512695, + "learning_rate": 9.573536983010125e-06, + "loss": 0.4947, + "step": 7935 + }, + { + "epoch": 1.36, + "grad_norm": 9.690449714660645, + "learning_rate": 9.57096275956753e-06, + "loss": 0.5057, + "step": 7936 + }, + { + "epoch": 1.36, + "grad_norm": 16.02230453491211, + "learning_rate": 9.568388536124935e-06, + "loss": 0.5295, + "step": 7937 + }, + { + "epoch": 1.36, + "grad_norm": 13.09279727935791, + "learning_rate": 9.56581431268234e-06, + "loss": 0.4545, + "step": 7938 + }, + { + "epoch": 1.36, + "grad_norm": 10.567049026489258, + "learning_rate": 9.563240089239747e-06, + "loss": 0.4609, + "step": 7939 + }, + { + "epoch": 1.36, + "grad_norm": 11.060547828674316, + "learning_rate": 9.560665865797152e-06, + "loss": 0.3896, + "step": 7940 + }, + { + "epoch": 1.36, + "grad_norm": 10.55778980255127, + "learning_rate": 9.558091642354557e-06, + "loss": 0.3546, + "step": 7941 + }, + { + "epoch": 1.36, + "grad_norm": 8.26447868347168, + "learning_rate": 9.555517418911962e-06, + "loss": 0.4502, + "step": 7942 + }, + { + "epoch": 1.36, + "grad_norm": 8.228554725646973, + "learning_rate": 9.552943195469367e-06, + "loss": 0.297, + "step": 7943 + }, + { + "epoch": 1.36, + "grad_norm": 13.135771751403809, + "learning_rate": 9.550368972026774e-06, + "loss": 0.5123, + "step": 7944 + }, + { + "epoch": 1.36, + "grad_norm": 11.186357498168945, + "learning_rate": 9.547794748584177e-06, + "loss": 0.5235, + "step": 7945 + }, + { + "epoch": 1.36, + "grad_norm": 11.003829956054688, + "learning_rate": 9.545220525141582e-06, + "loss": 0.5746, + "step": 7946 + }, + { + "epoch": 1.36, + "grad_norm": 11.436480522155762, + "learning_rate": 9.542646301698987e-06, + "loss": 0.3595, + "step": 7947 + }, + { + "epoch": 1.36, + "grad_norm": 11.297316551208496, + "learning_rate": 9.540072078256392e-06, + "loss": 0.5374, + "step": 7948 + }, + { + "epoch": 1.36, + "grad_norm": 12.019418716430664, + "learning_rate": 9.537497854813799e-06, + "loss": 0.4533, + "step": 7949 + }, + { + "epoch": 1.36, + "grad_norm": 10.778726577758789, + "learning_rate": 9.534923631371204e-06, + "loss": 0.2627, + "step": 7950 + }, + { + "epoch": 1.36, + "grad_norm": 8.137608528137207, + "learning_rate": 9.532349407928609e-06, + "loss": 0.4314, + "step": 7951 + }, + { + "epoch": 1.36, + "grad_norm": 9.141313552856445, + "learning_rate": 9.529775184486014e-06, + "loss": 0.5498, + "step": 7952 + }, + { + "epoch": 1.36, + "grad_norm": 12.387163162231445, + "learning_rate": 9.527200961043419e-06, + "loss": 0.3944, + "step": 7953 + }, + { + "epoch": 1.37, + "grad_norm": 10.75260066986084, + "learning_rate": 9.524626737600825e-06, + "loss": 0.4656, + "step": 7954 + }, + { + "epoch": 1.37, + "grad_norm": 10.301797866821289, + "learning_rate": 9.522052514158228e-06, + "loss": 0.4064, + "step": 7955 + }, + { + "epoch": 1.37, + "grad_norm": 10.34122085571289, + "learning_rate": 9.519478290715633e-06, + "loss": 0.5799, + "step": 7956 + }, + { + "epoch": 1.37, + "grad_norm": 13.564830780029297, + "learning_rate": 9.516904067273038e-06, + "loss": 0.403, + "step": 7957 + }, + { + "epoch": 1.37, + "grad_norm": 5.964445114135742, + "learning_rate": 9.514329843830445e-06, + "loss": 0.3114, + "step": 7958 + }, + { + "epoch": 1.37, + "grad_norm": 10.449462890625, + "learning_rate": 9.51175562038785e-06, + "loss": 0.5315, + "step": 7959 + }, + { + "epoch": 1.37, + "grad_norm": 17.97594451904297, + "learning_rate": 9.509181396945255e-06, + "loss": 0.595, + "step": 7960 + }, + { + "epoch": 1.37, + "grad_norm": 10.211369514465332, + "learning_rate": 9.50660717350266e-06, + "loss": 0.5134, + "step": 7961 + }, + { + "epoch": 1.37, + "grad_norm": 8.999001502990723, + "learning_rate": 9.504032950060065e-06, + "loss": 0.4178, + "step": 7962 + }, + { + "epoch": 1.37, + "grad_norm": 8.591569900512695, + "learning_rate": 9.501458726617472e-06, + "loss": 0.3171, + "step": 7963 + }, + { + "epoch": 1.37, + "grad_norm": 15.294441223144531, + "learning_rate": 9.498884503174877e-06, + "loss": 0.5068, + "step": 7964 + }, + { + "epoch": 1.37, + "grad_norm": 12.029129028320312, + "learning_rate": 9.496310279732282e-06, + "loss": 0.5429, + "step": 7965 + }, + { + "epoch": 1.37, + "grad_norm": 11.49426555633545, + "learning_rate": 9.493736056289685e-06, + "loss": 0.5535, + "step": 7966 + }, + { + "epoch": 1.37, + "grad_norm": 7.20008659362793, + "learning_rate": 9.49116183284709e-06, + "loss": 0.3495, + "step": 7967 + }, + { + "epoch": 1.37, + "grad_norm": 12.031464576721191, + "learning_rate": 9.488587609404497e-06, + "loss": 0.3439, + "step": 7968 + }, + { + "epoch": 1.37, + "grad_norm": 11.443595886230469, + "learning_rate": 9.486013385961902e-06, + "loss": 0.4986, + "step": 7969 + }, + { + "epoch": 1.37, + "grad_norm": 12.72297191619873, + "learning_rate": 9.483439162519307e-06, + "loss": 0.3509, + "step": 7970 + }, + { + "epoch": 1.37, + "grad_norm": 13.723140716552734, + "learning_rate": 9.480864939076712e-06, + "loss": 0.6552, + "step": 7971 + }, + { + "epoch": 1.37, + "grad_norm": 10.96426010131836, + "learning_rate": 9.478290715634118e-06, + "loss": 0.423, + "step": 7972 + }, + { + "epoch": 1.37, + "grad_norm": 9.09008502960205, + "learning_rate": 9.475716492191523e-06, + "loss": 0.4553, + "step": 7973 + }, + { + "epoch": 1.37, + "grad_norm": 7.585437297821045, + "learning_rate": 9.473142268748928e-06, + "loss": 0.3397, + "step": 7974 + }, + { + "epoch": 1.37, + "grad_norm": 8.272289276123047, + "learning_rate": 9.470568045306333e-06, + "loss": 0.3997, + "step": 7975 + }, + { + "epoch": 1.37, + "grad_norm": 9.922517776489258, + "learning_rate": 9.467993821863737e-06, + "loss": 0.3701, + "step": 7976 + }, + { + "epoch": 1.37, + "grad_norm": 10.318075180053711, + "learning_rate": 9.465419598421143e-06, + "loss": 0.3566, + "step": 7977 + }, + { + "epoch": 1.37, + "grad_norm": 9.127506256103516, + "learning_rate": 9.462845374978548e-06, + "loss": 0.4298, + "step": 7978 + }, + { + "epoch": 1.37, + "grad_norm": 12.943914413452148, + "learning_rate": 9.460271151535953e-06, + "loss": 0.5534, + "step": 7979 + }, + { + "epoch": 1.37, + "grad_norm": 9.463277816772461, + "learning_rate": 9.457696928093358e-06, + "loss": 0.3953, + "step": 7980 + }, + { + "epoch": 1.37, + "grad_norm": 14.071959495544434, + "learning_rate": 9.455122704650763e-06, + "loss": 0.5181, + "step": 7981 + }, + { + "epoch": 1.37, + "grad_norm": 9.230708122253418, + "learning_rate": 9.45254848120817e-06, + "loss": 0.4316, + "step": 7982 + }, + { + "epoch": 1.37, + "grad_norm": 11.167973518371582, + "learning_rate": 9.449974257765575e-06, + "loss": 0.5694, + "step": 7983 + }, + { + "epoch": 1.37, + "grad_norm": 11.809586524963379, + "learning_rate": 9.44740003432298e-06, + "loss": 0.4548, + "step": 7984 + }, + { + "epoch": 1.37, + "grad_norm": 9.861859321594238, + "learning_rate": 9.444825810880385e-06, + "loss": 0.4381, + "step": 7985 + }, + { + "epoch": 1.37, + "grad_norm": 9.695276260375977, + "learning_rate": 9.44225158743779e-06, + "loss": 0.4095, + "step": 7986 + }, + { + "epoch": 1.37, + "grad_norm": 10.268562316894531, + "learning_rate": 9.439677363995195e-06, + "loss": 0.4532, + "step": 7987 + }, + { + "epoch": 1.37, + "grad_norm": 7.070253372192383, + "learning_rate": 9.4371031405526e-06, + "loss": 0.4005, + "step": 7988 + }, + { + "epoch": 1.37, + "grad_norm": 7.023111820220947, + "learning_rate": 9.434528917110005e-06, + "loss": 0.3017, + "step": 7989 + }, + { + "epoch": 1.37, + "grad_norm": 15.786094665527344, + "learning_rate": 9.43195469366741e-06, + "loss": 0.4113, + "step": 7990 + }, + { + "epoch": 1.37, + "grad_norm": 8.137983322143555, + "learning_rate": 9.429380470224816e-06, + "loss": 0.3046, + "step": 7991 + }, + { + "epoch": 1.37, + "grad_norm": 13.733827590942383, + "learning_rate": 9.426806246782221e-06, + "loss": 0.6012, + "step": 7992 + }, + { + "epoch": 1.37, + "grad_norm": 12.344902992248535, + "learning_rate": 9.424232023339626e-06, + "loss": 0.4224, + "step": 7993 + }, + { + "epoch": 1.37, + "grad_norm": 13.666622161865234, + "learning_rate": 9.421657799897031e-06, + "loss": 0.5556, + "step": 7994 + }, + { + "epoch": 1.37, + "grad_norm": 11.04263973236084, + "learning_rate": 9.419083576454436e-06, + "loss": 0.4013, + "step": 7995 + }, + { + "epoch": 1.37, + "grad_norm": 9.478981971740723, + "learning_rate": 9.416509353011843e-06, + "loss": 0.4965, + "step": 7996 + }, + { + "epoch": 1.37, + "grad_norm": 12.716177940368652, + "learning_rate": 9.413935129569246e-06, + "loss": 0.5522, + "step": 7997 + }, + { + "epoch": 1.37, + "grad_norm": 11.81005859375, + "learning_rate": 9.411360906126651e-06, + "loss": 0.6467, + "step": 7998 + }, + { + "epoch": 1.37, + "grad_norm": 9.434504508972168, + "learning_rate": 9.408786682684056e-06, + "loss": 0.3804, + "step": 7999 + }, + { + "epoch": 1.37, + "grad_norm": 13.765411376953125, + "learning_rate": 9.406212459241461e-06, + "loss": 0.5495, + "step": 8000 + }, + { + "epoch": 1.37, + "grad_norm": 12.332803726196289, + "learning_rate": 9.403638235798868e-06, + "loss": 0.5431, + "step": 8001 + }, + { + "epoch": 1.37, + "grad_norm": 15.456473350524902, + "learning_rate": 9.401064012356273e-06, + "loss": 0.5641, + "step": 8002 + }, + { + "epoch": 1.37, + "grad_norm": 10.626286506652832, + "learning_rate": 9.398489788913678e-06, + "loss": 0.4688, + "step": 8003 + }, + { + "epoch": 1.37, + "grad_norm": 11.766304016113281, + "learning_rate": 9.395915565471083e-06, + "loss": 0.4206, + "step": 8004 + }, + { + "epoch": 1.37, + "grad_norm": 10.014063835144043, + "learning_rate": 9.393341342028488e-06, + "loss": 0.5907, + "step": 8005 + }, + { + "epoch": 1.37, + "grad_norm": 14.282660484313965, + "learning_rate": 9.390767118585895e-06, + "loss": 0.5397, + "step": 8006 + }, + { + "epoch": 1.37, + "grad_norm": 10.133003234863281, + "learning_rate": 9.388192895143298e-06, + "loss": 0.3617, + "step": 8007 + }, + { + "epoch": 1.37, + "grad_norm": 13.292089462280273, + "learning_rate": 9.385618671700703e-06, + "loss": 0.4716, + "step": 8008 + }, + { + "epoch": 1.37, + "grad_norm": 10.585433959960938, + "learning_rate": 9.383044448258108e-06, + "loss": 0.4841, + "step": 8009 + }, + { + "epoch": 1.37, + "grad_norm": 9.634963035583496, + "learning_rate": 9.380470224815515e-06, + "loss": 0.3696, + "step": 8010 + }, + { + "epoch": 1.37, + "grad_norm": 11.995577812194824, + "learning_rate": 9.37789600137292e-06, + "loss": 0.5785, + "step": 8011 + }, + { + "epoch": 1.37, + "grad_norm": 17.64609718322754, + "learning_rate": 9.375321777930325e-06, + "loss": 0.6242, + "step": 8012 + }, + { + "epoch": 1.38, + "grad_norm": 7.982150077819824, + "learning_rate": 9.37274755448773e-06, + "loss": 0.3501, + "step": 8013 + }, + { + "epoch": 1.38, + "grad_norm": 12.285284996032715, + "learning_rate": 9.370173331045135e-06, + "loss": 0.6264, + "step": 8014 + }, + { + "epoch": 1.38, + "grad_norm": 7.575113773345947, + "learning_rate": 9.367599107602541e-06, + "loss": 0.3797, + "step": 8015 + }, + { + "epoch": 1.38, + "grad_norm": 8.67357349395752, + "learning_rate": 9.365024884159946e-06, + "loss": 0.3867, + "step": 8016 + }, + { + "epoch": 1.38, + "grad_norm": 10.106900215148926, + "learning_rate": 9.362450660717351e-06, + "loss": 0.3732, + "step": 8017 + }, + { + "epoch": 1.38, + "grad_norm": 15.497732162475586, + "learning_rate": 9.359876437274755e-06, + "loss": 0.7343, + "step": 8018 + }, + { + "epoch": 1.38, + "grad_norm": 7.809659481048584, + "learning_rate": 9.35730221383216e-06, + "loss": 0.4074, + "step": 8019 + }, + { + "epoch": 1.38, + "grad_norm": 10.506635665893555, + "learning_rate": 9.354727990389566e-06, + "loss": 0.406, + "step": 8020 + }, + { + "epoch": 1.38, + "grad_norm": 9.403267860412598, + "learning_rate": 9.352153766946971e-06, + "loss": 0.5285, + "step": 8021 + }, + { + "epoch": 1.38, + "grad_norm": 13.269692420959473, + "learning_rate": 9.349579543504376e-06, + "loss": 0.5486, + "step": 8022 + }, + { + "epoch": 1.38, + "grad_norm": 13.832744598388672, + "learning_rate": 9.347005320061781e-06, + "loss": 0.3635, + "step": 8023 + }, + { + "epoch": 1.38, + "grad_norm": 8.928349494934082, + "learning_rate": 9.344431096619188e-06, + "loss": 0.3154, + "step": 8024 + }, + { + "epoch": 1.38, + "grad_norm": 9.598359107971191, + "learning_rate": 9.341856873176593e-06, + "loss": 0.4291, + "step": 8025 + }, + { + "epoch": 1.38, + "grad_norm": 11.557576179504395, + "learning_rate": 9.339282649733998e-06, + "loss": 0.4514, + "step": 8026 + }, + { + "epoch": 1.38, + "grad_norm": 8.876992225646973, + "learning_rate": 9.336708426291403e-06, + "loss": 0.4586, + "step": 8027 + }, + { + "epoch": 1.38, + "grad_norm": 9.892364501953125, + "learning_rate": 9.334134202848806e-06, + "loss": 0.5799, + "step": 8028 + }, + { + "epoch": 1.38, + "grad_norm": 10.67786693572998, + "learning_rate": 9.331559979406213e-06, + "loss": 0.3627, + "step": 8029 + }, + { + "epoch": 1.38, + "grad_norm": 9.558042526245117, + "learning_rate": 9.328985755963618e-06, + "loss": 0.5747, + "step": 8030 + }, + { + "epoch": 1.38, + "grad_norm": 11.952147483825684, + "learning_rate": 9.326411532521023e-06, + "loss": 0.7353, + "step": 8031 + }, + { + "epoch": 1.38, + "grad_norm": 10.418725967407227, + "learning_rate": 9.323837309078428e-06, + "loss": 0.4161, + "step": 8032 + }, + { + "epoch": 1.38, + "grad_norm": 9.857487678527832, + "learning_rate": 9.321263085635833e-06, + "loss": 0.3946, + "step": 8033 + }, + { + "epoch": 1.38, + "grad_norm": 10.552834510803223, + "learning_rate": 9.31868886219324e-06, + "loss": 0.6213, + "step": 8034 + }, + { + "epoch": 1.38, + "grad_norm": 11.699774742126465, + "learning_rate": 9.316114638750644e-06, + "loss": 0.4156, + "step": 8035 + }, + { + "epoch": 1.38, + "grad_norm": 10.114253997802734, + "learning_rate": 9.31354041530805e-06, + "loss": 0.4122, + "step": 8036 + }, + { + "epoch": 1.38, + "grad_norm": 11.4076509475708, + "learning_rate": 9.310966191865454e-06, + "loss": 0.6045, + "step": 8037 + }, + { + "epoch": 1.38, + "grad_norm": 10.927475929260254, + "learning_rate": 9.30839196842286e-06, + "loss": 0.5449, + "step": 8038 + }, + { + "epoch": 1.38, + "grad_norm": 7.94198751449585, + "learning_rate": 9.305817744980264e-06, + "loss": 0.3991, + "step": 8039 + }, + { + "epoch": 1.38, + "grad_norm": 12.521130561828613, + "learning_rate": 9.30324352153767e-06, + "loss": 0.7631, + "step": 8040 + }, + { + "epoch": 1.38, + "grad_norm": 7.8478193283081055, + "learning_rate": 9.300669298095074e-06, + "loss": 0.2825, + "step": 8041 + }, + { + "epoch": 1.38, + "grad_norm": 10.764883995056152, + "learning_rate": 9.29809507465248e-06, + "loss": 0.5785, + "step": 8042 + }, + { + "epoch": 1.38, + "grad_norm": 9.743704795837402, + "learning_rate": 9.295520851209886e-06, + "loss": 0.5258, + "step": 8043 + }, + { + "epoch": 1.38, + "grad_norm": 10.892135620117188, + "learning_rate": 9.292946627767291e-06, + "loss": 0.476, + "step": 8044 + }, + { + "epoch": 1.38, + "grad_norm": 13.146523475646973, + "learning_rate": 9.290372404324696e-06, + "loss": 0.4902, + "step": 8045 + }, + { + "epoch": 1.38, + "grad_norm": 6.193522930145264, + "learning_rate": 9.287798180882101e-06, + "loss": 0.2906, + "step": 8046 + }, + { + "epoch": 1.38, + "grad_norm": 10.721735954284668, + "learning_rate": 9.285223957439506e-06, + "loss": 0.4723, + "step": 8047 + }, + { + "epoch": 1.38, + "grad_norm": 9.398427963256836, + "learning_rate": 9.282649733996913e-06, + "loss": 0.4065, + "step": 8048 + }, + { + "epoch": 1.38, + "grad_norm": 10.175658226013184, + "learning_rate": 9.280075510554316e-06, + "loss": 0.4116, + "step": 8049 + }, + { + "epoch": 1.38, + "grad_norm": 7.530220031738281, + "learning_rate": 9.277501287111721e-06, + "loss": 0.2988, + "step": 8050 + }, + { + "epoch": 1.38, + "grad_norm": 10.05105972290039, + "learning_rate": 9.274927063669126e-06, + "loss": 0.472, + "step": 8051 + }, + { + "epoch": 1.38, + "grad_norm": 11.80966567993164, + "learning_rate": 9.272352840226531e-06, + "loss": 0.5229, + "step": 8052 + }, + { + "epoch": 1.38, + "grad_norm": 11.220248222351074, + "learning_rate": 9.269778616783938e-06, + "loss": 0.3409, + "step": 8053 + }, + { + "epoch": 1.38, + "grad_norm": 9.488533020019531, + "learning_rate": 9.267204393341343e-06, + "loss": 0.3125, + "step": 8054 + }, + { + "epoch": 1.38, + "grad_norm": 10.574464797973633, + "learning_rate": 9.264630169898748e-06, + "loss": 0.4688, + "step": 8055 + }, + { + "epoch": 1.38, + "grad_norm": 15.483405113220215, + "learning_rate": 9.262055946456153e-06, + "loss": 0.6974, + "step": 8056 + }, + { + "epoch": 1.38, + "grad_norm": 12.591333389282227, + "learning_rate": 9.259481723013558e-06, + "loss": 0.5321, + "step": 8057 + }, + { + "epoch": 1.38, + "grad_norm": 14.4368257522583, + "learning_rate": 9.256907499570964e-06, + "loss": 0.5061, + "step": 8058 + }, + { + "epoch": 1.38, + "grad_norm": 10.434736251831055, + "learning_rate": 9.254333276128367e-06, + "loss": 0.5674, + "step": 8059 + }, + { + "epoch": 1.38, + "grad_norm": 17.47308349609375, + "learning_rate": 9.251759052685772e-06, + "loss": 0.5751, + "step": 8060 + }, + { + "epoch": 1.38, + "grad_norm": 12.3410062789917, + "learning_rate": 9.249184829243177e-06, + "loss": 0.4704, + "step": 8061 + }, + { + "epoch": 1.38, + "grad_norm": 8.103771209716797, + "learning_rate": 9.246610605800584e-06, + "loss": 0.3633, + "step": 8062 + }, + { + "epoch": 1.38, + "grad_norm": 9.619467735290527, + "learning_rate": 9.244036382357989e-06, + "loss": 0.6022, + "step": 8063 + }, + { + "epoch": 1.38, + "grad_norm": 12.229324340820312, + "learning_rate": 9.241462158915394e-06, + "loss": 0.4712, + "step": 8064 + }, + { + "epoch": 1.38, + "grad_norm": 10.660847663879395, + "learning_rate": 9.238887935472799e-06, + "loss": 0.5006, + "step": 8065 + }, + { + "epoch": 1.38, + "grad_norm": 13.222514152526855, + "learning_rate": 9.236313712030204e-06, + "loss": 0.5117, + "step": 8066 + }, + { + "epoch": 1.38, + "grad_norm": 12.187640190124512, + "learning_rate": 9.23373948858761e-06, + "loss": 0.5412, + "step": 8067 + }, + { + "epoch": 1.38, + "grad_norm": 9.165215492248535, + "learning_rate": 9.231165265145016e-06, + "loss": 0.4262, + "step": 8068 + }, + { + "epoch": 1.38, + "grad_norm": 10.318355560302734, + "learning_rate": 9.22859104170242e-06, + "loss": 0.4678, + "step": 8069 + }, + { + "epoch": 1.38, + "grad_norm": 14.313220024108887, + "learning_rate": 9.226016818259824e-06, + "loss": 0.6658, + "step": 8070 + }, + { + "epoch": 1.39, + "grad_norm": 7.866724014282227, + "learning_rate": 9.223442594817229e-06, + "loss": 0.3462, + "step": 8071 + }, + { + "epoch": 1.39, + "grad_norm": 10.366093635559082, + "learning_rate": 9.220868371374636e-06, + "loss": 0.5936, + "step": 8072 + }, + { + "epoch": 1.39, + "grad_norm": 9.871940612792969, + "learning_rate": 9.21829414793204e-06, + "loss": 0.3365, + "step": 8073 + }, + { + "epoch": 1.39, + "grad_norm": 8.973756790161133, + "learning_rate": 9.215719924489446e-06, + "loss": 0.4507, + "step": 8074 + }, + { + "epoch": 1.39, + "grad_norm": 10.828827857971191, + "learning_rate": 9.21314570104685e-06, + "loss": 0.5217, + "step": 8075 + }, + { + "epoch": 1.39, + "grad_norm": 10.579808235168457, + "learning_rate": 9.210571477604257e-06, + "loss": 0.497, + "step": 8076 + }, + { + "epoch": 1.39, + "grad_norm": 10.060633659362793, + "learning_rate": 9.207997254161662e-06, + "loss": 0.424, + "step": 8077 + }, + { + "epoch": 1.39, + "grad_norm": 12.351252555847168, + "learning_rate": 9.205423030719067e-06, + "loss": 0.4474, + "step": 8078 + }, + { + "epoch": 1.39, + "grad_norm": 11.33030891418457, + "learning_rate": 9.202848807276472e-06, + "loss": 0.3736, + "step": 8079 + }, + { + "epoch": 1.39, + "grad_norm": 11.173501014709473, + "learning_rate": 9.200274583833876e-06, + "loss": 0.5181, + "step": 8080 + }, + { + "epoch": 1.39, + "grad_norm": 9.958808898925781, + "learning_rate": 9.197700360391282e-06, + "loss": 0.5828, + "step": 8081 + }, + { + "epoch": 1.39, + "grad_norm": 11.104315757751465, + "learning_rate": 9.195126136948687e-06, + "loss": 0.3645, + "step": 8082 + }, + { + "epoch": 1.39, + "grad_norm": 12.270234107971191, + "learning_rate": 9.192551913506092e-06, + "loss": 0.6803, + "step": 8083 + }, + { + "epoch": 1.39, + "grad_norm": 11.381769180297852, + "learning_rate": 9.189977690063497e-06, + "loss": 0.5663, + "step": 8084 + }, + { + "epoch": 1.39, + "grad_norm": 9.067337036132812, + "learning_rate": 9.187403466620902e-06, + "loss": 0.3539, + "step": 8085 + }, + { + "epoch": 1.39, + "grad_norm": 8.98100757598877, + "learning_rate": 9.184829243178309e-06, + "loss": 0.3303, + "step": 8086 + }, + { + "epoch": 1.39, + "grad_norm": 11.848414421081543, + "learning_rate": 9.182255019735714e-06, + "loss": 0.4793, + "step": 8087 + }, + { + "epoch": 1.39, + "grad_norm": 8.813301086425781, + "learning_rate": 9.179680796293119e-06, + "loss": 0.3734, + "step": 8088 + }, + { + "epoch": 1.39, + "grad_norm": 8.797536849975586, + "learning_rate": 9.177106572850524e-06, + "loss": 0.3712, + "step": 8089 + }, + { + "epoch": 1.39, + "grad_norm": 7.001542091369629, + "learning_rate": 9.174532349407929e-06, + "loss": 0.2863, + "step": 8090 + }, + { + "epoch": 1.39, + "grad_norm": 14.185687065124512, + "learning_rate": 9.171958125965334e-06, + "loss": 0.6677, + "step": 8091 + }, + { + "epoch": 1.39, + "grad_norm": 8.785146713256836, + "learning_rate": 9.169383902522739e-06, + "loss": 0.306, + "step": 8092 + }, + { + "epoch": 1.39, + "grad_norm": 10.149352073669434, + "learning_rate": 9.166809679080144e-06, + "loss": 0.4567, + "step": 8093 + }, + { + "epoch": 1.39, + "grad_norm": 12.00492000579834, + "learning_rate": 9.164235455637549e-06, + "loss": 0.5592, + "step": 8094 + }, + { + "epoch": 1.39, + "grad_norm": 10.96116828918457, + "learning_rate": 9.161661232194956e-06, + "loss": 0.5866, + "step": 8095 + }, + { + "epoch": 1.39, + "grad_norm": 12.884201049804688, + "learning_rate": 9.15908700875236e-06, + "loss": 0.6157, + "step": 8096 + }, + { + "epoch": 1.39, + "grad_norm": 15.086968421936035, + "learning_rate": 9.156512785309765e-06, + "loss": 0.6028, + "step": 8097 + }, + { + "epoch": 1.39, + "grad_norm": 12.086346626281738, + "learning_rate": 9.15393856186717e-06, + "loss": 0.3687, + "step": 8098 + }, + { + "epoch": 1.39, + "grad_norm": 12.605083465576172, + "learning_rate": 9.151364338424575e-06, + "loss": 0.5211, + "step": 8099 + }, + { + "epoch": 1.39, + "grad_norm": 7.9868316650390625, + "learning_rate": 9.148790114981982e-06, + "loss": 0.4179, + "step": 8100 + }, + { + "epoch": 1.39, + "grad_norm": 8.09657096862793, + "learning_rate": 9.146215891539385e-06, + "loss": 0.2719, + "step": 8101 + }, + { + "epoch": 1.39, + "grad_norm": 10.188369750976562, + "learning_rate": 9.14364166809679e-06, + "loss": 0.4122, + "step": 8102 + }, + { + "epoch": 1.39, + "grad_norm": 16.78367805480957, + "learning_rate": 9.141067444654195e-06, + "loss": 0.6931, + "step": 8103 + }, + { + "epoch": 1.39, + "grad_norm": 10.463834762573242, + "learning_rate": 9.1384932212116e-06, + "loss": 0.2721, + "step": 8104 + }, + { + "epoch": 1.39, + "grad_norm": 10.522351264953613, + "learning_rate": 9.135918997769007e-06, + "loss": 0.5886, + "step": 8105 + }, + { + "epoch": 1.39, + "grad_norm": 12.248406410217285, + "learning_rate": 9.133344774326412e-06, + "loss": 0.5777, + "step": 8106 + }, + { + "epoch": 1.39, + "grad_norm": 6.383096218109131, + "learning_rate": 9.130770550883817e-06, + "loss": 0.2832, + "step": 8107 + }, + { + "epoch": 1.39, + "grad_norm": 8.572043418884277, + "learning_rate": 9.128196327441222e-06, + "loss": 0.3077, + "step": 8108 + }, + { + "epoch": 1.39, + "grad_norm": 11.84365177154541, + "learning_rate": 9.125622103998627e-06, + "loss": 0.3161, + "step": 8109 + }, + { + "epoch": 1.39, + "grad_norm": 14.437800407409668, + "learning_rate": 9.123047880556034e-06, + "loss": 0.5003, + "step": 8110 + }, + { + "epoch": 1.39, + "grad_norm": 11.400375366210938, + "learning_rate": 9.120473657113437e-06, + "loss": 0.5481, + "step": 8111 + }, + { + "epoch": 1.39, + "grad_norm": 8.788758277893066, + "learning_rate": 9.117899433670842e-06, + "loss": 0.51, + "step": 8112 + }, + { + "epoch": 1.39, + "grad_norm": 10.329127311706543, + "learning_rate": 9.115325210228247e-06, + "loss": 0.458, + "step": 8113 + }, + { + "epoch": 1.39, + "grad_norm": 11.465681076049805, + "learning_rate": 9.112750986785654e-06, + "loss": 0.5648, + "step": 8114 + }, + { + "epoch": 1.39, + "grad_norm": 8.650321006774902, + "learning_rate": 9.110176763343059e-06, + "loss": 0.3974, + "step": 8115 + }, + { + "epoch": 1.39, + "grad_norm": 13.671797752380371, + "learning_rate": 9.107602539900464e-06, + "loss": 0.5949, + "step": 8116 + }, + { + "epoch": 1.39, + "grad_norm": 8.014888763427734, + "learning_rate": 9.105028316457869e-06, + "loss": 0.3711, + "step": 8117 + }, + { + "epoch": 1.39, + "grad_norm": 8.839879989624023, + "learning_rate": 9.102454093015274e-06, + "loss": 0.3674, + "step": 8118 + }, + { + "epoch": 1.39, + "grad_norm": 9.793872833251953, + "learning_rate": 9.09987986957268e-06, + "loss": 0.5252, + "step": 8119 + }, + { + "epoch": 1.39, + "grad_norm": 7.970890998840332, + "learning_rate": 9.097305646130085e-06, + "loss": 0.4629, + "step": 8120 + }, + { + "epoch": 1.39, + "grad_norm": 12.441503524780273, + "learning_rate": 9.09473142268749e-06, + "loss": 0.3309, + "step": 8121 + }, + { + "epoch": 1.39, + "grad_norm": 9.942561149597168, + "learning_rate": 9.092157199244894e-06, + "loss": 0.4877, + "step": 8122 + }, + { + "epoch": 1.39, + "grad_norm": 7.946661949157715, + "learning_rate": 9.089582975802299e-06, + "loss": 0.4963, + "step": 8123 + }, + { + "epoch": 1.39, + "grad_norm": 16.9072265625, + "learning_rate": 9.087008752359705e-06, + "loss": 0.6806, + "step": 8124 + }, + { + "epoch": 1.39, + "grad_norm": 7.876772403717041, + "learning_rate": 9.08443452891711e-06, + "loss": 0.3393, + "step": 8125 + }, + { + "epoch": 1.39, + "grad_norm": 10.20521354675293, + "learning_rate": 9.081860305474515e-06, + "loss": 0.4055, + "step": 8126 + }, + { + "epoch": 1.39, + "grad_norm": 9.229073524475098, + "learning_rate": 9.07928608203192e-06, + "loss": 0.4284, + "step": 8127 + }, + { + "epoch": 1.39, + "grad_norm": 11.250029563903809, + "learning_rate": 9.076711858589325e-06, + "loss": 0.3294, + "step": 8128 + }, + { + "epoch": 1.4, + "grad_norm": 7.435440540313721, + "learning_rate": 9.074137635146732e-06, + "loss": 0.326, + "step": 8129 + }, + { + "epoch": 1.4, + "grad_norm": 7.752583980560303, + "learning_rate": 9.071563411704137e-06, + "loss": 0.3474, + "step": 8130 + }, + { + "epoch": 1.4, + "grad_norm": 9.20149040222168, + "learning_rate": 9.068989188261542e-06, + "loss": 0.3749, + "step": 8131 + }, + { + "epoch": 1.4, + "grad_norm": 11.70844841003418, + "learning_rate": 9.066414964818945e-06, + "loss": 0.8339, + "step": 8132 + }, + { + "epoch": 1.4, + "grad_norm": 14.071343421936035, + "learning_rate": 9.063840741376352e-06, + "loss": 0.414, + "step": 8133 + }, + { + "epoch": 1.4, + "grad_norm": 13.165002822875977, + "learning_rate": 9.061266517933757e-06, + "loss": 0.5569, + "step": 8134 + }, + { + "epoch": 1.4, + "grad_norm": 13.210293769836426, + "learning_rate": 9.058692294491162e-06, + "loss": 0.6359, + "step": 8135 + }, + { + "epoch": 1.4, + "grad_norm": 7.353513717651367, + "learning_rate": 9.056118071048567e-06, + "loss": 0.5061, + "step": 8136 + }, + { + "epoch": 1.4, + "grad_norm": 11.391908645629883, + "learning_rate": 9.053543847605972e-06, + "loss": 0.5219, + "step": 8137 + }, + { + "epoch": 1.4, + "grad_norm": 12.386820793151855, + "learning_rate": 9.050969624163378e-06, + "loss": 0.3787, + "step": 8138 + }, + { + "epoch": 1.4, + "grad_norm": 15.763087272644043, + "learning_rate": 9.048395400720783e-06, + "loss": 0.6168, + "step": 8139 + }, + { + "epoch": 1.4, + "grad_norm": 10.695484161376953, + "learning_rate": 9.045821177278188e-06, + "loss": 0.3589, + "step": 8140 + }, + { + "epoch": 1.4, + "grad_norm": 9.074987411499023, + "learning_rate": 9.043246953835593e-06, + "loss": 0.4396, + "step": 8141 + }, + { + "epoch": 1.4, + "grad_norm": 11.367197036743164, + "learning_rate": 9.040672730392998e-06, + "loss": 0.5388, + "step": 8142 + }, + { + "epoch": 1.4, + "grad_norm": 12.407917976379395, + "learning_rate": 9.038098506950403e-06, + "loss": 0.591, + "step": 8143 + }, + { + "epoch": 1.4, + "grad_norm": 12.238104820251465, + "learning_rate": 9.035524283507808e-06, + "loss": 0.5808, + "step": 8144 + }, + { + "epoch": 1.4, + "grad_norm": 14.704313278198242, + "learning_rate": 9.032950060065213e-06, + "loss": 0.4613, + "step": 8145 + }, + { + "epoch": 1.4, + "grad_norm": 10.697770118713379, + "learning_rate": 9.030375836622618e-06, + "loss": 0.5465, + "step": 8146 + }, + { + "epoch": 1.4, + "grad_norm": 7.017826557159424, + "learning_rate": 9.027801613180025e-06, + "loss": 0.3571, + "step": 8147 + }, + { + "epoch": 1.4, + "grad_norm": 7.1393351554870605, + "learning_rate": 9.02522738973743e-06, + "loss": 0.2915, + "step": 8148 + }, + { + "epoch": 1.4, + "grad_norm": 11.771018028259277, + "learning_rate": 9.022653166294835e-06, + "loss": 0.5083, + "step": 8149 + }, + { + "epoch": 1.4, + "grad_norm": 11.499884605407715, + "learning_rate": 9.02007894285224e-06, + "loss": 0.5661, + "step": 8150 + }, + { + "epoch": 1.4, + "grad_norm": 9.548622131347656, + "learning_rate": 9.017504719409645e-06, + "loss": 0.5096, + "step": 8151 + }, + { + "epoch": 1.4, + "grad_norm": 10.084818840026855, + "learning_rate": 9.014930495967052e-06, + "loss": 0.6314, + "step": 8152 + }, + { + "epoch": 1.4, + "grad_norm": 11.663867950439453, + "learning_rate": 9.012356272524455e-06, + "loss": 0.4214, + "step": 8153 + }, + { + "epoch": 1.4, + "grad_norm": 10.084020614624023, + "learning_rate": 9.00978204908186e-06, + "loss": 0.6374, + "step": 8154 + }, + { + "epoch": 1.4, + "grad_norm": 12.588614463806152, + "learning_rate": 9.007207825639265e-06, + "loss": 0.4847, + "step": 8155 + }, + { + "epoch": 1.4, + "grad_norm": 10.392104148864746, + "learning_rate": 9.00463360219667e-06, + "loss": 0.3692, + "step": 8156 + }, + { + "epoch": 1.4, + "grad_norm": 9.744665145874023, + "learning_rate": 9.002059378754077e-06, + "loss": 0.4344, + "step": 8157 + }, + { + "epoch": 1.4, + "grad_norm": 7.3781585693359375, + "learning_rate": 8.999485155311482e-06, + "loss": 0.2844, + "step": 8158 + }, + { + "epoch": 1.4, + "grad_norm": 8.054342269897461, + "learning_rate": 8.996910931868887e-06, + "loss": 0.4426, + "step": 8159 + }, + { + "epoch": 1.4, + "grad_norm": 8.579928398132324, + "learning_rate": 8.994336708426292e-06, + "loss": 0.4809, + "step": 8160 + }, + { + "epoch": 1.4, + "grad_norm": 10.54077434539795, + "learning_rate": 8.991762484983697e-06, + "loss": 0.2959, + "step": 8161 + }, + { + "epoch": 1.4, + "grad_norm": 7.806890487670898, + "learning_rate": 8.989188261541103e-06, + "loss": 0.354, + "step": 8162 + }, + { + "epoch": 1.4, + "grad_norm": 8.789708137512207, + "learning_rate": 8.986614038098507e-06, + "loss": 0.4683, + "step": 8163 + }, + { + "epoch": 1.4, + "grad_norm": 5.888046741485596, + "learning_rate": 8.984039814655911e-06, + "loss": 0.2823, + "step": 8164 + }, + { + "epoch": 1.4, + "grad_norm": 13.385698318481445, + "learning_rate": 8.981465591213316e-06, + "loss": 0.4918, + "step": 8165 + }, + { + "epoch": 1.4, + "grad_norm": 10.205957412719727, + "learning_rate": 8.978891367770723e-06, + "loss": 0.4328, + "step": 8166 + }, + { + "epoch": 1.4, + "grad_norm": 12.774950981140137, + "learning_rate": 8.976317144328128e-06, + "loss": 0.6236, + "step": 8167 + }, + { + "epoch": 1.4, + "grad_norm": 12.305766105651855, + "learning_rate": 8.973742920885533e-06, + "loss": 0.5363, + "step": 8168 + }, + { + "epoch": 1.4, + "grad_norm": 14.135571479797363, + "learning_rate": 8.971168697442938e-06, + "loss": 0.4644, + "step": 8169 + }, + { + "epoch": 1.4, + "grad_norm": 10.392585754394531, + "learning_rate": 8.968594474000343e-06, + "loss": 0.3965, + "step": 8170 + }, + { + "epoch": 1.4, + "grad_norm": 12.347583770751953, + "learning_rate": 8.96602025055775e-06, + "loss": 0.6629, + "step": 8171 + }, + { + "epoch": 1.4, + "grad_norm": 12.156412124633789, + "learning_rate": 8.963446027115155e-06, + "loss": 0.42, + "step": 8172 + }, + { + "epoch": 1.4, + "grad_norm": 14.219449043273926, + "learning_rate": 8.96087180367256e-06, + "loss": 0.5156, + "step": 8173 + }, + { + "epoch": 1.4, + "grad_norm": 12.61083984375, + "learning_rate": 8.958297580229963e-06, + "loss": 0.6273, + "step": 8174 + }, + { + "epoch": 1.4, + "grad_norm": 9.187403678894043, + "learning_rate": 8.955723356787368e-06, + "loss": 0.2932, + "step": 8175 + }, + { + "epoch": 1.4, + "grad_norm": 12.252684593200684, + "learning_rate": 8.953149133344775e-06, + "loss": 0.5857, + "step": 8176 + }, + { + "epoch": 1.4, + "grad_norm": 15.347857475280762, + "learning_rate": 8.95057490990218e-06, + "loss": 0.7255, + "step": 8177 + }, + { + "epoch": 1.4, + "grad_norm": 10.671805381774902, + "learning_rate": 8.948000686459585e-06, + "loss": 0.4495, + "step": 8178 + }, + { + "epoch": 1.4, + "grad_norm": 7.4444427490234375, + "learning_rate": 8.94542646301699e-06, + "loss": 0.3414, + "step": 8179 + }, + { + "epoch": 1.4, + "grad_norm": 10.483716011047363, + "learning_rate": 8.942852239574395e-06, + "loss": 0.385, + "step": 8180 + }, + { + "epoch": 1.4, + "grad_norm": 11.974165916442871, + "learning_rate": 8.940278016131801e-06, + "loss": 0.4548, + "step": 8181 + }, + { + "epoch": 1.4, + "grad_norm": 10.356922149658203, + "learning_rate": 8.937703792689206e-06, + "loss": 0.573, + "step": 8182 + }, + { + "epoch": 1.4, + "grad_norm": 9.437995910644531, + "learning_rate": 8.935129569246611e-06, + "loss": 0.427, + "step": 8183 + }, + { + "epoch": 1.4, + "grad_norm": 16.105772018432617, + "learning_rate": 8.932555345804015e-06, + "loss": 0.5355, + "step": 8184 + }, + { + "epoch": 1.4, + "grad_norm": 7.922123908996582, + "learning_rate": 8.929981122361421e-06, + "loss": 0.3058, + "step": 8185 + }, + { + "epoch": 1.4, + "grad_norm": 9.451415061950684, + "learning_rate": 8.927406898918826e-06, + "loss": 0.3559, + "step": 8186 + }, + { + "epoch": 1.41, + "grad_norm": 10.637255668640137, + "learning_rate": 8.924832675476231e-06, + "loss": 0.4065, + "step": 8187 + }, + { + "epoch": 1.41, + "grad_norm": 10.531972885131836, + "learning_rate": 8.922258452033636e-06, + "loss": 0.2564, + "step": 8188 + }, + { + "epoch": 1.41, + "grad_norm": 8.943924903869629, + "learning_rate": 8.919684228591041e-06, + "loss": 0.4977, + "step": 8189 + }, + { + "epoch": 1.41, + "grad_norm": 9.87394905090332, + "learning_rate": 8.917110005148448e-06, + "loss": 0.5008, + "step": 8190 + }, + { + "epoch": 1.41, + "grad_norm": 8.284592628479004, + "learning_rate": 8.914535781705853e-06, + "loss": 0.404, + "step": 8191 + }, + { + "epoch": 1.41, + "grad_norm": 10.662132263183594, + "learning_rate": 8.911961558263258e-06, + "loss": 0.3691, + "step": 8192 + }, + { + "epoch": 1.41, + "grad_norm": 9.03271484375, + "learning_rate": 8.909387334820663e-06, + "loss": 0.562, + "step": 8193 + }, + { + "epoch": 1.41, + "grad_norm": 9.703021049499512, + "learning_rate": 8.906813111378068e-06, + "loss": 0.3693, + "step": 8194 + }, + { + "epoch": 1.41, + "grad_norm": 14.925243377685547, + "learning_rate": 8.904238887935473e-06, + "loss": 0.689, + "step": 8195 + }, + { + "epoch": 1.41, + "grad_norm": 11.75747013092041, + "learning_rate": 8.901664664492878e-06, + "loss": 0.3488, + "step": 8196 + }, + { + "epoch": 1.41, + "grad_norm": 8.172895431518555, + "learning_rate": 8.899090441050283e-06, + "loss": 0.3431, + "step": 8197 + }, + { + "epoch": 1.41, + "grad_norm": 9.2479248046875, + "learning_rate": 8.896516217607688e-06, + "loss": 0.3468, + "step": 8198 + }, + { + "epoch": 1.41, + "grad_norm": 8.617008209228516, + "learning_rate": 8.893941994165095e-06, + "loss": 0.3558, + "step": 8199 + }, + { + "epoch": 1.41, + "grad_norm": 14.736842155456543, + "learning_rate": 8.8913677707225e-06, + "loss": 0.5643, + "step": 8200 + }, + { + "epoch": 1.41, + "grad_norm": 15.11459732055664, + "learning_rate": 8.888793547279904e-06, + "loss": 0.434, + "step": 8201 + }, + { + "epoch": 1.41, + "grad_norm": 8.434243202209473, + "learning_rate": 8.88621932383731e-06, + "loss": 0.4352, + "step": 8202 + }, + { + "epoch": 1.41, + "grad_norm": 11.888657569885254, + "learning_rate": 8.883645100394714e-06, + "loss": 0.537, + "step": 8203 + }, + { + "epoch": 1.41, + "grad_norm": 10.245015144348145, + "learning_rate": 8.881070876952121e-06, + "loss": 0.5832, + "step": 8204 + }, + { + "epoch": 1.41, + "grad_norm": 9.50074577331543, + "learning_rate": 8.878496653509524e-06, + "loss": 0.4098, + "step": 8205 + }, + { + "epoch": 1.41, + "grad_norm": 10.001585960388184, + "learning_rate": 8.87592243006693e-06, + "loss": 0.5697, + "step": 8206 + }, + { + "epoch": 1.41, + "grad_norm": 11.823759078979492, + "learning_rate": 8.873348206624334e-06, + "loss": 0.6038, + "step": 8207 + }, + { + "epoch": 1.41, + "grad_norm": 21.641414642333984, + "learning_rate": 8.87077398318174e-06, + "loss": 0.5124, + "step": 8208 + }, + { + "epoch": 1.41, + "grad_norm": 11.899895668029785, + "learning_rate": 8.868199759739146e-06, + "loss": 0.4119, + "step": 8209 + }, + { + "epoch": 1.41, + "grad_norm": 11.082362174987793, + "learning_rate": 8.865625536296551e-06, + "loss": 0.5197, + "step": 8210 + }, + { + "epoch": 1.41, + "grad_norm": 13.878350257873535, + "learning_rate": 8.863051312853956e-06, + "loss": 0.5615, + "step": 8211 + }, + { + "epoch": 1.41, + "grad_norm": 9.261621475219727, + "learning_rate": 8.860477089411361e-06, + "loss": 0.4499, + "step": 8212 + }, + { + "epoch": 1.41, + "grad_norm": 9.753847122192383, + "learning_rate": 8.857902865968766e-06, + "loss": 0.4265, + "step": 8213 + }, + { + "epoch": 1.41, + "grad_norm": 11.380756378173828, + "learning_rate": 8.855328642526173e-06, + "loss": 0.5466, + "step": 8214 + }, + { + "epoch": 1.41, + "grad_norm": 7.7287492752075195, + "learning_rate": 8.852754419083578e-06, + "loss": 0.4916, + "step": 8215 + }, + { + "epoch": 1.41, + "grad_norm": 15.159360885620117, + "learning_rate": 8.850180195640981e-06, + "loss": 0.5015, + "step": 8216 + }, + { + "epoch": 1.41, + "grad_norm": 12.102422714233398, + "learning_rate": 8.847605972198386e-06, + "loss": 0.5305, + "step": 8217 + }, + { + "epoch": 1.41, + "grad_norm": 8.977442741394043, + "learning_rate": 8.845031748755793e-06, + "loss": 0.3016, + "step": 8218 + }, + { + "epoch": 1.41, + "grad_norm": 7.829934120178223, + "learning_rate": 8.842457525313198e-06, + "loss": 0.3551, + "step": 8219 + }, + { + "epoch": 1.41, + "grad_norm": 10.341826438903809, + "learning_rate": 8.839883301870603e-06, + "loss": 0.4647, + "step": 8220 + }, + { + "epoch": 1.41, + "grad_norm": 10.113983154296875, + "learning_rate": 8.837309078428008e-06, + "loss": 0.544, + "step": 8221 + }, + { + "epoch": 1.41, + "grad_norm": 8.224457740783691, + "learning_rate": 8.834734854985413e-06, + "loss": 0.4098, + "step": 8222 + }, + { + "epoch": 1.41, + "grad_norm": 11.566207885742188, + "learning_rate": 8.83216063154282e-06, + "loss": 0.5322, + "step": 8223 + }, + { + "epoch": 1.41, + "grad_norm": 12.017365455627441, + "learning_rate": 8.829586408100224e-06, + "loss": 0.759, + "step": 8224 + }, + { + "epoch": 1.41, + "grad_norm": 12.604937553405762, + "learning_rate": 8.82701218465763e-06, + "loss": 0.3421, + "step": 8225 + }, + { + "epoch": 1.41, + "grad_norm": 8.038934707641602, + "learning_rate": 8.824437961215033e-06, + "loss": 0.3996, + "step": 8226 + }, + { + "epoch": 1.41, + "grad_norm": 11.238123893737793, + "learning_rate": 8.821863737772438e-06, + "loss": 0.5725, + "step": 8227 + }, + { + "epoch": 1.41, + "grad_norm": 16.162654876708984, + "learning_rate": 8.819289514329844e-06, + "loss": 0.5796, + "step": 8228 + }, + { + "epoch": 1.41, + "grad_norm": 7.382336616516113, + "learning_rate": 8.81671529088725e-06, + "loss": 0.3975, + "step": 8229 + }, + { + "epoch": 1.41, + "grad_norm": 10.843670845031738, + "learning_rate": 8.814141067444654e-06, + "loss": 0.5249, + "step": 8230 + }, + { + "epoch": 1.41, + "grad_norm": 11.033681869506836, + "learning_rate": 8.81156684400206e-06, + "loss": 0.6187, + "step": 8231 + }, + { + "epoch": 1.41, + "grad_norm": 9.541534423828125, + "learning_rate": 8.808992620559464e-06, + "loss": 0.4985, + "step": 8232 + }, + { + "epoch": 1.41, + "grad_norm": 9.84037971496582, + "learning_rate": 8.806418397116871e-06, + "loss": 0.4992, + "step": 8233 + }, + { + "epoch": 1.41, + "grad_norm": 8.408158302307129, + "learning_rate": 8.803844173674276e-06, + "loss": 0.3927, + "step": 8234 + }, + { + "epoch": 1.41, + "grad_norm": 10.84311294555664, + "learning_rate": 8.80126995023168e-06, + "loss": 0.5302, + "step": 8235 + }, + { + "epoch": 1.41, + "grad_norm": 8.212716102600098, + "learning_rate": 8.798695726789084e-06, + "loss": 0.3294, + "step": 8236 + }, + { + "epoch": 1.41, + "grad_norm": 10.820342063903809, + "learning_rate": 8.79612150334649e-06, + "loss": 0.4808, + "step": 8237 + }, + { + "epoch": 1.41, + "grad_norm": 14.624064445495605, + "learning_rate": 8.793547279903896e-06, + "loss": 0.6788, + "step": 8238 + }, + { + "epoch": 1.41, + "grad_norm": 12.806413650512695, + "learning_rate": 8.7909730564613e-06, + "loss": 0.6901, + "step": 8239 + }, + { + "epoch": 1.41, + "grad_norm": 10.613114356994629, + "learning_rate": 8.788398833018706e-06, + "loss": 0.4274, + "step": 8240 + }, + { + "epoch": 1.41, + "grad_norm": 10.828776359558105, + "learning_rate": 8.78582460957611e-06, + "loss": 0.3861, + "step": 8241 + }, + { + "epoch": 1.41, + "grad_norm": 12.33912181854248, + "learning_rate": 8.783250386133517e-06, + "loss": 0.4479, + "step": 8242 + }, + { + "epoch": 1.41, + "grad_norm": 8.551612854003906, + "learning_rate": 8.780676162690922e-06, + "loss": 0.424, + "step": 8243 + }, + { + "epoch": 1.41, + "grad_norm": 15.652555465698242, + "learning_rate": 8.778101939248327e-06, + "loss": 0.6361, + "step": 8244 + }, + { + "epoch": 1.41, + "grad_norm": 10.954252243041992, + "learning_rate": 8.775527715805732e-06, + "loss": 0.4973, + "step": 8245 + }, + { + "epoch": 1.42, + "grad_norm": 9.67759895324707, + "learning_rate": 8.772953492363137e-06, + "loss": 0.467, + "step": 8246 + }, + { + "epoch": 1.42, + "grad_norm": 7.356545925140381, + "learning_rate": 8.770379268920542e-06, + "loss": 0.251, + "step": 8247 + }, + { + "epoch": 1.42, + "grad_norm": 7.583957195281982, + "learning_rate": 8.767805045477947e-06, + "loss": 0.334, + "step": 8248 + }, + { + "epoch": 1.42, + "grad_norm": 10.201501846313477, + "learning_rate": 8.765230822035352e-06, + "loss": 0.4561, + "step": 8249 + }, + { + "epoch": 1.42, + "grad_norm": 8.017091751098633, + "learning_rate": 8.762656598592757e-06, + "loss": 0.336, + "step": 8250 + }, + { + "epoch": 1.42, + "grad_norm": 12.172395706176758, + "learning_rate": 8.760082375150164e-06, + "loss": 0.3834, + "step": 8251 + }, + { + "epoch": 1.42, + "grad_norm": 12.870978355407715, + "learning_rate": 8.757508151707569e-06, + "loss": 0.5051, + "step": 8252 + }, + { + "epoch": 1.42, + "grad_norm": 11.49899959564209, + "learning_rate": 8.754933928264974e-06, + "loss": 0.5327, + "step": 8253 + }, + { + "epoch": 1.42, + "grad_norm": 8.872721672058105, + "learning_rate": 8.752359704822379e-06, + "loss": 0.5365, + "step": 8254 + }, + { + "epoch": 1.42, + "grad_norm": 11.230222702026367, + "learning_rate": 8.749785481379784e-06, + "loss": 0.4969, + "step": 8255 + }, + { + "epoch": 1.42, + "grad_norm": 12.480122566223145, + "learning_rate": 8.74721125793719e-06, + "loss": 0.4104, + "step": 8256 + }, + { + "epoch": 1.42, + "grad_norm": 17.623018264770508, + "learning_rate": 8.744637034494594e-06, + "loss": 0.5869, + "step": 8257 + }, + { + "epoch": 1.42, + "grad_norm": 12.163650512695312, + "learning_rate": 8.742062811051999e-06, + "loss": 0.6378, + "step": 8258 + }, + { + "epoch": 1.42, + "grad_norm": 10.162103652954102, + "learning_rate": 8.739488587609404e-06, + "loss": 0.4801, + "step": 8259 + }, + { + "epoch": 1.42, + "grad_norm": 11.066727638244629, + "learning_rate": 8.736914364166809e-06, + "loss": 0.4924, + "step": 8260 + }, + { + "epoch": 1.42, + "grad_norm": 12.020031929016113, + "learning_rate": 8.734340140724216e-06, + "loss": 0.5363, + "step": 8261 + }, + { + "epoch": 1.42, + "grad_norm": 8.88711929321289, + "learning_rate": 8.73176591728162e-06, + "loss": 0.3471, + "step": 8262 + }, + { + "epoch": 1.42, + "grad_norm": 8.993285179138184, + "learning_rate": 8.729191693839026e-06, + "loss": 0.3973, + "step": 8263 + }, + { + "epoch": 1.42, + "grad_norm": 11.434762954711914, + "learning_rate": 8.72661747039643e-06, + "loss": 0.3801, + "step": 8264 + }, + { + "epoch": 1.42, + "grad_norm": 8.919322967529297, + "learning_rate": 8.724043246953836e-06, + "loss": 0.5145, + "step": 8265 + }, + { + "epoch": 1.42, + "grad_norm": 8.72677230834961, + "learning_rate": 8.721469023511242e-06, + "loss": 0.3841, + "step": 8266 + }, + { + "epoch": 1.42, + "grad_norm": 16.11995506286621, + "learning_rate": 8.718894800068647e-06, + "loss": 0.5194, + "step": 8267 + }, + { + "epoch": 1.42, + "grad_norm": 9.176628112792969, + "learning_rate": 8.71632057662605e-06, + "loss": 0.5011, + "step": 8268 + }, + { + "epoch": 1.42, + "grad_norm": 10.962334632873535, + "learning_rate": 8.713746353183455e-06, + "loss": 0.4482, + "step": 8269 + }, + { + "epoch": 1.42, + "grad_norm": 9.004849433898926, + "learning_rate": 8.711172129740862e-06, + "loss": 0.579, + "step": 8270 + }, + { + "epoch": 1.42, + "grad_norm": 10.271482467651367, + "learning_rate": 8.708597906298267e-06, + "loss": 0.5262, + "step": 8271 + }, + { + "epoch": 1.42, + "grad_norm": 8.614509582519531, + "learning_rate": 8.706023682855672e-06, + "loss": 0.3924, + "step": 8272 + }, + { + "epoch": 1.42, + "grad_norm": 10.539762496948242, + "learning_rate": 8.703449459413077e-06, + "loss": 0.6156, + "step": 8273 + }, + { + "epoch": 1.42, + "grad_norm": 7.934184551239014, + "learning_rate": 8.700875235970482e-06, + "loss": 0.3409, + "step": 8274 + }, + { + "epoch": 1.42, + "grad_norm": 9.466604232788086, + "learning_rate": 8.698301012527889e-06, + "loss": 0.4886, + "step": 8275 + }, + { + "epoch": 1.42, + "grad_norm": 7.193823337554932, + "learning_rate": 8.695726789085294e-06, + "loss": 0.3566, + "step": 8276 + }, + { + "epoch": 1.42, + "grad_norm": 6.440775394439697, + "learning_rate": 8.693152565642699e-06, + "loss": 0.2732, + "step": 8277 + }, + { + "epoch": 1.42, + "grad_norm": 7.61000394821167, + "learning_rate": 8.690578342200102e-06, + "loss": 0.491, + "step": 8278 + }, + { + "epoch": 1.42, + "grad_norm": 11.64564037322998, + "learning_rate": 8.688004118757507e-06, + "loss": 0.7108, + "step": 8279 + }, + { + "epoch": 1.42, + "grad_norm": 8.754938125610352, + "learning_rate": 8.685429895314914e-06, + "loss": 0.341, + "step": 8280 + }, + { + "epoch": 1.42, + "grad_norm": 7.427057266235352, + "learning_rate": 8.682855671872319e-06, + "loss": 0.3672, + "step": 8281 + }, + { + "epoch": 1.42, + "grad_norm": 11.298309326171875, + "learning_rate": 8.680281448429724e-06, + "loss": 0.3598, + "step": 8282 + }, + { + "epoch": 1.42, + "grad_norm": 6.559104919433594, + "learning_rate": 8.677707224987129e-06, + "loss": 0.3229, + "step": 8283 + }, + { + "epoch": 1.42, + "grad_norm": 9.896451950073242, + "learning_rate": 8.675133001544534e-06, + "loss": 0.5372, + "step": 8284 + }, + { + "epoch": 1.42, + "grad_norm": 10.25345516204834, + "learning_rate": 8.67255877810194e-06, + "loss": 0.6119, + "step": 8285 + }, + { + "epoch": 1.42, + "grad_norm": 9.65485954284668, + "learning_rate": 8.669984554659345e-06, + "loss": 0.4507, + "step": 8286 + }, + { + "epoch": 1.42, + "grad_norm": 10.328947067260742, + "learning_rate": 8.66741033121675e-06, + "loss": 0.5, + "step": 8287 + }, + { + "epoch": 1.42, + "grad_norm": 9.528351783752441, + "learning_rate": 8.664836107774154e-06, + "loss": 0.3386, + "step": 8288 + }, + { + "epoch": 1.42, + "grad_norm": 13.252469062805176, + "learning_rate": 8.66226188433156e-06, + "loss": 0.4749, + "step": 8289 + }, + { + "epoch": 1.42, + "grad_norm": 8.864716529846191, + "learning_rate": 8.659687660888965e-06, + "loss": 0.3129, + "step": 8290 + }, + { + "epoch": 1.42, + "grad_norm": 10.331801414489746, + "learning_rate": 8.65711343744637e-06, + "loss": 0.4275, + "step": 8291 + }, + { + "epoch": 1.42, + "grad_norm": 9.158288955688477, + "learning_rate": 8.654539214003775e-06, + "loss": 0.4124, + "step": 8292 + }, + { + "epoch": 1.42, + "grad_norm": 12.22913932800293, + "learning_rate": 8.65196499056118e-06, + "loss": 0.6633, + "step": 8293 + }, + { + "epoch": 1.42, + "grad_norm": 14.249902725219727, + "learning_rate": 8.649390767118587e-06, + "loss": 0.6345, + "step": 8294 + }, + { + "epoch": 1.42, + "grad_norm": 11.750879287719727, + "learning_rate": 8.646816543675992e-06, + "loss": 0.4824, + "step": 8295 + }, + { + "epoch": 1.42, + "grad_norm": 9.58333683013916, + "learning_rate": 8.644242320233397e-06, + "loss": 0.4946, + "step": 8296 + }, + { + "epoch": 1.42, + "grad_norm": 14.40911865234375, + "learning_rate": 8.641668096790802e-06, + "loss": 0.4101, + "step": 8297 + }, + { + "epoch": 1.42, + "grad_norm": 9.808370590209961, + "learning_rate": 8.639093873348207e-06, + "loss": 0.3125, + "step": 8298 + }, + { + "epoch": 1.42, + "grad_norm": 11.678741455078125, + "learning_rate": 8.636519649905612e-06, + "loss": 0.4791, + "step": 8299 + }, + { + "epoch": 1.42, + "grad_norm": 7.651301860809326, + "learning_rate": 8.633945426463017e-06, + "loss": 0.3633, + "step": 8300 + }, + { + "epoch": 1.42, + "grad_norm": 9.924830436706543, + "learning_rate": 8.631371203020422e-06, + "loss": 0.2811, + "step": 8301 + }, + { + "epoch": 1.42, + "grad_norm": 9.635172843933105, + "learning_rate": 8.628796979577827e-06, + "loss": 0.484, + "step": 8302 + }, + { + "epoch": 1.42, + "grad_norm": 10.726848602294922, + "learning_rate": 8.626222756135232e-06, + "loss": 0.385, + "step": 8303 + }, + { + "epoch": 1.43, + "grad_norm": 10.830549240112305, + "learning_rate": 8.623648532692639e-06, + "loss": 0.424, + "step": 8304 + }, + { + "epoch": 1.43, + "grad_norm": 10.686710357666016, + "learning_rate": 8.621074309250044e-06, + "loss": 0.3862, + "step": 8305 + }, + { + "epoch": 1.43, + "grad_norm": 9.640059471130371, + "learning_rate": 8.618500085807448e-06, + "loss": 0.5257, + "step": 8306 + }, + { + "epoch": 1.43, + "grad_norm": 12.689738273620605, + "learning_rate": 8.615925862364853e-06, + "loss": 0.6524, + "step": 8307 + }, + { + "epoch": 1.43, + "grad_norm": 8.673746109008789, + "learning_rate": 8.61335163892226e-06, + "loss": 0.3279, + "step": 8308 + }, + { + "epoch": 1.43, + "grad_norm": 12.808671951293945, + "learning_rate": 8.610777415479663e-06, + "loss": 0.682, + "step": 8309 + }, + { + "epoch": 1.43, + "grad_norm": 7.548581123352051, + "learning_rate": 8.608203192037068e-06, + "loss": 0.2683, + "step": 8310 + }, + { + "epoch": 1.43, + "grad_norm": 9.380051612854004, + "learning_rate": 8.605628968594473e-06, + "loss": 0.366, + "step": 8311 + }, + { + "epoch": 1.43, + "grad_norm": 11.494132041931152, + "learning_rate": 8.603054745151878e-06, + "loss": 0.4889, + "step": 8312 + }, + { + "epoch": 1.43, + "grad_norm": 6.73214864730835, + "learning_rate": 8.600480521709285e-06, + "loss": 0.3131, + "step": 8313 + }, + { + "epoch": 1.43, + "grad_norm": 12.153104782104492, + "learning_rate": 8.59790629826669e-06, + "loss": 0.5418, + "step": 8314 + }, + { + "epoch": 1.43, + "grad_norm": 9.794357299804688, + "learning_rate": 8.595332074824095e-06, + "loss": 0.3256, + "step": 8315 + }, + { + "epoch": 1.43, + "grad_norm": 9.454607963562012, + "learning_rate": 8.5927578513815e-06, + "loss": 0.356, + "step": 8316 + }, + { + "epoch": 1.43, + "grad_norm": 8.618805885314941, + "learning_rate": 8.590183627938905e-06, + "loss": 0.3531, + "step": 8317 + }, + { + "epoch": 1.43, + "grad_norm": 11.349067687988281, + "learning_rate": 8.587609404496312e-06, + "loss": 0.633, + "step": 8318 + }, + { + "epoch": 1.43, + "grad_norm": 11.352596282958984, + "learning_rate": 8.585035181053717e-06, + "loss": 0.5056, + "step": 8319 + }, + { + "epoch": 1.43, + "grad_norm": 12.118658065795898, + "learning_rate": 8.58246095761112e-06, + "loss": 0.7799, + "step": 8320 + }, + { + "epoch": 1.43, + "grad_norm": 12.962812423706055, + "learning_rate": 8.579886734168525e-06, + "loss": 0.5314, + "step": 8321 + }, + { + "epoch": 1.43, + "grad_norm": 12.80367374420166, + "learning_rate": 8.577312510725932e-06, + "loss": 0.4823, + "step": 8322 + }, + { + "epoch": 1.43, + "grad_norm": 9.806482315063477, + "learning_rate": 8.574738287283337e-06, + "loss": 0.3314, + "step": 8323 + }, + { + "epoch": 1.43, + "grad_norm": 9.926474571228027, + "learning_rate": 8.572164063840742e-06, + "loss": 0.334, + "step": 8324 + }, + { + "epoch": 1.43, + "grad_norm": 10.295524597167969, + "learning_rate": 8.569589840398147e-06, + "loss": 0.3899, + "step": 8325 + }, + { + "epoch": 1.43, + "grad_norm": 10.297874450683594, + "learning_rate": 8.567015616955552e-06, + "loss": 0.3585, + "step": 8326 + }, + { + "epoch": 1.43, + "grad_norm": 10.662091255187988, + "learning_rate": 8.564441393512958e-06, + "loss": 0.3612, + "step": 8327 + }, + { + "epoch": 1.43, + "grad_norm": 11.200091361999512, + "learning_rate": 8.561867170070363e-06, + "loss": 0.4963, + "step": 8328 + }, + { + "epoch": 1.43, + "grad_norm": 12.467052459716797, + "learning_rate": 8.559292946627768e-06, + "loss": 0.4722, + "step": 8329 + }, + { + "epoch": 1.43, + "grad_norm": 10.857604026794434, + "learning_rate": 8.556718723185172e-06, + "loss": 0.4791, + "step": 8330 + }, + { + "epoch": 1.43, + "grad_norm": 10.185227394104004, + "learning_rate": 8.554144499742577e-06, + "loss": 0.4457, + "step": 8331 + }, + { + "epoch": 1.43, + "grad_norm": 9.965193748474121, + "learning_rate": 8.551570276299983e-06, + "loss": 0.4176, + "step": 8332 + }, + { + "epoch": 1.43, + "grad_norm": 11.929383277893066, + "learning_rate": 8.548996052857388e-06, + "loss": 0.5286, + "step": 8333 + }, + { + "epoch": 1.43, + "grad_norm": 10.188926696777344, + "learning_rate": 8.546421829414793e-06, + "loss": 0.5374, + "step": 8334 + }, + { + "epoch": 1.43, + "grad_norm": 10.042409896850586, + "learning_rate": 8.543847605972198e-06, + "loss": 0.471, + "step": 8335 + }, + { + "epoch": 1.43, + "grad_norm": 8.140226364135742, + "learning_rate": 8.541273382529603e-06, + "loss": 0.2439, + "step": 8336 + }, + { + "epoch": 1.43, + "grad_norm": 10.757363319396973, + "learning_rate": 8.53869915908701e-06, + "loss": 0.2812, + "step": 8337 + }, + { + "epoch": 1.43, + "grad_norm": 21.087406158447266, + "learning_rate": 8.536124935644415e-06, + "loss": 0.4378, + "step": 8338 + }, + { + "epoch": 1.43, + "grad_norm": 10.848742485046387, + "learning_rate": 8.53355071220182e-06, + "loss": 0.52, + "step": 8339 + }, + { + "epoch": 1.43, + "grad_norm": 14.599913597106934, + "learning_rate": 8.530976488759223e-06, + "loss": 0.6345, + "step": 8340 + }, + { + "epoch": 1.43, + "grad_norm": 9.633811950683594, + "learning_rate": 8.52840226531663e-06, + "loss": 0.4267, + "step": 8341 + }, + { + "epoch": 1.43, + "grad_norm": 9.455072402954102, + "learning_rate": 8.525828041874035e-06, + "loss": 0.4001, + "step": 8342 + }, + { + "epoch": 1.43, + "grad_norm": 8.543682098388672, + "learning_rate": 8.52325381843144e-06, + "loss": 0.2689, + "step": 8343 + }, + { + "epoch": 1.43, + "grad_norm": 13.692220687866211, + "learning_rate": 8.520679594988845e-06, + "loss": 0.488, + "step": 8344 + }, + { + "epoch": 1.43, + "grad_norm": 11.151768684387207, + "learning_rate": 8.51810537154625e-06, + "loss": 0.3154, + "step": 8345 + }, + { + "epoch": 1.43, + "grad_norm": 10.533817291259766, + "learning_rate": 8.515531148103656e-06, + "loss": 0.6282, + "step": 8346 + }, + { + "epoch": 1.43, + "grad_norm": 9.896312713623047, + "learning_rate": 8.512956924661061e-06, + "loss": 0.5397, + "step": 8347 + }, + { + "epoch": 1.43, + "grad_norm": 9.509431838989258, + "learning_rate": 8.510382701218466e-06, + "loss": 0.4958, + "step": 8348 + }, + { + "epoch": 1.43, + "grad_norm": 8.831765174865723, + "learning_rate": 8.507808477775871e-06, + "loss": 0.2785, + "step": 8349 + }, + { + "epoch": 1.43, + "grad_norm": 8.94285774230957, + "learning_rate": 8.505234254333276e-06, + "loss": 0.6302, + "step": 8350 + }, + { + "epoch": 1.43, + "grad_norm": 12.702482223510742, + "learning_rate": 8.502660030890681e-06, + "loss": 0.6218, + "step": 8351 + }, + { + "epoch": 1.43, + "grad_norm": 8.443445205688477, + "learning_rate": 8.500085807448086e-06, + "loss": 0.3523, + "step": 8352 + }, + { + "epoch": 1.43, + "grad_norm": 8.827876091003418, + "learning_rate": 8.497511584005491e-06, + "loss": 0.3185, + "step": 8353 + }, + { + "epoch": 1.43, + "grad_norm": 11.086119651794434, + "learning_rate": 8.494937360562896e-06, + "loss": 0.397, + "step": 8354 + }, + { + "epoch": 1.43, + "grad_norm": 10.287346839904785, + "learning_rate": 8.492363137120301e-06, + "loss": 0.3258, + "step": 8355 + }, + { + "epoch": 1.43, + "grad_norm": 14.768567085266113, + "learning_rate": 8.489788913677708e-06, + "loss": 0.7088, + "step": 8356 + }, + { + "epoch": 1.43, + "grad_norm": 11.253920555114746, + "learning_rate": 8.487214690235113e-06, + "loss": 0.4393, + "step": 8357 + }, + { + "epoch": 1.43, + "grad_norm": 9.461947441101074, + "learning_rate": 8.484640466792518e-06, + "loss": 0.4945, + "step": 8358 + }, + { + "epoch": 1.43, + "grad_norm": 11.910832405090332, + "learning_rate": 8.482066243349923e-06, + "loss": 0.5817, + "step": 8359 + }, + { + "epoch": 1.43, + "grad_norm": 10.067282676696777, + "learning_rate": 8.47949201990733e-06, + "loss": 0.5148, + "step": 8360 + }, + { + "epoch": 1.43, + "grad_norm": 13.792328834533691, + "learning_rate": 8.476917796464733e-06, + "loss": 0.5585, + "step": 8361 + }, + { + "epoch": 1.44, + "grad_norm": 11.34477424621582, + "learning_rate": 8.474343573022138e-06, + "loss": 0.4886, + "step": 8362 + }, + { + "epoch": 1.44, + "grad_norm": 9.764257431030273, + "learning_rate": 8.471769349579543e-06, + "loss": 0.4799, + "step": 8363 + }, + { + "epoch": 1.44, + "grad_norm": 14.07537841796875, + "learning_rate": 8.469195126136948e-06, + "loss": 0.5224, + "step": 8364 + }, + { + "epoch": 1.44, + "grad_norm": 10.835885047912598, + "learning_rate": 8.466620902694355e-06, + "loss": 0.5015, + "step": 8365 + }, + { + "epoch": 1.44, + "grad_norm": 8.417989730834961, + "learning_rate": 8.46404667925176e-06, + "loss": 0.4305, + "step": 8366 + }, + { + "epoch": 1.44, + "grad_norm": 11.092061042785645, + "learning_rate": 8.461472455809165e-06, + "loss": 0.481, + "step": 8367 + }, + { + "epoch": 1.44, + "grad_norm": 13.665326118469238, + "learning_rate": 8.45889823236657e-06, + "loss": 0.8116, + "step": 8368 + }, + { + "epoch": 1.44, + "grad_norm": 11.353106498718262, + "learning_rate": 8.456324008923975e-06, + "loss": 0.3589, + "step": 8369 + }, + { + "epoch": 1.44, + "grad_norm": 11.502452850341797, + "learning_rate": 8.453749785481381e-06, + "loss": 0.4884, + "step": 8370 + }, + { + "epoch": 1.44, + "grad_norm": 10.927734375, + "learning_rate": 8.451175562038786e-06, + "loss": 0.3571, + "step": 8371 + }, + { + "epoch": 1.44, + "grad_norm": 13.052430152893066, + "learning_rate": 8.44860133859619e-06, + "loss": 0.5337, + "step": 8372 + }, + { + "epoch": 1.44, + "grad_norm": 13.651488304138184, + "learning_rate": 8.446027115153594e-06, + "loss": 0.5971, + "step": 8373 + }, + { + "epoch": 1.44, + "grad_norm": 12.430017471313477, + "learning_rate": 8.443452891711001e-06, + "loss": 0.5956, + "step": 8374 + }, + { + "epoch": 1.44, + "grad_norm": 8.363251686096191, + "learning_rate": 8.440878668268406e-06, + "loss": 0.4146, + "step": 8375 + }, + { + "epoch": 1.44, + "grad_norm": 8.345171928405762, + "learning_rate": 8.438304444825811e-06, + "loss": 0.4958, + "step": 8376 + }, + { + "epoch": 1.44, + "grad_norm": 10.103143692016602, + "learning_rate": 8.435730221383216e-06, + "loss": 0.4342, + "step": 8377 + }, + { + "epoch": 1.44, + "grad_norm": 12.357614517211914, + "learning_rate": 8.433155997940621e-06, + "loss": 0.4971, + "step": 8378 + }, + { + "epoch": 1.44, + "grad_norm": 12.737123489379883, + "learning_rate": 8.430581774498028e-06, + "loss": 0.4342, + "step": 8379 + }, + { + "epoch": 1.44, + "grad_norm": 7.8394904136657715, + "learning_rate": 8.428007551055433e-06, + "loss": 0.3054, + "step": 8380 + }, + { + "epoch": 1.44, + "grad_norm": 10.861248016357422, + "learning_rate": 8.425433327612838e-06, + "loss": 0.4876, + "step": 8381 + }, + { + "epoch": 1.44, + "grad_norm": 12.34762191772461, + "learning_rate": 8.422859104170241e-06, + "loss": 0.4909, + "step": 8382 + }, + { + "epoch": 1.44, + "grad_norm": 13.439802169799805, + "learning_rate": 8.420284880727646e-06, + "loss": 0.6265, + "step": 8383 + }, + { + "epoch": 1.44, + "grad_norm": 9.666215896606445, + "learning_rate": 8.417710657285053e-06, + "loss": 0.2877, + "step": 8384 + }, + { + "epoch": 1.44, + "grad_norm": 11.26455307006836, + "learning_rate": 8.415136433842458e-06, + "loss": 0.59, + "step": 8385 + }, + { + "epoch": 1.44, + "grad_norm": 9.452399253845215, + "learning_rate": 8.412562210399863e-06, + "loss": 0.2802, + "step": 8386 + }, + { + "epoch": 1.44, + "grad_norm": 12.878459930419922, + "learning_rate": 8.409987986957268e-06, + "loss": 0.5966, + "step": 8387 + }, + { + "epoch": 1.44, + "grad_norm": 8.647378921508789, + "learning_rate": 8.407413763514673e-06, + "loss": 0.2942, + "step": 8388 + }, + { + "epoch": 1.44, + "grad_norm": 10.983903884887695, + "learning_rate": 8.40483954007208e-06, + "loss": 0.4485, + "step": 8389 + }, + { + "epoch": 1.44, + "grad_norm": 11.199638366699219, + "learning_rate": 8.402265316629484e-06, + "loss": 0.5004, + "step": 8390 + }, + { + "epoch": 1.44, + "grad_norm": 12.529352188110352, + "learning_rate": 8.39969109318689e-06, + "loss": 0.398, + "step": 8391 + }, + { + "epoch": 1.44, + "grad_norm": 11.492273330688477, + "learning_rate": 8.397116869744293e-06, + "loss": 0.4537, + "step": 8392 + }, + { + "epoch": 1.44, + "grad_norm": 13.803034782409668, + "learning_rate": 8.3945426463017e-06, + "loss": 0.6738, + "step": 8393 + }, + { + "epoch": 1.44, + "grad_norm": 11.096636772155762, + "learning_rate": 8.391968422859104e-06, + "loss": 0.5831, + "step": 8394 + }, + { + "epoch": 1.44, + "grad_norm": 8.942265510559082, + "learning_rate": 8.38939419941651e-06, + "loss": 0.3807, + "step": 8395 + }, + { + "epoch": 1.44, + "grad_norm": 10.381046295166016, + "learning_rate": 8.386819975973914e-06, + "loss": 0.3847, + "step": 8396 + }, + { + "epoch": 1.44, + "grad_norm": 11.610161781311035, + "learning_rate": 8.38424575253132e-06, + "loss": 0.4258, + "step": 8397 + }, + { + "epoch": 1.44, + "grad_norm": 11.381559371948242, + "learning_rate": 8.381671529088726e-06, + "loss": 0.6171, + "step": 8398 + }, + { + "epoch": 1.44, + "grad_norm": 11.574058532714844, + "learning_rate": 8.379097305646131e-06, + "loss": 0.342, + "step": 8399 + }, + { + "epoch": 1.44, + "grad_norm": 13.315040588378906, + "learning_rate": 8.376523082203536e-06, + "loss": 0.5536, + "step": 8400 + }, + { + "epoch": 1.44, + "grad_norm": 8.571113586425781, + "learning_rate": 8.373948858760941e-06, + "loss": 0.3625, + "step": 8401 + }, + { + "epoch": 1.44, + "grad_norm": 16.108417510986328, + "learning_rate": 8.371374635318346e-06, + "loss": 0.5363, + "step": 8402 + }, + { + "epoch": 1.44, + "grad_norm": 11.237990379333496, + "learning_rate": 8.368800411875751e-06, + "loss": 0.5317, + "step": 8403 + }, + { + "epoch": 1.44, + "grad_norm": 10.077326774597168, + "learning_rate": 8.366226188433156e-06, + "loss": 0.2537, + "step": 8404 + }, + { + "epoch": 1.44, + "grad_norm": 12.669939041137695, + "learning_rate": 8.363651964990561e-06, + "loss": 0.3767, + "step": 8405 + }, + { + "epoch": 1.44, + "grad_norm": 9.217723846435547, + "learning_rate": 8.361077741547966e-06, + "loss": 0.4565, + "step": 8406 + }, + { + "epoch": 1.44, + "grad_norm": 10.763214111328125, + "learning_rate": 8.35850351810537e-06, + "loss": 0.3536, + "step": 8407 + }, + { + "epoch": 1.44, + "grad_norm": 10.262199401855469, + "learning_rate": 8.355929294662778e-06, + "loss": 0.5818, + "step": 8408 + }, + { + "epoch": 1.44, + "grad_norm": 10.669377326965332, + "learning_rate": 8.353355071220183e-06, + "loss": 0.4387, + "step": 8409 + }, + { + "epoch": 1.44, + "grad_norm": 11.528432846069336, + "learning_rate": 8.350780847777588e-06, + "loss": 0.5647, + "step": 8410 + }, + { + "epoch": 1.44, + "grad_norm": 10.921568870544434, + "learning_rate": 8.348206624334992e-06, + "loss": 0.4815, + "step": 8411 + }, + { + "epoch": 1.44, + "grad_norm": 11.749783515930176, + "learning_rate": 8.3456324008924e-06, + "loss": 0.5651, + "step": 8412 + }, + { + "epoch": 1.44, + "grad_norm": 7.831440448760986, + "learning_rate": 8.343058177449802e-06, + "loss": 0.3119, + "step": 8413 + }, + { + "epoch": 1.44, + "grad_norm": 11.510527610778809, + "learning_rate": 8.340483954007207e-06, + "loss": 0.5791, + "step": 8414 + }, + { + "epoch": 1.44, + "grad_norm": 11.164763450622559, + "learning_rate": 8.337909730564612e-06, + "loss": 0.4462, + "step": 8415 + }, + { + "epoch": 1.44, + "grad_norm": 8.396998405456543, + "learning_rate": 8.335335507122017e-06, + "loss": 0.4084, + "step": 8416 + }, + { + "epoch": 1.44, + "grad_norm": 8.974717140197754, + "learning_rate": 8.332761283679424e-06, + "loss": 0.2652, + "step": 8417 + }, + { + "epoch": 1.44, + "grad_norm": 10.95771312713623, + "learning_rate": 8.330187060236829e-06, + "loss": 0.5407, + "step": 8418 + }, + { + "epoch": 1.44, + "grad_norm": 8.97205638885498, + "learning_rate": 8.327612836794234e-06, + "loss": 0.4735, + "step": 8419 + }, + { + "epoch": 1.44, + "grad_norm": 9.182731628417969, + "learning_rate": 8.325038613351639e-06, + "loss": 0.4982, + "step": 8420 + }, + { + "epoch": 1.45, + "grad_norm": 7.552624225616455, + "learning_rate": 8.322464389909044e-06, + "loss": 0.3196, + "step": 8421 + }, + { + "epoch": 1.45, + "grad_norm": 9.71505355834961, + "learning_rate": 8.31989016646645e-06, + "loss": 0.5178, + "step": 8422 + }, + { + "epoch": 1.45, + "grad_norm": 10.814096450805664, + "learning_rate": 8.317315943023856e-06, + "loss": 0.6398, + "step": 8423 + }, + { + "epoch": 1.45, + "grad_norm": 9.24152946472168, + "learning_rate": 8.314741719581259e-06, + "loss": 0.5214, + "step": 8424 + }, + { + "epoch": 1.45, + "grad_norm": 9.424005508422852, + "learning_rate": 8.312167496138664e-06, + "loss": 0.405, + "step": 8425 + }, + { + "epoch": 1.45, + "grad_norm": 12.650248527526855, + "learning_rate": 8.30959327269607e-06, + "loss": 0.4663, + "step": 8426 + }, + { + "epoch": 1.45, + "grad_norm": 7.279358386993408, + "learning_rate": 8.307019049253476e-06, + "loss": 0.3011, + "step": 8427 + }, + { + "epoch": 1.45, + "grad_norm": 9.594501495361328, + "learning_rate": 8.30444482581088e-06, + "loss": 0.3335, + "step": 8428 + }, + { + "epoch": 1.45, + "grad_norm": 11.36333179473877, + "learning_rate": 8.301870602368286e-06, + "loss": 0.4246, + "step": 8429 + }, + { + "epoch": 1.45, + "grad_norm": 9.192784309387207, + "learning_rate": 8.29929637892569e-06, + "loss": 0.4719, + "step": 8430 + }, + { + "epoch": 1.45, + "grad_norm": 10.950685501098633, + "learning_rate": 8.296722155483097e-06, + "loss": 0.3771, + "step": 8431 + }, + { + "epoch": 1.45, + "grad_norm": 15.547282218933105, + "learning_rate": 8.294147932040502e-06, + "loss": 0.6865, + "step": 8432 + }, + { + "epoch": 1.45, + "grad_norm": 11.424164772033691, + "learning_rate": 8.291573708597907e-06, + "loss": 0.5652, + "step": 8433 + }, + { + "epoch": 1.45, + "grad_norm": 10.865653038024902, + "learning_rate": 8.28899948515531e-06, + "loss": 0.4456, + "step": 8434 + }, + { + "epoch": 1.45, + "grad_norm": 14.219254493713379, + "learning_rate": 8.286425261712716e-06, + "loss": 0.552, + "step": 8435 + }, + { + "epoch": 1.45, + "grad_norm": 8.369195938110352, + "learning_rate": 8.283851038270122e-06, + "loss": 0.4501, + "step": 8436 + }, + { + "epoch": 1.45, + "grad_norm": 8.32481575012207, + "learning_rate": 8.281276814827527e-06, + "loss": 0.4291, + "step": 8437 + }, + { + "epoch": 1.45, + "grad_norm": 11.981685638427734, + "learning_rate": 8.278702591384932e-06, + "loss": 0.4604, + "step": 8438 + }, + { + "epoch": 1.45, + "grad_norm": 10.983774185180664, + "learning_rate": 8.276128367942337e-06, + "loss": 0.4458, + "step": 8439 + }, + { + "epoch": 1.45, + "grad_norm": 12.131538391113281, + "learning_rate": 8.273554144499742e-06, + "loss": 0.3347, + "step": 8440 + }, + { + "epoch": 1.45, + "grad_norm": 10.760303497314453, + "learning_rate": 8.270979921057149e-06, + "loss": 0.4634, + "step": 8441 + }, + { + "epoch": 1.45, + "grad_norm": 12.544533729553223, + "learning_rate": 8.268405697614554e-06, + "loss": 0.5307, + "step": 8442 + }, + { + "epoch": 1.45, + "grad_norm": 10.032904624938965, + "learning_rate": 8.265831474171959e-06, + "loss": 0.3305, + "step": 8443 + }, + { + "epoch": 1.45, + "grad_norm": 9.01931381225586, + "learning_rate": 8.263257250729364e-06, + "loss": 0.3629, + "step": 8444 + }, + { + "epoch": 1.45, + "grad_norm": 9.627394676208496, + "learning_rate": 8.260683027286769e-06, + "loss": 0.4172, + "step": 8445 + }, + { + "epoch": 1.45, + "grad_norm": 8.199492454528809, + "learning_rate": 8.258108803844174e-06, + "loss": 0.3404, + "step": 8446 + }, + { + "epoch": 1.45, + "grad_norm": 8.086860656738281, + "learning_rate": 8.255534580401579e-06, + "loss": 0.3776, + "step": 8447 + }, + { + "epoch": 1.45, + "grad_norm": 11.144417762756348, + "learning_rate": 8.252960356958984e-06, + "loss": 0.4841, + "step": 8448 + }, + { + "epoch": 1.45, + "grad_norm": 8.8674898147583, + "learning_rate": 8.250386133516389e-06, + "loss": 0.4192, + "step": 8449 + }, + { + "epoch": 1.45, + "grad_norm": 9.37148666381836, + "learning_rate": 8.247811910073795e-06, + "loss": 0.4222, + "step": 8450 + }, + { + "epoch": 1.45, + "grad_norm": 11.26028060913086, + "learning_rate": 8.2452376866312e-06, + "loss": 0.5195, + "step": 8451 + }, + { + "epoch": 1.45, + "grad_norm": 10.05476188659668, + "learning_rate": 8.242663463188605e-06, + "loss": 0.4429, + "step": 8452 + }, + { + "epoch": 1.45, + "grad_norm": 10.395622253417969, + "learning_rate": 8.24008923974601e-06, + "loss": 0.5244, + "step": 8453 + }, + { + "epoch": 1.45, + "grad_norm": 10.304303169250488, + "learning_rate": 8.237515016303415e-06, + "loss": 0.4451, + "step": 8454 + }, + { + "epoch": 1.45, + "grad_norm": 11.714488983154297, + "learning_rate": 8.23494079286082e-06, + "loss": 0.6974, + "step": 8455 + }, + { + "epoch": 1.45, + "grad_norm": 15.059340476989746, + "learning_rate": 8.232366569418225e-06, + "loss": 0.728, + "step": 8456 + }, + { + "epoch": 1.45, + "grad_norm": 8.829148292541504, + "learning_rate": 8.22979234597563e-06, + "loss": 0.4292, + "step": 8457 + }, + { + "epoch": 1.45, + "grad_norm": 13.803596496582031, + "learning_rate": 8.227218122533035e-06, + "loss": 0.45, + "step": 8458 + }, + { + "epoch": 1.45, + "grad_norm": 9.919529914855957, + "learning_rate": 8.22464389909044e-06, + "loss": 0.4653, + "step": 8459 + }, + { + "epoch": 1.45, + "grad_norm": 11.814225196838379, + "learning_rate": 8.222069675647847e-06, + "loss": 0.3956, + "step": 8460 + }, + { + "epoch": 1.45, + "grad_norm": 8.573873519897461, + "learning_rate": 8.219495452205252e-06, + "loss": 0.4426, + "step": 8461 + }, + { + "epoch": 1.45, + "grad_norm": 10.42048168182373, + "learning_rate": 8.216921228762657e-06, + "loss": 0.518, + "step": 8462 + }, + { + "epoch": 1.45, + "grad_norm": 10.783432960510254, + "learning_rate": 8.214347005320062e-06, + "loss": 0.5066, + "step": 8463 + }, + { + "epoch": 1.45, + "grad_norm": 11.45118236541748, + "learning_rate": 8.211772781877469e-06, + "loss": 0.5789, + "step": 8464 + }, + { + "epoch": 1.45, + "grad_norm": 8.957793235778809, + "learning_rate": 8.209198558434872e-06, + "loss": 0.4351, + "step": 8465 + }, + { + "epoch": 1.45, + "grad_norm": 11.776083946228027, + "learning_rate": 8.206624334992277e-06, + "loss": 0.4333, + "step": 8466 + }, + { + "epoch": 1.45, + "grad_norm": 9.99911880493164, + "learning_rate": 8.204050111549682e-06, + "loss": 0.4055, + "step": 8467 + }, + { + "epoch": 1.45, + "grad_norm": 10.94580364227295, + "learning_rate": 8.201475888107087e-06, + "loss": 0.5057, + "step": 8468 + }, + { + "epoch": 1.45, + "grad_norm": 12.577740669250488, + "learning_rate": 8.198901664664494e-06, + "loss": 0.4571, + "step": 8469 + }, + { + "epoch": 1.45, + "grad_norm": 7.692513942718506, + "learning_rate": 8.196327441221899e-06, + "loss": 0.2926, + "step": 8470 + }, + { + "epoch": 1.45, + "grad_norm": 11.46033763885498, + "learning_rate": 8.193753217779304e-06, + "loss": 0.6169, + "step": 8471 + }, + { + "epoch": 1.45, + "grad_norm": 11.683175086975098, + "learning_rate": 8.191178994336709e-06, + "loss": 0.3517, + "step": 8472 + }, + { + "epoch": 1.45, + "grad_norm": 8.816125869750977, + "learning_rate": 8.188604770894114e-06, + "loss": 0.466, + "step": 8473 + }, + { + "epoch": 1.45, + "grad_norm": 11.747037887573242, + "learning_rate": 8.18603054745152e-06, + "loss": 0.4297, + "step": 8474 + }, + { + "epoch": 1.45, + "grad_norm": 8.123262405395508, + "learning_rate": 8.183456324008925e-06, + "loss": 0.3166, + "step": 8475 + }, + { + "epoch": 1.45, + "grad_norm": 12.995744705200195, + "learning_rate": 8.180882100566329e-06, + "loss": 0.5092, + "step": 8476 + }, + { + "epoch": 1.45, + "grad_norm": 13.646529197692871, + "learning_rate": 8.178307877123734e-06, + "loss": 0.4361, + "step": 8477 + }, + { + "epoch": 1.45, + "grad_norm": 10.212658882141113, + "learning_rate": 8.175733653681138e-06, + "loss": 0.352, + "step": 8478 + }, + { + "epoch": 1.46, + "grad_norm": 13.237067222595215, + "learning_rate": 8.173159430238545e-06, + "loss": 0.4433, + "step": 8479 + }, + { + "epoch": 1.46, + "grad_norm": 10.00698184967041, + "learning_rate": 8.17058520679595e-06, + "loss": 0.4, + "step": 8480 + }, + { + "epoch": 1.46, + "grad_norm": 9.731039047241211, + "learning_rate": 8.168010983353355e-06, + "loss": 0.3632, + "step": 8481 + }, + { + "epoch": 1.46, + "grad_norm": 7.285453796386719, + "learning_rate": 8.16543675991076e-06, + "loss": 0.2463, + "step": 8482 + }, + { + "epoch": 1.46, + "grad_norm": 9.291986465454102, + "learning_rate": 8.162862536468167e-06, + "loss": 0.4433, + "step": 8483 + }, + { + "epoch": 1.46, + "grad_norm": 12.476229667663574, + "learning_rate": 8.160288313025572e-06, + "loss": 0.4293, + "step": 8484 + }, + { + "epoch": 1.46, + "grad_norm": 8.852197647094727, + "learning_rate": 8.157714089582977e-06, + "loss": 0.4534, + "step": 8485 + }, + { + "epoch": 1.46, + "grad_norm": 12.264104843139648, + "learning_rate": 8.15513986614038e-06, + "loss": 0.5055, + "step": 8486 + }, + { + "epoch": 1.46, + "grad_norm": 13.131600379943848, + "learning_rate": 8.152565642697785e-06, + "loss": 0.5458, + "step": 8487 + }, + { + "epoch": 1.46, + "grad_norm": 13.988832473754883, + "learning_rate": 8.149991419255192e-06, + "loss": 0.6621, + "step": 8488 + }, + { + "epoch": 1.46, + "grad_norm": 12.021470069885254, + "learning_rate": 8.147417195812597e-06, + "loss": 0.4055, + "step": 8489 + }, + { + "epoch": 1.46, + "grad_norm": 12.236593246459961, + "learning_rate": 8.144842972370002e-06, + "loss": 0.5235, + "step": 8490 + }, + { + "epoch": 1.46, + "grad_norm": 10.95366096496582, + "learning_rate": 8.142268748927407e-06, + "loss": 0.48, + "step": 8491 + }, + { + "epoch": 1.46, + "grad_norm": 11.198348999023438, + "learning_rate": 8.139694525484812e-06, + "loss": 0.467, + "step": 8492 + }, + { + "epoch": 1.46, + "grad_norm": 9.833405494689941, + "learning_rate": 8.137120302042218e-06, + "loss": 0.3358, + "step": 8493 + }, + { + "epoch": 1.46, + "grad_norm": 11.655987739562988, + "learning_rate": 8.134546078599623e-06, + "loss": 0.5777, + "step": 8494 + }, + { + "epoch": 1.46, + "grad_norm": 11.078511238098145, + "learning_rate": 8.131971855157028e-06, + "loss": 0.4674, + "step": 8495 + }, + { + "epoch": 1.46, + "grad_norm": 11.794244766235352, + "learning_rate": 8.129397631714433e-06, + "loss": 0.4774, + "step": 8496 + }, + { + "epoch": 1.46, + "grad_norm": 9.90333080291748, + "learning_rate": 8.126823408271838e-06, + "loss": 0.4316, + "step": 8497 + }, + { + "epoch": 1.46, + "grad_norm": 13.14542007446289, + "learning_rate": 8.124249184829243e-06, + "loss": 0.4482, + "step": 8498 + }, + { + "epoch": 1.46, + "grad_norm": 8.563660621643066, + "learning_rate": 8.121674961386648e-06, + "loss": 0.5368, + "step": 8499 + }, + { + "epoch": 1.46, + "grad_norm": 8.97414493560791, + "learning_rate": 8.119100737944053e-06, + "loss": 0.4153, + "step": 8500 + }, + { + "epoch": 1.46, + "grad_norm": 8.99500560760498, + "learning_rate": 8.116526514501458e-06, + "loss": 0.4568, + "step": 8501 + }, + { + "epoch": 1.46, + "grad_norm": 9.135347366333008, + "learning_rate": 8.113952291058865e-06, + "loss": 0.3883, + "step": 8502 + }, + { + "epoch": 1.46, + "grad_norm": 10.680998802185059, + "learning_rate": 8.11137806761627e-06, + "loss": 0.5393, + "step": 8503 + }, + { + "epoch": 1.46, + "grad_norm": 12.127410888671875, + "learning_rate": 8.108803844173675e-06, + "loss": 0.5414, + "step": 8504 + }, + { + "epoch": 1.46, + "grad_norm": 9.314909934997559, + "learning_rate": 8.10622962073108e-06, + "loss": 0.3377, + "step": 8505 + }, + { + "epoch": 1.46, + "grad_norm": 16.325355529785156, + "learning_rate": 8.103655397288485e-06, + "loss": 0.8095, + "step": 8506 + }, + { + "epoch": 1.46, + "grad_norm": 9.419428825378418, + "learning_rate": 8.10108117384589e-06, + "loss": 0.4026, + "step": 8507 + }, + { + "epoch": 1.46, + "grad_norm": 10.552887916564941, + "learning_rate": 8.098506950403295e-06, + "loss": 0.5147, + "step": 8508 + }, + { + "epoch": 1.46, + "grad_norm": 9.195188522338867, + "learning_rate": 8.0959327269607e-06, + "loss": 0.4182, + "step": 8509 + }, + { + "epoch": 1.46, + "grad_norm": 10.206841468811035, + "learning_rate": 8.093358503518105e-06, + "loss": 0.5025, + "step": 8510 + }, + { + "epoch": 1.46, + "grad_norm": 7.249957084655762, + "learning_rate": 8.09078428007551e-06, + "loss": 0.2775, + "step": 8511 + }, + { + "epoch": 1.46, + "grad_norm": 11.243851661682129, + "learning_rate": 8.088210056632917e-06, + "loss": 0.4352, + "step": 8512 + }, + { + "epoch": 1.46, + "grad_norm": 9.872894287109375, + "learning_rate": 8.085635833190322e-06, + "loss": 0.331, + "step": 8513 + }, + { + "epoch": 1.46, + "grad_norm": 11.04076099395752, + "learning_rate": 8.083061609747727e-06, + "loss": 0.363, + "step": 8514 + }, + { + "epoch": 1.46, + "grad_norm": 15.39079475402832, + "learning_rate": 8.080487386305131e-06, + "loss": 0.4156, + "step": 8515 + }, + { + "epoch": 1.46, + "grad_norm": 7.378559589385986, + "learning_rate": 8.077913162862538e-06, + "loss": 0.2945, + "step": 8516 + }, + { + "epoch": 1.46, + "grad_norm": 9.975764274597168, + "learning_rate": 8.075338939419941e-06, + "loss": 0.5242, + "step": 8517 + }, + { + "epoch": 1.46, + "grad_norm": 8.688372611999512, + "learning_rate": 8.072764715977346e-06, + "loss": 0.5725, + "step": 8518 + }, + { + "epoch": 1.46, + "grad_norm": 12.530060768127441, + "learning_rate": 8.070190492534751e-06, + "loss": 0.5253, + "step": 8519 + }, + { + "epoch": 1.46, + "grad_norm": 13.306748390197754, + "learning_rate": 8.067616269092156e-06, + "loss": 0.5221, + "step": 8520 + }, + { + "epoch": 1.46, + "grad_norm": 9.011393547058105, + "learning_rate": 8.065042045649563e-06, + "loss": 0.5562, + "step": 8521 + }, + { + "epoch": 1.46, + "grad_norm": 7.171546936035156, + "learning_rate": 8.062467822206968e-06, + "loss": 0.3144, + "step": 8522 + }, + { + "epoch": 1.46, + "grad_norm": 15.945513725280762, + "learning_rate": 8.059893598764373e-06, + "loss": 0.5019, + "step": 8523 + }, + { + "epoch": 1.46, + "grad_norm": 11.490241050720215, + "learning_rate": 8.057319375321778e-06, + "loss": 0.4945, + "step": 8524 + }, + { + "epoch": 1.46, + "grad_norm": 10.841147422790527, + "learning_rate": 8.054745151879183e-06, + "loss": 0.479, + "step": 8525 + }, + { + "epoch": 1.46, + "grad_norm": 9.656150817871094, + "learning_rate": 8.05217092843659e-06, + "loss": 0.4177, + "step": 8526 + }, + { + "epoch": 1.46, + "grad_norm": 13.543272018432617, + "learning_rate": 8.049596704993995e-06, + "loss": 0.4383, + "step": 8527 + }, + { + "epoch": 1.46, + "grad_norm": 11.524797439575195, + "learning_rate": 8.047022481551398e-06, + "loss": 0.5641, + "step": 8528 + }, + { + "epoch": 1.46, + "grad_norm": 10.433579444885254, + "learning_rate": 8.044448258108803e-06, + "loss": 0.4055, + "step": 8529 + }, + { + "epoch": 1.46, + "grad_norm": 9.726025581359863, + "learning_rate": 8.041874034666208e-06, + "loss": 0.4021, + "step": 8530 + }, + { + "epoch": 1.46, + "grad_norm": 9.450407028198242, + "learning_rate": 8.039299811223615e-06, + "loss": 0.3207, + "step": 8531 + }, + { + "epoch": 1.46, + "grad_norm": 11.087461471557617, + "learning_rate": 8.03672558778102e-06, + "loss": 0.4748, + "step": 8532 + }, + { + "epoch": 1.46, + "grad_norm": 12.776268005371094, + "learning_rate": 8.034151364338425e-06, + "loss": 0.3336, + "step": 8533 + }, + { + "epoch": 1.46, + "grad_norm": 8.61547565460205, + "learning_rate": 8.03157714089583e-06, + "loss": 0.3658, + "step": 8534 + }, + { + "epoch": 1.46, + "grad_norm": 17.384796142578125, + "learning_rate": 8.029002917453236e-06, + "loss": 0.6244, + "step": 8535 + }, + { + "epoch": 1.46, + "grad_norm": 9.831313133239746, + "learning_rate": 8.026428694010641e-06, + "loss": 0.3752, + "step": 8536 + }, + { + "epoch": 1.47, + "grad_norm": 7.768578052520752, + "learning_rate": 8.023854470568046e-06, + "loss": 0.3222, + "step": 8537 + }, + { + "epoch": 1.47, + "grad_norm": 7.5904693603515625, + "learning_rate": 8.02128024712545e-06, + "loss": 0.324, + "step": 8538 + }, + { + "epoch": 1.47, + "grad_norm": 10.682817459106445, + "learning_rate": 8.018706023682855e-06, + "loss": 0.5733, + "step": 8539 + }, + { + "epoch": 1.47, + "grad_norm": 10.738873481750488, + "learning_rate": 8.016131800240261e-06, + "loss": 0.7028, + "step": 8540 + }, + { + "epoch": 1.47, + "grad_norm": 9.896012306213379, + "learning_rate": 8.013557576797666e-06, + "loss": 0.2906, + "step": 8541 + }, + { + "epoch": 1.47, + "grad_norm": 12.165061950683594, + "learning_rate": 8.010983353355071e-06, + "loss": 0.5081, + "step": 8542 + }, + { + "epoch": 1.47, + "grad_norm": 9.817131042480469, + "learning_rate": 8.008409129912476e-06, + "loss": 0.4214, + "step": 8543 + }, + { + "epoch": 1.47, + "grad_norm": 11.760784149169922, + "learning_rate": 8.005834906469881e-06, + "loss": 0.5577, + "step": 8544 + }, + { + "epoch": 1.47, + "grad_norm": 9.888113021850586, + "learning_rate": 8.003260683027288e-06, + "loss": 0.4521, + "step": 8545 + }, + { + "epoch": 1.47, + "grad_norm": 9.918608665466309, + "learning_rate": 8.000686459584693e-06, + "loss": 0.3398, + "step": 8546 + }, + { + "epoch": 1.47, + "grad_norm": 11.417941093444824, + "learning_rate": 7.998112236142098e-06, + "loss": 0.3342, + "step": 8547 + }, + { + "epoch": 1.47, + "grad_norm": 16.36760711669922, + "learning_rate": 7.995538012699503e-06, + "loss": 0.4828, + "step": 8548 + }, + { + "epoch": 1.47, + "grad_norm": 12.83072280883789, + "learning_rate": 7.992963789256908e-06, + "loss": 0.5709, + "step": 8549 + }, + { + "epoch": 1.47, + "grad_norm": 10.945616722106934, + "learning_rate": 7.990389565814313e-06, + "loss": 0.4415, + "step": 8550 + }, + { + "epoch": 1.47, + "grad_norm": 10.12635612487793, + "learning_rate": 7.987815342371718e-06, + "loss": 0.4411, + "step": 8551 + }, + { + "epoch": 1.47, + "grad_norm": 12.528286933898926, + "learning_rate": 7.985241118929123e-06, + "loss": 0.2534, + "step": 8552 + }, + { + "epoch": 1.47, + "grad_norm": 10.1712646484375, + "learning_rate": 7.982666895486528e-06, + "loss": 0.3184, + "step": 8553 + }, + { + "epoch": 1.47, + "grad_norm": 10.892218589782715, + "learning_rate": 7.980092672043934e-06, + "loss": 0.3492, + "step": 8554 + }, + { + "epoch": 1.47, + "grad_norm": 14.465339660644531, + "learning_rate": 7.97751844860134e-06, + "loss": 0.5743, + "step": 8555 + }, + { + "epoch": 1.47, + "grad_norm": 10.969108581542969, + "learning_rate": 7.974944225158744e-06, + "loss": 0.5014, + "step": 8556 + }, + { + "epoch": 1.47, + "grad_norm": 9.110034942626953, + "learning_rate": 7.97237000171615e-06, + "loss": 0.5086, + "step": 8557 + }, + { + "epoch": 1.47, + "grad_norm": 11.674614906311035, + "learning_rate": 7.969795778273554e-06, + "loss": 0.3687, + "step": 8558 + }, + { + "epoch": 1.47, + "grad_norm": 9.170455932617188, + "learning_rate": 7.96722155483096e-06, + "loss": 0.3053, + "step": 8559 + }, + { + "epoch": 1.47, + "grad_norm": 13.258048057556152, + "learning_rate": 7.964647331388364e-06, + "loss": 0.4365, + "step": 8560 + }, + { + "epoch": 1.47, + "grad_norm": 10.805398941040039, + "learning_rate": 7.96207310794577e-06, + "loss": 0.4557, + "step": 8561 + }, + { + "epoch": 1.47, + "grad_norm": 14.54407787322998, + "learning_rate": 7.959498884503174e-06, + "loss": 0.4291, + "step": 8562 + }, + { + "epoch": 1.47, + "grad_norm": 9.470309257507324, + "learning_rate": 7.95692466106058e-06, + "loss": 0.4608, + "step": 8563 + }, + { + "epoch": 1.47, + "grad_norm": 7.896372318267822, + "learning_rate": 7.954350437617986e-06, + "loss": 0.2956, + "step": 8564 + }, + { + "epoch": 1.47, + "grad_norm": 12.786768913269043, + "learning_rate": 7.951776214175391e-06, + "loss": 0.4783, + "step": 8565 + }, + { + "epoch": 1.47, + "grad_norm": 9.320167541503906, + "learning_rate": 7.949201990732796e-06, + "loss": 0.4196, + "step": 8566 + }, + { + "epoch": 1.47, + "grad_norm": 11.008817672729492, + "learning_rate": 7.946627767290201e-06, + "loss": 0.329, + "step": 8567 + }, + { + "epoch": 1.47, + "grad_norm": 10.293103218078613, + "learning_rate": 7.944053543847608e-06, + "loss": 0.5097, + "step": 8568 + }, + { + "epoch": 1.47, + "grad_norm": 9.558850288391113, + "learning_rate": 7.941479320405011e-06, + "loss": 0.4128, + "step": 8569 + }, + { + "epoch": 1.47, + "grad_norm": 8.128281593322754, + "learning_rate": 7.938905096962416e-06, + "loss": 0.4256, + "step": 8570 + }, + { + "epoch": 1.47, + "grad_norm": 9.740893363952637, + "learning_rate": 7.936330873519821e-06, + "loss": 0.3899, + "step": 8571 + }, + { + "epoch": 1.47, + "grad_norm": 9.472328186035156, + "learning_rate": 7.933756650077226e-06, + "loss": 0.5626, + "step": 8572 + }, + { + "epoch": 1.47, + "grad_norm": 9.7655611038208, + "learning_rate": 7.931182426634633e-06, + "loss": 0.4221, + "step": 8573 + }, + { + "epoch": 1.47, + "grad_norm": 9.645916938781738, + "learning_rate": 7.928608203192038e-06, + "loss": 0.4746, + "step": 8574 + }, + { + "epoch": 1.47, + "grad_norm": 9.39921760559082, + "learning_rate": 7.926033979749443e-06, + "loss": 0.3937, + "step": 8575 + }, + { + "epoch": 1.47, + "grad_norm": 11.620342254638672, + "learning_rate": 7.923459756306848e-06, + "loss": 0.4633, + "step": 8576 + }, + { + "epoch": 1.47, + "grad_norm": 12.516475677490234, + "learning_rate": 7.920885532864253e-06, + "loss": 0.4572, + "step": 8577 + }, + { + "epoch": 1.47, + "grad_norm": 11.848872184753418, + "learning_rate": 7.91831130942166e-06, + "loss": 0.6008, + "step": 8578 + }, + { + "epoch": 1.47, + "grad_norm": 10.542719841003418, + "learning_rate": 7.915737085979064e-06, + "loss": 0.4618, + "step": 8579 + }, + { + "epoch": 1.47, + "grad_norm": 12.783872604370117, + "learning_rate": 7.913162862536468e-06, + "loss": 0.6557, + "step": 8580 + }, + { + "epoch": 1.47, + "grad_norm": 11.77277946472168, + "learning_rate": 7.910588639093873e-06, + "loss": 0.7083, + "step": 8581 + }, + { + "epoch": 1.47, + "grad_norm": 15.899014472961426, + "learning_rate": 7.908014415651278e-06, + "loss": 0.5602, + "step": 8582 + }, + { + "epoch": 1.47, + "grad_norm": 12.835158348083496, + "learning_rate": 7.905440192208684e-06, + "loss": 0.6009, + "step": 8583 + }, + { + "epoch": 1.47, + "grad_norm": 10.717292785644531, + "learning_rate": 7.90286596876609e-06, + "loss": 0.4338, + "step": 8584 + }, + { + "epoch": 1.47, + "grad_norm": 12.928191184997559, + "learning_rate": 7.900291745323494e-06, + "loss": 0.5056, + "step": 8585 + }, + { + "epoch": 1.47, + "grad_norm": 10.180452346801758, + "learning_rate": 7.8977175218809e-06, + "loss": 0.56, + "step": 8586 + }, + { + "epoch": 1.47, + "grad_norm": 13.932937622070312, + "learning_rate": 7.895143298438306e-06, + "loss": 0.5892, + "step": 8587 + }, + { + "epoch": 1.47, + "grad_norm": 13.647933959960938, + "learning_rate": 7.89256907499571e-06, + "loss": 0.6721, + "step": 8588 + }, + { + "epoch": 1.47, + "grad_norm": 10.494047164916992, + "learning_rate": 7.889994851553116e-06, + "loss": 0.3785, + "step": 8589 + }, + { + "epoch": 1.47, + "grad_norm": 11.854719161987305, + "learning_rate": 7.887420628110519e-06, + "loss": 0.5584, + "step": 8590 + }, + { + "epoch": 1.47, + "grad_norm": 12.866348266601562, + "learning_rate": 7.884846404667924e-06, + "loss": 0.5017, + "step": 8591 + }, + { + "epoch": 1.47, + "grad_norm": 12.433438301086426, + "learning_rate": 7.88227218122533e-06, + "loss": 0.5221, + "step": 8592 + }, + { + "epoch": 1.47, + "grad_norm": 10.125801086425781, + "learning_rate": 7.879697957782736e-06, + "loss": 0.3946, + "step": 8593 + }, + { + "epoch": 1.47, + "grad_norm": 13.81082534790039, + "learning_rate": 7.87712373434014e-06, + "loss": 0.5728, + "step": 8594 + }, + { + "epoch": 1.48, + "grad_norm": 12.547959327697754, + "learning_rate": 7.874549510897546e-06, + "loss": 0.369, + "step": 8595 + }, + { + "epoch": 1.48, + "grad_norm": 8.833579063415527, + "learning_rate": 7.87197528745495e-06, + "loss": 0.4068, + "step": 8596 + }, + { + "epoch": 1.48, + "grad_norm": 10.35151195526123, + "learning_rate": 7.869401064012357e-06, + "loss": 0.3471, + "step": 8597 + }, + { + "epoch": 1.48, + "grad_norm": 8.833970069885254, + "learning_rate": 7.866826840569762e-06, + "loss": 0.3091, + "step": 8598 + }, + { + "epoch": 1.48, + "grad_norm": 9.919172286987305, + "learning_rate": 7.864252617127167e-06, + "loss": 0.3985, + "step": 8599 + }, + { + "epoch": 1.48, + "grad_norm": 7.889188766479492, + "learning_rate": 7.861678393684572e-06, + "loss": 0.4031, + "step": 8600 + }, + { + "epoch": 1.48, + "grad_norm": 11.633176803588867, + "learning_rate": 7.859104170241977e-06, + "loss": 0.3812, + "step": 8601 + }, + { + "epoch": 1.48, + "grad_norm": 10.027216911315918, + "learning_rate": 7.856529946799382e-06, + "loss": 0.3904, + "step": 8602 + }, + { + "epoch": 1.48, + "grad_norm": 10.757156372070312, + "learning_rate": 7.853955723356787e-06, + "loss": 0.49, + "step": 8603 + }, + { + "epoch": 1.48, + "grad_norm": 7.519168853759766, + "learning_rate": 7.851381499914192e-06, + "loss": 0.3731, + "step": 8604 + }, + { + "epoch": 1.48, + "grad_norm": 14.353914260864258, + "learning_rate": 7.848807276471597e-06, + "loss": 0.4911, + "step": 8605 + }, + { + "epoch": 1.48, + "grad_norm": 11.230640411376953, + "learning_rate": 7.846233053029004e-06, + "loss": 0.4357, + "step": 8606 + }, + { + "epoch": 1.48, + "grad_norm": 11.016966819763184, + "learning_rate": 7.843658829586409e-06, + "loss": 0.3928, + "step": 8607 + }, + { + "epoch": 1.48, + "grad_norm": 11.400897979736328, + "learning_rate": 7.841084606143814e-06, + "loss": 0.4244, + "step": 8608 + }, + { + "epoch": 1.48, + "grad_norm": 9.500144004821777, + "learning_rate": 7.838510382701219e-06, + "loss": 0.495, + "step": 8609 + }, + { + "epoch": 1.48, + "grad_norm": 11.50043773651123, + "learning_rate": 7.835936159258624e-06, + "loss": 0.3381, + "step": 8610 + }, + { + "epoch": 1.48, + "grad_norm": 15.781012535095215, + "learning_rate": 7.833361935816029e-06, + "loss": 0.5251, + "step": 8611 + }, + { + "epoch": 1.48, + "grad_norm": 11.041146278381348, + "learning_rate": 7.830787712373434e-06, + "loss": 0.5681, + "step": 8612 + }, + { + "epoch": 1.48, + "grad_norm": 13.248954772949219, + "learning_rate": 7.828213488930839e-06, + "loss": 0.6538, + "step": 8613 + }, + { + "epoch": 1.48, + "grad_norm": 15.323101043701172, + "learning_rate": 7.825639265488244e-06, + "loss": 0.5628, + "step": 8614 + }, + { + "epoch": 1.48, + "grad_norm": 9.977293014526367, + "learning_rate": 7.823065042045649e-06, + "loss": 0.3591, + "step": 8615 + }, + { + "epoch": 1.48, + "grad_norm": 7.425943374633789, + "learning_rate": 7.820490818603056e-06, + "loss": 0.2616, + "step": 8616 + }, + { + "epoch": 1.48, + "grad_norm": 7.769509792327881, + "learning_rate": 7.81791659516046e-06, + "loss": 0.4714, + "step": 8617 + }, + { + "epoch": 1.48, + "grad_norm": 9.433910369873047, + "learning_rate": 7.815342371717866e-06, + "loss": 0.3575, + "step": 8618 + }, + { + "epoch": 1.48, + "grad_norm": 8.992460250854492, + "learning_rate": 7.81276814827527e-06, + "loss": 0.3664, + "step": 8619 + }, + { + "epoch": 1.48, + "grad_norm": 14.948832511901855, + "learning_rate": 7.810193924832677e-06, + "loss": 0.5124, + "step": 8620 + }, + { + "epoch": 1.48, + "grad_norm": 12.187300682067871, + "learning_rate": 7.80761970139008e-06, + "loss": 0.3882, + "step": 8621 + }, + { + "epoch": 1.48, + "grad_norm": 11.126838684082031, + "learning_rate": 7.805045477947485e-06, + "loss": 0.4334, + "step": 8622 + }, + { + "epoch": 1.48, + "grad_norm": 7.636470794677734, + "learning_rate": 7.80247125450489e-06, + "loss": 0.2723, + "step": 8623 + }, + { + "epoch": 1.48, + "grad_norm": 13.808019638061523, + "learning_rate": 7.799897031062295e-06, + "loss": 0.566, + "step": 8624 + }, + { + "epoch": 1.48, + "grad_norm": 8.529194831848145, + "learning_rate": 7.797322807619702e-06, + "loss": 0.3412, + "step": 8625 + }, + { + "epoch": 1.48, + "grad_norm": 9.64117431640625, + "learning_rate": 7.794748584177107e-06, + "loss": 0.28, + "step": 8626 + }, + { + "epoch": 1.48, + "grad_norm": 9.318421363830566, + "learning_rate": 7.792174360734512e-06, + "loss": 0.3969, + "step": 8627 + }, + { + "epoch": 1.48, + "grad_norm": 12.63068962097168, + "learning_rate": 7.789600137291917e-06, + "loss": 0.51, + "step": 8628 + }, + { + "epoch": 1.48, + "grad_norm": 10.368439674377441, + "learning_rate": 7.787025913849322e-06, + "loss": 0.5441, + "step": 8629 + }, + { + "epoch": 1.48, + "grad_norm": 15.952978134155273, + "learning_rate": 7.784451690406729e-06, + "loss": 0.6755, + "step": 8630 + }, + { + "epoch": 1.48, + "grad_norm": 12.3170747756958, + "learning_rate": 7.781877466964134e-06, + "loss": 0.4856, + "step": 8631 + }, + { + "epoch": 1.48, + "grad_norm": 10.046541213989258, + "learning_rate": 7.779303243521537e-06, + "loss": 0.4159, + "step": 8632 + }, + { + "epoch": 1.48, + "grad_norm": 10.291326522827148, + "learning_rate": 7.776729020078942e-06, + "loss": 0.3775, + "step": 8633 + }, + { + "epoch": 1.48, + "grad_norm": 13.550436019897461, + "learning_rate": 7.774154796636347e-06, + "loss": 0.4928, + "step": 8634 + }, + { + "epoch": 1.48, + "grad_norm": 10.879800796508789, + "learning_rate": 7.771580573193754e-06, + "loss": 0.4117, + "step": 8635 + }, + { + "epoch": 1.48, + "grad_norm": 11.166871070861816, + "learning_rate": 7.769006349751159e-06, + "loss": 0.4399, + "step": 8636 + }, + { + "epoch": 1.48, + "grad_norm": 12.039190292358398, + "learning_rate": 7.766432126308564e-06, + "loss": 0.3285, + "step": 8637 + }, + { + "epoch": 1.48, + "grad_norm": 12.225506782531738, + "learning_rate": 7.763857902865969e-06, + "loss": 0.3806, + "step": 8638 + }, + { + "epoch": 1.48, + "grad_norm": 10.985786437988281, + "learning_rate": 7.761283679423375e-06, + "loss": 0.4005, + "step": 8639 + }, + { + "epoch": 1.48, + "grad_norm": 18.728994369506836, + "learning_rate": 7.75870945598078e-06, + "loss": 0.7569, + "step": 8640 + }, + { + "epoch": 1.48, + "grad_norm": 12.25137710571289, + "learning_rate": 7.756135232538185e-06, + "loss": 0.4928, + "step": 8641 + }, + { + "epoch": 1.48, + "grad_norm": 9.436782836914062, + "learning_rate": 7.753561009095589e-06, + "loss": 0.3492, + "step": 8642 + }, + { + "epoch": 1.48, + "grad_norm": 10.46200180053711, + "learning_rate": 7.750986785652994e-06, + "loss": 0.3731, + "step": 8643 + }, + { + "epoch": 1.48, + "grad_norm": 12.370363235473633, + "learning_rate": 7.7484125622104e-06, + "loss": 0.5277, + "step": 8644 + }, + { + "epoch": 1.48, + "grad_norm": 9.704399108886719, + "learning_rate": 7.745838338767805e-06, + "loss": 0.5361, + "step": 8645 + }, + { + "epoch": 1.48, + "grad_norm": 12.174317359924316, + "learning_rate": 7.74326411532521e-06, + "loss": 0.5948, + "step": 8646 + }, + { + "epoch": 1.48, + "grad_norm": 9.262345314025879, + "learning_rate": 7.740689891882615e-06, + "loss": 0.3897, + "step": 8647 + }, + { + "epoch": 1.48, + "grad_norm": 11.378447532653809, + "learning_rate": 7.73811566844002e-06, + "loss": 0.4219, + "step": 8648 + }, + { + "epoch": 1.48, + "grad_norm": 11.001533508300781, + "learning_rate": 7.735541444997427e-06, + "loss": 0.3486, + "step": 8649 + }, + { + "epoch": 1.48, + "grad_norm": 10.994129180908203, + "learning_rate": 7.732967221554832e-06, + "loss": 0.5329, + "step": 8650 + }, + { + "epoch": 1.48, + "grad_norm": 11.308484077453613, + "learning_rate": 7.730392998112237e-06, + "loss": 0.4263, + "step": 8651 + }, + { + "epoch": 1.48, + "grad_norm": 10.41646957397461, + "learning_rate": 7.727818774669642e-06, + "loss": 0.4035, + "step": 8652 + }, + { + "epoch": 1.48, + "grad_norm": 10.040307998657227, + "learning_rate": 7.725244551227045e-06, + "loss": 0.4165, + "step": 8653 + }, + { + "epoch": 1.49, + "grad_norm": 9.31472110748291, + "learning_rate": 7.722670327784452e-06, + "loss": 0.2045, + "step": 8654 + }, + { + "epoch": 1.49, + "grad_norm": 11.812907218933105, + "learning_rate": 7.720096104341857e-06, + "loss": 0.449, + "step": 8655 + }, + { + "epoch": 1.49, + "grad_norm": 8.812792778015137, + "learning_rate": 7.717521880899262e-06, + "loss": 0.4032, + "step": 8656 + }, + { + "epoch": 1.49, + "grad_norm": 12.149288177490234, + "learning_rate": 7.714947657456667e-06, + "loss": 0.5115, + "step": 8657 + }, + { + "epoch": 1.49, + "grad_norm": 13.846478462219238, + "learning_rate": 7.712373434014073e-06, + "loss": 0.4938, + "step": 8658 + }, + { + "epoch": 1.49, + "grad_norm": 12.545721054077148, + "learning_rate": 7.709799210571478e-06, + "loss": 0.4208, + "step": 8659 + }, + { + "epoch": 1.49, + "grad_norm": 11.555359840393066, + "learning_rate": 7.707224987128883e-06, + "loss": 0.4141, + "step": 8660 + }, + { + "epoch": 1.49, + "grad_norm": 9.194311141967773, + "learning_rate": 7.704650763686288e-06, + "loss": 0.3589, + "step": 8661 + }, + { + "epoch": 1.49, + "grad_norm": 11.833417892456055, + "learning_rate": 7.702076540243693e-06, + "loss": 0.3768, + "step": 8662 + }, + { + "epoch": 1.49, + "grad_norm": 12.746025085449219, + "learning_rate": 7.699502316801098e-06, + "loss": 0.5061, + "step": 8663 + }, + { + "epoch": 1.49, + "grad_norm": 11.398209571838379, + "learning_rate": 7.696928093358503e-06, + "loss": 0.5899, + "step": 8664 + }, + { + "epoch": 1.49, + "grad_norm": 10.046601295471191, + "learning_rate": 7.694353869915908e-06, + "loss": 0.497, + "step": 8665 + }, + { + "epoch": 1.49, + "grad_norm": 10.254277229309082, + "learning_rate": 7.691779646473313e-06, + "loss": 0.4254, + "step": 8666 + }, + { + "epoch": 1.49, + "grad_norm": 9.432557106018066, + "learning_rate": 7.689205423030718e-06, + "loss": 0.3644, + "step": 8667 + }, + { + "epoch": 1.49, + "grad_norm": 13.011787414550781, + "learning_rate": 7.686631199588125e-06, + "loss": 0.5751, + "step": 8668 + }, + { + "epoch": 1.49, + "grad_norm": 12.807391166687012, + "learning_rate": 7.68405697614553e-06, + "loss": 0.6072, + "step": 8669 + }, + { + "epoch": 1.49, + "grad_norm": 9.687145233154297, + "learning_rate": 7.681482752702935e-06, + "loss": 0.3684, + "step": 8670 + }, + { + "epoch": 1.49, + "grad_norm": 12.121129035949707, + "learning_rate": 7.67890852926034e-06, + "loss": 0.3502, + "step": 8671 + }, + { + "epoch": 1.49, + "grad_norm": 10.828241348266602, + "learning_rate": 7.676334305817747e-06, + "loss": 0.3774, + "step": 8672 + }, + { + "epoch": 1.49, + "grad_norm": 9.942272186279297, + "learning_rate": 7.67376008237515e-06, + "loss": 0.4005, + "step": 8673 + }, + { + "epoch": 1.49, + "grad_norm": 8.567776679992676, + "learning_rate": 7.671185858932555e-06, + "loss": 0.4092, + "step": 8674 + }, + { + "epoch": 1.49, + "grad_norm": 14.88443660736084, + "learning_rate": 7.66861163548996e-06, + "loss": 0.3564, + "step": 8675 + }, + { + "epoch": 1.49, + "grad_norm": 17.76310157775879, + "learning_rate": 7.666037412047365e-06, + "loss": 0.5737, + "step": 8676 + }, + { + "epoch": 1.49, + "grad_norm": 8.955726623535156, + "learning_rate": 7.663463188604772e-06, + "loss": 0.4338, + "step": 8677 + }, + { + "epoch": 1.49, + "grad_norm": 9.855793952941895, + "learning_rate": 7.660888965162177e-06, + "loss": 0.4027, + "step": 8678 + }, + { + "epoch": 1.49, + "grad_norm": 10.609536170959473, + "learning_rate": 7.658314741719582e-06, + "loss": 0.4467, + "step": 8679 + }, + { + "epoch": 1.49, + "grad_norm": 11.255777359008789, + "learning_rate": 7.655740518276987e-06, + "loss": 0.3606, + "step": 8680 + }, + { + "epoch": 1.49, + "grad_norm": 8.6880521774292, + "learning_rate": 7.653166294834392e-06, + "loss": 0.5056, + "step": 8681 + }, + { + "epoch": 1.49, + "grad_norm": 9.712709426879883, + "learning_rate": 7.650592071391798e-06, + "loss": 0.4961, + "step": 8682 + }, + { + "epoch": 1.49, + "grad_norm": 12.465036392211914, + "learning_rate": 7.648017847949203e-06, + "loss": 0.5416, + "step": 8683 + }, + { + "epoch": 1.49, + "grad_norm": 7.517153263092041, + "learning_rate": 7.645443624506607e-06, + "loss": 0.3217, + "step": 8684 + }, + { + "epoch": 1.49, + "grad_norm": 9.14057445526123, + "learning_rate": 7.642869401064012e-06, + "loss": 0.4006, + "step": 8685 + }, + { + "epoch": 1.49, + "grad_norm": 9.679612159729004, + "learning_rate": 7.640295177621417e-06, + "loss": 0.3961, + "step": 8686 + }, + { + "epoch": 1.49, + "grad_norm": 12.151049613952637, + "learning_rate": 7.637720954178823e-06, + "loss": 0.6611, + "step": 8687 + }, + { + "epoch": 1.49, + "grad_norm": 7.980060577392578, + "learning_rate": 7.635146730736228e-06, + "loss": 0.348, + "step": 8688 + }, + { + "epoch": 1.49, + "grad_norm": 9.323481559753418, + "learning_rate": 7.632572507293633e-06, + "loss": 0.4482, + "step": 8689 + }, + { + "epoch": 1.49, + "grad_norm": 9.23275375366211, + "learning_rate": 7.629998283851038e-06, + "loss": 0.2792, + "step": 8690 + }, + { + "epoch": 1.49, + "grad_norm": 16.62765121459961, + "learning_rate": 7.627424060408444e-06, + "loss": 0.4991, + "step": 8691 + }, + { + "epoch": 1.49, + "grad_norm": 7.456879138946533, + "learning_rate": 7.624849836965849e-06, + "loss": 0.3401, + "step": 8692 + }, + { + "epoch": 1.49, + "grad_norm": 14.61656379699707, + "learning_rate": 7.622275613523254e-06, + "loss": 0.5759, + "step": 8693 + }, + { + "epoch": 1.49, + "grad_norm": 8.339529037475586, + "learning_rate": 7.619701390080659e-06, + "loss": 0.3965, + "step": 8694 + }, + { + "epoch": 1.49, + "grad_norm": 12.059610366821289, + "learning_rate": 7.617127166638064e-06, + "loss": 0.4692, + "step": 8695 + }, + { + "epoch": 1.49, + "grad_norm": 14.67989730834961, + "learning_rate": 7.614552943195471e-06, + "loss": 0.5743, + "step": 8696 + }, + { + "epoch": 1.49, + "grad_norm": 10.272076606750488, + "learning_rate": 7.611978719752875e-06, + "loss": 0.3803, + "step": 8697 + }, + { + "epoch": 1.49, + "grad_norm": 10.38070011138916, + "learning_rate": 7.60940449631028e-06, + "loss": 0.4559, + "step": 8698 + }, + { + "epoch": 1.49, + "grad_norm": 8.39708423614502, + "learning_rate": 7.606830272867685e-06, + "loss": 0.3623, + "step": 8699 + }, + { + "epoch": 1.49, + "grad_norm": 11.156476974487305, + "learning_rate": 7.60425604942509e-06, + "loss": 0.5037, + "step": 8700 + }, + { + "epoch": 1.49, + "grad_norm": 14.293466567993164, + "learning_rate": 7.601681825982496e-06, + "loss": 0.5061, + "step": 8701 + }, + { + "epoch": 1.49, + "grad_norm": 8.782387733459473, + "learning_rate": 7.5991076025399006e-06, + "loss": 0.2963, + "step": 8702 + }, + { + "epoch": 1.49, + "grad_norm": 13.22900676727295, + "learning_rate": 7.5965333790973055e-06, + "loss": 0.4414, + "step": 8703 + }, + { + "epoch": 1.49, + "grad_norm": 11.715869903564453, + "learning_rate": 7.5939591556547105e-06, + "loss": 0.5806, + "step": 8704 + }, + { + "epoch": 1.49, + "grad_norm": 7.745909690856934, + "learning_rate": 7.5913849322121155e-06, + "loss": 0.3812, + "step": 8705 + }, + { + "epoch": 1.49, + "grad_norm": 9.796847343444824, + "learning_rate": 7.588810708769522e-06, + "loss": 0.4646, + "step": 8706 + }, + { + "epoch": 1.49, + "grad_norm": 13.717659950256348, + "learning_rate": 7.586236485326926e-06, + "loss": 0.4574, + "step": 8707 + }, + { + "epoch": 1.49, + "grad_norm": 9.462380409240723, + "learning_rate": 7.583662261884331e-06, + "loss": 0.3608, + "step": 8708 + }, + { + "epoch": 1.49, + "grad_norm": 12.34654712677002, + "learning_rate": 7.581088038441736e-06, + "loss": 0.4558, + "step": 8709 + }, + { + "epoch": 1.49, + "grad_norm": 11.999104499816895, + "learning_rate": 7.578513814999143e-06, + "loss": 0.4434, + "step": 8710 + }, + { + "epoch": 1.49, + "grad_norm": 11.085166931152344, + "learning_rate": 7.575939591556548e-06, + "loss": 0.5302, + "step": 8711 + }, + { + "epoch": 1.5, + "grad_norm": 11.097311019897461, + "learning_rate": 7.573365368113952e-06, + "loss": 0.5046, + "step": 8712 + }, + { + "epoch": 1.5, + "grad_norm": 12.378292083740234, + "learning_rate": 7.570791144671357e-06, + "loss": 0.5099, + "step": 8713 + }, + { + "epoch": 1.5, + "grad_norm": 9.888908386230469, + "learning_rate": 7.568216921228762e-06, + "loss": 0.4807, + "step": 8714 + }, + { + "epoch": 1.5, + "grad_norm": 8.898755073547363, + "learning_rate": 7.565642697786169e-06, + "loss": 0.3219, + "step": 8715 + }, + { + "epoch": 1.5, + "grad_norm": 11.602651596069336, + "learning_rate": 7.563068474343574e-06, + "loss": 0.6849, + "step": 8716 + }, + { + "epoch": 1.5, + "grad_norm": 8.659979820251465, + "learning_rate": 7.560494250900979e-06, + "loss": 0.3166, + "step": 8717 + }, + { + "epoch": 1.5, + "grad_norm": 8.922460556030273, + "learning_rate": 7.557920027458383e-06, + "loss": 0.3806, + "step": 8718 + }, + { + "epoch": 1.5, + "grad_norm": 9.665297508239746, + "learning_rate": 7.555345804015788e-06, + "loss": 0.3061, + "step": 8719 + }, + { + "epoch": 1.5, + "grad_norm": 10.128588676452637, + "learning_rate": 7.5527715805731946e-06, + "loss": 0.3975, + "step": 8720 + }, + { + "epoch": 1.5, + "grad_norm": 10.131848335266113, + "learning_rate": 7.5501973571305995e-06, + "loss": 0.5227, + "step": 8721 + }, + { + "epoch": 1.5, + "grad_norm": 10.696487426757812, + "learning_rate": 7.5476231336880045e-06, + "loss": 0.4333, + "step": 8722 + }, + { + "epoch": 1.5, + "grad_norm": 11.940535545349121, + "learning_rate": 7.545048910245409e-06, + "loss": 0.4935, + "step": 8723 + }, + { + "epoch": 1.5, + "grad_norm": 9.68030071258545, + "learning_rate": 7.542474686802815e-06, + "loss": 0.4789, + "step": 8724 + }, + { + "epoch": 1.5, + "grad_norm": 12.738661766052246, + "learning_rate": 7.53990046336022e-06, + "loss": 0.3503, + "step": 8725 + }, + { + "epoch": 1.5, + "grad_norm": 15.187797546386719, + "learning_rate": 7.537326239917625e-06, + "loss": 0.4757, + "step": 8726 + }, + { + "epoch": 1.5, + "grad_norm": 9.924798011779785, + "learning_rate": 7.53475201647503e-06, + "loss": 0.4424, + "step": 8727 + }, + { + "epoch": 1.5, + "grad_norm": 8.826604843139648, + "learning_rate": 7.5321777930324345e-06, + "loss": 0.3712, + "step": 8728 + }, + { + "epoch": 1.5, + "grad_norm": 11.118193626403809, + "learning_rate": 7.529603569589841e-06, + "loss": 0.5455, + "step": 8729 + }, + { + "epoch": 1.5, + "grad_norm": 14.980628967285156, + "learning_rate": 7.527029346147246e-06, + "loss": 0.4883, + "step": 8730 + }, + { + "epoch": 1.5, + "grad_norm": 10.775498390197754, + "learning_rate": 7.524455122704651e-06, + "loss": 0.5881, + "step": 8731 + }, + { + "epoch": 1.5, + "grad_norm": 10.3221435546875, + "learning_rate": 7.521880899262056e-06, + "loss": 0.3668, + "step": 8732 + }, + { + "epoch": 1.5, + "grad_norm": 8.927716255187988, + "learning_rate": 7.51930667581946e-06, + "loss": 0.4116, + "step": 8733 + }, + { + "epoch": 1.5, + "grad_norm": 9.804611206054688, + "learning_rate": 7.516732452376867e-06, + "loss": 0.3445, + "step": 8734 + }, + { + "epoch": 1.5, + "grad_norm": 9.995001792907715, + "learning_rate": 7.514158228934272e-06, + "loss": 0.4496, + "step": 8735 + }, + { + "epoch": 1.5, + "grad_norm": 8.850483894348145, + "learning_rate": 7.511584005491677e-06, + "loss": 0.5389, + "step": 8736 + }, + { + "epoch": 1.5, + "grad_norm": 10.620684623718262, + "learning_rate": 7.509009782049082e-06, + "loss": 0.3659, + "step": 8737 + }, + { + "epoch": 1.5, + "grad_norm": 9.97374439239502, + "learning_rate": 7.506435558606486e-06, + "loss": 0.4827, + "step": 8738 + }, + { + "epoch": 1.5, + "grad_norm": 8.738988876342773, + "learning_rate": 7.503861335163893e-06, + "loss": 0.3773, + "step": 8739 + }, + { + "epoch": 1.5, + "grad_norm": 9.055754661560059, + "learning_rate": 7.501287111721298e-06, + "loss": 0.3924, + "step": 8740 + }, + { + "epoch": 1.5, + "grad_norm": 10.052687644958496, + "learning_rate": 7.498712888278703e-06, + "loss": 0.4134, + "step": 8741 + }, + { + "epoch": 1.5, + "grad_norm": 6.5737223625183105, + "learning_rate": 7.4961386648361085e-06, + "loss": 0.1778, + "step": 8742 + }, + { + "epoch": 1.5, + "grad_norm": 11.387993812561035, + "learning_rate": 7.4935644413935135e-06, + "loss": 0.4352, + "step": 8743 + }, + { + "epoch": 1.5, + "grad_norm": 8.305328369140625, + "learning_rate": 7.490990217950918e-06, + "loss": 0.4074, + "step": 8744 + }, + { + "epoch": 1.5, + "grad_norm": 6.951504230499268, + "learning_rate": 7.4884159945083235e-06, + "loss": 0.3848, + "step": 8745 + }, + { + "epoch": 1.5, + "grad_norm": 10.153668403625488, + "learning_rate": 7.4858417710657285e-06, + "loss": 0.4418, + "step": 8746 + }, + { + "epoch": 1.5, + "grad_norm": 10.094598770141602, + "learning_rate": 7.483267547623134e-06, + "loss": 0.4926, + "step": 8747 + }, + { + "epoch": 1.5, + "grad_norm": 11.65021800994873, + "learning_rate": 7.480693324180539e-06, + "loss": 0.367, + "step": 8748 + }, + { + "epoch": 1.5, + "grad_norm": 11.264941215515137, + "learning_rate": 7.478119100737944e-06, + "loss": 0.4585, + "step": 8749 + }, + { + "epoch": 1.5, + "grad_norm": 9.575928688049316, + "learning_rate": 7.475544877295349e-06, + "loss": 0.3404, + "step": 8750 + }, + { + "epoch": 1.5, + "grad_norm": 12.107653617858887, + "learning_rate": 7.472970653852754e-06, + "loss": 0.62, + "step": 8751 + }, + { + "epoch": 1.5, + "grad_norm": 12.695974349975586, + "learning_rate": 7.47039643041016e-06, + "loss": 0.533, + "step": 8752 + }, + { + "epoch": 1.5, + "grad_norm": 8.48951244354248, + "learning_rate": 7.467822206967565e-06, + "loss": 0.3823, + "step": 8753 + }, + { + "epoch": 1.5, + "grad_norm": 12.928346633911133, + "learning_rate": 7.46524798352497e-06, + "loss": 0.551, + "step": 8754 + }, + { + "epoch": 1.5, + "grad_norm": 10.034577369689941, + "learning_rate": 7.462673760082375e-06, + "loss": 0.2812, + "step": 8755 + }, + { + "epoch": 1.5, + "grad_norm": 12.299479484558105, + "learning_rate": 7.460099536639781e-06, + "loss": 0.6269, + "step": 8756 + }, + { + "epoch": 1.5, + "grad_norm": 11.282571792602539, + "learning_rate": 7.457525313197186e-06, + "loss": 0.4905, + "step": 8757 + }, + { + "epoch": 1.5, + "grad_norm": 9.033225059509277, + "learning_rate": 7.454951089754591e-06, + "loss": 0.3846, + "step": 8758 + }, + { + "epoch": 1.5, + "grad_norm": 12.580182075500488, + "learning_rate": 7.452376866311996e-06, + "loss": 0.6844, + "step": 8759 + }, + { + "epoch": 1.5, + "grad_norm": 11.297609329223633, + "learning_rate": 7.449802642869401e-06, + "loss": 0.5394, + "step": 8760 + }, + { + "epoch": 1.5, + "grad_norm": 10.419766426086426, + "learning_rate": 7.447228419426807e-06, + "loss": 0.3593, + "step": 8761 + }, + { + "epoch": 1.5, + "grad_norm": 9.03036880493164, + "learning_rate": 7.444654195984212e-06, + "loss": 0.4566, + "step": 8762 + }, + { + "epoch": 1.5, + "grad_norm": 11.862035751342773, + "learning_rate": 7.4420799725416175e-06, + "loss": 0.4694, + "step": 8763 + }, + { + "epoch": 1.5, + "grad_norm": 7.9309163093566895, + "learning_rate": 7.439505749099022e-06, + "loss": 0.4008, + "step": 8764 + }, + { + "epoch": 1.5, + "grad_norm": 9.637907028198242, + "learning_rate": 7.436931525656427e-06, + "loss": 0.5061, + "step": 8765 + }, + { + "epoch": 1.5, + "grad_norm": 11.582741737365723, + "learning_rate": 7.4343573022138324e-06, + "loss": 0.3703, + "step": 8766 + }, + { + "epoch": 1.5, + "grad_norm": 9.502876281738281, + "learning_rate": 7.4317830787712374e-06, + "loss": 0.4593, + "step": 8767 + }, + { + "epoch": 1.5, + "grad_norm": 12.501891136169434, + "learning_rate": 7.429208855328643e-06, + "loss": 0.4167, + "step": 8768 + }, + { + "epoch": 1.5, + "grad_norm": 11.096879005432129, + "learning_rate": 7.426634631886048e-06, + "loss": 0.5099, + "step": 8769 + }, + { + "epoch": 1.51, + "grad_norm": 11.736690521240234, + "learning_rate": 7.424060408443452e-06, + "loss": 0.3577, + "step": 8770 + }, + { + "epoch": 1.51, + "grad_norm": 12.838726997375488, + "learning_rate": 7.421486185000858e-06, + "loss": 0.609, + "step": 8771 + }, + { + "epoch": 1.51, + "grad_norm": 8.117023468017578, + "learning_rate": 7.418911961558263e-06, + "loss": 0.3835, + "step": 8772 + }, + { + "epoch": 1.51, + "grad_norm": 10.22922134399414, + "learning_rate": 7.416337738115669e-06, + "loss": 0.4301, + "step": 8773 + }, + { + "epoch": 1.51, + "grad_norm": 9.718122482299805, + "learning_rate": 7.413763514673074e-06, + "loss": 0.2766, + "step": 8774 + }, + { + "epoch": 1.51, + "grad_norm": 10.26065731048584, + "learning_rate": 7.411189291230479e-06, + "loss": 0.4337, + "step": 8775 + }, + { + "epoch": 1.51, + "grad_norm": 7.731683254241943, + "learning_rate": 7.408615067787884e-06, + "loss": 0.259, + "step": 8776 + }, + { + "epoch": 1.51, + "grad_norm": 12.79808235168457, + "learning_rate": 7.406040844345289e-06, + "loss": 0.3705, + "step": 8777 + }, + { + "epoch": 1.51, + "grad_norm": 9.64966106414795, + "learning_rate": 7.403466620902695e-06, + "loss": 0.425, + "step": 8778 + }, + { + "epoch": 1.51, + "grad_norm": 10.392223358154297, + "learning_rate": 7.4008923974601e-06, + "loss": 0.4592, + "step": 8779 + }, + { + "epoch": 1.51, + "grad_norm": 7.400946617126465, + "learning_rate": 7.398318174017505e-06, + "loss": 0.2814, + "step": 8780 + }, + { + "epoch": 1.51, + "grad_norm": 13.687527656555176, + "learning_rate": 7.39574395057491e-06, + "loss": 0.5491, + "step": 8781 + }, + { + "epoch": 1.51, + "grad_norm": 9.366559028625488, + "learning_rate": 7.393169727132316e-06, + "loss": 0.3551, + "step": 8782 + }, + { + "epoch": 1.51, + "grad_norm": 11.964877128601074, + "learning_rate": 7.390595503689721e-06, + "loss": 0.5435, + "step": 8783 + }, + { + "epoch": 1.51, + "grad_norm": 10.320745468139648, + "learning_rate": 7.388021280247126e-06, + "loss": 0.3814, + "step": 8784 + }, + { + "epoch": 1.51, + "grad_norm": 10.121947288513184, + "learning_rate": 7.385447056804531e-06, + "loss": 0.4195, + "step": 8785 + }, + { + "epoch": 1.51, + "grad_norm": 9.1561279296875, + "learning_rate": 7.382872833361936e-06, + "loss": 0.5622, + "step": 8786 + }, + { + "epoch": 1.51, + "grad_norm": 11.045967102050781, + "learning_rate": 7.380298609919341e-06, + "loss": 0.5008, + "step": 8787 + }, + { + "epoch": 1.51, + "grad_norm": 11.236106872558594, + "learning_rate": 7.377724386476746e-06, + "loss": 0.445, + "step": 8788 + }, + { + "epoch": 1.51, + "grad_norm": 9.588759422302246, + "learning_rate": 7.375150163034152e-06, + "loss": 0.278, + "step": 8789 + }, + { + "epoch": 1.51, + "grad_norm": 10.644856452941895, + "learning_rate": 7.372575939591556e-06, + "loss": 0.4051, + "step": 8790 + }, + { + "epoch": 1.51, + "grad_norm": 9.774545669555664, + "learning_rate": 7.370001716148961e-06, + "loss": 0.4701, + "step": 8791 + }, + { + "epoch": 1.51, + "grad_norm": 10.966662406921387, + "learning_rate": 7.367427492706367e-06, + "loss": 0.4374, + "step": 8792 + }, + { + "epoch": 1.51, + "grad_norm": 13.633502960205078, + "learning_rate": 7.364853269263772e-06, + "loss": 0.494, + "step": 8793 + }, + { + "epoch": 1.51, + "grad_norm": 12.987869262695312, + "learning_rate": 7.362279045821178e-06, + "loss": 0.3418, + "step": 8794 + }, + { + "epoch": 1.51, + "grad_norm": 9.426177024841309, + "learning_rate": 7.359704822378583e-06, + "loss": 0.3568, + "step": 8795 + }, + { + "epoch": 1.51, + "grad_norm": 10.096763610839844, + "learning_rate": 7.357130598935987e-06, + "loss": 0.4037, + "step": 8796 + }, + { + "epoch": 1.51, + "grad_norm": 11.230522155761719, + "learning_rate": 7.354556375493393e-06, + "loss": 0.423, + "step": 8797 + }, + { + "epoch": 1.51, + "grad_norm": 8.045573234558105, + "learning_rate": 7.351982152050798e-06, + "loss": 0.3674, + "step": 8798 + }, + { + "epoch": 1.51, + "grad_norm": 12.338088989257812, + "learning_rate": 7.349407928608204e-06, + "loss": 0.3108, + "step": 8799 + }, + { + "epoch": 1.51, + "grad_norm": 11.044685363769531, + "learning_rate": 7.346833705165609e-06, + "loss": 0.4526, + "step": 8800 + }, + { + "epoch": 1.51, + "grad_norm": 9.16036319732666, + "learning_rate": 7.344259481723014e-06, + "loss": 0.4964, + "step": 8801 + }, + { + "epoch": 1.51, + "grad_norm": 14.049835205078125, + "learning_rate": 7.341685258280419e-06, + "loss": 0.3937, + "step": 8802 + }, + { + "epoch": 1.51, + "grad_norm": 12.86475658416748, + "learning_rate": 7.339111034837824e-06, + "loss": 0.5843, + "step": 8803 + }, + { + "epoch": 1.51, + "grad_norm": 11.912973403930664, + "learning_rate": 7.33653681139523e-06, + "loss": 0.3669, + "step": 8804 + }, + { + "epoch": 1.51, + "grad_norm": 10.2679443359375, + "learning_rate": 7.333962587952635e-06, + "loss": 0.3577, + "step": 8805 + }, + { + "epoch": 1.51, + "grad_norm": 10.166956901550293, + "learning_rate": 7.3313883645100396e-06, + "loss": 0.3852, + "step": 8806 + }, + { + "epoch": 1.51, + "grad_norm": 9.623123168945312, + "learning_rate": 7.3288141410674446e-06, + "loss": 0.3887, + "step": 8807 + }, + { + "epoch": 1.51, + "grad_norm": 9.616179466247559, + "learning_rate": 7.32623991762485e-06, + "loss": 0.329, + "step": 8808 + }, + { + "epoch": 1.51, + "grad_norm": 12.576656341552734, + "learning_rate": 7.323665694182255e-06, + "loss": 0.4326, + "step": 8809 + }, + { + "epoch": 1.51, + "grad_norm": 10.386253356933594, + "learning_rate": 7.32109147073966e-06, + "loss": 0.3872, + "step": 8810 + }, + { + "epoch": 1.51, + "grad_norm": 14.359831809997559, + "learning_rate": 7.318517247297065e-06, + "loss": 0.5517, + "step": 8811 + }, + { + "epoch": 1.51, + "grad_norm": 13.12345027923584, + "learning_rate": 7.31594302385447e-06, + "loss": 0.4971, + "step": 8812 + }, + { + "epoch": 1.51, + "grad_norm": 15.607744216918945, + "learning_rate": 7.313368800411876e-06, + "loss": 0.6341, + "step": 8813 + }, + { + "epoch": 1.51, + "grad_norm": 11.886641502380371, + "learning_rate": 7.310794576969281e-06, + "loss": 0.5391, + "step": 8814 + }, + { + "epoch": 1.51, + "grad_norm": 11.09367847442627, + "learning_rate": 7.308220353526687e-06, + "loss": 0.4405, + "step": 8815 + }, + { + "epoch": 1.51, + "grad_norm": 14.571110725402832, + "learning_rate": 7.305646130084091e-06, + "loss": 0.4618, + "step": 8816 + }, + { + "epoch": 1.51, + "grad_norm": 7.874504566192627, + "learning_rate": 7.303071906641496e-06, + "loss": 0.3187, + "step": 8817 + }, + { + "epoch": 1.51, + "grad_norm": 10.266547203063965, + "learning_rate": 7.300497683198902e-06, + "loss": 0.5731, + "step": 8818 + }, + { + "epoch": 1.51, + "grad_norm": 15.223689079284668, + "learning_rate": 7.297923459756307e-06, + "loss": 0.5282, + "step": 8819 + }, + { + "epoch": 1.51, + "grad_norm": 14.01658821105957, + "learning_rate": 7.295349236313713e-06, + "loss": 0.4101, + "step": 8820 + }, + { + "epoch": 1.51, + "grad_norm": 10.149382591247559, + "learning_rate": 7.292775012871118e-06, + "loss": 0.4485, + "step": 8821 + }, + { + "epoch": 1.51, + "grad_norm": 8.120978355407715, + "learning_rate": 7.290200789428522e-06, + "loss": 0.3263, + "step": 8822 + }, + { + "epoch": 1.51, + "grad_norm": 8.204527854919434, + "learning_rate": 7.287626565985928e-06, + "loss": 0.4411, + "step": 8823 + }, + { + "epoch": 1.51, + "grad_norm": 6.927420616149902, + "learning_rate": 7.285052342543333e-06, + "loss": 0.2149, + "step": 8824 + }, + { + "epoch": 1.51, + "grad_norm": 7.783214092254639, + "learning_rate": 7.2824781191007386e-06, + "loss": 0.4074, + "step": 8825 + }, + { + "epoch": 1.51, + "grad_norm": 8.478473663330078, + "learning_rate": 7.2799038956581435e-06, + "loss": 0.3461, + "step": 8826 + }, + { + "epoch": 1.51, + "grad_norm": 14.341203689575195, + "learning_rate": 7.2773296722155485e-06, + "loss": 0.4026, + "step": 8827 + }, + { + "epoch": 1.52, + "grad_norm": 10.5017728805542, + "learning_rate": 7.2747554487729535e-06, + "loss": 0.5483, + "step": 8828 + }, + { + "epoch": 1.52, + "grad_norm": 12.103543281555176, + "learning_rate": 7.2721812253303585e-06, + "loss": 0.4359, + "step": 8829 + }, + { + "epoch": 1.52, + "grad_norm": 11.962469100952148, + "learning_rate": 7.269607001887764e-06, + "loss": 0.4857, + "step": 8830 + }, + { + "epoch": 1.52, + "grad_norm": 10.309229850769043, + "learning_rate": 7.267032778445169e-06, + "loss": 0.4297, + "step": 8831 + }, + { + "epoch": 1.52, + "grad_norm": 14.478974342346191, + "learning_rate": 7.264458555002574e-06, + "loss": 0.5057, + "step": 8832 + }, + { + "epoch": 1.52, + "grad_norm": 10.26053524017334, + "learning_rate": 7.261884331559979e-06, + "loss": 0.5801, + "step": 8833 + }, + { + "epoch": 1.52, + "grad_norm": 9.356572151184082, + "learning_rate": 7.259310108117385e-06, + "loss": 0.4289, + "step": 8834 + }, + { + "epoch": 1.52, + "grad_norm": 11.247749328613281, + "learning_rate": 7.25673588467479e-06, + "loss": 0.5264, + "step": 8835 + }, + { + "epoch": 1.52, + "grad_norm": 8.72059440612793, + "learning_rate": 7.254161661232195e-06, + "loss": 0.3276, + "step": 8836 + }, + { + "epoch": 1.52, + "grad_norm": 9.958900451660156, + "learning_rate": 7.2515874377896e-06, + "loss": 0.4227, + "step": 8837 + }, + { + "epoch": 1.52, + "grad_norm": 11.569451332092285, + "learning_rate": 7.249013214347005e-06, + "loss": 0.4005, + "step": 8838 + }, + { + "epoch": 1.52, + "grad_norm": 10.54477310180664, + "learning_rate": 7.246438990904411e-06, + "loss": 0.4857, + "step": 8839 + }, + { + "epoch": 1.52, + "grad_norm": 11.293161392211914, + "learning_rate": 7.243864767461816e-06, + "loss": 0.6239, + "step": 8840 + }, + { + "epoch": 1.52, + "grad_norm": 8.453324317932129, + "learning_rate": 7.241290544019221e-06, + "loss": 0.3432, + "step": 8841 + }, + { + "epoch": 1.52, + "grad_norm": 13.279464721679688, + "learning_rate": 7.238716320576626e-06, + "loss": 0.44, + "step": 8842 + }, + { + "epoch": 1.52, + "grad_norm": 10.750658988952637, + "learning_rate": 7.236142097134031e-06, + "loss": 0.4933, + "step": 8843 + }, + { + "epoch": 1.52, + "grad_norm": 13.113375663757324, + "learning_rate": 7.233567873691437e-06, + "loss": 0.6137, + "step": 8844 + }, + { + "epoch": 1.52, + "grad_norm": 15.700080871582031, + "learning_rate": 7.230993650248842e-06, + "loss": 0.5215, + "step": 8845 + }, + { + "epoch": 1.52, + "grad_norm": 11.13374137878418, + "learning_rate": 7.2284194268062475e-06, + "loss": 0.4844, + "step": 8846 + }, + { + "epoch": 1.52, + "grad_norm": 11.928226470947266, + "learning_rate": 7.2258452033636525e-06, + "loss": 0.4761, + "step": 8847 + }, + { + "epoch": 1.52, + "grad_norm": 10.967079162597656, + "learning_rate": 7.223270979921057e-06, + "loss": 0.4367, + "step": 8848 + }, + { + "epoch": 1.52, + "grad_norm": 18.112823486328125, + "learning_rate": 7.2206967564784625e-06, + "loss": 0.6469, + "step": 8849 + }, + { + "epoch": 1.52, + "grad_norm": 7.774311542510986, + "learning_rate": 7.2181225330358675e-06, + "loss": 0.2604, + "step": 8850 + }, + { + "epoch": 1.52, + "grad_norm": 13.273468971252441, + "learning_rate": 7.215548309593273e-06, + "loss": 0.5598, + "step": 8851 + }, + { + "epoch": 1.52, + "grad_norm": 9.786847114562988, + "learning_rate": 7.212974086150678e-06, + "loss": 0.2727, + "step": 8852 + }, + { + "epoch": 1.52, + "grad_norm": 10.64932632446289, + "learning_rate": 7.210399862708083e-06, + "loss": 0.4634, + "step": 8853 + }, + { + "epoch": 1.52, + "grad_norm": 11.51461124420166, + "learning_rate": 7.207825639265488e-06, + "loss": 0.714, + "step": 8854 + }, + { + "epoch": 1.52, + "grad_norm": 9.96403980255127, + "learning_rate": 7.205251415822893e-06, + "loss": 0.5432, + "step": 8855 + }, + { + "epoch": 1.52, + "grad_norm": 9.334166526794434, + "learning_rate": 7.202677192380299e-06, + "loss": 0.3426, + "step": 8856 + }, + { + "epoch": 1.52, + "grad_norm": 17.989980697631836, + "learning_rate": 7.200102968937704e-06, + "loss": 0.4662, + "step": 8857 + }, + { + "epoch": 1.52, + "grad_norm": 9.669950485229492, + "learning_rate": 7.197528745495109e-06, + "loss": 0.397, + "step": 8858 + }, + { + "epoch": 1.52, + "grad_norm": 11.047621726989746, + "learning_rate": 7.194954522052514e-06, + "loss": 0.4182, + "step": 8859 + }, + { + "epoch": 1.52, + "grad_norm": 10.333658218383789, + "learning_rate": 7.19238029860992e-06, + "loss": 0.3585, + "step": 8860 + }, + { + "epoch": 1.52, + "grad_norm": 9.259073257446289, + "learning_rate": 7.189806075167325e-06, + "loss": 0.4596, + "step": 8861 + }, + { + "epoch": 1.52, + "grad_norm": 10.235369682312012, + "learning_rate": 7.18723185172473e-06, + "loss": 0.3864, + "step": 8862 + }, + { + "epoch": 1.52, + "grad_norm": 12.717405319213867, + "learning_rate": 7.184657628282135e-06, + "loss": 0.4641, + "step": 8863 + }, + { + "epoch": 1.52, + "grad_norm": 11.15607738494873, + "learning_rate": 7.18208340483954e-06, + "loss": 0.5655, + "step": 8864 + }, + { + "epoch": 1.52, + "grad_norm": 10.473398208618164, + "learning_rate": 7.179509181396946e-06, + "loss": 0.497, + "step": 8865 + }, + { + "epoch": 1.52, + "grad_norm": 8.998419761657715, + "learning_rate": 7.176934957954351e-06, + "loss": 0.3287, + "step": 8866 + }, + { + "epoch": 1.52, + "grad_norm": 14.598323822021484, + "learning_rate": 7.174360734511756e-06, + "loss": 0.6619, + "step": 8867 + }, + { + "epoch": 1.52, + "grad_norm": 10.139378547668457, + "learning_rate": 7.171786511069161e-06, + "loss": 0.4889, + "step": 8868 + }, + { + "epoch": 1.52, + "grad_norm": 9.548771858215332, + "learning_rate": 7.169212287626566e-06, + "loss": 0.4528, + "step": 8869 + }, + { + "epoch": 1.52, + "grad_norm": 12.250882148742676, + "learning_rate": 7.1666380641839715e-06, + "loss": 0.6002, + "step": 8870 + }, + { + "epoch": 1.52, + "grad_norm": 11.281623840332031, + "learning_rate": 7.1640638407413764e-06, + "loss": 0.3651, + "step": 8871 + }, + { + "epoch": 1.52, + "grad_norm": 10.502840042114258, + "learning_rate": 7.161489617298782e-06, + "loss": 0.453, + "step": 8872 + }, + { + "epoch": 1.52, + "grad_norm": 8.626325607299805, + "learning_rate": 7.158915393856187e-06, + "loss": 0.3073, + "step": 8873 + }, + { + "epoch": 1.52, + "grad_norm": 14.452362060546875, + "learning_rate": 7.156341170413591e-06, + "loss": 0.5024, + "step": 8874 + }, + { + "epoch": 1.52, + "grad_norm": 13.148929595947266, + "learning_rate": 7.153766946970997e-06, + "loss": 0.4041, + "step": 8875 + }, + { + "epoch": 1.52, + "grad_norm": 9.789012908935547, + "learning_rate": 7.151192723528402e-06, + "loss": 0.2853, + "step": 8876 + }, + { + "epoch": 1.52, + "grad_norm": 14.083465576171875, + "learning_rate": 7.148618500085808e-06, + "loss": 0.4297, + "step": 8877 + }, + { + "epoch": 1.52, + "grad_norm": 8.260814666748047, + "learning_rate": 7.146044276643213e-06, + "loss": 0.3215, + "step": 8878 + }, + { + "epoch": 1.52, + "grad_norm": 11.21704387664795, + "learning_rate": 7.143470053200618e-06, + "loss": 0.4656, + "step": 8879 + }, + { + "epoch": 1.52, + "grad_norm": 10.400568008422852, + "learning_rate": 7.140895829758023e-06, + "loss": 0.4299, + "step": 8880 + }, + { + "epoch": 1.52, + "grad_norm": 8.600332260131836, + "learning_rate": 7.138321606315428e-06, + "loss": 0.4117, + "step": 8881 + }, + { + "epoch": 1.52, + "grad_norm": 8.670083045959473, + "learning_rate": 7.135747382872834e-06, + "loss": 0.4319, + "step": 8882 + }, + { + "epoch": 1.52, + "grad_norm": 11.604001998901367, + "learning_rate": 7.133173159430239e-06, + "loss": 0.5183, + "step": 8883 + }, + { + "epoch": 1.52, + "grad_norm": 12.450639724731445, + "learning_rate": 7.130598935987644e-06, + "loss": 0.4138, + "step": 8884 + }, + { + "epoch": 1.52, + "grad_norm": 15.210122108459473, + "learning_rate": 7.128024712545049e-06, + "loss": 0.5035, + "step": 8885 + }, + { + "epoch": 1.52, + "grad_norm": 13.121723175048828, + "learning_rate": 7.125450489102455e-06, + "loss": 0.3984, + "step": 8886 + }, + { + "epoch": 1.53, + "grad_norm": 12.203651428222656, + "learning_rate": 7.12287626565986e-06, + "loss": 0.432, + "step": 8887 + }, + { + "epoch": 1.53, + "grad_norm": 9.5178804397583, + "learning_rate": 7.120302042217265e-06, + "loss": 0.376, + "step": 8888 + }, + { + "epoch": 1.53, + "grad_norm": 8.63821029663086, + "learning_rate": 7.11772781877467e-06, + "loss": 0.3921, + "step": 8889 + }, + { + "epoch": 1.53, + "grad_norm": 8.6951322555542, + "learning_rate": 7.115153595332075e-06, + "loss": 0.4086, + "step": 8890 + }, + { + "epoch": 1.53, + "grad_norm": 7.29124641418457, + "learning_rate": 7.1125793718894804e-06, + "loss": 0.3772, + "step": 8891 + }, + { + "epoch": 1.53, + "grad_norm": 9.061479568481445, + "learning_rate": 7.110005148446885e-06, + "loss": 0.5001, + "step": 8892 + }, + { + "epoch": 1.53, + "grad_norm": 14.922256469726562, + "learning_rate": 7.10743092500429e-06, + "loss": 0.4717, + "step": 8893 + }, + { + "epoch": 1.53, + "grad_norm": 12.323261260986328, + "learning_rate": 7.104856701561695e-06, + "loss": 0.4749, + "step": 8894 + }, + { + "epoch": 1.53, + "grad_norm": 9.216887474060059, + "learning_rate": 7.1022824781191e-06, + "loss": 0.3984, + "step": 8895 + }, + { + "epoch": 1.53, + "grad_norm": 7.934128284454346, + "learning_rate": 7.099708254676506e-06, + "loss": 0.347, + "step": 8896 + }, + { + "epoch": 1.53, + "grad_norm": 8.633722305297852, + "learning_rate": 7.097134031233911e-06, + "loss": 0.2435, + "step": 8897 + }, + { + "epoch": 1.53, + "grad_norm": 11.857534408569336, + "learning_rate": 7.094559807791317e-06, + "loss": 0.4294, + "step": 8898 + }, + { + "epoch": 1.53, + "grad_norm": 11.378945350646973, + "learning_rate": 7.091985584348722e-06, + "loss": 0.4699, + "step": 8899 + }, + { + "epoch": 1.53, + "grad_norm": 12.56597900390625, + "learning_rate": 7.089411360906126e-06, + "loss": 0.4952, + "step": 8900 + }, + { + "epoch": 1.53, + "grad_norm": 6.527883052825928, + "learning_rate": 7.086837137463532e-06, + "loss": 0.2861, + "step": 8901 + }, + { + "epoch": 1.53, + "grad_norm": 11.969795227050781, + "learning_rate": 7.084262914020937e-06, + "loss": 0.3248, + "step": 8902 + }, + { + "epoch": 1.53, + "grad_norm": 13.945636749267578, + "learning_rate": 7.081688690578343e-06, + "loss": 0.5072, + "step": 8903 + }, + { + "epoch": 1.53, + "grad_norm": 11.161673545837402, + "learning_rate": 7.079114467135748e-06, + "loss": 0.5091, + "step": 8904 + }, + { + "epoch": 1.53, + "grad_norm": 11.02252197265625, + "learning_rate": 7.076540243693153e-06, + "loss": 0.5127, + "step": 8905 + }, + { + "epoch": 1.53, + "grad_norm": 9.72762680053711, + "learning_rate": 7.073966020250558e-06, + "loss": 0.4311, + "step": 8906 + }, + { + "epoch": 1.53, + "grad_norm": 10.358551025390625, + "learning_rate": 7.071391796807963e-06, + "loss": 0.471, + "step": 8907 + }, + { + "epoch": 1.53, + "grad_norm": 10.956890106201172, + "learning_rate": 7.068817573365369e-06, + "loss": 0.5217, + "step": 8908 + }, + { + "epoch": 1.53, + "grad_norm": 8.607876777648926, + "learning_rate": 7.066243349922774e-06, + "loss": 0.4731, + "step": 8909 + }, + { + "epoch": 1.53, + "grad_norm": 14.18957233428955, + "learning_rate": 7.063669126480179e-06, + "loss": 0.4956, + "step": 8910 + }, + { + "epoch": 1.53, + "grad_norm": 15.583208084106445, + "learning_rate": 7.0610949030375836e-06, + "loss": 0.4276, + "step": 8911 + }, + { + "epoch": 1.53, + "grad_norm": 7.812610149383545, + "learning_rate": 7.058520679594989e-06, + "loss": 0.3667, + "step": 8912 + }, + { + "epoch": 1.53, + "grad_norm": 7.129200458526611, + "learning_rate": 7.055946456152394e-06, + "loss": 0.3663, + "step": 8913 + }, + { + "epoch": 1.53, + "grad_norm": 10.108561515808105, + "learning_rate": 7.053372232709799e-06, + "loss": 0.4189, + "step": 8914 + }, + { + "epoch": 1.53, + "grad_norm": 10.96666145324707, + "learning_rate": 7.050798009267204e-06, + "loss": 0.4068, + "step": 8915 + }, + { + "epoch": 1.53, + "grad_norm": 8.690185546875, + "learning_rate": 7.048223785824609e-06, + "loss": 0.4316, + "step": 8916 + }, + { + "epoch": 1.53, + "grad_norm": 12.282842636108398, + "learning_rate": 7.045649562382015e-06, + "loss": 0.3765, + "step": 8917 + }, + { + "epoch": 1.53, + "grad_norm": 10.994156837463379, + "learning_rate": 7.04307533893942e-06, + "loss": 0.3158, + "step": 8918 + }, + { + "epoch": 1.53, + "grad_norm": 8.971477508544922, + "learning_rate": 7.040501115496825e-06, + "loss": 0.432, + "step": 8919 + }, + { + "epoch": 1.53, + "grad_norm": 13.713102340698242, + "learning_rate": 7.03792689205423e-06, + "loss": 0.4613, + "step": 8920 + }, + { + "epoch": 1.53, + "grad_norm": 7.9313554763793945, + "learning_rate": 7.035352668611635e-06, + "loss": 0.2449, + "step": 8921 + }, + { + "epoch": 1.53, + "grad_norm": 10.674796104431152, + "learning_rate": 7.032778445169041e-06, + "loss": 0.4648, + "step": 8922 + }, + { + "epoch": 1.53, + "grad_norm": 10.270509719848633, + "learning_rate": 7.030204221726446e-06, + "loss": 0.5068, + "step": 8923 + }, + { + "epoch": 1.53, + "grad_norm": 12.4141206741333, + "learning_rate": 7.027629998283852e-06, + "loss": 0.5071, + "step": 8924 + }, + { + "epoch": 1.53, + "grad_norm": 9.394927978515625, + "learning_rate": 7.025055774841257e-06, + "loss": 0.5717, + "step": 8925 + }, + { + "epoch": 1.53, + "grad_norm": 10.928528785705566, + "learning_rate": 7.022481551398661e-06, + "loss": 0.5141, + "step": 8926 + }, + { + "epoch": 1.53, + "grad_norm": 7.652988910675049, + "learning_rate": 7.019907327956067e-06, + "loss": 0.3254, + "step": 8927 + }, + { + "epoch": 1.53, + "grad_norm": 11.221549034118652, + "learning_rate": 7.017333104513472e-06, + "loss": 0.4598, + "step": 8928 + }, + { + "epoch": 1.53, + "grad_norm": 8.18213176727295, + "learning_rate": 7.0147588810708776e-06, + "loss": 0.2799, + "step": 8929 + }, + { + "epoch": 1.53, + "grad_norm": 9.492405891418457, + "learning_rate": 7.0121846576282826e-06, + "loss": 0.3331, + "step": 8930 + }, + { + "epoch": 1.53, + "grad_norm": 11.051721572875977, + "learning_rate": 7.0096104341856875e-06, + "loss": 0.4662, + "step": 8931 + }, + { + "epoch": 1.53, + "grad_norm": 13.982498168945312, + "learning_rate": 7.0070362107430925e-06, + "loss": 0.4243, + "step": 8932 + }, + { + "epoch": 1.53, + "grad_norm": 8.501224517822266, + "learning_rate": 7.0044619873004975e-06, + "loss": 0.3283, + "step": 8933 + }, + { + "epoch": 1.53, + "grad_norm": 10.438892364501953, + "learning_rate": 7.001887763857903e-06, + "loss": 0.5087, + "step": 8934 + }, + { + "epoch": 1.53, + "grad_norm": 7.742033004760742, + "learning_rate": 6.999313540415308e-06, + "loss": 0.326, + "step": 8935 + }, + { + "epoch": 1.53, + "grad_norm": 11.81969165802002, + "learning_rate": 6.996739316972713e-06, + "loss": 0.5256, + "step": 8936 + }, + { + "epoch": 1.53, + "grad_norm": 9.44371509552002, + "learning_rate": 6.994165093530118e-06, + "loss": 0.4008, + "step": 8937 + }, + { + "epoch": 1.53, + "grad_norm": 10.709961891174316, + "learning_rate": 6.991590870087524e-06, + "loss": 0.4038, + "step": 8938 + }, + { + "epoch": 1.53, + "grad_norm": 12.391035079956055, + "learning_rate": 6.989016646644929e-06, + "loss": 0.5656, + "step": 8939 + }, + { + "epoch": 1.53, + "grad_norm": 8.154583930969238, + "learning_rate": 6.986442423202334e-06, + "loss": 0.4149, + "step": 8940 + }, + { + "epoch": 1.53, + "grad_norm": 15.773641586303711, + "learning_rate": 6.983868199759739e-06, + "loss": 0.4047, + "step": 8941 + }, + { + "epoch": 1.53, + "grad_norm": 9.217438697814941, + "learning_rate": 6.981293976317144e-06, + "loss": 0.444, + "step": 8942 + }, + { + "epoch": 1.53, + "grad_norm": 13.037546157836914, + "learning_rate": 6.97871975287455e-06, + "loss": 0.4977, + "step": 8943 + }, + { + "epoch": 1.53, + "grad_norm": 11.7324857711792, + "learning_rate": 6.976145529431955e-06, + "loss": 0.3783, + "step": 8944 + }, + { + "epoch": 1.54, + "grad_norm": 9.169418334960938, + "learning_rate": 6.97357130598936e-06, + "loss": 0.3564, + "step": 8945 + }, + { + "epoch": 1.54, + "grad_norm": 11.172502517700195, + "learning_rate": 6.970997082546765e-06, + "loss": 0.5238, + "step": 8946 + }, + { + "epoch": 1.54, + "grad_norm": 10.07720947265625, + "learning_rate": 6.96842285910417e-06, + "loss": 0.3891, + "step": 8947 + }, + { + "epoch": 1.54, + "grad_norm": 9.466992378234863, + "learning_rate": 6.965848635661576e-06, + "loss": 0.3416, + "step": 8948 + }, + { + "epoch": 1.54, + "grad_norm": 11.41238021850586, + "learning_rate": 6.963274412218981e-06, + "loss": 0.3941, + "step": 8949 + }, + { + "epoch": 1.54, + "grad_norm": 13.027323722839355, + "learning_rate": 6.9607001887763865e-06, + "loss": 0.4548, + "step": 8950 + }, + { + "epoch": 1.54, + "grad_norm": 12.93846321105957, + "learning_rate": 6.9581259653337915e-06, + "loss": 0.5196, + "step": 8951 + }, + { + "epoch": 1.54, + "grad_norm": 10.976224899291992, + "learning_rate": 6.955551741891196e-06, + "loss": 0.4226, + "step": 8952 + }, + { + "epoch": 1.54, + "grad_norm": 9.822880744934082, + "learning_rate": 6.9529775184486015e-06, + "loss": 0.4668, + "step": 8953 + }, + { + "epoch": 1.54, + "grad_norm": 8.665153503417969, + "learning_rate": 6.9504032950060065e-06, + "loss": 0.3618, + "step": 8954 + }, + { + "epoch": 1.54, + "grad_norm": 8.610836029052734, + "learning_rate": 6.947829071563412e-06, + "loss": 0.5179, + "step": 8955 + }, + { + "epoch": 1.54, + "grad_norm": 14.167706489562988, + "learning_rate": 6.945254848120817e-06, + "loss": 0.4276, + "step": 8956 + }, + { + "epoch": 1.54, + "grad_norm": 12.83510684967041, + "learning_rate": 6.942680624678222e-06, + "loss": 0.6989, + "step": 8957 + }, + { + "epoch": 1.54, + "grad_norm": 10.535457611083984, + "learning_rate": 6.940106401235627e-06, + "loss": 0.2847, + "step": 8958 + }, + { + "epoch": 1.54, + "grad_norm": 13.525236129760742, + "learning_rate": 6.937532177793032e-06, + "loss": 0.5602, + "step": 8959 + }, + { + "epoch": 1.54, + "grad_norm": 12.786735534667969, + "learning_rate": 6.934957954350438e-06, + "loss": 0.4485, + "step": 8960 + }, + { + "epoch": 1.54, + "grad_norm": 11.261709213256836, + "learning_rate": 6.932383730907843e-06, + "loss": 0.4715, + "step": 8961 + }, + { + "epoch": 1.54, + "grad_norm": 12.32877254486084, + "learning_rate": 6.929809507465248e-06, + "loss": 0.4365, + "step": 8962 + }, + { + "epoch": 1.54, + "grad_norm": 7.638503551483154, + "learning_rate": 6.927235284022653e-06, + "loss": 0.358, + "step": 8963 + }, + { + "epoch": 1.54, + "grad_norm": 8.470928192138672, + "learning_rate": 6.924661060580059e-06, + "loss": 0.2755, + "step": 8964 + }, + { + "epoch": 1.54, + "grad_norm": 11.32476806640625, + "learning_rate": 6.922086837137464e-06, + "loss": 0.3925, + "step": 8965 + }, + { + "epoch": 1.54, + "grad_norm": 7.3812642097473145, + "learning_rate": 6.919512613694869e-06, + "loss": 0.3318, + "step": 8966 + }, + { + "epoch": 1.54, + "grad_norm": 11.82723331451416, + "learning_rate": 6.916938390252274e-06, + "loss": 0.4636, + "step": 8967 + }, + { + "epoch": 1.54, + "grad_norm": 12.385515213012695, + "learning_rate": 6.914364166809679e-06, + "loss": 0.4615, + "step": 8968 + }, + { + "epoch": 1.54, + "grad_norm": 9.83295726776123, + "learning_rate": 6.911789943367085e-06, + "loss": 0.3849, + "step": 8969 + }, + { + "epoch": 1.54, + "grad_norm": 11.831999778747559, + "learning_rate": 6.90921571992449e-06, + "loss": 0.408, + "step": 8970 + }, + { + "epoch": 1.54, + "grad_norm": 11.939336776733398, + "learning_rate": 6.906641496481895e-06, + "loss": 0.4841, + "step": 8971 + }, + { + "epoch": 1.54, + "grad_norm": 9.491720199584961, + "learning_rate": 6.9040672730393005e-06, + "loss": 0.3959, + "step": 8972 + }, + { + "epoch": 1.54, + "grad_norm": 12.336962699890137, + "learning_rate": 6.901493049596705e-06, + "loss": 0.5526, + "step": 8973 + }, + { + "epoch": 1.54, + "grad_norm": 11.131930351257324, + "learning_rate": 6.8989188261541105e-06, + "loss": 0.598, + "step": 8974 + }, + { + "epoch": 1.54, + "grad_norm": 14.194853782653809, + "learning_rate": 6.8963446027115155e-06, + "loss": 0.5016, + "step": 8975 + }, + { + "epoch": 1.54, + "grad_norm": 10.512124061584473, + "learning_rate": 6.893770379268921e-06, + "loss": 0.4137, + "step": 8976 + }, + { + "epoch": 1.54, + "grad_norm": 9.344644546508789, + "learning_rate": 6.891196155826326e-06, + "loss": 0.5206, + "step": 8977 + }, + { + "epoch": 1.54, + "grad_norm": 10.666017532348633, + "learning_rate": 6.88862193238373e-06, + "loss": 0.4154, + "step": 8978 + }, + { + "epoch": 1.54, + "grad_norm": 9.957460403442383, + "learning_rate": 6.886047708941136e-06, + "loss": 0.3911, + "step": 8979 + }, + { + "epoch": 1.54, + "grad_norm": 10.622474670410156, + "learning_rate": 6.883473485498541e-06, + "loss": 0.4925, + "step": 8980 + }, + { + "epoch": 1.54, + "grad_norm": 9.922347068786621, + "learning_rate": 6.880899262055947e-06, + "loss": 0.4897, + "step": 8981 + }, + { + "epoch": 1.54, + "grad_norm": 10.485464096069336, + "learning_rate": 6.878325038613352e-06, + "loss": 0.3414, + "step": 8982 + }, + { + "epoch": 1.54, + "grad_norm": 12.115612030029297, + "learning_rate": 6.875750815170757e-06, + "loss": 0.3409, + "step": 8983 + }, + { + "epoch": 1.54, + "grad_norm": 10.991114616394043, + "learning_rate": 6.873176591728162e-06, + "loss": 0.4447, + "step": 8984 + }, + { + "epoch": 1.54, + "grad_norm": 12.786396026611328, + "learning_rate": 6.870602368285567e-06, + "loss": 0.6268, + "step": 8985 + }, + { + "epoch": 1.54, + "grad_norm": 9.771339416503906, + "learning_rate": 6.868028144842973e-06, + "loss": 0.3625, + "step": 8986 + }, + { + "epoch": 1.54, + "grad_norm": 8.360448837280273, + "learning_rate": 6.865453921400378e-06, + "loss": 0.2887, + "step": 8987 + }, + { + "epoch": 1.54, + "grad_norm": 7.364771366119385, + "learning_rate": 6.862879697957783e-06, + "loss": 0.3066, + "step": 8988 + }, + { + "epoch": 1.54, + "grad_norm": 11.924720764160156, + "learning_rate": 6.860305474515188e-06, + "loss": 0.3879, + "step": 8989 + }, + { + "epoch": 1.54, + "grad_norm": 11.039410591125488, + "learning_rate": 6.857731251072594e-06, + "loss": 0.3501, + "step": 8990 + }, + { + "epoch": 1.54, + "grad_norm": 12.326106071472168, + "learning_rate": 6.855157027629999e-06, + "loss": 0.4389, + "step": 8991 + }, + { + "epoch": 1.54, + "grad_norm": 10.962635040283203, + "learning_rate": 6.852582804187404e-06, + "loss": 0.4185, + "step": 8992 + }, + { + "epoch": 1.54, + "grad_norm": 7.42863130569458, + "learning_rate": 6.850008580744809e-06, + "loss": 0.3356, + "step": 8993 + }, + { + "epoch": 1.54, + "grad_norm": 11.837570190429688, + "learning_rate": 6.847434357302214e-06, + "loss": 0.465, + "step": 8994 + }, + { + "epoch": 1.54, + "grad_norm": 12.257991790771484, + "learning_rate": 6.8448601338596194e-06, + "loss": 0.3991, + "step": 8995 + }, + { + "epoch": 1.54, + "grad_norm": 9.193087577819824, + "learning_rate": 6.8422859104170244e-06, + "loss": 0.3424, + "step": 8996 + }, + { + "epoch": 1.54, + "grad_norm": 13.22000789642334, + "learning_rate": 6.839711686974429e-06, + "loss": 0.5843, + "step": 8997 + }, + { + "epoch": 1.54, + "grad_norm": 11.147873878479004, + "learning_rate": 6.837137463531835e-06, + "loss": 0.5504, + "step": 8998 + }, + { + "epoch": 1.54, + "grad_norm": 14.118026733398438, + "learning_rate": 6.834563240089239e-06, + "loss": 0.4202, + "step": 8999 + }, + { + "epoch": 1.54, + "grad_norm": 10.525010108947754, + "learning_rate": 6.831989016646645e-06, + "loss": 0.3876, + "step": 9000 + }, + { + "epoch": 1.54, + "grad_norm": 8.929567337036133, + "learning_rate": 6.82941479320405e-06, + "loss": 0.2729, + "step": 9001 + }, + { + "epoch": 1.54, + "grad_norm": 13.239514350891113, + "learning_rate": 6.826840569761456e-06, + "loss": 0.3721, + "step": 9002 + }, + { + "epoch": 1.55, + "grad_norm": 7.794919013977051, + "learning_rate": 6.824266346318861e-06, + "loss": 0.3175, + "step": 9003 + }, + { + "epoch": 1.55, + "grad_norm": 9.50118350982666, + "learning_rate": 6.821692122876265e-06, + "loss": 0.293, + "step": 9004 + }, + { + "epoch": 1.55, + "grad_norm": 10.85807991027832, + "learning_rate": 6.819117899433671e-06, + "loss": 0.38, + "step": 9005 + }, + { + "epoch": 1.55, + "grad_norm": 13.937372207641602, + "learning_rate": 6.816543675991076e-06, + "loss": 0.4364, + "step": 9006 + }, + { + "epoch": 1.55, + "grad_norm": 10.955855369567871, + "learning_rate": 6.813969452548482e-06, + "loss": 0.5283, + "step": 9007 + }, + { + "epoch": 1.55, + "grad_norm": 14.119773864746094, + "learning_rate": 6.811395229105887e-06, + "loss": 0.4431, + "step": 9008 + }, + { + "epoch": 1.55, + "grad_norm": 13.668829917907715, + "learning_rate": 6.808821005663292e-06, + "loss": 0.5373, + "step": 9009 + }, + { + "epoch": 1.55, + "grad_norm": 11.992827415466309, + "learning_rate": 6.806246782220697e-06, + "loss": 0.2839, + "step": 9010 + }, + { + "epoch": 1.55, + "grad_norm": 10.80026626586914, + "learning_rate": 6.803672558778102e-06, + "loss": 0.344, + "step": 9011 + }, + { + "epoch": 1.55, + "grad_norm": 11.294907569885254, + "learning_rate": 6.801098335335508e-06, + "loss": 0.4355, + "step": 9012 + }, + { + "epoch": 1.55, + "grad_norm": 9.616134643554688, + "learning_rate": 6.798524111892913e-06, + "loss": 0.3739, + "step": 9013 + }, + { + "epoch": 1.55, + "grad_norm": 11.69835090637207, + "learning_rate": 6.795949888450318e-06, + "loss": 0.3783, + "step": 9014 + }, + { + "epoch": 1.55, + "grad_norm": 10.762347221374512, + "learning_rate": 6.793375665007723e-06, + "loss": 0.4616, + "step": 9015 + }, + { + "epoch": 1.55, + "grad_norm": 10.7774019241333, + "learning_rate": 6.7908014415651276e-06, + "loss": 0.3046, + "step": 9016 + }, + { + "epoch": 1.55, + "grad_norm": 11.603497505187988, + "learning_rate": 6.788227218122533e-06, + "loss": 0.4686, + "step": 9017 + }, + { + "epoch": 1.55, + "grad_norm": 10.491743087768555, + "learning_rate": 6.785652994679938e-06, + "loss": 0.5825, + "step": 9018 + }, + { + "epoch": 1.55, + "grad_norm": 10.463603019714355, + "learning_rate": 6.783078771237343e-06, + "loss": 0.5188, + "step": 9019 + }, + { + "epoch": 1.55, + "grad_norm": 10.45566177368164, + "learning_rate": 6.780504547794748e-06, + "loss": 0.4284, + "step": 9020 + }, + { + "epoch": 1.55, + "grad_norm": 9.350882530212402, + "learning_rate": 6.777930324352154e-06, + "loss": 0.3459, + "step": 9021 + }, + { + "epoch": 1.55, + "grad_norm": 14.684252738952637, + "learning_rate": 6.775356100909559e-06, + "loss": 0.5135, + "step": 9022 + }, + { + "epoch": 1.55, + "grad_norm": 11.462064743041992, + "learning_rate": 6.772781877466964e-06, + "loss": 0.4263, + "step": 9023 + }, + { + "epoch": 1.55, + "grad_norm": 12.511894226074219, + "learning_rate": 6.77020765402437e-06, + "loss": 0.3327, + "step": 9024 + }, + { + "epoch": 1.55, + "grad_norm": 10.778379440307617, + "learning_rate": 6.767633430581774e-06, + "loss": 0.4092, + "step": 9025 + }, + { + "epoch": 1.55, + "grad_norm": 9.334806442260742, + "learning_rate": 6.76505920713918e-06, + "loss": 0.298, + "step": 9026 + }, + { + "epoch": 1.55, + "grad_norm": 10.341312408447266, + "learning_rate": 6.762484983696585e-06, + "loss": 0.4947, + "step": 9027 + }, + { + "epoch": 1.55, + "grad_norm": 12.079545974731445, + "learning_rate": 6.759910760253991e-06, + "loss": 0.4858, + "step": 9028 + }, + { + "epoch": 1.55, + "grad_norm": 13.366859436035156, + "learning_rate": 6.757336536811396e-06, + "loss": 0.4849, + "step": 9029 + }, + { + "epoch": 1.55, + "grad_norm": 11.166098594665527, + "learning_rate": 6.7547623133688e-06, + "loss": 0.4984, + "step": 9030 + }, + { + "epoch": 1.55, + "grad_norm": 11.932158470153809, + "learning_rate": 6.752188089926206e-06, + "loss": 0.508, + "step": 9031 + }, + { + "epoch": 1.55, + "grad_norm": 12.931735038757324, + "learning_rate": 6.749613866483611e-06, + "loss": 0.3258, + "step": 9032 + }, + { + "epoch": 1.55, + "grad_norm": 13.567687034606934, + "learning_rate": 6.747039643041017e-06, + "loss": 0.4604, + "step": 9033 + }, + { + "epoch": 1.55, + "grad_norm": 13.96845817565918, + "learning_rate": 6.7444654195984216e-06, + "loss": 0.5776, + "step": 9034 + }, + { + "epoch": 1.55, + "grad_norm": 12.238383293151855, + "learning_rate": 6.7418911961558266e-06, + "loss": 0.3858, + "step": 9035 + }, + { + "epoch": 1.55, + "grad_norm": 9.755646705627441, + "learning_rate": 6.7393169727132315e-06, + "loss": 0.2998, + "step": 9036 + }, + { + "epoch": 1.55, + "grad_norm": 9.348224639892578, + "learning_rate": 6.7367427492706365e-06, + "loss": 0.3914, + "step": 9037 + }, + { + "epoch": 1.55, + "grad_norm": 8.106252670288086, + "learning_rate": 6.734168525828042e-06, + "loss": 0.3123, + "step": 9038 + }, + { + "epoch": 1.55, + "grad_norm": 9.297879219055176, + "learning_rate": 6.731594302385447e-06, + "loss": 0.3621, + "step": 9039 + }, + { + "epoch": 1.55, + "grad_norm": 9.97221565246582, + "learning_rate": 6.729020078942852e-06, + "loss": 0.4046, + "step": 9040 + }, + { + "epoch": 1.55, + "grad_norm": 10.938226699829102, + "learning_rate": 6.726445855500257e-06, + "loss": 0.3809, + "step": 9041 + }, + { + "epoch": 1.55, + "grad_norm": 8.279706001281738, + "learning_rate": 6.723871632057662e-06, + "loss": 0.3617, + "step": 9042 + }, + { + "epoch": 1.55, + "grad_norm": 10.85624885559082, + "learning_rate": 6.721297408615068e-06, + "loss": 0.4283, + "step": 9043 + }, + { + "epoch": 1.55, + "grad_norm": 10.55158519744873, + "learning_rate": 6.718723185172473e-06, + "loss": 0.3592, + "step": 9044 + }, + { + "epoch": 1.55, + "grad_norm": 11.287115097045898, + "learning_rate": 6.716148961729878e-06, + "loss": 0.5733, + "step": 9045 + }, + { + "epoch": 1.55, + "grad_norm": 10.168319702148438, + "learning_rate": 6.713574738287283e-06, + "loss": 0.4338, + "step": 9046 + }, + { + "epoch": 1.55, + "grad_norm": 7.264942169189453, + "learning_rate": 6.711000514844689e-06, + "loss": 0.3034, + "step": 9047 + }, + { + "epoch": 1.55, + "grad_norm": 10.841805458068848, + "learning_rate": 6.708426291402094e-06, + "loss": 0.5603, + "step": 9048 + }, + { + "epoch": 1.55, + "grad_norm": 13.720842361450195, + "learning_rate": 6.705852067959499e-06, + "loss": 0.6207, + "step": 9049 + }, + { + "epoch": 1.55, + "grad_norm": 8.516718864440918, + "learning_rate": 6.703277844516905e-06, + "loss": 0.3713, + "step": 9050 + }, + { + "epoch": 1.55, + "grad_norm": 11.178365707397461, + "learning_rate": 6.700703621074309e-06, + "loss": 0.5639, + "step": 9051 + }, + { + "epoch": 1.55, + "grad_norm": 10.041665077209473, + "learning_rate": 6.698129397631715e-06, + "loss": 0.4252, + "step": 9052 + }, + { + "epoch": 1.55, + "grad_norm": 10.102350234985352, + "learning_rate": 6.69555517418912e-06, + "loss": 0.5042, + "step": 9053 + }, + { + "epoch": 1.55, + "grad_norm": 8.4866361618042, + "learning_rate": 6.6929809507465256e-06, + "loss": 0.4346, + "step": 9054 + }, + { + "epoch": 1.55, + "grad_norm": 13.604265213012695, + "learning_rate": 6.6904067273039305e-06, + "loss": 0.4202, + "step": 9055 + }, + { + "epoch": 1.55, + "grad_norm": 8.317880630493164, + "learning_rate": 6.687832503861335e-06, + "loss": 0.2992, + "step": 9056 + }, + { + "epoch": 1.55, + "grad_norm": 8.931004524230957, + "learning_rate": 6.6852582804187405e-06, + "loss": 0.2154, + "step": 9057 + }, + { + "epoch": 1.55, + "grad_norm": 12.68259048461914, + "learning_rate": 6.6826840569761455e-06, + "loss": 0.356, + "step": 9058 + }, + { + "epoch": 1.55, + "grad_norm": 11.403766632080078, + "learning_rate": 6.680109833533551e-06, + "loss": 0.4383, + "step": 9059 + }, + { + "epoch": 1.55, + "grad_norm": 11.369772911071777, + "learning_rate": 6.677535610090956e-06, + "loss": 0.4297, + "step": 9060 + }, + { + "epoch": 1.56, + "grad_norm": 12.384984016418457, + "learning_rate": 6.674961386648361e-06, + "loss": 0.3296, + "step": 9061 + }, + { + "epoch": 1.56, + "grad_norm": 9.11343002319336, + "learning_rate": 6.672387163205766e-06, + "loss": 0.2974, + "step": 9062 + }, + { + "epoch": 1.56, + "grad_norm": 11.19269847869873, + "learning_rate": 6.669812939763171e-06, + "loss": 0.4339, + "step": 9063 + }, + { + "epoch": 1.56, + "grad_norm": 11.9432954788208, + "learning_rate": 6.667238716320577e-06, + "loss": 0.4885, + "step": 9064 + }, + { + "epoch": 1.56, + "grad_norm": 11.548988342285156, + "learning_rate": 6.664664492877982e-06, + "loss": 0.418, + "step": 9065 + }, + { + "epoch": 1.56, + "grad_norm": 11.793389320373535, + "learning_rate": 6.662090269435387e-06, + "loss": 0.4407, + "step": 9066 + }, + { + "epoch": 1.56, + "grad_norm": 15.234336853027344, + "learning_rate": 6.659516045992792e-06, + "loss": 0.4532, + "step": 9067 + }, + { + "epoch": 1.56, + "grad_norm": 10.90408992767334, + "learning_rate": 6.656941822550197e-06, + "loss": 0.5414, + "step": 9068 + }, + { + "epoch": 1.56, + "grad_norm": 11.602997779846191, + "learning_rate": 6.654367599107603e-06, + "loss": 0.4308, + "step": 9069 + }, + { + "epoch": 1.56, + "grad_norm": 10.916178703308105, + "learning_rate": 6.651793375665008e-06, + "loss": 0.5975, + "step": 9070 + }, + { + "epoch": 1.56, + "grad_norm": 10.920973777770996, + "learning_rate": 6.649219152222413e-06, + "loss": 0.2993, + "step": 9071 + }, + { + "epoch": 1.56, + "grad_norm": 9.38058853149414, + "learning_rate": 6.646644928779818e-06, + "loss": 0.3294, + "step": 9072 + }, + { + "epoch": 1.56, + "grad_norm": 13.297099113464355, + "learning_rate": 6.644070705337224e-06, + "loss": 0.5424, + "step": 9073 + }, + { + "epoch": 1.56, + "grad_norm": 12.37918472290039, + "learning_rate": 6.641496481894629e-06, + "loss": 0.6976, + "step": 9074 + }, + { + "epoch": 1.56, + "grad_norm": 10.919448852539062, + "learning_rate": 6.638922258452034e-06, + "loss": 0.3287, + "step": 9075 + }, + { + "epoch": 1.56, + "grad_norm": 11.900317192077637, + "learning_rate": 6.6363480350094395e-06, + "loss": 0.5989, + "step": 9076 + }, + { + "epoch": 1.56, + "grad_norm": 11.163420677185059, + "learning_rate": 6.633773811566844e-06, + "loss": 0.4301, + "step": 9077 + }, + { + "epoch": 1.56, + "grad_norm": 11.837841033935547, + "learning_rate": 6.6311995881242495e-06, + "loss": 0.3274, + "step": 9078 + }, + { + "epoch": 1.56, + "grad_norm": 13.50175952911377, + "learning_rate": 6.6286253646816545e-06, + "loss": 0.4362, + "step": 9079 + }, + { + "epoch": 1.56, + "grad_norm": 11.980319023132324, + "learning_rate": 6.62605114123906e-06, + "loss": 0.3522, + "step": 9080 + }, + { + "epoch": 1.56, + "grad_norm": 12.221705436706543, + "learning_rate": 6.623476917796465e-06, + "loss": 0.5547, + "step": 9081 + }, + { + "epoch": 1.56, + "grad_norm": 10.201438903808594, + "learning_rate": 6.6209026943538694e-06, + "loss": 0.3081, + "step": 9082 + }, + { + "epoch": 1.56, + "grad_norm": 17.045942306518555, + "learning_rate": 6.618328470911275e-06, + "loss": 0.3885, + "step": 9083 + }, + { + "epoch": 1.56, + "grad_norm": 8.674612998962402, + "learning_rate": 6.61575424746868e-06, + "loss": 0.3766, + "step": 9084 + }, + { + "epoch": 1.56, + "grad_norm": 10.371481895446777, + "learning_rate": 6.613180024026086e-06, + "loss": 0.2742, + "step": 9085 + }, + { + "epoch": 1.56, + "grad_norm": 8.235065460205078, + "learning_rate": 6.610605800583491e-06, + "loss": 0.2868, + "step": 9086 + }, + { + "epoch": 1.56, + "grad_norm": 12.495343208312988, + "learning_rate": 6.608031577140896e-06, + "loss": 0.5213, + "step": 9087 + }, + { + "epoch": 1.56, + "grad_norm": 9.121530532836914, + "learning_rate": 6.605457353698301e-06, + "loss": 0.4886, + "step": 9088 + }, + { + "epoch": 1.56, + "grad_norm": 9.821917533874512, + "learning_rate": 6.602883130255706e-06, + "loss": 0.4306, + "step": 9089 + }, + { + "epoch": 1.56, + "grad_norm": 8.319422721862793, + "learning_rate": 6.600308906813112e-06, + "loss": 0.34, + "step": 9090 + }, + { + "epoch": 1.56, + "grad_norm": 11.17870044708252, + "learning_rate": 6.597734683370517e-06, + "loss": 0.5238, + "step": 9091 + }, + { + "epoch": 1.56, + "grad_norm": 9.177310943603516, + "learning_rate": 6.595160459927922e-06, + "loss": 0.3137, + "step": 9092 + }, + { + "epoch": 1.56, + "grad_norm": 7.990798473358154, + "learning_rate": 6.592586236485327e-06, + "loss": 0.3652, + "step": 9093 + }, + { + "epoch": 1.56, + "grad_norm": 10.063852310180664, + "learning_rate": 6.590012013042732e-06, + "loss": 0.4725, + "step": 9094 + }, + { + "epoch": 1.56, + "grad_norm": 8.781620025634766, + "learning_rate": 6.587437789600138e-06, + "loss": 0.4493, + "step": 9095 + }, + { + "epoch": 1.56, + "grad_norm": 9.202735900878906, + "learning_rate": 6.584863566157543e-06, + "loss": 0.3454, + "step": 9096 + }, + { + "epoch": 1.56, + "grad_norm": 13.74743938446045, + "learning_rate": 6.582289342714948e-06, + "loss": 0.4237, + "step": 9097 + }, + { + "epoch": 1.56, + "grad_norm": 9.113112449645996, + "learning_rate": 6.579715119272353e-06, + "loss": 0.4049, + "step": 9098 + }, + { + "epoch": 1.56, + "grad_norm": 11.021286964416504, + "learning_rate": 6.5771408958297585e-06, + "loss": 0.6689, + "step": 9099 + }, + { + "epoch": 1.56, + "grad_norm": 8.52872085571289, + "learning_rate": 6.5745666723871634e-06, + "loss": 0.3713, + "step": 9100 + }, + { + "epoch": 1.56, + "grad_norm": 11.817242622375488, + "learning_rate": 6.5719924489445684e-06, + "loss": 0.4241, + "step": 9101 + }, + { + "epoch": 1.56, + "grad_norm": 12.09481143951416, + "learning_rate": 6.569418225501974e-06, + "loss": 0.5944, + "step": 9102 + }, + { + "epoch": 1.56, + "grad_norm": 8.745797157287598, + "learning_rate": 6.566844002059378e-06, + "loss": 0.2441, + "step": 9103 + }, + { + "epoch": 1.56, + "grad_norm": 10.217633247375488, + "learning_rate": 6.564269778616784e-06, + "loss": 0.4832, + "step": 9104 + }, + { + "epoch": 1.56, + "grad_norm": 11.284358978271484, + "learning_rate": 6.561695555174189e-06, + "loss": 0.3287, + "step": 9105 + }, + { + "epoch": 1.56, + "grad_norm": 7.830768585205078, + "learning_rate": 6.559121331731595e-06, + "loss": 0.3138, + "step": 9106 + }, + { + "epoch": 1.56, + "grad_norm": 9.012214660644531, + "learning_rate": 6.556547108289e-06, + "loss": 0.387, + "step": 9107 + }, + { + "epoch": 1.56, + "grad_norm": 11.80935001373291, + "learning_rate": 6.553972884846404e-06, + "loss": 0.5218, + "step": 9108 + }, + { + "epoch": 1.56, + "grad_norm": 7.77457857131958, + "learning_rate": 6.55139866140381e-06, + "loss": 0.3142, + "step": 9109 + }, + { + "epoch": 1.56, + "grad_norm": 10.121886253356934, + "learning_rate": 6.548824437961215e-06, + "loss": 0.3844, + "step": 9110 + }, + { + "epoch": 1.56, + "grad_norm": 10.070313453674316, + "learning_rate": 6.546250214518621e-06, + "loss": 0.531, + "step": 9111 + }, + { + "epoch": 1.56, + "grad_norm": 9.64294147491455, + "learning_rate": 6.543675991076026e-06, + "loss": 0.2965, + "step": 9112 + }, + { + "epoch": 1.56, + "grad_norm": 15.200862884521484, + "learning_rate": 6.541101767633431e-06, + "loss": 0.4901, + "step": 9113 + }, + { + "epoch": 1.56, + "grad_norm": 8.238682746887207, + "learning_rate": 6.538527544190836e-06, + "loss": 0.3706, + "step": 9114 + }, + { + "epoch": 1.56, + "grad_norm": 14.326116561889648, + "learning_rate": 6.535953320748241e-06, + "loss": 0.3243, + "step": 9115 + }, + { + "epoch": 1.56, + "grad_norm": 8.95153522491455, + "learning_rate": 6.533379097305647e-06, + "loss": 0.4127, + "step": 9116 + }, + { + "epoch": 1.56, + "grad_norm": 11.6895751953125, + "learning_rate": 6.530804873863052e-06, + "loss": 0.4495, + "step": 9117 + }, + { + "epoch": 1.56, + "grad_norm": 9.63408088684082, + "learning_rate": 6.528230650420457e-06, + "loss": 0.3325, + "step": 9118 + }, + { + "epoch": 1.56, + "grad_norm": 10.318384170532227, + "learning_rate": 6.525656426977862e-06, + "loss": 0.4507, + "step": 9119 + }, + { + "epoch": 1.57, + "grad_norm": 9.620682716369629, + "learning_rate": 6.5230822035352666e-06, + "loss": 0.387, + "step": 9120 + }, + { + "epoch": 1.57, + "grad_norm": 11.932801246643066, + "learning_rate": 6.520507980092672e-06, + "loss": 0.3956, + "step": 9121 + }, + { + "epoch": 1.57, + "grad_norm": 9.0022611618042, + "learning_rate": 6.517933756650077e-06, + "loss": 0.3439, + "step": 9122 + }, + { + "epoch": 1.57, + "grad_norm": 9.975605010986328, + "learning_rate": 6.515359533207482e-06, + "loss": 0.3452, + "step": 9123 + }, + { + "epoch": 1.57, + "grad_norm": 7.023692607879639, + "learning_rate": 6.512785309764887e-06, + "loss": 0.3325, + "step": 9124 + }, + { + "epoch": 1.57, + "grad_norm": 13.353510856628418, + "learning_rate": 6.510211086322293e-06, + "loss": 0.6038, + "step": 9125 + }, + { + "epoch": 1.57, + "grad_norm": 9.9276123046875, + "learning_rate": 6.507636862879698e-06, + "loss": 0.4792, + "step": 9126 + }, + { + "epoch": 1.57, + "grad_norm": 10.834636688232422, + "learning_rate": 6.505062639437103e-06, + "loss": 0.5706, + "step": 9127 + }, + { + "epoch": 1.57, + "grad_norm": 8.11631965637207, + "learning_rate": 6.502488415994509e-06, + "loss": 0.4871, + "step": 9128 + }, + { + "epoch": 1.57, + "grad_norm": 7.8363213539123535, + "learning_rate": 6.499914192551913e-06, + "loss": 0.3815, + "step": 9129 + }, + { + "epoch": 1.57, + "grad_norm": 7.0677266120910645, + "learning_rate": 6.497339969109319e-06, + "loss": 0.2796, + "step": 9130 + }, + { + "epoch": 1.57, + "grad_norm": 10.577725410461426, + "learning_rate": 6.494765745666724e-06, + "loss": 0.344, + "step": 9131 + }, + { + "epoch": 1.57, + "grad_norm": 7.923539161682129, + "learning_rate": 6.49219152222413e-06, + "loss": 0.3185, + "step": 9132 + }, + { + "epoch": 1.57, + "grad_norm": 8.376360893249512, + "learning_rate": 6.489617298781535e-06, + "loss": 0.3198, + "step": 9133 + }, + { + "epoch": 1.57, + "grad_norm": 11.27021598815918, + "learning_rate": 6.487043075338939e-06, + "loss": 0.5073, + "step": 9134 + }, + { + "epoch": 1.57, + "grad_norm": 12.934722900390625, + "learning_rate": 6.484468851896345e-06, + "loss": 0.4404, + "step": 9135 + }, + { + "epoch": 1.57, + "grad_norm": 12.822287559509277, + "learning_rate": 6.48189462845375e-06, + "loss": 0.5774, + "step": 9136 + }, + { + "epoch": 1.57, + "grad_norm": 10.939071655273438, + "learning_rate": 6.479320405011156e-06, + "loss": 0.2943, + "step": 9137 + }, + { + "epoch": 1.57, + "grad_norm": 9.28674030303955, + "learning_rate": 6.476746181568561e-06, + "loss": 0.3376, + "step": 9138 + }, + { + "epoch": 1.57, + "grad_norm": 7.918496131896973, + "learning_rate": 6.4741719581259656e-06, + "loss": 0.2892, + "step": 9139 + }, + { + "epoch": 1.57, + "grad_norm": 10.129401206970215, + "learning_rate": 6.4715977346833706e-06, + "loss": 0.4588, + "step": 9140 + }, + { + "epoch": 1.57, + "grad_norm": 11.066826820373535, + "learning_rate": 6.4690235112407755e-06, + "loss": 0.4128, + "step": 9141 + }, + { + "epoch": 1.57, + "grad_norm": 9.561490058898926, + "learning_rate": 6.466449287798181e-06, + "loss": 0.4404, + "step": 9142 + }, + { + "epoch": 1.57, + "grad_norm": 11.176942825317383, + "learning_rate": 6.463875064355586e-06, + "loss": 0.3879, + "step": 9143 + }, + { + "epoch": 1.57, + "grad_norm": 10.307493209838867, + "learning_rate": 6.461300840912991e-06, + "loss": 0.4464, + "step": 9144 + }, + { + "epoch": 1.57, + "grad_norm": 10.318291664123535, + "learning_rate": 6.458726617470396e-06, + "loss": 0.4403, + "step": 9145 + }, + { + "epoch": 1.57, + "grad_norm": 10.240020751953125, + "learning_rate": 6.456152394027801e-06, + "loss": 0.4397, + "step": 9146 + }, + { + "epoch": 1.57, + "grad_norm": 11.238934516906738, + "learning_rate": 6.453578170585207e-06, + "loss": 0.3925, + "step": 9147 + }, + { + "epoch": 1.57, + "grad_norm": 8.540674209594727, + "learning_rate": 6.451003947142612e-06, + "loss": 0.2702, + "step": 9148 + }, + { + "epoch": 1.57, + "grad_norm": 12.108983039855957, + "learning_rate": 6.448429723700017e-06, + "loss": 0.3493, + "step": 9149 + }, + { + "epoch": 1.57, + "grad_norm": 10.288019180297852, + "learning_rate": 6.445855500257422e-06, + "loss": 0.5413, + "step": 9150 + }, + { + "epoch": 1.57, + "grad_norm": 8.925826072692871, + "learning_rate": 6.443281276814828e-06, + "loss": 0.3367, + "step": 9151 + }, + { + "epoch": 1.57, + "grad_norm": 7.938533306121826, + "learning_rate": 6.440707053372233e-06, + "loss": 0.3785, + "step": 9152 + }, + { + "epoch": 1.57, + "grad_norm": 13.446700096130371, + "learning_rate": 6.438132829929638e-06, + "loss": 0.4267, + "step": 9153 + }, + { + "epoch": 1.57, + "grad_norm": 14.088892936706543, + "learning_rate": 6.435558606487044e-06, + "loss": 0.477, + "step": 9154 + }, + { + "epoch": 1.57, + "grad_norm": 8.959019660949707, + "learning_rate": 6.432984383044448e-06, + "loss": 0.3541, + "step": 9155 + }, + { + "epoch": 1.57, + "grad_norm": 10.720050811767578, + "learning_rate": 6.430410159601854e-06, + "loss": 0.4112, + "step": 9156 + }, + { + "epoch": 1.57, + "grad_norm": 7.589910507202148, + "learning_rate": 6.427835936159259e-06, + "loss": 0.2835, + "step": 9157 + }, + { + "epoch": 1.57, + "grad_norm": 11.143238067626953, + "learning_rate": 6.4252617127166646e-06, + "loss": 0.4491, + "step": 9158 + }, + { + "epoch": 1.57, + "grad_norm": 10.869386672973633, + "learning_rate": 6.4226874892740696e-06, + "loss": 0.6889, + "step": 9159 + }, + { + "epoch": 1.57, + "grad_norm": 12.199326515197754, + "learning_rate": 6.420113265831474e-06, + "loss": 0.34, + "step": 9160 + }, + { + "epoch": 1.57, + "grad_norm": 8.466827392578125, + "learning_rate": 6.4175390423888795e-06, + "loss": 0.4387, + "step": 9161 + }, + { + "epoch": 1.57, + "grad_norm": 8.478174209594727, + "learning_rate": 6.4149648189462845e-06, + "loss": 0.4016, + "step": 9162 + }, + { + "epoch": 1.57, + "grad_norm": 12.79170036315918, + "learning_rate": 6.41239059550369e-06, + "loss": 0.3401, + "step": 9163 + }, + { + "epoch": 1.57, + "grad_norm": 9.144485473632812, + "learning_rate": 6.409816372061095e-06, + "loss": 0.2904, + "step": 9164 + }, + { + "epoch": 1.57, + "grad_norm": 6.949034214019775, + "learning_rate": 6.4072421486185e-06, + "loss": 0.3288, + "step": 9165 + }, + { + "epoch": 1.57, + "grad_norm": 9.652872085571289, + "learning_rate": 6.404667925175905e-06, + "loss": 0.5183, + "step": 9166 + }, + { + "epoch": 1.57, + "grad_norm": 8.029740333557129, + "learning_rate": 6.40209370173331e-06, + "loss": 0.4428, + "step": 9167 + }, + { + "epoch": 1.57, + "grad_norm": 13.10626220703125, + "learning_rate": 6.399519478290716e-06, + "loss": 0.4905, + "step": 9168 + }, + { + "epoch": 1.57, + "grad_norm": 13.636055946350098, + "learning_rate": 6.396945254848121e-06, + "loss": 0.3283, + "step": 9169 + }, + { + "epoch": 1.57, + "grad_norm": 9.930211067199707, + "learning_rate": 6.394371031405526e-06, + "loss": 0.5027, + "step": 9170 + }, + { + "epoch": 1.57, + "grad_norm": 11.66115665435791, + "learning_rate": 6.391796807962931e-06, + "loss": 0.5509, + "step": 9171 + }, + { + "epoch": 1.57, + "grad_norm": 7.8271050453186035, + "learning_rate": 6.389222584520336e-06, + "loss": 0.3067, + "step": 9172 + }, + { + "epoch": 1.57, + "grad_norm": 9.62724781036377, + "learning_rate": 6.386648361077742e-06, + "loss": 0.4757, + "step": 9173 + }, + { + "epoch": 1.57, + "grad_norm": 10.797287940979004, + "learning_rate": 6.384074137635147e-06, + "loss": 0.3841, + "step": 9174 + }, + { + "epoch": 1.57, + "grad_norm": 11.340659141540527, + "learning_rate": 6.381499914192552e-06, + "loss": 0.3596, + "step": 9175 + }, + { + "epoch": 1.57, + "grad_norm": 11.375767707824707, + "learning_rate": 6.378925690749957e-06, + "loss": 0.4161, + "step": 9176 + }, + { + "epoch": 1.57, + "grad_norm": 11.873882293701172, + "learning_rate": 6.376351467307363e-06, + "loss": 0.5284, + "step": 9177 + }, + { + "epoch": 1.58, + "grad_norm": 8.78491497039795, + "learning_rate": 6.373777243864768e-06, + "loss": 0.2966, + "step": 9178 + }, + { + "epoch": 1.58, + "grad_norm": 11.241011619567871, + "learning_rate": 6.371203020422173e-06, + "loss": 0.3889, + "step": 9179 + }, + { + "epoch": 1.58, + "grad_norm": 10.602612495422363, + "learning_rate": 6.3686287969795785e-06, + "loss": 0.4154, + "step": 9180 + }, + { + "epoch": 1.58, + "grad_norm": 8.44935131072998, + "learning_rate": 6.366054573536983e-06, + "loss": 0.2783, + "step": 9181 + }, + { + "epoch": 1.58, + "grad_norm": 11.485203742980957, + "learning_rate": 6.3634803500943885e-06, + "loss": 0.4032, + "step": 9182 + }, + { + "epoch": 1.58, + "grad_norm": 10.090597152709961, + "learning_rate": 6.3609061266517935e-06, + "loss": 0.3144, + "step": 9183 + }, + { + "epoch": 1.58, + "grad_norm": 7.790277004241943, + "learning_rate": 6.358331903209199e-06, + "loss": 0.3183, + "step": 9184 + }, + { + "epoch": 1.58, + "grad_norm": 12.959728240966797, + "learning_rate": 6.355757679766604e-06, + "loss": 0.4813, + "step": 9185 + }, + { + "epoch": 1.58, + "grad_norm": 8.528314590454102, + "learning_rate": 6.3531834563240084e-06, + "loss": 0.4418, + "step": 9186 + }, + { + "epoch": 1.58, + "grad_norm": 8.280416488647461, + "learning_rate": 6.350609232881414e-06, + "loss": 0.3754, + "step": 9187 + }, + { + "epoch": 1.58, + "grad_norm": 12.70815658569336, + "learning_rate": 6.348035009438819e-06, + "loss": 0.4866, + "step": 9188 + }, + { + "epoch": 1.58, + "grad_norm": 13.094280242919922, + "learning_rate": 6.345460785996225e-06, + "loss": 0.4987, + "step": 9189 + }, + { + "epoch": 1.58, + "grad_norm": 11.443658828735352, + "learning_rate": 6.34288656255363e-06, + "loss": 0.4744, + "step": 9190 + }, + { + "epoch": 1.58, + "grad_norm": 12.062834739685059, + "learning_rate": 6.340312339111034e-06, + "loss": 0.4508, + "step": 9191 + }, + { + "epoch": 1.58, + "grad_norm": 8.706581115722656, + "learning_rate": 6.33773811566844e-06, + "loss": 0.4386, + "step": 9192 + }, + { + "epoch": 1.58, + "grad_norm": 11.26957893371582, + "learning_rate": 6.335163892225845e-06, + "loss": 0.401, + "step": 9193 + }, + { + "epoch": 1.58, + "grad_norm": 11.478545188903809, + "learning_rate": 6.332589668783251e-06, + "loss": 0.4432, + "step": 9194 + }, + { + "epoch": 1.58, + "grad_norm": 11.519157409667969, + "learning_rate": 6.330015445340656e-06, + "loss": 0.5666, + "step": 9195 + }, + { + "epoch": 1.58, + "grad_norm": 10.237707138061523, + "learning_rate": 6.327441221898061e-06, + "loss": 0.464, + "step": 9196 + }, + { + "epoch": 1.58, + "grad_norm": 8.189074516296387, + "learning_rate": 6.324866998455466e-06, + "loss": 0.3563, + "step": 9197 + }, + { + "epoch": 1.58, + "grad_norm": 12.335261344909668, + "learning_rate": 6.322292775012871e-06, + "loss": 0.4736, + "step": 9198 + }, + { + "epoch": 1.58, + "grad_norm": 9.836599349975586, + "learning_rate": 6.319718551570277e-06, + "loss": 0.3335, + "step": 9199 + }, + { + "epoch": 1.58, + "grad_norm": 11.865789413452148, + "learning_rate": 6.317144328127682e-06, + "loss": 0.2711, + "step": 9200 + }, + { + "epoch": 1.58, + "grad_norm": 11.779120445251465, + "learning_rate": 6.314570104685087e-06, + "loss": 0.4434, + "step": 9201 + }, + { + "epoch": 1.58, + "grad_norm": 14.948101043701172, + "learning_rate": 6.311995881242492e-06, + "loss": 0.398, + "step": 9202 + }, + { + "epoch": 1.58, + "grad_norm": 7.8967084884643555, + "learning_rate": 6.3094216577998975e-06, + "loss": 0.2803, + "step": 9203 + }, + { + "epoch": 1.58, + "grad_norm": 10.806225776672363, + "learning_rate": 6.3068474343573025e-06, + "loss": 0.3499, + "step": 9204 + }, + { + "epoch": 1.58, + "grad_norm": 11.917614936828613, + "learning_rate": 6.3042732109147074e-06, + "loss": 0.6543, + "step": 9205 + }, + { + "epoch": 1.58, + "grad_norm": 10.861566543579102, + "learning_rate": 6.301698987472113e-06, + "loss": 0.394, + "step": 9206 + }, + { + "epoch": 1.58, + "grad_norm": 13.328132629394531, + "learning_rate": 6.299124764029517e-06, + "loss": 0.3762, + "step": 9207 + }, + { + "epoch": 1.58, + "grad_norm": 7.741944313049316, + "learning_rate": 6.296550540586923e-06, + "loss": 0.266, + "step": 9208 + }, + { + "epoch": 1.58, + "grad_norm": 14.820136070251465, + "learning_rate": 6.293976317144328e-06, + "loss": 0.3392, + "step": 9209 + }, + { + "epoch": 1.58, + "grad_norm": 8.18803882598877, + "learning_rate": 6.291402093701734e-06, + "loss": 0.3968, + "step": 9210 + }, + { + "epoch": 1.58, + "grad_norm": 10.384278297424316, + "learning_rate": 6.288827870259139e-06, + "loss": 0.3269, + "step": 9211 + }, + { + "epoch": 1.58, + "grad_norm": 7.6448140144348145, + "learning_rate": 6.286253646816543e-06, + "loss": 0.242, + "step": 9212 + }, + { + "epoch": 1.58, + "grad_norm": 7.856287002563477, + "learning_rate": 6.283679423373949e-06, + "loss": 0.2546, + "step": 9213 + }, + { + "epoch": 1.58, + "grad_norm": 7.850491523742676, + "learning_rate": 6.281105199931354e-06, + "loss": 0.3218, + "step": 9214 + }, + { + "epoch": 1.58, + "grad_norm": 10.62674617767334, + "learning_rate": 6.27853097648876e-06, + "loss": 0.4675, + "step": 9215 + }, + { + "epoch": 1.58, + "grad_norm": 12.44118595123291, + "learning_rate": 6.275956753046165e-06, + "loss": 0.4394, + "step": 9216 + }, + { + "epoch": 1.58, + "grad_norm": 13.37971305847168, + "learning_rate": 6.273382529603569e-06, + "loss": 0.5001, + "step": 9217 + }, + { + "epoch": 1.58, + "grad_norm": 11.927918434143066, + "learning_rate": 6.270808306160975e-06, + "loss": 0.4025, + "step": 9218 + }, + { + "epoch": 1.58, + "grad_norm": 9.683401107788086, + "learning_rate": 6.26823408271838e-06, + "loss": 0.412, + "step": 9219 + }, + { + "epoch": 1.58, + "grad_norm": 9.079011917114258, + "learning_rate": 6.265659859275786e-06, + "loss": 0.419, + "step": 9220 + }, + { + "epoch": 1.58, + "grad_norm": 9.040717124938965, + "learning_rate": 6.263085635833191e-06, + "loss": 0.2575, + "step": 9221 + }, + { + "epoch": 1.58, + "grad_norm": 8.044816970825195, + "learning_rate": 6.260511412390596e-06, + "loss": 0.2736, + "step": 9222 + }, + { + "epoch": 1.58, + "grad_norm": 9.928059577941895, + "learning_rate": 6.257937188948001e-06, + "loss": 0.4892, + "step": 9223 + }, + { + "epoch": 1.58, + "grad_norm": 13.411823272705078, + "learning_rate": 6.255362965505406e-06, + "loss": 0.6102, + "step": 9224 + }, + { + "epoch": 1.58, + "grad_norm": 12.721311569213867, + "learning_rate": 6.252788742062811e-06, + "loss": 0.4807, + "step": 9225 + }, + { + "epoch": 1.58, + "grad_norm": 9.812295913696289, + "learning_rate": 6.250214518620216e-06, + "loss": 0.5312, + "step": 9226 + }, + { + "epoch": 1.58, + "grad_norm": 12.255032539367676, + "learning_rate": 6.247640295177621e-06, + "loss": 0.6739, + "step": 9227 + }, + { + "epoch": 1.58, + "grad_norm": 18.508779525756836, + "learning_rate": 6.245066071735026e-06, + "loss": 0.5966, + "step": 9228 + }, + { + "epoch": 1.58, + "grad_norm": 10.748486518859863, + "learning_rate": 6.242491848292432e-06, + "loss": 0.4089, + "step": 9229 + }, + { + "epoch": 1.58, + "grad_norm": 17.84903335571289, + "learning_rate": 6.239917624849837e-06, + "loss": 0.5221, + "step": 9230 + }, + { + "epoch": 1.58, + "grad_norm": 8.814446449279785, + "learning_rate": 6.237343401407242e-06, + "loss": 0.5019, + "step": 9231 + }, + { + "epoch": 1.58, + "grad_norm": 12.281661033630371, + "learning_rate": 6.234769177964648e-06, + "loss": 0.5408, + "step": 9232 + }, + { + "epoch": 1.58, + "grad_norm": 14.548513412475586, + "learning_rate": 6.232194954522052e-06, + "loss": 0.4943, + "step": 9233 + }, + { + "epoch": 1.58, + "grad_norm": 12.791305541992188, + "learning_rate": 6.229620731079458e-06, + "loss": 0.4264, + "step": 9234 + }, + { + "epoch": 1.58, + "grad_norm": 9.57929515838623, + "learning_rate": 6.227046507636863e-06, + "loss": 0.3723, + "step": 9235 + }, + { + "epoch": 1.59, + "grad_norm": 11.174640655517578, + "learning_rate": 6.224472284194269e-06, + "loss": 0.4752, + "step": 9236 + }, + { + "epoch": 1.59, + "grad_norm": 17.794071197509766, + "learning_rate": 6.221898060751674e-06, + "loss": 0.4902, + "step": 9237 + }, + { + "epoch": 1.59, + "grad_norm": 14.173076629638672, + "learning_rate": 6.219323837309078e-06, + "loss": 0.3757, + "step": 9238 + }, + { + "epoch": 1.59, + "grad_norm": 10.48410415649414, + "learning_rate": 6.216749613866484e-06, + "loss": 0.2799, + "step": 9239 + }, + { + "epoch": 1.59, + "grad_norm": 6.611170291900635, + "learning_rate": 6.214175390423889e-06, + "loss": 0.3328, + "step": 9240 + }, + { + "epoch": 1.59, + "grad_norm": 7.497977256774902, + "learning_rate": 6.211601166981295e-06, + "loss": 0.4005, + "step": 9241 + }, + { + "epoch": 1.59, + "grad_norm": 8.058220863342285, + "learning_rate": 6.2090269435387e-06, + "loss": 0.3598, + "step": 9242 + }, + { + "epoch": 1.59, + "grad_norm": 10.694328308105469, + "learning_rate": 6.206452720096104e-06, + "loss": 0.3787, + "step": 9243 + }, + { + "epoch": 1.59, + "grad_norm": 11.689227104187012, + "learning_rate": 6.2038784966535096e-06, + "loss": 0.607, + "step": 9244 + }, + { + "epoch": 1.59, + "grad_norm": 9.597221374511719, + "learning_rate": 6.2013042732109146e-06, + "loss": 0.3211, + "step": 9245 + }, + { + "epoch": 1.59, + "grad_norm": 7.371646881103516, + "learning_rate": 6.19873004976832e-06, + "loss": 0.3632, + "step": 9246 + }, + { + "epoch": 1.59, + "grad_norm": 11.772314071655273, + "learning_rate": 6.196155826325725e-06, + "loss": 0.5425, + "step": 9247 + }, + { + "epoch": 1.59, + "grad_norm": 9.395363807678223, + "learning_rate": 6.19358160288313e-06, + "loss": 0.2751, + "step": 9248 + }, + { + "epoch": 1.59, + "grad_norm": 13.440888404846191, + "learning_rate": 6.191007379440535e-06, + "loss": 0.622, + "step": 9249 + }, + { + "epoch": 1.59, + "grad_norm": 12.794041633605957, + "learning_rate": 6.18843315599794e-06, + "loss": 0.3511, + "step": 9250 + }, + { + "epoch": 1.59, + "grad_norm": 14.800326347351074, + "learning_rate": 6.185858932555346e-06, + "loss": 0.5923, + "step": 9251 + }, + { + "epoch": 1.59, + "grad_norm": 12.731034278869629, + "learning_rate": 6.183284709112751e-06, + "loss": 0.5992, + "step": 9252 + }, + { + "epoch": 1.59, + "grad_norm": 8.585034370422363, + "learning_rate": 6.180710485670156e-06, + "loss": 0.3608, + "step": 9253 + }, + { + "epoch": 1.59, + "grad_norm": 15.079184532165527, + "learning_rate": 6.178136262227561e-06, + "loss": 0.5047, + "step": 9254 + }, + { + "epoch": 1.59, + "grad_norm": 10.937750816345215, + "learning_rate": 6.175562038784967e-06, + "loss": 0.3699, + "step": 9255 + }, + { + "epoch": 1.59, + "grad_norm": 8.716346740722656, + "learning_rate": 6.172987815342372e-06, + "loss": 0.4097, + "step": 9256 + }, + { + "epoch": 1.59, + "grad_norm": 9.026440620422363, + "learning_rate": 6.170413591899777e-06, + "loss": 0.401, + "step": 9257 + }, + { + "epoch": 1.59, + "grad_norm": 8.726183891296387, + "learning_rate": 6.167839368457183e-06, + "loss": 0.4434, + "step": 9258 + }, + { + "epoch": 1.59, + "grad_norm": 11.557024955749512, + "learning_rate": 6.165265145014587e-06, + "loss": 0.5816, + "step": 9259 + }, + { + "epoch": 1.59, + "grad_norm": 12.54609203338623, + "learning_rate": 6.162690921571993e-06, + "loss": 0.4758, + "step": 9260 + }, + { + "epoch": 1.59, + "grad_norm": 14.337329864501953, + "learning_rate": 6.160116698129398e-06, + "loss": 0.624, + "step": 9261 + }, + { + "epoch": 1.59, + "grad_norm": 9.918791770935059, + "learning_rate": 6.157542474686804e-06, + "loss": 0.2915, + "step": 9262 + }, + { + "epoch": 1.59, + "grad_norm": 9.622333526611328, + "learning_rate": 6.1549682512442086e-06, + "loss": 0.4079, + "step": 9263 + }, + { + "epoch": 1.59, + "grad_norm": 13.2561674118042, + "learning_rate": 6.152394027801613e-06, + "loss": 0.3809, + "step": 9264 + }, + { + "epoch": 1.59, + "grad_norm": 10.55340576171875, + "learning_rate": 6.1498198043590185e-06, + "loss": 0.5374, + "step": 9265 + }, + { + "epoch": 1.59, + "grad_norm": 8.96012020111084, + "learning_rate": 6.1472455809164235e-06, + "loss": 0.356, + "step": 9266 + }, + { + "epoch": 1.59, + "grad_norm": 14.459707260131836, + "learning_rate": 6.144671357473829e-06, + "loss": 0.6339, + "step": 9267 + }, + { + "epoch": 1.59, + "grad_norm": 8.842348098754883, + "learning_rate": 6.142097134031234e-06, + "loss": 0.2857, + "step": 9268 + }, + { + "epoch": 1.59, + "grad_norm": 11.259499549865723, + "learning_rate": 6.1395229105886385e-06, + "loss": 0.3934, + "step": 9269 + }, + { + "epoch": 1.59, + "grad_norm": 9.485591888427734, + "learning_rate": 6.136948687146044e-06, + "loss": 0.3819, + "step": 9270 + }, + { + "epoch": 1.59, + "grad_norm": 7.002604961395264, + "learning_rate": 6.134374463703449e-06, + "loss": 0.2309, + "step": 9271 + }, + { + "epoch": 1.59, + "grad_norm": 15.152010917663574, + "learning_rate": 6.131800240260855e-06, + "loss": 0.5052, + "step": 9272 + }, + { + "epoch": 1.59, + "grad_norm": 9.924613952636719, + "learning_rate": 6.12922601681826e-06, + "loss": 0.5115, + "step": 9273 + }, + { + "epoch": 1.59, + "grad_norm": 13.100393295288086, + "learning_rate": 6.126651793375665e-06, + "loss": 0.4541, + "step": 9274 + }, + { + "epoch": 1.59, + "grad_norm": 13.020744323730469, + "learning_rate": 6.12407756993307e-06, + "loss": 0.5774, + "step": 9275 + }, + { + "epoch": 1.59, + "grad_norm": 8.655982971191406, + "learning_rate": 6.121503346490475e-06, + "loss": 0.3284, + "step": 9276 + }, + { + "epoch": 1.59, + "grad_norm": 10.302045822143555, + "learning_rate": 6.118929123047881e-06, + "loss": 0.2654, + "step": 9277 + }, + { + "epoch": 1.59, + "grad_norm": 10.965720176696777, + "learning_rate": 6.116354899605286e-06, + "loss": 0.5395, + "step": 9278 + }, + { + "epoch": 1.59, + "grad_norm": 9.560702323913574, + "learning_rate": 6.113780676162691e-06, + "loss": 0.218, + "step": 9279 + }, + { + "epoch": 1.59, + "grad_norm": 11.798632621765137, + "learning_rate": 6.111206452720096e-06, + "loss": 0.3781, + "step": 9280 + }, + { + "epoch": 1.59, + "grad_norm": 9.915528297424316, + "learning_rate": 6.108632229277502e-06, + "loss": 0.2927, + "step": 9281 + }, + { + "epoch": 1.59, + "grad_norm": 10.718039512634277, + "learning_rate": 6.106058005834907e-06, + "loss": 0.3748, + "step": 9282 + }, + { + "epoch": 1.59, + "grad_norm": 12.610981941223145, + "learning_rate": 6.103483782392312e-06, + "loss": 0.4156, + "step": 9283 + }, + { + "epoch": 1.59, + "grad_norm": 8.602054595947266, + "learning_rate": 6.1009095589497175e-06, + "loss": 0.3729, + "step": 9284 + }, + { + "epoch": 1.59, + "grad_norm": 9.089277267456055, + "learning_rate": 6.098335335507122e-06, + "loss": 0.3612, + "step": 9285 + }, + { + "epoch": 1.59, + "grad_norm": 13.430428504943848, + "learning_rate": 6.0957611120645275e-06, + "loss": 0.5332, + "step": 9286 + }, + { + "epoch": 1.59, + "grad_norm": 11.575677871704102, + "learning_rate": 6.0931868886219325e-06, + "loss": 0.3386, + "step": 9287 + }, + { + "epoch": 1.59, + "grad_norm": 11.176405906677246, + "learning_rate": 6.090612665179338e-06, + "loss": 0.494, + "step": 9288 + }, + { + "epoch": 1.59, + "grad_norm": 13.538809776306152, + "learning_rate": 6.088038441736743e-06, + "loss": 0.5416, + "step": 9289 + }, + { + "epoch": 1.59, + "grad_norm": 10.619078636169434, + "learning_rate": 6.0854642182941475e-06, + "loss": 0.4912, + "step": 9290 + }, + { + "epoch": 1.59, + "grad_norm": 19.25139045715332, + "learning_rate": 6.082889994851553e-06, + "loss": 0.3834, + "step": 9291 + }, + { + "epoch": 1.59, + "grad_norm": 7.719689846038818, + "learning_rate": 6.080315771408958e-06, + "loss": 0.3456, + "step": 9292 + }, + { + "epoch": 1.59, + "grad_norm": 11.243241310119629, + "learning_rate": 6.077741547966364e-06, + "loss": 0.5009, + "step": 9293 + }, + { + "epoch": 1.59, + "grad_norm": 8.630461692810059, + "learning_rate": 6.075167324523769e-06, + "loss": 0.3576, + "step": 9294 + }, + { + "epoch": 1.6, + "grad_norm": 14.423848152160645, + "learning_rate": 6.072593101081173e-06, + "loss": 0.5059, + "step": 9295 + }, + { + "epoch": 1.6, + "grad_norm": 13.479129791259766, + "learning_rate": 6.070018877638579e-06, + "loss": 0.5272, + "step": 9296 + }, + { + "epoch": 1.6, + "grad_norm": 9.272512435913086, + "learning_rate": 6.067444654195984e-06, + "loss": 0.3023, + "step": 9297 + }, + { + "epoch": 1.6, + "grad_norm": 11.018547058105469, + "learning_rate": 6.06487043075339e-06, + "loss": 0.5643, + "step": 9298 + }, + { + "epoch": 1.6, + "grad_norm": 9.156089782714844, + "learning_rate": 6.062296207310795e-06, + "loss": 0.3727, + "step": 9299 + }, + { + "epoch": 1.6, + "grad_norm": 12.55975341796875, + "learning_rate": 6.0597219838682e-06, + "loss": 0.2611, + "step": 9300 + }, + { + "epoch": 1.6, + "grad_norm": 10.020785331726074, + "learning_rate": 6.057147760425605e-06, + "loss": 0.3822, + "step": 9301 + }, + { + "epoch": 1.6, + "grad_norm": 13.141971588134766, + "learning_rate": 6.05457353698301e-06, + "loss": 0.5198, + "step": 9302 + }, + { + "epoch": 1.6, + "grad_norm": 12.88416576385498, + "learning_rate": 6.051999313540416e-06, + "loss": 0.5505, + "step": 9303 + }, + { + "epoch": 1.6, + "grad_norm": 8.901981353759766, + "learning_rate": 6.049425090097821e-06, + "loss": 0.2862, + "step": 9304 + }, + { + "epoch": 1.6, + "grad_norm": 7.45460319519043, + "learning_rate": 6.046850866655226e-06, + "loss": 0.3459, + "step": 9305 + }, + { + "epoch": 1.6, + "grad_norm": 10.849007606506348, + "learning_rate": 6.044276643212631e-06, + "loss": 0.3254, + "step": 9306 + }, + { + "epoch": 1.6, + "grad_norm": 16.164941787719727, + "learning_rate": 6.0417024197700365e-06, + "loss": 0.5973, + "step": 9307 + }, + { + "epoch": 1.6, + "grad_norm": 10.208996772766113, + "learning_rate": 6.0391281963274415e-06, + "loss": 0.4844, + "step": 9308 + }, + { + "epoch": 1.6, + "grad_norm": 10.36326789855957, + "learning_rate": 6.0365539728848465e-06, + "loss": 0.3321, + "step": 9309 + }, + { + "epoch": 1.6, + "grad_norm": 14.12881088256836, + "learning_rate": 6.033979749442252e-06, + "loss": 0.4278, + "step": 9310 + }, + { + "epoch": 1.6, + "grad_norm": 13.57091999053955, + "learning_rate": 6.0314055259996564e-06, + "loss": 0.3726, + "step": 9311 + }, + { + "epoch": 1.6, + "grad_norm": 15.636283874511719, + "learning_rate": 6.028831302557062e-06, + "loss": 0.392, + "step": 9312 + }, + { + "epoch": 1.6, + "grad_norm": 11.948846817016602, + "learning_rate": 6.026257079114467e-06, + "loss": 0.2689, + "step": 9313 + }, + { + "epoch": 1.6, + "grad_norm": 8.179314613342285, + "learning_rate": 6.023682855671873e-06, + "loss": 0.368, + "step": 9314 + }, + { + "epoch": 1.6, + "grad_norm": 13.047548294067383, + "learning_rate": 6.021108632229278e-06, + "loss": 0.3608, + "step": 9315 + }, + { + "epoch": 1.6, + "grad_norm": 8.82796859741211, + "learning_rate": 6.018534408786682e-06, + "loss": 0.353, + "step": 9316 + }, + { + "epoch": 1.6, + "grad_norm": 9.916614532470703, + "learning_rate": 6.015960185344088e-06, + "loss": 0.4454, + "step": 9317 + }, + { + "epoch": 1.6, + "grad_norm": 8.81261920928955, + "learning_rate": 6.013385961901493e-06, + "loss": 0.3534, + "step": 9318 + }, + { + "epoch": 1.6, + "grad_norm": 8.931442260742188, + "learning_rate": 6.010811738458899e-06, + "loss": 0.4239, + "step": 9319 + }, + { + "epoch": 1.6, + "grad_norm": 11.180985450744629, + "learning_rate": 6.008237515016304e-06, + "loss": 0.4094, + "step": 9320 + }, + { + "epoch": 1.6, + "grad_norm": 13.366473197937012, + "learning_rate": 6.005663291573708e-06, + "loss": 0.6373, + "step": 9321 + }, + { + "epoch": 1.6, + "grad_norm": 15.796895980834961, + "learning_rate": 6.003089068131114e-06, + "loss": 0.5091, + "step": 9322 + }, + { + "epoch": 1.6, + "grad_norm": 7.969415664672852, + "learning_rate": 6.000514844688519e-06, + "loss": 0.444, + "step": 9323 + }, + { + "epoch": 1.6, + "grad_norm": 9.278087615966797, + "learning_rate": 5.997940621245925e-06, + "loss": 0.2579, + "step": 9324 + }, + { + "epoch": 1.6, + "grad_norm": 12.328569412231445, + "learning_rate": 5.99536639780333e-06, + "loss": 0.4803, + "step": 9325 + }, + { + "epoch": 1.6, + "grad_norm": 14.177692413330078, + "learning_rate": 5.992792174360735e-06, + "loss": 0.4947, + "step": 9326 + }, + { + "epoch": 1.6, + "grad_norm": 9.504905700683594, + "learning_rate": 5.99021795091814e-06, + "loss": 0.3134, + "step": 9327 + }, + { + "epoch": 1.6, + "grad_norm": 12.254817008972168, + "learning_rate": 5.987643727475545e-06, + "loss": 0.4861, + "step": 9328 + }, + { + "epoch": 1.6, + "grad_norm": 13.166810989379883, + "learning_rate": 5.9850695040329504e-06, + "loss": 0.363, + "step": 9329 + }, + { + "epoch": 1.6, + "grad_norm": 7.391650676727295, + "learning_rate": 5.982495280590355e-06, + "loss": 0.2166, + "step": 9330 + }, + { + "epoch": 1.6, + "grad_norm": 11.902772903442383, + "learning_rate": 5.97992105714776e-06, + "loss": 0.5765, + "step": 9331 + }, + { + "epoch": 1.6, + "grad_norm": 9.624422073364258, + "learning_rate": 5.977346833705165e-06, + "loss": 0.4744, + "step": 9332 + }, + { + "epoch": 1.6, + "grad_norm": 15.857745170593262, + "learning_rate": 5.974772610262571e-06, + "loss": 0.4948, + "step": 9333 + }, + { + "epoch": 1.6, + "grad_norm": 10.925243377685547, + "learning_rate": 5.972198386819976e-06, + "loss": 0.3548, + "step": 9334 + }, + { + "epoch": 1.6, + "grad_norm": 10.138570785522461, + "learning_rate": 5.969624163377381e-06, + "loss": 0.3301, + "step": 9335 + }, + { + "epoch": 1.6, + "grad_norm": 11.094758033752441, + "learning_rate": 5.967049939934787e-06, + "loss": 0.4501, + "step": 9336 + }, + { + "epoch": 1.6, + "grad_norm": 13.87204360961914, + "learning_rate": 5.964475716492191e-06, + "loss": 0.6357, + "step": 9337 + }, + { + "epoch": 1.6, + "grad_norm": 10.850410461425781, + "learning_rate": 5.961901493049597e-06, + "loss": 0.4706, + "step": 9338 + }, + { + "epoch": 1.6, + "grad_norm": 9.774003982543945, + "learning_rate": 5.959327269607002e-06, + "loss": 0.2983, + "step": 9339 + }, + { + "epoch": 1.6, + "grad_norm": 11.635725021362305, + "learning_rate": 5.956753046164408e-06, + "loss": 0.4463, + "step": 9340 + }, + { + "epoch": 1.6, + "grad_norm": 12.130509376525879, + "learning_rate": 5.954178822721813e-06, + "loss": 0.2974, + "step": 9341 + }, + { + "epoch": 1.6, + "grad_norm": 10.487428665161133, + "learning_rate": 5.951604599279217e-06, + "loss": 0.3373, + "step": 9342 + }, + { + "epoch": 1.6, + "grad_norm": 12.86618423461914, + "learning_rate": 5.949030375836623e-06, + "loss": 0.4209, + "step": 9343 + }, + { + "epoch": 1.6, + "grad_norm": 13.71156120300293, + "learning_rate": 5.946456152394028e-06, + "loss": 0.4609, + "step": 9344 + }, + { + "epoch": 1.6, + "grad_norm": 10.448272705078125, + "learning_rate": 5.943881928951434e-06, + "loss": 0.4087, + "step": 9345 + }, + { + "epoch": 1.6, + "grad_norm": 12.952189445495605, + "learning_rate": 5.941307705508839e-06, + "loss": 0.5209, + "step": 9346 + }, + { + "epoch": 1.6, + "grad_norm": 10.552382469177246, + "learning_rate": 5.938733482066243e-06, + "loss": 0.4547, + "step": 9347 + }, + { + "epoch": 1.6, + "grad_norm": 11.495433807373047, + "learning_rate": 5.936159258623649e-06, + "loss": 0.4151, + "step": 9348 + }, + { + "epoch": 1.6, + "grad_norm": 12.510540008544922, + "learning_rate": 5.9335850351810536e-06, + "loss": 0.4371, + "step": 9349 + }, + { + "epoch": 1.6, + "grad_norm": 10.461602210998535, + "learning_rate": 5.931010811738459e-06, + "loss": 0.3226, + "step": 9350 + }, + { + "epoch": 1.6, + "grad_norm": 8.195796966552734, + "learning_rate": 5.928436588295864e-06, + "loss": 0.3189, + "step": 9351 + }, + { + "epoch": 1.6, + "grad_norm": 6.487024307250977, + "learning_rate": 5.925862364853269e-06, + "loss": 0.2571, + "step": 9352 + }, + { + "epoch": 1.61, + "grad_norm": 10.922492027282715, + "learning_rate": 5.923288141410674e-06, + "loss": 0.5108, + "step": 9353 + }, + { + "epoch": 1.61, + "grad_norm": 9.246350288391113, + "learning_rate": 5.920713917968079e-06, + "loss": 0.3148, + "step": 9354 + }, + { + "epoch": 1.61, + "grad_norm": 10.257803916931152, + "learning_rate": 5.918139694525485e-06, + "loss": 0.4622, + "step": 9355 + }, + { + "epoch": 1.61, + "grad_norm": 9.560223579406738, + "learning_rate": 5.91556547108289e-06, + "loss": 0.4447, + "step": 9356 + }, + { + "epoch": 1.61, + "grad_norm": 8.579278945922852, + "learning_rate": 5.912991247640295e-06, + "loss": 0.2871, + "step": 9357 + }, + { + "epoch": 1.61, + "grad_norm": 8.096439361572266, + "learning_rate": 5.9104170241977e-06, + "loss": 0.4621, + "step": 9358 + }, + { + "epoch": 1.61, + "grad_norm": 14.200118064880371, + "learning_rate": 5.907842800755106e-06, + "loss": 0.5691, + "step": 9359 + }, + { + "epoch": 1.61, + "grad_norm": 10.151054382324219, + "learning_rate": 5.905268577312511e-06, + "loss": 0.4102, + "step": 9360 + }, + { + "epoch": 1.61, + "grad_norm": 6.757876396179199, + "learning_rate": 5.902694353869916e-06, + "loss": 0.257, + "step": 9361 + }, + { + "epoch": 1.61, + "grad_norm": 7.452271461486816, + "learning_rate": 5.900120130427322e-06, + "loss": 0.3395, + "step": 9362 + }, + { + "epoch": 1.61, + "grad_norm": 10.132145881652832, + "learning_rate": 5.897545906984726e-06, + "loss": 0.4416, + "step": 9363 + }, + { + "epoch": 1.61, + "grad_norm": 9.626276969909668, + "learning_rate": 5.894971683542132e-06, + "loss": 0.3473, + "step": 9364 + }, + { + "epoch": 1.61, + "grad_norm": 12.145108222961426, + "learning_rate": 5.892397460099537e-06, + "loss": 0.4639, + "step": 9365 + }, + { + "epoch": 1.61, + "grad_norm": 11.048547744750977, + "learning_rate": 5.889823236656942e-06, + "loss": 0.3527, + "step": 9366 + }, + { + "epoch": 1.61, + "grad_norm": 13.349409103393555, + "learning_rate": 5.8872490132143476e-06, + "loss": 0.6066, + "step": 9367 + }, + { + "epoch": 1.61, + "grad_norm": 7.787160396575928, + "learning_rate": 5.884674789771752e-06, + "loss": 0.2924, + "step": 9368 + }, + { + "epoch": 1.61, + "grad_norm": 13.691932678222656, + "learning_rate": 5.8821005663291576e-06, + "loss": 0.4399, + "step": 9369 + }, + { + "epoch": 1.61, + "grad_norm": 10.421937942504883, + "learning_rate": 5.8795263428865625e-06, + "loss": 0.4559, + "step": 9370 + }, + { + "epoch": 1.61, + "grad_norm": 12.742603302001953, + "learning_rate": 5.876952119443968e-06, + "loss": 0.642, + "step": 9371 + }, + { + "epoch": 1.61, + "grad_norm": 10.500776290893555, + "learning_rate": 5.874377896001373e-06, + "loss": 0.3622, + "step": 9372 + }, + { + "epoch": 1.61, + "grad_norm": 10.78892707824707, + "learning_rate": 5.8718036725587775e-06, + "loss": 0.5199, + "step": 9373 + }, + { + "epoch": 1.61, + "grad_norm": 10.555265426635742, + "learning_rate": 5.869229449116183e-06, + "loss": 0.3516, + "step": 9374 + }, + { + "epoch": 1.61, + "grad_norm": 7.721681594848633, + "learning_rate": 5.866655225673588e-06, + "loss": 0.3237, + "step": 9375 + }, + { + "epoch": 1.61, + "grad_norm": 15.01641845703125, + "learning_rate": 5.864081002230994e-06, + "loss": 0.4826, + "step": 9376 + }, + { + "epoch": 1.61, + "grad_norm": 7.306987762451172, + "learning_rate": 5.861506778788399e-06, + "loss": 0.2503, + "step": 9377 + }, + { + "epoch": 1.61, + "grad_norm": 8.787321090698242, + "learning_rate": 5.858932555345804e-06, + "loss": 0.2964, + "step": 9378 + }, + { + "epoch": 1.61, + "grad_norm": 10.207783699035645, + "learning_rate": 5.856358331903209e-06, + "loss": 0.4971, + "step": 9379 + }, + { + "epoch": 1.61, + "grad_norm": 12.970616340637207, + "learning_rate": 5.853784108460614e-06, + "loss": 0.4826, + "step": 9380 + }, + { + "epoch": 1.61, + "grad_norm": 8.678253173828125, + "learning_rate": 5.85120988501802e-06, + "loss": 0.3736, + "step": 9381 + }, + { + "epoch": 1.61, + "grad_norm": 9.0572509765625, + "learning_rate": 5.848635661575425e-06, + "loss": 0.3381, + "step": 9382 + }, + { + "epoch": 1.61, + "grad_norm": 10.030983924865723, + "learning_rate": 5.846061438132831e-06, + "loss": 0.3966, + "step": 9383 + }, + { + "epoch": 1.61, + "grad_norm": 10.285091400146484, + "learning_rate": 5.843487214690235e-06, + "loss": 0.4965, + "step": 9384 + }, + { + "epoch": 1.61, + "grad_norm": 7.010630130767822, + "learning_rate": 5.840912991247641e-06, + "loss": 0.2905, + "step": 9385 + }, + { + "epoch": 1.61, + "grad_norm": 11.928468704223633, + "learning_rate": 5.838338767805046e-06, + "loss": 0.3296, + "step": 9386 + }, + { + "epoch": 1.61, + "grad_norm": 15.42886734008789, + "learning_rate": 5.835764544362451e-06, + "loss": 0.332, + "step": 9387 + }, + { + "epoch": 1.61, + "grad_norm": 8.908339500427246, + "learning_rate": 5.8331903209198565e-06, + "loss": 0.4273, + "step": 9388 + }, + { + "epoch": 1.61, + "grad_norm": 7.1969499588012695, + "learning_rate": 5.830616097477261e-06, + "loss": 0.223, + "step": 9389 + }, + { + "epoch": 1.61, + "grad_norm": 9.444570541381836, + "learning_rate": 5.8280418740346665e-06, + "loss": 0.3362, + "step": 9390 + }, + { + "epoch": 1.61, + "grad_norm": 11.364608764648438, + "learning_rate": 5.8254676505920715e-06, + "loss": 0.3234, + "step": 9391 + }, + { + "epoch": 1.61, + "grad_norm": 10.901269912719727, + "learning_rate": 5.8228934271494765e-06, + "loss": 0.5691, + "step": 9392 + }, + { + "epoch": 1.61, + "grad_norm": 10.817156791687012, + "learning_rate": 5.820319203706882e-06, + "loss": 0.3869, + "step": 9393 + }, + { + "epoch": 1.61, + "grad_norm": 7.694347858428955, + "learning_rate": 5.8177449802642865e-06, + "loss": 0.3568, + "step": 9394 + }, + { + "epoch": 1.61, + "grad_norm": 7.901276588439941, + "learning_rate": 5.815170756821692e-06, + "loss": 0.3348, + "step": 9395 + }, + { + "epoch": 1.61, + "grad_norm": 9.192615509033203, + "learning_rate": 5.812596533379097e-06, + "loss": 0.2995, + "step": 9396 + }, + { + "epoch": 1.61, + "grad_norm": 10.507491111755371, + "learning_rate": 5.810022309936503e-06, + "loss": 0.3902, + "step": 9397 + }, + { + "epoch": 1.61, + "grad_norm": 9.135817527770996, + "learning_rate": 5.807448086493908e-06, + "loss": 0.3596, + "step": 9398 + }, + { + "epoch": 1.61, + "grad_norm": 15.679965019226074, + "learning_rate": 5.804873863051312e-06, + "loss": 0.4209, + "step": 9399 + }, + { + "epoch": 1.61, + "grad_norm": 16.001195907592773, + "learning_rate": 5.802299639608718e-06, + "loss": 0.5544, + "step": 9400 + }, + { + "epoch": 1.61, + "grad_norm": 11.406121253967285, + "learning_rate": 5.799725416166123e-06, + "loss": 0.4476, + "step": 9401 + }, + { + "epoch": 1.61, + "grad_norm": 10.545846939086914, + "learning_rate": 5.797151192723529e-06, + "loss": 0.5148, + "step": 9402 + }, + { + "epoch": 1.61, + "grad_norm": 9.235424041748047, + "learning_rate": 5.794576969280934e-06, + "loss": 0.4847, + "step": 9403 + }, + { + "epoch": 1.61, + "grad_norm": 8.171060562133789, + "learning_rate": 5.792002745838339e-06, + "loss": 0.4245, + "step": 9404 + }, + { + "epoch": 1.61, + "grad_norm": 12.361068725585938, + "learning_rate": 5.789428522395744e-06, + "loss": 0.401, + "step": 9405 + }, + { + "epoch": 1.61, + "grad_norm": 11.301602363586426, + "learning_rate": 5.786854298953149e-06, + "loss": 0.4688, + "step": 9406 + }, + { + "epoch": 1.61, + "grad_norm": 8.1406831741333, + "learning_rate": 5.784280075510555e-06, + "loss": 0.2977, + "step": 9407 + }, + { + "epoch": 1.61, + "grad_norm": 7.393590450286865, + "learning_rate": 5.78170585206796e-06, + "loss": 0.2248, + "step": 9408 + }, + { + "epoch": 1.61, + "grad_norm": 10.281673431396484, + "learning_rate": 5.7791316286253655e-06, + "loss": 0.3988, + "step": 9409 + }, + { + "epoch": 1.61, + "grad_norm": 7.977659702301025, + "learning_rate": 5.77655740518277e-06, + "loss": 0.3915, + "step": 9410 + }, + { + "epoch": 1.62, + "grad_norm": 14.32376480102539, + "learning_rate": 5.7739831817401755e-06, + "loss": 0.3113, + "step": 9411 + }, + { + "epoch": 1.62, + "grad_norm": 11.357552528381348, + "learning_rate": 5.7714089582975805e-06, + "loss": 0.441, + "step": 9412 + }, + { + "epoch": 1.62, + "grad_norm": 9.72784423828125, + "learning_rate": 5.7688347348549855e-06, + "loss": 0.4453, + "step": 9413 + }, + { + "epoch": 1.62, + "grad_norm": 17.66362190246582, + "learning_rate": 5.766260511412391e-06, + "loss": 0.74, + "step": 9414 + }, + { + "epoch": 1.62, + "grad_norm": 11.012140274047852, + "learning_rate": 5.7636862879697954e-06, + "loss": 0.5343, + "step": 9415 + }, + { + "epoch": 1.62, + "grad_norm": 9.609058380126953, + "learning_rate": 5.761112064527201e-06, + "loss": 0.28, + "step": 9416 + }, + { + "epoch": 1.62, + "grad_norm": 11.282861709594727, + "learning_rate": 5.758537841084606e-06, + "loss": 0.3836, + "step": 9417 + }, + { + "epoch": 1.62, + "grad_norm": 10.726537704467773, + "learning_rate": 5.755963617642011e-06, + "loss": 0.4459, + "step": 9418 + }, + { + "epoch": 1.62, + "grad_norm": 10.606436729431152, + "learning_rate": 5.753389394199417e-06, + "loss": 0.3538, + "step": 9419 + }, + { + "epoch": 1.62, + "grad_norm": 14.751702308654785, + "learning_rate": 5.750815170756821e-06, + "loss": 0.4457, + "step": 9420 + }, + { + "epoch": 1.62, + "grad_norm": 12.104438781738281, + "learning_rate": 5.748240947314227e-06, + "loss": 0.6976, + "step": 9421 + }, + { + "epoch": 1.62, + "grad_norm": 16.153799057006836, + "learning_rate": 5.745666723871632e-06, + "loss": 0.5936, + "step": 9422 + }, + { + "epoch": 1.62, + "grad_norm": 10.309557914733887, + "learning_rate": 5.743092500429038e-06, + "loss": 0.3608, + "step": 9423 + }, + { + "epoch": 1.62, + "grad_norm": 10.956354141235352, + "learning_rate": 5.740518276986443e-06, + "loss": 0.4579, + "step": 9424 + }, + { + "epoch": 1.62, + "grad_norm": 12.102630615234375, + "learning_rate": 5.737944053543847e-06, + "loss": 0.5007, + "step": 9425 + }, + { + "epoch": 1.62, + "grad_norm": 11.239076614379883, + "learning_rate": 5.735369830101253e-06, + "loss": 0.5125, + "step": 9426 + }, + { + "epoch": 1.62, + "grad_norm": 7.7650980949401855, + "learning_rate": 5.732795606658658e-06, + "loss": 0.36, + "step": 9427 + }, + { + "epoch": 1.62, + "grad_norm": 9.341635704040527, + "learning_rate": 5.730221383216064e-06, + "loss": 0.2929, + "step": 9428 + }, + { + "epoch": 1.62, + "grad_norm": 13.563714027404785, + "learning_rate": 5.727647159773469e-06, + "loss": 0.3478, + "step": 9429 + }, + { + "epoch": 1.62, + "grad_norm": 11.534550666809082, + "learning_rate": 5.725072936330874e-06, + "loss": 0.417, + "step": 9430 + }, + { + "epoch": 1.62, + "grad_norm": 9.148838996887207, + "learning_rate": 5.722498712888279e-06, + "loss": 0.3054, + "step": 9431 + }, + { + "epoch": 1.62, + "grad_norm": 9.314359664916992, + "learning_rate": 5.719924489445684e-06, + "loss": 0.3541, + "step": 9432 + }, + { + "epoch": 1.62, + "grad_norm": 7.9740309715271, + "learning_rate": 5.7173502660030894e-06, + "loss": 0.2978, + "step": 9433 + }, + { + "epoch": 1.62, + "grad_norm": 13.401704788208008, + "learning_rate": 5.7147760425604944e-06, + "loss": 0.4636, + "step": 9434 + }, + { + "epoch": 1.62, + "grad_norm": 14.361881256103516, + "learning_rate": 5.7122018191179e-06, + "loss": 0.5099, + "step": 9435 + }, + { + "epoch": 1.62, + "grad_norm": 7.762758731842041, + "learning_rate": 5.709627595675304e-06, + "loss": 0.2905, + "step": 9436 + }, + { + "epoch": 1.62, + "grad_norm": 9.460415840148926, + "learning_rate": 5.70705337223271e-06, + "loss": 0.3647, + "step": 9437 + }, + { + "epoch": 1.62, + "grad_norm": 9.269363403320312, + "learning_rate": 5.704479148790115e-06, + "loss": 0.5, + "step": 9438 + }, + { + "epoch": 1.62, + "grad_norm": 8.00335693359375, + "learning_rate": 5.70190492534752e-06, + "loss": 0.3988, + "step": 9439 + }, + { + "epoch": 1.62, + "grad_norm": 11.040388107299805, + "learning_rate": 5.699330701904926e-06, + "loss": 0.3993, + "step": 9440 + }, + { + "epoch": 1.62, + "grad_norm": 7.470085144042969, + "learning_rate": 5.69675647846233e-06, + "loss": 0.2645, + "step": 9441 + }, + { + "epoch": 1.62, + "grad_norm": 10.192621231079102, + "learning_rate": 5.694182255019736e-06, + "loss": 0.5402, + "step": 9442 + }, + { + "epoch": 1.62, + "grad_norm": 9.485733985900879, + "learning_rate": 5.691608031577141e-06, + "loss": 0.3496, + "step": 9443 + }, + { + "epoch": 1.62, + "grad_norm": 9.314024925231934, + "learning_rate": 5.689033808134546e-06, + "loss": 0.4641, + "step": 9444 + }, + { + "epoch": 1.62, + "grad_norm": 10.139507293701172, + "learning_rate": 5.686459584691952e-06, + "loss": 0.4792, + "step": 9445 + }, + { + "epoch": 1.62, + "grad_norm": 9.422876358032227, + "learning_rate": 5.683885361249356e-06, + "loss": 0.3295, + "step": 9446 + }, + { + "epoch": 1.62, + "grad_norm": 15.380331039428711, + "learning_rate": 5.681311137806762e-06, + "loss": 0.4889, + "step": 9447 + }, + { + "epoch": 1.62, + "grad_norm": 10.798163414001465, + "learning_rate": 5.678736914364167e-06, + "loss": 0.4406, + "step": 9448 + }, + { + "epoch": 1.62, + "grad_norm": 10.419535636901855, + "learning_rate": 5.676162690921573e-06, + "loss": 0.3113, + "step": 9449 + }, + { + "epoch": 1.62, + "grad_norm": 11.418760299682617, + "learning_rate": 5.673588467478978e-06, + "loss": 0.4324, + "step": 9450 + }, + { + "epoch": 1.62, + "grad_norm": 8.205583572387695, + "learning_rate": 5.671014244036382e-06, + "loss": 0.3619, + "step": 9451 + }, + { + "epoch": 1.62, + "grad_norm": 7.42657995223999, + "learning_rate": 5.668440020593788e-06, + "loss": 0.3224, + "step": 9452 + }, + { + "epoch": 1.62, + "grad_norm": 11.461694717407227, + "learning_rate": 5.665865797151193e-06, + "loss": 0.4596, + "step": 9453 + }, + { + "epoch": 1.62, + "grad_norm": 7.489125728607178, + "learning_rate": 5.663291573708598e-06, + "loss": 0.2557, + "step": 9454 + }, + { + "epoch": 1.62, + "grad_norm": 7.991674423217773, + "learning_rate": 5.660717350266003e-06, + "loss": 0.3223, + "step": 9455 + }, + { + "epoch": 1.62, + "grad_norm": 8.101436614990234, + "learning_rate": 5.658143126823408e-06, + "loss": 0.2953, + "step": 9456 + }, + { + "epoch": 1.62, + "grad_norm": 9.007814407348633, + "learning_rate": 5.655568903380813e-06, + "loss": 0.3529, + "step": 9457 + }, + { + "epoch": 1.62, + "grad_norm": 11.626026153564453, + "learning_rate": 5.652994679938218e-06, + "loss": 0.4621, + "step": 9458 + }, + { + "epoch": 1.62, + "grad_norm": 9.057463645935059, + "learning_rate": 5.650420456495624e-06, + "loss": 0.326, + "step": 9459 + }, + { + "epoch": 1.62, + "grad_norm": 9.18494987487793, + "learning_rate": 5.647846233053029e-06, + "loss": 0.4238, + "step": 9460 + }, + { + "epoch": 1.62, + "grad_norm": 9.687639236450195, + "learning_rate": 5.645272009610435e-06, + "loss": 0.4175, + "step": 9461 + }, + { + "epoch": 1.62, + "grad_norm": 12.35632610321045, + "learning_rate": 5.642697786167839e-06, + "loss": 0.3192, + "step": 9462 + }, + { + "epoch": 1.62, + "grad_norm": 12.568696975708008, + "learning_rate": 5.640123562725245e-06, + "loss": 0.7296, + "step": 9463 + }, + { + "epoch": 1.62, + "grad_norm": 9.392620086669922, + "learning_rate": 5.63754933928265e-06, + "loss": 0.2002, + "step": 9464 + }, + { + "epoch": 1.62, + "grad_norm": 24.698711395263672, + "learning_rate": 5.634975115840055e-06, + "loss": 0.4289, + "step": 9465 + }, + { + "epoch": 1.62, + "grad_norm": 10.210901260375977, + "learning_rate": 5.632400892397461e-06, + "loss": 0.3849, + "step": 9466 + }, + { + "epoch": 1.62, + "grad_norm": 12.081639289855957, + "learning_rate": 5.629826668954865e-06, + "loss": 0.4971, + "step": 9467 + }, + { + "epoch": 1.62, + "grad_norm": 10.283191680908203, + "learning_rate": 5.627252445512271e-06, + "loss": 0.4543, + "step": 9468 + }, + { + "epoch": 1.63, + "grad_norm": 9.628883361816406, + "learning_rate": 5.624678222069676e-06, + "loss": 0.3403, + "step": 9469 + }, + { + "epoch": 1.63, + "grad_norm": 13.887421607971191, + "learning_rate": 5.622103998627081e-06, + "loss": 0.4559, + "step": 9470 + }, + { + "epoch": 1.63, + "grad_norm": 11.605327606201172, + "learning_rate": 5.619529775184487e-06, + "loss": 0.4851, + "step": 9471 + }, + { + "epoch": 1.63, + "grad_norm": 6.49120569229126, + "learning_rate": 5.616955551741891e-06, + "loss": 0.2355, + "step": 9472 + }, + { + "epoch": 1.63, + "grad_norm": 12.835000991821289, + "learning_rate": 5.6143813282992966e-06, + "loss": 0.5566, + "step": 9473 + }, + { + "epoch": 1.63, + "grad_norm": 5.666558265686035, + "learning_rate": 5.6118071048567015e-06, + "loss": 0.2818, + "step": 9474 + }, + { + "epoch": 1.63, + "grad_norm": 13.961830139160156, + "learning_rate": 5.609232881414107e-06, + "loss": 0.4205, + "step": 9475 + }, + { + "epoch": 1.63, + "grad_norm": 13.174569129943848, + "learning_rate": 5.606658657971512e-06, + "loss": 0.5047, + "step": 9476 + }, + { + "epoch": 1.63, + "grad_norm": 9.219046592712402, + "learning_rate": 5.6040844345289165e-06, + "loss": 0.355, + "step": 9477 + }, + { + "epoch": 1.63, + "grad_norm": 10.74807357788086, + "learning_rate": 5.601510211086322e-06, + "loss": 0.412, + "step": 9478 + }, + { + "epoch": 1.63, + "grad_norm": 9.126022338867188, + "learning_rate": 5.598935987643727e-06, + "loss": 0.3001, + "step": 9479 + }, + { + "epoch": 1.63, + "grad_norm": 11.293107032775879, + "learning_rate": 5.596361764201133e-06, + "loss": 0.5309, + "step": 9480 + }, + { + "epoch": 1.63, + "grad_norm": 12.276771545410156, + "learning_rate": 5.593787540758538e-06, + "loss": 0.3959, + "step": 9481 + }, + { + "epoch": 1.63, + "grad_norm": 8.59830379486084, + "learning_rate": 5.591213317315943e-06, + "loss": 0.3412, + "step": 9482 + }, + { + "epoch": 1.63, + "grad_norm": 7.927278995513916, + "learning_rate": 5.588639093873348e-06, + "loss": 0.3523, + "step": 9483 + }, + { + "epoch": 1.63, + "grad_norm": 9.802326202392578, + "learning_rate": 5.586064870430753e-06, + "loss": 0.2781, + "step": 9484 + }, + { + "epoch": 1.63, + "grad_norm": 10.646251678466797, + "learning_rate": 5.583490646988159e-06, + "loss": 0.3949, + "step": 9485 + }, + { + "epoch": 1.63, + "grad_norm": 12.38243579864502, + "learning_rate": 5.580916423545564e-06, + "loss": 0.5674, + "step": 9486 + }, + { + "epoch": 1.63, + "grad_norm": 10.333597183227539, + "learning_rate": 5.57834220010297e-06, + "loss": 0.403, + "step": 9487 + }, + { + "epoch": 1.63, + "grad_norm": 14.630289077758789, + "learning_rate": 5.575767976660374e-06, + "loss": 0.5945, + "step": 9488 + }, + { + "epoch": 1.63, + "grad_norm": 11.838348388671875, + "learning_rate": 5.57319375321778e-06, + "loss": 0.3882, + "step": 9489 + }, + { + "epoch": 1.63, + "grad_norm": 8.97502613067627, + "learning_rate": 5.570619529775185e-06, + "loss": 0.4292, + "step": 9490 + }, + { + "epoch": 1.63, + "grad_norm": 13.213041305541992, + "learning_rate": 5.56804530633259e-06, + "loss": 0.6812, + "step": 9491 + }, + { + "epoch": 1.63, + "grad_norm": 10.27556324005127, + "learning_rate": 5.5654710828899956e-06, + "loss": 0.3498, + "step": 9492 + }, + { + "epoch": 1.63, + "grad_norm": 9.039551734924316, + "learning_rate": 5.5628968594474e-06, + "loss": 0.4898, + "step": 9493 + }, + { + "epoch": 1.63, + "grad_norm": 11.654840469360352, + "learning_rate": 5.5603226360048055e-06, + "loss": 0.4408, + "step": 9494 + }, + { + "epoch": 1.63, + "grad_norm": 12.076395034790039, + "learning_rate": 5.5577484125622105e-06, + "loss": 0.5059, + "step": 9495 + }, + { + "epoch": 1.63, + "grad_norm": 11.76390552520752, + "learning_rate": 5.5551741891196155e-06, + "loss": 0.4985, + "step": 9496 + }, + { + "epoch": 1.63, + "grad_norm": 11.457161903381348, + "learning_rate": 5.552599965677021e-06, + "loss": 0.4021, + "step": 9497 + }, + { + "epoch": 1.63, + "grad_norm": 10.285161972045898, + "learning_rate": 5.5500257422344255e-06, + "loss": 0.2897, + "step": 9498 + }, + { + "epoch": 1.63, + "grad_norm": 13.88736343383789, + "learning_rate": 5.547451518791831e-06, + "loss": 0.5343, + "step": 9499 + }, + { + "epoch": 1.63, + "grad_norm": 11.263001441955566, + "learning_rate": 5.544877295349236e-06, + "loss": 0.4679, + "step": 9500 + }, + { + "epoch": 1.63, + "grad_norm": 11.066070556640625, + "learning_rate": 5.542303071906642e-06, + "loss": 0.4338, + "step": 9501 + }, + { + "epoch": 1.63, + "grad_norm": 11.497499465942383, + "learning_rate": 5.539728848464047e-06, + "loss": 0.4305, + "step": 9502 + }, + { + "epoch": 1.63, + "grad_norm": 9.847160339355469, + "learning_rate": 5.537154625021451e-06, + "loss": 0.3502, + "step": 9503 + }, + { + "epoch": 1.63, + "grad_norm": 7.71319055557251, + "learning_rate": 5.534580401578857e-06, + "loss": 0.3201, + "step": 9504 + }, + { + "epoch": 1.63, + "grad_norm": 10.295954704284668, + "learning_rate": 5.532006178136262e-06, + "loss": 0.3941, + "step": 9505 + }, + { + "epoch": 1.63, + "grad_norm": 12.266483306884766, + "learning_rate": 5.529431954693668e-06, + "loss": 0.3982, + "step": 9506 + }, + { + "epoch": 1.63, + "grad_norm": 11.09190845489502, + "learning_rate": 5.526857731251073e-06, + "loss": 0.4783, + "step": 9507 + }, + { + "epoch": 1.63, + "grad_norm": 12.584628105163574, + "learning_rate": 5.524283507808478e-06, + "loss": 0.4744, + "step": 9508 + }, + { + "epoch": 1.63, + "grad_norm": 14.742584228515625, + "learning_rate": 5.521709284365883e-06, + "loss": 0.514, + "step": 9509 + }, + { + "epoch": 1.63, + "grad_norm": 9.745132446289062, + "learning_rate": 5.519135060923288e-06, + "loss": 0.2851, + "step": 9510 + }, + { + "epoch": 1.63, + "grad_norm": 10.359766960144043, + "learning_rate": 5.516560837480694e-06, + "loss": 0.4297, + "step": 9511 + }, + { + "epoch": 1.63, + "grad_norm": 15.780399322509766, + "learning_rate": 5.513986614038099e-06, + "loss": 0.6842, + "step": 9512 + }, + { + "epoch": 1.63, + "grad_norm": 9.671772003173828, + "learning_rate": 5.5114123905955045e-06, + "loss": 0.3953, + "step": 9513 + }, + { + "epoch": 1.63, + "grad_norm": 12.546340942382812, + "learning_rate": 5.508838167152909e-06, + "loss": 0.483, + "step": 9514 + }, + { + "epoch": 1.63, + "grad_norm": 11.07878589630127, + "learning_rate": 5.5062639437103145e-06, + "loss": 0.4839, + "step": 9515 + }, + { + "epoch": 1.63, + "grad_norm": 9.804040908813477, + "learning_rate": 5.5036897202677195e-06, + "loss": 0.3769, + "step": 9516 + }, + { + "epoch": 1.63, + "grad_norm": 7.916842937469482, + "learning_rate": 5.5011154968251245e-06, + "loss": 0.3174, + "step": 9517 + }, + { + "epoch": 1.63, + "grad_norm": 6.750128746032715, + "learning_rate": 5.49854127338253e-06, + "loss": 0.3253, + "step": 9518 + }, + { + "epoch": 1.63, + "grad_norm": 8.293014526367188, + "learning_rate": 5.4959670499399345e-06, + "loss": 0.2398, + "step": 9519 + }, + { + "epoch": 1.63, + "grad_norm": 10.516221046447754, + "learning_rate": 5.49339282649734e-06, + "loss": 0.2682, + "step": 9520 + }, + { + "epoch": 1.63, + "grad_norm": 13.01073169708252, + "learning_rate": 5.490818603054745e-06, + "loss": 0.4094, + "step": 9521 + }, + { + "epoch": 1.63, + "grad_norm": 8.900822639465332, + "learning_rate": 5.48824437961215e-06, + "loss": 0.3751, + "step": 9522 + }, + { + "epoch": 1.63, + "grad_norm": 10.678243637084961, + "learning_rate": 5.485670156169556e-06, + "loss": 0.393, + "step": 9523 + }, + { + "epoch": 1.63, + "grad_norm": 15.451507568359375, + "learning_rate": 5.48309593272696e-06, + "loss": 0.5546, + "step": 9524 + }, + { + "epoch": 1.63, + "grad_norm": 8.416790962219238, + "learning_rate": 5.480521709284366e-06, + "loss": 0.3119, + "step": 9525 + }, + { + "epoch": 1.63, + "grad_norm": 12.565122604370117, + "learning_rate": 5.477947485841771e-06, + "loss": 0.4713, + "step": 9526 + }, + { + "epoch": 1.63, + "grad_norm": 12.840173721313477, + "learning_rate": 5.475373262399177e-06, + "loss": 0.4917, + "step": 9527 + }, + { + "epoch": 1.64, + "grad_norm": 10.61516284942627, + "learning_rate": 5.472799038956582e-06, + "loss": 0.5948, + "step": 9528 + }, + { + "epoch": 1.64, + "grad_norm": 10.036795616149902, + "learning_rate": 5.470224815513986e-06, + "loss": 0.4526, + "step": 9529 + }, + { + "epoch": 1.64, + "grad_norm": 9.842708587646484, + "learning_rate": 5.467650592071392e-06, + "loss": 0.4411, + "step": 9530 + }, + { + "epoch": 1.64, + "grad_norm": 10.228228569030762, + "learning_rate": 5.465076368628797e-06, + "loss": 0.3512, + "step": 9531 + }, + { + "epoch": 1.64, + "grad_norm": 13.198923110961914, + "learning_rate": 5.462502145186203e-06, + "loss": 0.4442, + "step": 9532 + }, + { + "epoch": 1.64, + "grad_norm": 8.91535472869873, + "learning_rate": 5.459927921743608e-06, + "loss": 0.3202, + "step": 9533 + }, + { + "epoch": 1.64, + "grad_norm": 9.445051193237305, + "learning_rate": 5.457353698301013e-06, + "loss": 0.3955, + "step": 9534 + }, + { + "epoch": 1.64, + "grad_norm": 10.360581398010254, + "learning_rate": 5.454779474858418e-06, + "loss": 0.3812, + "step": 9535 + }, + { + "epoch": 1.64, + "grad_norm": 10.984142303466797, + "learning_rate": 5.452205251415823e-06, + "loss": 0.4418, + "step": 9536 + }, + { + "epoch": 1.64, + "grad_norm": 12.357352256774902, + "learning_rate": 5.4496310279732285e-06, + "loss": 0.4037, + "step": 9537 + }, + { + "epoch": 1.64, + "grad_norm": 12.403053283691406, + "learning_rate": 5.4470568045306334e-06, + "loss": 0.476, + "step": 9538 + }, + { + "epoch": 1.64, + "grad_norm": 11.646989822387695, + "learning_rate": 5.444482581088039e-06, + "loss": 0.3284, + "step": 9539 + }, + { + "epoch": 1.64, + "grad_norm": 12.550874710083008, + "learning_rate": 5.441908357645443e-06, + "loss": 0.6351, + "step": 9540 + }, + { + "epoch": 1.64, + "grad_norm": 13.214956283569336, + "learning_rate": 5.439334134202848e-06, + "loss": 0.3966, + "step": 9541 + }, + { + "epoch": 1.64, + "grad_norm": 9.978095054626465, + "learning_rate": 5.436759910760254e-06, + "loss": 0.3507, + "step": 9542 + }, + { + "epoch": 1.64, + "grad_norm": 9.20533275604248, + "learning_rate": 5.434185687317659e-06, + "loss": 0.2897, + "step": 9543 + }, + { + "epoch": 1.64, + "grad_norm": 9.751108169555664, + "learning_rate": 5.431611463875065e-06, + "loss": 0.4487, + "step": 9544 + }, + { + "epoch": 1.64, + "grad_norm": 8.129634857177734, + "learning_rate": 5.429037240432469e-06, + "loss": 0.3483, + "step": 9545 + }, + { + "epoch": 1.64, + "grad_norm": 9.402081489562988, + "learning_rate": 5.426463016989875e-06, + "loss": 0.3781, + "step": 9546 + }, + { + "epoch": 1.64, + "grad_norm": 14.624606132507324, + "learning_rate": 5.42388879354728e-06, + "loss": 0.5113, + "step": 9547 + }, + { + "epoch": 1.64, + "grad_norm": 14.68389892578125, + "learning_rate": 5.421314570104685e-06, + "loss": 0.633, + "step": 9548 + }, + { + "epoch": 1.64, + "grad_norm": 14.133748054504395, + "learning_rate": 5.418740346662091e-06, + "loss": 0.5521, + "step": 9549 + }, + { + "epoch": 1.64, + "grad_norm": 9.241744995117188, + "learning_rate": 5.416166123219495e-06, + "loss": 0.3083, + "step": 9550 + }, + { + "epoch": 1.64, + "grad_norm": 13.651822090148926, + "learning_rate": 5.413591899776901e-06, + "loss": 0.5847, + "step": 9551 + }, + { + "epoch": 1.64, + "grad_norm": 12.413772583007812, + "learning_rate": 5.411017676334306e-06, + "loss": 0.4548, + "step": 9552 + }, + { + "epoch": 1.64, + "grad_norm": 11.035812377929688, + "learning_rate": 5.408443452891712e-06, + "loss": 0.444, + "step": 9553 + }, + { + "epoch": 1.64, + "grad_norm": 11.18057632446289, + "learning_rate": 5.405869229449117e-06, + "loss": 0.5344, + "step": 9554 + }, + { + "epoch": 1.64, + "grad_norm": 9.828645706176758, + "learning_rate": 5.403295006006521e-06, + "loss": 0.3964, + "step": 9555 + }, + { + "epoch": 1.64, + "grad_norm": 9.996955871582031, + "learning_rate": 5.400720782563927e-06, + "loss": 0.3469, + "step": 9556 + }, + { + "epoch": 1.64, + "grad_norm": 9.058783531188965, + "learning_rate": 5.398146559121332e-06, + "loss": 0.3878, + "step": 9557 + }, + { + "epoch": 1.64, + "grad_norm": 9.217575073242188, + "learning_rate": 5.3955723356787374e-06, + "loss": 0.4707, + "step": 9558 + }, + { + "epoch": 1.64, + "grad_norm": 7.300097465515137, + "learning_rate": 5.392998112236142e-06, + "loss": 0.2818, + "step": 9559 + }, + { + "epoch": 1.64, + "grad_norm": 9.75239372253418, + "learning_rate": 5.390423888793547e-06, + "loss": 0.3886, + "step": 9560 + }, + { + "epoch": 1.64, + "grad_norm": 11.548704147338867, + "learning_rate": 5.387849665350952e-06, + "loss": 0.2526, + "step": 9561 + }, + { + "epoch": 1.64, + "grad_norm": 9.713324546813965, + "learning_rate": 5.385275441908357e-06, + "loss": 0.2906, + "step": 9562 + }, + { + "epoch": 1.64, + "grad_norm": 8.408305168151855, + "learning_rate": 5.382701218465763e-06, + "loss": 0.3239, + "step": 9563 + }, + { + "epoch": 1.64, + "grad_norm": 12.87882137298584, + "learning_rate": 5.380126995023168e-06, + "loss": 0.5022, + "step": 9564 + }, + { + "epoch": 1.64, + "grad_norm": 8.475255966186523, + "learning_rate": 5.377552771580574e-06, + "loss": 0.3187, + "step": 9565 + }, + { + "epoch": 1.64, + "grad_norm": 10.214444160461426, + "learning_rate": 5.374978548137978e-06, + "loss": 0.4406, + "step": 9566 + }, + { + "epoch": 1.64, + "grad_norm": 12.222140312194824, + "learning_rate": 5.372404324695383e-06, + "loss": 0.4956, + "step": 9567 + }, + { + "epoch": 1.64, + "grad_norm": 10.52293872833252, + "learning_rate": 5.369830101252789e-06, + "loss": 0.4043, + "step": 9568 + }, + { + "epoch": 1.64, + "grad_norm": 9.071854591369629, + "learning_rate": 5.367255877810194e-06, + "loss": 0.4452, + "step": 9569 + }, + { + "epoch": 1.64, + "grad_norm": 7.6990203857421875, + "learning_rate": 5.3646816543676e-06, + "loss": 0.2953, + "step": 9570 + }, + { + "epoch": 1.64, + "grad_norm": 12.847419738769531, + "learning_rate": 5.362107430925004e-06, + "loss": 0.4816, + "step": 9571 + }, + { + "epoch": 1.64, + "grad_norm": 9.557892799377441, + "learning_rate": 5.35953320748241e-06, + "loss": 0.2918, + "step": 9572 + }, + { + "epoch": 1.64, + "grad_norm": 8.088288307189941, + "learning_rate": 5.356958984039815e-06, + "loss": 0.5144, + "step": 9573 + }, + { + "epoch": 1.64, + "grad_norm": 12.070624351501465, + "learning_rate": 5.35438476059722e-06, + "loss": 0.4365, + "step": 9574 + }, + { + "epoch": 1.64, + "grad_norm": 11.944831848144531, + "learning_rate": 5.351810537154626e-06, + "loss": 0.342, + "step": 9575 + }, + { + "epoch": 1.64, + "grad_norm": 10.573887825012207, + "learning_rate": 5.34923631371203e-06, + "loss": 0.3637, + "step": 9576 + }, + { + "epoch": 1.64, + "grad_norm": 10.548113822937012, + "learning_rate": 5.3466620902694356e-06, + "loss": 0.5225, + "step": 9577 + }, + { + "epoch": 1.64, + "grad_norm": 13.11074447631836, + "learning_rate": 5.3440878668268406e-06, + "loss": 0.2917, + "step": 9578 + }, + { + "epoch": 1.64, + "grad_norm": 10.624791145324707, + "learning_rate": 5.341513643384246e-06, + "loss": 0.4344, + "step": 9579 + }, + { + "epoch": 1.64, + "grad_norm": 11.08116340637207, + "learning_rate": 5.338939419941651e-06, + "loss": 0.3729, + "step": 9580 + }, + { + "epoch": 1.64, + "grad_norm": 8.875580787658691, + "learning_rate": 5.3363651964990555e-06, + "loss": 0.3392, + "step": 9581 + }, + { + "epoch": 1.64, + "grad_norm": 9.49691104888916, + "learning_rate": 5.333790973056461e-06, + "loss": 0.3364, + "step": 9582 + }, + { + "epoch": 1.64, + "grad_norm": 11.008417129516602, + "learning_rate": 5.331216749613866e-06, + "loss": 0.3941, + "step": 9583 + }, + { + "epoch": 1.64, + "grad_norm": 12.848654747009277, + "learning_rate": 5.328642526171272e-06, + "loss": 0.4664, + "step": 9584 + }, + { + "epoch": 1.64, + "grad_norm": 9.821887016296387, + "learning_rate": 5.326068302728677e-06, + "loss": 0.3384, + "step": 9585 + }, + { + "epoch": 1.65, + "grad_norm": 11.210831642150879, + "learning_rate": 5.323494079286082e-06, + "loss": 0.5148, + "step": 9586 + }, + { + "epoch": 1.65, + "grad_norm": 11.727355003356934, + "learning_rate": 5.320919855843487e-06, + "loss": 0.3022, + "step": 9587 + }, + { + "epoch": 1.65, + "grad_norm": 13.036104202270508, + "learning_rate": 5.318345632400892e-06, + "loss": 0.4292, + "step": 9588 + }, + { + "epoch": 1.65, + "grad_norm": 12.513479232788086, + "learning_rate": 5.315771408958298e-06, + "loss": 0.3374, + "step": 9589 + }, + { + "epoch": 1.65, + "grad_norm": 11.694671630859375, + "learning_rate": 5.313197185515703e-06, + "loss": 0.5588, + "step": 9590 + }, + { + "epoch": 1.65, + "grad_norm": 9.200736999511719, + "learning_rate": 5.310622962073109e-06, + "loss": 0.4175, + "step": 9591 + }, + { + "epoch": 1.65, + "grad_norm": 12.50564193725586, + "learning_rate": 5.308048738630513e-06, + "loss": 0.3543, + "step": 9592 + }, + { + "epoch": 1.65, + "grad_norm": 15.963687896728516, + "learning_rate": 5.305474515187918e-06, + "loss": 0.4715, + "step": 9593 + }, + { + "epoch": 1.65, + "grad_norm": 11.498435974121094, + "learning_rate": 5.302900291745324e-06, + "loss": 0.3656, + "step": 9594 + }, + { + "epoch": 1.65, + "grad_norm": 10.893860816955566, + "learning_rate": 5.300326068302729e-06, + "loss": 0.4367, + "step": 9595 + }, + { + "epoch": 1.65, + "grad_norm": 13.980158805847168, + "learning_rate": 5.2977518448601346e-06, + "loss": 0.6962, + "step": 9596 + }, + { + "epoch": 1.65, + "grad_norm": 12.001141548156738, + "learning_rate": 5.295177621417539e-06, + "loss": 0.3928, + "step": 9597 + }, + { + "epoch": 1.65, + "grad_norm": 12.483039855957031, + "learning_rate": 5.2926033979749445e-06, + "loss": 0.2953, + "step": 9598 + }, + { + "epoch": 1.65, + "grad_norm": 10.969145774841309, + "learning_rate": 5.2900291745323495e-06, + "loss": 0.3239, + "step": 9599 + }, + { + "epoch": 1.65, + "grad_norm": 10.261040687561035, + "learning_rate": 5.2874549510897545e-06, + "loss": 0.3371, + "step": 9600 + }, + { + "epoch": 1.65, + "grad_norm": 11.073018074035645, + "learning_rate": 5.28488072764716e-06, + "loss": 0.3655, + "step": 9601 + }, + { + "epoch": 1.65, + "grad_norm": 11.37199878692627, + "learning_rate": 5.2823065042045645e-06, + "loss": 0.4765, + "step": 9602 + }, + { + "epoch": 1.65, + "grad_norm": 10.612890243530273, + "learning_rate": 5.27973228076197e-06, + "loss": 0.4252, + "step": 9603 + }, + { + "epoch": 1.65, + "grad_norm": 8.203518867492676, + "learning_rate": 5.277158057319375e-06, + "loss": 0.2959, + "step": 9604 + }, + { + "epoch": 1.65, + "grad_norm": 8.584474563598633, + "learning_rate": 5.274583833876781e-06, + "loss": 0.2578, + "step": 9605 + }, + { + "epoch": 1.65, + "grad_norm": 10.519556045532227, + "learning_rate": 5.272009610434186e-06, + "loss": 0.3959, + "step": 9606 + }, + { + "epoch": 1.65, + "grad_norm": 11.884371757507324, + "learning_rate": 5.26943538699159e-06, + "loss": 0.4247, + "step": 9607 + }, + { + "epoch": 1.65, + "grad_norm": 11.644059181213379, + "learning_rate": 5.266861163548996e-06, + "loss": 0.5127, + "step": 9608 + }, + { + "epoch": 1.65, + "grad_norm": 11.072440147399902, + "learning_rate": 5.264286940106401e-06, + "loss": 0.4754, + "step": 9609 + }, + { + "epoch": 1.65, + "grad_norm": 12.92575740814209, + "learning_rate": 5.261712716663807e-06, + "loss": 0.4107, + "step": 9610 + }, + { + "epoch": 1.65, + "grad_norm": 10.279706001281738, + "learning_rate": 5.259138493221212e-06, + "loss": 0.3781, + "step": 9611 + }, + { + "epoch": 1.65, + "grad_norm": 14.406563758850098, + "learning_rate": 5.256564269778617e-06, + "loss": 0.5921, + "step": 9612 + }, + { + "epoch": 1.65, + "grad_norm": 9.601865768432617, + "learning_rate": 5.253990046336022e-06, + "loss": 0.334, + "step": 9613 + }, + { + "epoch": 1.65, + "grad_norm": 9.434653282165527, + "learning_rate": 5.251415822893427e-06, + "loss": 0.365, + "step": 9614 + }, + { + "epoch": 1.65, + "grad_norm": 7.051100254058838, + "learning_rate": 5.248841599450833e-06, + "loss": 0.2711, + "step": 9615 + }, + { + "epoch": 1.65, + "grad_norm": 10.551592826843262, + "learning_rate": 5.246267376008238e-06, + "loss": 0.4609, + "step": 9616 + }, + { + "epoch": 1.65, + "grad_norm": 17.5572452545166, + "learning_rate": 5.2436931525656435e-06, + "loss": 0.4668, + "step": 9617 + }, + { + "epoch": 1.65, + "grad_norm": 9.892151832580566, + "learning_rate": 5.241118929123048e-06, + "loss": 0.3163, + "step": 9618 + }, + { + "epoch": 1.65, + "grad_norm": 10.698821067810059, + "learning_rate": 5.238544705680453e-06, + "loss": 0.5468, + "step": 9619 + }, + { + "epoch": 1.65, + "grad_norm": 8.577296257019043, + "learning_rate": 5.2359704822378585e-06, + "loss": 0.3295, + "step": 9620 + }, + { + "epoch": 1.65, + "grad_norm": 15.120394706726074, + "learning_rate": 5.2333962587952635e-06, + "loss": 0.5822, + "step": 9621 + }, + { + "epoch": 1.65, + "grad_norm": 10.945082664489746, + "learning_rate": 5.230822035352669e-06, + "loss": 0.3464, + "step": 9622 + }, + { + "epoch": 1.65, + "grad_norm": 9.520858764648438, + "learning_rate": 5.2282478119100735e-06, + "loss": 0.5536, + "step": 9623 + }, + { + "epoch": 1.65, + "grad_norm": 10.958598136901855, + "learning_rate": 5.225673588467479e-06, + "loss": 0.4022, + "step": 9624 + }, + { + "epoch": 1.65, + "grad_norm": 8.269060134887695, + "learning_rate": 5.223099365024884e-06, + "loss": 0.3177, + "step": 9625 + }, + { + "epoch": 1.65, + "grad_norm": 10.411349296569824, + "learning_rate": 5.220525141582289e-06, + "loss": 0.4978, + "step": 9626 + }, + { + "epoch": 1.65, + "grad_norm": 13.53354263305664, + "learning_rate": 5.217950918139695e-06, + "loss": 0.5876, + "step": 9627 + }, + { + "epoch": 1.65, + "grad_norm": 16.472549438476562, + "learning_rate": 5.215376694697099e-06, + "loss": 0.4486, + "step": 9628 + }, + { + "epoch": 1.65, + "grad_norm": 13.45311164855957, + "learning_rate": 5.212802471254505e-06, + "loss": 0.5712, + "step": 9629 + }, + { + "epoch": 1.65, + "grad_norm": 9.941349983215332, + "learning_rate": 5.21022824781191e-06, + "loss": 0.4403, + "step": 9630 + }, + { + "epoch": 1.65, + "grad_norm": 9.188843727111816, + "learning_rate": 5.207654024369316e-06, + "loss": 0.4363, + "step": 9631 + }, + { + "epoch": 1.65, + "grad_norm": 9.22233772277832, + "learning_rate": 5.205079800926721e-06, + "loss": 0.4265, + "step": 9632 + }, + { + "epoch": 1.65, + "grad_norm": 11.215051651000977, + "learning_rate": 5.202505577484125e-06, + "loss": 0.2918, + "step": 9633 + }, + { + "epoch": 1.65, + "grad_norm": 12.18620491027832, + "learning_rate": 5.199931354041531e-06, + "loss": 0.5269, + "step": 9634 + }, + { + "epoch": 1.65, + "grad_norm": 12.463600158691406, + "learning_rate": 5.197357130598936e-06, + "loss": 0.4948, + "step": 9635 + }, + { + "epoch": 1.65, + "grad_norm": 8.704997062683105, + "learning_rate": 5.194782907156342e-06, + "loss": 0.4368, + "step": 9636 + }, + { + "epoch": 1.65, + "grad_norm": 10.63602352142334, + "learning_rate": 5.192208683713747e-06, + "loss": 0.3814, + "step": 9637 + }, + { + "epoch": 1.65, + "grad_norm": 7.676228046417236, + "learning_rate": 5.189634460271152e-06, + "loss": 0.2348, + "step": 9638 + }, + { + "epoch": 1.65, + "grad_norm": 12.812514305114746, + "learning_rate": 5.187060236828557e-06, + "loss": 0.3691, + "step": 9639 + }, + { + "epoch": 1.65, + "grad_norm": 8.405218124389648, + "learning_rate": 5.184486013385962e-06, + "loss": 0.3215, + "step": 9640 + }, + { + "epoch": 1.65, + "grad_norm": 11.067056655883789, + "learning_rate": 5.1819117899433675e-06, + "loss": 0.2929, + "step": 9641 + }, + { + "epoch": 1.65, + "grad_norm": 11.108912467956543, + "learning_rate": 5.1793375665007725e-06, + "loss": 0.6292, + "step": 9642 + }, + { + "epoch": 1.65, + "grad_norm": 13.607660293579102, + "learning_rate": 5.176763343058178e-06, + "loss": 0.5454, + "step": 9643 + }, + { + "epoch": 1.66, + "grad_norm": 10.48611831665039, + "learning_rate": 5.1741891196155824e-06, + "loss": 0.4604, + "step": 9644 + }, + { + "epoch": 1.66, + "grad_norm": 8.380729675292969, + "learning_rate": 5.171614896172987e-06, + "loss": 0.4349, + "step": 9645 + }, + { + "epoch": 1.66, + "grad_norm": 12.142107963562012, + "learning_rate": 5.169040672730393e-06, + "loss": 0.5217, + "step": 9646 + }, + { + "epoch": 1.66, + "grad_norm": 11.150938987731934, + "learning_rate": 5.166466449287798e-06, + "loss": 0.2636, + "step": 9647 + }, + { + "epoch": 1.66, + "grad_norm": 9.137070655822754, + "learning_rate": 5.163892225845204e-06, + "loss": 0.3405, + "step": 9648 + }, + { + "epoch": 1.66, + "grad_norm": 16.729690551757812, + "learning_rate": 5.161318002402608e-06, + "loss": 0.5361, + "step": 9649 + }, + { + "epoch": 1.66, + "grad_norm": 10.038084983825684, + "learning_rate": 5.158743778960014e-06, + "loss": 0.3174, + "step": 9650 + }, + { + "epoch": 1.66, + "grad_norm": 9.0342378616333, + "learning_rate": 5.156169555517419e-06, + "loss": 0.3105, + "step": 9651 + }, + { + "epoch": 1.66, + "grad_norm": 14.702507972717285, + "learning_rate": 5.153595332074824e-06, + "loss": 0.5836, + "step": 9652 + }, + { + "epoch": 1.66, + "grad_norm": 7.81276798248291, + "learning_rate": 5.15102110863223e-06, + "loss": 0.2117, + "step": 9653 + }, + { + "epoch": 1.66, + "grad_norm": 15.668899536132812, + "learning_rate": 5.148446885189634e-06, + "loss": 0.6862, + "step": 9654 + }, + { + "epoch": 1.66, + "grad_norm": 9.84668254852295, + "learning_rate": 5.14587266174704e-06, + "loss": 0.4756, + "step": 9655 + }, + { + "epoch": 1.66, + "grad_norm": 8.694463729858398, + "learning_rate": 5.143298438304445e-06, + "loss": 0.2502, + "step": 9656 + }, + { + "epoch": 1.66, + "grad_norm": 11.033036231994629, + "learning_rate": 5.140724214861851e-06, + "loss": 0.3887, + "step": 9657 + }, + { + "epoch": 1.66, + "grad_norm": 9.068087577819824, + "learning_rate": 5.138149991419256e-06, + "loss": 0.399, + "step": 9658 + }, + { + "epoch": 1.66, + "grad_norm": 10.252431869506836, + "learning_rate": 5.13557576797666e-06, + "loss": 0.4944, + "step": 9659 + }, + { + "epoch": 1.66, + "grad_norm": 9.663421630859375, + "learning_rate": 5.133001544534066e-06, + "loss": 0.3324, + "step": 9660 + }, + { + "epoch": 1.66, + "grad_norm": 12.257155418395996, + "learning_rate": 5.130427321091471e-06, + "loss": 0.3699, + "step": 9661 + }, + { + "epoch": 1.66, + "grad_norm": 10.055291175842285, + "learning_rate": 5.1278530976488764e-06, + "loss": 0.4766, + "step": 9662 + }, + { + "epoch": 1.66, + "grad_norm": 9.363739967346191, + "learning_rate": 5.1252788742062814e-06, + "loss": 0.5042, + "step": 9663 + }, + { + "epoch": 1.66, + "grad_norm": 12.342509269714355, + "learning_rate": 5.122704650763686e-06, + "loss": 0.3707, + "step": 9664 + }, + { + "epoch": 1.66, + "grad_norm": 14.173884391784668, + "learning_rate": 5.120130427321091e-06, + "loss": 0.4334, + "step": 9665 + }, + { + "epoch": 1.66, + "grad_norm": 11.110319137573242, + "learning_rate": 5.117556203878496e-06, + "loss": 0.3685, + "step": 9666 + }, + { + "epoch": 1.66, + "grad_norm": 9.817700386047363, + "learning_rate": 5.114981980435902e-06, + "loss": 0.3278, + "step": 9667 + }, + { + "epoch": 1.66, + "grad_norm": 9.422295570373535, + "learning_rate": 5.112407756993307e-06, + "loss": 0.3149, + "step": 9668 + }, + { + "epoch": 1.66, + "grad_norm": 10.302181243896484, + "learning_rate": 5.109833533550713e-06, + "loss": 0.4029, + "step": 9669 + }, + { + "epoch": 1.66, + "grad_norm": 8.484495162963867, + "learning_rate": 5.107259310108117e-06, + "loss": 0.4563, + "step": 9670 + }, + { + "epoch": 1.66, + "grad_norm": 11.984222412109375, + "learning_rate": 5.104685086665522e-06, + "loss": 0.3982, + "step": 9671 + }, + { + "epoch": 1.66, + "grad_norm": 9.562557220458984, + "learning_rate": 5.102110863222928e-06, + "loss": 0.2225, + "step": 9672 + }, + { + "epoch": 1.66, + "grad_norm": 9.58154296875, + "learning_rate": 5.099536639780333e-06, + "loss": 0.5037, + "step": 9673 + }, + { + "epoch": 1.66, + "grad_norm": 12.111811637878418, + "learning_rate": 5.096962416337739e-06, + "loss": 0.4773, + "step": 9674 + }, + { + "epoch": 1.66, + "grad_norm": 9.926980972290039, + "learning_rate": 5.094388192895143e-06, + "loss": 0.5246, + "step": 9675 + }, + { + "epoch": 1.66, + "grad_norm": 9.595392227172852, + "learning_rate": 5.091813969452549e-06, + "loss": 0.361, + "step": 9676 + }, + { + "epoch": 1.66, + "grad_norm": 11.566325187683105, + "learning_rate": 5.089239746009954e-06, + "loss": 0.4003, + "step": 9677 + }, + { + "epoch": 1.66, + "grad_norm": 10.54730224609375, + "learning_rate": 5.086665522567359e-06, + "loss": 0.4316, + "step": 9678 + }, + { + "epoch": 1.66, + "grad_norm": 8.703022003173828, + "learning_rate": 5.084091299124765e-06, + "loss": 0.2642, + "step": 9679 + }, + { + "epoch": 1.66, + "grad_norm": 13.015544891357422, + "learning_rate": 5.081517075682169e-06, + "loss": 0.4576, + "step": 9680 + }, + { + "epoch": 1.66, + "grad_norm": 10.491514205932617, + "learning_rate": 5.078942852239575e-06, + "loss": 0.2702, + "step": 9681 + }, + { + "epoch": 1.66, + "grad_norm": 9.422765731811523, + "learning_rate": 5.0763686287969796e-06, + "loss": 0.3372, + "step": 9682 + }, + { + "epoch": 1.66, + "grad_norm": 14.247221946716309, + "learning_rate": 5.073794405354385e-06, + "loss": 0.4937, + "step": 9683 + }, + { + "epoch": 1.66, + "grad_norm": 11.32010269165039, + "learning_rate": 5.07122018191179e-06, + "loss": 0.5839, + "step": 9684 + }, + { + "epoch": 1.66, + "grad_norm": 10.346248626708984, + "learning_rate": 5.0686459584691945e-06, + "loss": 0.4014, + "step": 9685 + }, + { + "epoch": 1.66, + "grad_norm": 11.298344612121582, + "learning_rate": 5.0660717350266e-06, + "loss": 0.3451, + "step": 9686 + }, + { + "epoch": 1.66, + "grad_norm": 16.95216941833496, + "learning_rate": 5.063497511584005e-06, + "loss": 0.5503, + "step": 9687 + }, + { + "epoch": 1.66, + "grad_norm": 8.768074035644531, + "learning_rate": 5.060923288141411e-06, + "loss": 0.3055, + "step": 9688 + }, + { + "epoch": 1.66, + "grad_norm": 8.51509952545166, + "learning_rate": 5.058349064698816e-06, + "loss": 0.3556, + "step": 9689 + }, + { + "epoch": 1.66, + "grad_norm": 10.47543716430664, + "learning_rate": 5.055774841256221e-06, + "loss": 0.3333, + "step": 9690 + }, + { + "epoch": 1.66, + "grad_norm": 9.400741577148438, + "learning_rate": 5.053200617813626e-06, + "loss": 0.4665, + "step": 9691 + }, + { + "epoch": 1.66, + "grad_norm": 15.153923034667969, + "learning_rate": 5.050626394371031e-06, + "loss": 0.4699, + "step": 9692 + }, + { + "epoch": 1.66, + "grad_norm": 11.132309913635254, + "learning_rate": 5.048052170928437e-06, + "loss": 0.3326, + "step": 9693 + }, + { + "epoch": 1.66, + "grad_norm": 10.555898666381836, + "learning_rate": 5.045477947485842e-06, + "loss": 0.5747, + "step": 9694 + }, + { + "epoch": 1.66, + "grad_norm": 9.92475700378418, + "learning_rate": 5.042903724043248e-06, + "loss": 0.386, + "step": 9695 + }, + { + "epoch": 1.66, + "grad_norm": 9.048136711120605, + "learning_rate": 5.040329500600652e-06, + "loss": 0.4085, + "step": 9696 + }, + { + "epoch": 1.66, + "grad_norm": 14.683354377746582, + "learning_rate": 5.037755277158057e-06, + "loss": 0.4891, + "step": 9697 + }, + { + "epoch": 1.66, + "grad_norm": 9.044388771057129, + "learning_rate": 5.035181053715463e-06, + "loss": 0.5794, + "step": 9698 + }, + { + "epoch": 1.66, + "grad_norm": 10.953500747680664, + "learning_rate": 5.032606830272868e-06, + "loss": 0.3842, + "step": 9699 + }, + { + "epoch": 1.66, + "grad_norm": 9.28654956817627, + "learning_rate": 5.030032606830274e-06, + "loss": 0.3827, + "step": 9700 + }, + { + "epoch": 1.66, + "grad_norm": 10.839948654174805, + "learning_rate": 5.027458383387678e-06, + "loss": 0.4443, + "step": 9701 + }, + { + "epoch": 1.67, + "grad_norm": 9.030762672424316, + "learning_rate": 5.0248841599450836e-06, + "loss": 0.2864, + "step": 9702 + }, + { + "epoch": 1.67, + "grad_norm": 11.66850471496582, + "learning_rate": 5.0223099365024885e-06, + "loss": 0.454, + "step": 9703 + }, + { + "epoch": 1.67, + "grad_norm": 11.376481056213379, + "learning_rate": 5.0197357130598935e-06, + "loss": 0.4531, + "step": 9704 + }, + { + "epoch": 1.67, + "grad_norm": 9.105647087097168, + "learning_rate": 5.017161489617299e-06, + "loss": 0.3653, + "step": 9705 + }, + { + "epoch": 1.67, + "grad_norm": 7.068383693695068, + "learning_rate": 5.0145872661747035e-06, + "loss": 0.3109, + "step": 9706 + }, + { + "epoch": 1.67, + "grad_norm": 8.775964736938477, + "learning_rate": 5.012013042732109e-06, + "loss": 0.4885, + "step": 9707 + }, + { + "epoch": 1.67, + "grad_norm": 16.013874053955078, + "learning_rate": 5.009438819289514e-06, + "loss": 0.4333, + "step": 9708 + }, + { + "epoch": 1.67, + "grad_norm": 11.835769653320312, + "learning_rate": 5.00686459584692e-06, + "loss": 0.4808, + "step": 9709 + }, + { + "epoch": 1.67, + "grad_norm": 10.118885040283203, + "learning_rate": 5.004290372404325e-06, + "loss": 0.3596, + "step": 9710 + }, + { + "epoch": 1.67, + "grad_norm": 11.221210479736328, + "learning_rate": 5.001716148961729e-06, + "loss": 0.4878, + "step": 9711 + }, + { + "epoch": 1.67, + "grad_norm": 9.434014320373535, + "learning_rate": 4.999141925519135e-06, + "loss": 0.3823, + "step": 9712 + }, + { + "epoch": 1.67, + "grad_norm": 12.95885944366455, + "learning_rate": 4.99656770207654e-06, + "loss": 0.344, + "step": 9713 + }, + { + "epoch": 1.67, + "grad_norm": 12.053287506103516, + "learning_rate": 4.993993478633946e-06, + "loss": 0.4008, + "step": 9714 + }, + { + "epoch": 1.67, + "grad_norm": 11.965164184570312, + "learning_rate": 4.991419255191351e-06, + "loss": 0.4148, + "step": 9715 + }, + { + "epoch": 1.67, + "grad_norm": 7.994836807250977, + "learning_rate": 4.988845031748756e-06, + "loss": 0.2957, + "step": 9716 + }, + { + "epoch": 1.67, + "grad_norm": 11.769638061523438, + "learning_rate": 4.986270808306161e-06, + "loss": 0.3409, + "step": 9717 + }, + { + "epoch": 1.67, + "grad_norm": 15.909829139709473, + "learning_rate": 4.983696584863566e-06, + "loss": 0.5229, + "step": 9718 + }, + { + "epoch": 1.67, + "grad_norm": 8.90284252166748, + "learning_rate": 4.981122361420972e-06, + "loss": 0.3628, + "step": 9719 + }, + { + "epoch": 1.67, + "grad_norm": 14.65898609161377, + "learning_rate": 4.978548137978377e-06, + "loss": 0.563, + "step": 9720 + }, + { + "epoch": 1.67, + "grad_norm": 6.9395856857299805, + "learning_rate": 4.9759739145357825e-06, + "loss": 0.4056, + "step": 9721 + }, + { + "epoch": 1.67, + "grad_norm": 13.578889846801758, + "learning_rate": 4.973399691093187e-06, + "loss": 0.4731, + "step": 9722 + }, + { + "epoch": 1.67, + "grad_norm": 10.687485694885254, + "learning_rate": 4.970825467650592e-06, + "loss": 0.5302, + "step": 9723 + }, + { + "epoch": 1.67, + "grad_norm": 12.489301681518555, + "learning_rate": 4.9682512442079975e-06, + "loss": 0.5465, + "step": 9724 + }, + { + "epoch": 1.67, + "grad_norm": 8.20189094543457, + "learning_rate": 4.9656770207654025e-06, + "loss": 0.3079, + "step": 9725 + }, + { + "epoch": 1.67, + "grad_norm": 11.46342945098877, + "learning_rate": 4.963102797322808e-06, + "loss": 0.5348, + "step": 9726 + }, + { + "epoch": 1.67, + "grad_norm": 16.839994430541992, + "learning_rate": 4.9605285738802125e-06, + "loss": 0.6541, + "step": 9727 + }, + { + "epoch": 1.67, + "grad_norm": 7.444540500640869, + "learning_rate": 4.957954350437618e-06, + "loss": 0.2223, + "step": 9728 + }, + { + "epoch": 1.67, + "grad_norm": 8.409849166870117, + "learning_rate": 4.955380126995023e-06, + "loss": 0.3626, + "step": 9729 + }, + { + "epoch": 1.67, + "grad_norm": 9.897289276123047, + "learning_rate": 4.952805903552428e-06, + "loss": 0.3453, + "step": 9730 + }, + { + "epoch": 1.67, + "grad_norm": 10.75826644897461, + "learning_rate": 4.950231680109834e-06, + "loss": 0.4334, + "step": 9731 + }, + { + "epoch": 1.67, + "grad_norm": 8.613908767700195, + "learning_rate": 4.947657456667238e-06, + "loss": 0.4228, + "step": 9732 + }, + { + "epoch": 1.67, + "grad_norm": 15.719470977783203, + "learning_rate": 4.945083233224644e-06, + "loss": 0.427, + "step": 9733 + }, + { + "epoch": 1.67, + "grad_norm": 9.687437057495117, + "learning_rate": 4.942509009782049e-06, + "loss": 0.3567, + "step": 9734 + }, + { + "epoch": 1.67, + "grad_norm": 12.149904251098633, + "learning_rate": 4.939934786339455e-06, + "loss": 0.4149, + "step": 9735 + }, + { + "epoch": 1.67, + "grad_norm": 13.584178924560547, + "learning_rate": 4.93736056289686e-06, + "loss": 0.5128, + "step": 9736 + }, + { + "epoch": 1.67, + "grad_norm": 11.671844482421875, + "learning_rate": 4.934786339454264e-06, + "loss": 0.5259, + "step": 9737 + }, + { + "epoch": 1.67, + "grad_norm": 12.277400016784668, + "learning_rate": 4.93221211601167e-06, + "loss": 0.599, + "step": 9738 + }, + { + "epoch": 1.67, + "grad_norm": 10.343419075012207, + "learning_rate": 4.929637892569075e-06, + "loss": 0.3067, + "step": 9739 + }, + { + "epoch": 1.67, + "grad_norm": 9.915791511535645, + "learning_rate": 4.927063669126481e-06, + "loss": 0.485, + "step": 9740 + }, + { + "epoch": 1.67, + "grad_norm": 10.148524284362793, + "learning_rate": 4.924489445683886e-06, + "loss": 0.4279, + "step": 9741 + }, + { + "epoch": 1.67, + "grad_norm": 8.541462898254395, + "learning_rate": 4.921915222241291e-06, + "loss": 0.3187, + "step": 9742 + }, + { + "epoch": 1.67, + "grad_norm": 11.05405330657959, + "learning_rate": 4.919340998798696e-06, + "loss": 0.3563, + "step": 9743 + }, + { + "epoch": 1.67, + "grad_norm": 14.602696418762207, + "learning_rate": 4.916766775356101e-06, + "loss": 0.3998, + "step": 9744 + }, + { + "epoch": 1.67, + "grad_norm": 10.657196044921875, + "learning_rate": 4.9141925519135065e-06, + "loss": 0.3731, + "step": 9745 + }, + { + "epoch": 1.67, + "grad_norm": 10.555832862854004, + "learning_rate": 4.9116183284709115e-06, + "loss": 0.3885, + "step": 9746 + }, + { + "epoch": 1.67, + "grad_norm": 7.744253635406494, + "learning_rate": 4.909044105028317e-06, + "loss": 0.364, + "step": 9747 + }, + { + "epoch": 1.67, + "grad_norm": 10.683523178100586, + "learning_rate": 4.9064698815857214e-06, + "loss": 0.2569, + "step": 9748 + }, + { + "epoch": 1.67, + "grad_norm": 11.121380805969238, + "learning_rate": 4.9038956581431264e-06, + "loss": 0.3385, + "step": 9749 + }, + { + "epoch": 1.67, + "grad_norm": 9.50162410736084, + "learning_rate": 4.901321434700532e-06, + "loss": 0.365, + "step": 9750 + }, + { + "epoch": 1.67, + "grad_norm": 9.024792671203613, + "learning_rate": 4.898747211257937e-06, + "loss": 0.5786, + "step": 9751 + }, + { + "epoch": 1.67, + "grad_norm": 11.351205825805664, + "learning_rate": 4.896172987815343e-06, + "loss": 0.4454, + "step": 9752 + }, + { + "epoch": 1.67, + "grad_norm": 11.305006980895996, + "learning_rate": 4.893598764372747e-06, + "loss": 0.3918, + "step": 9753 + }, + { + "epoch": 1.67, + "grad_norm": 14.149331092834473, + "learning_rate": 4.891024540930153e-06, + "loss": 0.3914, + "step": 9754 + }, + { + "epoch": 1.67, + "grad_norm": 12.778213500976562, + "learning_rate": 4.888450317487558e-06, + "loss": 0.3568, + "step": 9755 + }, + { + "epoch": 1.67, + "grad_norm": 13.530014038085938, + "learning_rate": 4.885876094044963e-06, + "loss": 0.5542, + "step": 9756 + }, + { + "epoch": 1.67, + "grad_norm": 7.421553134918213, + "learning_rate": 4.883301870602369e-06, + "loss": 0.4059, + "step": 9757 + }, + { + "epoch": 1.67, + "grad_norm": 9.967906951904297, + "learning_rate": 4.880727647159773e-06, + "loss": 0.4492, + "step": 9758 + }, + { + "epoch": 1.67, + "grad_norm": 16.925548553466797, + "learning_rate": 4.878153423717179e-06, + "loss": 0.4873, + "step": 9759 + }, + { + "epoch": 1.67, + "grad_norm": 11.683004379272461, + "learning_rate": 4.875579200274584e-06, + "loss": 0.5783, + "step": 9760 + }, + { + "epoch": 1.68, + "grad_norm": 10.583817481994629, + "learning_rate": 4.87300497683199e-06, + "loss": 0.4882, + "step": 9761 + }, + { + "epoch": 1.68, + "grad_norm": 13.585066795349121, + "learning_rate": 4.870430753389395e-06, + "loss": 0.5634, + "step": 9762 + }, + { + "epoch": 1.68, + "grad_norm": 7.594355583190918, + "learning_rate": 4.867856529946799e-06, + "loss": 0.2008, + "step": 9763 + }, + { + "epoch": 1.68, + "grad_norm": 9.9620943069458, + "learning_rate": 4.865282306504205e-06, + "loss": 0.3649, + "step": 9764 + }, + { + "epoch": 1.68, + "grad_norm": 12.050984382629395, + "learning_rate": 4.86270808306161e-06, + "loss": 0.5158, + "step": 9765 + }, + { + "epoch": 1.68, + "grad_norm": 9.703680038452148, + "learning_rate": 4.8601338596190155e-06, + "loss": 0.3964, + "step": 9766 + }, + { + "epoch": 1.68, + "grad_norm": 11.653826713562012, + "learning_rate": 4.8575596361764204e-06, + "loss": 0.4932, + "step": 9767 + }, + { + "epoch": 1.68, + "grad_norm": 9.42916488647461, + "learning_rate": 4.8549854127338254e-06, + "loss": 0.4299, + "step": 9768 + }, + { + "epoch": 1.68, + "grad_norm": 9.416485786437988, + "learning_rate": 4.85241118929123e-06, + "loss": 0.2953, + "step": 9769 + }, + { + "epoch": 1.68, + "grad_norm": 9.955766677856445, + "learning_rate": 4.849836965848635e-06, + "loss": 0.1855, + "step": 9770 + }, + { + "epoch": 1.68, + "grad_norm": 9.285853385925293, + "learning_rate": 4.847262742406041e-06, + "loss": 0.4909, + "step": 9771 + }, + { + "epoch": 1.68, + "grad_norm": 11.007976531982422, + "learning_rate": 4.844688518963446e-06, + "loss": 0.3296, + "step": 9772 + }, + { + "epoch": 1.68, + "grad_norm": 12.120858192443848, + "learning_rate": 4.842114295520852e-06, + "loss": 0.4508, + "step": 9773 + }, + { + "epoch": 1.68, + "grad_norm": 9.275014877319336, + "learning_rate": 4.839540072078256e-06, + "loss": 0.3586, + "step": 9774 + }, + { + "epoch": 1.68, + "grad_norm": 11.946040153503418, + "learning_rate": 4.836965848635661e-06, + "loss": 0.3457, + "step": 9775 + }, + { + "epoch": 1.68, + "grad_norm": 10.76372241973877, + "learning_rate": 4.834391625193067e-06, + "loss": 0.3692, + "step": 9776 + }, + { + "epoch": 1.68, + "grad_norm": 10.383515357971191, + "learning_rate": 4.831817401750472e-06, + "loss": 0.3021, + "step": 9777 + }, + { + "epoch": 1.68, + "grad_norm": 10.727526664733887, + "learning_rate": 4.829243178307878e-06, + "loss": 0.4489, + "step": 9778 + }, + { + "epoch": 1.68, + "grad_norm": 16.16355323791504, + "learning_rate": 4.826668954865282e-06, + "loss": 0.5248, + "step": 9779 + }, + { + "epoch": 1.68, + "grad_norm": 11.519400596618652, + "learning_rate": 4.824094731422688e-06, + "loss": 0.487, + "step": 9780 + }, + { + "epoch": 1.68, + "grad_norm": 8.917197227478027, + "learning_rate": 4.821520507980093e-06, + "loss": 0.5149, + "step": 9781 + }, + { + "epoch": 1.68, + "grad_norm": 7.641584873199463, + "learning_rate": 4.818946284537498e-06, + "loss": 0.3047, + "step": 9782 + }, + { + "epoch": 1.68, + "grad_norm": 10.241837501525879, + "learning_rate": 4.816372061094904e-06, + "loss": 0.4307, + "step": 9783 + }, + { + "epoch": 1.68, + "grad_norm": 10.343546867370605, + "learning_rate": 4.813797837652308e-06, + "loss": 0.365, + "step": 9784 + }, + { + "epoch": 1.68, + "grad_norm": 8.740839004516602, + "learning_rate": 4.811223614209714e-06, + "loss": 0.326, + "step": 9785 + }, + { + "epoch": 1.68, + "grad_norm": 9.622611045837402, + "learning_rate": 4.808649390767119e-06, + "loss": 0.4491, + "step": 9786 + }, + { + "epoch": 1.68, + "grad_norm": 9.138028144836426, + "learning_rate": 4.806075167324524e-06, + "loss": 0.2401, + "step": 9787 + }, + { + "epoch": 1.68, + "grad_norm": 10.242894172668457, + "learning_rate": 4.803500943881929e-06, + "loss": 0.5856, + "step": 9788 + }, + { + "epoch": 1.68, + "grad_norm": 8.222790718078613, + "learning_rate": 4.8009267204393335e-06, + "loss": 0.3135, + "step": 9789 + }, + { + "epoch": 1.68, + "grad_norm": 8.781777381896973, + "learning_rate": 4.798352496996739e-06, + "loss": 0.4274, + "step": 9790 + }, + { + "epoch": 1.68, + "grad_norm": 7.9878411293029785, + "learning_rate": 4.795778273554144e-06, + "loss": 0.363, + "step": 9791 + }, + { + "epoch": 1.68, + "grad_norm": 11.4044771194458, + "learning_rate": 4.79320405011155e-06, + "loss": 0.4434, + "step": 9792 + }, + { + "epoch": 1.68, + "grad_norm": 7.178708076477051, + "learning_rate": 4.790629826668955e-06, + "loss": 0.3797, + "step": 9793 + }, + { + "epoch": 1.68, + "grad_norm": 10.6813325881958, + "learning_rate": 4.78805560322636e-06, + "loss": 0.5054, + "step": 9794 + }, + { + "epoch": 1.68, + "grad_norm": 7.982411861419678, + "learning_rate": 4.785481379783765e-06, + "loss": 0.2052, + "step": 9795 + }, + { + "epoch": 1.68, + "grad_norm": 17.61693000793457, + "learning_rate": 4.78290715634117e-06, + "loss": 0.5132, + "step": 9796 + }, + { + "epoch": 1.68, + "grad_norm": 9.12075138092041, + "learning_rate": 4.780332932898576e-06, + "loss": 0.2804, + "step": 9797 + }, + { + "epoch": 1.68, + "grad_norm": 9.588252067565918, + "learning_rate": 4.777758709455981e-06, + "loss": 0.3441, + "step": 9798 + }, + { + "epoch": 1.68, + "grad_norm": 8.717166900634766, + "learning_rate": 4.775184486013387e-06, + "loss": 0.4569, + "step": 9799 + }, + { + "epoch": 1.68, + "grad_norm": 9.314568519592285, + "learning_rate": 4.772610262570791e-06, + "loss": 0.3885, + "step": 9800 + }, + { + "epoch": 1.68, + "grad_norm": 8.739814758300781, + "learning_rate": 4.770036039128196e-06, + "loss": 0.3648, + "step": 9801 + }, + { + "epoch": 1.68, + "grad_norm": 8.777087211608887, + "learning_rate": 4.767461815685602e-06, + "loss": 0.3712, + "step": 9802 + }, + { + "epoch": 1.68, + "grad_norm": 10.593404769897461, + "learning_rate": 4.764887592243007e-06, + "loss": 0.3444, + "step": 9803 + }, + { + "epoch": 1.68, + "grad_norm": 12.855645179748535, + "learning_rate": 4.762313368800413e-06, + "loss": 0.5348, + "step": 9804 + }, + { + "epoch": 1.68, + "grad_norm": 12.688872337341309, + "learning_rate": 4.759739145357817e-06, + "loss": 0.5544, + "step": 9805 + }, + { + "epoch": 1.68, + "grad_norm": 6.552645206451416, + "learning_rate": 4.7571649219152226e-06, + "loss": 0.224, + "step": 9806 + }, + { + "epoch": 1.68, + "grad_norm": 12.194353103637695, + "learning_rate": 4.7545906984726276e-06, + "loss": 0.4566, + "step": 9807 + }, + { + "epoch": 1.68, + "grad_norm": 7.01422643661499, + "learning_rate": 4.7520164750300325e-06, + "loss": 0.3212, + "step": 9808 + }, + { + "epoch": 1.68, + "grad_norm": 10.690058708190918, + "learning_rate": 4.749442251587438e-06, + "loss": 0.4046, + "step": 9809 + }, + { + "epoch": 1.68, + "grad_norm": 9.333771705627441, + "learning_rate": 4.7468680281448425e-06, + "loss": 0.3044, + "step": 9810 + }, + { + "epoch": 1.68, + "grad_norm": 14.825360298156738, + "learning_rate": 4.744293804702248e-06, + "loss": 0.4733, + "step": 9811 + }, + { + "epoch": 1.68, + "grad_norm": 9.755243301391602, + "learning_rate": 4.741719581259653e-06, + "loss": 0.4944, + "step": 9812 + }, + { + "epoch": 1.68, + "grad_norm": 4.844978332519531, + "learning_rate": 4.739145357817059e-06, + "loss": 0.2402, + "step": 9813 + }, + { + "epoch": 1.68, + "grad_norm": 12.713988304138184, + "learning_rate": 4.736571134374464e-06, + "loss": 0.3748, + "step": 9814 + }, + { + "epoch": 1.68, + "grad_norm": 8.40971851348877, + "learning_rate": 4.733996910931868e-06, + "loss": 0.4246, + "step": 9815 + }, + { + "epoch": 1.68, + "grad_norm": 10.48250961303711, + "learning_rate": 4.731422687489274e-06, + "loss": 0.4136, + "step": 9816 + }, + { + "epoch": 1.68, + "grad_norm": 12.333423614501953, + "learning_rate": 4.728848464046679e-06, + "loss": 0.3594, + "step": 9817 + }, + { + "epoch": 1.68, + "grad_norm": 15.316245079040527, + "learning_rate": 4.726274240604085e-06, + "loss": 0.3243, + "step": 9818 + }, + { + "epoch": 1.69, + "grad_norm": 10.806426048278809, + "learning_rate": 4.72370001716149e-06, + "loss": 0.367, + "step": 9819 + }, + { + "epoch": 1.69, + "grad_norm": 8.316032409667969, + "learning_rate": 4.721125793718895e-06, + "loss": 0.2928, + "step": 9820 + }, + { + "epoch": 1.69, + "grad_norm": 16.782794952392578, + "learning_rate": 4.7185515702763e-06, + "loss": 0.4303, + "step": 9821 + }, + { + "epoch": 1.69, + "grad_norm": 9.911745071411133, + "learning_rate": 4.715977346833705e-06, + "loss": 0.3829, + "step": 9822 + }, + { + "epoch": 1.69, + "grad_norm": 11.591330528259277, + "learning_rate": 4.713403123391111e-06, + "loss": 0.4084, + "step": 9823 + }, + { + "epoch": 1.69, + "grad_norm": 15.675152778625488, + "learning_rate": 4.710828899948516e-06, + "loss": 0.3473, + "step": 9824 + }, + { + "epoch": 1.69, + "grad_norm": 10.265535354614258, + "learning_rate": 4.7082546765059216e-06, + "loss": 0.477, + "step": 9825 + }, + { + "epoch": 1.69, + "grad_norm": 9.301252365112305, + "learning_rate": 4.705680453063326e-06, + "loss": 0.352, + "step": 9826 + }, + { + "epoch": 1.69, + "grad_norm": 13.527870178222656, + "learning_rate": 4.703106229620731e-06, + "loss": 0.3314, + "step": 9827 + }, + { + "epoch": 1.69, + "grad_norm": 11.516618728637695, + "learning_rate": 4.7005320061781365e-06, + "loss": 0.2983, + "step": 9828 + }, + { + "epoch": 1.69, + "grad_norm": 8.598899841308594, + "learning_rate": 4.6979577827355415e-06, + "loss": 0.3594, + "step": 9829 + }, + { + "epoch": 1.69, + "grad_norm": 10.302850723266602, + "learning_rate": 4.695383559292947e-06, + "loss": 0.3796, + "step": 9830 + }, + { + "epoch": 1.69, + "grad_norm": 8.374128341674805, + "learning_rate": 4.6928093358503515e-06, + "loss": 0.3827, + "step": 9831 + }, + { + "epoch": 1.69, + "grad_norm": 9.87947940826416, + "learning_rate": 4.690235112407757e-06, + "loss": 0.4863, + "step": 9832 + }, + { + "epoch": 1.69, + "grad_norm": 10.927107810974121, + "learning_rate": 4.687660888965162e-06, + "loss": 0.3641, + "step": 9833 + }, + { + "epoch": 1.69, + "grad_norm": 13.19281005859375, + "learning_rate": 4.685086665522567e-06, + "loss": 0.4519, + "step": 9834 + }, + { + "epoch": 1.69, + "grad_norm": 12.90243911743164, + "learning_rate": 4.682512442079973e-06, + "loss": 0.3695, + "step": 9835 + }, + { + "epoch": 1.69, + "grad_norm": 13.137521743774414, + "learning_rate": 4.679938218637377e-06, + "loss": 0.5407, + "step": 9836 + }, + { + "epoch": 1.69, + "grad_norm": 13.40953254699707, + "learning_rate": 4.677363995194783e-06, + "loss": 0.3313, + "step": 9837 + }, + { + "epoch": 1.69, + "grad_norm": 7.945343017578125, + "learning_rate": 4.674789771752188e-06, + "loss": 0.4024, + "step": 9838 + }, + { + "epoch": 1.69, + "grad_norm": 12.178277969360352, + "learning_rate": 4.672215548309594e-06, + "loss": 0.4536, + "step": 9839 + }, + { + "epoch": 1.69, + "grad_norm": 9.213882446289062, + "learning_rate": 4.669641324866999e-06, + "loss": 0.3644, + "step": 9840 + }, + { + "epoch": 1.69, + "grad_norm": 13.61800765991211, + "learning_rate": 4.667067101424403e-06, + "loss": 0.5098, + "step": 9841 + }, + { + "epoch": 1.69, + "grad_norm": 6.695042610168457, + "learning_rate": 4.664492877981809e-06, + "loss": 0.3005, + "step": 9842 + }, + { + "epoch": 1.69, + "grad_norm": 9.714210510253906, + "learning_rate": 4.661918654539214e-06, + "loss": 0.4691, + "step": 9843 + }, + { + "epoch": 1.69, + "grad_norm": 10.154513359069824, + "learning_rate": 4.65934443109662e-06, + "loss": 0.3923, + "step": 9844 + }, + { + "epoch": 1.69, + "grad_norm": 10.203897476196289, + "learning_rate": 4.656770207654025e-06, + "loss": 0.4449, + "step": 9845 + }, + { + "epoch": 1.69, + "grad_norm": 9.143186569213867, + "learning_rate": 4.65419598421143e-06, + "loss": 0.4072, + "step": 9846 + }, + { + "epoch": 1.69, + "grad_norm": 9.76897144317627, + "learning_rate": 4.651621760768835e-06, + "loss": 0.2984, + "step": 9847 + }, + { + "epoch": 1.69, + "grad_norm": 13.279026985168457, + "learning_rate": 4.64904753732624e-06, + "loss": 0.317, + "step": 9848 + }, + { + "epoch": 1.69, + "grad_norm": 13.268970489501953, + "learning_rate": 4.6464733138836455e-06, + "loss": 0.5681, + "step": 9849 + }, + { + "epoch": 1.69, + "grad_norm": 12.448714256286621, + "learning_rate": 4.6438990904410505e-06, + "loss": 0.3394, + "step": 9850 + }, + { + "epoch": 1.69, + "grad_norm": 9.98882007598877, + "learning_rate": 4.641324866998456e-06, + "loss": 0.4784, + "step": 9851 + }, + { + "epoch": 1.69, + "grad_norm": 9.86368465423584, + "learning_rate": 4.6387506435558605e-06, + "loss": 0.4373, + "step": 9852 + }, + { + "epoch": 1.69, + "grad_norm": 11.203152656555176, + "learning_rate": 4.6361764201132654e-06, + "loss": 0.3715, + "step": 9853 + }, + { + "epoch": 1.69, + "grad_norm": 7.811062812805176, + "learning_rate": 4.633602196670671e-06, + "loss": 0.3231, + "step": 9854 + }, + { + "epoch": 1.69, + "grad_norm": 8.87142562866211, + "learning_rate": 4.631027973228076e-06, + "loss": 0.2538, + "step": 9855 + }, + { + "epoch": 1.69, + "grad_norm": 6.872397422790527, + "learning_rate": 4.628453749785482e-06, + "loss": 0.3488, + "step": 9856 + }, + { + "epoch": 1.69, + "grad_norm": 13.994199752807617, + "learning_rate": 4.625879526342886e-06, + "loss": 0.4921, + "step": 9857 + }, + { + "epoch": 1.69, + "grad_norm": 14.76842212677002, + "learning_rate": 4.623305302900292e-06, + "loss": 0.4806, + "step": 9858 + }, + { + "epoch": 1.69, + "grad_norm": 10.93956470489502, + "learning_rate": 4.620731079457697e-06, + "loss": 0.5517, + "step": 9859 + }, + { + "epoch": 1.69, + "grad_norm": 9.051068305969238, + "learning_rate": 4.618156856015102e-06, + "loss": 0.463, + "step": 9860 + }, + { + "epoch": 1.69, + "grad_norm": 11.394959449768066, + "learning_rate": 4.615582632572508e-06, + "loss": 0.3313, + "step": 9861 + }, + { + "epoch": 1.69, + "grad_norm": 12.845617294311523, + "learning_rate": 4.613008409129912e-06, + "loss": 0.5067, + "step": 9862 + }, + { + "epoch": 1.69, + "grad_norm": 7.060649871826172, + "learning_rate": 4.610434185687318e-06, + "loss": 0.296, + "step": 9863 + }, + { + "epoch": 1.69, + "grad_norm": 7.202380180358887, + "learning_rate": 4.607859962244723e-06, + "loss": 0.2721, + "step": 9864 + }, + { + "epoch": 1.69, + "grad_norm": 13.583398818969727, + "learning_rate": 4.605285738802129e-06, + "loss": 0.5201, + "step": 9865 + }, + { + "epoch": 1.69, + "grad_norm": 9.528497695922852, + "learning_rate": 4.602711515359534e-06, + "loss": 0.3013, + "step": 9866 + }, + { + "epoch": 1.69, + "grad_norm": 9.059798240661621, + "learning_rate": 4.600137291916938e-06, + "loss": 0.4016, + "step": 9867 + }, + { + "epoch": 1.69, + "grad_norm": 6.830427646636963, + "learning_rate": 4.597563068474344e-06, + "loss": 0.1739, + "step": 9868 + }, + { + "epoch": 1.69, + "grad_norm": 10.108186721801758, + "learning_rate": 4.594988845031749e-06, + "loss": 0.3077, + "step": 9869 + }, + { + "epoch": 1.69, + "grad_norm": 10.28027057647705, + "learning_rate": 4.5924146215891545e-06, + "loss": 0.3203, + "step": 9870 + }, + { + "epoch": 1.69, + "grad_norm": 8.209270477294922, + "learning_rate": 4.5898403981465594e-06, + "loss": 0.3556, + "step": 9871 + }, + { + "epoch": 1.69, + "grad_norm": 10.611172676086426, + "learning_rate": 4.5872661747039644e-06, + "loss": 0.3661, + "step": 9872 + }, + { + "epoch": 1.69, + "grad_norm": 7.970212459564209, + "learning_rate": 4.5846919512613694e-06, + "loss": 0.2734, + "step": 9873 + }, + { + "epoch": 1.69, + "grad_norm": 9.525469779968262, + "learning_rate": 4.582117727818774e-06, + "loss": 0.323, + "step": 9874 + }, + { + "epoch": 1.69, + "grad_norm": 7.489565372467041, + "learning_rate": 4.57954350437618e-06, + "loss": 0.2885, + "step": 9875 + }, + { + "epoch": 1.69, + "grad_norm": 9.656030654907227, + "learning_rate": 4.576969280933585e-06, + "loss": 0.3498, + "step": 9876 + }, + { + "epoch": 1.7, + "grad_norm": 12.637785911560059, + "learning_rate": 4.574395057490991e-06, + "loss": 0.5924, + "step": 9877 + }, + { + "epoch": 1.7, + "grad_norm": 10.836075782775879, + "learning_rate": 4.571820834048395e-06, + "loss": 0.366, + "step": 9878 + }, + { + "epoch": 1.7, + "grad_norm": 10.544533729553223, + "learning_rate": 4.5692466106058e-06, + "loss": 0.3082, + "step": 9879 + }, + { + "epoch": 1.7, + "grad_norm": 11.82541275024414, + "learning_rate": 4.566672387163206e-06, + "loss": 0.3581, + "step": 9880 + }, + { + "epoch": 1.7, + "grad_norm": 9.754866600036621, + "learning_rate": 4.564098163720611e-06, + "loss": 0.4529, + "step": 9881 + }, + { + "epoch": 1.7, + "grad_norm": 14.363245964050293, + "learning_rate": 4.561523940278017e-06, + "loss": 0.5338, + "step": 9882 + }, + { + "epoch": 1.7, + "grad_norm": 15.042471885681152, + "learning_rate": 4.558949716835421e-06, + "loss": 0.589, + "step": 9883 + }, + { + "epoch": 1.7, + "grad_norm": 7.8993916511535645, + "learning_rate": 4.556375493392827e-06, + "loss": 0.4759, + "step": 9884 + }, + { + "epoch": 1.7, + "grad_norm": 7.697238922119141, + "learning_rate": 4.553801269950232e-06, + "loss": 0.284, + "step": 9885 + }, + { + "epoch": 1.7, + "grad_norm": 13.843331336975098, + "learning_rate": 4.551227046507637e-06, + "loss": 0.4114, + "step": 9886 + }, + { + "epoch": 1.7, + "grad_norm": 6.613215923309326, + "learning_rate": 4.548652823065043e-06, + "loss": 0.3233, + "step": 9887 + }, + { + "epoch": 1.7, + "grad_norm": 12.457439422607422, + "learning_rate": 4.546078599622447e-06, + "loss": 0.672, + "step": 9888 + }, + { + "epoch": 1.7, + "grad_norm": 10.027551651000977, + "learning_rate": 4.543504376179853e-06, + "loss": 0.4279, + "step": 9889 + }, + { + "epoch": 1.7, + "grad_norm": 10.191003799438477, + "learning_rate": 4.540930152737258e-06, + "loss": 0.4173, + "step": 9890 + }, + { + "epoch": 1.7, + "grad_norm": 14.850259780883789, + "learning_rate": 4.538355929294663e-06, + "loss": 0.5251, + "step": 9891 + }, + { + "epoch": 1.7, + "grad_norm": 13.420716285705566, + "learning_rate": 4.535781705852068e-06, + "loss": 0.4322, + "step": 9892 + }, + { + "epoch": 1.7, + "grad_norm": 9.755199432373047, + "learning_rate": 4.5332074824094726e-06, + "loss": 0.3306, + "step": 9893 + }, + { + "epoch": 1.7, + "grad_norm": 8.51894474029541, + "learning_rate": 4.530633258966878e-06, + "loss": 0.3436, + "step": 9894 + }, + { + "epoch": 1.7, + "grad_norm": 8.663911819458008, + "learning_rate": 4.528059035524283e-06, + "loss": 0.4141, + "step": 9895 + }, + { + "epoch": 1.7, + "grad_norm": 10.478439331054688, + "learning_rate": 4.525484812081689e-06, + "loss": 0.3057, + "step": 9896 + }, + { + "epoch": 1.7, + "grad_norm": 9.557174682617188, + "learning_rate": 4.522910588639094e-06, + "loss": 0.3271, + "step": 9897 + }, + { + "epoch": 1.7, + "grad_norm": 7.117825984954834, + "learning_rate": 4.520336365196499e-06, + "loss": 0.3008, + "step": 9898 + }, + { + "epoch": 1.7, + "grad_norm": 10.67201042175293, + "learning_rate": 4.517762141753904e-06, + "loss": 0.3839, + "step": 9899 + }, + { + "epoch": 1.7, + "grad_norm": 7.920360088348389, + "learning_rate": 4.515187918311309e-06, + "loss": 0.3075, + "step": 9900 + }, + { + "epoch": 1.7, + "grad_norm": 11.682703971862793, + "learning_rate": 4.512613694868715e-06, + "loss": 0.2329, + "step": 9901 + }, + { + "epoch": 1.7, + "grad_norm": 11.152132034301758, + "learning_rate": 4.51003947142612e-06, + "loss": 0.5156, + "step": 9902 + }, + { + "epoch": 1.7, + "grad_norm": 7.8434672355651855, + "learning_rate": 4.507465247983526e-06, + "loss": 0.34, + "step": 9903 + }, + { + "epoch": 1.7, + "grad_norm": 10.7037353515625, + "learning_rate": 4.50489102454093e-06, + "loss": 0.2548, + "step": 9904 + }, + { + "epoch": 1.7, + "grad_norm": 9.541150093078613, + "learning_rate": 4.502316801098335e-06, + "loss": 0.378, + "step": 9905 + }, + { + "epoch": 1.7, + "grad_norm": 11.245125770568848, + "learning_rate": 4.499742577655741e-06, + "loss": 0.2657, + "step": 9906 + }, + { + "epoch": 1.7, + "grad_norm": 17.534658432006836, + "learning_rate": 4.497168354213146e-06, + "loss": 0.2804, + "step": 9907 + }, + { + "epoch": 1.7, + "grad_norm": 10.769990921020508, + "learning_rate": 4.494594130770552e-06, + "loss": 0.2776, + "step": 9908 + }, + { + "epoch": 1.7, + "grad_norm": 10.299349784851074, + "learning_rate": 4.492019907327956e-06, + "loss": 0.589, + "step": 9909 + }, + { + "epoch": 1.7, + "grad_norm": 11.773977279663086, + "learning_rate": 4.489445683885362e-06, + "loss": 0.3645, + "step": 9910 + }, + { + "epoch": 1.7, + "grad_norm": 12.670004844665527, + "learning_rate": 4.4868714604427666e-06, + "loss": 0.3539, + "step": 9911 + }, + { + "epoch": 1.7, + "grad_norm": 10.599482536315918, + "learning_rate": 4.4842972370001716e-06, + "loss": 0.3158, + "step": 9912 + }, + { + "epoch": 1.7, + "grad_norm": 12.113405227661133, + "learning_rate": 4.481723013557577e-06, + "loss": 0.6185, + "step": 9913 + }, + { + "epoch": 1.7, + "grad_norm": 10.548078536987305, + "learning_rate": 4.4791487901149815e-06, + "loss": 0.3143, + "step": 9914 + }, + { + "epoch": 1.7, + "grad_norm": 9.87138557434082, + "learning_rate": 4.476574566672387e-06, + "loss": 0.3951, + "step": 9915 + }, + { + "epoch": 1.7, + "grad_norm": 12.681567192077637, + "learning_rate": 4.474000343229792e-06, + "loss": 0.5538, + "step": 9916 + }, + { + "epoch": 1.7, + "grad_norm": 10.560541152954102, + "learning_rate": 4.471426119787197e-06, + "loss": 0.3245, + "step": 9917 + }, + { + "epoch": 1.7, + "grad_norm": 8.184925079345703, + "learning_rate": 4.468851896344603e-06, + "loss": 0.3684, + "step": 9918 + }, + { + "epoch": 1.7, + "grad_norm": 8.797863006591797, + "learning_rate": 4.466277672902007e-06, + "loss": 0.3596, + "step": 9919 + }, + { + "epoch": 1.7, + "grad_norm": 12.702967643737793, + "learning_rate": 4.463703449459413e-06, + "loss": 0.4481, + "step": 9920 + }, + { + "epoch": 1.7, + "grad_norm": 12.415507316589355, + "learning_rate": 4.461129226016818e-06, + "loss": 0.5798, + "step": 9921 + }, + { + "epoch": 1.7, + "grad_norm": 12.53695297241211, + "learning_rate": 4.458555002574224e-06, + "loss": 0.4363, + "step": 9922 + }, + { + "epoch": 1.7, + "grad_norm": 11.94764232635498, + "learning_rate": 4.455980779131629e-06, + "loss": 0.5268, + "step": 9923 + }, + { + "epoch": 1.7, + "grad_norm": 9.764609336853027, + "learning_rate": 4.453406555689034e-06, + "loss": 0.2574, + "step": 9924 + }, + { + "epoch": 1.7, + "grad_norm": 11.214308738708496, + "learning_rate": 4.450832332246439e-06, + "loss": 0.4664, + "step": 9925 + }, + { + "epoch": 1.7, + "grad_norm": 11.65449047088623, + "learning_rate": 4.448258108803844e-06, + "loss": 0.4357, + "step": 9926 + }, + { + "epoch": 1.7, + "grad_norm": 7.322643756866455, + "learning_rate": 4.44568388536125e-06, + "loss": 0.2718, + "step": 9927 + }, + { + "epoch": 1.7, + "grad_norm": 8.729557991027832, + "learning_rate": 4.443109661918655e-06, + "loss": 0.3978, + "step": 9928 + }, + { + "epoch": 1.7, + "grad_norm": 8.054461479187012, + "learning_rate": 4.4405354384760606e-06, + "loss": 0.3249, + "step": 9929 + }, + { + "epoch": 1.7, + "grad_norm": 8.421133995056152, + "learning_rate": 4.437961215033465e-06, + "loss": 0.339, + "step": 9930 + }, + { + "epoch": 1.7, + "grad_norm": 9.247720718383789, + "learning_rate": 4.43538699159087e-06, + "loss": 0.3763, + "step": 9931 + }, + { + "epoch": 1.7, + "grad_norm": 12.756997108459473, + "learning_rate": 4.4328127681482755e-06, + "loss": 0.442, + "step": 9932 + }, + { + "epoch": 1.7, + "grad_norm": 11.602229118347168, + "learning_rate": 4.4302385447056805e-06, + "loss": 0.3772, + "step": 9933 + }, + { + "epoch": 1.7, + "grad_norm": 11.061059951782227, + "learning_rate": 4.427664321263086e-06, + "loss": 0.3439, + "step": 9934 + }, + { + "epoch": 1.7, + "grad_norm": 9.922179222106934, + "learning_rate": 4.4250900978204905e-06, + "loss": 0.2709, + "step": 9935 + }, + { + "epoch": 1.71, + "grad_norm": 11.109374046325684, + "learning_rate": 4.422515874377896e-06, + "loss": 0.5161, + "step": 9936 + }, + { + "epoch": 1.71, + "grad_norm": 8.100266456604004, + "learning_rate": 4.419941650935301e-06, + "loss": 0.3431, + "step": 9937 + }, + { + "epoch": 1.71, + "grad_norm": 11.069289207458496, + "learning_rate": 4.417367427492706e-06, + "loss": 0.3434, + "step": 9938 + }, + { + "epoch": 1.71, + "grad_norm": 9.864737510681152, + "learning_rate": 4.414793204050112e-06, + "loss": 0.4447, + "step": 9939 + }, + { + "epoch": 1.71, + "grad_norm": 12.718376159667969, + "learning_rate": 4.412218980607516e-06, + "loss": 0.3581, + "step": 9940 + }, + { + "epoch": 1.71, + "grad_norm": 9.893866539001465, + "learning_rate": 4.409644757164922e-06, + "loss": 0.3602, + "step": 9941 + }, + { + "epoch": 1.71, + "grad_norm": 8.459463119506836, + "learning_rate": 4.407070533722327e-06, + "loss": 0.2941, + "step": 9942 + }, + { + "epoch": 1.71, + "grad_norm": 8.783381462097168, + "learning_rate": 4.404496310279732e-06, + "loss": 0.434, + "step": 9943 + }, + { + "epoch": 1.71, + "grad_norm": 12.828739166259766, + "learning_rate": 4.401922086837138e-06, + "loss": 0.4314, + "step": 9944 + }, + { + "epoch": 1.71, + "grad_norm": 7.140368938446045, + "learning_rate": 4.399347863394542e-06, + "loss": 0.297, + "step": 9945 + }, + { + "epoch": 1.71, + "grad_norm": 13.443787574768066, + "learning_rate": 4.396773639951948e-06, + "loss": 0.4304, + "step": 9946 + }, + { + "epoch": 1.71, + "grad_norm": 9.49197769165039, + "learning_rate": 4.394199416509353e-06, + "loss": 0.4603, + "step": 9947 + }, + { + "epoch": 1.71, + "grad_norm": 8.293033599853516, + "learning_rate": 4.391625193066759e-06, + "loss": 0.3378, + "step": 9948 + }, + { + "epoch": 1.71, + "grad_norm": 10.394169807434082, + "learning_rate": 4.389050969624164e-06, + "loss": 0.3738, + "step": 9949 + }, + { + "epoch": 1.71, + "grad_norm": 12.421829223632812, + "learning_rate": 4.386476746181569e-06, + "loss": 0.3499, + "step": 9950 + }, + { + "epoch": 1.71, + "grad_norm": 10.849088668823242, + "learning_rate": 4.383902522738974e-06, + "loss": 0.376, + "step": 9951 + }, + { + "epoch": 1.71, + "grad_norm": 8.470183372497559, + "learning_rate": 4.381328299296379e-06, + "loss": 0.4127, + "step": 9952 + }, + { + "epoch": 1.71, + "grad_norm": 10.784804344177246, + "learning_rate": 4.3787540758537845e-06, + "loss": 0.458, + "step": 9953 + }, + { + "epoch": 1.71, + "grad_norm": 16.23931121826172, + "learning_rate": 4.3761798524111895e-06, + "loss": 0.3807, + "step": 9954 + }, + { + "epoch": 1.71, + "grad_norm": 11.81545352935791, + "learning_rate": 4.373605628968595e-06, + "loss": 0.3841, + "step": 9955 + }, + { + "epoch": 1.71, + "grad_norm": 12.764395713806152, + "learning_rate": 4.3710314055259995e-06, + "loss": 0.5208, + "step": 9956 + }, + { + "epoch": 1.71, + "grad_norm": 10.672592163085938, + "learning_rate": 4.3684571820834045e-06, + "loss": 0.3087, + "step": 9957 + }, + { + "epoch": 1.71, + "grad_norm": 11.531686782836914, + "learning_rate": 4.36588295864081e-06, + "loss": 0.4427, + "step": 9958 + }, + { + "epoch": 1.71, + "grad_norm": 9.387245178222656, + "learning_rate": 4.363308735198215e-06, + "loss": 0.3499, + "step": 9959 + }, + { + "epoch": 1.71, + "grad_norm": 10.841809272766113, + "learning_rate": 4.360734511755621e-06, + "loss": 0.3798, + "step": 9960 + }, + { + "epoch": 1.71, + "grad_norm": 10.419737815856934, + "learning_rate": 4.358160288313025e-06, + "loss": 0.344, + "step": 9961 + }, + { + "epoch": 1.71, + "grad_norm": 10.818804740905762, + "learning_rate": 4.355586064870431e-06, + "loss": 0.4238, + "step": 9962 + }, + { + "epoch": 1.71, + "grad_norm": 9.162422180175781, + "learning_rate": 4.353011841427836e-06, + "loss": 0.3439, + "step": 9963 + }, + { + "epoch": 1.71, + "grad_norm": 8.351640701293945, + "learning_rate": 4.350437617985241e-06, + "loss": 0.2175, + "step": 9964 + }, + { + "epoch": 1.71, + "grad_norm": 11.395376205444336, + "learning_rate": 4.347863394542647e-06, + "loss": 0.3205, + "step": 9965 + }, + { + "epoch": 1.71, + "grad_norm": 9.97102165222168, + "learning_rate": 4.345289171100051e-06, + "loss": 0.3443, + "step": 9966 + }, + { + "epoch": 1.71, + "grad_norm": 8.703822135925293, + "learning_rate": 4.342714947657457e-06, + "loss": 0.4079, + "step": 9967 + }, + { + "epoch": 1.71, + "grad_norm": 10.510187149047852, + "learning_rate": 4.340140724214862e-06, + "loss": 0.3666, + "step": 9968 + }, + { + "epoch": 1.71, + "grad_norm": 11.33984088897705, + "learning_rate": 4.337566500772267e-06, + "loss": 0.509, + "step": 9969 + }, + { + "epoch": 1.71, + "grad_norm": 8.041324615478516, + "learning_rate": 4.334992277329673e-06, + "loss": 0.4211, + "step": 9970 + }, + { + "epoch": 1.71, + "grad_norm": 13.166263580322266, + "learning_rate": 4.332418053887077e-06, + "loss": 0.4017, + "step": 9971 + }, + { + "epoch": 1.71, + "grad_norm": 7.305109977722168, + "learning_rate": 4.329843830444483e-06, + "loss": 0.2828, + "step": 9972 + }, + { + "epoch": 1.71, + "grad_norm": 13.540048599243164, + "learning_rate": 4.327269607001888e-06, + "loss": 0.6394, + "step": 9973 + }, + { + "epoch": 1.71, + "grad_norm": 10.86319351196289, + "learning_rate": 4.3246953835592935e-06, + "loss": 0.5214, + "step": 9974 + }, + { + "epoch": 1.71, + "grad_norm": 9.562620162963867, + "learning_rate": 4.3221211601166985e-06, + "loss": 0.4605, + "step": 9975 + }, + { + "epoch": 1.71, + "grad_norm": 9.087349891662598, + "learning_rate": 4.3195469366741034e-06, + "loss": 0.4786, + "step": 9976 + }, + { + "epoch": 1.71, + "grad_norm": 12.751546859741211, + "learning_rate": 4.3169727132315084e-06, + "loss": 0.5242, + "step": 9977 + }, + { + "epoch": 1.71, + "grad_norm": 8.395437240600586, + "learning_rate": 4.314398489788913e-06, + "loss": 0.4307, + "step": 9978 + }, + { + "epoch": 1.71, + "grad_norm": 11.593765258789062, + "learning_rate": 4.311824266346319e-06, + "loss": 0.4633, + "step": 9979 + }, + { + "epoch": 1.71, + "grad_norm": 9.877291679382324, + "learning_rate": 4.309250042903724e-06, + "loss": 0.5356, + "step": 9980 + }, + { + "epoch": 1.71, + "grad_norm": 11.459031105041504, + "learning_rate": 4.30667581946113e-06, + "loss": 0.3569, + "step": 9981 + }, + { + "epoch": 1.71, + "grad_norm": 8.680886268615723, + "learning_rate": 4.304101596018534e-06, + "loss": 0.3856, + "step": 9982 + }, + { + "epoch": 1.71, + "grad_norm": 10.003732681274414, + "learning_rate": 4.301527372575939e-06, + "loss": 0.3501, + "step": 9983 + }, + { + "epoch": 1.71, + "grad_norm": 8.571664810180664, + "learning_rate": 4.298953149133345e-06, + "loss": 0.3511, + "step": 9984 + }, + { + "epoch": 1.71, + "grad_norm": 12.06940746307373, + "learning_rate": 4.29637892569075e-06, + "loss": 0.638, + "step": 9985 + }, + { + "epoch": 1.71, + "grad_norm": 10.53713607788086, + "learning_rate": 4.293804702248156e-06, + "loss": 0.2733, + "step": 9986 + }, + { + "epoch": 1.71, + "grad_norm": 13.928442001342773, + "learning_rate": 4.29123047880556e-06, + "loss": 0.395, + "step": 9987 + }, + { + "epoch": 1.71, + "grad_norm": 9.022892951965332, + "learning_rate": 4.288656255362966e-06, + "loss": 0.238, + "step": 9988 + }, + { + "epoch": 1.71, + "grad_norm": 10.315838813781738, + "learning_rate": 4.286082031920371e-06, + "loss": 0.4526, + "step": 9989 + }, + { + "epoch": 1.71, + "grad_norm": 13.385167121887207, + "learning_rate": 4.283507808477776e-06, + "loss": 0.3602, + "step": 9990 + }, + { + "epoch": 1.71, + "grad_norm": 9.09528636932373, + "learning_rate": 4.280933585035182e-06, + "loss": 0.3881, + "step": 9991 + }, + { + "epoch": 1.71, + "grad_norm": 15.53307056427002, + "learning_rate": 4.278359361592586e-06, + "loss": 0.5135, + "step": 9992 + }, + { + "epoch": 1.71, + "grad_norm": 12.816743850708008, + "learning_rate": 4.275785138149992e-06, + "loss": 0.476, + "step": 9993 + }, + { + "epoch": 1.72, + "grad_norm": 8.927627563476562, + "learning_rate": 4.273210914707397e-06, + "loss": 0.3254, + "step": 9994 + }, + { + "epoch": 1.72, + "grad_norm": 11.05849838256836, + "learning_rate": 4.270636691264802e-06, + "loss": 0.3206, + "step": 9995 + }, + { + "epoch": 1.72, + "grad_norm": 14.244296073913574, + "learning_rate": 4.2680624678222074e-06, + "loss": 0.3888, + "step": 9996 + }, + { + "epoch": 1.72, + "grad_norm": 9.192716598510742, + "learning_rate": 4.2654882443796116e-06, + "loss": 0.2747, + "step": 9997 + }, + { + "epoch": 1.72, + "grad_norm": 9.146156311035156, + "learning_rate": 4.262914020937017e-06, + "loss": 0.3986, + "step": 9998 + }, + { + "epoch": 1.72, + "grad_norm": 12.070487022399902, + "learning_rate": 4.260339797494422e-06, + "loss": 0.4472, + "step": 9999 + }, + { + "epoch": 1.72, + "grad_norm": 12.683749198913574, + "learning_rate": 4.257765574051828e-06, + "loss": 0.5275, + "step": 10000 + }, + { + "epoch": 1.72, + "grad_norm": 9.69218921661377, + "learning_rate": 4.255191350609233e-06, + "loss": 0.4133, + "step": 10001 + }, + { + "epoch": 1.72, + "grad_norm": 9.761098861694336, + "learning_rate": 4.252617127166638e-06, + "loss": 0.4975, + "step": 10002 + }, + { + "epoch": 1.72, + "grad_norm": 9.708329200744629, + "learning_rate": 4.250042903724043e-06, + "loss": 0.2557, + "step": 10003 + }, + { + "epoch": 1.72, + "grad_norm": 7.929306983947754, + "learning_rate": 4.247468680281448e-06, + "loss": 0.3182, + "step": 10004 + }, + { + "epoch": 1.72, + "grad_norm": 12.045339584350586, + "learning_rate": 4.244894456838854e-06, + "loss": 0.3838, + "step": 10005 + }, + { + "epoch": 1.72, + "grad_norm": 10.587363243103027, + "learning_rate": 4.242320233396259e-06, + "loss": 0.4017, + "step": 10006 + }, + { + "epoch": 1.72, + "grad_norm": 8.690743446350098, + "learning_rate": 4.239746009953665e-06, + "loss": 0.3563, + "step": 10007 + }, + { + "epoch": 1.72, + "grad_norm": 10.331488609313965, + "learning_rate": 4.237171786511069e-06, + "loss": 0.6087, + "step": 10008 + }, + { + "epoch": 1.72, + "grad_norm": 10.600420951843262, + "learning_rate": 4.234597563068474e-06, + "loss": 0.3599, + "step": 10009 + }, + { + "epoch": 1.72, + "grad_norm": 7.76400089263916, + "learning_rate": 4.23202333962588e-06, + "loss": 0.2751, + "step": 10010 + }, + { + "epoch": 1.72, + "grad_norm": 11.677290916442871, + "learning_rate": 4.229449116183285e-06, + "loss": 0.4324, + "step": 10011 + }, + { + "epoch": 1.72, + "grad_norm": 9.26309871673584, + "learning_rate": 4.226874892740691e-06, + "loss": 0.4589, + "step": 10012 + }, + { + "epoch": 1.72, + "grad_norm": 10.799513816833496, + "learning_rate": 4.224300669298095e-06, + "loss": 0.4897, + "step": 10013 + }, + { + "epoch": 1.72, + "grad_norm": 7.122469902038574, + "learning_rate": 4.221726445855501e-06, + "loss": 0.2205, + "step": 10014 + }, + { + "epoch": 1.72, + "grad_norm": 10.203651428222656, + "learning_rate": 4.219152222412906e-06, + "loss": 0.3603, + "step": 10015 + }, + { + "epoch": 1.72, + "grad_norm": 7.591609001159668, + "learning_rate": 4.2165779989703106e-06, + "loss": 0.2476, + "step": 10016 + }, + { + "epoch": 1.72, + "grad_norm": 10.541642189025879, + "learning_rate": 4.214003775527716e-06, + "loss": 0.5272, + "step": 10017 + }, + { + "epoch": 1.72, + "grad_norm": 12.456793785095215, + "learning_rate": 4.2114295520851205e-06, + "loss": 0.5535, + "step": 10018 + }, + { + "epoch": 1.72, + "grad_norm": 9.074565887451172, + "learning_rate": 4.208855328642526e-06, + "loss": 0.4096, + "step": 10019 + }, + { + "epoch": 1.72, + "grad_norm": 11.155806541442871, + "learning_rate": 4.206281105199931e-06, + "loss": 0.488, + "step": 10020 + }, + { + "epoch": 1.72, + "grad_norm": 21.50942611694336, + "learning_rate": 4.203706881757336e-06, + "loss": 0.4609, + "step": 10021 + }, + { + "epoch": 1.72, + "grad_norm": 11.536234855651855, + "learning_rate": 4.201132658314742e-06, + "loss": 0.4203, + "step": 10022 + }, + { + "epoch": 1.72, + "grad_norm": 11.204566955566406, + "learning_rate": 4.198558434872146e-06, + "loss": 0.5778, + "step": 10023 + }, + { + "epoch": 1.72, + "grad_norm": 13.064000129699707, + "learning_rate": 4.195984211429552e-06, + "loss": 0.3768, + "step": 10024 + }, + { + "epoch": 1.72, + "grad_norm": 9.241158485412598, + "learning_rate": 4.193409987986957e-06, + "loss": 0.3996, + "step": 10025 + }, + { + "epoch": 1.72, + "grad_norm": 8.174125671386719, + "learning_rate": 4.190835764544363e-06, + "loss": 0.3392, + "step": 10026 + }, + { + "epoch": 1.72, + "grad_norm": 9.345174789428711, + "learning_rate": 4.188261541101768e-06, + "loss": 0.3971, + "step": 10027 + }, + { + "epoch": 1.72, + "grad_norm": 16.275466918945312, + "learning_rate": 4.185687317659173e-06, + "loss": 0.5066, + "step": 10028 + }, + { + "epoch": 1.72, + "grad_norm": 7.62310791015625, + "learning_rate": 4.183113094216578e-06, + "loss": 0.3347, + "step": 10029 + }, + { + "epoch": 1.72, + "grad_norm": 10.437335014343262, + "learning_rate": 4.180538870773983e-06, + "loss": 0.3518, + "step": 10030 + }, + { + "epoch": 1.72, + "grad_norm": 9.76176929473877, + "learning_rate": 4.177964647331389e-06, + "loss": 0.4014, + "step": 10031 + }, + { + "epoch": 1.72, + "grad_norm": 10.781759262084961, + "learning_rate": 4.175390423888794e-06, + "loss": 0.45, + "step": 10032 + }, + { + "epoch": 1.72, + "grad_norm": 11.804431915283203, + "learning_rate": 4.1728162004462e-06, + "loss": 0.5412, + "step": 10033 + }, + { + "epoch": 1.72, + "grad_norm": 15.3230619430542, + "learning_rate": 4.170241977003604e-06, + "loss": 0.3673, + "step": 10034 + }, + { + "epoch": 1.72, + "grad_norm": 16.02815055847168, + "learning_rate": 4.167667753561009e-06, + "loss": 0.4242, + "step": 10035 + }, + { + "epoch": 1.72, + "grad_norm": 15.48589038848877, + "learning_rate": 4.1650935301184145e-06, + "loss": 0.4381, + "step": 10036 + }, + { + "epoch": 1.72, + "grad_norm": 13.080061912536621, + "learning_rate": 4.1625193066758195e-06, + "loss": 0.3745, + "step": 10037 + }, + { + "epoch": 1.72, + "grad_norm": 7.295418739318848, + "learning_rate": 4.159945083233225e-06, + "loss": 0.2397, + "step": 10038 + }, + { + "epoch": 1.72, + "grad_norm": 14.337566375732422, + "learning_rate": 4.1573708597906295e-06, + "loss": 0.4904, + "step": 10039 + }, + { + "epoch": 1.72, + "grad_norm": 11.389517784118652, + "learning_rate": 4.154796636348035e-06, + "loss": 0.362, + "step": 10040 + }, + { + "epoch": 1.72, + "grad_norm": 10.826855659484863, + "learning_rate": 4.15222241290544e-06, + "loss": 0.2595, + "step": 10041 + }, + { + "epoch": 1.72, + "grad_norm": 10.343708992004395, + "learning_rate": 4.149648189462845e-06, + "loss": 0.3669, + "step": 10042 + }, + { + "epoch": 1.72, + "grad_norm": 7.76363468170166, + "learning_rate": 4.147073966020251e-06, + "loss": 0.2913, + "step": 10043 + }, + { + "epoch": 1.72, + "grad_norm": 9.270110130310059, + "learning_rate": 4.144499742577655e-06, + "loss": 0.4417, + "step": 10044 + }, + { + "epoch": 1.72, + "grad_norm": 9.014410018920898, + "learning_rate": 4.141925519135061e-06, + "loss": 0.4179, + "step": 10045 + }, + { + "epoch": 1.72, + "grad_norm": 7.9216227531433105, + "learning_rate": 4.139351295692466e-06, + "loss": 0.2627, + "step": 10046 + }, + { + "epoch": 1.72, + "grad_norm": 9.388548851013184, + "learning_rate": 4.136777072249871e-06, + "loss": 0.4104, + "step": 10047 + }, + { + "epoch": 1.72, + "grad_norm": 8.152256965637207, + "learning_rate": 4.134202848807277e-06, + "loss": 0.3494, + "step": 10048 + }, + { + "epoch": 1.72, + "grad_norm": 8.463470458984375, + "learning_rate": 4.131628625364682e-06, + "loss": 0.2555, + "step": 10049 + }, + { + "epoch": 1.72, + "grad_norm": 10.243407249450684, + "learning_rate": 4.129054401922087e-06, + "loss": 0.3652, + "step": 10050 + }, + { + "epoch": 1.72, + "grad_norm": 13.6102933883667, + "learning_rate": 4.126480178479492e-06, + "loss": 0.2998, + "step": 10051 + }, + { + "epoch": 1.73, + "grad_norm": 14.25294017791748, + "learning_rate": 4.123905955036898e-06, + "loss": 0.512, + "step": 10052 + }, + { + "epoch": 1.73, + "grad_norm": 11.124979019165039, + "learning_rate": 4.121331731594303e-06, + "loss": 0.3626, + "step": 10053 + }, + { + "epoch": 1.73, + "grad_norm": 7.336242198944092, + "learning_rate": 4.118757508151708e-06, + "loss": 0.215, + "step": 10054 + }, + { + "epoch": 1.73, + "grad_norm": 10.061466217041016, + "learning_rate": 4.116183284709113e-06, + "loss": 0.3215, + "step": 10055 + }, + { + "epoch": 1.73, + "grad_norm": 11.13331413269043, + "learning_rate": 4.113609061266518e-06, + "loss": 0.4244, + "step": 10056 + }, + { + "epoch": 1.73, + "grad_norm": 13.146357536315918, + "learning_rate": 4.1110348378239235e-06, + "loss": 0.3712, + "step": 10057 + }, + { + "epoch": 1.73, + "grad_norm": 12.410157203674316, + "learning_rate": 4.1084606143813285e-06, + "loss": 0.3266, + "step": 10058 + }, + { + "epoch": 1.73, + "grad_norm": 7.439155578613281, + "learning_rate": 4.105886390938734e-06, + "loss": 0.2689, + "step": 10059 + }, + { + "epoch": 1.73, + "grad_norm": 6.937128067016602, + "learning_rate": 4.1033121674961385e-06, + "loss": 0.298, + "step": 10060 + }, + { + "epoch": 1.73, + "grad_norm": 12.979667663574219, + "learning_rate": 4.1007379440535435e-06, + "loss": 0.4274, + "step": 10061 + }, + { + "epoch": 1.73, + "grad_norm": 11.426986694335938, + "learning_rate": 4.098163720610949e-06, + "loss": 0.4087, + "step": 10062 + }, + { + "epoch": 1.73, + "grad_norm": 9.391172409057617, + "learning_rate": 4.095589497168354e-06, + "loss": 0.3698, + "step": 10063 + }, + { + "epoch": 1.73, + "grad_norm": 15.403900146484375, + "learning_rate": 4.09301527372576e-06, + "loss": 0.4726, + "step": 10064 + }, + { + "epoch": 1.73, + "grad_norm": 8.048985481262207, + "learning_rate": 4.090441050283164e-06, + "loss": 0.1954, + "step": 10065 + }, + { + "epoch": 1.73, + "grad_norm": 11.735633850097656, + "learning_rate": 4.087866826840569e-06, + "loss": 0.2995, + "step": 10066 + }, + { + "epoch": 1.73, + "grad_norm": 11.164621353149414, + "learning_rate": 4.085292603397975e-06, + "loss": 0.2752, + "step": 10067 + }, + { + "epoch": 1.73, + "grad_norm": 8.355125427246094, + "learning_rate": 4.08271837995538e-06, + "loss": 0.4, + "step": 10068 + }, + { + "epoch": 1.73, + "grad_norm": 8.642196655273438, + "learning_rate": 4.080144156512786e-06, + "loss": 0.5601, + "step": 10069 + }, + { + "epoch": 1.73, + "grad_norm": 7.689550399780273, + "learning_rate": 4.07756993307019e-06, + "loss": 0.325, + "step": 10070 + }, + { + "epoch": 1.73, + "grad_norm": 11.884004592895508, + "learning_rate": 4.074995709627596e-06, + "loss": 0.3896, + "step": 10071 + }, + { + "epoch": 1.73, + "grad_norm": 10.69339370727539, + "learning_rate": 4.072421486185001e-06, + "loss": 0.386, + "step": 10072 + }, + { + "epoch": 1.73, + "grad_norm": 12.559497833251953, + "learning_rate": 4.069847262742406e-06, + "loss": 0.3497, + "step": 10073 + }, + { + "epoch": 1.73, + "grad_norm": 8.112516403198242, + "learning_rate": 4.067273039299812e-06, + "loss": 0.2648, + "step": 10074 + }, + { + "epoch": 1.73, + "grad_norm": 10.096497535705566, + "learning_rate": 4.064698815857217e-06, + "loss": 0.4449, + "step": 10075 + }, + { + "epoch": 1.73, + "grad_norm": 8.329072952270508, + "learning_rate": 4.062124592414622e-06, + "loss": 0.3324, + "step": 10076 + }, + { + "epoch": 1.73, + "grad_norm": 13.353119850158691, + "learning_rate": 4.059550368972027e-06, + "loss": 0.369, + "step": 10077 + }, + { + "epoch": 1.73, + "grad_norm": 8.638492584228516, + "learning_rate": 4.0569761455294325e-06, + "loss": 0.2372, + "step": 10078 + }, + { + "epoch": 1.73, + "grad_norm": 9.622969627380371, + "learning_rate": 4.0544019220868375e-06, + "loss": 0.2513, + "step": 10079 + }, + { + "epoch": 1.73, + "grad_norm": 12.213968276977539, + "learning_rate": 4.0518276986442425e-06, + "loss": 0.5631, + "step": 10080 + }, + { + "epoch": 1.73, + "grad_norm": 11.203161239624023, + "learning_rate": 4.0492534752016474e-06, + "loss": 0.4392, + "step": 10081 + }, + { + "epoch": 1.73, + "grad_norm": 10.644662857055664, + "learning_rate": 4.0466792517590524e-06, + "loss": 0.4, + "step": 10082 + }, + { + "epoch": 1.73, + "grad_norm": 11.43625545501709, + "learning_rate": 4.044105028316458e-06, + "loss": 0.3789, + "step": 10083 + }, + { + "epoch": 1.73, + "grad_norm": 8.148271560668945, + "learning_rate": 4.041530804873863e-06, + "loss": 0.3693, + "step": 10084 + }, + { + "epoch": 1.73, + "grad_norm": 9.060578346252441, + "learning_rate": 4.038956581431269e-06, + "loss": 0.1837, + "step": 10085 + }, + { + "epoch": 1.73, + "grad_norm": 17.784847259521484, + "learning_rate": 4.036382357988673e-06, + "loss": 0.4735, + "step": 10086 + }, + { + "epoch": 1.73, + "grad_norm": 15.989152908325195, + "learning_rate": 4.033808134546078e-06, + "loss": 0.3457, + "step": 10087 + }, + { + "epoch": 1.73, + "grad_norm": 9.844850540161133, + "learning_rate": 4.031233911103484e-06, + "loss": 0.3449, + "step": 10088 + }, + { + "epoch": 1.73, + "grad_norm": 11.894591331481934, + "learning_rate": 4.028659687660889e-06, + "loss": 0.4263, + "step": 10089 + }, + { + "epoch": 1.73, + "grad_norm": 8.945180892944336, + "learning_rate": 4.026085464218295e-06, + "loss": 0.2866, + "step": 10090 + }, + { + "epoch": 1.73, + "grad_norm": 12.589032173156738, + "learning_rate": 4.023511240775699e-06, + "loss": 0.6389, + "step": 10091 + }, + { + "epoch": 1.73, + "grad_norm": 15.864961624145508, + "learning_rate": 4.020937017333104e-06, + "loss": 0.6178, + "step": 10092 + }, + { + "epoch": 1.73, + "grad_norm": 10.489991188049316, + "learning_rate": 4.01836279389051e-06, + "loss": 0.3439, + "step": 10093 + }, + { + "epoch": 1.73, + "grad_norm": 12.373434066772461, + "learning_rate": 4.015788570447915e-06, + "loss": 0.3362, + "step": 10094 + }, + { + "epoch": 1.73, + "grad_norm": 6.838357448577881, + "learning_rate": 4.013214347005321e-06, + "loss": 0.2354, + "step": 10095 + }, + { + "epoch": 1.73, + "grad_norm": 10.028657913208008, + "learning_rate": 4.010640123562725e-06, + "loss": 0.281, + "step": 10096 + }, + { + "epoch": 1.73, + "grad_norm": 10.155238151550293, + "learning_rate": 4.008065900120131e-06, + "loss": 0.2869, + "step": 10097 + }, + { + "epoch": 1.73, + "grad_norm": 8.65730094909668, + "learning_rate": 4.005491676677536e-06, + "loss": 0.4494, + "step": 10098 + }, + { + "epoch": 1.73, + "grad_norm": 9.49343490600586, + "learning_rate": 4.002917453234941e-06, + "loss": 0.41, + "step": 10099 + }, + { + "epoch": 1.73, + "grad_norm": 10.031082153320312, + "learning_rate": 4.0003432297923464e-06, + "loss": 0.3188, + "step": 10100 + }, + { + "epoch": 1.73, + "grad_norm": 11.496113777160645, + "learning_rate": 3.9977690063497514e-06, + "loss": 0.3532, + "step": 10101 + }, + { + "epoch": 1.73, + "grad_norm": 10.07313060760498, + "learning_rate": 3.995194782907156e-06, + "loss": 0.4757, + "step": 10102 + }, + { + "epoch": 1.73, + "grad_norm": 13.511941909790039, + "learning_rate": 3.992620559464561e-06, + "loss": 0.4832, + "step": 10103 + }, + { + "epoch": 1.73, + "grad_norm": 9.121332168579102, + "learning_rate": 3.990046336021967e-06, + "loss": 0.2619, + "step": 10104 + }, + { + "epoch": 1.73, + "grad_norm": 12.800068855285645, + "learning_rate": 3.987472112579372e-06, + "loss": 0.4312, + "step": 10105 + }, + { + "epoch": 1.73, + "grad_norm": 9.701355934143066, + "learning_rate": 3.984897889136777e-06, + "loss": 0.3494, + "step": 10106 + }, + { + "epoch": 1.73, + "grad_norm": 14.681970596313477, + "learning_rate": 3.982323665694182e-06, + "loss": 0.3549, + "step": 10107 + }, + { + "epoch": 1.73, + "grad_norm": 7.654945373535156, + "learning_rate": 3.979749442251587e-06, + "loss": 0.2984, + "step": 10108 + }, + { + "epoch": 1.73, + "grad_norm": 8.594856262207031, + "learning_rate": 3.977175218808993e-06, + "loss": 0.3414, + "step": 10109 + }, + { + "epoch": 1.74, + "grad_norm": 10.8992338180542, + "learning_rate": 3.974600995366398e-06, + "loss": 0.5667, + "step": 10110 + }, + { + "epoch": 1.74, + "grad_norm": 8.656254768371582, + "learning_rate": 3.972026771923804e-06, + "loss": 0.464, + "step": 10111 + }, + { + "epoch": 1.74, + "grad_norm": 10.926775932312012, + "learning_rate": 3.969452548481208e-06, + "loss": 0.3728, + "step": 10112 + }, + { + "epoch": 1.74, + "grad_norm": 14.054173469543457, + "learning_rate": 3.966878325038613e-06, + "loss": 0.5664, + "step": 10113 + }, + { + "epoch": 1.74, + "grad_norm": 18.295503616333008, + "learning_rate": 3.964304101596019e-06, + "loss": 0.4331, + "step": 10114 + }, + { + "epoch": 1.74, + "grad_norm": 32.960628509521484, + "learning_rate": 3.961729878153424e-06, + "loss": 0.3698, + "step": 10115 + }, + { + "epoch": 1.74, + "grad_norm": 11.869596481323242, + "learning_rate": 3.95915565471083e-06, + "loss": 0.4341, + "step": 10116 + }, + { + "epoch": 1.74, + "grad_norm": 13.166833877563477, + "learning_rate": 3.956581431268234e-06, + "loss": 0.3685, + "step": 10117 + }, + { + "epoch": 1.74, + "grad_norm": 9.592395782470703, + "learning_rate": 3.954007207825639e-06, + "loss": 0.4397, + "step": 10118 + }, + { + "epoch": 1.74, + "grad_norm": 10.791773796081543, + "learning_rate": 3.951432984383045e-06, + "loss": 0.3871, + "step": 10119 + }, + { + "epoch": 1.74, + "grad_norm": 9.0762357711792, + "learning_rate": 3.94885876094045e-06, + "loss": 0.3591, + "step": 10120 + }, + { + "epoch": 1.74, + "grad_norm": 14.5891695022583, + "learning_rate": 3.946284537497855e-06, + "loss": 0.5207, + "step": 10121 + }, + { + "epoch": 1.74, + "grad_norm": 10.399431228637695, + "learning_rate": 3.9437103140552596e-06, + "loss": 0.352, + "step": 10122 + }, + { + "epoch": 1.74, + "grad_norm": 11.89193058013916, + "learning_rate": 3.941136090612665e-06, + "loss": 0.4992, + "step": 10123 + }, + { + "epoch": 1.74, + "grad_norm": 7.691988468170166, + "learning_rate": 3.93856186717007e-06, + "loss": 0.159, + "step": 10124 + }, + { + "epoch": 1.74, + "grad_norm": 8.005675315856934, + "learning_rate": 3.935987643727475e-06, + "loss": 0.3312, + "step": 10125 + }, + { + "epoch": 1.74, + "grad_norm": 5.908039569854736, + "learning_rate": 3.933413420284881e-06, + "loss": 0.1449, + "step": 10126 + }, + { + "epoch": 1.74, + "grad_norm": 10.309003829956055, + "learning_rate": 3.930839196842286e-06, + "loss": 0.4602, + "step": 10127 + }, + { + "epoch": 1.74, + "grad_norm": 8.464003562927246, + "learning_rate": 3.928264973399691e-06, + "loss": 0.3839, + "step": 10128 + }, + { + "epoch": 1.74, + "grad_norm": 9.687711715698242, + "learning_rate": 3.925690749957096e-06, + "loss": 0.3445, + "step": 10129 + }, + { + "epoch": 1.74, + "grad_norm": 12.281744956970215, + "learning_rate": 3.923116526514502e-06, + "loss": 0.5241, + "step": 10130 + }, + { + "epoch": 1.74, + "grad_norm": 8.498844146728516, + "learning_rate": 3.920542303071907e-06, + "loss": 0.327, + "step": 10131 + }, + { + "epoch": 1.74, + "grad_norm": 9.091933250427246, + "learning_rate": 3.917968079629312e-06, + "loss": 0.2824, + "step": 10132 + }, + { + "epoch": 1.74, + "grad_norm": 13.312525749206543, + "learning_rate": 3.915393856186717e-06, + "loss": 0.5629, + "step": 10133 + }, + { + "epoch": 1.74, + "grad_norm": 11.121058464050293, + "learning_rate": 3.912819632744122e-06, + "loss": 0.4358, + "step": 10134 + }, + { + "epoch": 1.74, + "grad_norm": 10.324707984924316, + "learning_rate": 3.910245409301528e-06, + "loss": 0.3947, + "step": 10135 + }, + { + "epoch": 1.74, + "grad_norm": 8.844686508178711, + "learning_rate": 3.907671185858933e-06, + "loss": 0.2847, + "step": 10136 + }, + { + "epoch": 1.74, + "grad_norm": 13.180686950683594, + "learning_rate": 3.905096962416339e-06, + "loss": 0.3813, + "step": 10137 + }, + { + "epoch": 1.74, + "grad_norm": 8.596451759338379, + "learning_rate": 3.902522738973743e-06, + "loss": 0.3386, + "step": 10138 + }, + { + "epoch": 1.74, + "grad_norm": 10.376317977905273, + "learning_rate": 3.899948515531148e-06, + "loss": 0.4449, + "step": 10139 + }, + { + "epoch": 1.74, + "grad_norm": 8.76968002319336, + "learning_rate": 3.8973742920885536e-06, + "loss": 0.1939, + "step": 10140 + }, + { + "epoch": 1.74, + "grad_norm": 17.617429733276367, + "learning_rate": 3.8948000686459585e-06, + "loss": 0.511, + "step": 10141 + }, + { + "epoch": 1.74, + "grad_norm": 10.86064624786377, + "learning_rate": 3.892225845203364e-06, + "loss": 0.3824, + "step": 10142 + }, + { + "epoch": 1.74, + "grad_norm": 7.791238784790039, + "learning_rate": 3.8896516217607685e-06, + "loss": 0.3047, + "step": 10143 + }, + { + "epoch": 1.74, + "grad_norm": 9.900047302246094, + "learning_rate": 3.8870773983181735e-06, + "loss": 0.2892, + "step": 10144 + }, + { + "epoch": 1.74, + "grad_norm": 9.590638160705566, + "learning_rate": 3.884503174875579e-06, + "loss": 0.3112, + "step": 10145 + }, + { + "epoch": 1.74, + "grad_norm": 9.809215545654297, + "learning_rate": 3.881928951432984e-06, + "loss": 0.3487, + "step": 10146 + }, + { + "epoch": 1.74, + "grad_norm": 10.319319725036621, + "learning_rate": 3.87935472799039e-06, + "loss": 0.5492, + "step": 10147 + }, + { + "epoch": 1.74, + "grad_norm": 8.404623031616211, + "learning_rate": 3.876780504547794e-06, + "loss": 0.3197, + "step": 10148 + }, + { + "epoch": 1.74, + "grad_norm": 10.93860149383545, + "learning_rate": 3.8742062811052e-06, + "loss": 0.2761, + "step": 10149 + }, + { + "epoch": 1.74, + "grad_norm": 12.618767738342285, + "learning_rate": 3.871632057662605e-06, + "loss": 0.6052, + "step": 10150 + }, + { + "epoch": 1.74, + "grad_norm": 7.340433597564697, + "learning_rate": 3.86905783422001e-06, + "loss": 0.2744, + "step": 10151 + }, + { + "epoch": 1.74, + "grad_norm": 14.260811805725098, + "learning_rate": 3.866483610777416e-06, + "loss": 0.606, + "step": 10152 + }, + { + "epoch": 1.74, + "grad_norm": 13.70333194732666, + "learning_rate": 3.863909387334821e-06, + "loss": 0.2935, + "step": 10153 + }, + { + "epoch": 1.74, + "grad_norm": 10.234350204467773, + "learning_rate": 3.861335163892226e-06, + "loss": 0.3622, + "step": 10154 + }, + { + "epoch": 1.74, + "grad_norm": 11.689064025878906, + "learning_rate": 3.858760940449631e-06, + "loss": 0.5567, + "step": 10155 + }, + { + "epoch": 1.74, + "grad_norm": 8.362519264221191, + "learning_rate": 3.856186717007037e-06, + "loss": 0.3005, + "step": 10156 + }, + { + "epoch": 1.74, + "grad_norm": 12.870559692382812, + "learning_rate": 3.853612493564442e-06, + "loss": 0.3534, + "step": 10157 + }, + { + "epoch": 1.74, + "grad_norm": 9.29106616973877, + "learning_rate": 3.851038270121847e-06, + "loss": 0.2802, + "step": 10158 + }, + { + "epoch": 1.74, + "grad_norm": 10.681184768676758, + "learning_rate": 3.848464046679252e-06, + "loss": 0.3886, + "step": 10159 + }, + { + "epoch": 1.74, + "grad_norm": 10.985565185546875, + "learning_rate": 3.845889823236657e-06, + "loss": 0.4138, + "step": 10160 + }, + { + "epoch": 1.74, + "grad_norm": 13.857745170593262, + "learning_rate": 3.8433155997940625e-06, + "loss": 0.3003, + "step": 10161 + }, + { + "epoch": 1.74, + "grad_norm": 11.65396785736084, + "learning_rate": 3.8407413763514675e-06, + "loss": 0.4674, + "step": 10162 + }, + { + "epoch": 1.74, + "grad_norm": 12.1290864944458, + "learning_rate": 3.838167152908873e-06, + "loss": 0.4457, + "step": 10163 + }, + { + "epoch": 1.74, + "grad_norm": 10.762251853942871, + "learning_rate": 3.8355929294662775e-06, + "loss": 0.3151, + "step": 10164 + }, + { + "epoch": 1.74, + "grad_norm": 9.683486938476562, + "learning_rate": 3.8330187060236825e-06, + "loss": 0.4095, + "step": 10165 + }, + { + "epoch": 1.74, + "grad_norm": 8.60052490234375, + "learning_rate": 3.830444482581088e-06, + "loss": 0.5007, + "step": 10166 + }, + { + "epoch": 1.74, + "grad_norm": 8.125239372253418, + "learning_rate": 3.827870259138493e-06, + "loss": 0.3678, + "step": 10167 + }, + { + "epoch": 1.74, + "grad_norm": 10.572195053100586, + "learning_rate": 3.825296035695899e-06, + "loss": 0.383, + "step": 10168 + }, + { + "epoch": 1.75, + "grad_norm": 12.062673568725586, + "learning_rate": 3.822721812253303e-06, + "loss": 0.3307, + "step": 10169 + }, + { + "epoch": 1.75, + "grad_norm": 8.542654991149902, + "learning_rate": 3.820147588810708e-06, + "loss": 0.3823, + "step": 10170 + }, + { + "epoch": 1.75, + "grad_norm": 9.79784870147705, + "learning_rate": 3.817573365368114e-06, + "loss": 0.3434, + "step": 10171 + }, + { + "epoch": 1.75, + "grad_norm": 12.200736045837402, + "learning_rate": 3.814999141925519e-06, + "loss": 0.2691, + "step": 10172 + }, + { + "epoch": 1.75, + "grad_norm": 10.891195297241211, + "learning_rate": 3.8124249184829245e-06, + "loss": 0.3118, + "step": 10173 + }, + { + "epoch": 1.75, + "grad_norm": 10.681863784790039, + "learning_rate": 3.8098506950403295e-06, + "loss": 0.3609, + "step": 10174 + }, + { + "epoch": 1.75, + "grad_norm": 8.811197280883789, + "learning_rate": 3.8072764715977353e-06, + "loss": 0.425, + "step": 10175 + }, + { + "epoch": 1.75, + "grad_norm": 7.526089668273926, + "learning_rate": 3.80470224815514e-06, + "loss": 0.3092, + "step": 10176 + }, + { + "epoch": 1.75, + "grad_norm": 8.435626029968262, + "learning_rate": 3.802128024712545e-06, + "loss": 0.4314, + "step": 10177 + }, + { + "epoch": 1.75, + "grad_norm": 15.051081657409668, + "learning_rate": 3.7995538012699503e-06, + "loss": 0.3363, + "step": 10178 + }, + { + "epoch": 1.75, + "grad_norm": 8.169252395629883, + "learning_rate": 3.7969795778273553e-06, + "loss": 0.3322, + "step": 10179 + }, + { + "epoch": 1.75, + "grad_norm": 8.432186126708984, + "learning_rate": 3.794405354384761e-06, + "loss": 0.3272, + "step": 10180 + }, + { + "epoch": 1.75, + "grad_norm": 9.028060913085938, + "learning_rate": 3.7918311309421657e-06, + "loss": 0.4354, + "step": 10181 + }, + { + "epoch": 1.75, + "grad_norm": 11.405692100524902, + "learning_rate": 3.7892569074995715e-06, + "loss": 0.3692, + "step": 10182 + }, + { + "epoch": 1.75, + "grad_norm": 11.497088432312012, + "learning_rate": 3.786682684056976e-06, + "loss": 0.4336, + "step": 10183 + }, + { + "epoch": 1.75, + "grad_norm": 12.881348609924316, + "learning_rate": 3.784108460614381e-06, + "loss": 0.4159, + "step": 10184 + }, + { + "epoch": 1.75, + "grad_norm": 10.188570022583008, + "learning_rate": 3.781534237171787e-06, + "loss": 0.4665, + "step": 10185 + }, + { + "epoch": 1.75, + "grad_norm": 10.117717742919922, + "learning_rate": 3.7789600137291914e-06, + "loss": 0.3599, + "step": 10186 + }, + { + "epoch": 1.75, + "grad_norm": 9.287288665771484, + "learning_rate": 3.7763857902865973e-06, + "loss": 0.2787, + "step": 10187 + }, + { + "epoch": 1.75, + "grad_norm": 14.764989852905273, + "learning_rate": 3.7738115668440023e-06, + "loss": 0.389, + "step": 10188 + }, + { + "epoch": 1.75, + "grad_norm": 11.528788566589355, + "learning_rate": 3.7712373434014077e-06, + "loss": 0.3383, + "step": 10189 + }, + { + "epoch": 1.75, + "grad_norm": 9.32126235961914, + "learning_rate": 3.7686631199588127e-06, + "loss": 0.2743, + "step": 10190 + }, + { + "epoch": 1.75, + "grad_norm": 15.467133522033691, + "learning_rate": 3.7660888965162172e-06, + "loss": 0.4855, + "step": 10191 + }, + { + "epoch": 1.75, + "grad_norm": 11.054813385009766, + "learning_rate": 3.763514673073623e-06, + "loss": 0.3842, + "step": 10192 + }, + { + "epoch": 1.75, + "grad_norm": 12.813760757446289, + "learning_rate": 3.760940449631028e-06, + "loss": 0.409, + "step": 10193 + }, + { + "epoch": 1.75, + "grad_norm": 11.009858131408691, + "learning_rate": 3.7583662261884335e-06, + "loss": 0.3529, + "step": 10194 + }, + { + "epoch": 1.75, + "grad_norm": 10.135607719421387, + "learning_rate": 3.7557920027458384e-06, + "loss": 0.3987, + "step": 10195 + }, + { + "epoch": 1.75, + "grad_norm": 16.250125885009766, + "learning_rate": 3.753217779303243e-06, + "loss": 0.3077, + "step": 10196 + }, + { + "epoch": 1.75, + "grad_norm": 12.336618423461914, + "learning_rate": 3.750643555860649e-06, + "loss": 0.5505, + "step": 10197 + }, + { + "epoch": 1.75, + "grad_norm": 13.917937278747559, + "learning_rate": 3.7480693324180543e-06, + "loss": 0.3855, + "step": 10198 + }, + { + "epoch": 1.75, + "grad_norm": 8.916064262390137, + "learning_rate": 3.745495108975459e-06, + "loss": 0.2829, + "step": 10199 + }, + { + "epoch": 1.75, + "grad_norm": 10.031928062438965, + "learning_rate": 3.7429208855328642e-06, + "loss": 0.5476, + "step": 10200 + }, + { + "epoch": 1.75, + "grad_norm": 13.900164604187012, + "learning_rate": 3.7403466620902696e-06, + "loss": 0.5107, + "step": 10201 + }, + { + "epoch": 1.75, + "grad_norm": 10.412485122680664, + "learning_rate": 3.7377724386476746e-06, + "loss": 0.3165, + "step": 10202 + }, + { + "epoch": 1.75, + "grad_norm": 12.321499824523926, + "learning_rate": 3.73519821520508e-06, + "loss": 0.5741, + "step": 10203 + }, + { + "epoch": 1.75, + "grad_norm": 7.0417375564575195, + "learning_rate": 3.732623991762485e-06, + "loss": 0.322, + "step": 10204 + }, + { + "epoch": 1.75, + "grad_norm": 9.423901557922363, + "learning_rate": 3.7300497683198904e-06, + "loss": 0.479, + "step": 10205 + }, + { + "epoch": 1.75, + "grad_norm": 9.220063209533691, + "learning_rate": 3.7274755448772954e-06, + "loss": 0.3528, + "step": 10206 + }, + { + "epoch": 1.75, + "grad_norm": 8.971394538879395, + "learning_rate": 3.7249013214347004e-06, + "loss": 0.379, + "step": 10207 + }, + { + "epoch": 1.75, + "grad_norm": 12.72043228149414, + "learning_rate": 3.722327097992106e-06, + "loss": 0.489, + "step": 10208 + }, + { + "epoch": 1.75, + "grad_norm": 13.088316917419434, + "learning_rate": 3.719752874549511e-06, + "loss": 0.437, + "step": 10209 + }, + { + "epoch": 1.75, + "grad_norm": 11.305198669433594, + "learning_rate": 3.7171786511069162e-06, + "loss": 0.5663, + "step": 10210 + }, + { + "epoch": 1.75, + "grad_norm": 8.237088203430176, + "learning_rate": 3.7146044276643216e-06, + "loss": 0.2819, + "step": 10211 + }, + { + "epoch": 1.75, + "grad_norm": 10.895726203918457, + "learning_rate": 3.712030204221726e-06, + "loss": 0.4606, + "step": 10212 + }, + { + "epoch": 1.75, + "grad_norm": 10.748737335205078, + "learning_rate": 3.7094559807791316e-06, + "loss": 0.3876, + "step": 10213 + }, + { + "epoch": 1.75, + "grad_norm": 9.259161949157715, + "learning_rate": 3.706881757336537e-06, + "loss": 0.4186, + "step": 10214 + }, + { + "epoch": 1.75, + "grad_norm": 8.028119087219238, + "learning_rate": 3.704307533893942e-06, + "loss": 0.2588, + "step": 10215 + }, + { + "epoch": 1.75, + "grad_norm": 8.59246826171875, + "learning_rate": 3.7017333104513474e-06, + "loss": 0.3857, + "step": 10216 + }, + { + "epoch": 1.75, + "grad_norm": 10.146876335144043, + "learning_rate": 3.6991590870087524e-06, + "loss": 0.4382, + "step": 10217 + }, + { + "epoch": 1.75, + "grad_norm": 10.523021697998047, + "learning_rate": 3.696584863566158e-06, + "loss": 0.3964, + "step": 10218 + }, + { + "epoch": 1.75, + "grad_norm": 12.233314514160156, + "learning_rate": 3.694010640123563e-06, + "loss": 0.3836, + "step": 10219 + }, + { + "epoch": 1.75, + "grad_norm": 8.950338363647461, + "learning_rate": 3.691436416680968e-06, + "loss": 0.3473, + "step": 10220 + }, + { + "epoch": 1.75, + "grad_norm": 13.352520942687988, + "learning_rate": 3.688862193238373e-06, + "loss": 0.4426, + "step": 10221 + }, + { + "epoch": 1.75, + "grad_norm": 10.945180892944336, + "learning_rate": 3.686287969795778e-06, + "loss": 0.4705, + "step": 10222 + }, + { + "epoch": 1.75, + "grad_norm": 7.934719085693359, + "learning_rate": 3.6837137463531836e-06, + "loss": 0.2289, + "step": 10223 + }, + { + "epoch": 1.75, + "grad_norm": 11.892194747924805, + "learning_rate": 3.681139522910589e-06, + "loss": 0.5061, + "step": 10224 + }, + { + "epoch": 1.75, + "grad_norm": 12.008149147033691, + "learning_rate": 3.6785652994679936e-06, + "loss": 0.4234, + "step": 10225 + }, + { + "epoch": 1.75, + "grad_norm": 13.622702598571777, + "learning_rate": 3.675991076025399e-06, + "loss": 0.5187, + "step": 10226 + }, + { + "epoch": 1.76, + "grad_norm": 12.735066413879395, + "learning_rate": 3.6734168525828044e-06, + "loss": 0.3654, + "step": 10227 + }, + { + "epoch": 1.76, + "grad_norm": 9.49288558959961, + "learning_rate": 3.6708426291402094e-06, + "loss": 0.3758, + "step": 10228 + }, + { + "epoch": 1.76, + "grad_norm": 10.763015747070312, + "learning_rate": 3.668268405697615e-06, + "loss": 0.4298, + "step": 10229 + }, + { + "epoch": 1.76, + "grad_norm": 8.570923805236816, + "learning_rate": 3.6656941822550198e-06, + "loss": 0.2124, + "step": 10230 + }, + { + "epoch": 1.76, + "grad_norm": 12.82702922821045, + "learning_rate": 3.663119958812425e-06, + "loss": 0.3749, + "step": 10231 + }, + { + "epoch": 1.76, + "grad_norm": 8.77657699584961, + "learning_rate": 3.66054573536983e-06, + "loss": 0.2517, + "step": 10232 + }, + { + "epoch": 1.76, + "grad_norm": 10.193717002868652, + "learning_rate": 3.657971511927235e-06, + "loss": 0.4446, + "step": 10233 + }, + { + "epoch": 1.76, + "grad_norm": 6.4744038581848145, + "learning_rate": 3.6553972884846406e-06, + "loss": 0.1988, + "step": 10234 + }, + { + "epoch": 1.76, + "grad_norm": 8.41050910949707, + "learning_rate": 3.6528230650420456e-06, + "loss": 0.3939, + "step": 10235 + }, + { + "epoch": 1.76, + "grad_norm": 11.848515510559082, + "learning_rate": 3.650248841599451e-06, + "loss": 0.4534, + "step": 10236 + }, + { + "epoch": 1.76, + "grad_norm": 12.180014610290527, + "learning_rate": 3.6476746181568564e-06, + "loss": 0.2749, + "step": 10237 + }, + { + "epoch": 1.76, + "grad_norm": 18.61145782470703, + "learning_rate": 3.645100394714261e-06, + "loss": 0.3995, + "step": 10238 + }, + { + "epoch": 1.76, + "grad_norm": 9.235429763793945, + "learning_rate": 3.6425261712716664e-06, + "loss": 0.3171, + "step": 10239 + }, + { + "epoch": 1.76, + "grad_norm": 10.68899154663086, + "learning_rate": 3.6399519478290718e-06, + "loss": 0.4255, + "step": 10240 + }, + { + "epoch": 1.76, + "grad_norm": 9.805556297302246, + "learning_rate": 3.6373777243864768e-06, + "loss": 0.4189, + "step": 10241 + }, + { + "epoch": 1.76, + "grad_norm": 10.117131233215332, + "learning_rate": 3.634803500943882e-06, + "loss": 0.5465, + "step": 10242 + }, + { + "epoch": 1.76, + "grad_norm": 12.267644882202148, + "learning_rate": 3.632229277501287e-06, + "loss": 0.4355, + "step": 10243 + }, + { + "epoch": 1.76, + "grad_norm": 10.417571067810059, + "learning_rate": 3.6296550540586926e-06, + "loss": 0.3252, + "step": 10244 + }, + { + "epoch": 1.76, + "grad_norm": 8.063173294067383, + "learning_rate": 3.6270808306160976e-06, + "loss": 0.2651, + "step": 10245 + }, + { + "epoch": 1.76, + "grad_norm": 8.665863990783691, + "learning_rate": 3.6245066071735025e-06, + "loss": 0.3893, + "step": 10246 + }, + { + "epoch": 1.76, + "grad_norm": 13.454741477966309, + "learning_rate": 3.621932383730908e-06, + "loss": 0.6182, + "step": 10247 + }, + { + "epoch": 1.76, + "grad_norm": 11.875772476196289, + "learning_rate": 3.619358160288313e-06, + "loss": 0.4422, + "step": 10248 + }, + { + "epoch": 1.76, + "grad_norm": 8.632668495178223, + "learning_rate": 3.6167839368457184e-06, + "loss": 0.3494, + "step": 10249 + }, + { + "epoch": 1.76, + "grad_norm": 8.414258003234863, + "learning_rate": 3.6142097134031238e-06, + "loss": 0.1634, + "step": 10250 + }, + { + "epoch": 1.76, + "grad_norm": 13.760488510131836, + "learning_rate": 3.6116354899605283e-06, + "loss": 0.3774, + "step": 10251 + }, + { + "epoch": 1.76, + "grad_norm": 9.845131874084473, + "learning_rate": 3.6090612665179337e-06, + "loss": 0.3897, + "step": 10252 + }, + { + "epoch": 1.76, + "grad_norm": 8.781501770019531, + "learning_rate": 3.606487043075339e-06, + "loss": 0.3187, + "step": 10253 + }, + { + "epoch": 1.76, + "grad_norm": 13.929583549499512, + "learning_rate": 3.603912819632744e-06, + "loss": 0.469, + "step": 10254 + }, + { + "epoch": 1.76, + "grad_norm": 11.2886323928833, + "learning_rate": 3.6013385961901495e-06, + "loss": 0.2773, + "step": 10255 + }, + { + "epoch": 1.76, + "grad_norm": 7.797028541564941, + "learning_rate": 3.5987643727475545e-06, + "loss": 0.2498, + "step": 10256 + }, + { + "epoch": 1.76, + "grad_norm": 11.603740692138672, + "learning_rate": 3.59619014930496e-06, + "loss": 0.4764, + "step": 10257 + }, + { + "epoch": 1.76, + "grad_norm": 10.143132209777832, + "learning_rate": 3.593615925862365e-06, + "loss": 0.2996, + "step": 10258 + }, + { + "epoch": 1.76, + "grad_norm": 13.65654182434082, + "learning_rate": 3.59104170241977e-06, + "loss": 0.2893, + "step": 10259 + }, + { + "epoch": 1.76, + "grad_norm": 8.181256294250488, + "learning_rate": 3.5884674789771753e-06, + "loss": 0.3335, + "step": 10260 + }, + { + "epoch": 1.76, + "grad_norm": 7.574747085571289, + "learning_rate": 3.5858932555345803e-06, + "loss": 0.3222, + "step": 10261 + }, + { + "epoch": 1.76, + "grad_norm": 10.166397094726562, + "learning_rate": 3.5833190320919857e-06, + "loss": 0.3921, + "step": 10262 + }, + { + "epoch": 1.76, + "grad_norm": 9.948464393615723, + "learning_rate": 3.580744808649391e-06, + "loss": 0.3042, + "step": 10263 + }, + { + "epoch": 1.76, + "grad_norm": 9.119213104248047, + "learning_rate": 3.5781705852067957e-06, + "loss": 0.2746, + "step": 10264 + }, + { + "epoch": 1.76, + "grad_norm": 10.38940143585205, + "learning_rate": 3.575596361764201e-06, + "loss": 0.5122, + "step": 10265 + }, + { + "epoch": 1.76, + "grad_norm": 7.604372024536133, + "learning_rate": 3.5730221383216065e-06, + "loss": 0.3038, + "step": 10266 + }, + { + "epoch": 1.76, + "grad_norm": 7.439919948577881, + "learning_rate": 3.5704479148790115e-06, + "loss": 0.2691, + "step": 10267 + }, + { + "epoch": 1.76, + "grad_norm": 9.319008827209473, + "learning_rate": 3.567873691436417e-06, + "loss": 0.4428, + "step": 10268 + }, + { + "epoch": 1.76, + "grad_norm": 11.040205955505371, + "learning_rate": 3.565299467993822e-06, + "loss": 0.2964, + "step": 10269 + }, + { + "epoch": 1.76, + "grad_norm": 9.960899353027344, + "learning_rate": 3.5627252445512273e-06, + "loss": 0.3532, + "step": 10270 + }, + { + "epoch": 1.76, + "grad_norm": 9.433367729187012, + "learning_rate": 3.5601510211086323e-06, + "loss": 0.3188, + "step": 10271 + }, + { + "epoch": 1.76, + "grad_norm": 11.964232444763184, + "learning_rate": 3.5575767976660373e-06, + "loss": 0.5936, + "step": 10272 + }, + { + "epoch": 1.76, + "grad_norm": 10.903236389160156, + "learning_rate": 3.5550025742234427e-06, + "loss": 0.5645, + "step": 10273 + }, + { + "epoch": 1.76, + "grad_norm": 14.747243881225586, + "learning_rate": 3.5524283507808477e-06, + "loss": 0.6095, + "step": 10274 + }, + { + "epoch": 1.76, + "grad_norm": 7.366724491119385, + "learning_rate": 3.549854127338253e-06, + "loss": 0.2494, + "step": 10275 + }, + { + "epoch": 1.76, + "grad_norm": 15.175317764282227, + "learning_rate": 3.5472799038956585e-06, + "loss": 0.4956, + "step": 10276 + }, + { + "epoch": 1.76, + "grad_norm": 12.862730979919434, + "learning_rate": 3.544705680453063e-06, + "loss": 0.3139, + "step": 10277 + }, + { + "epoch": 1.76, + "grad_norm": 8.148316383361816, + "learning_rate": 3.5421314570104685e-06, + "loss": 0.3412, + "step": 10278 + }, + { + "epoch": 1.76, + "grad_norm": 10.67927360534668, + "learning_rate": 3.539557233567874e-06, + "loss": 0.4003, + "step": 10279 + }, + { + "epoch": 1.76, + "grad_norm": 10.142306327819824, + "learning_rate": 3.536983010125279e-06, + "loss": 0.4048, + "step": 10280 + }, + { + "epoch": 1.76, + "grad_norm": 14.645539283752441, + "learning_rate": 3.5344087866826843e-06, + "loss": 0.3232, + "step": 10281 + }, + { + "epoch": 1.76, + "grad_norm": 11.56098461151123, + "learning_rate": 3.5318345632400893e-06, + "loss": 0.3882, + "step": 10282 + }, + { + "epoch": 1.76, + "grad_norm": 8.415877342224121, + "learning_rate": 3.5292603397974947e-06, + "loss": 0.2145, + "step": 10283 + }, + { + "epoch": 1.76, + "grad_norm": 12.05709171295166, + "learning_rate": 3.5266861163548997e-06, + "loss": 0.4451, + "step": 10284 + }, + { + "epoch": 1.77, + "grad_norm": 13.837220191955566, + "learning_rate": 3.5241118929123047e-06, + "loss": 0.5567, + "step": 10285 + }, + { + "epoch": 1.77, + "grad_norm": 10.209320068359375, + "learning_rate": 3.52153766946971e-06, + "loss": 0.513, + "step": 10286 + }, + { + "epoch": 1.77, + "grad_norm": 8.352028846740723, + "learning_rate": 3.518963446027115e-06, + "loss": 0.3012, + "step": 10287 + }, + { + "epoch": 1.77, + "grad_norm": 11.550464630126953, + "learning_rate": 3.5163892225845205e-06, + "loss": 0.4202, + "step": 10288 + }, + { + "epoch": 1.77, + "grad_norm": 10.208223342895508, + "learning_rate": 3.513814999141926e-06, + "loss": 0.4685, + "step": 10289 + }, + { + "epoch": 1.77, + "grad_norm": 7.388490676879883, + "learning_rate": 3.5112407756993305e-06, + "loss": 0.2887, + "step": 10290 + }, + { + "epoch": 1.77, + "grad_norm": 16.14285659790039, + "learning_rate": 3.508666552256736e-06, + "loss": 0.5275, + "step": 10291 + }, + { + "epoch": 1.77, + "grad_norm": 10.451848030090332, + "learning_rate": 3.5060923288141413e-06, + "loss": 0.6252, + "step": 10292 + }, + { + "epoch": 1.77, + "grad_norm": 10.768813133239746, + "learning_rate": 3.5035181053715463e-06, + "loss": 0.3542, + "step": 10293 + }, + { + "epoch": 1.77, + "grad_norm": 8.399510383605957, + "learning_rate": 3.5009438819289517e-06, + "loss": 0.4854, + "step": 10294 + }, + { + "epoch": 1.77, + "grad_norm": 8.238245964050293, + "learning_rate": 3.4983696584863567e-06, + "loss": 0.3979, + "step": 10295 + }, + { + "epoch": 1.77, + "grad_norm": 8.908125877380371, + "learning_rate": 3.495795435043762e-06, + "loss": 0.4981, + "step": 10296 + }, + { + "epoch": 1.77, + "grad_norm": 9.450529098510742, + "learning_rate": 3.493221211601167e-06, + "loss": 0.3135, + "step": 10297 + }, + { + "epoch": 1.77, + "grad_norm": 7.324727535247803, + "learning_rate": 3.490646988158572e-06, + "loss": 0.2525, + "step": 10298 + }, + { + "epoch": 1.77, + "grad_norm": 7.562472820281982, + "learning_rate": 3.4880727647159775e-06, + "loss": 0.263, + "step": 10299 + }, + { + "epoch": 1.77, + "grad_norm": 9.205446243286133, + "learning_rate": 3.4854985412733824e-06, + "loss": 0.301, + "step": 10300 + }, + { + "epoch": 1.77, + "grad_norm": 13.021312713623047, + "learning_rate": 3.482924317830788e-06, + "loss": 0.463, + "step": 10301 + }, + { + "epoch": 1.77, + "grad_norm": 11.65982437133789, + "learning_rate": 3.4803500943881933e-06, + "loss": 0.4438, + "step": 10302 + }, + { + "epoch": 1.77, + "grad_norm": 8.424298286437988, + "learning_rate": 3.477775870945598e-06, + "loss": 0.2931, + "step": 10303 + }, + { + "epoch": 1.77, + "grad_norm": 8.964347839355469, + "learning_rate": 3.4752016475030032e-06, + "loss": 0.3709, + "step": 10304 + }, + { + "epoch": 1.77, + "grad_norm": 12.874922752380371, + "learning_rate": 3.4726274240604087e-06, + "loss": 0.3808, + "step": 10305 + }, + { + "epoch": 1.77, + "grad_norm": 12.361870765686035, + "learning_rate": 3.4700532006178136e-06, + "loss": 0.4141, + "step": 10306 + }, + { + "epoch": 1.77, + "grad_norm": 13.834250450134277, + "learning_rate": 3.467478977175219e-06, + "loss": 0.5122, + "step": 10307 + }, + { + "epoch": 1.77, + "grad_norm": 11.968622207641602, + "learning_rate": 3.464904753732624e-06, + "loss": 0.3735, + "step": 10308 + }, + { + "epoch": 1.77, + "grad_norm": 10.21124267578125, + "learning_rate": 3.4623305302900295e-06, + "loss": 0.2812, + "step": 10309 + }, + { + "epoch": 1.77, + "grad_norm": 11.070713996887207, + "learning_rate": 3.4597563068474344e-06, + "loss": 0.3327, + "step": 10310 + }, + { + "epoch": 1.77, + "grad_norm": 12.503514289855957, + "learning_rate": 3.4571820834048394e-06, + "loss": 0.4168, + "step": 10311 + }, + { + "epoch": 1.77, + "grad_norm": 9.725349426269531, + "learning_rate": 3.454607859962245e-06, + "loss": 0.33, + "step": 10312 + }, + { + "epoch": 1.77, + "grad_norm": 13.072580337524414, + "learning_rate": 3.4520336365196502e-06, + "loss": 0.3008, + "step": 10313 + }, + { + "epoch": 1.77, + "grad_norm": 12.163908004760742, + "learning_rate": 3.4494594130770552e-06, + "loss": 0.4394, + "step": 10314 + }, + { + "epoch": 1.77, + "grad_norm": 11.541789054870605, + "learning_rate": 3.4468851896344606e-06, + "loss": 0.5652, + "step": 10315 + }, + { + "epoch": 1.77, + "grad_norm": 9.008191108703613, + "learning_rate": 3.444310966191865e-06, + "loss": 0.3386, + "step": 10316 + }, + { + "epoch": 1.77, + "grad_norm": 12.454290390014648, + "learning_rate": 3.4417367427492706e-06, + "loss": 0.5166, + "step": 10317 + }, + { + "epoch": 1.77, + "grad_norm": 10.802310943603516, + "learning_rate": 3.439162519306676e-06, + "loss": 0.3082, + "step": 10318 + }, + { + "epoch": 1.77, + "grad_norm": 10.244376182556152, + "learning_rate": 3.436588295864081e-06, + "loss": 0.3266, + "step": 10319 + }, + { + "epoch": 1.77, + "grad_norm": 14.181434631347656, + "learning_rate": 3.4340140724214864e-06, + "loss": 0.4626, + "step": 10320 + }, + { + "epoch": 1.77, + "grad_norm": 7.8023176193237305, + "learning_rate": 3.4314398489788914e-06, + "loss": 0.3633, + "step": 10321 + }, + { + "epoch": 1.77, + "grad_norm": 13.527910232543945, + "learning_rate": 3.428865625536297e-06, + "loss": 0.4064, + "step": 10322 + }, + { + "epoch": 1.77, + "grad_norm": 9.953227043151855, + "learning_rate": 3.426291402093702e-06, + "loss": 0.2656, + "step": 10323 + }, + { + "epoch": 1.77, + "grad_norm": 11.34505558013916, + "learning_rate": 3.423717178651107e-06, + "loss": 0.4373, + "step": 10324 + }, + { + "epoch": 1.77, + "grad_norm": 13.77633285522461, + "learning_rate": 3.4211429552085122e-06, + "loss": 0.3254, + "step": 10325 + }, + { + "epoch": 1.77, + "grad_norm": 9.321101188659668, + "learning_rate": 3.4185687317659176e-06, + "loss": 0.4738, + "step": 10326 + }, + { + "epoch": 1.77, + "grad_norm": 9.482454299926758, + "learning_rate": 3.4159945083233226e-06, + "loss": 0.3438, + "step": 10327 + }, + { + "epoch": 1.77, + "grad_norm": 11.79236125946045, + "learning_rate": 3.413420284880728e-06, + "loss": 0.2604, + "step": 10328 + }, + { + "epoch": 1.77, + "grad_norm": 12.345510482788086, + "learning_rate": 3.4108460614381326e-06, + "loss": 0.5073, + "step": 10329 + }, + { + "epoch": 1.77, + "grad_norm": 7.138515472412109, + "learning_rate": 3.408271837995538e-06, + "loss": 0.3654, + "step": 10330 + }, + { + "epoch": 1.77, + "grad_norm": 7.56738805770874, + "learning_rate": 3.4056976145529434e-06, + "loss": 0.2656, + "step": 10331 + }, + { + "epoch": 1.77, + "grad_norm": 10.79387378692627, + "learning_rate": 3.4031233911103484e-06, + "loss": 0.4011, + "step": 10332 + }, + { + "epoch": 1.77, + "grad_norm": 8.253780364990234, + "learning_rate": 3.400549167667754e-06, + "loss": 0.3602, + "step": 10333 + }, + { + "epoch": 1.77, + "grad_norm": 13.453286170959473, + "learning_rate": 3.397974944225159e-06, + "loss": 0.3837, + "step": 10334 + }, + { + "epoch": 1.77, + "grad_norm": 13.63301944732666, + "learning_rate": 3.3954007207825638e-06, + "loss": 0.3533, + "step": 10335 + }, + { + "epoch": 1.77, + "grad_norm": 8.314065933227539, + "learning_rate": 3.392826497339969e-06, + "loss": 0.2284, + "step": 10336 + }, + { + "epoch": 1.77, + "grad_norm": 8.389789581298828, + "learning_rate": 3.390252273897374e-06, + "loss": 0.2565, + "step": 10337 + }, + { + "epoch": 1.77, + "grad_norm": 6.987936019897461, + "learning_rate": 3.3876780504547796e-06, + "loss": 0.197, + "step": 10338 + }, + { + "epoch": 1.77, + "grad_norm": 12.762155532836914, + "learning_rate": 3.385103827012185e-06, + "loss": 0.3853, + "step": 10339 + }, + { + "epoch": 1.77, + "grad_norm": 8.206766128540039, + "learning_rate": 3.38252960356959e-06, + "loss": 0.3187, + "step": 10340 + }, + { + "epoch": 1.77, + "grad_norm": 12.465702056884766, + "learning_rate": 3.3799553801269954e-06, + "loss": 0.527, + "step": 10341 + }, + { + "epoch": 1.77, + "grad_norm": 11.702652931213379, + "learning_rate": 3.3773811566844e-06, + "loss": 0.3219, + "step": 10342 + }, + { + "epoch": 1.78, + "grad_norm": 9.759143829345703, + "learning_rate": 3.3748069332418054e-06, + "loss": 0.4821, + "step": 10343 + }, + { + "epoch": 1.78, + "grad_norm": 13.18113899230957, + "learning_rate": 3.3722327097992108e-06, + "loss": 0.3623, + "step": 10344 + }, + { + "epoch": 1.78, + "grad_norm": 10.89564323425293, + "learning_rate": 3.3696584863566158e-06, + "loss": 0.4238, + "step": 10345 + }, + { + "epoch": 1.78, + "grad_norm": 12.624650955200195, + "learning_rate": 3.367084262914021e-06, + "loss": 0.3402, + "step": 10346 + }, + { + "epoch": 1.78, + "grad_norm": 8.608189582824707, + "learning_rate": 3.364510039471426e-06, + "loss": 0.4109, + "step": 10347 + }, + { + "epoch": 1.78, + "grad_norm": 12.358651161193848, + "learning_rate": 3.361935816028831e-06, + "loss": 0.3574, + "step": 10348 + }, + { + "epoch": 1.78, + "grad_norm": 9.220965385437012, + "learning_rate": 3.3593615925862366e-06, + "loss": 0.3318, + "step": 10349 + }, + { + "epoch": 1.78, + "grad_norm": 13.647798538208008, + "learning_rate": 3.3567873691436416e-06, + "loss": 0.3023, + "step": 10350 + }, + { + "epoch": 1.78, + "grad_norm": 9.34692668914795, + "learning_rate": 3.354213145701047e-06, + "loss": 0.3374, + "step": 10351 + }, + { + "epoch": 1.78, + "grad_norm": 10.064382553100586, + "learning_rate": 3.3516389222584524e-06, + "loss": 0.2876, + "step": 10352 + }, + { + "epoch": 1.78, + "grad_norm": 11.457650184631348, + "learning_rate": 3.3490646988158574e-06, + "loss": 0.3867, + "step": 10353 + }, + { + "epoch": 1.78, + "grad_norm": 12.781315803527832, + "learning_rate": 3.3464904753732628e-06, + "loss": 0.4221, + "step": 10354 + }, + { + "epoch": 1.78, + "grad_norm": 9.486967086791992, + "learning_rate": 3.3439162519306673e-06, + "loss": 0.3239, + "step": 10355 + }, + { + "epoch": 1.78, + "grad_norm": 12.00626277923584, + "learning_rate": 3.3413420284880728e-06, + "loss": 0.3696, + "step": 10356 + }, + { + "epoch": 1.78, + "grad_norm": 9.245110511779785, + "learning_rate": 3.338767805045478e-06, + "loss": 0.3123, + "step": 10357 + }, + { + "epoch": 1.78, + "grad_norm": 13.767106056213379, + "learning_rate": 3.336193581602883e-06, + "loss": 0.4389, + "step": 10358 + }, + { + "epoch": 1.78, + "grad_norm": 10.81732177734375, + "learning_rate": 3.3336193581602886e-06, + "loss": 0.4039, + "step": 10359 + }, + { + "epoch": 1.78, + "grad_norm": 8.436320304870605, + "learning_rate": 3.3310451347176935e-06, + "loss": 0.3122, + "step": 10360 + }, + { + "epoch": 1.78, + "grad_norm": 9.913230895996094, + "learning_rate": 3.3284709112750985e-06, + "loss": 0.3536, + "step": 10361 + }, + { + "epoch": 1.78, + "grad_norm": 13.325385093688965, + "learning_rate": 3.325896687832504e-06, + "loss": 0.3868, + "step": 10362 + }, + { + "epoch": 1.78, + "grad_norm": 10.295690536499023, + "learning_rate": 3.323322464389909e-06, + "loss": 0.4019, + "step": 10363 + }, + { + "epoch": 1.78, + "grad_norm": 10.48462200164795, + "learning_rate": 3.3207482409473143e-06, + "loss": 0.3442, + "step": 10364 + }, + { + "epoch": 1.78, + "grad_norm": 10.8226318359375, + "learning_rate": 3.3181740175047198e-06, + "loss": 0.3471, + "step": 10365 + }, + { + "epoch": 1.78, + "grad_norm": 11.76678466796875, + "learning_rate": 3.3155997940621247e-06, + "loss": 0.3863, + "step": 10366 + }, + { + "epoch": 1.78, + "grad_norm": 7.450419902801514, + "learning_rate": 3.31302557061953e-06, + "loss": 0.3797, + "step": 10367 + }, + { + "epoch": 1.78, + "grad_norm": 13.025487899780273, + "learning_rate": 3.3104513471769347e-06, + "loss": 0.4222, + "step": 10368 + }, + { + "epoch": 1.78, + "grad_norm": 12.639636039733887, + "learning_rate": 3.30787712373434e-06, + "loss": 0.3518, + "step": 10369 + }, + { + "epoch": 1.78, + "grad_norm": 8.625659942626953, + "learning_rate": 3.3053029002917455e-06, + "loss": 0.3278, + "step": 10370 + }, + { + "epoch": 1.78, + "grad_norm": 11.003331184387207, + "learning_rate": 3.3027286768491505e-06, + "loss": 0.3522, + "step": 10371 + }, + { + "epoch": 1.78, + "grad_norm": 7.612837314605713, + "learning_rate": 3.300154453406556e-06, + "loss": 0.2022, + "step": 10372 + }, + { + "epoch": 1.78, + "grad_norm": 10.15909481048584, + "learning_rate": 3.297580229963961e-06, + "loss": 0.4, + "step": 10373 + }, + { + "epoch": 1.78, + "grad_norm": 10.829818725585938, + "learning_rate": 3.295006006521366e-06, + "loss": 0.3181, + "step": 10374 + }, + { + "epoch": 1.78, + "grad_norm": 9.90761661529541, + "learning_rate": 3.2924317830787713e-06, + "loss": 0.3898, + "step": 10375 + }, + { + "epoch": 1.78, + "grad_norm": 13.742633819580078, + "learning_rate": 3.2898575596361763e-06, + "loss": 0.519, + "step": 10376 + }, + { + "epoch": 1.78, + "grad_norm": 12.075239181518555, + "learning_rate": 3.2872833361935817e-06, + "loss": 0.4766, + "step": 10377 + }, + { + "epoch": 1.78, + "grad_norm": 8.430364608764648, + "learning_rate": 3.284709112750987e-06, + "loss": 0.4269, + "step": 10378 + }, + { + "epoch": 1.78, + "grad_norm": 15.02288818359375, + "learning_rate": 3.282134889308392e-06, + "loss": 0.3027, + "step": 10379 + }, + { + "epoch": 1.78, + "grad_norm": 10.890666007995605, + "learning_rate": 3.2795606658657975e-06, + "loss": 0.2773, + "step": 10380 + }, + { + "epoch": 1.78, + "grad_norm": 13.060396194458008, + "learning_rate": 3.276986442423202e-06, + "loss": 0.4201, + "step": 10381 + }, + { + "epoch": 1.78, + "grad_norm": 13.831722259521484, + "learning_rate": 3.2744122189806075e-06, + "loss": 0.4713, + "step": 10382 + }, + { + "epoch": 1.78, + "grad_norm": 9.686012268066406, + "learning_rate": 3.271837995538013e-06, + "loss": 0.2566, + "step": 10383 + }, + { + "epoch": 1.78, + "grad_norm": 15.444820404052734, + "learning_rate": 3.269263772095418e-06, + "loss": 0.5268, + "step": 10384 + }, + { + "epoch": 1.78, + "grad_norm": 8.96072006225586, + "learning_rate": 3.2666895486528233e-06, + "loss": 0.423, + "step": 10385 + }, + { + "epoch": 1.78, + "grad_norm": 11.017065048217773, + "learning_rate": 3.2641153252102283e-06, + "loss": 0.5799, + "step": 10386 + }, + { + "epoch": 1.78, + "grad_norm": 6.117496013641357, + "learning_rate": 3.2615411017676333e-06, + "loss": 0.232, + "step": 10387 + }, + { + "epoch": 1.78, + "grad_norm": 8.22362995147705, + "learning_rate": 3.2589668783250387e-06, + "loss": 0.3084, + "step": 10388 + }, + { + "epoch": 1.78, + "grad_norm": 8.4487943649292, + "learning_rate": 3.2563926548824437e-06, + "loss": 0.2844, + "step": 10389 + }, + { + "epoch": 1.78, + "grad_norm": 8.250666618347168, + "learning_rate": 3.253818431439849e-06, + "loss": 0.3653, + "step": 10390 + }, + { + "epoch": 1.78, + "grad_norm": 8.97622299194336, + "learning_rate": 3.2512442079972545e-06, + "loss": 0.4187, + "step": 10391 + }, + { + "epoch": 1.78, + "grad_norm": 9.81460952758789, + "learning_rate": 3.2486699845546595e-06, + "loss": 0.3936, + "step": 10392 + }, + { + "epoch": 1.78, + "grad_norm": 12.792677879333496, + "learning_rate": 3.246095761112065e-06, + "loss": 0.5237, + "step": 10393 + }, + { + "epoch": 1.78, + "grad_norm": 12.203180313110352, + "learning_rate": 3.2435215376694695e-06, + "loss": 0.2558, + "step": 10394 + }, + { + "epoch": 1.78, + "grad_norm": 14.556968688964844, + "learning_rate": 3.240947314226875e-06, + "loss": 0.4362, + "step": 10395 + }, + { + "epoch": 1.78, + "grad_norm": 13.968944549560547, + "learning_rate": 3.2383730907842803e-06, + "loss": 0.511, + "step": 10396 + }, + { + "epoch": 1.78, + "grad_norm": 9.981884002685547, + "learning_rate": 3.2357988673416853e-06, + "loss": 0.4566, + "step": 10397 + }, + { + "epoch": 1.78, + "grad_norm": 8.601639747619629, + "learning_rate": 3.2332246438990907e-06, + "loss": 0.3828, + "step": 10398 + }, + { + "epoch": 1.78, + "grad_norm": 9.621521949768066, + "learning_rate": 3.2306504204564957e-06, + "loss": 0.3328, + "step": 10399 + }, + { + "epoch": 1.78, + "grad_norm": 11.16479778289795, + "learning_rate": 3.2280761970139007e-06, + "loss": 0.4389, + "step": 10400 + }, + { + "epoch": 1.78, + "grad_norm": 10.212334632873535, + "learning_rate": 3.225501973571306e-06, + "loss": 0.3302, + "step": 10401 + }, + { + "epoch": 1.79, + "grad_norm": 10.324894905090332, + "learning_rate": 3.222927750128711e-06, + "loss": 0.4328, + "step": 10402 + }, + { + "epoch": 1.79, + "grad_norm": 7.898311138153076, + "learning_rate": 3.2203535266861165e-06, + "loss": 0.3092, + "step": 10403 + }, + { + "epoch": 1.79, + "grad_norm": 12.091353416442871, + "learning_rate": 3.217779303243522e-06, + "loss": 0.4459, + "step": 10404 + }, + { + "epoch": 1.79, + "grad_norm": 7.136836528778076, + "learning_rate": 3.215205079800927e-06, + "loss": 0.277, + "step": 10405 + }, + { + "epoch": 1.79, + "grad_norm": 17.946653366088867, + "learning_rate": 3.2126308563583323e-06, + "loss": 0.3875, + "step": 10406 + }, + { + "epoch": 1.79, + "grad_norm": 15.466854095458984, + "learning_rate": 3.210056632915737e-06, + "loss": 0.4126, + "step": 10407 + }, + { + "epoch": 1.79, + "grad_norm": 16.457801818847656, + "learning_rate": 3.2074824094731423e-06, + "loss": 0.4534, + "step": 10408 + }, + { + "epoch": 1.79, + "grad_norm": 12.460442543029785, + "learning_rate": 3.2049081860305477e-06, + "loss": 0.4763, + "step": 10409 + }, + { + "epoch": 1.79, + "grad_norm": 14.811309814453125, + "learning_rate": 3.2023339625879527e-06, + "loss": 0.4383, + "step": 10410 + }, + { + "epoch": 1.79, + "grad_norm": 11.980772018432617, + "learning_rate": 3.199759739145358e-06, + "loss": 0.4323, + "step": 10411 + }, + { + "epoch": 1.79, + "grad_norm": 12.63910961151123, + "learning_rate": 3.197185515702763e-06, + "loss": 0.3878, + "step": 10412 + }, + { + "epoch": 1.79, + "grad_norm": 8.61581802368164, + "learning_rate": 3.194611292260168e-06, + "loss": 0.414, + "step": 10413 + }, + { + "epoch": 1.79, + "grad_norm": 11.028787612915039, + "learning_rate": 3.1920370688175735e-06, + "loss": 0.3732, + "step": 10414 + }, + { + "epoch": 1.79, + "grad_norm": 11.4498291015625, + "learning_rate": 3.1894628453749784e-06, + "loss": 0.4567, + "step": 10415 + }, + { + "epoch": 1.79, + "grad_norm": 9.622383117675781, + "learning_rate": 3.186888621932384e-06, + "loss": 0.3014, + "step": 10416 + }, + { + "epoch": 1.79, + "grad_norm": 9.299242973327637, + "learning_rate": 3.1843143984897893e-06, + "loss": 0.3553, + "step": 10417 + }, + { + "epoch": 1.79, + "grad_norm": 12.99066162109375, + "learning_rate": 3.1817401750471942e-06, + "loss": 0.4837, + "step": 10418 + }, + { + "epoch": 1.79, + "grad_norm": 7.404476642608643, + "learning_rate": 3.1791659516045997e-06, + "loss": 0.2378, + "step": 10419 + }, + { + "epoch": 1.79, + "grad_norm": 11.202580451965332, + "learning_rate": 3.1765917281620042e-06, + "loss": 0.3031, + "step": 10420 + }, + { + "epoch": 1.79, + "grad_norm": 13.294025421142578, + "learning_rate": 3.1740175047194096e-06, + "loss": 0.3783, + "step": 10421 + }, + { + "epoch": 1.79, + "grad_norm": 11.027019500732422, + "learning_rate": 3.171443281276815e-06, + "loss": 0.3435, + "step": 10422 + }, + { + "epoch": 1.79, + "grad_norm": 10.20676326751709, + "learning_rate": 3.16886905783422e-06, + "loss": 0.3646, + "step": 10423 + }, + { + "epoch": 1.79, + "grad_norm": 8.397150993347168, + "learning_rate": 3.1662948343916254e-06, + "loss": 0.3074, + "step": 10424 + }, + { + "epoch": 1.79, + "grad_norm": 9.634303092956543, + "learning_rate": 3.1637206109490304e-06, + "loss": 0.4436, + "step": 10425 + }, + { + "epoch": 1.79, + "grad_norm": 9.696736335754395, + "learning_rate": 3.1611463875064354e-06, + "loss": 0.4129, + "step": 10426 + }, + { + "epoch": 1.79, + "grad_norm": 11.378373146057129, + "learning_rate": 3.158572164063841e-06, + "loss": 0.318, + "step": 10427 + }, + { + "epoch": 1.79, + "grad_norm": 9.294855117797852, + "learning_rate": 3.155997940621246e-06, + "loss": 0.4607, + "step": 10428 + }, + { + "epoch": 1.79, + "grad_norm": 7.040972709655762, + "learning_rate": 3.1534237171786512e-06, + "loss": 0.3901, + "step": 10429 + }, + { + "epoch": 1.79, + "grad_norm": 10.023202896118164, + "learning_rate": 3.1508494937360566e-06, + "loss": 0.4375, + "step": 10430 + }, + { + "epoch": 1.79, + "grad_norm": 9.713528633117676, + "learning_rate": 3.1482752702934616e-06, + "loss": 0.5196, + "step": 10431 + }, + { + "epoch": 1.79, + "grad_norm": 9.742056846618652, + "learning_rate": 3.145701046850867e-06, + "loss": 0.3954, + "step": 10432 + }, + { + "epoch": 1.79, + "grad_norm": 8.226908683776855, + "learning_rate": 3.1431268234082716e-06, + "loss": 0.343, + "step": 10433 + }, + { + "epoch": 1.79, + "grad_norm": 6.437498092651367, + "learning_rate": 3.140552599965677e-06, + "loss": 0.2502, + "step": 10434 + }, + { + "epoch": 1.79, + "grad_norm": 9.046233177185059, + "learning_rate": 3.1379783765230824e-06, + "loss": 0.3234, + "step": 10435 + }, + { + "epoch": 1.79, + "grad_norm": 9.71757984161377, + "learning_rate": 3.1354041530804874e-06, + "loss": 0.3604, + "step": 10436 + }, + { + "epoch": 1.79, + "grad_norm": 12.548288345336914, + "learning_rate": 3.132829929637893e-06, + "loss": 0.4339, + "step": 10437 + }, + { + "epoch": 1.79, + "grad_norm": 9.2647066116333, + "learning_rate": 3.130255706195298e-06, + "loss": 0.3799, + "step": 10438 + }, + { + "epoch": 1.79, + "grad_norm": 9.122357368469238, + "learning_rate": 3.127681482752703e-06, + "loss": 0.2217, + "step": 10439 + }, + { + "epoch": 1.79, + "grad_norm": 7.940947532653809, + "learning_rate": 3.125107259310108e-06, + "loss": 0.2518, + "step": 10440 + }, + { + "epoch": 1.79, + "grad_norm": 10.62997055053711, + "learning_rate": 3.122533035867513e-06, + "loss": 0.3733, + "step": 10441 + }, + { + "epoch": 1.79, + "grad_norm": 6.836966037750244, + "learning_rate": 3.1199588124249186e-06, + "loss": 0.1797, + "step": 10442 + }, + { + "epoch": 1.79, + "grad_norm": 14.723001480102539, + "learning_rate": 3.117384588982324e-06, + "loss": 0.4777, + "step": 10443 + }, + { + "epoch": 1.79, + "grad_norm": 6.959872722625732, + "learning_rate": 3.114810365539729e-06, + "loss": 0.2769, + "step": 10444 + }, + { + "epoch": 1.79, + "grad_norm": 8.834935188293457, + "learning_rate": 3.1122361420971344e-06, + "loss": 0.3563, + "step": 10445 + }, + { + "epoch": 1.79, + "grad_norm": 10.509300231933594, + "learning_rate": 3.109661918654539e-06, + "loss": 0.3915, + "step": 10446 + }, + { + "epoch": 1.79, + "grad_norm": 10.07111930847168, + "learning_rate": 3.1070876952119444e-06, + "loss": 0.3267, + "step": 10447 + }, + { + "epoch": 1.79, + "grad_norm": 11.043294906616211, + "learning_rate": 3.10451347176935e-06, + "loss": 0.4405, + "step": 10448 + }, + { + "epoch": 1.79, + "grad_norm": 15.18427562713623, + "learning_rate": 3.1019392483267548e-06, + "loss": 0.5181, + "step": 10449 + }, + { + "epoch": 1.79, + "grad_norm": 9.779806137084961, + "learning_rate": 3.09936502488416e-06, + "loss": 0.2407, + "step": 10450 + }, + { + "epoch": 1.79, + "grad_norm": 11.736502647399902, + "learning_rate": 3.096790801441565e-06, + "loss": 0.3411, + "step": 10451 + }, + { + "epoch": 1.79, + "grad_norm": 9.774645805358887, + "learning_rate": 3.09421657799897e-06, + "loss": 0.4571, + "step": 10452 + }, + { + "epoch": 1.79, + "grad_norm": 12.356518745422363, + "learning_rate": 3.0916423545563756e-06, + "loss": 0.4493, + "step": 10453 + }, + { + "epoch": 1.79, + "grad_norm": 12.229302406311035, + "learning_rate": 3.0890681311137806e-06, + "loss": 0.3212, + "step": 10454 + }, + { + "epoch": 1.79, + "grad_norm": 9.343244552612305, + "learning_rate": 3.086493907671186e-06, + "loss": 0.2629, + "step": 10455 + }, + { + "epoch": 1.79, + "grad_norm": 11.548308372497559, + "learning_rate": 3.0839196842285914e-06, + "loss": 0.409, + "step": 10456 + }, + { + "epoch": 1.79, + "grad_norm": 10.149595260620117, + "learning_rate": 3.0813454607859964e-06, + "loss": 0.3767, + "step": 10457 + }, + { + "epoch": 1.79, + "grad_norm": 10.739317893981934, + "learning_rate": 3.078771237343402e-06, + "loss": 0.5334, + "step": 10458 + }, + { + "epoch": 1.79, + "grad_norm": 8.763712882995605, + "learning_rate": 3.0761970139008064e-06, + "loss": 0.335, + "step": 10459 + }, + { + "epoch": 1.8, + "grad_norm": 7.9581499099731445, + "learning_rate": 3.0736227904582118e-06, + "loss": 0.287, + "step": 10460 + }, + { + "epoch": 1.8, + "grad_norm": 9.953165054321289, + "learning_rate": 3.071048567015617e-06, + "loss": 0.3215, + "step": 10461 + }, + { + "epoch": 1.8, + "grad_norm": 10.7612943649292, + "learning_rate": 3.068474343573022e-06, + "loss": 0.3962, + "step": 10462 + }, + { + "epoch": 1.8, + "grad_norm": 11.252572059631348, + "learning_rate": 3.0659001201304276e-06, + "loss": 0.3712, + "step": 10463 + }, + { + "epoch": 1.8, + "grad_norm": 9.732776641845703, + "learning_rate": 3.0633258966878326e-06, + "loss": 0.4333, + "step": 10464 + }, + { + "epoch": 1.8, + "grad_norm": 13.15046501159668, + "learning_rate": 3.0607516732452375e-06, + "loss": 0.4144, + "step": 10465 + }, + { + "epoch": 1.8, + "grad_norm": 10.169456481933594, + "learning_rate": 3.058177449802643e-06, + "loss": 0.3307, + "step": 10466 + }, + { + "epoch": 1.8, + "grad_norm": 13.577850341796875, + "learning_rate": 3.055603226360048e-06, + "loss": 0.539, + "step": 10467 + }, + { + "epoch": 1.8, + "grad_norm": 10.831425666809082, + "learning_rate": 3.0530290029174534e-06, + "loss": 0.5813, + "step": 10468 + }, + { + "epoch": 1.8, + "grad_norm": 13.429529190063477, + "learning_rate": 3.0504547794748588e-06, + "loss": 0.6707, + "step": 10469 + }, + { + "epoch": 1.8, + "grad_norm": 13.450451850891113, + "learning_rate": 3.0478805560322638e-06, + "loss": 0.485, + "step": 10470 + }, + { + "epoch": 1.8, + "grad_norm": 9.50145149230957, + "learning_rate": 3.045306332589669e-06, + "loss": 0.3956, + "step": 10471 + }, + { + "epoch": 1.8, + "grad_norm": 10.277445793151855, + "learning_rate": 3.0427321091470737e-06, + "loss": 0.2804, + "step": 10472 + }, + { + "epoch": 1.8, + "grad_norm": 11.106534004211426, + "learning_rate": 3.040157885704479e-06, + "loss": 0.4831, + "step": 10473 + }, + { + "epoch": 1.8, + "grad_norm": 12.272245407104492, + "learning_rate": 3.0375836622618846e-06, + "loss": 0.434, + "step": 10474 + }, + { + "epoch": 1.8, + "grad_norm": 8.200016021728516, + "learning_rate": 3.0350094388192895e-06, + "loss": 0.3605, + "step": 10475 + }, + { + "epoch": 1.8, + "grad_norm": 7.604349136352539, + "learning_rate": 3.032435215376695e-06, + "loss": 0.319, + "step": 10476 + }, + { + "epoch": 1.8, + "grad_norm": 13.937857627868652, + "learning_rate": 3.0298609919341e-06, + "loss": 0.4413, + "step": 10477 + }, + { + "epoch": 1.8, + "grad_norm": 11.702960968017578, + "learning_rate": 3.027286768491505e-06, + "loss": 0.3216, + "step": 10478 + }, + { + "epoch": 1.8, + "grad_norm": 8.42029857635498, + "learning_rate": 3.0247125450489103e-06, + "loss": 0.3126, + "step": 10479 + }, + { + "epoch": 1.8, + "grad_norm": 12.8287992477417, + "learning_rate": 3.0221383216063153e-06, + "loss": 0.4181, + "step": 10480 + }, + { + "epoch": 1.8, + "grad_norm": 13.196413040161133, + "learning_rate": 3.0195640981637207e-06, + "loss": 0.3181, + "step": 10481 + }, + { + "epoch": 1.8, + "grad_norm": 10.274801254272461, + "learning_rate": 3.016989874721126e-06, + "loss": 0.3049, + "step": 10482 + }, + { + "epoch": 1.8, + "grad_norm": 8.921868324279785, + "learning_rate": 3.014415651278531e-06, + "loss": 0.4739, + "step": 10483 + }, + { + "epoch": 1.8, + "grad_norm": 9.26804256439209, + "learning_rate": 3.0118414278359365e-06, + "loss": 0.3415, + "step": 10484 + }, + { + "epoch": 1.8, + "grad_norm": 12.426739692687988, + "learning_rate": 3.009267204393341e-06, + "loss": 0.4155, + "step": 10485 + }, + { + "epoch": 1.8, + "grad_norm": 12.302928924560547, + "learning_rate": 3.0066929809507465e-06, + "loss": 0.4515, + "step": 10486 + }, + { + "epoch": 1.8, + "grad_norm": 11.730430603027344, + "learning_rate": 3.004118757508152e-06, + "loss": 0.3515, + "step": 10487 + }, + { + "epoch": 1.8, + "grad_norm": 10.60925579071045, + "learning_rate": 3.001544534065557e-06, + "loss": 0.4456, + "step": 10488 + }, + { + "epoch": 1.8, + "grad_norm": 10.795517921447754, + "learning_rate": 2.9989703106229623e-06, + "loss": 0.5469, + "step": 10489 + }, + { + "epoch": 1.8, + "grad_norm": 9.022937774658203, + "learning_rate": 2.9963960871803673e-06, + "loss": 0.3824, + "step": 10490 + }, + { + "epoch": 1.8, + "grad_norm": 12.785831451416016, + "learning_rate": 2.9938218637377723e-06, + "loss": 0.6176, + "step": 10491 + }, + { + "epoch": 1.8, + "grad_norm": 12.09007740020752, + "learning_rate": 2.9912476402951777e-06, + "loss": 0.3668, + "step": 10492 + }, + { + "epoch": 1.8, + "grad_norm": 10.240140914916992, + "learning_rate": 2.9886734168525827e-06, + "loss": 0.3375, + "step": 10493 + }, + { + "epoch": 1.8, + "grad_norm": 8.147300720214844, + "learning_rate": 2.986099193409988e-06, + "loss": 0.2585, + "step": 10494 + }, + { + "epoch": 1.8, + "grad_norm": 12.183375358581543, + "learning_rate": 2.9835249699673935e-06, + "loss": 0.324, + "step": 10495 + }, + { + "epoch": 1.8, + "grad_norm": 15.306490898132324, + "learning_rate": 2.9809507465247985e-06, + "loss": 0.4229, + "step": 10496 + }, + { + "epoch": 1.8, + "grad_norm": 9.241902351379395, + "learning_rate": 2.978376523082204e-06, + "loss": 0.4034, + "step": 10497 + }, + { + "epoch": 1.8, + "grad_norm": 13.103628158569336, + "learning_rate": 2.9758022996396085e-06, + "loss": 0.4971, + "step": 10498 + }, + { + "epoch": 1.8, + "grad_norm": 12.960226058959961, + "learning_rate": 2.973228076197014e-06, + "loss": 0.4651, + "step": 10499 + }, + { + "epoch": 1.8, + "grad_norm": 10.878409385681152, + "learning_rate": 2.9706538527544193e-06, + "loss": 0.4128, + "step": 10500 + }, + { + "epoch": 1.8, + "grad_norm": 8.149812698364258, + "learning_rate": 2.9680796293118243e-06, + "loss": 0.3229, + "step": 10501 + }, + { + "epoch": 1.8, + "grad_norm": 9.156515121459961, + "learning_rate": 2.9655054058692297e-06, + "loss": 0.2426, + "step": 10502 + }, + { + "epoch": 1.8, + "grad_norm": 12.101290702819824, + "learning_rate": 2.9629311824266347e-06, + "loss": 0.5479, + "step": 10503 + }, + { + "epoch": 1.8, + "grad_norm": 9.113824844360352, + "learning_rate": 2.9603569589840397e-06, + "loss": 0.2417, + "step": 10504 + }, + { + "epoch": 1.8, + "grad_norm": 17.302640914916992, + "learning_rate": 2.957782735541445e-06, + "loss": 0.471, + "step": 10505 + }, + { + "epoch": 1.8, + "grad_norm": 17.096059799194336, + "learning_rate": 2.95520851209885e-06, + "loss": 0.3605, + "step": 10506 + }, + { + "epoch": 1.8, + "grad_norm": 9.7495698928833, + "learning_rate": 2.9526342886562555e-06, + "loss": 0.3854, + "step": 10507 + }, + { + "epoch": 1.8, + "grad_norm": 9.65858268737793, + "learning_rate": 2.950060065213661e-06, + "loss": 0.2668, + "step": 10508 + }, + { + "epoch": 1.8, + "grad_norm": 10.74853515625, + "learning_rate": 2.947485841771066e-06, + "loss": 0.328, + "step": 10509 + }, + { + "epoch": 1.8, + "grad_norm": 11.641555786132812, + "learning_rate": 2.944911618328471e-06, + "loss": 0.4549, + "step": 10510 + }, + { + "epoch": 1.8, + "grad_norm": 14.706530570983887, + "learning_rate": 2.942337394885876e-06, + "loss": 0.4255, + "step": 10511 + }, + { + "epoch": 1.8, + "grad_norm": 15.585078239440918, + "learning_rate": 2.9397631714432813e-06, + "loss": 0.4299, + "step": 10512 + }, + { + "epoch": 1.8, + "grad_norm": 6.333610534667969, + "learning_rate": 2.9371889480006867e-06, + "loss": 0.1562, + "step": 10513 + }, + { + "epoch": 1.8, + "grad_norm": 9.311840057373047, + "learning_rate": 2.9346147245580917e-06, + "loss": 0.2095, + "step": 10514 + }, + { + "epoch": 1.8, + "grad_norm": 10.91805648803711, + "learning_rate": 2.932040501115497e-06, + "loss": 0.5584, + "step": 10515 + }, + { + "epoch": 1.8, + "grad_norm": 11.733766555786133, + "learning_rate": 2.929466277672902e-06, + "loss": 0.3841, + "step": 10516 + }, + { + "epoch": 1.8, + "grad_norm": 10.359426498413086, + "learning_rate": 2.926892054230307e-06, + "loss": 0.3687, + "step": 10517 + }, + { + "epoch": 1.81, + "grad_norm": 10.029149055480957, + "learning_rate": 2.9243178307877125e-06, + "loss": 0.2796, + "step": 10518 + }, + { + "epoch": 1.81, + "grad_norm": 10.420923233032227, + "learning_rate": 2.9217436073451175e-06, + "loss": 0.3452, + "step": 10519 + }, + { + "epoch": 1.81, + "grad_norm": 12.36894702911377, + "learning_rate": 2.919169383902523e-06, + "loss": 0.3778, + "step": 10520 + }, + { + "epoch": 1.81, + "grad_norm": 8.38289737701416, + "learning_rate": 2.9165951604599283e-06, + "loss": 0.3045, + "step": 10521 + }, + { + "epoch": 1.81, + "grad_norm": 9.05566692352295, + "learning_rate": 2.9140209370173333e-06, + "loss": 0.2796, + "step": 10522 + }, + { + "epoch": 1.81, + "grad_norm": 13.177206039428711, + "learning_rate": 2.9114467135747382e-06, + "loss": 0.4427, + "step": 10523 + }, + { + "epoch": 1.81, + "grad_norm": 9.995177268981934, + "learning_rate": 2.9088724901321432e-06, + "loss": 0.3749, + "step": 10524 + }, + { + "epoch": 1.81, + "grad_norm": 10.074337005615234, + "learning_rate": 2.9062982666895486e-06, + "loss": 0.4551, + "step": 10525 + }, + { + "epoch": 1.81, + "grad_norm": 9.752120971679688, + "learning_rate": 2.903724043246954e-06, + "loss": 0.3661, + "step": 10526 + }, + { + "epoch": 1.81, + "grad_norm": 8.213761329650879, + "learning_rate": 2.901149819804359e-06, + "loss": 0.2887, + "step": 10527 + }, + { + "epoch": 1.81, + "grad_norm": 8.839393615722656, + "learning_rate": 2.8985755963617645e-06, + "loss": 0.2288, + "step": 10528 + }, + { + "epoch": 1.81, + "grad_norm": 13.381450653076172, + "learning_rate": 2.8960013729191694e-06, + "loss": 0.3318, + "step": 10529 + }, + { + "epoch": 1.81, + "grad_norm": 7.615515232086182, + "learning_rate": 2.8934271494765744e-06, + "loss": 0.2597, + "step": 10530 + }, + { + "epoch": 1.81, + "grad_norm": 9.628311157226562, + "learning_rate": 2.89085292603398e-06, + "loss": 0.4385, + "step": 10531 + }, + { + "epoch": 1.81, + "grad_norm": 9.040813446044922, + "learning_rate": 2.888278702591385e-06, + "loss": 0.3115, + "step": 10532 + }, + { + "epoch": 1.81, + "grad_norm": 8.033926963806152, + "learning_rate": 2.8857044791487902e-06, + "loss": 0.4985, + "step": 10533 + }, + { + "epoch": 1.81, + "grad_norm": 10.547693252563477, + "learning_rate": 2.8831302557061956e-06, + "loss": 0.3176, + "step": 10534 + }, + { + "epoch": 1.81, + "grad_norm": 10.174239158630371, + "learning_rate": 2.8805560322636006e-06, + "loss": 0.4017, + "step": 10535 + }, + { + "epoch": 1.81, + "grad_norm": 11.291644096374512, + "learning_rate": 2.8779818088210056e-06, + "loss": 0.3642, + "step": 10536 + }, + { + "epoch": 1.81, + "grad_norm": 13.902597427368164, + "learning_rate": 2.8754075853784106e-06, + "loss": 0.3852, + "step": 10537 + }, + { + "epoch": 1.81, + "grad_norm": 6.805796146392822, + "learning_rate": 2.872833361935816e-06, + "loss": 0.2278, + "step": 10538 + }, + { + "epoch": 1.81, + "grad_norm": 10.182024002075195, + "learning_rate": 2.8702591384932214e-06, + "loss": 0.3623, + "step": 10539 + }, + { + "epoch": 1.81, + "grad_norm": 9.437385559082031, + "learning_rate": 2.8676849150506264e-06, + "loss": 0.4775, + "step": 10540 + }, + { + "epoch": 1.81, + "grad_norm": 11.261222839355469, + "learning_rate": 2.865110691608032e-06, + "loss": 0.3547, + "step": 10541 + }, + { + "epoch": 1.81, + "grad_norm": 13.4518404006958, + "learning_rate": 2.862536468165437e-06, + "loss": 0.3631, + "step": 10542 + }, + { + "epoch": 1.81, + "grad_norm": 10.53903865814209, + "learning_rate": 2.859962244722842e-06, + "loss": 0.2195, + "step": 10543 + }, + { + "epoch": 1.81, + "grad_norm": 8.522038459777832, + "learning_rate": 2.8573880212802472e-06, + "loss": 0.2779, + "step": 10544 + }, + { + "epoch": 1.81, + "grad_norm": 12.359233856201172, + "learning_rate": 2.854813797837652e-06, + "loss": 0.4727, + "step": 10545 + }, + { + "epoch": 1.81, + "grad_norm": 10.91067886352539, + "learning_rate": 2.8522395743950576e-06, + "loss": 0.3306, + "step": 10546 + }, + { + "epoch": 1.81, + "grad_norm": 10.986410140991211, + "learning_rate": 2.849665350952463e-06, + "loss": 0.3438, + "step": 10547 + }, + { + "epoch": 1.81, + "grad_norm": 8.411774635314941, + "learning_rate": 2.847091127509868e-06, + "loss": 0.2605, + "step": 10548 + }, + { + "epoch": 1.81, + "grad_norm": 8.776211738586426, + "learning_rate": 2.844516904067273e-06, + "loss": 0.2656, + "step": 10549 + }, + { + "epoch": 1.81, + "grad_norm": 13.012097358703613, + "learning_rate": 2.841942680624678e-06, + "loss": 0.419, + "step": 10550 + }, + { + "epoch": 1.81, + "grad_norm": 9.288351058959961, + "learning_rate": 2.8393684571820834e-06, + "loss": 0.2623, + "step": 10551 + }, + { + "epoch": 1.81, + "grad_norm": 11.65538215637207, + "learning_rate": 2.836794233739489e-06, + "loss": 0.4126, + "step": 10552 + }, + { + "epoch": 1.81, + "grad_norm": 7.261811256408691, + "learning_rate": 2.834220010296894e-06, + "loss": 0.346, + "step": 10553 + }, + { + "epoch": 1.81, + "grad_norm": 10.221818923950195, + "learning_rate": 2.831645786854299e-06, + "loss": 0.2815, + "step": 10554 + }, + { + "epoch": 1.81, + "grad_norm": 11.045987129211426, + "learning_rate": 2.829071563411704e-06, + "loss": 0.4013, + "step": 10555 + }, + { + "epoch": 1.81, + "grad_norm": 13.827857971191406, + "learning_rate": 2.826497339969109e-06, + "loss": 0.3692, + "step": 10556 + }, + { + "epoch": 1.81, + "grad_norm": 10.186592102050781, + "learning_rate": 2.8239231165265146e-06, + "loss": 0.3897, + "step": 10557 + }, + { + "epoch": 1.81, + "grad_norm": 9.697921752929688, + "learning_rate": 2.8213488930839196e-06, + "loss": 0.3147, + "step": 10558 + }, + { + "epoch": 1.81, + "grad_norm": 11.759190559387207, + "learning_rate": 2.818774669641325e-06, + "loss": 0.3754, + "step": 10559 + }, + { + "epoch": 1.81, + "grad_norm": 11.325169563293457, + "learning_rate": 2.8162004461987304e-06, + "loss": 0.4412, + "step": 10560 + }, + { + "epoch": 1.81, + "grad_norm": 11.071754455566406, + "learning_rate": 2.8136262227561354e-06, + "loss": 0.4155, + "step": 10561 + }, + { + "epoch": 1.81, + "grad_norm": 11.39793872833252, + "learning_rate": 2.8110519993135404e-06, + "loss": 0.4341, + "step": 10562 + }, + { + "epoch": 1.81, + "grad_norm": 8.861881256103516, + "learning_rate": 2.8084777758709454e-06, + "loss": 0.5423, + "step": 10563 + }, + { + "epoch": 1.81, + "grad_norm": 11.513286590576172, + "learning_rate": 2.8059035524283508e-06, + "loss": 0.4751, + "step": 10564 + }, + { + "epoch": 1.81, + "grad_norm": 12.263647079467773, + "learning_rate": 2.803329328985756e-06, + "loss": 0.3686, + "step": 10565 + }, + { + "epoch": 1.81, + "grad_norm": 9.746999740600586, + "learning_rate": 2.800755105543161e-06, + "loss": 0.3899, + "step": 10566 + }, + { + "epoch": 1.81, + "grad_norm": 11.294825553894043, + "learning_rate": 2.7981808821005666e-06, + "loss": 0.4866, + "step": 10567 + }, + { + "epoch": 1.81, + "grad_norm": 15.171945571899414, + "learning_rate": 2.7956066586579716e-06, + "loss": 0.5816, + "step": 10568 + }, + { + "epoch": 1.81, + "grad_norm": 12.502220153808594, + "learning_rate": 2.7930324352153766e-06, + "loss": 0.4326, + "step": 10569 + }, + { + "epoch": 1.81, + "grad_norm": 11.98552131652832, + "learning_rate": 2.790458211772782e-06, + "loss": 0.436, + "step": 10570 + }, + { + "epoch": 1.81, + "grad_norm": 11.707756996154785, + "learning_rate": 2.787883988330187e-06, + "loss": 0.2927, + "step": 10571 + }, + { + "epoch": 1.81, + "grad_norm": 8.05388069152832, + "learning_rate": 2.7853097648875924e-06, + "loss": 0.3858, + "step": 10572 + }, + { + "epoch": 1.81, + "grad_norm": 14.113991737365723, + "learning_rate": 2.7827355414449978e-06, + "loss": 0.465, + "step": 10573 + }, + { + "epoch": 1.81, + "grad_norm": 10.309956550598145, + "learning_rate": 2.7801613180024028e-06, + "loss": 0.3354, + "step": 10574 + }, + { + "epoch": 1.81, + "grad_norm": 9.145524024963379, + "learning_rate": 2.7775870945598078e-06, + "loss": 0.2914, + "step": 10575 + }, + { + "epoch": 1.81, + "grad_norm": 12.339261054992676, + "learning_rate": 2.7750128711172127e-06, + "loss": 0.476, + "step": 10576 + }, + { + "epoch": 1.82, + "grad_norm": 7.763864994049072, + "learning_rate": 2.772438647674618e-06, + "loss": 0.2999, + "step": 10577 + }, + { + "epoch": 1.82, + "grad_norm": 17.158241271972656, + "learning_rate": 2.7698644242320236e-06, + "loss": 0.5204, + "step": 10578 + }, + { + "epoch": 1.82, + "grad_norm": 12.755532264709473, + "learning_rate": 2.7672902007894285e-06, + "loss": 0.346, + "step": 10579 + }, + { + "epoch": 1.82, + "grad_norm": 10.783491134643555, + "learning_rate": 2.764715977346834e-06, + "loss": 0.421, + "step": 10580 + }, + { + "epoch": 1.82, + "grad_norm": 13.496931076049805, + "learning_rate": 2.762141753904239e-06, + "loss": 0.4242, + "step": 10581 + }, + { + "epoch": 1.82, + "grad_norm": 8.753774642944336, + "learning_rate": 2.759567530461644e-06, + "loss": 0.2941, + "step": 10582 + }, + { + "epoch": 1.82, + "grad_norm": 10.52796745300293, + "learning_rate": 2.7569933070190493e-06, + "loss": 0.3719, + "step": 10583 + }, + { + "epoch": 1.82, + "grad_norm": 7.800555229187012, + "learning_rate": 2.7544190835764543e-06, + "loss": 0.2398, + "step": 10584 + }, + { + "epoch": 1.82, + "grad_norm": 7.637390613555908, + "learning_rate": 2.7518448601338597e-06, + "loss": 0.2663, + "step": 10585 + }, + { + "epoch": 1.82, + "grad_norm": 14.037017822265625, + "learning_rate": 2.749270636691265e-06, + "loss": 0.4499, + "step": 10586 + }, + { + "epoch": 1.82, + "grad_norm": 9.532233238220215, + "learning_rate": 2.74669641324867e-06, + "loss": 0.2082, + "step": 10587 + }, + { + "epoch": 1.82, + "grad_norm": 12.800809860229492, + "learning_rate": 2.744122189806075e-06, + "loss": 0.3746, + "step": 10588 + }, + { + "epoch": 1.82, + "grad_norm": 10.036859512329102, + "learning_rate": 2.74154796636348e-06, + "loss": 0.338, + "step": 10589 + }, + { + "epoch": 1.82, + "grad_norm": 9.914230346679688, + "learning_rate": 2.7389737429208855e-06, + "loss": 0.4457, + "step": 10590 + }, + { + "epoch": 1.82, + "grad_norm": 7.960654258728027, + "learning_rate": 2.736399519478291e-06, + "loss": 0.243, + "step": 10591 + }, + { + "epoch": 1.82, + "grad_norm": 11.706157684326172, + "learning_rate": 2.733825296035696e-06, + "loss": 0.5498, + "step": 10592 + }, + { + "epoch": 1.82, + "grad_norm": 8.224727630615234, + "learning_rate": 2.7312510725931013e-06, + "loss": 0.3512, + "step": 10593 + }, + { + "epoch": 1.82, + "grad_norm": 8.958061218261719, + "learning_rate": 2.7286768491505063e-06, + "loss": 0.2833, + "step": 10594 + }, + { + "epoch": 1.82, + "grad_norm": 14.539462089538574, + "learning_rate": 2.7261026257079113e-06, + "loss": 0.3899, + "step": 10595 + }, + { + "epoch": 1.82, + "grad_norm": 11.695630073547363, + "learning_rate": 2.7235284022653167e-06, + "loss": 0.6017, + "step": 10596 + }, + { + "epoch": 1.82, + "grad_norm": 14.647037506103516, + "learning_rate": 2.7209541788227217e-06, + "loss": 0.3946, + "step": 10597 + }, + { + "epoch": 1.82, + "grad_norm": 13.245928764343262, + "learning_rate": 2.718379955380127e-06, + "loss": 0.439, + "step": 10598 + }, + { + "epoch": 1.82, + "grad_norm": 8.757686614990234, + "learning_rate": 2.7158057319375325e-06, + "loss": 0.2704, + "step": 10599 + }, + { + "epoch": 1.82, + "grad_norm": 12.669482231140137, + "learning_rate": 2.7132315084949375e-06, + "loss": 0.3674, + "step": 10600 + }, + { + "epoch": 1.82, + "grad_norm": 12.072402954101562, + "learning_rate": 2.7106572850523425e-06, + "loss": 0.411, + "step": 10601 + }, + { + "epoch": 1.82, + "grad_norm": 11.48277473449707, + "learning_rate": 2.7080830616097475e-06, + "loss": 0.5575, + "step": 10602 + }, + { + "epoch": 1.82, + "grad_norm": 12.001754760742188, + "learning_rate": 2.705508838167153e-06, + "loss": 0.4023, + "step": 10603 + }, + { + "epoch": 1.82, + "grad_norm": 10.322209358215332, + "learning_rate": 2.7029346147245583e-06, + "loss": 0.3281, + "step": 10604 + }, + { + "epoch": 1.82, + "grad_norm": 9.982292175292969, + "learning_rate": 2.7003603912819633e-06, + "loss": 0.3397, + "step": 10605 + }, + { + "epoch": 1.82, + "grad_norm": 10.836981773376465, + "learning_rate": 2.6977861678393687e-06, + "loss": 0.2666, + "step": 10606 + }, + { + "epoch": 1.82, + "grad_norm": 10.477084159851074, + "learning_rate": 2.6952119443967737e-06, + "loss": 0.338, + "step": 10607 + }, + { + "epoch": 1.82, + "grad_norm": 12.189910888671875, + "learning_rate": 2.6926377209541787e-06, + "loss": 0.3507, + "step": 10608 + }, + { + "epoch": 1.82, + "grad_norm": 9.914125442504883, + "learning_rate": 2.690063497511584e-06, + "loss": 0.3637, + "step": 10609 + }, + { + "epoch": 1.82, + "grad_norm": 9.882555961608887, + "learning_rate": 2.687489274068989e-06, + "loss": 0.4148, + "step": 10610 + }, + { + "epoch": 1.82, + "grad_norm": 9.456730842590332, + "learning_rate": 2.6849150506263945e-06, + "loss": 0.2627, + "step": 10611 + }, + { + "epoch": 1.82, + "grad_norm": 14.048463821411133, + "learning_rate": 2.6823408271838e-06, + "loss": 0.3487, + "step": 10612 + }, + { + "epoch": 1.82, + "grad_norm": 11.227310180664062, + "learning_rate": 2.679766603741205e-06, + "loss": 0.2775, + "step": 10613 + }, + { + "epoch": 1.82, + "grad_norm": 10.190180778503418, + "learning_rate": 2.67719238029861e-06, + "loss": 0.3063, + "step": 10614 + }, + { + "epoch": 1.82, + "grad_norm": 9.883695602416992, + "learning_rate": 2.674618156856015e-06, + "loss": 0.4502, + "step": 10615 + }, + { + "epoch": 1.82, + "grad_norm": 14.71688461303711, + "learning_rate": 2.6720439334134203e-06, + "loss": 0.5068, + "step": 10616 + }, + { + "epoch": 1.82, + "grad_norm": 11.551656723022461, + "learning_rate": 2.6694697099708257e-06, + "loss": 0.2955, + "step": 10617 + }, + { + "epoch": 1.82, + "grad_norm": 10.369060516357422, + "learning_rate": 2.6668954865282307e-06, + "loss": 0.2576, + "step": 10618 + }, + { + "epoch": 1.82, + "grad_norm": 8.58068561553955, + "learning_rate": 2.664321263085636e-06, + "loss": 0.3137, + "step": 10619 + }, + { + "epoch": 1.82, + "grad_norm": 10.581269264221191, + "learning_rate": 2.661747039643041e-06, + "loss": 0.3654, + "step": 10620 + }, + { + "epoch": 1.82, + "grad_norm": 7.775806427001953, + "learning_rate": 2.659172816200446e-06, + "loss": 0.2647, + "step": 10621 + }, + { + "epoch": 1.82, + "grad_norm": 10.271175384521484, + "learning_rate": 2.6565985927578515e-06, + "loss": 0.3589, + "step": 10622 + }, + { + "epoch": 1.82, + "grad_norm": 8.722084999084473, + "learning_rate": 2.6540243693152565e-06, + "loss": 0.3768, + "step": 10623 + }, + { + "epoch": 1.82, + "grad_norm": 8.016916275024414, + "learning_rate": 2.651450145872662e-06, + "loss": 0.3311, + "step": 10624 + }, + { + "epoch": 1.82, + "grad_norm": 13.523698806762695, + "learning_rate": 2.6488759224300673e-06, + "loss": 0.3587, + "step": 10625 + }, + { + "epoch": 1.82, + "grad_norm": 12.232637405395508, + "learning_rate": 2.6463016989874723e-06, + "loss": 0.333, + "step": 10626 + }, + { + "epoch": 1.82, + "grad_norm": 33.780067443847656, + "learning_rate": 2.6437274755448773e-06, + "loss": 0.2737, + "step": 10627 + }, + { + "epoch": 1.82, + "grad_norm": 13.261059761047363, + "learning_rate": 2.6411532521022822e-06, + "loss": 0.5378, + "step": 10628 + }, + { + "epoch": 1.82, + "grad_norm": 16.210847854614258, + "learning_rate": 2.6385790286596877e-06, + "loss": 0.5803, + "step": 10629 + }, + { + "epoch": 1.82, + "grad_norm": 17.004446029663086, + "learning_rate": 2.636004805217093e-06, + "loss": 0.5725, + "step": 10630 + }, + { + "epoch": 1.82, + "grad_norm": 8.11176586151123, + "learning_rate": 2.633430581774498e-06, + "loss": 0.252, + "step": 10631 + }, + { + "epoch": 1.82, + "grad_norm": 13.555098533630371, + "learning_rate": 2.6308563583319035e-06, + "loss": 0.5343, + "step": 10632 + }, + { + "epoch": 1.82, + "grad_norm": 12.724139213562012, + "learning_rate": 2.6282821348893085e-06, + "loss": 0.2942, + "step": 10633 + }, + { + "epoch": 1.82, + "grad_norm": 10.260353088378906, + "learning_rate": 2.6257079114467134e-06, + "loss": 0.3384, + "step": 10634 + }, + { + "epoch": 1.83, + "grad_norm": 14.129500389099121, + "learning_rate": 2.623133688004119e-06, + "loss": 0.4465, + "step": 10635 + }, + { + "epoch": 1.83, + "grad_norm": 10.664830207824707, + "learning_rate": 2.620559464561524e-06, + "loss": 0.3828, + "step": 10636 + }, + { + "epoch": 1.83, + "grad_norm": 8.48660659790039, + "learning_rate": 2.6179852411189292e-06, + "loss": 0.243, + "step": 10637 + }, + { + "epoch": 1.83, + "grad_norm": 10.969393730163574, + "learning_rate": 2.6154110176763347e-06, + "loss": 0.3194, + "step": 10638 + }, + { + "epoch": 1.83, + "grad_norm": 8.693145751953125, + "learning_rate": 2.6128367942337396e-06, + "loss": 0.2953, + "step": 10639 + }, + { + "epoch": 1.83, + "grad_norm": 8.943690299987793, + "learning_rate": 2.6102625707911446e-06, + "loss": 0.3923, + "step": 10640 + }, + { + "epoch": 1.83, + "grad_norm": 5.0937981605529785, + "learning_rate": 2.6076883473485496e-06, + "loss": 0.1444, + "step": 10641 + }, + { + "epoch": 1.83, + "grad_norm": 8.566240310668945, + "learning_rate": 2.605114123905955e-06, + "loss": 0.3607, + "step": 10642 + }, + { + "epoch": 1.83, + "grad_norm": 7.5882415771484375, + "learning_rate": 2.6025399004633604e-06, + "loss": 0.2362, + "step": 10643 + }, + { + "epoch": 1.83, + "grad_norm": 8.648507118225098, + "learning_rate": 2.5999656770207654e-06, + "loss": 0.3617, + "step": 10644 + }, + { + "epoch": 1.83, + "grad_norm": 7.666114330291748, + "learning_rate": 2.597391453578171e-06, + "loss": 0.2689, + "step": 10645 + }, + { + "epoch": 1.83, + "grad_norm": 11.404210090637207, + "learning_rate": 2.594817230135576e-06, + "loss": 0.4654, + "step": 10646 + }, + { + "epoch": 1.83, + "grad_norm": 10.755274772644043, + "learning_rate": 2.592243006692981e-06, + "loss": 0.3033, + "step": 10647 + }, + { + "epoch": 1.83, + "grad_norm": 9.978460311889648, + "learning_rate": 2.5896687832503862e-06, + "loss": 0.3809, + "step": 10648 + }, + { + "epoch": 1.83, + "grad_norm": 10.506619453430176, + "learning_rate": 2.5870945598077912e-06, + "loss": 0.474, + "step": 10649 + }, + { + "epoch": 1.83, + "grad_norm": 11.281198501586914, + "learning_rate": 2.5845203363651966e-06, + "loss": 0.4336, + "step": 10650 + }, + { + "epoch": 1.83, + "grad_norm": 14.126219749450684, + "learning_rate": 2.581946112922602e-06, + "loss": 0.2947, + "step": 10651 + }, + { + "epoch": 1.83, + "grad_norm": 9.406610488891602, + "learning_rate": 2.579371889480007e-06, + "loss": 0.3549, + "step": 10652 + }, + { + "epoch": 1.83, + "grad_norm": 10.784308433532715, + "learning_rate": 2.576797666037412e-06, + "loss": 0.2985, + "step": 10653 + }, + { + "epoch": 1.83, + "grad_norm": 9.192948341369629, + "learning_rate": 2.574223442594817e-06, + "loss": 0.3816, + "step": 10654 + }, + { + "epoch": 1.83, + "grad_norm": 10.522122383117676, + "learning_rate": 2.5716492191522224e-06, + "loss": 0.3252, + "step": 10655 + }, + { + "epoch": 1.83, + "grad_norm": 10.236865997314453, + "learning_rate": 2.569074995709628e-06, + "loss": 0.31, + "step": 10656 + }, + { + "epoch": 1.83, + "grad_norm": 11.734746932983398, + "learning_rate": 2.566500772267033e-06, + "loss": 0.2426, + "step": 10657 + }, + { + "epoch": 1.83, + "grad_norm": 11.649822235107422, + "learning_rate": 2.5639265488244382e-06, + "loss": 0.4776, + "step": 10658 + }, + { + "epoch": 1.83, + "grad_norm": 8.669262886047363, + "learning_rate": 2.561352325381843e-06, + "loss": 0.3362, + "step": 10659 + }, + { + "epoch": 1.83, + "grad_norm": 9.61310863494873, + "learning_rate": 2.558778101939248e-06, + "loss": 0.2083, + "step": 10660 + }, + { + "epoch": 1.83, + "grad_norm": 10.194008827209473, + "learning_rate": 2.5562038784966536e-06, + "loss": 0.3197, + "step": 10661 + }, + { + "epoch": 1.83, + "grad_norm": 10.544124603271484, + "learning_rate": 2.5536296550540586e-06, + "loss": 0.3584, + "step": 10662 + }, + { + "epoch": 1.83, + "grad_norm": 8.95773696899414, + "learning_rate": 2.551055431611464e-06, + "loss": 0.2641, + "step": 10663 + }, + { + "epoch": 1.83, + "grad_norm": 10.994258880615234, + "learning_rate": 2.5484812081688694e-06, + "loss": 0.3933, + "step": 10664 + }, + { + "epoch": 1.83, + "grad_norm": 9.87153434753418, + "learning_rate": 2.5459069847262744e-06, + "loss": 0.2925, + "step": 10665 + }, + { + "epoch": 1.83, + "grad_norm": 10.127371788024902, + "learning_rate": 2.5433327612836794e-06, + "loss": 0.275, + "step": 10666 + }, + { + "epoch": 1.83, + "grad_norm": 10.941168785095215, + "learning_rate": 2.5407585378410844e-06, + "loss": 0.3046, + "step": 10667 + }, + { + "epoch": 1.83, + "grad_norm": 10.67513656616211, + "learning_rate": 2.5381843143984898e-06, + "loss": 0.3292, + "step": 10668 + }, + { + "epoch": 1.83, + "grad_norm": 8.871071815490723, + "learning_rate": 2.535610090955895e-06, + "loss": 0.2954, + "step": 10669 + }, + { + "epoch": 1.83, + "grad_norm": 9.738022804260254, + "learning_rate": 2.5330358675133e-06, + "loss": 0.3821, + "step": 10670 + }, + { + "epoch": 1.83, + "grad_norm": 11.993451118469238, + "learning_rate": 2.5304616440707056e-06, + "loss": 0.4609, + "step": 10671 + }, + { + "epoch": 1.83, + "grad_norm": 9.69482421875, + "learning_rate": 2.5278874206281106e-06, + "loss": 0.4201, + "step": 10672 + }, + { + "epoch": 1.83, + "grad_norm": 9.218208312988281, + "learning_rate": 2.5253131971855156e-06, + "loss": 0.3566, + "step": 10673 + }, + { + "epoch": 1.83, + "grad_norm": 12.55072021484375, + "learning_rate": 2.522738973742921e-06, + "loss": 0.3988, + "step": 10674 + }, + { + "epoch": 1.83, + "grad_norm": 10.507923126220703, + "learning_rate": 2.520164750300326e-06, + "loss": 0.4861, + "step": 10675 + }, + { + "epoch": 1.83, + "grad_norm": 13.510515213012695, + "learning_rate": 2.5175905268577314e-06, + "loss": 0.3368, + "step": 10676 + }, + { + "epoch": 1.83, + "grad_norm": 8.693829536437988, + "learning_rate": 2.515016303415137e-06, + "loss": 0.2396, + "step": 10677 + }, + { + "epoch": 1.83, + "grad_norm": 10.560959815979004, + "learning_rate": 2.5124420799725418e-06, + "loss": 0.4916, + "step": 10678 + }, + { + "epoch": 1.83, + "grad_norm": 13.765018463134766, + "learning_rate": 2.5098678565299468e-06, + "loss": 0.6102, + "step": 10679 + }, + { + "epoch": 1.83, + "grad_norm": 9.950790405273438, + "learning_rate": 2.5072936330873518e-06, + "loss": 0.4473, + "step": 10680 + }, + { + "epoch": 1.83, + "grad_norm": 13.45733642578125, + "learning_rate": 2.504719409644757e-06, + "loss": 0.4512, + "step": 10681 + }, + { + "epoch": 1.83, + "grad_norm": 11.298639297485352, + "learning_rate": 2.5021451862021626e-06, + "loss": 0.3228, + "step": 10682 + }, + { + "epoch": 1.83, + "grad_norm": 11.985605239868164, + "learning_rate": 2.4995709627595676e-06, + "loss": 0.5333, + "step": 10683 + }, + { + "epoch": 1.83, + "grad_norm": 12.020630836486816, + "learning_rate": 2.496996739316973e-06, + "loss": 0.5044, + "step": 10684 + }, + { + "epoch": 1.83, + "grad_norm": 12.97929859161377, + "learning_rate": 2.494422515874378e-06, + "loss": 0.3551, + "step": 10685 + }, + { + "epoch": 1.83, + "grad_norm": 8.300394058227539, + "learning_rate": 2.491848292431783e-06, + "loss": 0.408, + "step": 10686 + }, + { + "epoch": 1.83, + "grad_norm": 8.84925365447998, + "learning_rate": 2.4892740689891884e-06, + "loss": 0.3481, + "step": 10687 + }, + { + "epoch": 1.83, + "grad_norm": 10.51993179321289, + "learning_rate": 2.4866998455465933e-06, + "loss": 0.3779, + "step": 10688 + }, + { + "epoch": 1.83, + "grad_norm": 8.055931091308594, + "learning_rate": 2.4841256221039988e-06, + "loss": 0.3127, + "step": 10689 + }, + { + "epoch": 1.83, + "grad_norm": 8.135323524475098, + "learning_rate": 2.481551398661404e-06, + "loss": 0.3064, + "step": 10690 + }, + { + "epoch": 1.83, + "grad_norm": 9.34423828125, + "learning_rate": 2.478977175218809e-06, + "loss": 0.2762, + "step": 10691 + }, + { + "epoch": 1.83, + "grad_norm": 11.478259086608887, + "learning_rate": 2.476402951776214e-06, + "loss": 0.4732, + "step": 10692 + }, + { + "epoch": 1.84, + "grad_norm": 7.674147129058838, + "learning_rate": 2.473828728333619e-06, + "loss": 0.3008, + "step": 10693 + }, + { + "epoch": 1.84, + "grad_norm": 12.84994125366211, + "learning_rate": 2.4712545048910245e-06, + "loss": 0.4315, + "step": 10694 + }, + { + "epoch": 1.84, + "grad_norm": 11.25035285949707, + "learning_rate": 2.46868028144843e-06, + "loss": 0.2382, + "step": 10695 + }, + { + "epoch": 1.84, + "grad_norm": 8.903910636901855, + "learning_rate": 2.466106058005835e-06, + "loss": 0.3797, + "step": 10696 + }, + { + "epoch": 1.84, + "grad_norm": 8.617475509643555, + "learning_rate": 2.4635318345632403e-06, + "loss": 0.3788, + "step": 10697 + }, + { + "epoch": 1.84, + "grad_norm": 11.769457817077637, + "learning_rate": 2.4609576111206453e-06, + "loss": 0.3622, + "step": 10698 + }, + { + "epoch": 1.84, + "grad_norm": 8.13018798828125, + "learning_rate": 2.4583833876780503e-06, + "loss": 0.2796, + "step": 10699 + }, + { + "epoch": 1.84, + "grad_norm": 12.063539505004883, + "learning_rate": 2.4558091642354557e-06, + "loss": 0.3168, + "step": 10700 + }, + { + "epoch": 1.84, + "grad_norm": 11.092525482177734, + "learning_rate": 2.4532349407928607e-06, + "loss": 0.4787, + "step": 10701 + }, + { + "epoch": 1.84, + "grad_norm": 9.005880355834961, + "learning_rate": 2.450660717350266e-06, + "loss": 0.2767, + "step": 10702 + }, + { + "epoch": 1.84, + "grad_norm": 11.908269882202148, + "learning_rate": 2.4480864939076715e-06, + "loss": 0.27, + "step": 10703 + }, + { + "epoch": 1.84, + "grad_norm": 10.34350299835205, + "learning_rate": 2.4455122704650765e-06, + "loss": 0.4592, + "step": 10704 + }, + { + "epoch": 1.84, + "grad_norm": 7.498222827911377, + "learning_rate": 2.4429380470224815e-06, + "loss": 0.393, + "step": 10705 + }, + { + "epoch": 1.84, + "grad_norm": 9.66064739227295, + "learning_rate": 2.4403638235798865e-06, + "loss": 0.4346, + "step": 10706 + }, + { + "epoch": 1.84, + "grad_norm": 16.21183967590332, + "learning_rate": 2.437789600137292e-06, + "loss": 0.3454, + "step": 10707 + }, + { + "epoch": 1.84, + "grad_norm": 8.232462882995605, + "learning_rate": 2.4352153766946973e-06, + "loss": 0.3814, + "step": 10708 + }, + { + "epoch": 1.84, + "grad_norm": 9.885994911193848, + "learning_rate": 2.4326411532521023e-06, + "loss": 0.4438, + "step": 10709 + }, + { + "epoch": 1.84, + "grad_norm": 11.98500919342041, + "learning_rate": 2.4300669298095077e-06, + "loss": 0.4261, + "step": 10710 + }, + { + "epoch": 1.84, + "grad_norm": 9.819098472595215, + "learning_rate": 2.4274927063669127e-06, + "loss": 0.4565, + "step": 10711 + }, + { + "epoch": 1.84, + "grad_norm": 9.974862098693848, + "learning_rate": 2.4249184829243177e-06, + "loss": 0.455, + "step": 10712 + }, + { + "epoch": 1.84, + "grad_norm": 9.547346115112305, + "learning_rate": 2.422344259481723e-06, + "loss": 0.4271, + "step": 10713 + }, + { + "epoch": 1.84, + "grad_norm": 8.870933532714844, + "learning_rate": 2.419770036039128e-06, + "loss": 0.3525, + "step": 10714 + }, + { + "epoch": 1.84, + "grad_norm": 11.99450969696045, + "learning_rate": 2.4171958125965335e-06, + "loss": 0.5769, + "step": 10715 + }, + { + "epoch": 1.84, + "grad_norm": 14.365693092346191, + "learning_rate": 2.414621589153939e-06, + "loss": 0.4526, + "step": 10716 + }, + { + "epoch": 1.84, + "grad_norm": 13.111851692199707, + "learning_rate": 2.412047365711344e-06, + "loss": 0.364, + "step": 10717 + }, + { + "epoch": 1.84, + "grad_norm": 13.152235984802246, + "learning_rate": 2.409473142268749e-06, + "loss": 0.3813, + "step": 10718 + }, + { + "epoch": 1.84, + "grad_norm": 8.690625190734863, + "learning_rate": 2.406898918826154e-06, + "loss": 0.394, + "step": 10719 + }, + { + "epoch": 1.84, + "grad_norm": 10.726323127746582, + "learning_rate": 2.4043246953835593e-06, + "loss": 0.3937, + "step": 10720 + }, + { + "epoch": 1.84, + "grad_norm": 12.641265869140625, + "learning_rate": 2.4017504719409647e-06, + "loss": 0.4748, + "step": 10721 + }, + { + "epoch": 1.84, + "grad_norm": 7.505012035369873, + "learning_rate": 2.3991762484983697e-06, + "loss": 0.3154, + "step": 10722 + }, + { + "epoch": 1.84, + "grad_norm": 12.628533363342285, + "learning_rate": 2.396602025055775e-06, + "loss": 0.4944, + "step": 10723 + }, + { + "epoch": 1.84, + "grad_norm": 12.972872734069824, + "learning_rate": 2.39402780161318e-06, + "loss": 0.3235, + "step": 10724 + }, + { + "epoch": 1.84, + "grad_norm": 7.067104816436768, + "learning_rate": 2.391453578170585e-06, + "loss": 0.2797, + "step": 10725 + }, + { + "epoch": 1.84, + "grad_norm": 11.320572853088379, + "learning_rate": 2.3888793547279905e-06, + "loss": 0.3946, + "step": 10726 + }, + { + "epoch": 1.84, + "grad_norm": 11.184109687805176, + "learning_rate": 2.3863051312853955e-06, + "loss": 0.4283, + "step": 10727 + }, + { + "epoch": 1.84, + "grad_norm": 8.515572547912598, + "learning_rate": 2.383730907842801e-06, + "loss": 0.4074, + "step": 10728 + }, + { + "epoch": 1.84, + "grad_norm": 8.552699089050293, + "learning_rate": 2.3811566844002063e-06, + "loss": 0.3594, + "step": 10729 + }, + { + "epoch": 1.84, + "grad_norm": 12.412210464477539, + "learning_rate": 2.3785824609576113e-06, + "loss": 0.4901, + "step": 10730 + }, + { + "epoch": 1.84, + "grad_norm": 9.148717880249023, + "learning_rate": 2.3760082375150163e-06, + "loss": 0.4589, + "step": 10731 + }, + { + "epoch": 1.84, + "grad_norm": 11.961956977844238, + "learning_rate": 2.3734340140724213e-06, + "loss": 0.4587, + "step": 10732 + }, + { + "epoch": 1.84, + "grad_norm": 13.488950729370117, + "learning_rate": 2.3708597906298267e-06, + "loss": 0.4681, + "step": 10733 + }, + { + "epoch": 1.84, + "grad_norm": 10.658753395080566, + "learning_rate": 2.368285567187232e-06, + "loss": 0.42, + "step": 10734 + }, + { + "epoch": 1.84, + "grad_norm": 7.891936302185059, + "learning_rate": 2.365711343744637e-06, + "loss": 0.3699, + "step": 10735 + }, + { + "epoch": 1.84, + "grad_norm": 10.638575553894043, + "learning_rate": 2.3631371203020425e-06, + "loss": 0.4098, + "step": 10736 + }, + { + "epoch": 1.84, + "grad_norm": 16.642004013061523, + "learning_rate": 2.3605628968594475e-06, + "loss": 0.4884, + "step": 10737 + }, + { + "epoch": 1.84, + "grad_norm": 14.215935707092285, + "learning_rate": 2.3579886734168525e-06, + "loss": 0.3861, + "step": 10738 + }, + { + "epoch": 1.84, + "grad_norm": 10.332646369934082, + "learning_rate": 2.355414449974258e-06, + "loss": 0.3031, + "step": 10739 + }, + { + "epoch": 1.84, + "grad_norm": 10.552823066711426, + "learning_rate": 2.352840226531663e-06, + "loss": 0.3673, + "step": 10740 + }, + { + "epoch": 1.84, + "grad_norm": 7.942591190338135, + "learning_rate": 2.3502660030890683e-06, + "loss": 0.3187, + "step": 10741 + }, + { + "epoch": 1.84, + "grad_norm": 12.71848201751709, + "learning_rate": 2.3476917796464737e-06, + "loss": 0.4059, + "step": 10742 + }, + { + "epoch": 1.84, + "grad_norm": 12.311237335205078, + "learning_rate": 2.3451175562038787e-06, + "loss": 0.3469, + "step": 10743 + }, + { + "epoch": 1.84, + "grad_norm": 7.839985370635986, + "learning_rate": 2.3425433327612836e-06, + "loss": 0.2815, + "step": 10744 + }, + { + "epoch": 1.84, + "grad_norm": 7.345728874206543, + "learning_rate": 2.3399691093186886e-06, + "loss": 0.3124, + "step": 10745 + }, + { + "epoch": 1.84, + "grad_norm": 8.57292366027832, + "learning_rate": 2.337394885876094e-06, + "loss": 0.273, + "step": 10746 + }, + { + "epoch": 1.84, + "grad_norm": 12.953744888305664, + "learning_rate": 2.3348206624334995e-06, + "loss": 0.3599, + "step": 10747 + }, + { + "epoch": 1.84, + "grad_norm": 8.740678787231445, + "learning_rate": 2.3322464389909044e-06, + "loss": 0.3269, + "step": 10748 + }, + { + "epoch": 1.84, + "grad_norm": 18.84109878540039, + "learning_rate": 2.32967221554831e-06, + "loss": 0.4604, + "step": 10749 + }, + { + "epoch": 1.84, + "grad_norm": 9.938599586486816, + "learning_rate": 2.327097992105715e-06, + "loss": 0.4271, + "step": 10750 + }, + { + "epoch": 1.85, + "grad_norm": 12.845032691955566, + "learning_rate": 2.32452376866312e-06, + "loss": 0.4105, + "step": 10751 + }, + { + "epoch": 1.85, + "grad_norm": 11.102261543273926, + "learning_rate": 2.3219495452205252e-06, + "loss": 0.4717, + "step": 10752 + }, + { + "epoch": 1.85, + "grad_norm": 9.609228134155273, + "learning_rate": 2.3193753217779302e-06, + "loss": 0.4263, + "step": 10753 + }, + { + "epoch": 1.85, + "grad_norm": 7.596081733703613, + "learning_rate": 2.3168010983353356e-06, + "loss": 0.2193, + "step": 10754 + }, + { + "epoch": 1.85, + "grad_norm": 11.777579307556152, + "learning_rate": 2.314226874892741e-06, + "loss": 0.4518, + "step": 10755 + }, + { + "epoch": 1.85, + "grad_norm": 8.152752876281738, + "learning_rate": 2.311652651450146e-06, + "loss": 0.3354, + "step": 10756 + }, + { + "epoch": 1.85, + "grad_norm": 13.82956314086914, + "learning_rate": 2.309078428007551e-06, + "loss": 0.4699, + "step": 10757 + }, + { + "epoch": 1.85, + "grad_norm": 11.48739242553711, + "learning_rate": 2.306504204564956e-06, + "loss": 0.417, + "step": 10758 + }, + { + "epoch": 1.85, + "grad_norm": 12.494852066040039, + "learning_rate": 2.3039299811223614e-06, + "loss": 0.4977, + "step": 10759 + }, + { + "epoch": 1.85, + "grad_norm": 12.750948905944824, + "learning_rate": 2.301355757679767e-06, + "loss": 0.455, + "step": 10760 + }, + { + "epoch": 1.85, + "grad_norm": 9.151665687561035, + "learning_rate": 2.298781534237172e-06, + "loss": 0.4942, + "step": 10761 + }, + { + "epoch": 1.85, + "grad_norm": 32.819061279296875, + "learning_rate": 2.2962073107945772e-06, + "loss": 0.404, + "step": 10762 + }, + { + "epoch": 1.85, + "grad_norm": 11.47062873840332, + "learning_rate": 2.2936330873519822e-06, + "loss": 0.3854, + "step": 10763 + }, + { + "epoch": 1.85, + "grad_norm": 11.467101097106934, + "learning_rate": 2.291058863909387e-06, + "loss": 0.3279, + "step": 10764 + }, + { + "epoch": 1.85, + "grad_norm": 8.407057762145996, + "learning_rate": 2.2884846404667926e-06, + "loss": 0.3094, + "step": 10765 + }, + { + "epoch": 1.85, + "grad_norm": 11.819005012512207, + "learning_rate": 2.2859104170241976e-06, + "loss": 0.3628, + "step": 10766 + }, + { + "epoch": 1.85, + "grad_norm": 10.826458930969238, + "learning_rate": 2.283336193581603e-06, + "loss": 0.2743, + "step": 10767 + }, + { + "epoch": 1.85, + "grad_norm": 7.1718525886535645, + "learning_rate": 2.2807619701390084e-06, + "loss": 0.3215, + "step": 10768 + }, + { + "epoch": 1.85, + "grad_norm": 13.493207931518555, + "learning_rate": 2.2781877466964134e-06, + "loss": 0.3257, + "step": 10769 + }, + { + "epoch": 1.85, + "grad_norm": 10.476923942565918, + "learning_rate": 2.2756135232538184e-06, + "loss": 0.3036, + "step": 10770 + }, + { + "epoch": 1.85, + "grad_norm": 11.054359436035156, + "learning_rate": 2.2730392998112234e-06, + "loss": 0.4715, + "step": 10771 + }, + { + "epoch": 1.85, + "grad_norm": 7.640109062194824, + "learning_rate": 2.270465076368629e-06, + "loss": 0.3296, + "step": 10772 + }, + { + "epoch": 1.85, + "grad_norm": 10.762101173400879, + "learning_rate": 2.267890852926034e-06, + "loss": 0.3601, + "step": 10773 + }, + { + "epoch": 1.85, + "grad_norm": 11.953591346740723, + "learning_rate": 2.265316629483439e-06, + "loss": 0.4362, + "step": 10774 + }, + { + "epoch": 1.85, + "grad_norm": 11.266918182373047, + "learning_rate": 2.2627424060408446e-06, + "loss": 0.4505, + "step": 10775 + }, + { + "epoch": 1.85, + "grad_norm": 9.953780174255371, + "learning_rate": 2.2601681825982496e-06, + "loss": 0.4516, + "step": 10776 + }, + { + "epoch": 1.85, + "grad_norm": 10.710752487182617, + "learning_rate": 2.2575939591556546e-06, + "loss": 0.3218, + "step": 10777 + }, + { + "epoch": 1.85, + "grad_norm": 6.6055755615234375, + "learning_rate": 2.25501973571306e-06, + "loss": 0.2625, + "step": 10778 + }, + { + "epoch": 1.85, + "grad_norm": 10.41867446899414, + "learning_rate": 2.252445512270465e-06, + "loss": 0.3481, + "step": 10779 + }, + { + "epoch": 1.85, + "grad_norm": 12.75260066986084, + "learning_rate": 2.2498712888278704e-06, + "loss": 0.3312, + "step": 10780 + }, + { + "epoch": 1.85, + "grad_norm": 9.806591033935547, + "learning_rate": 2.247297065385276e-06, + "loss": 0.4483, + "step": 10781 + }, + { + "epoch": 1.85, + "grad_norm": 10.854302406311035, + "learning_rate": 2.244722841942681e-06, + "loss": 0.3797, + "step": 10782 + }, + { + "epoch": 1.85, + "grad_norm": 8.459409713745117, + "learning_rate": 2.2421486185000858e-06, + "loss": 0.3149, + "step": 10783 + }, + { + "epoch": 1.85, + "grad_norm": 8.881391525268555, + "learning_rate": 2.2395743950574908e-06, + "loss": 0.4413, + "step": 10784 + }, + { + "epoch": 1.85, + "grad_norm": 12.098734855651855, + "learning_rate": 2.237000171614896e-06, + "loss": 0.4845, + "step": 10785 + }, + { + "epoch": 1.85, + "grad_norm": 10.4308443069458, + "learning_rate": 2.2344259481723016e-06, + "loss": 0.3798, + "step": 10786 + }, + { + "epoch": 1.85, + "grad_norm": 9.48336410522461, + "learning_rate": 2.2318517247297066e-06, + "loss": 0.4244, + "step": 10787 + }, + { + "epoch": 1.85, + "grad_norm": 16.419635772705078, + "learning_rate": 2.229277501287112e-06, + "loss": 0.4723, + "step": 10788 + }, + { + "epoch": 1.85, + "grad_norm": 7.306260585784912, + "learning_rate": 2.226703277844517e-06, + "loss": 0.3261, + "step": 10789 + }, + { + "epoch": 1.85, + "grad_norm": 12.113286972045898, + "learning_rate": 2.224129054401922e-06, + "loss": 0.5654, + "step": 10790 + }, + { + "epoch": 1.85, + "grad_norm": 12.769453048706055, + "learning_rate": 2.2215548309593274e-06, + "loss": 0.4038, + "step": 10791 + }, + { + "epoch": 1.85, + "grad_norm": 9.740857124328613, + "learning_rate": 2.2189806075167324e-06, + "loss": 0.3157, + "step": 10792 + }, + { + "epoch": 1.85, + "grad_norm": 13.538228034973145, + "learning_rate": 2.2164063840741378e-06, + "loss": 0.4004, + "step": 10793 + }, + { + "epoch": 1.85, + "grad_norm": 6.721349716186523, + "learning_rate": 2.213832160631543e-06, + "loss": 0.2371, + "step": 10794 + }, + { + "epoch": 1.85, + "grad_norm": 9.922874450683594, + "learning_rate": 2.211257937188948e-06, + "loss": 0.368, + "step": 10795 + }, + { + "epoch": 1.85, + "grad_norm": 12.995516777038574, + "learning_rate": 2.208683713746353e-06, + "loss": 0.4539, + "step": 10796 + }, + { + "epoch": 1.85, + "grad_norm": 9.98942756652832, + "learning_rate": 2.206109490303758e-06, + "loss": 0.3035, + "step": 10797 + }, + { + "epoch": 1.85, + "grad_norm": 12.0841064453125, + "learning_rate": 2.2035352668611636e-06, + "loss": 0.288, + "step": 10798 + }, + { + "epoch": 1.85, + "grad_norm": 12.027642250061035, + "learning_rate": 2.200961043418569e-06, + "loss": 0.3204, + "step": 10799 + }, + { + "epoch": 1.85, + "grad_norm": 8.990629196166992, + "learning_rate": 2.198386819975974e-06, + "loss": 0.3326, + "step": 10800 + }, + { + "epoch": 1.85, + "grad_norm": 9.817361831665039, + "learning_rate": 2.1958125965333794e-06, + "loss": 0.5591, + "step": 10801 + }, + { + "epoch": 1.85, + "grad_norm": 10.330004692077637, + "learning_rate": 2.1932383730907843e-06, + "loss": 0.4329, + "step": 10802 + }, + { + "epoch": 1.85, + "grad_norm": 8.918294906616211, + "learning_rate": 2.1906641496481893e-06, + "loss": 0.333, + "step": 10803 + }, + { + "epoch": 1.85, + "grad_norm": 15.731829643249512, + "learning_rate": 2.1880899262055947e-06, + "loss": 0.318, + "step": 10804 + }, + { + "epoch": 1.85, + "grad_norm": 9.897744178771973, + "learning_rate": 2.1855157027629997e-06, + "loss": 0.3439, + "step": 10805 + }, + { + "epoch": 1.85, + "grad_norm": 8.820111274719238, + "learning_rate": 2.182941479320405e-06, + "loss": 0.3696, + "step": 10806 + }, + { + "epoch": 1.85, + "grad_norm": 15.642640113830566, + "learning_rate": 2.1803672558778106e-06, + "loss": 0.5032, + "step": 10807 + }, + { + "epoch": 1.85, + "grad_norm": 14.144488334655762, + "learning_rate": 2.1777930324352155e-06, + "loss": 0.3735, + "step": 10808 + }, + { + "epoch": 1.85, + "grad_norm": 9.489876747131348, + "learning_rate": 2.1752188089926205e-06, + "loss": 0.3798, + "step": 10809 + }, + { + "epoch": 1.86, + "grad_norm": 12.61406135559082, + "learning_rate": 2.1726445855500255e-06, + "loss": 0.3387, + "step": 10810 + }, + { + "epoch": 1.86, + "grad_norm": 10.711216926574707, + "learning_rate": 2.170070362107431e-06, + "loss": 0.4337, + "step": 10811 + }, + { + "epoch": 1.86, + "grad_norm": 12.59476375579834, + "learning_rate": 2.1674961386648363e-06, + "loss": 0.4709, + "step": 10812 + }, + { + "epoch": 1.86, + "grad_norm": 9.587848663330078, + "learning_rate": 2.1649219152222413e-06, + "loss": 0.3485, + "step": 10813 + }, + { + "epoch": 1.86, + "grad_norm": 12.932808876037598, + "learning_rate": 2.1623476917796467e-06, + "loss": 0.3501, + "step": 10814 + }, + { + "epoch": 1.86, + "grad_norm": 8.602524757385254, + "learning_rate": 2.1597734683370517e-06, + "loss": 0.3511, + "step": 10815 + }, + { + "epoch": 1.86, + "grad_norm": 7.017490863800049, + "learning_rate": 2.1571992448944567e-06, + "loss": 0.2124, + "step": 10816 + }, + { + "epoch": 1.86, + "grad_norm": 8.48502254486084, + "learning_rate": 2.154625021451862e-06, + "loss": 0.2939, + "step": 10817 + }, + { + "epoch": 1.86, + "grad_norm": 8.827862739562988, + "learning_rate": 2.152050798009267e-06, + "loss": 0.3149, + "step": 10818 + }, + { + "epoch": 1.86, + "grad_norm": 13.767644882202148, + "learning_rate": 2.1494765745666725e-06, + "loss": 0.2953, + "step": 10819 + }, + { + "epoch": 1.86, + "grad_norm": 12.464248657226562, + "learning_rate": 2.146902351124078e-06, + "loss": 0.3244, + "step": 10820 + }, + { + "epoch": 1.86, + "grad_norm": 8.992477416992188, + "learning_rate": 2.144328127681483e-06, + "loss": 0.3256, + "step": 10821 + }, + { + "epoch": 1.86, + "grad_norm": 12.161433219909668, + "learning_rate": 2.141753904238888e-06, + "loss": 0.3607, + "step": 10822 + }, + { + "epoch": 1.86, + "grad_norm": 8.0374755859375, + "learning_rate": 2.139179680796293e-06, + "loss": 0.2819, + "step": 10823 + }, + { + "epoch": 1.86, + "grad_norm": 11.524991035461426, + "learning_rate": 2.1366054573536983e-06, + "loss": 0.5337, + "step": 10824 + }, + { + "epoch": 1.86, + "grad_norm": 10.048221588134766, + "learning_rate": 2.1340312339111037e-06, + "loss": 0.3204, + "step": 10825 + }, + { + "epoch": 1.86, + "grad_norm": 12.081626892089844, + "learning_rate": 2.1314570104685087e-06, + "loss": 0.4938, + "step": 10826 + }, + { + "epoch": 1.86, + "grad_norm": 7.9117021560668945, + "learning_rate": 2.128882787025914e-06, + "loss": 0.3639, + "step": 10827 + }, + { + "epoch": 1.86, + "grad_norm": 10.109366416931152, + "learning_rate": 2.126308563583319e-06, + "loss": 0.2826, + "step": 10828 + }, + { + "epoch": 1.86, + "grad_norm": 13.635122299194336, + "learning_rate": 2.123734340140724e-06, + "loss": 0.4808, + "step": 10829 + }, + { + "epoch": 1.86, + "grad_norm": 7.299768924713135, + "learning_rate": 2.1211601166981295e-06, + "loss": 0.2482, + "step": 10830 + }, + { + "epoch": 1.86, + "grad_norm": 13.62306022644043, + "learning_rate": 2.1185858932555345e-06, + "loss": 0.4508, + "step": 10831 + }, + { + "epoch": 1.86, + "grad_norm": 8.016575813293457, + "learning_rate": 2.11601166981294e-06, + "loss": 0.2345, + "step": 10832 + }, + { + "epoch": 1.86, + "grad_norm": 15.547022819519043, + "learning_rate": 2.1134374463703453e-06, + "loss": 0.4968, + "step": 10833 + }, + { + "epoch": 1.86, + "grad_norm": 16.415725708007812, + "learning_rate": 2.1108632229277503e-06, + "loss": 0.291, + "step": 10834 + }, + { + "epoch": 1.86, + "grad_norm": 11.098642349243164, + "learning_rate": 2.1082889994851553e-06, + "loss": 0.3459, + "step": 10835 + }, + { + "epoch": 1.86, + "grad_norm": 11.48742961883545, + "learning_rate": 2.1057147760425603e-06, + "loss": 0.4204, + "step": 10836 + }, + { + "epoch": 1.86, + "grad_norm": 8.175433158874512, + "learning_rate": 2.1031405525999657e-06, + "loss": 0.2679, + "step": 10837 + }, + { + "epoch": 1.86, + "grad_norm": 11.38789176940918, + "learning_rate": 2.100566329157371e-06, + "loss": 0.3646, + "step": 10838 + }, + { + "epoch": 1.86, + "grad_norm": 7.256763935089111, + "learning_rate": 2.097992105714776e-06, + "loss": 0.2612, + "step": 10839 + }, + { + "epoch": 1.86, + "grad_norm": 11.745893478393555, + "learning_rate": 2.0954178822721815e-06, + "loss": 0.2839, + "step": 10840 + }, + { + "epoch": 1.86, + "grad_norm": 14.312209129333496, + "learning_rate": 2.0928436588295865e-06, + "loss": 0.4288, + "step": 10841 + }, + { + "epoch": 1.86, + "grad_norm": 11.442017555236816, + "learning_rate": 2.0902694353869915e-06, + "loss": 0.4444, + "step": 10842 + }, + { + "epoch": 1.86, + "grad_norm": 9.069182395935059, + "learning_rate": 2.087695211944397e-06, + "loss": 0.4258, + "step": 10843 + }, + { + "epoch": 1.86, + "grad_norm": 7.49484920501709, + "learning_rate": 2.085120988501802e-06, + "loss": 0.3, + "step": 10844 + }, + { + "epoch": 1.86, + "grad_norm": 7.370274066925049, + "learning_rate": 2.0825467650592073e-06, + "loss": 0.2795, + "step": 10845 + }, + { + "epoch": 1.86, + "grad_norm": 13.82091999053955, + "learning_rate": 2.0799725416166127e-06, + "loss": 0.33, + "step": 10846 + }, + { + "epoch": 1.86, + "grad_norm": 8.931026458740234, + "learning_rate": 2.0773983181740177e-06, + "loss": 0.3417, + "step": 10847 + }, + { + "epoch": 1.86, + "grad_norm": 8.414534568786621, + "learning_rate": 2.0748240947314227e-06, + "loss": 0.3247, + "step": 10848 + }, + { + "epoch": 1.86, + "grad_norm": 13.010655403137207, + "learning_rate": 2.0722498712888276e-06, + "loss": 0.5502, + "step": 10849 + }, + { + "epoch": 1.86, + "grad_norm": 9.234877586364746, + "learning_rate": 2.069675647846233e-06, + "loss": 0.3277, + "step": 10850 + }, + { + "epoch": 1.86, + "grad_norm": 11.902608871459961, + "learning_rate": 2.0671014244036385e-06, + "loss": 0.4241, + "step": 10851 + }, + { + "epoch": 1.86, + "grad_norm": 11.112380981445312, + "learning_rate": 2.0645272009610435e-06, + "loss": 0.4556, + "step": 10852 + }, + { + "epoch": 1.86, + "grad_norm": 8.702977180480957, + "learning_rate": 2.061952977518449e-06, + "loss": 0.3529, + "step": 10853 + }, + { + "epoch": 1.86, + "grad_norm": 8.624886512756348, + "learning_rate": 2.059378754075854e-06, + "loss": 0.2972, + "step": 10854 + }, + { + "epoch": 1.86, + "grad_norm": 8.750003814697266, + "learning_rate": 2.056804530633259e-06, + "loss": 0.3776, + "step": 10855 + }, + { + "epoch": 1.86, + "grad_norm": 9.668874740600586, + "learning_rate": 2.0542303071906643e-06, + "loss": 0.3964, + "step": 10856 + }, + { + "epoch": 1.86, + "grad_norm": 9.533783912658691, + "learning_rate": 2.0516560837480692e-06, + "loss": 0.2578, + "step": 10857 + }, + { + "epoch": 1.86, + "grad_norm": 6.777493000030518, + "learning_rate": 2.0490818603054746e-06, + "loss": 0.2758, + "step": 10858 + }, + { + "epoch": 1.86, + "grad_norm": 10.253680229187012, + "learning_rate": 2.04650763686288e-06, + "loss": 0.4073, + "step": 10859 + }, + { + "epoch": 1.86, + "grad_norm": 11.718351364135742, + "learning_rate": 2.0439334134202846e-06, + "loss": 0.5825, + "step": 10860 + }, + { + "epoch": 1.86, + "grad_norm": 10.605161666870117, + "learning_rate": 2.04135918997769e-06, + "loss": 0.3916, + "step": 10861 + }, + { + "epoch": 1.86, + "grad_norm": 8.781625747680664, + "learning_rate": 2.038784966535095e-06, + "loss": 0.2946, + "step": 10862 + }, + { + "epoch": 1.86, + "grad_norm": 9.692581176757812, + "learning_rate": 2.0362107430925004e-06, + "loss": 0.3037, + "step": 10863 + }, + { + "epoch": 1.86, + "grad_norm": 16.491222381591797, + "learning_rate": 2.033636519649906e-06, + "loss": 0.3868, + "step": 10864 + }, + { + "epoch": 1.86, + "grad_norm": 6.820777893066406, + "learning_rate": 2.031062296207311e-06, + "loss": 0.3183, + "step": 10865 + }, + { + "epoch": 1.86, + "grad_norm": 9.390010833740234, + "learning_rate": 2.0284880727647162e-06, + "loss": 0.2387, + "step": 10866 + }, + { + "epoch": 1.86, + "grad_norm": 10.670175552368164, + "learning_rate": 2.0259138493221212e-06, + "loss": 0.4745, + "step": 10867 + }, + { + "epoch": 1.87, + "grad_norm": 14.558318138122559, + "learning_rate": 2.0233396258795262e-06, + "loss": 0.3502, + "step": 10868 + }, + { + "epoch": 1.87, + "grad_norm": 10.584383964538574, + "learning_rate": 2.0207654024369316e-06, + "loss": 0.2779, + "step": 10869 + }, + { + "epoch": 1.87, + "grad_norm": 8.472787857055664, + "learning_rate": 2.0181911789943366e-06, + "loss": 0.3735, + "step": 10870 + }, + { + "epoch": 1.87, + "grad_norm": 8.116923332214355, + "learning_rate": 2.015616955551742e-06, + "loss": 0.3172, + "step": 10871 + }, + { + "epoch": 1.87, + "grad_norm": 8.554486274719238, + "learning_rate": 2.0130427321091474e-06, + "loss": 0.3193, + "step": 10872 + }, + { + "epoch": 1.87, + "grad_norm": 11.012552261352539, + "learning_rate": 2.010468508666552e-06, + "loss": 0.3107, + "step": 10873 + }, + { + "epoch": 1.87, + "grad_norm": 10.351103782653809, + "learning_rate": 2.0078942852239574e-06, + "loss": 0.3081, + "step": 10874 + }, + { + "epoch": 1.87, + "grad_norm": 13.335416793823242, + "learning_rate": 2.0053200617813624e-06, + "loss": 0.4063, + "step": 10875 + }, + { + "epoch": 1.87, + "grad_norm": 14.989412307739258, + "learning_rate": 2.002745838338768e-06, + "loss": 0.5465, + "step": 10876 + }, + { + "epoch": 1.87, + "grad_norm": 10.931973457336426, + "learning_rate": 2.0001716148961732e-06, + "loss": 0.2691, + "step": 10877 + }, + { + "epoch": 1.87, + "grad_norm": 11.633210182189941, + "learning_rate": 1.997597391453578e-06, + "loss": 0.5472, + "step": 10878 + }, + { + "epoch": 1.87, + "grad_norm": 12.967374801635742, + "learning_rate": 1.9950231680109836e-06, + "loss": 0.408, + "step": 10879 + }, + { + "epoch": 1.87, + "grad_norm": 10.11751651763916, + "learning_rate": 1.9924489445683886e-06, + "loss": 0.4009, + "step": 10880 + }, + { + "epoch": 1.87, + "grad_norm": 10.455737113952637, + "learning_rate": 1.9898747211257936e-06, + "loss": 0.3342, + "step": 10881 + }, + { + "epoch": 1.87, + "grad_norm": 9.098809242248535, + "learning_rate": 1.987300497683199e-06, + "loss": 0.2926, + "step": 10882 + }, + { + "epoch": 1.87, + "grad_norm": 9.390904426574707, + "learning_rate": 1.984726274240604e-06, + "loss": 0.4593, + "step": 10883 + }, + { + "epoch": 1.87, + "grad_norm": 15.999994277954102, + "learning_rate": 1.9821520507980094e-06, + "loss": 0.4293, + "step": 10884 + }, + { + "epoch": 1.87, + "grad_norm": 10.855880737304688, + "learning_rate": 1.979577827355415e-06, + "loss": 0.3421, + "step": 10885 + }, + { + "epoch": 1.87, + "grad_norm": 9.927961349487305, + "learning_rate": 1.9770036039128194e-06, + "loss": 0.3545, + "step": 10886 + }, + { + "epoch": 1.87, + "grad_norm": 13.199851036071777, + "learning_rate": 1.974429380470225e-06, + "loss": 0.3846, + "step": 10887 + }, + { + "epoch": 1.87, + "grad_norm": 9.086398124694824, + "learning_rate": 1.9718551570276298e-06, + "loss": 0.3284, + "step": 10888 + }, + { + "epoch": 1.87, + "grad_norm": 7.92074728012085, + "learning_rate": 1.969280933585035e-06, + "loss": 0.2864, + "step": 10889 + }, + { + "epoch": 1.87, + "grad_norm": 12.711901664733887, + "learning_rate": 1.9667067101424406e-06, + "loss": 0.4471, + "step": 10890 + }, + { + "epoch": 1.87, + "grad_norm": 9.300307273864746, + "learning_rate": 1.9641324866998456e-06, + "loss": 0.3492, + "step": 10891 + }, + { + "epoch": 1.87, + "grad_norm": 9.916529655456543, + "learning_rate": 1.961558263257251e-06, + "loss": 0.3667, + "step": 10892 + }, + { + "epoch": 1.87, + "grad_norm": 10.481868743896484, + "learning_rate": 1.958984039814656e-06, + "loss": 0.3685, + "step": 10893 + }, + { + "epoch": 1.87, + "grad_norm": 9.224878311157227, + "learning_rate": 1.956409816372061e-06, + "loss": 0.3121, + "step": 10894 + }, + { + "epoch": 1.87, + "grad_norm": 9.719337463378906, + "learning_rate": 1.9538355929294664e-06, + "loss": 0.3187, + "step": 10895 + }, + { + "epoch": 1.87, + "grad_norm": 9.698870658874512, + "learning_rate": 1.9512613694868714e-06, + "loss": 0.3979, + "step": 10896 + }, + { + "epoch": 1.87, + "grad_norm": 7.925223350524902, + "learning_rate": 1.9486871460442768e-06, + "loss": 0.2924, + "step": 10897 + }, + { + "epoch": 1.87, + "grad_norm": 10.224217414855957, + "learning_rate": 1.946112922601682e-06, + "loss": 0.3423, + "step": 10898 + }, + { + "epoch": 1.87, + "grad_norm": 11.26035213470459, + "learning_rate": 1.9435386991590868e-06, + "loss": 0.3215, + "step": 10899 + }, + { + "epoch": 1.87, + "grad_norm": 15.702499389648438, + "learning_rate": 1.940964475716492e-06, + "loss": 0.4997, + "step": 10900 + }, + { + "epoch": 1.87, + "grad_norm": 7.4429731369018555, + "learning_rate": 1.938390252273897e-06, + "loss": 0.2643, + "step": 10901 + }, + { + "epoch": 1.87, + "grad_norm": 10.223000526428223, + "learning_rate": 1.9358160288313026e-06, + "loss": 0.3761, + "step": 10902 + }, + { + "epoch": 1.87, + "grad_norm": 12.797672271728516, + "learning_rate": 1.933241805388708e-06, + "loss": 0.4087, + "step": 10903 + }, + { + "epoch": 1.87, + "grad_norm": 13.24771785736084, + "learning_rate": 1.930667581946113e-06, + "loss": 0.4276, + "step": 10904 + }, + { + "epoch": 1.87, + "grad_norm": 9.165146827697754, + "learning_rate": 1.9280933585035184e-06, + "loss": 0.2561, + "step": 10905 + }, + { + "epoch": 1.87, + "grad_norm": 11.10590934753418, + "learning_rate": 1.9255191350609234e-06, + "loss": 0.4682, + "step": 10906 + }, + { + "epoch": 1.87, + "grad_norm": 10.769000053405762, + "learning_rate": 1.9229449116183283e-06, + "loss": 0.4245, + "step": 10907 + }, + { + "epoch": 1.87, + "grad_norm": 9.64234447479248, + "learning_rate": 1.9203706881757338e-06, + "loss": 0.2173, + "step": 10908 + }, + { + "epoch": 1.87, + "grad_norm": 10.308282852172852, + "learning_rate": 1.9177964647331387e-06, + "loss": 0.3512, + "step": 10909 + }, + { + "epoch": 1.87, + "grad_norm": 10.698214530944824, + "learning_rate": 1.915222241290544e-06, + "loss": 0.4464, + "step": 10910 + }, + { + "epoch": 1.87, + "grad_norm": 9.663969039916992, + "learning_rate": 1.9126480178479496e-06, + "loss": 0.3552, + "step": 10911 + }, + { + "epoch": 1.87, + "grad_norm": 10.192497253417969, + "learning_rate": 1.910073794405354e-06, + "loss": 0.4219, + "step": 10912 + }, + { + "epoch": 1.87, + "grad_norm": 11.867043495178223, + "learning_rate": 1.9074995709627595e-06, + "loss": 0.3177, + "step": 10913 + }, + { + "epoch": 1.87, + "grad_norm": 8.983952522277832, + "learning_rate": 1.9049253475201647e-06, + "loss": 0.2394, + "step": 10914 + }, + { + "epoch": 1.87, + "grad_norm": 7.460039138793945, + "learning_rate": 1.90235112407757e-06, + "loss": 0.2937, + "step": 10915 + }, + { + "epoch": 1.87, + "grad_norm": 13.977998733520508, + "learning_rate": 1.8997769006349751e-06, + "loss": 0.4923, + "step": 10916 + }, + { + "epoch": 1.87, + "grad_norm": 10.566569328308105, + "learning_rate": 1.8972026771923805e-06, + "loss": 0.34, + "step": 10917 + }, + { + "epoch": 1.87, + "grad_norm": 12.486238479614258, + "learning_rate": 1.8946284537497857e-06, + "loss": 0.3451, + "step": 10918 + }, + { + "epoch": 1.87, + "grad_norm": 12.05168628692627, + "learning_rate": 1.8920542303071905e-06, + "loss": 0.1917, + "step": 10919 + }, + { + "epoch": 1.87, + "grad_norm": 10.568209648132324, + "learning_rate": 1.8894800068645957e-06, + "loss": 0.3181, + "step": 10920 + }, + { + "epoch": 1.87, + "grad_norm": 8.249594688415527, + "learning_rate": 1.8869057834220011e-06, + "loss": 0.2863, + "step": 10921 + }, + { + "epoch": 1.87, + "grad_norm": 8.555765151977539, + "learning_rate": 1.8843315599794063e-06, + "loss": 0.3128, + "step": 10922 + }, + { + "epoch": 1.87, + "grad_norm": 7.785953998565674, + "learning_rate": 1.8817573365368115e-06, + "loss": 0.2663, + "step": 10923 + }, + { + "epoch": 1.87, + "grad_norm": 9.51054573059082, + "learning_rate": 1.8791831130942167e-06, + "loss": 0.3521, + "step": 10924 + }, + { + "epoch": 1.87, + "grad_norm": 11.163633346557617, + "learning_rate": 1.8766088896516215e-06, + "loss": 0.3209, + "step": 10925 + }, + { + "epoch": 1.88, + "grad_norm": 9.81087589263916, + "learning_rate": 1.8740346662090271e-06, + "loss": 0.3311, + "step": 10926 + }, + { + "epoch": 1.88, + "grad_norm": 11.272926330566406, + "learning_rate": 1.8714604427664321e-06, + "loss": 0.4884, + "step": 10927 + }, + { + "epoch": 1.88, + "grad_norm": 10.91628646850586, + "learning_rate": 1.8688862193238373e-06, + "loss": 0.3836, + "step": 10928 + }, + { + "epoch": 1.88, + "grad_norm": 8.258402824401855, + "learning_rate": 1.8663119958812425e-06, + "loss": 0.2882, + "step": 10929 + }, + { + "epoch": 1.88, + "grad_norm": 9.744312286376953, + "learning_rate": 1.8637377724386477e-06, + "loss": 0.3744, + "step": 10930 + }, + { + "epoch": 1.88, + "grad_norm": 6.320903301239014, + "learning_rate": 1.861163548996053e-06, + "loss": 0.1566, + "step": 10931 + }, + { + "epoch": 1.88, + "grad_norm": 10.218724250793457, + "learning_rate": 1.8585893255534581e-06, + "loss": 0.3718, + "step": 10932 + }, + { + "epoch": 1.88, + "grad_norm": 13.666526794433594, + "learning_rate": 1.856015102110863e-06, + "loss": 0.4264, + "step": 10933 + }, + { + "epoch": 1.88, + "grad_norm": 9.000092506408691, + "learning_rate": 1.8534408786682685e-06, + "loss": 0.3998, + "step": 10934 + }, + { + "epoch": 1.88, + "grad_norm": 10.815633773803711, + "learning_rate": 1.8508666552256737e-06, + "loss": 0.5304, + "step": 10935 + }, + { + "epoch": 1.88, + "grad_norm": 11.508477210998535, + "learning_rate": 1.848292431783079e-06, + "loss": 0.2781, + "step": 10936 + }, + { + "epoch": 1.88, + "grad_norm": 9.919846534729004, + "learning_rate": 1.845718208340484e-06, + "loss": 0.5268, + "step": 10937 + }, + { + "epoch": 1.88, + "grad_norm": 9.646406173706055, + "learning_rate": 1.843143984897889e-06, + "loss": 0.5366, + "step": 10938 + }, + { + "epoch": 1.88, + "grad_norm": 11.416731834411621, + "learning_rate": 1.8405697614552945e-06, + "loss": 0.5303, + "step": 10939 + }, + { + "epoch": 1.88, + "grad_norm": 12.099388122558594, + "learning_rate": 1.8379955380126995e-06, + "loss": 0.3049, + "step": 10940 + }, + { + "epoch": 1.88, + "grad_norm": 9.014859199523926, + "learning_rate": 1.8354213145701047e-06, + "loss": 0.2689, + "step": 10941 + }, + { + "epoch": 1.88, + "grad_norm": 10.663738250732422, + "learning_rate": 1.8328470911275099e-06, + "loss": 0.4063, + "step": 10942 + }, + { + "epoch": 1.88, + "grad_norm": 9.820629119873047, + "learning_rate": 1.830272867684915e-06, + "loss": 0.3735, + "step": 10943 + }, + { + "epoch": 1.88, + "grad_norm": 14.905858039855957, + "learning_rate": 1.8276986442423203e-06, + "loss": 0.4487, + "step": 10944 + }, + { + "epoch": 1.88, + "grad_norm": 9.693678855895996, + "learning_rate": 1.8251244207997255e-06, + "loss": 0.3747, + "step": 10945 + }, + { + "epoch": 1.88, + "grad_norm": 8.894797325134277, + "learning_rate": 1.8225501973571305e-06, + "loss": 0.3577, + "step": 10946 + }, + { + "epoch": 1.88, + "grad_norm": 9.730839729309082, + "learning_rate": 1.8199759739145359e-06, + "loss": 0.2407, + "step": 10947 + }, + { + "epoch": 1.88, + "grad_norm": 13.638737678527832, + "learning_rate": 1.817401750471941e-06, + "loss": 0.6203, + "step": 10948 + }, + { + "epoch": 1.88, + "grad_norm": 10.555971145629883, + "learning_rate": 1.8148275270293463e-06, + "loss": 0.272, + "step": 10949 + }, + { + "epoch": 1.88, + "grad_norm": 8.601404190063477, + "learning_rate": 1.8122533035867513e-06, + "loss": 0.3058, + "step": 10950 + }, + { + "epoch": 1.88, + "grad_norm": 15.684060096740723, + "learning_rate": 1.8096790801441565e-06, + "loss": 0.4711, + "step": 10951 + }, + { + "epoch": 1.88, + "grad_norm": 9.077581405639648, + "learning_rate": 1.8071048567015619e-06, + "loss": 0.3769, + "step": 10952 + }, + { + "epoch": 1.88, + "grad_norm": 7.523407936096191, + "learning_rate": 1.8045306332589669e-06, + "loss": 0.2662, + "step": 10953 + }, + { + "epoch": 1.88, + "grad_norm": 10.453701972961426, + "learning_rate": 1.801956409816372e-06, + "loss": 0.3374, + "step": 10954 + }, + { + "epoch": 1.88, + "grad_norm": 12.322737693786621, + "learning_rate": 1.7993821863737773e-06, + "loss": 0.409, + "step": 10955 + }, + { + "epoch": 1.88, + "grad_norm": 11.129615783691406, + "learning_rate": 1.7968079629311825e-06, + "loss": 0.3592, + "step": 10956 + }, + { + "epoch": 1.88, + "grad_norm": 11.893600463867188, + "learning_rate": 1.7942337394885877e-06, + "loss": 0.4282, + "step": 10957 + }, + { + "epoch": 1.88, + "grad_norm": 8.170722007751465, + "learning_rate": 1.7916595160459929e-06, + "loss": 0.2453, + "step": 10958 + }, + { + "epoch": 1.88, + "grad_norm": 8.843855857849121, + "learning_rate": 1.7890852926033979e-06, + "loss": 0.297, + "step": 10959 + }, + { + "epoch": 1.88, + "grad_norm": 13.154544830322266, + "learning_rate": 1.7865110691608033e-06, + "loss": 0.2612, + "step": 10960 + }, + { + "epoch": 1.88, + "grad_norm": 10.611052513122559, + "learning_rate": 1.7839368457182085e-06, + "loss": 0.3408, + "step": 10961 + }, + { + "epoch": 1.88, + "grad_norm": 8.195291519165039, + "learning_rate": 1.7813626222756137e-06, + "loss": 0.2961, + "step": 10962 + }, + { + "epoch": 1.88, + "grad_norm": 9.155021667480469, + "learning_rate": 1.7787883988330186e-06, + "loss": 0.2582, + "step": 10963 + }, + { + "epoch": 1.88, + "grad_norm": 8.679885864257812, + "learning_rate": 1.7762141753904238e-06, + "loss": 0.2969, + "step": 10964 + }, + { + "epoch": 1.88, + "grad_norm": 15.09830379486084, + "learning_rate": 1.7736399519478293e-06, + "loss": 0.2912, + "step": 10965 + }, + { + "epoch": 1.88, + "grad_norm": 14.981791496276855, + "learning_rate": 1.7710657285052342e-06, + "loss": 0.3144, + "step": 10966 + }, + { + "epoch": 1.88, + "grad_norm": 14.299041748046875, + "learning_rate": 1.7684915050626394e-06, + "loss": 0.4067, + "step": 10967 + }, + { + "epoch": 1.88, + "grad_norm": 8.790837287902832, + "learning_rate": 1.7659172816200446e-06, + "loss": 0.3289, + "step": 10968 + }, + { + "epoch": 1.88, + "grad_norm": 9.538524627685547, + "learning_rate": 1.7633430581774498e-06, + "loss": 0.3175, + "step": 10969 + }, + { + "epoch": 1.88, + "grad_norm": 9.780279159545898, + "learning_rate": 1.760768834734855e-06, + "loss": 0.3629, + "step": 10970 + }, + { + "epoch": 1.88, + "grad_norm": 14.61856746673584, + "learning_rate": 1.7581946112922602e-06, + "loss": 0.4479, + "step": 10971 + }, + { + "epoch": 1.88, + "grad_norm": 10.927382469177246, + "learning_rate": 1.7556203878496652e-06, + "loss": 0.2956, + "step": 10972 + }, + { + "epoch": 1.88, + "grad_norm": 11.458925247192383, + "learning_rate": 1.7530461644070706e-06, + "loss": 0.2484, + "step": 10973 + }, + { + "epoch": 1.88, + "grad_norm": 10.202786445617676, + "learning_rate": 1.7504719409644758e-06, + "loss": 0.415, + "step": 10974 + }, + { + "epoch": 1.88, + "grad_norm": 8.060401916503906, + "learning_rate": 1.747897717521881e-06, + "loss": 0.3452, + "step": 10975 + }, + { + "epoch": 1.88, + "grad_norm": 13.941603660583496, + "learning_rate": 1.745323494079286e-06, + "loss": 0.48, + "step": 10976 + }, + { + "epoch": 1.88, + "grad_norm": 13.705573081970215, + "learning_rate": 1.7427492706366912e-06, + "loss": 0.5061, + "step": 10977 + }, + { + "epoch": 1.88, + "grad_norm": 11.721020698547363, + "learning_rate": 1.7401750471940966e-06, + "loss": 0.3106, + "step": 10978 + }, + { + "epoch": 1.88, + "grad_norm": 14.752761840820312, + "learning_rate": 1.7376008237515016e-06, + "loss": 0.4474, + "step": 10979 + }, + { + "epoch": 1.88, + "grad_norm": 12.939994812011719, + "learning_rate": 1.7350266003089068e-06, + "loss": 0.5227, + "step": 10980 + }, + { + "epoch": 1.88, + "grad_norm": 11.127387046813965, + "learning_rate": 1.732452376866312e-06, + "loss": 0.6356, + "step": 10981 + }, + { + "epoch": 1.88, + "grad_norm": 12.27786922454834, + "learning_rate": 1.7298781534237172e-06, + "loss": 0.4401, + "step": 10982 + }, + { + "epoch": 1.88, + "grad_norm": 8.561688423156738, + "learning_rate": 1.7273039299811224e-06, + "loss": 0.328, + "step": 10983 + }, + { + "epoch": 1.89, + "grad_norm": 11.691143035888672, + "learning_rate": 1.7247297065385276e-06, + "loss": 0.3278, + "step": 10984 + }, + { + "epoch": 1.89, + "grad_norm": 10.351495742797852, + "learning_rate": 1.7221554830959326e-06, + "loss": 0.3805, + "step": 10985 + }, + { + "epoch": 1.89, + "grad_norm": 5.489583969116211, + "learning_rate": 1.719581259653338e-06, + "loss": 0.2829, + "step": 10986 + }, + { + "epoch": 1.89, + "grad_norm": 12.070416450500488, + "learning_rate": 1.7170070362107432e-06, + "loss": 0.2392, + "step": 10987 + }, + { + "epoch": 1.89, + "grad_norm": 13.467737197875977, + "learning_rate": 1.7144328127681484e-06, + "loss": 0.3205, + "step": 10988 + }, + { + "epoch": 1.89, + "grad_norm": 8.04840087890625, + "learning_rate": 1.7118585893255534e-06, + "loss": 0.397, + "step": 10989 + }, + { + "epoch": 1.89, + "grad_norm": 13.456292152404785, + "learning_rate": 1.7092843658829588e-06, + "loss": 0.6786, + "step": 10990 + }, + { + "epoch": 1.89, + "grad_norm": 10.699073791503906, + "learning_rate": 1.706710142440364e-06, + "loss": 0.4608, + "step": 10991 + }, + { + "epoch": 1.89, + "grad_norm": 10.682391166687012, + "learning_rate": 1.704135918997769e-06, + "loss": 0.5574, + "step": 10992 + }, + { + "epoch": 1.89, + "grad_norm": 12.479904174804688, + "learning_rate": 1.7015616955551742e-06, + "loss": 0.3646, + "step": 10993 + }, + { + "epoch": 1.89, + "grad_norm": 10.163212776184082, + "learning_rate": 1.6989874721125794e-06, + "loss": 0.4495, + "step": 10994 + }, + { + "epoch": 1.89, + "grad_norm": 12.567753791809082, + "learning_rate": 1.6964132486699846e-06, + "loss": 0.3117, + "step": 10995 + }, + { + "epoch": 1.89, + "grad_norm": 9.026771545410156, + "learning_rate": 1.6938390252273898e-06, + "loss": 0.2662, + "step": 10996 + }, + { + "epoch": 1.89, + "grad_norm": 13.500985145568848, + "learning_rate": 1.691264801784795e-06, + "loss": 0.511, + "step": 10997 + }, + { + "epoch": 1.89, + "grad_norm": 9.901535034179688, + "learning_rate": 1.6886905783422e-06, + "loss": 0.4055, + "step": 10998 + }, + { + "epoch": 1.89, + "grad_norm": 10.854602813720703, + "learning_rate": 1.6861163548996054e-06, + "loss": 0.405, + "step": 10999 + }, + { + "epoch": 1.89, + "grad_norm": 7.663098335266113, + "learning_rate": 1.6835421314570106e-06, + "loss": 0.2184, + "step": 11000 + }, + { + "epoch": 1.89, + "grad_norm": 11.24438190460205, + "learning_rate": 1.6809679080144156e-06, + "loss": 0.3804, + "step": 11001 + }, + { + "epoch": 1.89, + "grad_norm": 11.910487174987793, + "learning_rate": 1.6783936845718208e-06, + "loss": 0.3671, + "step": 11002 + }, + { + "epoch": 1.89, + "grad_norm": 9.503626823425293, + "learning_rate": 1.6758194611292262e-06, + "loss": 0.3057, + "step": 11003 + }, + { + "epoch": 1.89, + "grad_norm": 10.804804801940918, + "learning_rate": 1.6732452376866314e-06, + "loss": 0.3449, + "step": 11004 + }, + { + "epoch": 1.89, + "grad_norm": 12.306525230407715, + "learning_rate": 1.6706710142440364e-06, + "loss": 0.3832, + "step": 11005 + }, + { + "epoch": 1.89, + "grad_norm": 11.503520965576172, + "learning_rate": 1.6680967908014416e-06, + "loss": 0.3029, + "step": 11006 + }, + { + "epoch": 1.89, + "grad_norm": 15.522000312805176, + "learning_rate": 1.6655225673588468e-06, + "loss": 0.3566, + "step": 11007 + }, + { + "epoch": 1.89, + "grad_norm": 11.334946632385254, + "learning_rate": 1.662948343916252e-06, + "loss": 0.3406, + "step": 11008 + }, + { + "epoch": 1.89, + "grad_norm": 10.630239486694336, + "learning_rate": 1.6603741204736572e-06, + "loss": 0.3247, + "step": 11009 + }, + { + "epoch": 1.89, + "grad_norm": 9.048665046691895, + "learning_rate": 1.6577998970310624e-06, + "loss": 0.2981, + "step": 11010 + }, + { + "epoch": 1.89, + "grad_norm": 7.395168304443359, + "learning_rate": 1.6552256735884674e-06, + "loss": 0.2665, + "step": 11011 + }, + { + "epoch": 1.89, + "grad_norm": 9.834129333496094, + "learning_rate": 1.6526514501458728e-06, + "loss": 0.2973, + "step": 11012 + }, + { + "epoch": 1.89, + "grad_norm": 6.7796783447265625, + "learning_rate": 1.650077226703278e-06, + "loss": 0.3329, + "step": 11013 + }, + { + "epoch": 1.89, + "grad_norm": 12.606521606445312, + "learning_rate": 1.647503003260683e-06, + "loss": 0.5186, + "step": 11014 + }, + { + "epoch": 1.89, + "grad_norm": 10.214790344238281, + "learning_rate": 1.6449287798180882e-06, + "loss": 0.41, + "step": 11015 + }, + { + "epoch": 1.89, + "grad_norm": 8.67099380493164, + "learning_rate": 1.6423545563754936e-06, + "loss": 0.4352, + "step": 11016 + }, + { + "epoch": 1.89, + "grad_norm": 13.937704086303711, + "learning_rate": 1.6397803329328988e-06, + "loss": 0.4888, + "step": 11017 + }, + { + "epoch": 1.89, + "grad_norm": 11.618767738342285, + "learning_rate": 1.6372061094903038e-06, + "loss": 0.3589, + "step": 11018 + }, + { + "epoch": 1.89, + "grad_norm": 8.796728134155273, + "learning_rate": 1.634631886047709e-06, + "loss": 0.3502, + "step": 11019 + }, + { + "epoch": 1.89, + "grad_norm": 7.404500484466553, + "learning_rate": 1.6320576626051142e-06, + "loss": 0.2487, + "step": 11020 + }, + { + "epoch": 1.89, + "grad_norm": 8.910385131835938, + "learning_rate": 1.6294834391625193e-06, + "loss": 0.3413, + "step": 11021 + }, + { + "epoch": 1.89, + "grad_norm": 10.829666137695312, + "learning_rate": 1.6269092157199245e-06, + "loss": 0.3072, + "step": 11022 + }, + { + "epoch": 1.89, + "grad_norm": 7.673471927642822, + "learning_rate": 1.6243349922773297e-06, + "loss": 0.2127, + "step": 11023 + }, + { + "epoch": 1.89, + "grad_norm": 9.837130546569824, + "learning_rate": 1.6217607688347347e-06, + "loss": 0.3571, + "step": 11024 + }, + { + "epoch": 1.89, + "grad_norm": 16.69570541381836, + "learning_rate": 1.6191865453921401e-06, + "loss": 0.4973, + "step": 11025 + }, + { + "epoch": 1.89, + "grad_norm": 10.976789474487305, + "learning_rate": 1.6166123219495453e-06, + "loss": 0.4729, + "step": 11026 + }, + { + "epoch": 1.89, + "grad_norm": 11.088257789611816, + "learning_rate": 1.6140380985069503e-06, + "loss": 0.3667, + "step": 11027 + }, + { + "epoch": 1.89, + "grad_norm": 6.058648109436035, + "learning_rate": 1.6114638750643555e-06, + "loss": 0.2157, + "step": 11028 + }, + { + "epoch": 1.89, + "grad_norm": 9.641792297363281, + "learning_rate": 1.608889651621761e-06, + "loss": 0.3405, + "step": 11029 + }, + { + "epoch": 1.89, + "grad_norm": 7.001545429229736, + "learning_rate": 1.6063154281791661e-06, + "loss": 0.2472, + "step": 11030 + }, + { + "epoch": 1.89, + "grad_norm": 11.410353660583496, + "learning_rate": 1.6037412047365711e-06, + "loss": 0.4894, + "step": 11031 + }, + { + "epoch": 1.89, + "grad_norm": 17.552005767822266, + "learning_rate": 1.6011669812939763e-06, + "loss": 0.4725, + "step": 11032 + }, + { + "epoch": 1.89, + "grad_norm": 12.280860900878906, + "learning_rate": 1.5985927578513815e-06, + "loss": 0.5346, + "step": 11033 + }, + { + "epoch": 1.89, + "grad_norm": 9.028883934020996, + "learning_rate": 1.5960185344087867e-06, + "loss": 0.3666, + "step": 11034 + }, + { + "epoch": 1.89, + "grad_norm": 13.481372833251953, + "learning_rate": 1.593444310966192e-06, + "loss": 0.4333, + "step": 11035 + }, + { + "epoch": 1.89, + "grad_norm": 13.702973365783691, + "learning_rate": 1.5908700875235971e-06, + "loss": 0.2984, + "step": 11036 + }, + { + "epoch": 1.89, + "grad_norm": 11.116095542907715, + "learning_rate": 1.5882958640810021e-06, + "loss": 0.3005, + "step": 11037 + }, + { + "epoch": 1.89, + "grad_norm": 8.090060234069824, + "learning_rate": 1.5857216406384075e-06, + "loss": 0.2367, + "step": 11038 + }, + { + "epoch": 1.89, + "grad_norm": 14.669819831848145, + "learning_rate": 1.5831474171958127e-06, + "loss": 0.48, + "step": 11039 + }, + { + "epoch": 1.89, + "grad_norm": 6.91246223449707, + "learning_rate": 1.5805731937532177e-06, + "loss": 0.2103, + "step": 11040 + }, + { + "epoch": 1.89, + "grad_norm": 13.193394660949707, + "learning_rate": 1.577998970310623e-06, + "loss": 0.2906, + "step": 11041 + }, + { + "epoch": 1.89, + "grad_norm": 9.936698913574219, + "learning_rate": 1.5754247468680283e-06, + "loss": 0.3704, + "step": 11042 + }, + { + "epoch": 1.9, + "grad_norm": 8.773835182189941, + "learning_rate": 1.5728505234254335e-06, + "loss": 0.3421, + "step": 11043 + }, + { + "epoch": 1.9, + "grad_norm": 9.810704231262207, + "learning_rate": 1.5702762999828385e-06, + "loss": 0.3327, + "step": 11044 + }, + { + "epoch": 1.9, + "grad_norm": 12.33499526977539, + "learning_rate": 1.5677020765402437e-06, + "loss": 0.3793, + "step": 11045 + }, + { + "epoch": 1.9, + "grad_norm": 11.707942008972168, + "learning_rate": 1.565127853097649e-06, + "loss": 0.3876, + "step": 11046 + }, + { + "epoch": 1.9, + "grad_norm": 13.534838676452637, + "learning_rate": 1.562553629655054e-06, + "loss": 0.3662, + "step": 11047 + }, + { + "epoch": 1.9, + "grad_norm": 9.828907012939453, + "learning_rate": 1.5599794062124593e-06, + "loss": 0.3065, + "step": 11048 + }, + { + "epoch": 1.9, + "grad_norm": 8.582770347595215, + "learning_rate": 1.5574051827698645e-06, + "loss": 0.3508, + "step": 11049 + }, + { + "epoch": 1.9, + "grad_norm": 12.885123252868652, + "learning_rate": 1.5548309593272695e-06, + "loss": 0.4519, + "step": 11050 + }, + { + "epoch": 1.9, + "grad_norm": 13.808953285217285, + "learning_rate": 1.552256735884675e-06, + "loss": 0.3399, + "step": 11051 + }, + { + "epoch": 1.9, + "grad_norm": 15.436074256896973, + "learning_rate": 1.54968251244208e-06, + "loss": 0.5282, + "step": 11052 + }, + { + "epoch": 1.9, + "grad_norm": 6.765783786773682, + "learning_rate": 1.547108288999485e-06, + "loss": 0.1748, + "step": 11053 + }, + { + "epoch": 1.9, + "grad_norm": 11.8151216506958, + "learning_rate": 1.5445340655568903e-06, + "loss": 0.3572, + "step": 11054 + }, + { + "epoch": 1.9, + "grad_norm": 11.112709045410156, + "learning_rate": 1.5419598421142957e-06, + "loss": 0.2839, + "step": 11055 + }, + { + "epoch": 1.9, + "grad_norm": 9.86545181274414, + "learning_rate": 1.539385618671701e-06, + "loss": 0.3148, + "step": 11056 + }, + { + "epoch": 1.9, + "grad_norm": 8.274297714233398, + "learning_rate": 1.5368113952291059e-06, + "loss": 0.2154, + "step": 11057 + }, + { + "epoch": 1.9, + "grad_norm": 9.033451080322266, + "learning_rate": 1.534237171786511e-06, + "loss": 0.3413, + "step": 11058 + }, + { + "epoch": 1.9, + "grad_norm": 11.844738006591797, + "learning_rate": 1.5316629483439163e-06, + "loss": 0.3977, + "step": 11059 + }, + { + "epoch": 1.9, + "grad_norm": 10.7937650680542, + "learning_rate": 1.5290887249013215e-06, + "loss": 0.3504, + "step": 11060 + }, + { + "epoch": 1.9, + "grad_norm": 13.066435813903809, + "learning_rate": 1.5265145014587267e-06, + "loss": 0.4311, + "step": 11061 + }, + { + "epoch": 1.9, + "grad_norm": 9.249638557434082, + "learning_rate": 1.5239402780161319e-06, + "loss": 0.3725, + "step": 11062 + }, + { + "epoch": 1.9, + "grad_norm": 9.696634292602539, + "learning_rate": 1.5213660545735369e-06, + "loss": 0.3621, + "step": 11063 + }, + { + "epoch": 1.9, + "grad_norm": 11.671460151672363, + "learning_rate": 1.5187918311309423e-06, + "loss": 0.3871, + "step": 11064 + }, + { + "epoch": 1.9, + "grad_norm": 9.716444969177246, + "learning_rate": 1.5162176076883475e-06, + "loss": 0.3548, + "step": 11065 + }, + { + "epoch": 1.9, + "grad_norm": 12.064604759216309, + "learning_rate": 1.5136433842457525e-06, + "loss": 0.5071, + "step": 11066 + }, + { + "epoch": 1.9, + "grad_norm": 8.194353103637695, + "learning_rate": 1.5110691608031577e-06, + "loss": 0.5182, + "step": 11067 + }, + { + "epoch": 1.9, + "grad_norm": 9.161458969116211, + "learning_rate": 1.508494937360563e-06, + "loss": 0.4692, + "step": 11068 + }, + { + "epoch": 1.9, + "grad_norm": 8.44462776184082, + "learning_rate": 1.5059207139179683e-06, + "loss": 0.2873, + "step": 11069 + }, + { + "epoch": 1.9, + "grad_norm": 13.861928939819336, + "learning_rate": 1.5033464904753733e-06, + "loss": 0.4623, + "step": 11070 + }, + { + "epoch": 1.9, + "grad_norm": 14.47918701171875, + "learning_rate": 1.5007722670327785e-06, + "loss": 0.3618, + "step": 11071 + }, + { + "epoch": 1.9, + "grad_norm": 12.619352340698242, + "learning_rate": 1.4981980435901837e-06, + "loss": 0.3762, + "step": 11072 + }, + { + "epoch": 1.9, + "grad_norm": 12.443218231201172, + "learning_rate": 1.4956238201475889e-06, + "loss": 0.5267, + "step": 11073 + }, + { + "epoch": 1.9, + "grad_norm": 19.459564208984375, + "learning_rate": 1.493049596704994e-06, + "loss": 0.3748, + "step": 11074 + }, + { + "epoch": 1.9, + "grad_norm": 8.884991645812988, + "learning_rate": 1.4904753732623993e-06, + "loss": 0.2525, + "step": 11075 + }, + { + "epoch": 1.9, + "grad_norm": 11.193331718444824, + "learning_rate": 1.4879011498198042e-06, + "loss": 0.4248, + "step": 11076 + }, + { + "epoch": 1.9, + "grad_norm": 14.723840713500977, + "learning_rate": 1.4853269263772097e-06, + "loss": 0.353, + "step": 11077 + }, + { + "epoch": 1.9, + "grad_norm": 7.666188716888428, + "learning_rate": 1.4827527029346149e-06, + "loss": 0.2274, + "step": 11078 + }, + { + "epoch": 1.9, + "grad_norm": 8.302162170410156, + "learning_rate": 1.4801784794920198e-06, + "loss": 0.3173, + "step": 11079 + }, + { + "epoch": 1.9, + "grad_norm": 8.9813871383667, + "learning_rate": 1.477604256049425e-06, + "loss": 0.3018, + "step": 11080 + }, + { + "epoch": 1.9, + "grad_norm": 8.430132865905762, + "learning_rate": 1.4750300326068304e-06, + "loss": 0.3852, + "step": 11081 + }, + { + "epoch": 1.9, + "grad_norm": 7.488740921020508, + "learning_rate": 1.4724558091642354e-06, + "loss": 0.2931, + "step": 11082 + }, + { + "epoch": 1.9, + "grad_norm": 12.134601593017578, + "learning_rate": 1.4698815857216406e-06, + "loss": 0.426, + "step": 11083 + }, + { + "epoch": 1.9, + "grad_norm": 6.19254732131958, + "learning_rate": 1.4673073622790458e-06, + "loss": 0.2565, + "step": 11084 + }, + { + "epoch": 1.9, + "grad_norm": 11.47159481048584, + "learning_rate": 1.464733138836451e-06, + "loss": 0.3586, + "step": 11085 + }, + { + "epoch": 1.9, + "grad_norm": 9.662041664123535, + "learning_rate": 1.4621589153938562e-06, + "loss": 0.2908, + "step": 11086 + }, + { + "epoch": 1.9, + "grad_norm": 9.238934516906738, + "learning_rate": 1.4595846919512614e-06, + "loss": 0.3454, + "step": 11087 + }, + { + "epoch": 1.9, + "grad_norm": 13.97555923461914, + "learning_rate": 1.4570104685086666e-06, + "loss": 0.3494, + "step": 11088 + }, + { + "epoch": 1.9, + "grad_norm": 9.415300369262695, + "learning_rate": 1.4544362450660716e-06, + "loss": 0.2691, + "step": 11089 + }, + { + "epoch": 1.9, + "grad_norm": 10.385876655578613, + "learning_rate": 1.451862021623477e-06, + "loss": 0.2755, + "step": 11090 + }, + { + "epoch": 1.9, + "grad_norm": 14.479072570800781, + "learning_rate": 1.4492877981808822e-06, + "loss": 0.3505, + "step": 11091 + }, + { + "epoch": 1.9, + "grad_norm": 16.806800842285156, + "learning_rate": 1.4467135747382872e-06, + "loss": 0.5207, + "step": 11092 + }, + { + "epoch": 1.9, + "grad_norm": 10.432601928710938, + "learning_rate": 1.4441393512956924e-06, + "loss": 0.4147, + "step": 11093 + }, + { + "epoch": 1.9, + "grad_norm": 9.812528610229492, + "learning_rate": 1.4415651278530978e-06, + "loss": 0.2677, + "step": 11094 + }, + { + "epoch": 1.9, + "grad_norm": 12.20866870880127, + "learning_rate": 1.4389909044105028e-06, + "loss": 0.4399, + "step": 11095 + }, + { + "epoch": 1.9, + "grad_norm": 10.870978355407715, + "learning_rate": 1.436416680967908e-06, + "loss": 0.2924, + "step": 11096 + }, + { + "epoch": 1.9, + "grad_norm": 11.04464054107666, + "learning_rate": 1.4338424575253132e-06, + "loss": 0.2668, + "step": 11097 + }, + { + "epoch": 1.9, + "grad_norm": 7.724349498748779, + "learning_rate": 1.4312682340827184e-06, + "loss": 0.2505, + "step": 11098 + }, + { + "epoch": 1.9, + "grad_norm": 9.885076522827148, + "learning_rate": 1.4286940106401236e-06, + "loss": 0.3736, + "step": 11099 + }, + { + "epoch": 1.9, + "grad_norm": 11.8551607131958, + "learning_rate": 1.4261197871975288e-06, + "loss": 0.4088, + "step": 11100 + }, + { + "epoch": 1.91, + "grad_norm": 8.75053882598877, + "learning_rate": 1.423545563754934e-06, + "loss": 0.4597, + "step": 11101 + }, + { + "epoch": 1.91, + "grad_norm": 17.84624671936035, + "learning_rate": 1.420971340312339e-06, + "loss": 0.633, + "step": 11102 + }, + { + "epoch": 1.91, + "grad_norm": 10.965681076049805, + "learning_rate": 1.4183971168697444e-06, + "loss": 0.3649, + "step": 11103 + }, + { + "epoch": 1.91, + "grad_norm": 17.497034072875977, + "learning_rate": 1.4158228934271496e-06, + "loss": 0.3716, + "step": 11104 + }, + { + "epoch": 1.91, + "grad_norm": 9.71844482421875, + "learning_rate": 1.4132486699845546e-06, + "loss": 0.3703, + "step": 11105 + }, + { + "epoch": 1.91, + "grad_norm": 7.784379959106445, + "learning_rate": 1.4106744465419598e-06, + "loss": 0.2385, + "step": 11106 + }, + { + "epoch": 1.91, + "grad_norm": 8.765949249267578, + "learning_rate": 1.4081002230993652e-06, + "loss": 0.3182, + "step": 11107 + }, + { + "epoch": 1.91, + "grad_norm": 10.988414764404297, + "learning_rate": 1.4055259996567702e-06, + "loss": 0.3591, + "step": 11108 + }, + { + "epoch": 1.91, + "grad_norm": 13.155403137207031, + "learning_rate": 1.4029517762141754e-06, + "loss": 0.6233, + "step": 11109 + }, + { + "epoch": 1.91, + "grad_norm": 10.289401054382324, + "learning_rate": 1.4003775527715806e-06, + "loss": 0.2637, + "step": 11110 + }, + { + "epoch": 1.91, + "grad_norm": 7.276763439178467, + "learning_rate": 1.3978033293289858e-06, + "loss": 0.382, + "step": 11111 + }, + { + "epoch": 1.91, + "grad_norm": 9.864279747009277, + "learning_rate": 1.395229105886391e-06, + "loss": 0.2796, + "step": 11112 + }, + { + "epoch": 1.91, + "grad_norm": 11.87149715423584, + "learning_rate": 1.3926548824437962e-06, + "loss": 0.3887, + "step": 11113 + }, + { + "epoch": 1.91, + "grad_norm": 11.398486137390137, + "learning_rate": 1.3900806590012014e-06, + "loss": 0.3937, + "step": 11114 + }, + { + "epoch": 1.91, + "grad_norm": 11.466192245483398, + "learning_rate": 1.3875064355586064e-06, + "loss": 0.4295, + "step": 11115 + }, + { + "epoch": 1.91, + "grad_norm": 7.146299362182617, + "learning_rate": 1.3849322121160118e-06, + "loss": 0.2447, + "step": 11116 + }, + { + "epoch": 1.91, + "grad_norm": 13.072288513183594, + "learning_rate": 1.382357988673417e-06, + "loss": 0.578, + "step": 11117 + }, + { + "epoch": 1.91, + "grad_norm": 9.480597496032715, + "learning_rate": 1.379783765230822e-06, + "loss": 0.3693, + "step": 11118 + }, + { + "epoch": 1.91, + "grad_norm": 9.855088233947754, + "learning_rate": 1.3772095417882272e-06, + "loss": 0.3826, + "step": 11119 + }, + { + "epoch": 1.91, + "grad_norm": 16.57455062866211, + "learning_rate": 1.3746353183456326e-06, + "loss": 0.5271, + "step": 11120 + }, + { + "epoch": 1.91, + "grad_norm": 11.553418159484863, + "learning_rate": 1.3720610949030376e-06, + "loss": 0.4354, + "step": 11121 + }, + { + "epoch": 1.91, + "grad_norm": 19.02069091796875, + "learning_rate": 1.3694868714604428e-06, + "loss": 0.5303, + "step": 11122 + }, + { + "epoch": 1.91, + "grad_norm": 9.128046035766602, + "learning_rate": 1.366912648017848e-06, + "loss": 0.279, + "step": 11123 + }, + { + "epoch": 1.91, + "grad_norm": 11.339672088623047, + "learning_rate": 1.3643384245752532e-06, + "loss": 0.3368, + "step": 11124 + }, + { + "epoch": 1.91, + "grad_norm": 13.95443058013916, + "learning_rate": 1.3617642011326584e-06, + "loss": 0.5186, + "step": 11125 + }, + { + "epoch": 1.91, + "grad_norm": 8.590944290161133, + "learning_rate": 1.3591899776900636e-06, + "loss": 0.4422, + "step": 11126 + }, + { + "epoch": 1.91, + "grad_norm": 15.974928855895996, + "learning_rate": 1.3566157542474688e-06, + "loss": 0.4472, + "step": 11127 + }, + { + "epoch": 1.91, + "grad_norm": 9.079082489013672, + "learning_rate": 1.3540415308048737e-06, + "loss": 0.2701, + "step": 11128 + }, + { + "epoch": 1.91, + "grad_norm": 20.837074279785156, + "learning_rate": 1.3514673073622792e-06, + "loss": 0.36, + "step": 11129 + }, + { + "epoch": 1.91, + "grad_norm": 10.154749870300293, + "learning_rate": 1.3488930839196844e-06, + "loss": 0.2335, + "step": 11130 + }, + { + "epoch": 1.91, + "grad_norm": 8.293107032775879, + "learning_rate": 1.3463188604770893e-06, + "loss": 0.3503, + "step": 11131 + }, + { + "epoch": 1.91, + "grad_norm": 9.760516166687012, + "learning_rate": 1.3437446370344945e-06, + "loss": 0.4766, + "step": 11132 + }, + { + "epoch": 1.91, + "grad_norm": 9.617878913879395, + "learning_rate": 1.3411704135919e-06, + "loss": 0.3759, + "step": 11133 + }, + { + "epoch": 1.91, + "grad_norm": 8.617166519165039, + "learning_rate": 1.338596190149305e-06, + "loss": 0.3392, + "step": 11134 + }, + { + "epoch": 1.91, + "grad_norm": 12.61210823059082, + "learning_rate": 1.3360219667067101e-06, + "loss": 0.6003, + "step": 11135 + }, + { + "epoch": 1.91, + "grad_norm": 6.657895088195801, + "learning_rate": 1.3334477432641153e-06, + "loss": 0.2443, + "step": 11136 + }, + { + "epoch": 1.91, + "grad_norm": 10.559221267700195, + "learning_rate": 1.3308735198215205e-06, + "loss": 0.3253, + "step": 11137 + }, + { + "epoch": 1.91, + "grad_norm": 8.400186538696289, + "learning_rate": 1.3282992963789257e-06, + "loss": 0.158, + "step": 11138 + }, + { + "epoch": 1.91, + "grad_norm": 9.523914337158203, + "learning_rate": 1.325725072936331e-06, + "loss": 0.4054, + "step": 11139 + }, + { + "epoch": 1.91, + "grad_norm": 11.095930099487305, + "learning_rate": 1.3231508494937361e-06, + "loss": 0.2519, + "step": 11140 + }, + { + "epoch": 1.91, + "grad_norm": 8.52552318572998, + "learning_rate": 1.3205766260511411e-06, + "loss": 0.3217, + "step": 11141 + }, + { + "epoch": 1.91, + "grad_norm": 7.015631675720215, + "learning_rate": 1.3180024026085465e-06, + "loss": 0.3141, + "step": 11142 + }, + { + "epoch": 1.91, + "grad_norm": 7.959336280822754, + "learning_rate": 1.3154281791659517e-06, + "loss": 0.2927, + "step": 11143 + }, + { + "epoch": 1.91, + "grad_norm": 11.793444633483887, + "learning_rate": 1.3128539557233567e-06, + "loss": 0.4361, + "step": 11144 + }, + { + "epoch": 1.91, + "grad_norm": 9.20043659210205, + "learning_rate": 1.310279732280762e-06, + "loss": 0.2795, + "step": 11145 + }, + { + "epoch": 1.91, + "grad_norm": 12.522153854370117, + "learning_rate": 1.3077055088381673e-06, + "loss": 0.3831, + "step": 11146 + }, + { + "epoch": 1.91, + "grad_norm": 11.44690227508545, + "learning_rate": 1.3051312853955723e-06, + "loss": 0.239, + "step": 11147 + }, + { + "epoch": 1.91, + "grad_norm": 9.916020393371582, + "learning_rate": 1.3025570619529775e-06, + "loss": 0.2487, + "step": 11148 + }, + { + "epoch": 1.91, + "grad_norm": 17.011587142944336, + "learning_rate": 1.2999828385103827e-06, + "loss": 0.5096, + "step": 11149 + }, + { + "epoch": 1.91, + "grad_norm": 10.361563682556152, + "learning_rate": 1.297408615067788e-06, + "loss": 0.4364, + "step": 11150 + }, + { + "epoch": 1.91, + "grad_norm": 10.001625061035156, + "learning_rate": 1.2948343916251931e-06, + "loss": 0.2891, + "step": 11151 + }, + { + "epoch": 1.91, + "grad_norm": 10.219432830810547, + "learning_rate": 1.2922601681825983e-06, + "loss": 0.3015, + "step": 11152 + }, + { + "epoch": 1.91, + "grad_norm": 9.579834938049316, + "learning_rate": 1.2896859447400035e-06, + "loss": 0.2665, + "step": 11153 + }, + { + "epoch": 1.91, + "grad_norm": 12.218266487121582, + "learning_rate": 1.2871117212974085e-06, + "loss": 0.3915, + "step": 11154 + }, + { + "epoch": 1.91, + "grad_norm": 7.835561752319336, + "learning_rate": 1.284537497854814e-06, + "loss": 0.2177, + "step": 11155 + }, + { + "epoch": 1.91, + "grad_norm": 11.71733283996582, + "learning_rate": 1.2819632744122191e-06, + "loss": 0.3994, + "step": 11156 + }, + { + "epoch": 1.91, + "grad_norm": 9.122781753540039, + "learning_rate": 1.279389050969624e-06, + "loss": 0.2778, + "step": 11157 + }, + { + "epoch": 1.91, + "grad_norm": 13.859009742736816, + "learning_rate": 1.2768148275270293e-06, + "loss": 0.5948, + "step": 11158 + }, + { + "epoch": 1.92, + "grad_norm": 8.164999961853027, + "learning_rate": 1.2742406040844347e-06, + "loss": 0.2499, + "step": 11159 + }, + { + "epoch": 1.92, + "grad_norm": 13.167304039001465, + "learning_rate": 1.2716663806418397e-06, + "loss": 0.4165, + "step": 11160 + }, + { + "epoch": 1.92, + "grad_norm": 10.132265090942383, + "learning_rate": 1.2690921571992449e-06, + "loss": 0.2645, + "step": 11161 + }, + { + "epoch": 1.92, + "grad_norm": 10.252290725708008, + "learning_rate": 1.26651793375665e-06, + "loss": 0.3353, + "step": 11162 + }, + { + "epoch": 1.92, + "grad_norm": 11.0601806640625, + "learning_rate": 1.2639437103140553e-06, + "loss": 0.4035, + "step": 11163 + }, + { + "epoch": 1.92, + "grad_norm": 14.308443069458008, + "learning_rate": 1.2613694868714605e-06, + "loss": 0.4274, + "step": 11164 + }, + { + "epoch": 1.92, + "grad_norm": 11.666142463684082, + "learning_rate": 1.2587952634288657e-06, + "loss": 0.611, + "step": 11165 + }, + { + "epoch": 1.92, + "grad_norm": 11.5977201461792, + "learning_rate": 1.2562210399862709e-06, + "loss": 0.3677, + "step": 11166 + }, + { + "epoch": 1.92, + "grad_norm": 10.810057640075684, + "learning_rate": 1.2536468165436759e-06, + "loss": 0.3956, + "step": 11167 + }, + { + "epoch": 1.92, + "grad_norm": 12.103277206420898, + "learning_rate": 1.2510725931010813e-06, + "loss": 0.2853, + "step": 11168 + }, + { + "epoch": 1.92, + "grad_norm": 6.228283882141113, + "learning_rate": 1.2484983696584865e-06, + "loss": 0.2452, + "step": 11169 + }, + { + "epoch": 1.92, + "grad_norm": 9.345582962036133, + "learning_rate": 1.2459241462158915e-06, + "loss": 0.3881, + "step": 11170 + }, + { + "epoch": 1.92, + "grad_norm": 11.230448722839355, + "learning_rate": 1.2433499227732967e-06, + "loss": 0.2997, + "step": 11171 + }, + { + "epoch": 1.92, + "grad_norm": 8.033206939697266, + "learning_rate": 1.240775699330702e-06, + "loss": 0.333, + "step": 11172 + }, + { + "epoch": 1.92, + "grad_norm": 12.548929214477539, + "learning_rate": 1.238201475888107e-06, + "loss": 0.3719, + "step": 11173 + }, + { + "epoch": 1.92, + "grad_norm": 12.063002586364746, + "learning_rate": 1.2356272524455123e-06, + "loss": 0.374, + "step": 11174 + }, + { + "epoch": 1.92, + "grad_norm": 15.224303245544434, + "learning_rate": 1.2330530290029175e-06, + "loss": 0.4004, + "step": 11175 + }, + { + "epoch": 1.92, + "grad_norm": 10.494648933410645, + "learning_rate": 1.2304788055603227e-06, + "loss": 0.4149, + "step": 11176 + }, + { + "epoch": 1.92, + "grad_norm": 10.038595199584961, + "learning_rate": 1.2279045821177279e-06, + "loss": 0.2261, + "step": 11177 + }, + { + "epoch": 1.92, + "grad_norm": 15.000349998474121, + "learning_rate": 1.225330358675133e-06, + "loss": 0.4082, + "step": 11178 + }, + { + "epoch": 1.92, + "grad_norm": 11.931147575378418, + "learning_rate": 1.2227561352325383e-06, + "loss": 0.2104, + "step": 11179 + }, + { + "epoch": 1.92, + "grad_norm": 7.060903549194336, + "learning_rate": 1.2201819117899433e-06, + "loss": 0.2106, + "step": 11180 + }, + { + "epoch": 1.92, + "grad_norm": 11.333086013793945, + "learning_rate": 1.2176076883473487e-06, + "loss": 0.382, + "step": 11181 + }, + { + "epoch": 1.92, + "grad_norm": 8.011672973632812, + "learning_rate": 1.2150334649047539e-06, + "loss": 0.3036, + "step": 11182 + }, + { + "epoch": 1.92, + "grad_norm": 9.26968002319336, + "learning_rate": 1.2124592414621588e-06, + "loss": 0.294, + "step": 11183 + }, + { + "epoch": 1.92, + "grad_norm": 10.636792182922363, + "learning_rate": 1.209885018019564e-06, + "loss": 0.4169, + "step": 11184 + }, + { + "epoch": 1.92, + "grad_norm": 10.713167190551758, + "learning_rate": 1.2073107945769695e-06, + "loss": 0.3351, + "step": 11185 + }, + { + "epoch": 1.92, + "grad_norm": 11.600278854370117, + "learning_rate": 1.2047365711343744e-06, + "loss": 0.4731, + "step": 11186 + }, + { + "epoch": 1.92, + "grad_norm": 10.932683944702148, + "learning_rate": 1.2021623476917796e-06, + "loss": 0.3507, + "step": 11187 + }, + { + "epoch": 1.92, + "grad_norm": 8.079802513122559, + "learning_rate": 1.1995881242491848e-06, + "loss": 0.3308, + "step": 11188 + }, + { + "epoch": 1.92, + "grad_norm": 9.066193580627441, + "learning_rate": 1.19701390080659e-06, + "loss": 0.4347, + "step": 11189 + }, + { + "epoch": 1.92, + "grad_norm": 12.60873031616211, + "learning_rate": 1.1944396773639952e-06, + "loss": 0.4299, + "step": 11190 + }, + { + "epoch": 1.92, + "grad_norm": 9.062819480895996, + "learning_rate": 1.1918654539214004e-06, + "loss": 0.3523, + "step": 11191 + }, + { + "epoch": 1.92, + "grad_norm": 11.772015571594238, + "learning_rate": 1.1892912304788056e-06, + "loss": 0.2923, + "step": 11192 + }, + { + "epoch": 1.92, + "grad_norm": 8.713652610778809, + "learning_rate": 1.1867170070362106e-06, + "loss": 0.3126, + "step": 11193 + }, + { + "epoch": 1.92, + "grad_norm": 8.670111656188965, + "learning_rate": 1.184142783593616e-06, + "loss": 0.3533, + "step": 11194 + }, + { + "epoch": 1.92, + "grad_norm": 7.896197319030762, + "learning_rate": 1.1815685601510212e-06, + "loss": 0.4245, + "step": 11195 + }, + { + "epoch": 1.92, + "grad_norm": 10.288776397705078, + "learning_rate": 1.1789943367084262e-06, + "loss": 0.4578, + "step": 11196 + }, + { + "epoch": 1.92, + "grad_norm": 13.529693603515625, + "learning_rate": 1.1764201132658314e-06, + "loss": 0.3531, + "step": 11197 + }, + { + "epoch": 1.92, + "grad_norm": 10.612212181091309, + "learning_rate": 1.1738458898232368e-06, + "loss": 0.2996, + "step": 11198 + }, + { + "epoch": 1.92, + "grad_norm": 11.519620895385742, + "learning_rate": 1.1712716663806418e-06, + "loss": 0.2407, + "step": 11199 + }, + { + "epoch": 1.92, + "grad_norm": 7.2696943283081055, + "learning_rate": 1.168697442938047e-06, + "loss": 0.236, + "step": 11200 + }, + { + "epoch": 1.92, + "grad_norm": 8.432975769042969, + "learning_rate": 1.1661232194954522e-06, + "loss": 0.2414, + "step": 11201 + }, + { + "epoch": 1.92, + "grad_norm": 9.555484771728516, + "learning_rate": 1.1635489960528574e-06, + "loss": 0.4583, + "step": 11202 + }, + { + "epoch": 1.92, + "grad_norm": 10.341927528381348, + "learning_rate": 1.1609747726102626e-06, + "loss": 0.3313, + "step": 11203 + }, + { + "epoch": 1.92, + "grad_norm": 10.018180847167969, + "learning_rate": 1.1584005491676678e-06, + "loss": 0.423, + "step": 11204 + }, + { + "epoch": 1.92, + "grad_norm": 7.9628214836120605, + "learning_rate": 1.155826325725073e-06, + "loss": 0.3947, + "step": 11205 + }, + { + "epoch": 1.92, + "grad_norm": 8.402603149414062, + "learning_rate": 1.153252102282478e-06, + "loss": 0.2585, + "step": 11206 + }, + { + "epoch": 1.92, + "grad_norm": 14.141619682312012, + "learning_rate": 1.1506778788398834e-06, + "loss": 0.3434, + "step": 11207 + }, + { + "epoch": 1.92, + "grad_norm": 7.781804084777832, + "learning_rate": 1.1481036553972886e-06, + "loss": 0.2694, + "step": 11208 + }, + { + "epoch": 1.92, + "grad_norm": 9.032614707946777, + "learning_rate": 1.1455294319546936e-06, + "loss": 0.4094, + "step": 11209 + }, + { + "epoch": 1.92, + "grad_norm": 9.427877426147461, + "learning_rate": 1.1429552085120988e-06, + "loss": 0.2521, + "step": 11210 + }, + { + "epoch": 1.92, + "grad_norm": 9.758294105529785, + "learning_rate": 1.1403809850695042e-06, + "loss": 0.3904, + "step": 11211 + }, + { + "epoch": 1.92, + "grad_norm": 8.965252876281738, + "learning_rate": 1.1378067616269092e-06, + "loss": 0.3828, + "step": 11212 + }, + { + "epoch": 1.92, + "grad_norm": 9.592247009277344, + "learning_rate": 1.1352325381843144e-06, + "loss": 0.3502, + "step": 11213 + }, + { + "epoch": 1.92, + "grad_norm": 12.019633293151855, + "learning_rate": 1.1326583147417196e-06, + "loss": 0.3179, + "step": 11214 + }, + { + "epoch": 1.92, + "grad_norm": 9.783926963806152, + "learning_rate": 1.1300840912991248e-06, + "loss": 0.3423, + "step": 11215 + }, + { + "epoch": 1.92, + "grad_norm": 9.411629676818848, + "learning_rate": 1.12750986785653e-06, + "loss": 0.3384, + "step": 11216 + }, + { + "epoch": 1.93, + "grad_norm": 9.852890014648438, + "learning_rate": 1.1249356444139352e-06, + "loss": 0.3336, + "step": 11217 + }, + { + "epoch": 1.93, + "grad_norm": 11.638039588928223, + "learning_rate": 1.1223614209713404e-06, + "loss": 0.3948, + "step": 11218 + }, + { + "epoch": 1.93, + "grad_norm": 15.062429428100586, + "learning_rate": 1.1197871975287454e-06, + "loss": 0.444, + "step": 11219 + }, + { + "epoch": 1.93, + "grad_norm": 11.246325492858887, + "learning_rate": 1.1172129740861508e-06, + "loss": 0.3967, + "step": 11220 + }, + { + "epoch": 1.93, + "grad_norm": 15.875120162963867, + "learning_rate": 1.114638750643556e-06, + "loss": 0.4064, + "step": 11221 + }, + { + "epoch": 1.93, + "grad_norm": 8.381048202514648, + "learning_rate": 1.112064527200961e-06, + "loss": 0.2384, + "step": 11222 + }, + { + "epoch": 1.93, + "grad_norm": 11.927454948425293, + "learning_rate": 1.1094903037583662e-06, + "loss": 0.3917, + "step": 11223 + }, + { + "epoch": 1.93, + "grad_norm": 10.526918411254883, + "learning_rate": 1.1069160803157716e-06, + "loss": 0.3228, + "step": 11224 + }, + { + "epoch": 1.93, + "grad_norm": 15.221765518188477, + "learning_rate": 1.1043418568731766e-06, + "loss": 0.3269, + "step": 11225 + }, + { + "epoch": 1.93, + "grad_norm": 8.908487319946289, + "learning_rate": 1.1017676334305818e-06, + "loss": 0.2706, + "step": 11226 + }, + { + "epoch": 1.93, + "grad_norm": 12.794954299926758, + "learning_rate": 1.099193409987987e-06, + "loss": 0.205, + "step": 11227 + }, + { + "epoch": 1.93, + "grad_norm": 10.273452758789062, + "learning_rate": 1.0966191865453922e-06, + "loss": 0.4591, + "step": 11228 + }, + { + "epoch": 1.93, + "grad_norm": 11.273874282836914, + "learning_rate": 1.0940449631027974e-06, + "loss": 0.4452, + "step": 11229 + }, + { + "epoch": 1.93, + "grad_norm": 8.868019104003906, + "learning_rate": 1.0914707396602026e-06, + "loss": 0.3404, + "step": 11230 + }, + { + "epoch": 1.93, + "grad_norm": 10.350021362304688, + "learning_rate": 1.0888965162176078e-06, + "loss": 0.3758, + "step": 11231 + }, + { + "epoch": 1.93, + "grad_norm": 12.006874084472656, + "learning_rate": 1.0863222927750128e-06, + "loss": 0.4104, + "step": 11232 + }, + { + "epoch": 1.93, + "grad_norm": 8.243325233459473, + "learning_rate": 1.0837480693324182e-06, + "loss": 0.3459, + "step": 11233 + }, + { + "epoch": 1.93, + "grad_norm": 10.974726676940918, + "learning_rate": 1.0811738458898234e-06, + "loss": 0.3966, + "step": 11234 + }, + { + "epoch": 1.93, + "grad_norm": 12.79494571685791, + "learning_rate": 1.0785996224472284e-06, + "loss": 0.4917, + "step": 11235 + }, + { + "epoch": 1.93, + "grad_norm": 15.913002014160156, + "learning_rate": 1.0760253990046336e-06, + "loss": 0.5121, + "step": 11236 + }, + { + "epoch": 1.93, + "grad_norm": 11.53303337097168, + "learning_rate": 1.073451175562039e-06, + "loss": 0.3243, + "step": 11237 + }, + { + "epoch": 1.93, + "grad_norm": 21.39658546447754, + "learning_rate": 1.070876952119444e-06, + "loss": 0.269, + "step": 11238 + }, + { + "epoch": 1.93, + "grad_norm": 14.369318008422852, + "learning_rate": 1.0683027286768492e-06, + "loss": 0.3039, + "step": 11239 + }, + { + "epoch": 1.93, + "grad_norm": 10.533215522766113, + "learning_rate": 1.0657285052342544e-06, + "loss": 0.4033, + "step": 11240 + }, + { + "epoch": 1.93, + "grad_norm": 11.617938041687012, + "learning_rate": 1.0631542817916596e-06, + "loss": 0.4906, + "step": 11241 + }, + { + "epoch": 1.93, + "grad_norm": 10.701202392578125, + "learning_rate": 1.0605800583490647e-06, + "loss": 0.349, + "step": 11242 + }, + { + "epoch": 1.93, + "grad_norm": 6.699152946472168, + "learning_rate": 1.05800583490647e-06, + "loss": 0.2965, + "step": 11243 + }, + { + "epoch": 1.93, + "grad_norm": 18.104108810424805, + "learning_rate": 1.0554316114638751e-06, + "loss": 0.4275, + "step": 11244 + }, + { + "epoch": 1.93, + "grad_norm": 9.235733032226562, + "learning_rate": 1.0528573880212801e-06, + "loss": 0.4241, + "step": 11245 + }, + { + "epoch": 1.93, + "grad_norm": 10.381830215454102, + "learning_rate": 1.0502831645786855e-06, + "loss": 0.3633, + "step": 11246 + }, + { + "epoch": 1.93, + "grad_norm": 8.662819862365723, + "learning_rate": 1.0477089411360907e-06, + "loss": 0.444, + "step": 11247 + }, + { + "epoch": 1.93, + "grad_norm": 14.038917541503906, + "learning_rate": 1.0451347176934957e-06, + "loss": 0.3129, + "step": 11248 + }, + { + "epoch": 1.93, + "grad_norm": 8.602962493896484, + "learning_rate": 1.042560494250901e-06, + "loss": 0.2215, + "step": 11249 + }, + { + "epoch": 1.93, + "grad_norm": 11.234943389892578, + "learning_rate": 1.0399862708083063e-06, + "loss": 0.4591, + "step": 11250 + }, + { + "epoch": 1.93, + "grad_norm": 12.435638427734375, + "learning_rate": 1.0374120473657113e-06, + "loss": 0.3963, + "step": 11251 + }, + { + "epoch": 1.93, + "grad_norm": 10.230363845825195, + "learning_rate": 1.0348378239231165e-06, + "loss": 0.3475, + "step": 11252 + }, + { + "epoch": 1.93, + "grad_norm": 6.6542887687683105, + "learning_rate": 1.0322636004805217e-06, + "loss": 0.2066, + "step": 11253 + }, + { + "epoch": 1.93, + "grad_norm": 14.208277702331543, + "learning_rate": 1.029689377037927e-06, + "loss": 0.587, + "step": 11254 + }, + { + "epoch": 1.93, + "grad_norm": 9.624720573425293, + "learning_rate": 1.0271151535953321e-06, + "loss": 0.4105, + "step": 11255 + }, + { + "epoch": 1.93, + "grad_norm": 5.926789283752441, + "learning_rate": 1.0245409301527373e-06, + "loss": 0.2069, + "step": 11256 + }, + { + "epoch": 1.93, + "grad_norm": 8.228850364685059, + "learning_rate": 1.0219667067101423e-06, + "loss": 0.2753, + "step": 11257 + }, + { + "epoch": 1.93, + "grad_norm": 13.834033966064453, + "learning_rate": 1.0193924832675475e-06, + "loss": 0.5825, + "step": 11258 + }, + { + "epoch": 1.93, + "grad_norm": 10.35616397857666, + "learning_rate": 1.016818259824953e-06, + "loss": 0.2947, + "step": 11259 + }, + { + "epoch": 1.93, + "grad_norm": 8.541860580444336, + "learning_rate": 1.0142440363823581e-06, + "loss": 0.3636, + "step": 11260 + }, + { + "epoch": 1.93, + "grad_norm": 8.137368202209473, + "learning_rate": 1.0116698129397631e-06, + "loss": 0.3446, + "step": 11261 + }, + { + "epoch": 1.93, + "grad_norm": 10.032513618469238, + "learning_rate": 1.0090955894971683e-06, + "loss": 0.3341, + "step": 11262 + }, + { + "epoch": 1.93, + "grad_norm": 10.584372520446777, + "learning_rate": 1.0065213660545737e-06, + "loss": 0.4229, + "step": 11263 + }, + { + "epoch": 1.93, + "grad_norm": 9.587027549743652, + "learning_rate": 1.0039471426119787e-06, + "loss": 0.3627, + "step": 11264 + }, + { + "epoch": 1.93, + "grad_norm": 10.200761795043945, + "learning_rate": 1.001372919169384e-06, + "loss": 0.3423, + "step": 11265 + }, + { + "epoch": 1.93, + "grad_norm": 11.629083633422852, + "learning_rate": 9.98798695726789e-07, + "loss": 0.4505, + "step": 11266 + }, + { + "epoch": 1.93, + "grad_norm": 9.095871925354004, + "learning_rate": 9.962244722841943e-07, + "loss": 0.3581, + "step": 11267 + }, + { + "epoch": 1.93, + "grad_norm": 7.397126197814941, + "learning_rate": 9.936502488415995e-07, + "loss": 0.2732, + "step": 11268 + }, + { + "epoch": 1.93, + "grad_norm": 8.796599388122559, + "learning_rate": 9.910760253990047e-07, + "loss": 0.2283, + "step": 11269 + }, + { + "epoch": 1.93, + "grad_norm": 12.334604263305664, + "learning_rate": 9.885018019564097e-07, + "loss": 0.3054, + "step": 11270 + }, + { + "epoch": 1.93, + "grad_norm": 11.330229759216309, + "learning_rate": 9.859275785138149e-07, + "loss": 0.3471, + "step": 11271 + }, + { + "epoch": 1.93, + "grad_norm": 11.068267822265625, + "learning_rate": 9.833533550712203e-07, + "loss": 0.3788, + "step": 11272 + }, + { + "epoch": 1.93, + "grad_norm": 7.766244411468506, + "learning_rate": 9.807791316286255e-07, + "loss": 0.2363, + "step": 11273 + }, + { + "epoch": 1.93, + "grad_norm": 9.55380630493164, + "learning_rate": 9.782049081860305e-07, + "loss": 0.4018, + "step": 11274 + }, + { + "epoch": 1.93, + "grad_norm": 12.42378044128418, + "learning_rate": 9.756306847434357e-07, + "loss": 0.3952, + "step": 11275 + }, + { + "epoch": 1.94, + "grad_norm": 8.504379272460938, + "learning_rate": 9.73056461300841e-07, + "loss": 0.3481, + "step": 11276 + }, + { + "epoch": 1.94, + "grad_norm": 15.353805541992188, + "learning_rate": 9.70482237858246e-07, + "loss": 0.5317, + "step": 11277 + }, + { + "epoch": 1.94, + "grad_norm": 11.899829864501953, + "learning_rate": 9.679080144156513e-07, + "loss": 0.3624, + "step": 11278 + }, + { + "epoch": 1.94, + "grad_norm": 6.905879974365234, + "learning_rate": 9.653337909730565e-07, + "loss": 0.4465, + "step": 11279 + }, + { + "epoch": 1.94, + "grad_norm": 11.24138355255127, + "learning_rate": 9.627595675304617e-07, + "loss": 0.3696, + "step": 11280 + }, + { + "epoch": 1.94, + "grad_norm": 13.197113990783691, + "learning_rate": 9.601853440878669e-07, + "loss": 0.3602, + "step": 11281 + }, + { + "epoch": 1.94, + "grad_norm": 13.318521499633789, + "learning_rate": 9.57611120645272e-07, + "loss": 0.5945, + "step": 11282 + }, + { + "epoch": 1.94, + "grad_norm": 10.276877403259277, + "learning_rate": 9.55036897202677e-07, + "loss": 0.2997, + "step": 11283 + }, + { + "epoch": 1.94, + "grad_norm": 11.022278785705566, + "learning_rate": 9.524626737600824e-07, + "loss": 0.3282, + "step": 11284 + }, + { + "epoch": 1.94, + "grad_norm": 10.468074798583984, + "learning_rate": 9.498884503174876e-07, + "loss": 0.318, + "step": 11285 + }, + { + "epoch": 1.94, + "grad_norm": 8.843952178955078, + "learning_rate": 9.473142268748929e-07, + "loss": 0.3547, + "step": 11286 + }, + { + "epoch": 1.94, + "grad_norm": 9.174036026000977, + "learning_rate": 9.447400034322979e-07, + "loss": 0.3074, + "step": 11287 + }, + { + "epoch": 1.94, + "grad_norm": 10.60664176940918, + "learning_rate": 9.421657799897032e-07, + "loss": 0.4209, + "step": 11288 + }, + { + "epoch": 1.94, + "grad_norm": 10.336174011230469, + "learning_rate": 9.395915565471084e-07, + "loss": 0.423, + "step": 11289 + }, + { + "epoch": 1.94, + "grad_norm": 10.120474815368652, + "learning_rate": 9.370173331045136e-07, + "loss": 0.3692, + "step": 11290 + }, + { + "epoch": 1.94, + "grad_norm": 12.143680572509766, + "learning_rate": 9.344431096619187e-07, + "loss": 0.3587, + "step": 11291 + }, + { + "epoch": 1.94, + "grad_norm": 11.772634506225586, + "learning_rate": 9.318688862193239e-07, + "loss": 0.3199, + "step": 11292 + }, + { + "epoch": 1.94, + "grad_norm": 12.608824729919434, + "learning_rate": 9.292946627767291e-07, + "loss": 0.3796, + "step": 11293 + }, + { + "epoch": 1.94, + "grad_norm": 10.068592071533203, + "learning_rate": 9.267204393341343e-07, + "loss": 0.3352, + "step": 11294 + }, + { + "epoch": 1.94, + "grad_norm": 9.383529663085938, + "learning_rate": 9.241462158915395e-07, + "loss": 0.318, + "step": 11295 + }, + { + "epoch": 1.94, + "grad_norm": 11.84504508972168, + "learning_rate": 9.215719924489445e-07, + "loss": 0.3878, + "step": 11296 + }, + { + "epoch": 1.94, + "grad_norm": 7.539249897003174, + "learning_rate": 9.189977690063497e-07, + "loss": 0.2165, + "step": 11297 + }, + { + "epoch": 1.94, + "grad_norm": 11.671634674072266, + "learning_rate": 9.164235455637549e-07, + "loss": 0.3436, + "step": 11298 + }, + { + "epoch": 1.94, + "grad_norm": 9.809889793395996, + "learning_rate": 9.138493221211601e-07, + "loss": 0.4075, + "step": 11299 + }, + { + "epoch": 1.94, + "grad_norm": 8.600805282592773, + "learning_rate": 9.112750986785652e-07, + "loss": 0.3933, + "step": 11300 + }, + { + "epoch": 1.94, + "grad_norm": 10.200301170349121, + "learning_rate": 9.087008752359705e-07, + "loss": 0.3252, + "step": 11301 + }, + { + "epoch": 1.94, + "grad_norm": 10.493424415588379, + "learning_rate": 9.061266517933756e-07, + "loss": 0.3976, + "step": 11302 + }, + { + "epoch": 1.94, + "grad_norm": 9.592477798461914, + "learning_rate": 9.035524283507809e-07, + "loss": 0.4543, + "step": 11303 + }, + { + "epoch": 1.94, + "grad_norm": 12.08369255065918, + "learning_rate": 9.00978204908186e-07, + "loss": 0.3364, + "step": 11304 + }, + { + "epoch": 1.94, + "grad_norm": 14.590408325195312, + "learning_rate": 8.984039814655912e-07, + "loss": 0.5278, + "step": 11305 + }, + { + "epoch": 1.94, + "grad_norm": 8.600781440734863, + "learning_rate": 8.958297580229964e-07, + "loss": 0.2189, + "step": 11306 + }, + { + "epoch": 1.94, + "grad_norm": 10.556259155273438, + "learning_rate": 8.932555345804016e-07, + "loss": 0.2317, + "step": 11307 + }, + { + "epoch": 1.94, + "grad_norm": 8.474226951599121, + "learning_rate": 8.906813111378068e-07, + "loss": 0.3711, + "step": 11308 + }, + { + "epoch": 1.94, + "grad_norm": 8.599727630615234, + "learning_rate": 8.881070876952119e-07, + "loss": 0.3542, + "step": 11309 + }, + { + "epoch": 1.94, + "grad_norm": 8.618650436401367, + "learning_rate": 8.855328642526171e-07, + "loss": 0.3074, + "step": 11310 + }, + { + "epoch": 1.94, + "grad_norm": 8.354700088500977, + "learning_rate": 8.829586408100223e-07, + "loss": 0.3029, + "step": 11311 + }, + { + "epoch": 1.94, + "grad_norm": 7.634174346923828, + "learning_rate": 8.803844173674275e-07, + "loss": 0.336, + "step": 11312 + }, + { + "epoch": 1.94, + "grad_norm": 8.879085540771484, + "learning_rate": 8.778101939248326e-07, + "loss": 0.3534, + "step": 11313 + }, + { + "epoch": 1.94, + "grad_norm": 11.613602638244629, + "learning_rate": 8.752359704822379e-07, + "loss": 0.3587, + "step": 11314 + }, + { + "epoch": 1.94, + "grad_norm": 7.624441146850586, + "learning_rate": 8.72661747039643e-07, + "loss": 0.268, + "step": 11315 + }, + { + "epoch": 1.94, + "grad_norm": 11.6231050491333, + "learning_rate": 8.700875235970483e-07, + "loss": 0.2694, + "step": 11316 + }, + { + "epoch": 1.94, + "grad_norm": 10.00921630859375, + "learning_rate": 8.675133001544534e-07, + "loss": 0.4135, + "step": 11317 + }, + { + "epoch": 1.94, + "grad_norm": 9.385370254516602, + "learning_rate": 8.649390767118586e-07, + "loss": 0.4183, + "step": 11318 + }, + { + "epoch": 1.94, + "grad_norm": 9.136366844177246, + "learning_rate": 8.623648532692638e-07, + "loss": 0.4017, + "step": 11319 + }, + { + "epoch": 1.94, + "grad_norm": 8.964530944824219, + "learning_rate": 8.59790629826669e-07, + "loss": 0.3124, + "step": 11320 + }, + { + "epoch": 1.94, + "grad_norm": 13.475838661193848, + "learning_rate": 8.572164063840742e-07, + "loss": 0.5261, + "step": 11321 + }, + { + "epoch": 1.94, + "grad_norm": 11.14635181427002, + "learning_rate": 8.546421829414794e-07, + "loss": 0.3229, + "step": 11322 + }, + { + "epoch": 1.94, + "grad_norm": 9.979029655456543, + "learning_rate": 8.520679594988845e-07, + "loss": 0.3312, + "step": 11323 + }, + { + "epoch": 1.94, + "grad_norm": 14.051238059997559, + "learning_rate": 8.494937360562897e-07, + "loss": 0.443, + "step": 11324 + }, + { + "epoch": 1.94, + "grad_norm": 8.99988842010498, + "learning_rate": 8.469195126136949e-07, + "loss": 0.3289, + "step": 11325 + }, + { + "epoch": 1.94, + "grad_norm": 10.240226745605469, + "learning_rate": 8.443452891711e-07, + "loss": 0.3315, + "step": 11326 + }, + { + "epoch": 1.94, + "grad_norm": 8.262712478637695, + "learning_rate": 8.417710657285053e-07, + "loss": 0.3172, + "step": 11327 + }, + { + "epoch": 1.94, + "grad_norm": 8.68488597869873, + "learning_rate": 8.391968422859104e-07, + "loss": 0.3532, + "step": 11328 + }, + { + "epoch": 1.94, + "grad_norm": 11.156920433044434, + "learning_rate": 8.366226188433157e-07, + "loss": 0.4471, + "step": 11329 + }, + { + "epoch": 1.94, + "grad_norm": 11.043469429016113, + "learning_rate": 8.340483954007208e-07, + "loss": 0.343, + "step": 11330 + }, + { + "epoch": 1.94, + "grad_norm": 9.056516647338867, + "learning_rate": 8.31474171958126e-07, + "loss": 0.2761, + "step": 11331 + }, + { + "epoch": 1.94, + "grad_norm": 8.607675552368164, + "learning_rate": 8.288999485155312e-07, + "loss": 0.3714, + "step": 11332 + }, + { + "epoch": 1.94, + "grad_norm": 10.879067420959473, + "learning_rate": 8.263257250729364e-07, + "loss": 0.3696, + "step": 11333 + }, + { + "epoch": 1.95, + "grad_norm": 12.252850532531738, + "learning_rate": 8.237515016303415e-07, + "loss": 0.6638, + "step": 11334 + }, + { + "epoch": 1.95, + "grad_norm": 9.101922988891602, + "learning_rate": 8.211772781877468e-07, + "loss": 0.326, + "step": 11335 + }, + { + "epoch": 1.95, + "grad_norm": 8.902658462524414, + "learning_rate": 8.186030547451519e-07, + "loss": 0.3754, + "step": 11336 + }, + { + "epoch": 1.95, + "grad_norm": 7.206715106964111, + "learning_rate": 8.160288313025571e-07, + "loss": 0.4128, + "step": 11337 + }, + { + "epoch": 1.95, + "grad_norm": 7.075551509857178, + "learning_rate": 8.134546078599623e-07, + "loss": 0.1763, + "step": 11338 + }, + { + "epoch": 1.95, + "grad_norm": 12.898223876953125, + "learning_rate": 8.108803844173674e-07, + "loss": 0.3561, + "step": 11339 + }, + { + "epoch": 1.95, + "grad_norm": 11.690729141235352, + "learning_rate": 8.083061609747727e-07, + "loss": 0.3557, + "step": 11340 + }, + { + "epoch": 1.95, + "grad_norm": 9.656914710998535, + "learning_rate": 8.057319375321778e-07, + "loss": 0.3916, + "step": 11341 + }, + { + "epoch": 1.95, + "grad_norm": 10.575813293457031, + "learning_rate": 8.031577140895831e-07, + "loss": 0.3707, + "step": 11342 + }, + { + "epoch": 1.95, + "grad_norm": 13.833656311035156, + "learning_rate": 8.005834906469882e-07, + "loss": 0.4552, + "step": 11343 + }, + { + "epoch": 1.95, + "grad_norm": 9.008463859558105, + "learning_rate": 7.980092672043934e-07, + "loss": 0.3894, + "step": 11344 + }, + { + "epoch": 1.95, + "grad_norm": 7.926922798156738, + "learning_rate": 7.954350437617986e-07, + "loss": 0.2971, + "step": 11345 + }, + { + "epoch": 1.95, + "grad_norm": 9.602967262268066, + "learning_rate": 7.928608203192038e-07, + "loss": 0.25, + "step": 11346 + }, + { + "epoch": 1.95, + "grad_norm": 10.169910430908203, + "learning_rate": 7.902865968766089e-07, + "loss": 0.3526, + "step": 11347 + }, + { + "epoch": 1.95, + "grad_norm": 11.690129280090332, + "learning_rate": 7.877123734340142e-07, + "loss": 0.3324, + "step": 11348 + }, + { + "epoch": 1.95, + "grad_norm": 10.213464736938477, + "learning_rate": 7.851381499914193e-07, + "loss": 0.2848, + "step": 11349 + }, + { + "epoch": 1.95, + "grad_norm": 10.605606079101562, + "learning_rate": 7.825639265488245e-07, + "loss": 0.4787, + "step": 11350 + }, + { + "epoch": 1.95, + "grad_norm": 14.166177749633789, + "learning_rate": 7.799897031062297e-07, + "loss": 0.5206, + "step": 11351 + }, + { + "epoch": 1.95, + "grad_norm": 9.196728706359863, + "learning_rate": 7.774154796636347e-07, + "loss": 0.2789, + "step": 11352 + }, + { + "epoch": 1.95, + "grad_norm": 11.934510231018066, + "learning_rate": 7.7484125622104e-07, + "loss": 0.4525, + "step": 11353 + }, + { + "epoch": 1.95, + "grad_norm": 8.63411808013916, + "learning_rate": 7.722670327784451e-07, + "loss": 0.2259, + "step": 11354 + }, + { + "epoch": 1.95, + "grad_norm": 9.784299850463867, + "learning_rate": 7.696928093358504e-07, + "loss": 0.28, + "step": 11355 + }, + { + "epoch": 1.95, + "grad_norm": 12.000056266784668, + "learning_rate": 7.671185858932555e-07, + "loss": 0.5209, + "step": 11356 + }, + { + "epoch": 1.95, + "grad_norm": 10.76236629486084, + "learning_rate": 7.645443624506607e-07, + "loss": 0.3218, + "step": 11357 + }, + { + "epoch": 1.95, + "grad_norm": 10.138952255249023, + "learning_rate": 7.619701390080659e-07, + "loss": 0.3785, + "step": 11358 + }, + { + "epoch": 1.95, + "grad_norm": 14.027183532714844, + "learning_rate": 7.593959155654711e-07, + "loss": 0.3568, + "step": 11359 + }, + { + "epoch": 1.95, + "grad_norm": 12.467564582824707, + "learning_rate": 7.568216921228762e-07, + "loss": 0.5007, + "step": 11360 + }, + { + "epoch": 1.95, + "grad_norm": 17.798078536987305, + "learning_rate": 7.542474686802815e-07, + "loss": 0.5194, + "step": 11361 + }, + { + "epoch": 1.95, + "grad_norm": 8.058542251586914, + "learning_rate": 7.516732452376866e-07, + "loss": 0.3352, + "step": 11362 + }, + { + "epoch": 1.95, + "grad_norm": 12.526508331298828, + "learning_rate": 7.490990217950918e-07, + "loss": 0.4279, + "step": 11363 + }, + { + "epoch": 1.95, + "grad_norm": 8.043108940124512, + "learning_rate": 7.46524798352497e-07, + "loss": 0.2351, + "step": 11364 + }, + { + "epoch": 1.95, + "grad_norm": 14.996644020080566, + "learning_rate": 7.439505749099021e-07, + "loss": 0.3735, + "step": 11365 + }, + { + "epoch": 1.95, + "grad_norm": 10.366191864013672, + "learning_rate": 7.413763514673074e-07, + "loss": 0.4054, + "step": 11366 + }, + { + "epoch": 1.95, + "grad_norm": 8.20408821105957, + "learning_rate": 7.388021280247125e-07, + "loss": 0.3294, + "step": 11367 + }, + { + "epoch": 1.95, + "grad_norm": 8.842229843139648, + "learning_rate": 7.362279045821177e-07, + "loss": 0.2984, + "step": 11368 + }, + { + "epoch": 1.95, + "grad_norm": 6.793534278869629, + "learning_rate": 7.336536811395229e-07, + "loss": 0.255, + "step": 11369 + }, + { + "epoch": 1.95, + "grad_norm": 7.305765628814697, + "learning_rate": 7.310794576969281e-07, + "loss": 0.2969, + "step": 11370 + }, + { + "epoch": 1.95, + "grad_norm": 8.11915397644043, + "learning_rate": 7.285052342543333e-07, + "loss": 0.3607, + "step": 11371 + }, + { + "epoch": 1.95, + "grad_norm": 9.45128345489502, + "learning_rate": 7.259310108117385e-07, + "loss": 0.2797, + "step": 11372 + }, + { + "epoch": 1.95, + "grad_norm": 11.961358070373535, + "learning_rate": 7.233567873691436e-07, + "loss": 0.2651, + "step": 11373 + }, + { + "epoch": 1.95, + "grad_norm": 13.083110809326172, + "learning_rate": 7.207825639265489e-07, + "loss": 0.4404, + "step": 11374 + }, + { + "epoch": 1.95, + "grad_norm": 6.668498992919922, + "learning_rate": 7.18208340483954e-07, + "loss": 0.2143, + "step": 11375 + }, + { + "epoch": 1.95, + "grad_norm": 12.162538528442383, + "learning_rate": 7.156341170413592e-07, + "loss": 0.278, + "step": 11376 + }, + { + "epoch": 1.95, + "grad_norm": 9.628896713256836, + "learning_rate": 7.130598935987644e-07, + "loss": 0.3406, + "step": 11377 + }, + { + "epoch": 1.95, + "grad_norm": 9.505866050720215, + "learning_rate": 7.104856701561695e-07, + "loss": 0.3576, + "step": 11378 + }, + { + "epoch": 1.95, + "grad_norm": 10.133254051208496, + "learning_rate": 7.079114467135748e-07, + "loss": 0.3205, + "step": 11379 + }, + { + "epoch": 1.95, + "grad_norm": 11.557565689086914, + "learning_rate": 7.053372232709799e-07, + "loss": 0.4144, + "step": 11380 + }, + { + "epoch": 1.95, + "grad_norm": 10.075443267822266, + "learning_rate": 7.027629998283851e-07, + "loss": 0.3218, + "step": 11381 + }, + { + "epoch": 1.95, + "grad_norm": 9.629512786865234, + "learning_rate": 7.001887763857903e-07, + "loss": 0.3068, + "step": 11382 + }, + { + "epoch": 1.95, + "grad_norm": 8.45313835144043, + "learning_rate": 6.976145529431955e-07, + "loss": 0.244, + "step": 11383 + }, + { + "epoch": 1.95, + "grad_norm": 10.560997009277344, + "learning_rate": 6.950403295006007e-07, + "loss": 0.3099, + "step": 11384 + }, + { + "epoch": 1.95, + "grad_norm": 10.60513973236084, + "learning_rate": 6.924661060580059e-07, + "loss": 0.3831, + "step": 11385 + }, + { + "epoch": 1.95, + "grad_norm": 11.306890487670898, + "learning_rate": 6.89891882615411e-07, + "loss": 0.2036, + "step": 11386 + }, + { + "epoch": 1.95, + "grad_norm": 6.6257123947143555, + "learning_rate": 6.873176591728163e-07, + "loss": 0.2673, + "step": 11387 + }, + { + "epoch": 1.95, + "grad_norm": 11.490890502929688, + "learning_rate": 6.847434357302214e-07, + "loss": 0.3954, + "step": 11388 + }, + { + "epoch": 1.95, + "grad_norm": 8.047118186950684, + "learning_rate": 6.821692122876266e-07, + "loss": 0.33, + "step": 11389 + }, + { + "epoch": 1.95, + "grad_norm": 8.06814956665039, + "learning_rate": 6.795949888450318e-07, + "loss": 0.3274, + "step": 11390 + }, + { + "epoch": 1.95, + "grad_norm": 10.393421173095703, + "learning_rate": 6.770207654024369e-07, + "loss": 0.4495, + "step": 11391 + }, + { + "epoch": 1.96, + "grad_norm": 12.045222282409668, + "learning_rate": 6.744465419598422e-07, + "loss": 0.5531, + "step": 11392 + }, + { + "epoch": 1.96, + "grad_norm": 10.564979553222656, + "learning_rate": 6.718723185172473e-07, + "loss": 0.3655, + "step": 11393 + }, + { + "epoch": 1.96, + "grad_norm": 11.417673110961914, + "learning_rate": 6.692980950746525e-07, + "loss": 0.4051, + "step": 11394 + }, + { + "epoch": 1.96, + "grad_norm": 11.536788940429688, + "learning_rate": 6.667238716320577e-07, + "loss": 0.3846, + "step": 11395 + }, + { + "epoch": 1.96, + "grad_norm": 7.755039691925049, + "learning_rate": 6.641496481894629e-07, + "loss": 0.2846, + "step": 11396 + }, + { + "epoch": 1.96, + "grad_norm": 10.713797569274902, + "learning_rate": 6.615754247468681e-07, + "loss": 0.4076, + "step": 11397 + }, + { + "epoch": 1.96, + "grad_norm": 14.087739944458008, + "learning_rate": 6.590012013042733e-07, + "loss": 0.3359, + "step": 11398 + }, + { + "epoch": 1.96, + "grad_norm": 9.078948020935059, + "learning_rate": 6.564269778616784e-07, + "loss": 0.3524, + "step": 11399 + }, + { + "epoch": 1.96, + "grad_norm": 7.877887725830078, + "learning_rate": 6.538527544190837e-07, + "loss": 0.2715, + "step": 11400 + }, + { + "epoch": 1.96, + "grad_norm": 6.709405899047852, + "learning_rate": 6.512785309764888e-07, + "loss": 0.1988, + "step": 11401 + }, + { + "epoch": 1.96, + "grad_norm": 8.651058197021484, + "learning_rate": 6.48704307533894e-07, + "loss": 0.2806, + "step": 11402 + }, + { + "epoch": 1.96, + "grad_norm": 8.915276527404785, + "learning_rate": 6.461300840912992e-07, + "loss": 0.3012, + "step": 11403 + }, + { + "epoch": 1.96, + "grad_norm": 8.318134307861328, + "learning_rate": 6.435558606487042e-07, + "loss": 0.1687, + "step": 11404 + }, + { + "epoch": 1.96, + "grad_norm": 8.65805721282959, + "learning_rate": 6.409816372061096e-07, + "loss": 0.4185, + "step": 11405 + }, + { + "epoch": 1.96, + "grad_norm": 10.805819511413574, + "learning_rate": 6.384074137635146e-07, + "loss": 0.3746, + "step": 11406 + }, + { + "epoch": 1.96, + "grad_norm": 8.475388526916504, + "learning_rate": 6.358331903209198e-07, + "loss": 0.3137, + "step": 11407 + }, + { + "epoch": 1.96, + "grad_norm": 9.643163681030273, + "learning_rate": 6.33258966878325e-07, + "loss": 0.2668, + "step": 11408 + }, + { + "epoch": 1.96, + "grad_norm": 10.919766426086426, + "learning_rate": 6.306847434357302e-07, + "loss": 0.3774, + "step": 11409 + }, + { + "epoch": 1.96, + "grad_norm": 9.508401870727539, + "learning_rate": 6.281105199931354e-07, + "loss": 0.3102, + "step": 11410 + }, + { + "epoch": 1.96, + "grad_norm": 12.121211051940918, + "learning_rate": 6.255362965505406e-07, + "loss": 0.4604, + "step": 11411 + }, + { + "epoch": 1.96, + "grad_norm": 11.517192840576172, + "learning_rate": 6.229620731079457e-07, + "loss": 0.4134, + "step": 11412 + }, + { + "epoch": 1.96, + "grad_norm": 8.313142776489258, + "learning_rate": 6.20387849665351e-07, + "loss": 0.3172, + "step": 11413 + }, + { + "epoch": 1.96, + "grad_norm": 7.615242004394531, + "learning_rate": 6.178136262227561e-07, + "loss": 0.3419, + "step": 11414 + }, + { + "epoch": 1.96, + "grad_norm": 11.35338020324707, + "learning_rate": 6.152394027801613e-07, + "loss": 0.3038, + "step": 11415 + }, + { + "epoch": 1.96, + "grad_norm": 9.667108535766602, + "learning_rate": 6.126651793375665e-07, + "loss": 0.3588, + "step": 11416 + }, + { + "epoch": 1.96, + "grad_norm": 9.784387588500977, + "learning_rate": 6.100909558949716e-07, + "loss": 0.307, + "step": 11417 + }, + { + "epoch": 1.96, + "grad_norm": 8.328282356262207, + "learning_rate": 6.075167324523769e-07, + "loss": 0.3785, + "step": 11418 + }, + { + "epoch": 1.96, + "grad_norm": 10.699987411499023, + "learning_rate": 6.04942509009782e-07, + "loss": 0.3058, + "step": 11419 + }, + { + "epoch": 1.96, + "grad_norm": 10.649855613708496, + "learning_rate": 6.023682855671872e-07, + "loss": 0.3671, + "step": 11420 + }, + { + "epoch": 1.96, + "grad_norm": 10.536017417907715, + "learning_rate": 5.997940621245924e-07, + "loss": 0.5921, + "step": 11421 + }, + { + "epoch": 1.96, + "grad_norm": 10.392743110656738, + "learning_rate": 5.972198386819976e-07, + "loss": 0.4402, + "step": 11422 + }, + { + "epoch": 1.96, + "grad_norm": 19.151330947875977, + "learning_rate": 5.946456152394028e-07, + "loss": 0.3639, + "step": 11423 + }, + { + "epoch": 1.96, + "grad_norm": 11.538474082946777, + "learning_rate": 5.92071391796808e-07, + "loss": 0.591, + "step": 11424 + }, + { + "epoch": 1.96, + "grad_norm": 8.823904037475586, + "learning_rate": 5.894971683542131e-07, + "loss": 0.2764, + "step": 11425 + }, + { + "epoch": 1.96, + "grad_norm": 8.737130165100098, + "learning_rate": 5.869229449116184e-07, + "loss": 0.3678, + "step": 11426 + }, + { + "epoch": 1.96, + "grad_norm": 11.101218223571777, + "learning_rate": 5.843487214690235e-07, + "loss": 0.5497, + "step": 11427 + }, + { + "epoch": 1.96, + "grad_norm": 7.815122604370117, + "learning_rate": 5.817744980264287e-07, + "loss": 0.2557, + "step": 11428 + }, + { + "epoch": 1.96, + "grad_norm": 19.70182991027832, + "learning_rate": 5.792002745838339e-07, + "loss": 0.4925, + "step": 11429 + }, + { + "epoch": 1.96, + "grad_norm": 8.772957801818848, + "learning_rate": 5.76626051141239e-07, + "loss": 0.3222, + "step": 11430 + }, + { + "epoch": 1.96, + "grad_norm": 6.422170162200928, + "learning_rate": 5.740518276986443e-07, + "loss": 0.1953, + "step": 11431 + }, + { + "epoch": 1.96, + "grad_norm": 7.435391902923584, + "learning_rate": 5.714776042560494e-07, + "loss": 0.2463, + "step": 11432 + }, + { + "epoch": 1.96, + "grad_norm": 8.697405815124512, + "learning_rate": 5.689033808134546e-07, + "loss": 0.3534, + "step": 11433 + }, + { + "epoch": 1.96, + "grad_norm": 9.7539701461792, + "learning_rate": 5.663291573708598e-07, + "loss": 0.4687, + "step": 11434 + }, + { + "epoch": 1.96, + "grad_norm": 9.636242866516113, + "learning_rate": 5.63754933928265e-07, + "loss": 0.2845, + "step": 11435 + }, + { + "epoch": 1.96, + "grad_norm": 9.384602546691895, + "learning_rate": 5.611807104856702e-07, + "loss": 0.2499, + "step": 11436 + }, + { + "epoch": 1.96, + "grad_norm": 10.37716293334961, + "learning_rate": 5.586064870430754e-07, + "loss": 0.3624, + "step": 11437 + }, + { + "epoch": 1.96, + "grad_norm": 7.166197776794434, + "learning_rate": 5.560322636004805e-07, + "loss": 0.2766, + "step": 11438 + }, + { + "epoch": 1.96, + "grad_norm": 11.577507019042969, + "learning_rate": 5.534580401578858e-07, + "loss": 0.4619, + "step": 11439 + }, + { + "epoch": 1.96, + "grad_norm": 11.988529205322266, + "learning_rate": 5.508838167152909e-07, + "loss": 0.3986, + "step": 11440 + }, + { + "epoch": 1.96, + "grad_norm": 9.408734321594238, + "learning_rate": 5.483095932726961e-07, + "loss": 0.3293, + "step": 11441 + }, + { + "epoch": 1.96, + "grad_norm": 11.547457695007324, + "learning_rate": 5.457353698301013e-07, + "loss": 0.4507, + "step": 11442 + }, + { + "epoch": 1.96, + "grad_norm": 9.997553825378418, + "learning_rate": 5.431611463875064e-07, + "loss": 0.4103, + "step": 11443 + }, + { + "epoch": 1.96, + "grad_norm": 8.524253845214844, + "learning_rate": 5.405869229449117e-07, + "loss": 0.3093, + "step": 11444 + }, + { + "epoch": 1.96, + "grad_norm": 7.29480504989624, + "learning_rate": 5.380126995023168e-07, + "loss": 0.2626, + "step": 11445 + }, + { + "epoch": 1.96, + "grad_norm": 15.253386497497559, + "learning_rate": 5.35438476059722e-07, + "loss": 0.4418, + "step": 11446 + }, + { + "epoch": 1.96, + "grad_norm": 11.01857852935791, + "learning_rate": 5.328642526171272e-07, + "loss": 0.3053, + "step": 11447 + }, + { + "epoch": 1.96, + "grad_norm": 12.180018424987793, + "learning_rate": 5.302900291745324e-07, + "loss": 0.2964, + "step": 11448 + }, + { + "epoch": 1.96, + "grad_norm": 8.832536697387695, + "learning_rate": 5.277158057319376e-07, + "loss": 0.256, + "step": 11449 + }, + { + "epoch": 1.96, + "grad_norm": 11.6122407913208, + "learning_rate": 5.251415822893428e-07, + "loss": 0.4412, + "step": 11450 + }, + { + "epoch": 1.97, + "grad_norm": 9.61803913116455, + "learning_rate": 5.225673588467479e-07, + "loss": 0.5073, + "step": 11451 + }, + { + "epoch": 1.97, + "grad_norm": 11.499238967895508, + "learning_rate": 5.199931354041532e-07, + "loss": 0.4001, + "step": 11452 + }, + { + "epoch": 1.97, + "grad_norm": 8.861466407775879, + "learning_rate": 5.174189119615583e-07, + "loss": 0.4356, + "step": 11453 + }, + { + "epoch": 1.97, + "grad_norm": 9.302648544311523, + "learning_rate": 5.148446885189635e-07, + "loss": 0.285, + "step": 11454 + }, + { + "epoch": 1.97, + "grad_norm": 10.945225715637207, + "learning_rate": 5.122704650763687e-07, + "loss": 0.4054, + "step": 11455 + }, + { + "epoch": 1.97, + "grad_norm": 11.368644714355469, + "learning_rate": 5.096962416337738e-07, + "loss": 0.3205, + "step": 11456 + }, + { + "epoch": 1.97, + "grad_norm": 8.485661506652832, + "learning_rate": 5.071220181911791e-07, + "loss": 0.2192, + "step": 11457 + }, + { + "epoch": 1.97, + "grad_norm": 13.474787712097168, + "learning_rate": 5.045477947485842e-07, + "loss": 0.5874, + "step": 11458 + }, + { + "epoch": 1.97, + "grad_norm": 10.382576942443848, + "learning_rate": 5.019735713059894e-07, + "loss": 0.3911, + "step": 11459 + }, + { + "epoch": 1.97, + "grad_norm": 9.274835586547852, + "learning_rate": 4.993993478633946e-07, + "loss": 0.3661, + "step": 11460 + }, + { + "epoch": 1.97, + "grad_norm": 7.426583766937256, + "learning_rate": 4.968251244207998e-07, + "loss": 0.2625, + "step": 11461 + }, + { + "epoch": 1.97, + "grad_norm": 11.829323768615723, + "learning_rate": 4.942509009782048e-07, + "loss": 0.3157, + "step": 11462 + }, + { + "epoch": 1.97, + "grad_norm": 9.745725631713867, + "learning_rate": 4.916766775356101e-07, + "loss": 0.3127, + "step": 11463 + }, + { + "epoch": 1.97, + "grad_norm": 9.34257698059082, + "learning_rate": 4.891024540930152e-07, + "loss": 0.4637, + "step": 11464 + }, + { + "epoch": 1.97, + "grad_norm": 10.127771377563477, + "learning_rate": 4.865282306504205e-07, + "loss": 0.3486, + "step": 11465 + }, + { + "epoch": 1.97, + "grad_norm": 10.810133934020996, + "learning_rate": 4.839540072078256e-07, + "loss": 0.3189, + "step": 11466 + }, + { + "epoch": 1.97, + "grad_norm": 7.137019634246826, + "learning_rate": 4.813797837652308e-07, + "loss": 0.2494, + "step": 11467 + }, + { + "epoch": 1.97, + "grad_norm": 8.305639266967773, + "learning_rate": 4.78805560322636e-07, + "loss": 0.278, + "step": 11468 + }, + { + "epoch": 1.97, + "grad_norm": 12.243700981140137, + "learning_rate": 4.762313368800412e-07, + "loss": 0.412, + "step": 11469 + }, + { + "epoch": 1.97, + "grad_norm": 11.298053741455078, + "learning_rate": 4.7365711343744644e-07, + "loss": 0.4562, + "step": 11470 + }, + { + "epoch": 1.97, + "grad_norm": 8.916495323181152, + "learning_rate": 4.710828899948516e-07, + "loss": 0.2969, + "step": 11471 + }, + { + "epoch": 1.97, + "grad_norm": 8.318690299987793, + "learning_rate": 4.685086665522568e-07, + "loss": 0.3448, + "step": 11472 + }, + { + "epoch": 1.97, + "grad_norm": 11.141204833984375, + "learning_rate": 4.6593444310966193e-07, + "loss": 0.4195, + "step": 11473 + }, + { + "epoch": 1.97, + "grad_norm": 10.106829643249512, + "learning_rate": 4.6336021966706713e-07, + "loss": 0.5072, + "step": 11474 + }, + { + "epoch": 1.97, + "grad_norm": 7.209422588348389, + "learning_rate": 4.607859962244723e-07, + "loss": 0.2639, + "step": 11475 + }, + { + "epoch": 1.97, + "grad_norm": 14.521828651428223, + "learning_rate": 4.5821177278187747e-07, + "loss": 0.3201, + "step": 11476 + }, + { + "epoch": 1.97, + "grad_norm": 9.716654777526855, + "learning_rate": 4.556375493392826e-07, + "loss": 0.424, + "step": 11477 + }, + { + "epoch": 1.97, + "grad_norm": 10.334619522094727, + "learning_rate": 4.530633258966878e-07, + "loss": 0.348, + "step": 11478 + }, + { + "epoch": 1.97, + "grad_norm": 10.950033187866211, + "learning_rate": 4.50489102454093e-07, + "loss": 0.2928, + "step": 11479 + }, + { + "epoch": 1.97, + "grad_norm": 8.429141998291016, + "learning_rate": 4.479148790114982e-07, + "loss": 0.2919, + "step": 11480 + }, + { + "epoch": 1.97, + "grad_norm": 8.359455108642578, + "learning_rate": 4.453406555689034e-07, + "loss": 0.3272, + "step": 11481 + }, + { + "epoch": 1.97, + "grad_norm": 8.862103462219238, + "learning_rate": 4.4276643212630856e-07, + "loss": 0.3789, + "step": 11482 + }, + { + "epoch": 1.97, + "grad_norm": 10.794952392578125, + "learning_rate": 4.4019220868371376e-07, + "loss": 0.5016, + "step": 11483 + }, + { + "epoch": 1.97, + "grad_norm": 8.989912986755371, + "learning_rate": 4.3761798524111896e-07, + "loss": 0.2662, + "step": 11484 + }, + { + "epoch": 1.97, + "grad_norm": 10.99366569519043, + "learning_rate": 4.3504376179852416e-07, + "loss": 0.4709, + "step": 11485 + }, + { + "epoch": 1.97, + "grad_norm": 12.567079544067383, + "learning_rate": 4.324695383559293e-07, + "loss": 0.4098, + "step": 11486 + }, + { + "epoch": 1.97, + "grad_norm": 9.405452728271484, + "learning_rate": 4.298953149133345e-07, + "loss": 0.2751, + "step": 11487 + }, + { + "epoch": 1.97, + "grad_norm": 10.602827072143555, + "learning_rate": 4.273210914707397e-07, + "loss": 0.4951, + "step": 11488 + }, + { + "epoch": 1.97, + "grad_norm": 13.505640983581543, + "learning_rate": 4.2474686802814485e-07, + "loss": 0.318, + "step": 11489 + }, + { + "epoch": 1.97, + "grad_norm": 10.013648986816406, + "learning_rate": 4.2217264458555e-07, + "loss": 0.272, + "step": 11490 + }, + { + "epoch": 1.97, + "grad_norm": 14.087907791137695, + "learning_rate": 4.195984211429552e-07, + "loss": 0.4187, + "step": 11491 + }, + { + "epoch": 1.97, + "grad_norm": 13.0941162109375, + "learning_rate": 4.170241977003604e-07, + "loss": 0.4704, + "step": 11492 + }, + { + "epoch": 1.97, + "grad_norm": 10.989205360412598, + "learning_rate": 4.144499742577656e-07, + "loss": 0.302, + "step": 11493 + }, + { + "epoch": 1.97, + "grad_norm": 11.566054344177246, + "learning_rate": 4.1187575081517074e-07, + "loss": 0.4751, + "step": 11494 + }, + { + "epoch": 1.97, + "grad_norm": 13.2603759765625, + "learning_rate": 4.0930152737257594e-07, + "loss": 0.2806, + "step": 11495 + }, + { + "epoch": 1.97, + "grad_norm": 14.102025032043457, + "learning_rate": 4.0672730392998114e-07, + "loss": 0.356, + "step": 11496 + }, + { + "epoch": 1.97, + "grad_norm": 11.15386962890625, + "learning_rate": 4.0415308048738634e-07, + "loss": 0.3403, + "step": 11497 + }, + { + "epoch": 1.97, + "grad_norm": 8.41845703125, + "learning_rate": 4.0157885704479154e-07, + "loss": 0.3056, + "step": 11498 + }, + { + "epoch": 1.97, + "grad_norm": 12.357941627502441, + "learning_rate": 3.990046336021967e-07, + "loss": 0.3618, + "step": 11499 + }, + { + "epoch": 1.97, + "grad_norm": 10.907411575317383, + "learning_rate": 3.964304101596019e-07, + "loss": 0.331, + "step": 11500 + }, + { + "epoch": 1.97, + "grad_norm": 11.477226257324219, + "learning_rate": 3.938561867170071e-07, + "loss": 0.3956, + "step": 11501 + }, + { + "epoch": 1.97, + "grad_norm": 11.849809646606445, + "learning_rate": 3.912819632744122e-07, + "loss": 0.4089, + "step": 11502 + }, + { + "epoch": 1.97, + "grad_norm": 12.161151885986328, + "learning_rate": 3.8870773983181737e-07, + "loss": 0.4616, + "step": 11503 + }, + { + "epoch": 1.97, + "grad_norm": 9.105419158935547, + "learning_rate": 3.8613351638922257e-07, + "loss": 0.3344, + "step": 11504 + }, + { + "epoch": 1.97, + "grad_norm": 9.301834106445312, + "learning_rate": 3.8355929294662777e-07, + "loss": 0.3531, + "step": 11505 + }, + { + "epoch": 1.97, + "grad_norm": 13.82765007019043, + "learning_rate": 3.8098506950403297e-07, + "loss": 0.61, + "step": 11506 + }, + { + "epoch": 1.97, + "grad_norm": 10.907184600830078, + "learning_rate": 3.784108460614381e-07, + "loss": 0.2997, + "step": 11507 + }, + { + "epoch": 1.97, + "grad_norm": 15.38486385345459, + "learning_rate": 3.758366226188433e-07, + "loss": 0.3336, + "step": 11508 + }, + { + "epoch": 1.98, + "grad_norm": 9.507502555847168, + "learning_rate": 3.732623991762485e-07, + "loss": 0.3728, + "step": 11509 + }, + { + "epoch": 1.98, + "grad_norm": 9.295052528381348, + "learning_rate": 3.706881757336537e-07, + "loss": 0.3916, + "step": 11510 + }, + { + "epoch": 1.98, + "grad_norm": 8.334773063659668, + "learning_rate": 3.6811395229105886e-07, + "loss": 0.3485, + "step": 11511 + }, + { + "epoch": 1.98, + "grad_norm": 8.45616626739502, + "learning_rate": 3.6553972884846406e-07, + "loss": 0.3185, + "step": 11512 + }, + { + "epoch": 1.98, + "grad_norm": 7.822600841522217, + "learning_rate": 3.6296550540586926e-07, + "loss": 0.3291, + "step": 11513 + }, + { + "epoch": 1.98, + "grad_norm": 9.113861083984375, + "learning_rate": 3.6039128196327446e-07, + "loss": 0.389, + "step": 11514 + }, + { + "epoch": 1.98, + "grad_norm": 8.824539184570312, + "learning_rate": 3.578170585206796e-07, + "loss": 0.2315, + "step": 11515 + }, + { + "epoch": 1.98, + "grad_norm": 7.730411529541016, + "learning_rate": 3.5524283507808475e-07, + "loss": 0.2284, + "step": 11516 + }, + { + "epoch": 1.98, + "grad_norm": 8.9005126953125, + "learning_rate": 3.5266861163548995e-07, + "loss": 0.4206, + "step": 11517 + }, + { + "epoch": 1.98, + "grad_norm": 18.357040405273438, + "learning_rate": 3.5009438819289515e-07, + "loss": 0.4062, + "step": 11518 + }, + { + "epoch": 1.98, + "grad_norm": 10.8555326461792, + "learning_rate": 3.4752016475030035e-07, + "loss": 0.4162, + "step": 11519 + }, + { + "epoch": 1.98, + "grad_norm": 9.863471031188965, + "learning_rate": 3.449459413077055e-07, + "loss": 0.3734, + "step": 11520 + }, + { + "epoch": 1.98, + "grad_norm": 9.063223838806152, + "learning_rate": 3.423717178651107e-07, + "loss": 0.364, + "step": 11521 + }, + { + "epoch": 1.98, + "grad_norm": 12.022010803222656, + "learning_rate": 3.397974944225159e-07, + "loss": 0.5075, + "step": 11522 + }, + { + "epoch": 1.98, + "grad_norm": 8.83696460723877, + "learning_rate": 3.372232709799211e-07, + "loss": 0.2929, + "step": 11523 + }, + { + "epoch": 1.98, + "grad_norm": 8.155284881591797, + "learning_rate": 3.3464904753732624e-07, + "loss": 0.3201, + "step": 11524 + }, + { + "epoch": 1.98, + "grad_norm": 9.256797790527344, + "learning_rate": 3.3207482409473143e-07, + "loss": 0.2956, + "step": 11525 + }, + { + "epoch": 1.98, + "grad_norm": 10.252130508422852, + "learning_rate": 3.2950060065213663e-07, + "loss": 0.3463, + "step": 11526 + }, + { + "epoch": 1.98, + "grad_norm": 10.993163108825684, + "learning_rate": 3.2692637720954183e-07, + "loss": 0.3171, + "step": 11527 + }, + { + "epoch": 1.98, + "grad_norm": 8.930692672729492, + "learning_rate": 3.24352153766947e-07, + "loss": 0.3235, + "step": 11528 + }, + { + "epoch": 1.98, + "grad_norm": 11.87640380859375, + "learning_rate": 3.217779303243521e-07, + "loss": 0.3065, + "step": 11529 + }, + { + "epoch": 1.98, + "grad_norm": 7.991257667541504, + "learning_rate": 3.192037068817573e-07, + "loss": 0.3639, + "step": 11530 + }, + { + "epoch": 1.98, + "grad_norm": 10.389147758483887, + "learning_rate": 3.166294834391625e-07, + "loss": 0.4295, + "step": 11531 + }, + { + "epoch": 1.98, + "grad_norm": 8.487982749938965, + "learning_rate": 3.140552599965677e-07, + "loss": 0.327, + "step": 11532 + }, + { + "epoch": 1.98, + "grad_norm": 9.464696884155273, + "learning_rate": 3.1148103655397287e-07, + "loss": 0.32, + "step": 11533 + }, + { + "epoch": 1.98, + "grad_norm": 14.410228729248047, + "learning_rate": 3.0890681311137807e-07, + "loss": 0.3767, + "step": 11534 + }, + { + "epoch": 1.98, + "grad_norm": 8.981040954589844, + "learning_rate": 3.0633258966878327e-07, + "loss": 0.3519, + "step": 11535 + }, + { + "epoch": 1.98, + "grad_norm": 10.440129280090332, + "learning_rate": 3.0375836622618847e-07, + "loss": 0.3847, + "step": 11536 + }, + { + "epoch": 1.98, + "grad_norm": 11.9198637008667, + "learning_rate": 3.011841427835936e-07, + "loss": 0.4402, + "step": 11537 + }, + { + "epoch": 1.98, + "grad_norm": 9.214505195617676, + "learning_rate": 2.986099193409988e-07, + "loss": 0.2856, + "step": 11538 + }, + { + "epoch": 1.98, + "grad_norm": 9.527960777282715, + "learning_rate": 2.96035695898404e-07, + "loss": 0.3397, + "step": 11539 + }, + { + "epoch": 1.98, + "grad_norm": 9.586348533630371, + "learning_rate": 2.934614724558092e-07, + "loss": 0.2686, + "step": 11540 + }, + { + "epoch": 1.98, + "grad_norm": 10.140045166015625, + "learning_rate": 2.9088724901321436e-07, + "loss": 0.3593, + "step": 11541 + }, + { + "epoch": 1.98, + "grad_norm": 17.366416931152344, + "learning_rate": 2.883130255706195e-07, + "loss": 0.3621, + "step": 11542 + }, + { + "epoch": 1.98, + "grad_norm": 12.255388259887695, + "learning_rate": 2.857388021280247e-07, + "loss": 0.2811, + "step": 11543 + }, + { + "epoch": 1.98, + "grad_norm": 8.306804656982422, + "learning_rate": 2.831645786854299e-07, + "loss": 0.3178, + "step": 11544 + }, + { + "epoch": 1.98, + "grad_norm": 10.351367950439453, + "learning_rate": 2.805903552428351e-07, + "loss": 0.2738, + "step": 11545 + }, + { + "epoch": 1.98, + "grad_norm": 7.161386966705322, + "learning_rate": 2.7801613180024024e-07, + "loss": 0.2006, + "step": 11546 + }, + { + "epoch": 1.98, + "grad_norm": 10.230098724365234, + "learning_rate": 2.7544190835764544e-07, + "loss": 0.3282, + "step": 11547 + }, + { + "epoch": 1.98, + "grad_norm": 14.078783988952637, + "learning_rate": 2.7286768491505064e-07, + "loss": 0.4008, + "step": 11548 + }, + { + "epoch": 1.98, + "grad_norm": 6.660420894622803, + "learning_rate": 2.7029346147245584e-07, + "loss": 0.2171, + "step": 11549 + }, + { + "epoch": 1.98, + "grad_norm": 10.891387939453125, + "learning_rate": 2.67719238029861e-07, + "loss": 0.3924, + "step": 11550 + }, + { + "epoch": 1.98, + "grad_norm": 11.31053638458252, + "learning_rate": 2.651450145872662e-07, + "loss": 0.5193, + "step": 11551 + }, + { + "epoch": 1.98, + "grad_norm": 9.492470741271973, + "learning_rate": 2.625707911446714e-07, + "loss": 0.3821, + "step": 11552 + }, + { + "epoch": 1.98, + "grad_norm": 11.105793952941895, + "learning_rate": 2.599965677020766e-07, + "loss": 0.3579, + "step": 11553 + }, + { + "epoch": 1.98, + "grad_norm": 10.469000816345215, + "learning_rate": 2.5742234425948173e-07, + "loss": 0.416, + "step": 11554 + }, + { + "epoch": 1.98, + "grad_norm": 12.040057182312012, + "learning_rate": 2.548481208168869e-07, + "loss": 0.3274, + "step": 11555 + }, + { + "epoch": 1.98, + "grad_norm": 13.669187545776367, + "learning_rate": 2.522738973742921e-07, + "loss": 0.4086, + "step": 11556 + }, + { + "epoch": 1.98, + "grad_norm": 8.471062660217285, + "learning_rate": 2.496996739316973e-07, + "loss": 0.3147, + "step": 11557 + }, + { + "epoch": 1.98, + "grad_norm": 9.899096488952637, + "learning_rate": 2.471254504891024e-07, + "loss": 0.4109, + "step": 11558 + }, + { + "epoch": 1.98, + "grad_norm": 8.59607982635498, + "learning_rate": 2.445512270465076e-07, + "loss": 0.3408, + "step": 11559 + }, + { + "epoch": 1.98, + "grad_norm": 13.245586395263672, + "learning_rate": 2.419770036039128e-07, + "loss": 0.5837, + "step": 11560 + }, + { + "epoch": 1.98, + "grad_norm": 8.268532752990723, + "learning_rate": 2.39402780161318e-07, + "loss": 0.2982, + "step": 11561 + }, + { + "epoch": 1.98, + "grad_norm": 20.093299865722656, + "learning_rate": 2.3682855671872322e-07, + "loss": 0.4422, + "step": 11562 + }, + { + "epoch": 1.98, + "grad_norm": 9.338587760925293, + "learning_rate": 2.342543332761284e-07, + "loss": 0.3539, + "step": 11563 + }, + { + "epoch": 1.98, + "grad_norm": 13.079157829284668, + "learning_rate": 2.3168010983353356e-07, + "loss": 0.3752, + "step": 11564 + }, + { + "epoch": 1.98, + "grad_norm": 7.305372714996338, + "learning_rate": 2.2910588639093874e-07, + "loss": 0.3345, + "step": 11565 + }, + { + "epoch": 1.98, + "grad_norm": 9.811758041381836, + "learning_rate": 2.265316629483439e-07, + "loss": 0.2837, + "step": 11566 + }, + { + "epoch": 1.99, + "grad_norm": 8.363420486450195, + "learning_rate": 2.239574395057491e-07, + "loss": 0.2655, + "step": 11567 + }, + { + "epoch": 1.99, + "grad_norm": 10.844328880310059, + "learning_rate": 2.2138321606315428e-07, + "loss": 0.3373, + "step": 11568 + }, + { + "epoch": 1.99, + "grad_norm": 11.258129119873047, + "learning_rate": 2.1880899262055948e-07, + "loss": 0.3602, + "step": 11569 + }, + { + "epoch": 1.99, + "grad_norm": 11.300853729248047, + "learning_rate": 2.1623476917796465e-07, + "loss": 0.3643, + "step": 11570 + }, + { + "epoch": 1.99, + "grad_norm": 6.862691402435303, + "learning_rate": 2.1366054573536985e-07, + "loss": 0.2271, + "step": 11571 + }, + { + "epoch": 1.99, + "grad_norm": 13.394877433776855, + "learning_rate": 2.11086322292775e-07, + "loss": 0.3863, + "step": 11572 + }, + { + "epoch": 1.99, + "grad_norm": 8.647296905517578, + "learning_rate": 2.085120988501802e-07, + "loss": 0.3639, + "step": 11573 + }, + { + "epoch": 1.99, + "grad_norm": 18.53976058959961, + "learning_rate": 2.0593787540758537e-07, + "loss": 0.3572, + "step": 11574 + }, + { + "epoch": 1.99, + "grad_norm": 5.477564334869385, + "learning_rate": 2.0336365196499057e-07, + "loss": 0.1911, + "step": 11575 + }, + { + "epoch": 1.99, + "grad_norm": 8.660491943359375, + "learning_rate": 2.0078942852239577e-07, + "loss": 0.2693, + "step": 11576 + }, + { + "epoch": 1.99, + "grad_norm": 16.215484619140625, + "learning_rate": 1.9821520507980094e-07, + "loss": 0.3393, + "step": 11577 + }, + { + "epoch": 1.99, + "grad_norm": 10.43334674835205, + "learning_rate": 1.956409816372061e-07, + "loss": 0.3334, + "step": 11578 + }, + { + "epoch": 1.99, + "grad_norm": 11.9304780960083, + "learning_rate": 1.9306675819461129e-07, + "loss": 0.4064, + "step": 11579 + }, + { + "epoch": 1.99, + "grad_norm": 9.014012336730957, + "learning_rate": 1.9049253475201648e-07, + "loss": 0.317, + "step": 11580 + }, + { + "epoch": 1.99, + "grad_norm": 13.0802583694458, + "learning_rate": 1.8791831130942166e-07, + "loss": 0.3601, + "step": 11581 + }, + { + "epoch": 1.99, + "grad_norm": 8.10100269317627, + "learning_rate": 1.8534408786682686e-07, + "loss": 0.2967, + "step": 11582 + }, + { + "epoch": 1.99, + "grad_norm": 11.166520118713379, + "learning_rate": 1.8276986442423203e-07, + "loss": 0.1733, + "step": 11583 + }, + { + "epoch": 1.99, + "grad_norm": 9.802959442138672, + "learning_rate": 1.8019564098163723e-07, + "loss": 0.2037, + "step": 11584 + }, + { + "epoch": 1.99, + "grad_norm": 11.004393577575684, + "learning_rate": 1.7762141753904237e-07, + "loss": 0.3187, + "step": 11585 + }, + { + "epoch": 1.99, + "grad_norm": 12.01358699798584, + "learning_rate": 1.7504719409644757e-07, + "loss": 0.3986, + "step": 11586 + }, + { + "epoch": 1.99, + "grad_norm": 11.093853950500488, + "learning_rate": 1.7247297065385275e-07, + "loss": 0.5979, + "step": 11587 + }, + { + "epoch": 1.99, + "grad_norm": 9.381300926208496, + "learning_rate": 1.6989874721125795e-07, + "loss": 0.4252, + "step": 11588 + }, + { + "epoch": 1.99, + "grad_norm": 41.779701232910156, + "learning_rate": 1.6732452376866312e-07, + "loss": 0.4688, + "step": 11589 + }, + { + "epoch": 1.99, + "grad_norm": 9.901070594787598, + "learning_rate": 1.6475030032606832e-07, + "loss": 0.4482, + "step": 11590 + }, + { + "epoch": 1.99, + "grad_norm": 8.64988899230957, + "learning_rate": 1.621760768834735e-07, + "loss": 0.3563, + "step": 11591 + }, + { + "epoch": 1.99, + "grad_norm": 8.116276741027832, + "learning_rate": 1.5960185344087866e-07, + "loss": 0.3584, + "step": 11592 + }, + { + "epoch": 1.99, + "grad_norm": 8.5116605758667, + "learning_rate": 1.5702762999828386e-07, + "loss": 0.4026, + "step": 11593 + }, + { + "epoch": 1.99, + "grad_norm": 8.244938850402832, + "learning_rate": 1.5445340655568903e-07, + "loss": 0.4058, + "step": 11594 + }, + { + "epoch": 1.99, + "grad_norm": 8.948546409606934, + "learning_rate": 1.5187918311309423e-07, + "loss": 0.4042, + "step": 11595 + }, + { + "epoch": 1.99, + "grad_norm": 9.164525032043457, + "learning_rate": 1.493049596704994e-07, + "loss": 0.3639, + "step": 11596 + }, + { + "epoch": 1.99, + "grad_norm": 8.158011436462402, + "learning_rate": 1.467307362279046e-07, + "loss": 0.3336, + "step": 11597 + }, + { + "epoch": 1.99, + "grad_norm": 11.70641803741455, + "learning_rate": 1.4415651278530975e-07, + "loss": 0.4431, + "step": 11598 + }, + { + "epoch": 1.99, + "grad_norm": 5.583722114562988, + "learning_rate": 1.4158228934271495e-07, + "loss": 0.2764, + "step": 11599 + }, + { + "epoch": 1.99, + "grad_norm": 12.039341926574707, + "learning_rate": 1.3900806590012012e-07, + "loss": 0.2335, + "step": 11600 + }, + { + "epoch": 1.99, + "grad_norm": 12.198591232299805, + "learning_rate": 1.3643384245752532e-07, + "loss": 0.3343, + "step": 11601 + }, + { + "epoch": 1.99, + "grad_norm": 7.904633522033691, + "learning_rate": 1.338596190149305e-07, + "loss": 0.2966, + "step": 11602 + }, + { + "epoch": 1.99, + "grad_norm": 10.212183952331543, + "learning_rate": 1.312853955723357e-07, + "loss": 0.3424, + "step": 11603 + }, + { + "epoch": 1.99, + "grad_norm": 9.84640884399414, + "learning_rate": 1.2871117212974087e-07, + "loss": 0.4, + "step": 11604 + }, + { + "epoch": 1.99, + "grad_norm": 7.661840915679932, + "learning_rate": 1.2613694868714604e-07, + "loss": 0.2402, + "step": 11605 + }, + { + "epoch": 1.99, + "grad_norm": 9.088878631591797, + "learning_rate": 1.235627252445512e-07, + "loss": 0.2863, + "step": 11606 + }, + { + "epoch": 1.99, + "grad_norm": 8.745158195495605, + "learning_rate": 1.209885018019564e-07, + "loss": 0.2985, + "step": 11607 + }, + { + "epoch": 1.99, + "grad_norm": 7.851995468139648, + "learning_rate": 1.1841427835936161e-07, + "loss": 0.2763, + "step": 11608 + }, + { + "epoch": 1.99, + "grad_norm": 9.716765403747559, + "learning_rate": 1.1584005491676678e-07, + "loss": 0.253, + "step": 11609 + }, + { + "epoch": 1.99, + "grad_norm": 12.23376750946045, + "learning_rate": 1.1326583147417195e-07, + "loss": 0.4726, + "step": 11610 + }, + { + "epoch": 1.99, + "grad_norm": 8.617305755615234, + "learning_rate": 1.1069160803157714e-07, + "loss": 0.389, + "step": 11611 + }, + { + "epoch": 1.99, + "grad_norm": 8.755290985107422, + "learning_rate": 1.0811738458898233e-07, + "loss": 0.3865, + "step": 11612 + }, + { + "epoch": 1.99, + "grad_norm": 10.882112503051758, + "learning_rate": 1.055431611463875e-07, + "loss": 0.4901, + "step": 11613 + }, + { + "epoch": 1.99, + "grad_norm": 5.497066974639893, + "learning_rate": 1.0296893770379268e-07, + "loss": 0.1278, + "step": 11614 + }, + { + "epoch": 1.99, + "grad_norm": 11.85369873046875, + "learning_rate": 1.0039471426119788e-07, + "loss": 0.322, + "step": 11615 + }, + { + "epoch": 1.99, + "grad_norm": 9.569109916687012, + "learning_rate": 9.782049081860306e-08, + "loss": 0.4142, + "step": 11616 + }, + { + "epoch": 1.99, + "grad_norm": 7.69202184677124, + "learning_rate": 9.524626737600824e-08, + "loss": 0.305, + "step": 11617 + }, + { + "epoch": 1.99, + "grad_norm": 13.631950378417969, + "learning_rate": 9.267204393341343e-08, + "loss": 0.3786, + "step": 11618 + }, + { + "epoch": 1.99, + "grad_norm": 7.579807758331299, + "learning_rate": 9.009782049081861e-08, + "loss": 0.2518, + "step": 11619 + }, + { + "epoch": 1.99, + "grad_norm": 10.297823905944824, + "learning_rate": 8.752359704822379e-08, + "loss": 0.327, + "step": 11620 + }, + { + "epoch": 1.99, + "grad_norm": 10.749382972717285, + "learning_rate": 8.494937360562897e-08, + "loss": 0.4666, + "step": 11621 + }, + { + "epoch": 1.99, + "grad_norm": 12.204568862915039, + "learning_rate": 8.237515016303416e-08, + "loss": 0.3926, + "step": 11622 + }, + { + "epoch": 1.99, + "grad_norm": 9.774746894836426, + "learning_rate": 7.980092672043933e-08, + "loss": 0.2924, + "step": 11623 + }, + { + "epoch": 1.99, + "grad_norm": 6.393989086151123, + "learning_rate": 7.722670327784452e-08, + "loss": 0.1916, + "step": 11624 + }, + { + "epoch": 2.0, + "grad_norm": 9.441868782043457, + "learning_rate": 7.46524798352497e-08, + "loss": 0.4196, + "step": 11625 + }, + { + "epoch": 2.0, + "grad_norm": 13.14520263671875, + "learning_rate": 7.207825639265488e-08, + "loss": 0.3645, + "step": 11626 + }, + { + "epoch": 2.0, + "grad_norm": 11.262258529663086, + "learning_rate": 6.950403295006006e-08, + "loss": 0.3976, + "step": 11627 + }, + { + "epoch": 2.0, + "grad_norm": 6.72003173828125, + "learning_rate": 6.692980950746525e-08, + "loss": 0.2126, + "step": 11628 + }, + { + "epoch": 2.0, + "grad_norm": 10.395048141479492, + "learning_rate": 6.435558606487043e-08, + "loss": 0.3385, + "step": 11629 + }, + { + "epoch": 2.0, + "grad_norm": 9.435748100280762, + "learning_rate": 6.17813626222756e-08, + "loss": 0.3397, + "step": 11630 + }, + { + "epoch": 2.0, + "grad_norm": 9.424450874328613, + "learning_rate": 5.9207139179680805e-08, + "loss": 0.3764, + "step": 11631 + }, + { + "epoch": 2.0, + "grad_norm": 12.107622146606445, + "learning_rate": 5.663291573708598e-08, + "loss": 0.434, + "step": 11632 + }, + { + "epoch": 2.0, + "grad_norm": 11.096739768981934, + "learning_rate": 5.405869229449116e-08, + "loss": 0.4002, + "step": 11633 + }, + { + "epoch": 2.0, + "grad_norm": 10.735634803771973, + "learning_rate": 5.148446885189634e-08, + "loss": 0.4333, + "step": 11634 + }, + { + "epoch": 2.0, + "grad_norm": 11.462284088134766, + "learning_rate": 4.891024540930153e-08, + "loss": 0.3572, + "step": 11635 + }, + { + "epoch": 2.0, + "grad_norm": 15.17946720123291, + "learning_rate": 4.6336021966706714e-08, + "loss": 0.3477, + "step": 11636 + }, + { + "epoch": 2.0, + "grad_norm": 10.753332138061523, + "learning_rate": 4.3761798524111893e-08, + "loss": 0.3611, + "step": 11637 + }, + { + "epoch": 2.0, + "grad_norm": 8.240086555480957, + "learning_rate": 4.118757508151708e-08, + "loss": 0.3462, + "step": 11638 + }, + { + "epoch": 2.0, + "grad_norm": 11.988096237182617, + "learning_rate": 3.861335163892226e-08, + "loss": 0.4948, + "step": 11639 + }, + { + "epoch": 2.0, + "grad_norm": 9.858365058898926, + "learning_rate": 3.603912819632744e-08, + "loss": 0.2489, + "step": 11640 + }, + { + "epoch": 2.0, + "grad_norm": 7.523008346557617, + "learning_rate": 3.3464904753732624e-08, + "loss": 0.3755, + "step": 11641 + }, + { + "epoch": 2.0, + "grad_norm": 11.568328857421875, + "learning_rate": 3.08906813111378e-08, + "loss": 0.3955, + "step": 11642 + }, + { + "epoch": 2.0, + "grad_norm": 9.07970905303955, + "learning_rate": 2.831645786854299e-08, + "loss": 0.3319, + "step": 11643 + }, + { + "epoch": 2.0, + "grad_norm": 8.71036434173584, + "learning_rate": 2.574223442594817e-08, + "loss": 0.3837, + "step": 11644 + }, + { + "epoch": 2.0, + "grad_norm": 13.206975936889648, + "learning_rate": 2.3168010983353357e-08, + "loss": 0.4032, + "step": 11645 + }, + { + "epoch": 2.0, + "grad_norm": 8.85912036895752, + "learning_rate": 2.059378754075854e-08, + "loss": 0.3112, + "step": 11646 + }, + { + "epoch": 2.0, + "grad_norm": 8.981271743774414, + "learning_rate": 1.801956409816372e-08, + "loss": 0.2749, + "step": 11647 + }, + { + "epoch": 2.0, + "grad_norm": 10.14116382598877, + "learning_rate": 1.54453406555689e-08, + "loss": 0.4656, + "step": 11648 + }, + { + "epoch": 2.0, + "grad_norm": 9.546759605407715, + "learning_rate": 1.2871117212974086e-08, + "loss": 0.296, + "step": 11649 + }, + { + "epoch": 2.0, + "grad_norm": 10.54394817352295, + "learning_rate": 1.029689377037927e-08, + "loss": 0.4372, + "step": 11650 + }, + { + "epoch": 2.0, + "grad_norm": 7.853440761566162, + "learning_rate": 7.72267032778445e-09, + "loss": 0.2876, + "step": 11651 + }, + { + "epoch": 2.0, + "grad_norm": 11.25448989868164, + "learning_rate": 5.148446885189635e-09, + "loss": 0.5023, + "step": 11652 + }, + { + "epoch": 2.0, + "grad_norm": 7.559902667999268, + "learning_rate": 2.5742234425948174e-09, + "loss": 0.2573, + "step": 11653 + }, + { + "epoch": 2.0, + "grad_norm": 20.35457992553711, + "learning_rate": 0.0, + "loss": 0.5802, + "step": 11654 + }, + { + "epoch": 2.0, + "step": 11654, + "total_flos": 2.1922863214339584e+17, + "train_loss": 0.7619943554932388, + "train_runtime": 3561.7574, + "train_samples_per_second": 314.078, + "train_steps_per_second": 3.272 + } + ], + "logging_steps": 1.0, + "max_steps": 11654, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "total_flos": 2.1922863214339584e+17, + "train_batch_size": 96, + "trial_name": null, + "trial_params": null +}